Source code for causallib.evaluation.predictor

"""Predictor classes.

Predictors generate sets of predictions for a single fold with no cross-validation
or train-test logic.
"""

import abc
from copy import deepcopy
from typing import Union

import pandas as pd

from ..estimation.base_estimator import IndividualOutcomeEstimator
from ..estimation.base_weight import PropensityEstimator, WeightEstimator
from ..utils.stat_utils import robust_lookup

from .predictions import PropensityPredictions, WeightPredictions, OutcomePredictions


[docs]def predict_cv(estimator, X, a, y, cv, refit=True, phases=("train", "valid")): """Obtain predictions on the provided data in cross-validation Args: X (pd.DataFrame): Covariates. a (pd.Series): Treatment assignment. y (pd.Series): Outcome. cv (list[tuples]): list the number of folds containing tuples of indices (train_idx, validation_idx) refit (bool): Whether to refit the model on each fold. phases (list[str]): {["train", "valid"], ["train"], ["valid"]}. Phases names to evaluate on - train ("train"), validation ("valid") or both. 'train' corresponds to cv[i][0] and 'valid' to cv[i][1] Returns: (dict[str, list], list): A two-tuple containing: * predictions: dictionary with keys being the phases provided and values are list the size of the number of folds in cv and containing the output of the estimator on that corresponding fold. For example, predictions["valid"][3] contains the prediction of the estimator on untrained data of the third fold (i.e. validation set of the third fold) * models: list the size of the number of folds in cv containing the fitted estimator on the training data of that fold. """ predictor = BasePredictor.from_estimator(estimator)(estimator) predictions = {phase: [] for phase in phases} models = [] for train_idx, valid_idx in cv: data = { "train": { "X": X.iloc[train_idx], "a": a.iloc[train_idx], "y": y.iloc[train_idx], }, "valid": { "X": X.iloc[valid_idx], "a": a.iloc[valid_idx], "y": y.iloc[valid_idx], }, } # TODO: use dict-comprehension to map between phases[0] to cv[0] # instead of writing "train" explicitly if refit: predictor.fit( X=data["train"]["X"], a=data["train"]["a"], y=data["train"]["y"] ) for phase in phases: fold_prediction = predictor.predict(X=data[phase]["X"], a=data[phase]["a"]) predictions[phase].append(fold_prediction) models.append(deepcopy(predictor.estimator)) return predictions, models
[docs]class BasePredictor: """Generate predictions from estimator for evaluation (base class)."""
[docs] @staticmethod def from_estimator( estimator: Union[ IndividualOutcomeEstimator, PropensityEstimator, WeightEstimator ] ): """Select subclass based on estimator. Args: estimator (Union[IndividualOutcomeEstimator, PropensityEstimator, WeightEstimator]): Estimator to generate evaluation predictions from. Returns: Union[PropensityPredictor, WeightPredictor, OutcomePredictor]: the correct predictor for the supplied estimator """ # import outside toplevel is the price you pay for having a factory method # of the base class if isinstance(estimator, PropensityEstimator): return PropensityPredictor if isinstance(estimator, WeightEstimator): return WeightPredictor if isinstance(estimator, IndividualOutcomeEstimator): return OutcomePredictor raise ValueError(f"Received unsupported estimator type {type(estimator)}")
def __init__(self, estimator): self.estimator = estimator
[docs] @abc.abstractmethod def fit(self, X, a, y): """Fit an estimator.""" raise NotImplementedError
[docs] @abc.abstractmethod def predict(self, X, a): """Predict (weights, outcomes, etc. depending on the model). The output can be as flexible as desired, but score_estimation should know to handle it.""" raise NotImplementedError
[docs]class OutcomePredictor(BasePredictor): """Generate evaluation predictions for IndividualOutcomeEstimator models.""" def __init__(self, estimator): """ Args: estimator (IndividualOutcomeEstimator): """ if not isinstance(estimator, IndividualOutcomeEstimator): raise TypeError( f"OutcomePredictor must be initialized with IndividualOutcomeEstimator. " f"Received ({ type(estimator)}) instead." ) super().__init__(estimator)
[docs] def fit(self, X, a, y): """Fit estimator.""" self.estimator.fit(X=X, a=a, y=y)
[docs] def predict(self, X, a): """Predict on data.""" prediction = self.estimator.estimate_individual_outcome( X, a, predict_proba=False ) # Use predict_probability if possible since it is needed for most evaluations: prediction_event_prob = self.estimator.estimate_individual_outcome( X, a, predict_proba=True ) fold_prediction = OutcomePredictions(prediction, prediction_event_prob) return fold_prediction
[docs]class WeightPredictor(BasePredictor): """Generate evaluation predictions for WeightEstimator models.""" def __init__(self, estimator): """ Args: estimator (WeightEstimator): """ if not isinstance(estimator, WeightEstimator): raise TypeError( "WeightPredictor must be initialized with WeightEstimator." f"Received got ({type(estimator)}) instead." ) super().__init__(estimator)
[docs] def fit(self, X, a, y=None): """Fit estimator. `y` is ignored.""" self.estimator.fit(X=X, a=a)
[docs] def predict(self, X, a): """Predict on data. Args: X (pd.DataFrame): Covariates. a (pd.Series): Target variable - treatment assignment Returns: WeightEvaluatorPredictions """ weight_by_treatment_assignment = self.estimator.compute_weights( X, a, treatment_values=None, use_stabilized=False ) weight_for_being_treated = self.estimator.compute_weights( X, a, treatment_values=a.max(), use_stabilized=False ) prediction = WeightPredictions( weight_by_treatment_assignment, weight_for_being_treated, ) return prediction
[docs]class PropensityPredictor(WeightPredictor): """Generate evaluation predictions for PropensityEstimator models.""" def __init__(self, estimator): """ Args: estimator (PropensityEstimator): """ if not isinstance(estimator, PropensityEstimator): raise TypeError( "PropensityPredictor must be initialized with PropensityEstimator. " f"Received ({type(estimator)}) instead." ) super().__init__(estimator)
[docs] def predict(self, X, a): """Predict on data. Args: X (pd.DataFrame): Covariates. a (pd.Series): Target variable - treatment assignment Returns: PropensityEvaluatorPredictions """ propensity = self.estimator.compute_propensity(X, a, treatment_values=a.max()) propensity_matrix = self.estimator.compute_propensity_matrix(X) propensity_by_treatment_assignment = robust_lookup(propensity_matrix, a) treatment_assignment_pred = self.estimator.learner.predict( X ) # TODO: maybe add predict_label to interface instead treatment_assignment_pred = pd.Series(treatment_assignment_pred, index=X.index) weight_prediction = super().predict(X, a) # Do not force stabilize=False as in WeightEvaluator: weight_by_treatment_assignment = self.estimator.compute_weights(X, a) prediction = PropensityPredictions( weight_by_treatment_assignment, weight_prediction.weight_for_being_treated, treatment_assignment_pred, propensity, propensity_by_treatment_assignment, ) return prediction