Source code for causallib.evaluation.predictor

"""Predictor classes.

Predictors generate sets of predictions for a single fold with no cross-validation
or train-test logic.
"""

import abc
from copy import deepcopy
from typing import Union

import pandas as pd

from ..estimation.base_estimator import IndividualOutcomeEstimator
from ..estimation.base_weight import PropensityEstimator, WeightEstimator
from ..utils.stat_utils import robust_lookup

from .predictions import PropensityPredictions, WeightPredictions, OutcomePredictions


[docs]def predict_cv(estimator, X, a, y, cv, refit=True, phases=("train", "valid")):
    """Obtain predictions on the provided data in cross-validation

    Args:
        X (pd.DataFrame): Covariates.
        a (pd.Series): Treatment assignment.
        y (pd.Series): Outcome.
        cv (list[tuples]): list the number of folds containing tuples of indices
            (train_idx, validation_idx)
        refit (bool): Whether to refit the model on each fold.
        phases (list[str]): {["train", "valid"], ["train"], ["valid"]}.
            Phases names to evaluate on - train ("train"), validation ("valid") or both.
            'train' corresponds to cv[i][0] and 'valid' to  cv[i][1]
    Returns:
        (dict[str, list], list): A two-tuple containing:

            * predictions: dictionary with keys being the phases provided and values are
                list the size of the number of folds in cv and containing the output of
                the estimator on that corresponding fold.
                For example, predictions["valid"][3] contains the prediction of the estimator on
                untrained data of the third fold (i.e. validation set of the third fold)
            * models: list the size of the number of folds in cv containing the fitted estimator
                on the training data of that fold.
    """
    predictor = BasePredictor.from_estimator(estimator)(estimator)
    predictions = {phase: [] for phase in phases}
    models = []
    for train_idx, valid_idx in cv:
        data = {
            "train": {
                "X": X.iloc[train_idx],
                "a": a.iloc[train_idx],
                "y": y.iloc[train_idx],
            },
            "valid": {
                "X": X.iloc[valid_idx],
                "a": a.iloc[valid_idx],
                "y": y.iloc[valid_idx],
            },
        }
        # TODO: use dict-comprehension to map between phases[0] to cv[0]
        # instead of writing "train" explicitly

        if refit:
            predictor.fit(
                X=data["train"]["X"], a=data["train"]["a"], y=data["train"]["y"]
            )

        for phase in phases:
            fold_prediction = predictor.predict(X=data[phase]["X"], a=data[phase]["a"])
            predictions[phase].append(fold_prediction)

        models.append(deepcopy(predictor.estimator))
    return predictions, models


[docs]class BasePredictor:
    """Generate predictions from estimator for evaluation (base class)."""

[docs]    @staticmethod
    def from_estimator(
        estimator: Union[
            IndividualOutcomeEstimator, PropensityEstimator, WeightEstimator
        ]
    ):
        """Select subclass based on estimator.

        Args:
            estimator (Union[IndividualOutcomeEstimator, PropensityEstimator, WeightEstimator]):
                Estimator to generate evaluation predictions from.

        Returns:
            Union[PropensityPredictor, WeightPredictor, OutcomePredictor]: the correct predictor for
                the supplied estimator
        """
        # import outside toplevel is the price you pay for having a factory method
        # of the base class

        if isinstance(estimator, PropensityEstimator):
            return PropensityPredictor
        if isinstance(estimator, WeightEstimator):
            return WeightPredictor
        if isinstance(estimator, IndividualOutcomeEstimator):
            return OutcomePredictor
        raise ValueError(f"Received unsupported estimator type {type(estimator)}")

    def __init__(self, estimator):
        self.estimator = estimator

[docs]    @abc.abstractmethod
    def fit(self, X, a, y):
        """Fit an estimator."""
        raise NotImplementedError

[docs]    @abc.abstractmethod
    def predict(self, X, a):
        """Predict (weights, outcomes, etc. depending on the model).
        The output can be as flexible as desired, but score_estimation should know to handle it."""
        raise NotImplementedError


[docs]class OutcomePredictor(BasePredictor):
    """Generate evaluation predictions for IndividualOutcomeEstimator models."""

    def __init__(self, estimator):
        """
        Args:
            estimator (IndividualOutcomeEstimator):
        """
        if not isinstance(estimator, IndividualOutcomeEstimator):
            raise TypeError(
                f"OutcomePredictor must be initialized with IndividualOutcomeEstimator. "
                f"Received ({ type(estimator)}) instead."
            )
        super().__init__(estimator)

[docs]    def fit(self, X, a, y):
        """Fit estimator."""
        self.estimator.fit(X=X, a=a, y=y)

[docs]    def predict(self, X, a):
        """Predict on data."""
        prediction = self.estimator.estimate_individual_outcome(
            X, a, predict_proba=False
        )
        # Use predict_probability if possible since it is needed for most evaluations:
        prediction_event_prob = self.estimator.estimate_individual_outcome(
            X, a, predict_proba=True
        )
        fold_prediction = OutcomePredictions(prediction, prediction_event_prob)
        return fold_prediction


[docs]class WeightPredictor(BasePredictor):
    """Generate evaluation predictions for WeightEstimator models."""

    def __init__(self, estimator):
        """
        Args:
            estimator (WeightEstimator):
        """
        if not isinstance(estimator, WeightEstimator):
            raise TypeError(
                "WeightPredictor must be initialized with WeightEstimator."
                f"Received got ({type(estimator)}) instead."
            )
        super().__init__(estimator)

[docs]    def fit(self, X, a, y=None):
        """Fit estimator. `y` is ignored."""
        self.estimator.fit(X=X, a=a)

[docs]    def predict(self, X, a):
        """Predict on data.

        Args:
            X (pd.DataFrame): Covariates.
            a (pd.Series): Target variable - treatment assignment

        Returns:
            WeightEvaluatorPredictions
        """
        weight_by_treatment_assignment = self.estimator.compute_weights(
            X, a, treatment_values=None, use_stabilized=False
        )
        weight_for_being_treated = self.estimator.compute_weights(
            X, a, treatment_values=a.max(), use_stabilized=False
        )

        prediction = WeightPredictions(
            weight_by_treatment_assignment,
            weight_for_being_treated,
        )
        return prediction


[docs]class PropensityPredictor(WeightPredictor):
    """Generate evaluation predictions for PropensityEstimator models."""

    def __init__(self, estimator):
        """
        Args:
            estimator (PropensityEstimator):
        """
        if not isinstance(estimator, PropensityEstimator):
            raise TypeError(
                "PropensityPredictor must be initialized with PropensityEstimator. "
                f"Received ({type(estimator)}) instead."
            )
        super().__init__(estimator)

[docs]    def predict(self, X, a):
        """Predict on data.

        Args:
            X (pd.DataFrame): Covariates.
            a (pd.Series): Target variable - treatment assignment

        Returns:
            PropensityEvaluatorPredictions
        """
        propensity = self.estimator.compute_propensity(X, a, treatment_values=a.max())
        propensity_matrix = self.estimator.compute_propensity_matrix(X)
        propensity_by_treatment_assignment = robust_lookup(propensity_matrix, a)

        treatment_assignment_pred = self.estimator.learner.predict(
            X
        )  # TODO: maybe add predict_label to interface instead
        treatment_assignment_pred = pd.Series(treatment_assignment_pred, index=X.index)

        weight_prediction = super().predict(X, a)
        # Do not force stabilize=False as in WeightEvaluator:
        weight_by_treatment_assignment = self.estimator.compute_weights(X, a)
        prediction = PropensityPredictions(
            weight_by_treatment_assignment,
            weight_prediction.weight_for_being_treated,
            treatment_assignment_pred,
            propensity,
            propensity_by_treatment_assignment,
        )
        return prediction