Source code for causallib.utils.crossfit

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils.metaestimators import _safe_split
import pandas as pd
from sklearn.base import clone


[docs]def cross_fitting(estimator, X, y, n_splits=5, predict_proba=False,
                  return_estimator=True):
    """

    Args:
        estimator(object): sklearn object
        X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
        y (pd.Series): Observed outcome of size (num_subjects,).
        n_splits (int): number of folds
        predict_proba (bool): If True, the treatment model is a classifier
                                and use 'predict_proba',
                              If False, use 'predict'.
        return_estimator (bool): If true return fitted estimators of each fold

    Returns:
        array of held-out prediction,
        if return estimator:
            a tuple of estimators on held-out-data
    """

    cv = StratifiedKFold(n_splits=n_splits) if predict_proba else KFold(
        n_splits=n_splits)
    ret = [_fit_and_predict(clone(estimator), X, y, train, test,
                            predict_proba=predict_proba)
           for train, test in cv.split(X, y)]
    zipped_ret = list(zip(*ret))
    if return_estimator:
        return pd.concat(zipped_ret[0]), zipped_ret[1]
    else:
        return pd.concat(zipped_ret[0])


def _fit_and_predict(estimator, X, y, train, test, predict_proba):
    """
    fit the estimator with the train samples and make prediction with the test data
    Args:
        estimator(object): sklearn object
        X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
        y (pd.Series): Observed outcome of size (num_subjects,).
        train:
        test:
        predict_proba (bool): If True, the treatment model is a classifier
                                and use 'predict_proba',
                              If False, use 'predict'.

    """
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, _ = _safe_split(estimator, X, y, test, train)
    estimator.fit(X_train, y_train)
    if predict_proba:
        pred = estimator.predict_proba(X_test)[:, 1]
    else:
        pred = estimator.predict(X_test)

    return pd.Series(pred, index=X_test.index), estimator