Source code for causallib.utils.crossfit

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils.metaestimators import _safe_split
import pandas as pd
from sklearn.base import clone


[docs]def cross_fitting(estimator, X, y, n_splits=5, predict_proba=False, return_estimator=True): """ Args: estimator(object): sklearn object X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). y (pd.Series): Observed outcome of size (num_subjects,). n_splits (int): number of folds predict_proba (bool): If True, the treatment model is a classifier and use 'predict_proba', If False, use 'predict'. return_estimator (bool): If true return fitted estimators of each fold Returns: array of held-out prediction, if return estimator: a tuple of estimators on held-out-data """ cv = StratifiedKFold(n_splits=n_splits) if predict_proba else KFold( n_splits=n_splits) ret = [_fit_and_predict(clone(estimator), X, y, train, test, predict_proba=predict_proba) for train, test in cv.split(X, y)] zipped_ret = list(zip(*ret)) if return_estimator: return pd.concat(zipped_ret[0]), zipped_ret[1] else: return pd.concat(zipped_ret[0])
def _fit_and_predict(estimator, X, y, train, test, predict_proba): """ fit the estimator with the train samples and make prediction with the test data Args: estimator(object): sklearn object X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). y (pd.Series): Observed outcome of size (num_subjects,). train: test: predict_proba (bool): If True, the treatment model is a classifier and use 'predict_proba', If False, use 'predict'. """ X_train, y_train = _safe_split(estimator, X, y, train) X_test, _ = _safe_split(estimator, X, y, test, train) estimator.fit(X_train, y_train) if predict_proba: pred = estimator.predict_proba(X_test)[:, 1] else: pred = estimator.predict(X_test) return pd.Series(pred, index=X_test.index), estimator