Source code for causallib.positivity.trimming

import pandas as pd
import numpy as np
from causallib.positivity import BasePositivity
from sklearn.linear_model import LogisticRegression

OPTIMAL_THRESHOLD_ACCURACY = 5e-6


def _check_is_valid_threshold_value(threshold_value):
    if not isinstance(threshold_value, (float, type(None))):
        raise ValueError("invalid threshold_value")
    return threshold_value


def _check_is_valid_threshold_method(threshold_method):
    threshold_method = "crump" if threshold_method == "auto" else threshold_method
    if threshold_method not in cutoff_optimizers:
        raise ValueError("invalid threshold_method")
    return threshold_method


def _check_propensities(prob):
    """ check if the treatment assignment is binary"""
    if prob.shape[1] > 2:
        raise ValueError('This threshold selection method is applicable only '
                         'for binary treatment assignment')
    else:
        propensities = prob.iloc[:, 1]
        return propensities


def crump_cutoff(prob, segments=10000):
    """
    A systematic approach to find the optimal trimming cutoff, based on the
    marginal distribution of the propensity score,
    and according to a variance minimization criterion.

    "Crump, R. K., Hotz, V. J., Imbens, G. W., & Mitnik, O. A. (2009).
    Dealing with limited overlap in estimation of average treatment effects."
    Args:
        prob (pd.Series): probability of be assign to a group
                          (n_samples, n_classes)
        segments (int): number of exclusive segments of the interval (0, 0.5].
                        more segments results with more precise cutoff

    Returns:
        float: the optimal cutoff,
               i.e. the smallest value that satisfies the criterion.
    """
    propensities = _check_propensities(prob)
    alphas = np.linspace(1e-7, 0.5, segments)
    alphas_weights = alphas * (1 - alphas)
    overlap_weights = propensities * (1 - propensities)
    for i in range(segments):
        obs_meets_criterion = overlap_weights >= alphas_weights[i]
        criterion = 2 * (np.sum(obs_meets_criterion / overlap_weights) /
                         np.maximum(np.sum(obs_meets_criterion), 1e-7))
        if (1 / alphas_weights[i]) <= criterion:
            break
    return alphas[i]


cutoff_optimizers = {'crump': crump_cutoff}


def _lookup_method(threshold_method):
    if threshold_method in cutoff_optimizers:
        return cutoff_optimizers[threshold_method]
    else:
        raise Exception("Method %s does not exist" % threshold_method)


[docs] class Trimming(BasePositivity):
[docs] def __init__(self, learner=LogisticRegression(), threshold="auto"): """ Args: learner (sklearn object): Initialized sklearn model threshold (str | float) : The threshold method or value. - if auto: finding the optimized threshold in a principled way. - if float, hard-coded value between 0 to 0.5 is used in order to clip the propensity estimation. """ self.learner = learner if not hasattr(self.learner, "predict_proba"): raise AttributeError("Propensity Estimator must use a machine " "learning that can predict probabilities" "(i.e., have predict_proba method)") if isinstance(threshold, str): self.threshold = _check_is_valid_threshold_method(threshold) else: self.threshold_ = _check_is_valid_threshold_value(threshold)
def _fit_threshold(self, X): """Fit threshold in a principled way""" prob = self.learner.predict_proba(X) prob = pd.DataFrame(prob, index=X.index, columns=self.learner.classes_) method = _lookup_method(self.threshold) threshold = method(prob) return threshold
[docs] def fit(self, X, a): """Fit propensity model for positivity. Args: X (pd.DataFrame): covariate matrix of size (num_subjects, num_features) a (pd.Series): treatment assignment of size (num_subjects,) """ self.learner.fit(X, a) if hasattr(self, 'threshold'): self.threshold_ = self._fit_threshold(X) return self
[docs] def predict(self, X, a, threshold=None): """Predict whether or not a sample is in the overlap region. Find samples that have probabilities to be assigned to one of the treatment groups, that is bigger than the cutoff threshold. return a boolean indexer which is `True` if their probabilities are higher than the cutoff threshold and `False` otherwise. Args: X (pd.DataFrame): covariate matrix of size (num_subjects, num_features) a (pd.Series): treatment assignment of size (num_subjects,) threshold (float|None): The cutoff threshold. - if float, an optional value between 0 to 0.5 to clip the propensity estimation. - if None, use the optimized cutoff in a principled way. Returns: pd.Series: a Series of length `X.shape[0]` with the same index as `X` and only boolean values """ prob = self.learner.predict_proba(X) prob = pd.DataFrame(prob, index=X.index, columns=self.learner.classes_) threshold_value = _check_is_valid_threshold_value(threshold) threshold_to_use = (self.threshold_ if threshold_value is None else threshold_value) untrimmed_indices = (prob >= threshold_to_use).all(axis=1) return untrimmed_indices