Source code for causallib.positivity.trimming
import pandas as pd
import numpy as np
from causallib.positivity import BasePositivity
from sklearn.linear_model import LogisticRegression
OPTIMAL_THRESHOLD_ACCURACY = 5e-6
def _check_is_valid_threshold_value(threshold_value):
if not isinstance(threshold_value, (float, type(None))):
raise ValueError("invalid threshold_value")
return threshold_value
def _check_is_valid_threshold_method(threshold_method):
threshold_method = "crump" if threshold_method == "auto" else threshold_method
if threshold_method not in cutoff_optimizers:
raise ValueError("invalid threshold_method")
return threshold_method
def _check_propensities(prob):
""" check if the treatment assignment is binary"""
if prob.shape[1] > 2:
raise ValueError('This threshold selection method is applicable only '
'for binary treatment assignment')
else:
propensities = prob.iloc[:, 1]
return propensities
def crump_cutoff(prob, segments=10000):
"""
A systematic approach to find the optimal trimming cutoff, based on the
marginal distribution of the propensity score,
and according to a variance minimization criterion.
"Crump, R. K., Hotz, V. J., Imbens, G. W., & Mitnik, O. A. (2009).
Dealing with limited overlap in estimation of average treatment effects."
Args:
prob (pd.Series): probability of be assign to a group
(n_samples, n_classes)
segments (int): number of exclusive segments of the interval (0, 0.5].
more segments results with more precise cutoff
Returns:
float: the optimal cutoff,
i.e. the smallest value that satisfies the criterion.
"""
propensities = _check_propensities(prob)
alphas = np.linspace(1e-7, 0.5, segments)
alphas_weights = alphas * (1 - alphas)
overlap_weights = propensities * (1 - propensities)
for i in range(segments):
obs_meets_criterion = overlap_weights >= alphas_weights[i]
criterion = 2 * (np.sum(obs_meets_criterion / overlap_weights) /
np.maximum(np.sum(obs_meets_criterion), 1e-7))
if (1 / alphas_weights[i]) <= criterion:
break
return alphas[i]
cutoff_optimizers = {'crump': crump_cutoff}
def _lookup_method(threshold_method):
if threshold_method in cutoff_optimizers:
return cutoff_optimizers[threshold_method]
else:
raise Exception("Method %s does not exist" % threshold_method)
[docs]
class Trimming(BasePositivity):
[docs]
def __init__(self,
learner=LogisticRegression(),
threshold="auto"):
"""
Args:
learner (sklearn object): Initialized sklearn model
threshold (str | float) : The threshold method or value.
- if auto: finding the optimized threshold in a principled way.
- if float, hard-coded value between 0 to 0.5 is used
in order to clip the propensity estimation.
"""
self.learner = learner
if not hasattr(self.learner, "predict_proba"):
raise AttributeError("Propensity Estimator must use a machine "
"learning that can predict probabilities"
"(i.e., have predict_proba method)")
if isinstance(threshold, str):
self.threshold = _check_is_valid_threshold_method(threshold)
else:
self.threshold_ = _check_is_valid_threshold_value(threshold)
def _fit_threshold(self, X):
"""Fit threshold in a principled way"""
prob = self.learner.predict_proba(X)
prob = pd.DataFrame(prob, index=X.index, columns=self.learner.classes_)
method = _lookup_method(self.threshold)
threshold = method(prob)
return threshold
[docs]
def fit(self, X, a):
"""Fit propensity model for positivity.
Args:
X (pd.DataFrame): covariate matrix of size
(num_subjects, num_features)
a (pd.Series): treatment assignment of size (num_subjects,)
"""
self.learner.fit(X, a)
if hasattr(self, 'threshold'):
self.threshold_ = self._fit_threshold(X)
return self
[docs]
def predict(self, X, a, threshold=None):
"""Predict whether or not a sample is in the overlap region.
Find samples that have probabilities to be assigned to one of the
treatment groups, that is bigger than the cutoff threshold.
return a boolean indexer which is `True` if their probabilities are
higher than the cutoff threshold and `False` otherwise.
Args:
X (pd.DataFrame): covariate matrix of size
(num_subjects, num_features)
a (pd.Series): treatment assignment of size (num_subjects,)
threshold (float|None): The cutoff threshold.
- if float, an optional value between 0 to 0.5 to clip the
propensity estimation.
- if None, use the optimized cutoff in a principled way.
Returns:
pd.Series: a Series of length `X.shape[0]` with the same index as
`X` and only boolean values
"""
prob = self.learner.predict_proba(X)
prob = pd.DataFrame(prob, index=X.index, columns=self.learner.classes_)
threshold_value = _check_is_valid_threshold_value(threshold)
threshold_to_use = (self.threshold_ if threshold_value is None
else threshold_value)
untrimmed_indices = (prob >= threshold_to_use).all(axis=1)
return untrimmed_indices