Source code for causallib.estimation.ipw

"""
(C) Copyright 2019 IBM Corp.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Created on Apr 25, 2018

"""

import warnings

import pandas as pd

from .base_estimator import PopulationOutcomeEstimator
from .base_weight import PropensityEstimator
from ..utils.stat_utils import robust_lookup


[docs]class IPW(PropensityEstimator, PopulationOutcomeEstimator): """ Causal model implementing inverse probability (propensity score) weighting. w_i = 1 / Pr[A=a_i|Xi] """ def __init__(self, learner, clip_min=None, clip_max=None, use_stabilized=False, verbose=False): """ Args: learner: Initialized sklearn model. clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by clipping it. Will clip probabilities under clip_min to this value. clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by clipping it. Will clip probabilities above clip_max to this value. use_stabilized (bool): Whether to re-weigh the learned weights with the prevalence of the treatment. See Also: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4351790/#S6title verbose (bool): Whether to print summary statistics regarding the number of samples clipped due to clip_min and clip_max. """ super(IPW, self).__init__(learner, use_stabilized) self.__check_truncation_value_is_legal(clip_min, clip_max) self.clip_min = clip_min self.clip_max = clip_max self.verbose = verbose
[docs] def fit(self, X, a, y=None): if self.use_stabilized: self.treatment_prevalence_ = a.value_counts(normalize=True, sort=False) self.learner.fit(X, a) return self
def _predict(self, X): # Assumes PropensityEstimator checked that learner has predict_proba during initialization: prediction_matrix = self.learner.predict_proba(X) columns = getattr(self.learner, "classes_", None) # non-sklearn classifiers might not have the attribute prediction_matrix = pd.DataFrame(prediction_matrix, index=X.index, columns=columns) return prediction_matrix
[docs] def compute_weights(self, X, a, treatment_values=None, clip_min=None, clip_max=None, use_stabilized=None): """ Computes individual weight given the individual's treatment assignment. w_i = 1 / Pr[A=a_i|X_i] for each individual i. Args: X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). a (pd.Series): Treatment assignment of size (num_subjects,). treatment_values (Any | None): A desired value/s to extract weights to (i.e. weights to what treatment value should be calculated). If not specified, then the weights are chosen by the individual's actual treatment assignment. clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by clipping it. Will clip probabilities under clip_min to this value. clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by clipping it. Will clip probabilities above clip_max to this value. use_stabilized (None|bool): Whether to re-weigh the learned weights with the prevalence of the treatment. This overrides the use_stabilized parameter provided at initialization. If True provided, but the model was initialized with use_stabilized=False, then prevalence is calculated from data at hand, rather than the prevalence from the training data. See Also: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4351790/#S6title Returns: pd.Series | pd.DataFrame: If treatment_values is not supplied (None) or is a scalar, then a vector of n_samples with a weight for each sample is returned. If treatment_values is a list/array, then a DataFrame is returned. """ weight_matrix = self.compute_weight_matrix(X, a, clip_min, clip_max, use_stabilized) if treatment_values is None: weights = robust_lookup(weight_matrix, a) # lookup table: take the column a[i] for every i in index(a). else: weights = weight_matrix[treatment_values] return weights
[docs] def compute_weight_matrix(self, X, a, clip_min=None, clip_max=None, use_stabilized=None): """ Computes individual weight across all possible treatment values. w_ij = 1 / Pr[A=a_j | X_i] for all individual i and treatment j. Args: X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). a (pd.Series): Treatment assignment of size (num_subjects,). clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by clipping it. Will clip probabilities under clip_min to this value. clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by clipping it. Will clip probabilities above clip_max to this value. use_stabilized (None|bool): Whether to re-weigh the learned weights with the prevalence of the treatment. This overrides the use_stabilized parameter provided at initialization. If True provided, but the model was initialized with use_stabilized=False, then prevalence is calculated from data at hand, rather than the prevalence from the training data. See Also: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4351790/#S6title Returns: pd.DataFrame: A matrix of size (num_subjects, num_treatments) with weight for every individual and every treatment. """ use_stabilized = self.use_stabilized if use_stabilized is None else use_stabilized probabilities = self.compute_propensity_matrix(X, a, clip_min, clip_max) # weight_matrix = 1.0 / probabilities # type: pd.DataFrame weight_matrix = probabilities.rdiv(1.0) if use_stabilized: if self.use_stabilized: prevalence = self.treatment_prevalence_ else: warnings.warn("Stabilized is asked, however, the model was not trained using stabilization, and " "therefore, stabilized weights are taken from the provided treatment assignment.", RuntimeWarning) prevalence = a.value_counts(normalize=True, sort=False) prevalence_per_subject = a.replace(prevalence) # map tx-assign to prevalence # pointwise multiplication of each column in weights: weight_matrix = weight_matrix.multiply(prevalence_per_subject, axis="index") return weight_matrix
[docs] def compute_propensity(self, X, a, treatment_values=None, clip_min=None, clip_max=None): """ Args: X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). a (pd.Series): Treatment assignment of size (num_subjects,). treatment_values (Any | None): A desired value/s to extract propensity to (i.e. probabilities to what treatment value should be calculated). If not specified, then the maximal treatment value is chosen. This is since the usual case is of treatment (A=1) control (A=0) setting. clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by clipping it. Will clip probabilities under clip_min to this value. clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by clipping it. Will clip probabilities above clip_max to this value. Returns: pd.DataFrame | pd.Series: A matrix/vector num_subjects rows and number of columns is the number of values provided to treatment_value. The content is probabilities for every individual to have the specified treatment_value. If treatment_value is a list/vector, than a pd.DataFrame is returned. If treatment_value is sort of scalar, than a pd.Series is returned. (just like slicing a DataFrame's columns) """ treatment_values = a.max() if treatment_values is None else treatment_values probabilities = self.compute_propensity_matrix(X, a, clip_min, clip_max) probabilities = probabilities[treatment_values] return probabilities
[docs] def compute_propensity_matrix(self, X, a=None, clip_min=None, clip_max=None): """ Args: X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). a (pd.Series): Treatment assignment of size (num_subjects,). clip_min (None|float): Optional value between 0 to 0.5 to lower bound the propensity estimation by clipping it. Will clip probabilities under clip_min to this value. clip_max (None|float): Optional value between 0.5 to 1 to upper bound the propensity estimation by clipping it. Will clip probabilities above clip_max to this value. Returns: pd.DataFrame: A matrix of size (num_subjects, num_treatments) with probability for every individual and e very treatment. """ clip_min = self.clip_min if clip_min is None else clip_min clip_max = self.clip_max if clip_max is None else clip_max self.__check_truncation_value_is_legal(clip_min, clip_max) probabilities = self._predict(X) clip_min = 0 if clip_min is None else clip_min clip_max = 1 if clip_max is None else clip_max self.__count_truncated(probabilities, clip_min, clip_max) probabilities = probabilities.clip(lower=clip_min, upper=clip_max) return probabilities
[docs] def estimate_population_outcome(self, X, a, y, w=None, treatment_values=None): """ Calculates weighted population outcome for each subgroup stratified by treatment assignment. Args: X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). a (pd.Series): Treatment assignment of size (num_subjects,). y (pd.Series): Observed outcome of size (num_subjects,). w (pd.Series | None): Individual (sample) weights calculated. Used to achieved unbiased average outcome. If not provided, will be calculated on the data. treatment_values (Any): Desired treatment value/s to stratify upon. Must be a subset of values found in `a`. If not supplied, calculates for all available treatment values. Returns: pd.Series[Any, float]: Series which index are treatment values, and the values are numbers - the aggregated outcome for the strata of people whose assigned treatment is the key. """ if w is None: w = self.compute_weights(X, a) res = self._compute_stratified_weighted_aggregate(y, sample_weight=w, stratify_by=a, treatment_values=treatment_values) return res
@staticmethod def __check_truncation_value_is_legal(clip_min, clip_max): if clip_min is not None and not 0 <= clip_min <= 0.5: raise AssertionError("Provided value for truncation (clip_min) should be between 0.0 and 0.5") if clip_max is not None and not 0.5 <= clip_max <= 1: raise AssertionError("Provided value for truncation (clip_max) should be between 0.5 and 1.0") def __count_truncated(self, probabilities, clip_min, clip_max): num_clipped = probabilities.apply(lambda x: ~x.between(clip_min, clip_max)).sum().sum() fraction_clipped = num_clipped/probabilities.size if self.verbose: print(f'Fraction of values being truncated: {fraction_clipped:.5f}.') # TODO: do as log return fraction_clipped