Source code for causallib.estimation.overlap_weights

"""
(C) Copyright 2021 IBM Corp.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Created on Jun 09, 2021

"""

import warnings

import pandas as pd

from causallib.estimation.base_estimator import PopulationOutcomeEstimator
from causallib.estimation.base_weight import PropensityEstimator
from causallib.estimation.ipw import IPW

# TODO: Move fit and _predict methods to PropensityEstimator, instead of rely on it from the ipw module.


[docs]class OverlapWeights(IPW): def __init__(self, learner, use_stabilized=False): """ Implementation of overlap (propensity score) weighting: https://www.tandfonline.com/doi/full/10.1080/01621459.2016.1260466 A method to balance observed covariates between treatment groups in observational studies. Down-weigh observations with extreme propensity and weigh up Put less importance to observations with extreme propensity scores, and put more emphasis on observations with a central tendency towards (i.e. overlapping propensity scores). Each unit’s weight is proportional to the probability of that unit being assigned to the opposite group: w_i = 1 - Pr[A=a_i|Xi] This method assumes only two treatment groups exist. Args: learner: Initialized sklearn model. use_stabilized (bool): Whether to re-weigh the learned weights with the prevalence of the treatment. See Also: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4351790/#S6title """ super(OverlapWeights, self).__init__(learner, use_stabilized)
[docs] def compute_weight_matrix(self, X, a, clip_min=None, clip_max=None, use_stabilized=None): """ Computes individual weight across all possible treatment values. w_ij = 1 - Pr[A=a_j | X_i] for all individual i and treatment j. Args: X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). a (pd.Series): Treatment assignment of size (num_subjects,). clip_min (None|float): Lower bound for propensity scores. Better be left `None`. clip_max (None|float): Upper bound for propensity scores. Better be left `None`. use_stabilized (None|bool): Whether to re-weigh the learned weights with the prevalence of the treatment. This overrides the use_stabilized parameter provided at initialization. If True provided, but the model was initialized with use_stabilized=False, then prevalence is calculated from data at hand, rather than the prevalence from the training data. See Also: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4351790/#S6title Returns: pd.DataFrame: A matrix of size (num_subjects, num_treatments) with weight for every individual and every treatment. """ use_stabilized = self.use_stabilized if use_stabilized is None else use_stabilized # Check that number of unique classes is 2 self.__check_number_of_classes_is_legal(a) # Truncation is generally bad, check and warn: self.__check_truncation_value_is_none(clip_min, clip_max) # COmpute propensity scores probabilities = self.compute_propensity_matrix(X, a, clip_min, clip_max) # weight matrix: 1-P[a_i=1|x] # Reverse probabilities to opposite classes: probabilities.columns = probabilities.columns[::-1] # Flip name-based indexing # reorder weights_matrix weight_matrix = probabilities.iloc[:, ::-1] # Flip integer (location)-based indexing weight_matrix = self.stabilize_weights(a, weight_matrix, use_stabilized) return weight_matrix
[docs] def stabilize_weights(self, a, weight_matrix, use_stabilized=False): # TODO: Move this function to IPW """ Adjust sample weights according to class prevalence: Pr[A=a_i] * w_i Args: weight_matrix (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). use_stabilized (None|bool): Whether to re-weigh the learned weights with the prevalence of the treatment. This overrides the use_stabilized parameter provided at initialization. If True provided, but the model was initialized with use_stabilized=False, then prevalence is calculated from data at hand, rather than the prevalence from the training data. See Also: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4351790/#S6title Returns: pd.DataFrame: A matrix of size (num_subjects, num_treatments) with stabilized (if True) weight for every individual and every treatment. """ if use_stabilized: if self.use_stabilized: prevalence = self.treatment_prevalence_ else: warnings.warn("Stabilized is asked, however, the model was not trained using stabilization, and " "therefore, stabilized weights are taken from the provided treatment assignment.", RuntimeWarning) prevalence = a.value_counts(normalize=True, sort=False) prevalence_per_subject = a.replace(prevalence) # map tx-assign to prevalence # pointwise multiplication of each column in weights: weight_matrix = weight_matrix.multiply(prevalence_per_subject, axis="index") return weight_matrix
@staticmethod def __check_number_of_classes_is_legal(x): count_classes = x.nunique() if count_classes != 2: raise AssertionError("Number of unique classes should be equal 2") @staticmethod def __check_truncation_value_is_none(clip_min, clip_max): if clip_min is not None or clip_max is not None: warnings.warn( "Trimming observations with Overlap Weighting may be redundant, " "as extreme observations can receive greater importance than they should.", RuntimeWarning )