Source code for causallib.estimation.matching

# (C) Copyright 2021 IBM Corp.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import warnings
import pandas as pd
import numpy as np
from itertools import permutations, combinations
from collections import namedtuple, Counter
from sklearn.covariance import EmpiricalCovariance
from sklearn.neighbors import NearestNeighbors
from sklearn.exceptions import NotFittedError
from sklearn.base import clone as sk_clone
from .base_estimator import IndividualOutcomeEstimator
from .base_weight import WeightEstimator
from scipy.optimize import linear_sum_assignment
from scipy.spatial import distance


KNN = namedtuple("KNN", "learner index")
# scipy distance routine requires matrix of valid numerical distances
# we use `VERY_LARGE_NUMBER` to represent an infinite distance
VERY_LARGE_NUMBER = np.finfo('d').max


[docs]def majority_rule(x): return Counter(x).most_common(1)[0][0]
[docs]class Matching(IndividualOutcomeEstimator, WeightEstimator): def __init__( self, propensity_transform=None, caliper=None, with_replacement=True, n_neighbors=1, matching_mode="both", metric="mahalanobis", knn_backend="sklearn", estimate_observed_outcome=False, ): """Match treatment and control samples with similar covariates. Args: propensity_transform (causallib.transformers.PropensityTransformer): an object for data preprocessing which adds the propensity score as a feature (default: None) caliper (float) : maximal distance for a match to be accepted. If not defined, all matches will be accepted. If defined, some samples may not be matched and their outcomes will not be estimated. (default: None) with_replacement (bool): whether samples can be used multiple times for matching. If set to False, the matching process will optimize the linear sum of distances between pairs of treatment and control samples and only `min(N_treatment, N_control)` samples will be estimated. Matching with no replacement does not make use of the `fit` data and is therefore not implemented for out-of-sample data (default: True) n_neighbors (int) : number of nearest neighbors to include in match. Must be 1 if `with_replacement` is `False.` If larger than 1, the estimate is calculated using the `regress_agg_function` or `classify_agg_function` across the `n_neighbors`. Note that when the `caliper` variable is set, some samples will have fewer than `n_neighbors` matches. (default: 1). matching_mode (str) : Direction of matching: `treatment_to_control`, `control_to_treatment` or `both` to indicate which set should be matched to which. All sets are cross-matched in `match` and when `with_replacement` is `False` all matching modes coincide. With replacement there is a difference. metric (str) : Distance metric string for calculating distance between samples. Note: if an external built `knn_backend` object with a different metric is supplied, `metric` needs to be changed to reflect that, because `Matching` will set its inverse covariance matrix if "mahalanobis" is set. (default: "mahalanobis", also supported: "euclidean") knn_backend (str or callable) : Backend to use for nearest neighbor search. Options are "sklearn" or a callable which returns an object implementing `fit`, `kneighbors` and `set_params` like the sklearn `NearestNeighbors` object. (default: "sklearn"). estimate_observed_outcome (bool) : Whether to allow a match of a sample to a sample other than itself when looking within its own treatment value. If True, the estimated potential outcome for the observed outcome may differ from the true observed outcome. (default: False) Attributes: classify_agg_function (callable) : Aggregating function for outcome estimation when classifying. (default: majority_rule) Usage is determined by type of `y` during `fit` regress_agg_function (callable) : Aggregating function for outcome estimation when regressing or predicting prob_a. (default: np.mean) Usage is determined by type of `y` during `fit` treatments_ (pd.DataFrame) : DataFrame of treatments (created after `fit`) outcomes_ (pd.DataFrame) : DataFrame of outcomes (created after `fit`) match_df_ (pd.DataFrame) : Dataframe of most recently calculated matches. For details, see `match`. (created after `match`) samples_used_ (pd.Series) : Series with count of samples used during most recent match. Series includes a count for each treatment value. (created after `match`) """ self.propensity_transform = propensity_transform self.covariance_conditioner = EmpiricalCovariance() self.caliper = caliper self.with_replacement = with_replacement self.n_neighbors = n_neighbors self.matching_mode = matching_mode self.metric = metric # if classify task, default aggregation function is majority self.classify_agg_function = majority_rule # if regress task, default aggregation function is mean self.regress_agg_function = np.mean self.knn_backend = knn_backend self.estimate_observed_outcome = estimate_observed_outcome
[docs] def fit(self, X, a, y, sample_weight=None): """Load the treatments and outcomes and fit search trees. Applies transform to covariates X, initializes search trees for each treatment value for performing nearest neighbor searches. Note: Running `fit` a second time overwrites any information from previous `fit or `match` and re-fits the propensity_transform object. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. y (pd.Series): Series of shape (n,) containing outcomes for the n samples. sample_weight: IGNORED In signature for compatibility with other estimators. Note: `X`, `a` and `y` must share the same index. Returns: self (Matching) the fitted object """ self._clear_post_fit_variables() self.outcome_ = y.copy() self.treatments_ = a.copy() if self.propensity_transform: self.propensity_transform.fit(X, a) X = self.propensity_transform.transform(X) self.conditioned_covariance_ = self._calculate_covariance(X) self.treatment_knns_ = {} for a in self.treatments_.unique(): haystack = X[self.treatments_ == a] self.treatment_knns_[a] = self._fit_sknn(haystack) return self
def _execute_matching(self, X, a): """Execute matching of samples in X according to the treatment values in a. Returns a DataFrame including all the results, which is also set as the attribute `self.match_df_`. The arguments `X` and `a` define the "needle" where the "haystack" is the data that was previously passed to fit, for matching with replacement. As such, treatment and control samples from within `X` will not be matched with each other, unless the same `X` and `a` were passed to `fit`. For matching without replacement, the `X` and `a` passed to `match` provide the "needle" and the "haystack". If the attribute `caliper` is set, the matches are limited to those with a distance less than `caliper`. This function ignores the existing `match_df_` and will overwrite it. It is thus useful for if you have changed the settings and need to rematch the samples. For most applications, the `match` function is more convenient. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. Note: The args are assumed to share the same index. Returns: match_df: The resulting matches DataFrame is indexed so that ` match_df.loc[treatment_value, sample_id]` has columns `matches` and `distances` containing lists of indices to samples and the respective distances for the matches discovered for `sample_id` from within the fitted samples with the given `treatment_value`. The indices in the `matches` column are from the fitted data, not the X argument in `match`. If `sample_id` had no match, `match_df.loc[treatment_value, sample_id].matches = []`. The DataFrame has shape (n* len(a.unique()), 2 ). Raises: NotImplementedError: Raised when with_replacement is False and n_neighbors is not 1. """ if self.n_neighbors != 1 and not self.with_replacement: raise NotImplementedError( "Matching more than one neighbor is only implemented for" "no-replacement" ) if self.propensity_transform: X = self.propensity_transform.transform(X) if self.with_replacement: self.match_df_ = self._withreplacement_match(X, a) else: self.match_df_ = self._noreplacement_match(X, a) sample_id_name = X.index.name if X.index.name is not None else "sample_id" self.match_df_.index.set_names( ["match_to_treatment", sample_id_name], inplace=True ) # we record the number of samples that were successfully matched of # each treatment value self.samples_used_ = self._count_samples_used_by_treatment_value(a) return self.match_df_
[docs] def estimate_individual_outcome( self, X, a, y=None, treatment_values=None, predict_proba=True, dropna=True ): """ Calculate the potential outcome for each sample and treatment value. Execute match and calculate, for each treatment value and each sample, the expected outcome. Note: Out of sample estimation for matching without replacement requires passing a `y` vector here. If no 'y' is passed here, the values received by `fit` are used, and if the estimation indices are not a subset of the fitted indices, the estimation will fail. If the attribute `estimate_observed_outcome` is `True`, estimates will be calculated for the observed outcomes as well. If not, then the observed outcome will be passed through from the corresponding element of `y` passed to `fit`. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. y (pd.Series): Series of shape (n,) containing outcome values for n samples. This is only used when `with_replacemnt=False`. Otherwise, the outcome values passed to `fit` are used. predict_proba (bool) : whether to output classifications or probabilties for a classification task. If set to False and data is non-integer, a warning is issued. (default True) dropna (bool) : For samples that were unmatched due to caliper restrictions, drop from outcome_df leading to a potentially smaller sized output, or include them as NaN. (default: True) treatment_values : IGNORED Note: The args are assumed to share the same index. Returns: outcome_df (pd.DataFrame) """ match_df = self.match(X, a, use_cached_result=True) outcome_df = self._aggregate_match_df_to_generate_outcome_df( match_df, a, predict_proba) outcome_df = self._filter_outcome_df_by_matching_mode(outcome_df, a) if outcome_df.isna().all(axis=None): raise ValueError("Matching was not successful and no outcomes can" "be estimated. Check caliper value.") if dropna: outcome_df = outcome_df.dropna() return outcome_df
[docs] def match(self, X, a, use_cached_result=True, successful_matches_only=False): """Matching the samples in X according to the treatment values in a. Returns a DataFrame including all the results, which is also set as the attribute `self.match_df_`. The arguments `X` and `a` define the "needle" where the "haystack" is the data that was previously passed to fit, for matching with replacement. As such, treatment and control samp les from within `X` will not be matched with each other, unless the same `X` and `a` were passed to `fit`. For matching without replacement, the `X` and `a` passed to `match` provide the "needle" and the "haystack". If the attribute `caliper` is set, the matches are limited to those with a distance less than `caliper`. Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. use_cached_result (bool): Whether or not to return the `match_df` from the most recent matching operation. The cached result will only be used if the sample indices of `X` and those of `match_df` are identical, otherwise it will rematch. successful_matches_only (bool): Whether or not to filter the matches to those which matched successfully. If set to `False`, the resulting DataFrame will have shape (n* len(a.unique()), 2 ), otherwise it may have a smaller shape due to unsuccessful matches. Note: The args are assumed to share the same index. Returns: match_df: The resulting matches DataFrame is indexed so that ` match_df.loc[treatment_value, sample_id]` has columns `matches` and `distances` containing lists of indices to samples and the respective distances for the matches discovered for `sample_id` from within the fitted samples with the given `treatment_value`. The indices in the `matches` column are from the fitted data, not the X argument in `match`. If `sample_id` had no match, `match_df.loc[treatment_value, sample_id].matches = []`. The DataFrame has shape (n* len(a.unique()), 2 ), if `successful_matches_only` is set to `False. Raises: NotImplementedError: Raised when with_replacement is False and n_neighbors is not 1. """ cached_result_available = (hasattr(self, "match_df_") and X.index.equals(self.match_df_.loc[0].index)) if not (use_cached_result and cached_result_available): self._execute_matching(X, a) return self._get_match_df(successful_matches_only=successful_matches_only)
[docs] def matches_to_weights(self, match_df=None): """Calculate weights based on a given set of matches. For each matching from one treatment value to another, a weight vector is generated. The weights are calculated as the number of times a sample was selected in a matching, with each occurrence weighted according to the number of other samples in that matching. The weights can be used to estimate outcomes or to check covariate balancing. The function can only be called after `match` has been run. Args: match_df (pd.DataFrame) : a DataFrame of matches returned from `match`. If not supplied, will use the `match_df_` attribute if available, else raises ValueError. Will not execute `match` to generate a `match_df`. Returns: weights_df (pd.DataFrame): DataFrame of shape (n,M) where M is the number of permutations of `a.unique()`. """ if match_df is None: match_df = self._get_match_df(successful_matches_only=False) match_permutations = sorted(permutations(self.treatments_.unique())) weights_df = pd.DataFrame([ self._matches_to_weights_single_matching(s, t, match_df) for s, t in match_permutations],).T return weights_df
[docs] def compute_weights(self, X, a, treatment_values=None, use_stabilized=None, **kwargs): """Calculate weights based on a given set of matches. Only applicable for `matching_mode` "control_to_treatment" or "treatment_to_control". Args: X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates for n samples. a (pd.Series): Series of shape (n,) containing discrete treatment values for the n samples. treatment_values: IGNORED. use_stabilized: IGNORED. **kwargs: Returns: pd.Series: a Series of shape (n,) with a weight per sample. Raises: ValueError if `Matching().matching_mode == 'both'`. """ self.match(X, a, successful_matches_only=False) w = self.matches_to_weights() if self.matching_mode == "both": raise ValueError( f"Matching mode {self.matching_mode} is not supported for weight calculation." f"Please use 'control_to_treatment' or 'treatment_to_control'." ) w = w[self.matching_mode] return w
[docs] def compute_weight_matrix(self, X, a, use_stabilized=None, **kwargs): raise NotImplementedError("Weight matrix is not supported for a Matching estimator.")
# TODO: is that so?
[docs] def get_covariates_of_matches(self, s, t, covariates): """ Look up covariates of closest matches for a given matching. Using `self.match_df_` and the supplied `covariates`, look up the covariates of the last match. The function can only be called after `match` has been run. Args: s (int) : source treatment value t (int) : target treatment value covariates (pd.DataFrame) : The same covariates which were passed to `fit`. Returns: covariate_df (pd.DataFrame) : a DataFrame of size (n_matched_samples, n_covariates * 3 + 2) with the covariate values of the sample, covariates of its match, calculated distance and number of neighbors found within the given caliper (with no caliper this will equal self.n_neighbors ) """ match_df = self._get_match_df() subdf = match_df.loc[s][self.treatments_ == t] sample_id_name = subdf.index.name def get_covariate_difference_from_nearest_match(source_row_index): j = subdf.loc[source_row_index].matches[0] delta_series = pd.Series( covariates.loc[source_row_index] - covariates.loc[j]) source_row = covariates.loc[j].copy() source_row.at[sample_id_name] = j target_row = covariates.loc[source_row_index].copy() target_row = target_row covariate_differences = pd.concat( { t: target_row, s: source_row, "delta": delta_series, "outcomes": pd.Series( {t: self.outcome_.loc[source_row_index], s: self.outcome_.loc[j]} ), "match": pd.Series( dict( n_neighbors=len( subdf.loc[source_row_index].matches), distance=subdf.loc[source_row_index].distances[0], ) ), } ) return covariate_differences covdf = pd.DataFrame( data=[get_covariate_difference_from_nearest_match(i) for i in subdf.index], index=subdf.index ) covdf = covdf.reset_index() cols = covdf.columns covdf.columns = pd.MultiIndex.from_tuples( [(t, sample_id_name)] + list(cols[1:])) return covdf
def _clear_post_fit_variables(self): for var in list(vars(self)): if var[-1] == "_": self.__delattr__(var) def _calculate_covariance(self, X): if len(X.shape) > 1 and X.shape[1] > 1: V_list = [] for a in self.treatments_.unique(): X_at_a = X[self.treatments_ == a].copy() current_V = self.covariance_conditioner.fit(X_at_a).covariance_ V_list.append(current_V) # following Imbens&Rubin, we average across treatment groups V = np.mean(V_list, axis=0) else: # for 1d data revert to euclidean metric V = np.array(1).reshape(1, 1) return V def _aggregate_match_df_to_generate_outcome_df(self, match_df, a, predict_proba): agg_function = self._get_agg_function(predict_proba) def outcome_from_matches_by_idx(x): return agg_function(self.outcome_.loc[x]) outcomes = {} for i in sorted(a.unique()): outcomes[i] = match_df.loc[i].matches.apply( outcome_from_matches_by_idx) outcome_df = pd.DataFrame(outcomes) return outcome_df def _get_match_df(self, successful_matches_only=True): if not hasattr(self, "match_df_") or self.match_df_ is None: raise NotFittedError("You need to run `match` first") match_df = self.match_df_.copy() if successful_matches_only: match_df = match_df[match_df.matches.apply(bool)] if match_df.empty: raise ValueError( "Matching was not successful and no outcomes can be " "estimated. Check caliper value." ) return match_df def _filter_outcome_df_by_matching_mode(self, outcome_df, a): if self.matching_mode == "treatment_to_control": outcome_df = outcome_df[a == 1] elif self.matching_mode == "control_to_treatment": outcome_df = outcome_df[a == 0] elif self.matching_mode == "both": pass else: raise NotImplementedError( "Matching mode {} is not implemented. Please select one of " "'treatment_to_control', 'control_to_treatment, " "or 'both'.".format(self.matching_mode) ) return outcome_df def _get_agg_function(self, predict_proba): if predict_proba: agg_function = self.regress_agg_function else: agg_function = self.classify_agg_function try: isoutputinteger = np.allclose( self.outcome_.apply(int), self.outcome_) if not isoutputinteger: warnings.warn( "Classifying non-categorical outcomes. " "This is probably a mistake." ) except: warnings.warn( "Unable to detect whether outcome is integer-like. ") return agg_function def _instantiate_nearest_neighbors_object(self): backend = self.knn_backend if backend == "sklearn": backend_instance = NearestNeighbors(algorithm="auto") elif callable(backend): backend_instance = backend() self.metric = backend_instance.metric elif hasattr(backend, "fit") and hasattr(backend, "kneighbors"): backend_instance = sk_clone(backend) self.metric = backend_instance.metric else: raise NotImplementedError( "`knn_backend` must be either an NearestNeighbors-like object," " a callable returning such an object, or the string \"sklearn\"") backend_instance.set_params(**self._get_metric_dict()) return backend_instance def _fit_sknn(self, target_df): """ Fit scikit-learn NearestNeighbors object with samples in target_df. Fits object, adds metric parameters and returns namedtuple which also includes DataFrame indices so that identities can looked up. Args: target_df (pd.DataFrame) : DataFrame of covariates to fit Returns: KNN (namedtuple) : Namedtuple with members `learner` and `index` containing the fitted sklearn object and an index lookup vector, respectively. """ target_array = target_df.values sknn = self._instantiate_nearest_neighbors_object() target_array = self._ensure_array_columnlike(target_array) sknn.fit(target_array) return KNN(sknn, target_df.index) @staticmethod def _ensure_array_columnlike(target_array): if len(target_array.shape) < 2 or target_array.shape[1] == 1: target_array = target_array.reshape(-1, 1) return target_array def _get_metric_dict( self, VI_in_metric_params=True, ): metric_dict = dict(metric=self.metric) if self.metric == "mahalanobis": VI = np.linalg.inv(self.conditioned_covariance_) if VI_in_metric_params: metric_dict["metric_params"] = {"VI": VI} else: metric_dict["VI"] = VI return metric_dict def _kneighbors(self, knn, source_df, n_neighbors): """Lookup neighbors in knn object. Args: knn (namedtuple) : knn named tuple to look for neighbors in. The object has `learner` and `index` attributes to reference the original df index. source_df (pd.DataFrame) : a DataFrame of source data points to use as "needles" for the knn "haystack." n_neighbors Returns: match_df (pd.DataFrame) : a DataFrame of matches """ source_array = source_df.values # 1d data must be in shape (-1, 1) for sklearn.knn source_array = self._ensure_array_columnlike(source_array) distances, neighbor_array_indices = knn.learner.kneighbors( source_array, n_neighbors=n_neighbors ) return self._generate_match_df( source_df, knn.index, distances, neighbor_array_indices ) def _generate_match_df( self, source_df, target_df_index, distances, neighbor_array_indices ): """ Take results of matching and build into match_df DataFrame. For clarity we'll call the samples that are being matched "needles" and the set of samples that they looked for matches in the "haystack". Args: source_df (pd.DataFrame) : Covariate dataframe of N "needles" target_df_index (np.array) : An array of M indices of the haystack samples in their original dataframe. distances (np.array) : An array of N arrays of floats of length K where K is `self.n_neighbors`. neighbor_array_indices (np.array) : An array of N arrays of ints of length K where K is `self.n_neighbors`. """ # target is the haystack, source is the needle(s) # translate array indices back to original indices matches_dict = {} for source_idx, distance_row, neighbor_array_index_row in zip( source_df.index, distances, neighbor_array_indices ): neighbor_df_indices = \ target_df_index[neighbor_array_index_row.flatten()] if self.caliper is not None: neighbor_df_indices = [ n for i, n in enumerate(neighbor_df_indices) if distance_row[i] < self.caliper ] distance_row = [d for d in distance_row if d < self.caliper] matches_dict[source_idx] = dict( matches=list(neighbor_df_indices), distances=list(distance_row) ) # convert dict of dicts like { 1: {'matches':[], 'distances':[]}} to df return pd.DataFrame(matches_dict).T def _matches_to_weights_single_matching(self, s, t, match_df): """ For a given match, calculate the resulting weight vector. The weight vector adds a count each time a sample is used, weighted by the number of other neighbors when it was used. This is necessary to make the weighted sum return the correct effect estimate. """ weights = pd.Series(self.treatments_.copy() * 0) name = {0: "control", 1: "treatment"} weights.name = "{s}_to_{t}".format(s=name[s], t=name[t]) s_to_t_matches = match_df.loc[t][self.treatments_ == s].matches for source_idx, matches_list in s_to_t_matches.items(): if matches_list: weights.loc[source_idx] += 1 for match in matches_list: weights.loc[match] += 1 / len(matches_list) return weights def _get_distance_matrix(self, source_df, target_df): """ Create distance matrix for no replacement match. Combines metric, caliper and source/target data into a precalculated distance matrix which can be passed to scipy.optimize.linear_sum_assignment. """ cdist_args = dict( XA=self._ensure_array_columnlike(source_df.values), XB=self._ensure_array_columnlike(target_df.values), ) cdist_args.update(self._get_metric_dict(False)) distance_matrix = distance.cdist(**cdist_args) if self.caliper is not None: distance_matrix[distance_matrix > self.caliper] = VERY_LARGE_NUMBER return distance_matrix def _withreplacement_match(self, X, a): matches = {} # maps treatment value to list of matches TO that value for treatment_value, knn in self.treatment_knns_.items(): n_matchable = sum(a==treatment_value) if n_matchable < self.n_neighbors: n_neighbors = n_matchable warnings.warn( f"Not enough matchable samples in treatment group {treatment_value}. " f"Reducing `n_neighbors` for this direction to {n_neighbors}." ) else: n_neighbors = self.n_neighbors matches[treatment_value] = self._kneighbors(knn, X, n_neighbors) # when producing potential outcomes we may want to force the # value of the observed outcome to be the actual observed # outcome, and not an average of the k nearest samples. if not self.estimate_observed_outcome: def limit_within_treatment_matches_to_self_only(row): if ( a.loc[row.name] == treatment_value and row.name in row.matches ): row.matches = [row.name] row.distances = [0] return row matches[treatment_value] = matches[treatment_value].apply( limit_within_treatment_matches_to_self_only, axis=1 ) return pd.concat(matches, sort=True) def _noreplacement_match(self, X, a): match_combinations = sorted(combinations(a.unique(), 2)) matches = {} for s, t in match_combinations: distance_matrix = self._get_distance_matrix(X[a == s], X[a == t]) source_array, neighbor_array_indices, distances = \ self._optimally_match_distance_matrix(distance_matrix) source_df = X[a == s].iloc[np.array(source_array)] target_df = X[a == t].iloc[np.array(neighbor_array_indices)] if t in matches or s in matches: warnings.warn( "No-replacement matching for more than " "2 treatment values is not supported" ) matches[t] = self._create_match_df_for_no_replacement( a, source_df, target_df, distances ) matches[s] = self._create_match_df_for_no_replacement( a, target_df, source_df, distances ) match_df = pd.concat(matches, sort=True) return match_df def _optimally_match_distance_matrix(self, distance_matrix): source_array, neighbor_array_indices = linear_sum_assignment( distance_matrix ) distances = [ [distance_matrix[s_idx, t_idx]] for s_idx, t_idx in zip(source_array, neighbor_array_indices) ] source_array, neighbor_array_indices, distances = \ self._filter_noreplacement_matches_using_caliper( source_array, neighbor_array_indices, distances) return source_array, neighbor_array_indices, distances def _filter_noreplacement_matches_using_caliper( self, source_array, neighbor_array_indices, distances): if self.caliper is None: return source_array, neighbor_array_indices, distances keep_indices = [i for i, d in enumerate( distances) if d[0] <= self.caliper] source_array = source_array[keep_indices] neighbor_array_indices = neighbor_array_indices[keep_indices] distances = [distances[i] for i in keep_indices] if not keep_indices: warnings.warn("No matches found, check caliper." "No estimation possible.") return source_array, neighbor_array_indices, distances @staticmethod def _create_match_df_for_no_replacement( base_series, source_df, target_df, distances ): match_sub_df = pd.DataFrame( index=base_series.index, columns=[ "matches", "distances", ], data=base_series.apply(lambda x: pd.Series([[], []])).values, dtype="object", ) # matching from source to target: read distances match_sub_df.loc[source_df.index] = pd.DataFrame( data=dict( matches=[[tidx] for tidx in target_df.index], distances=distances, ), index=source_df.index, ) # matching from target to target: fill with zeros match_sub_df.loc[target_df.index] = pd.DataFrame( data=dict( matches=[[tidx] for tidx in target_df.index], distances=[[0]] * len(distances), ), index=target_df.index, ) return match_sub_df def _count_samples_used_by_treatment_value(self, a): # we record the number of samples that were successfully matched of # each treatment value samples_used = { treatment_value: self.match_df_.loc[treatment_value][a != treatment_value] .matches.apply(bool).sum() for treatment_value in sorted(a.unique(), reverse=True) } return pd.Series(samples_used)
[docs]class PropensityMatching(Matching): def __init__(self, learner, **kwargs): """Matching on propensity score only. This is a convenience class to execute the common task of propensity score matching. It shares all of the methods of the `Matching` class but offers a shortcut for initialization. Args: learner (sklearn.estimator) : a trainable propensity model that implements `fit` and `predict_proba`. Will be passed to the `PropensityTransformer` object. **kwargs : see Matching.__init__ for supported kwargs. """ from causallib.preprocessing.transformers import PropensityTransformer super().__init__(**kwargs) self.learner = learner self.propensity_transform = PropensityTransformer( include_covariates=False, learner=self.learner )