Source code for causallib.estimation.matching

# (C) Copyright 2021 IBM Corp.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import warnings
import pandas as pd
import numpy as np
from itertools import permutations, combinations
from collections import namedtuple, Counter
from sklearn.covariance import EmpiricalCovariance
from sklearn.neighbors import NearestNeighbors
from sklearn.exceptions import NotFittedError
from sklearn.base import clone as sk_clone
from .base_estimator import IndividualOutcomeEstimator
from .base_weight import WeightEstimator
from scipy.optimize import linear_sum_assignment
from scipy.spatial import distance


KNN = namedtuple("KNN", "learner index")
# scipy distance routine requires matrix of valid numerical distances
# we use `VERY_LARGE_NUMBER` to represent an infinite distance
VERY_LARGE_NUMBER = np.finfo('d').max


[docs]def majority_rule(x):
    return Counter(x).most_common(1)[0][0]


[docs]class Matching(IndividualOutcomeEstimator, WeightEstimator):

    def __init__(
        self,
        propensity_transform=None,
        caliper=None,
        with_replacement=True,
        n_neighbors=1,
        matching_mode="both",
        metric="mahalanobis",
        knn_backend="sklearn",
        estimate_observed_outcome=False,
    ):
        """Match treatment and control samples with similar covariates.

        Args:
            propensity_transform (causallib.transformers.PropensityTransformer):
                an object for data preprocessing which adds the propensity
                score as a feature (default: None)
            caliper (float) : maximal distance for a match to be accepted. If
                not defined, all matches will be accepted. If defined, some
                samples may not be matched and their outcomes will not be
                estimated. (default: None)
            with_replacement (bool): whether samples can be used multiple times
                for matching. If set to False, the matching process will optimize
                the linear sum of distances between pairs of treatment and
                control samples and only `min(N_treatment, N_control)` samples
                will be estimated. Matching with no replacement does not make
                use of the `fit` data and is therefore not implemented for
                out-of-sample data (default: True)
            n_neighbors (int) : number of nearest neighbors to include in match.
                Must be 1 if `with_replacement` is `False.` If larger than 1, the
                estimate is calculated using the `regress_agg_function` or 
                `classify_agg_function` across the `n_neighbors`. Note that when
                the `caliper` variable is set, some samples will have fewer than
                `n_neighbors` matches. (default: 1).
            matching_mode (str) : Direction of matching: `treatment_to_control`,
                `control_to_treatment` or `both` to indicate which set should
                be matched to which. All sets are cross-matched in `match`
                and when `with_replacement` is `False` all matching modes 
                coincide. With replacement there is a difference.
            metric (str) : Distance metric string for calculating distance
                between samples. Note: if an external built `knn_backend`
                object with a different metric is supplied, `metric` needs to
                be changed to reflect that, because `Matching` will set its 
                inverse covariance matrix if "mahalanobis" is set. (default: 
                "mahalanobis", also supported: "euclidean")
            knn_backend (str or callable) : Backend to use for nearest neighbor
                search. Options are "sklearn"  or a callable  which returns an 
                object implementing `fit`, `kneighbors` and `set_params` 
                like the sklearn `NearestNeighbors` object. (default: "sklearn"). 
            estimate_observed_outcome (bool) : Whether to allow a match of a
                sample to a sample other than itself when looking within its own
                treatment value. If True, the estimated potential outcome for the
                observed outcome may differ from the true observed outcome.
                (default: False)

        Attributes:
            classify_agg_function (callable) : Aggregating function for outcome
                estimation when classifying. (default: majority_rule)
                Usage is determined by type of `y` during `fit`
            regress_agg_function (callable) : Aggregating function for outcome
                estimation when regressing or predicting prob_a. (default: np.mean)
                Usage is determined by type of `y` during `fit`
            treatments_ (pd.DataFrame) : DataFrame of treatments (created after `fit`)
            outcomes_ (pd.DataFrame) : DataFrame of outcomes (created after `fit`)
            match_df_ (pd.DataFrame) : Dataframe of most recently calculated
                matches. For details, see `match`. (created after `match`)
            samples_used_ (pd.Series) : Series with count of samples used
                during most recent match. Series includes a count for each
                treatment value. (created after `match`)
        """
        self.propensity_transform = propensity_transform
        self.covariance_conditioner = EmpiricalCovariance()
        self.caliper = caliper
        self.with_replacement = with_replacement
        self.n_neighbors = n_neighbors
        self.matching_mode = matching_mode
        self.metric = metric
        # if classify task, default aggregation function is majority
        self.classify_agg_function = majority_rule
        # if regress task,  default aggregation function is mean
        self.regress_agg_function = np.mean
        self.knn_backend = knn_backend
        self.estimate_observed_outcome = estimate_observed_outcome

[docs]    def fit(self, X, a, y, sample_weight=None):
        """Load the treatments and outcomes and fit search trees.

        Applies transform to covariates X, initializes search trees for each
        treatment value for performing nearest neighbor searches.
        Note: Running `fit` a second time overwrites any information from
        previous `fit or `match` and re-fits the propensity_transform object.

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            y (pd.Series): Series of shape (n,) containing outcomes for
                the n samples.
            sample_weight: IGNORED In signature for compatibility with other
                estimators.


        Note: `X`, `a` and `y` must share the same index.

        Returns:
            self (Matching) the fitted object
        """
        self._clear_post_fit_variables()
        self.outcome_ = y.copy()
        self.treatments_ = a.copy()

        if self.propensity_transform:
            self.propensity_transform.fit(X, a)
            X = self.propensity_transform.transform(X)

        self.conditioned_covariance_ = self._calculate_covariance(X)

        self.treatment_knns_ = {}
        for a in self.treatments_.unique():
            haystack = X[self.treatments_ == a]
            self.treatment_knns_[a] = self._fit_sknn(haystack)

        return self

    def _execute_matching(self, X, a):
        """Execute matching of samples in X according to the treatment values in a.

        Returns a DataFrame including all the results, which is also set as
        the attribute `self.match_df_`. The arguments `X` and `a` define the
        "needle" where the "haystack" is the data that was previously passed
        to fit, for matching with replacement. As such, treatment and control 
        samples from within `X` will not be matched with each other, unless
        the same `X` and `a` were passed to `fit`. For matching without
        replacement, the `X` and `a` passed to `match` provide the "needle" and
        the "haystack". If the attribute `caliper` is set, the matches are
        limited to those with a distance less than `caliper`.

        This function ignores the existing `match_df_` and will overwrite it.
        It is thus useful for if you have changed the settings and need to
        rematch the samples. For most applications, the `match` function is
        more convenient.

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.

        Note: The args are assumed to share the same index.

        Returns:
            match_df: The resulting matches DataFrame is indexed so that
              ` match_df.loc[treatment_value, sample_id]` has columns `matches`
               and `distances` containing lists of indices to samples and the
               respective distances for the matches discovered for `sample_id`
               from within the fitted samples with the given `treatment_value`.
               The indices in the `matches` column are from the fitted data,
               not the X argument in `match`. If `sample_id` had no match,
               `match_df.loc[treatment_value, sample_id].matches = []`.
               The DataFrame has shape (n* len(a.unique()), 2 ).

        Raises:
            NotImplementedError: Raised when with_replacement is False and
               n_neighbors is not 1.
        """
        if self.n_neighbors != 1 and not self.with_replacement:
            raise NotImplementedError(
                "Matching more than one neighbor is only implemented for"
                "no-replacement"
            )

        if self.propensity_transform:
            X = self.propensity_transform.transform(X)
        if self.with_replacement:
            self.match_df_ = self._withreplacement_match(X, a)
        else:
            self.match_df_ = self._noreplacement_match(X, a)
        sample_id_name = X.index.name if X.index.name is not None else "sample_id"
        self.match_df_.index.set_names(
            ["match_to_treatment", sample_id_name], inplace=True
        )
        # we record the number of samples that were successfully matched of
        # each treatment value
        self.samples_used_ = self._count_samples_used_by_treatment_value(a)

        return self.match_df_

[docs]    def estimate_individual_outcome(
        self, X, a, y=None, treatment_values=None, predict_proba=True, dropna=True
    ):
        """
        Calculate the potential outcome for each sample and treatment value.

        Execute match and calculate, for each treatment value and each sample,
        the expected outcome. 

        Note: Out of sample estimation for matching without replacement requires
        passing a `y` vector here. If no 'y' is passed here, the values received
        by `fit` are used, and if the estimation indices are not a subset of the 
        fitted indices, the estimation will fail.

        If the attribute `estimate_observed_outcome` is 
        `True`, estimates will be calculated for the observed outcomes as well.
        If not, then the observed outcome will be passed through from the 
        corresponding element of `y` passed to `fit`.


        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            y (pd.Series): Series of shape (n,) containing outcome values for
                n samples. This is only used when `with_replacemnt=False`. 
                Otherwise, the outcome values passed to `fit` are used.
            predict_proba (bool) : whether to output classifications or
                probabilties for a classification task. If set to False and
                data is non-integer, a warning is issued. (default True)
            dropna (bool) : For samples that were unmatched due to caliper
                restrictions, drop from outcome_df leading to a potentially
                smaller sized output, or include them as NaN. (default: True)
            treatment_values : IGNORED

        Note: The args are assumed to share the same index.

        Returns:
            outcome_df (pd.DataFrame)
        """
        match_df = self.match(X, a, use_cached_result=True)

        outcome_df = self._aggregate_match_df_to_generate_outcome_df(
            match_df, a, predict_proba)
        outcome_df = self._filter_outcome_df_by_matching_mode(outcome_df, a)
        if outcome_df.isna().all(axis=None):
            raise ValueError("Matching was not successful and no outcomes can"
                             "be estimated. Check caliper value.")
        if dropna:
            outcome_df = outcome_df.dropna()
        
        return outcome_df

[docs]    def match(self, X, a, use_cached_result=True, successful_matches_only=False):
        """Matching the samples in X according to the treatment values in a.

        Returns a DataFrame including all the results, which is also set as
        the attribute `self.match_df_`. The arguments `X` and `a` define the
        "needle" where the "haystack" is the data that was previously passed
        to fit, for matching with replacement. As such, treatment and control 
        samp    les from within `X` will not be matched with each other, unless
        the same `X` and `a` were passed to `fit`. For matching without
        replacement, the `X` and `a` passed to `match` provide the "needle" and
        the "haystack". If the attribute `caliper` is set, the matches are
        limited to those with a distance less than `caliper`.

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            use_cached_result (bool): Whether or not to return the `match_df` 
                from the most recent matching operation. The cached result will
                only be used if the sample indices of `X` and those of `match_df`
                are identical, otherwise it will rematch.
            successful_matches_only (bool): Whether or not to filter the matches
                to those which matched successfully. If set to `False`, the
                resulting DataFrame will have shape (n* len(a.unique()), 2 ),
                otherwise it may have a smaller shape due to unsuccessful matches.

        Note: The args are assumed to share the same index.

        Returns:
            match_df: The resulting matches DataFrame is indexed so that
              ` match_df.loc[treatment_value, sample_id]` has columns `matches`
               and `distances` containing lists of indices to samples and the
               respective distances for the matches discovered for `sample_id`
               from within the fitted samples with the given `treatment_value`.
               The indices in the `matches` column are from the fitted data,
               not the X argument in `match`. If `sample_id` had no match,
               `match_df.loc[treatment_value, sample_id].matches = []`.
               The DataFrame has shape (n* len(a.unique()), 2 ), if
               `successful_matches_only` is set to `False.

        Raises:
            NotImplementedError: Raised when with_replacement is False and
               n_neighbors is not 1.
        """
        cached_result_available = (hasattr(self, "match_df_")
                                   and X.index.equals(self.match_df_.loc[0].index))
        if not (use_cached_result and cached_result_available):
            self._execute_matching(X, a)

        return self._get_match_df(successful_matches_only=successful_matches_only)

[docs]    def matches_to_weights(self, match_df=None):
        """Calculate weights based on a given set of matches.

        For each matching from one treatment value to another, a weight vector
        is generated. The weights are calculated as the number of times a
        sample was selected in a matching, with each occurrence weighted
        according to the number of other samples in that matching. The weights
        can be used to estimate outcomes or to check covariate balancing. The 
        function can only be called after `match` has been run.

        Args:
            match_df (pd.DataFrame) : a DataFrame of matches returned from
                `match`. If not supplied, will use the `match_df_` attribute if
                available, else raises ValueError. Will not execute `match` to
                generate a `match_df`.

        Returns:
            weights_df (pd.DataFrame): DataFrame of shape (n,M) where M is the
                number of permutations of `a.unique()`.
        """
        if match_df is None:
            match_df = self._get_match_df(successful_matches_only=False)

        match_permutations = sorted(permutations(self.treatments_.unique()))
        weights_df = pd.DataFrame([
            self._matches_to_weights_single_matching(s, t, match_df)
            for s, t in match_permutations],).T

        return weights_df

[docs]    def compute_weights(self, X, a, treatment_values=None, use_stabilized=None, **kwargs):
        """Calculate weights based on a given set of matches.

        Only applicable for `matching_mode` "control_to_treatment"
        or "treatment_to_control".

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            treatment_values: IGNORED.
            use_stabilized: IGNORED.
            **kwargs:

        Returns:
            pd.Series: a Series of shape (n,) with a weight per sample.

        Raises:
            ValueError if `Matching().matching_mode == 'both'`.
        """
        self.match(X, a, successful_matches_only=False)

        w = self.matches_to_weights()
        if self.matching_mode == "both":
            raise ValueError(
                f"Matching mode {self.matching_mode} is not supported for weight calculation."
                f"Please use 'control_to_treatment' or 'treatment_to_control'."
            )
        w = w[self.matching_mode]
        return w

[docs]    def compute_weight_matrix(self, X, a, use_stabilized=None, **kwargs):
        raise NotImplementedError("Weight matrix is not supported for a Matching estimator.")
        # TODO: is that so?

[docs]    def get_covariates_of_matches(self, s, t, covariates):
        """
        Look up covariates of closest matches for a given matching.

        Using `self.match_df_` and the supplied `covariates`, look up
        the covariates of the last match. The function can only be called after
        `match` has been run.

            Args:
                s (int) : source treatment value
                t (int) : target treatment value
                covariates (pd.DataFrame) : The same covariates which were
                   passed to `fit`.

            Returns:
                covariate_df (pd.DataFrame) : a DataFrame of size
                (n_matched_samples, n_covariates * 3 + 2) with the covariate
                values of the sample, covariates of its match, calculated
                distance and number of neighbors found within the given
                caliper (with no caliper this will equal self.n_neighbors )

        """
        match_df = self._get_match_df()
        subdf = match_df.loc[s][self.treatments_ == t]
        sample_id_name = subdf.index.name

        def get_covariate_difference_from_nearest_match(source_row_index):
            j = subdf.loc[source_row_index].matches[0]
            delta_series = pd.Series(
                covariates.loc[source_row_index] - covariates.loc[j])
            source_row = covariates.loc[j].copy()
            source_row.at[sample_id_name] = j
            target_row = covariates.loc[source_row_index].copy()
            target_row = target_row
            covariate_differences = pd.concat(
                {
                    t: target_row,
                    s: source_row,
                    "delta": delta_series,
                    "outcomes": pd.Series(
                        {t: self.outcome_.loc[source_row_index],
                            s: self.outcome_.loc[j]}
                    ),
                    "match": pd.Series(
                        dict(
                            n_neighbors=len(
                                subdf.loc[source_row_index].matches),
                            distance=subdf.loc[source_row_index].distances[0],
                        )
                    ),
                }
            )
            return covariate_differences

        covdf = pd.DataFrame(
            data=[get_covariate_difference_from_nearest_match(i)
                  for i in subdf.index], index=subdf.index
        )
        covdf = covdf.reset_index()
        cols = covdf.columns
        covdf.columns = pd.MultiIndex.from_tuples(
            [(t, sample_id_name)] + list(cols[1:]))
        return covdf

    def _clear_post_fit_variables(self):
        for var in list(vars(self)):
            if var[-1] == "_":
                self.__delattr__(var)

    def _calculate_covariance(self, X):
        if len(X.shape) > 1 and X.shape[1] > 1:
            V_list = []
            for a in self.treatments_.unique():
                X_at_a = X[self.treatments_ == a].copy()
                current_V = self.covariance_conditioner.fit(X_at_a).covariance_
                V_list.append(current_V)
            # following Imbens&Rubin, we average across treatment groups
            V = np.mean(V_list, axis=0)
        else:
            # for 1d data revert to euclidean metric
            V = np.array(1).reshape(1, 1)
        return V

    def _aggregate_match_df_to_generate_outcome_df(self, match_df, a, predict_proba):
        agg_function = self._get_agg_function(predict_proba)

        def outcome_from_matches_by_idx(x):
            return agg_function(self.outcome_.loc[x])

        outcomes = {}
        for i in sorted(a.unique()):
            outcomes[i] = match_df.loc[i].matches.apply(
                outcome_from_matches_by_idx)
        outcome_df = pd.DataFrame(outcomes)
        return outcome_df

    def _get_match_df(self, successful_matches_only=True):
        if not hasattr(self, "match_df_") or self.match_df_ is None:
            raise NotFittedError("You need to run `match` first")
        match_df = self.match_df_.copy()
        if successful_matches_only:
            match_df = match_df[match_df.matches.apply(bool)]
        if match_df.empty:
            raise ValueError(
                "Matching was not successful and no outcomes can be "
                "estimated. Check caliper value."
            )
        return match_df

    def _filter_outcome_df_by_matching_mode(self, outcome_df, a):
        if self.matching_mode == "treatment_to_control":
            outcome_df = outcome_df[a == 1]
        elif self.matching_mode == "control_to_treatment":
            outcome_df = outcome_df[a == 0]
        elif self.matching_mode == "both":
            pass
        else:
            raise NotImplementedError(
                "Matching mode {} is not implemented. Please select one of "
                "'treatment_to_control', 'control_to_treatment, "
                "or 'both'.".format(self.matching_mode)
            )
        return outcome_df

    def _get_agg_function(self, predict_proba):
        if predict_proba:
            agg_function = self.regress_agg_function
        else:
            agg_function = self.classify_agg_function
            try:
                isoutputinteger = np.allclose(
                    self.outcome_.apply(int), self.outcome_)
                if not isoutputinteger:
                    warnings.warn(
                        "Classifying non-categorical outcomes. "
                        "This is probably a mistake."
                    )
            except:
                warnings.warn(
                    "Unable to detect whether outcome is integer-like. ")
        return agg_function

    def _instantiate_nearest_neighbors_object(self):
        backend = self.knn_backend
        if backend == "sklearn":
            backend_instance = NearestNeighbors(algorithm="auto")
        elif callable(backend):
            backend_instance = backend()
            self.metric = backend_instance.metric
        elif hasattr(backend, "fit") and hasattr(backend, "kneighbors"):
            backend_instance = sk_clone(backend)
            self.metric = backend_instance.metric
        else:
            raise NotImplementedError(
                "`knn_backend` must be either an NearestNeighbors-like object,"
                " a callable returning such an object, or the string \"sklearn\"")
        backend_instance.set_params(**self._get_metric_dict())
        return backend_instance

    def _fit_sknn(self, target_df):
        """
        Fit scikit-learn NearestNeighbors object with samples in target_df.

        Fits object, adds metric parameters and returns namedtuple which
        also includes DataFrame indices so that identities can looked up.

        Args:
            target_df (pd.DataFrame) : DataFrame of covariates to fit

        Returns:
            KNN (namedtuple) : Namedtuple with members `learner` and `index`
            containing the fitted sklearn object and an index lookup vector,
            respectively.
        """
        target_array = target_df.values

        sknn = self._instantiate_nearest_neighbors_object()

        target_array = self._ensure_array_columnlike(target_array)

        sknn.fit(target_array)
        return KNN(sknn, target_df.index)

    @staticmethod
    def _ensure_array_columnlike(target_array):
        if len(target_array.shape) < 2 or target_array.shape[1] == 1:
            target_array = target_array.reshape(-1, 1)
        return target_array

    def _get_metric_dict(
        self,
        VI_in_metric_params=True,
    ):
        metric_dict = dict(metric=self.metric)
        if self.metric == "mahalanobis":
            VI = np.linalg.inv(self.conditioned_covariance_)
            if VI_in_metric_params:
                metric_dict["metric_params"] = {"VI": VI}
            else:
                metric_dict["VI"] = VI

        return metric_dict

    def _kneighbors(self, knn, source_df, n_neighbors):
        """Lookup neighbors in knn object.

        Args:
           knn (namedtuple) : knn named tuple to look for neighbors in. The
               object has `learner` and `index` attributes to reference the
               original df index.
           source_df (pd.DataFrame) : a DataFrame of source data points to use
               as "needles" for the knn "haystack."
           n_neighbors

        Returns:
            match_df (pd.DataFrame) : a DataFrame of matches
        """
        source_array = source_df.values
        # 1d data must be in shape (-1, 1) for sklearn.knn
        source_array = self._ensure_array_columnlike(source_array)

        distances, neighbor_array_indices = knn.learner.kneighbors(
            source_array, n_neighbors=n_neighbors
        )

        return self._generate_match_df(
            source_df, knn.index, distances, neighbor_array_indices
        )

    def _generate_match_df(
        self, source_df, target_df_index, distances, neighbor_array_indices
    ):
        """
        Take results of matching and build into match_df DataFrame.

        For clarity we'll call the samples that are being matched "needles" and
        the set of samples that they looked for matches in the "haystack".

        Args:
            source_df (pd.DataFrame) : Covariate dataframe of N "needles"
            target_df_index (np.array) : An array of M indices of the haystack
                samples in their original dataframe.
            distances (np.array) : An array of N arrays of floats of length K
                where K is `self.n_neighbors`.
            neighbor_array_indices (np.array) : An array of N arrays of ints of
                length K where K is `self.n_neighbors`.
        """
        # target is the haystack, source is the needle(s)
        # translate array indices back to original indices
        matches_dict = {}
        for source_idx, distance_row, neighbor_array_index_row in zip(
            source_df.index, distances, neighbor_array_indices
        ):
            neighbor_df_indices = \
                target_df_index[neighbor_array_index_row.flatten()]
            if self.caliper is not None:
                neighbor_df_indices = [
                    n
                    for i, n in enumerate(neighbor_df_indices)
                    if distance_row[i] < self.caliper
                ]
                distance_row = [d for d in distance_row if d < self.caliper]
            matches_dict[source_idx] = dict(
                matches=list(neighbor_df_indices), distances=list(distance_row)
            )
        # convert dict of dicts like { 1: {'matches':[], 'distances':[]}} to df
        return pd.DataFrame(matches_dict).T

    def _matches_to_weights_single_matching(self, s, t, match_df):
        """
        For a given match, calculate the resulting weight vector.

        The weight vector adds a count each time a sample is used, weighted by
        the number of other neighbors when it was used. This is necessary to
        make the weighted sum return the correct effect estimate.
        """
        weights = pd.Series(self.treatments_.copy() * 0)
        name = {0: "control", 1: "treatment"}
        weights.name = "{s}_to_{t}".format(s=name[s], t=name[t])
        s_to_t_matches = match_df.loc[t][self.treatments_ == s].matches
        for source_idx, matches_list in s_to_t_matches.items():
            if matches_list:
                weights.loc[source_idx] += 1
            for match in matches_list:
                weights.loc[match] += 1 / len(matches_list)
        return weights

    def _get_distance_matrix(self, source_df, target_df):
        """
        Create distance matrix for no replacement match.

        Combines metric, caliper and source/target data into a
        precalculated distance matrix which can be passed to
        scipy.optimize.linear_sum_assignment.
        """

        cdist_args = dict(
            XA=self._ensure_array_columnlike(source_df.values),
            XB=self._ensure_array_columnlike(target_df.values),
        )
        cdist_args.update(self._get_metric_dict(False))
        distance_matrix = distance.cdist(**cdist_args)

        if self.caliper is not None:
            distance_matrix[distance_matrix > self.caliper] = VERY_LARGE_NUMBER
        return distance_matrix

    def _withreplacement_match(self, X, a):
        matches = {}  # maps treatment value to list of matches TO that value

        for treatment_value, knn in self.treatment_knns_.items():
            n_matchable = sum(a==treatment_value)
            if n_matchable < self.n_neighbors:
                n_neighbors = n_matchable
                warnings.warn(
                    f"Not enough matchable samples in treatment group {treatment_value}. "
                    f"Reducing `n_neighbors` for this direction to {n_neighbors}."
                )
            else:
                n_neighbors = self.n_neighbors

            matches[treatment_value] = self._kneighbors(knn, X, n_neighbors)
            # when producing potential outcomes we may want to force the
            # value of the observed outcome to be the actual observed
            # outcome, and not an average of the k nearest samples.
            if not self.estimate_observed_outcome:

                def limit_within_treatment_matches_to_self_only(row):
                    if (
                        a.loc[row.name] == treatment_value
                        and row.name in row.matches
                    ):
                        row.matches = [row.name]
                        row.distances = [0]
                    return row

                matches[treatment_value] = matches[treatment_value].apply(
                    limit_within_treatment_matches_to_self_only, axis=1
                )

        return pd.concat(matches, sort=True)

    def _noreplacement_match(self, X, a):

        match_combinations = sorted(combinations(a.unique(), 2))
        matches = {}

        for s, t in match_combinations:
            distance_matrix = self._get_distance_matrix(X[a == s], X[a == t])
            source_array, neighbor_array_indices, distances = \
                self._optimally_match_distance_matrix(distance_matrix)
            source_df = X[a == s].iloc[np.array(source_array)]
            target_df = X[a == t].iloc[np.array(neighbor_array_indices)]
            if t in matches or s in matches:
                warnings.warn(
                    "No-replacement matching for more than "
                    "2 treatment values is not supported"
                )

            matches[t] = self._create_match_df_for_no_replacement(
                a, source_df, target_df, distances
            )
            matches[s] = self._create_match_df_for_no_replacement(
                a, target_df, source_df, distances
            )

        match_df = pd.concat(matches, sort=True)
        return match_df

    def _optimally_match_distance_matrix(self, distance_matrix):
        source_array, neighbor_array_indices = linear_sum_assignment(
            distance_matrix
        )
        distances = [
            [distance_matrix[s_idx, t_idx]]
            for s_idx, t_idx in zip(source_array, neighbor_array_indices)
        ]
        source_array, neighbor_array_indices, distances = \
            self._filter_noreplacement_matches_using_caliper(
                source_array, neighbor_array_indices, distances)
        return source_array, neighbor_array_indices, distances

    def _filter_noreplacement_matches_using_caliper(
            self, source_array, neighbor_array_indices, distances):
        if self.caliper is None:
            return source_array, neighbor_array_indices, distances
        keep_indices = [i for i, d in enumerate(
            distances) if d[0] <= self.caliper]
        source_array = source_array[keep_indices]
        neighbor_array_indices = neighbor_array_indices[keep_indices]
        distances = [distances[i] for i in keep_indices]
        if not keep_indices:
            warnings.warn("No matches found, check caliper."
                          "No estimation possible.")
        return source_array, neighbor_array_indices, distances

    @staticmethod
    def _create_match_df_for_no_replacement(
        base_series, source_df, target_df, distances
    ):
        match_sub_df = pd.DataFrame(
            index=base_series.index,
            columns=[
                "matches",
                "distances",
            ],
            data=base_series.apply(lambda x: pd.Series([[], []])).values,
            dtype="object",
        )

        # matching from source to target: read distances
        match_sub_df.loc[source_df.index] = pd.DataFrame(
            data=dict(
                matches=[[tidx] for tidx in target_df.index],
                distances=distances,
            ),
            index=source_df.index,
        )

        # matching from target to target: fill with zeros
        match_sub_df.loc[target_df.index] = pd.DataFrame(
            data=dict(
                matches=[[tidx] for tidx in target_df.index],
                distances=[[0]] * len(distances),
            ),
            index=target_df.index,
        )
        return match_sub_df

    def _count_samples_used_by_treatment_value(self, a):
        # we record the number of samples that were successfully matched of
        # each treatment value
        samples_used = {
            treatment_value:
            self.match_df_.loc[treatment_value][a != treatment_value]
            .matches.apply(bool).sum()

            for treatment_value in sorted(a.unique(), reverse=True)
        }

        return pd.Series(samples_used)


[docs]class PropensityMatching(Matching):

    def __init__(self, learner, **kwargs):
        """Matching on propensity score only.

        This is a convenience class to execute the common task of propensity
        score matching. It shares all of the methods of the `Matching` class
        but offers a shortcut for initialization.

        Args:
            learner (sklearn.estimator) :  a trainable propensity model that
                implements `fit` and `predict_proba`. Will be passed to the
                `PropensityTransformer` object.
            **kwargs : see Matching.__init__ for supported kwargs.
        """
        from causallib.preprocessing.transformers import PropensityTransformer

        super().__init__(**kwargs)
        self.learner = learner
        self.propensity_transform = PropensityTransformer(
            include_covariates=False, learner=self.learner
        )