Source code for causallib.preprocessing.confounder_selection

import numpy as np
from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone
from sklearn.utils.multiclass import type_of_target
from sklearn.linear_model import LassoCV, LogisticRegressionCV
# Find internal implementations
try:  # Version 0.20 - 0.21
    from sklearn.feature_selection.base import SelectorMixin
except ModuleNotFoundError:
    # Version >= 0.22
    from sklearn.feature_selection._base import SelectorMixin


__all__ = ["DoubleLASSO", "RecursiveConfounderElimination"]


# noinspection PyAbstractClass
class _BaseConfounderSelection(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
    def __init__(self, importance_getter='auto', covariates=None):
        self.importance_getter = importance_getter
        self.covariates = covariates

    # TODO: move functionality for general utils and use for Estimators as well
    @staticmethod
    def _filter_covariates(func):
        def filter_covariates_and_run(self, X, *args, **kwargs):
            covariates = self.covariates
            if covariates is None:
                covariates = X.columns
            X = X.loc[:, covariates]
            return func(self, X, *args, **kwargs)
        return filter_covariates_and_run

    # @staticmethod
    def _filter_and_re_add_covariates(func):
        def filter_covariates_and_run_and_add(self, X, *args, **kwargs):
            covariates = self.covariates
            if covariates is None:
                covariates = X.columns
            sub_X = X.loc[:, covariates]
            res = func(self, sub_X, *args, **kwargs)
            complement_covariates = X.columns.difference(sub_X.columns)
            res = res.join(X.loc[:, complement_covariates])
            return res
        return filter_covariates_and_run_and_add

    @_filter_and_re_add_covariates
    def transform(self, X, a=None):
        X = X.loc[:, self.get_support()]
        return X

    # Decorator function cannot be defined as static before decorating,
    # so setting as the decorator as `staticmethod` is done after defining the `transform` using the decorator
    _filter_and_re_add_covariates = staticmethod(_filter_and_re_add_covariates)


[docs]class DoubleLASSO(_BaseConfounderSelection):
    def __init__(self, treatment_lasso=None, outcome_lasso=None,
                 mask_fn=None, threshold=1e-6,
                 importance_getter='auto', covariates=None):
        """
        A method for selecting confounders using sparse regression
        on both the treatment and the outcomes, and select for

        Implementing "Inference on Treatment Effects after Selection
        among High-Dimensional Controls"
        https://academic.oup.com/restud/article/81/2/608/1523757

        Args:
            treatment_lasso: Lasso learner to fit confounders and treatment.
                For example using scikit-learn,
                continuous treatment may use: `Lasso()`,
                discrete treatment may use: `LogisticRegression(penalty='l1')`.
                If `None` will try to automatically assign a lasso model with cross validation.
            outcome_lasso: Lasso learner to fit confounders and outcome.
                For example using scikit-learn,
                continuous outcome may use: `Lasso()`,
                discrete outcome may use: `LogisticRegression(penalty='l1')`.
                If `None` will try to automatically assign lasso model cross-validation.
            mask_fn: Function that takes input as two fitted lasso learners
                and returns a mask of the length of number of columns where True
                corresponds to columns that need to be selected. When set to None,
                the default implementation returns a mask based on non-zero
                coefficients in either learner. User can supply their own function,
                which must return a boolean array (of the length of columns of X)
                to indicate which columns are to be included.
            threshold: For default mask_fn, absolute value below which a lasso
                coefficient is treated as zero.
            importance_getter (str | callable): how to obtain feature importance.
                either a callable that inputs an estimator,
                a string of `'coef_'` or `'feature_importance_'`,
                or `'auto'` will detect `'coef_'` or `'feature_importance_'` automatically.
            covariates (list | np.ndarray): Specifying a subset of columns to perform selection on.
                Columns in `X` but not in `covariates` will be included after `transform`
                no matter the selection.
                Can be either a list of column names, or an array of boolean indicators length of `X`,
                or anything compatible with pandas `loc` function for columns.
                if `None` then all columns are participating in the selection process.
                This is similar to using sklearn's `ColumnTransformer` or `make_column_selector`.
        """
        # TODO: allowing users to provide the models follows the same design
        #       design principle throughout causallib,
        #       however, this might put some strain on the users who will need
        #       to know to supply `sklearn.linear_model.LogisticRegression(penalty='l1')
        #       for the treatment and the same for categorical outcome, but
        #       `sklearn.linear_model.Lasso` for continuous outcome.
        super().__init__(importance_getter, covariates)
        self.treatment_lasso = treatment_lasso
        self.outcome_lasso = outcome_lasso
        self.mask_fn = mask_fn
        self.threshold = threshold

[docs]    @_BaseConfounderSelection._filter_covariates
    def fit(self, X, a, y):
        self.treatment_lasso = self._data_driven_initialization(self.treatment_lasso, a)
        self.outcome_lasso = self._data_driven_initialization(self.outcome_lasso, y)

        self.treatment_lasso.fit(X, a)
        self.outcome_lasso.fit(X, y)
        mask_fn = self.mask_fn or self._get_non_zero_coef_mask
        self.support_ = mask_fn(self.treatment_lasso, self.outcome_lasso)
        self.n_features_ = self.support_.sum()
        return self

    # @_BaseConfounderSelection._filter_and_re_add_covariates
    # def transform(self, X, a=None):
    #     X = X.iloc[:, self.get_support()]
    #     return X

    def _get_support_mask(self):
        return self.support_

    def _get_non_zero_coef_mask(self, treatment_lasso, outcome_lasso):
        # Using _get_feature_importances from sklearn covers many more
        # edge cases than writing a vanilla function.
        # Specifying transform_func "norm" in the call below
        # actually calls np.abs when the coef_ attribute of treatment_lasso
        # (or output_lasso) is a one dimensional vector.
        treatment_lasso_importances = _get_feature_importances(
            treatment_lasso, self.importance_getter, transform_func="norm",
        )
        outcome_lasso_importances = _get_feature_importances(
            outcome_lasso, self.importance_getter, transform_func="norm",
        )
        treatment_mask = treatment_lasso_importances >= self.threshold
        outcome_mask = outcome_lasso_importances >= self.threshold
        return treatment_mask | outcome_mask

    @staticmethod
    def _data_driven_initialization(estimator, target):
        if estimator is not None:  # User provided an estimator
            return estimator

        if type_of_target(target) == "continuous":
            estimator = LassoCV()
        else:
            estimator = LogisticRegressionCV(penalty='l1', solver='saga', max_iter=5000)
        return estimator


[docs]class RecursiveConfounderElimination(_BaseConfounderSelection):

    def __init__(self, estimator, n_features_to_select: int = 1, step: int = 1,
                 importance_getter="auto", covariates=None):
        """Recursively eliminate confounders to prune confounders.

        Args:
            estimator: Estimator to fit for every step of recursive elimination.
            n_features_to_select (int): The number of confounders to keep.
            step (int): The number of confounders to eliminate in one iteration.
            importance_getter (str | callable): how to obtain feature importance.
              either a callable that inputs an estimator,
              a string of `'coef_'` or `'feature_importance_'`,
              or `'auto'` will detect `'coef_'` or `'feature_importance_'` automatically.
            covariates (list | np.ndarray): Specifying a subset of columns to perform selection on.
              Columns in `X` but not in `covariates` will be included after `transform`
              no matter the selection.
              Can be either a list of column names, or an array of boolean indicators length of `X`,
              or anything compatible with pandas `loc` function for columns.
              if `None` then all columns are participating in the selection process.
              This is similar to using sklearn's `ColumnTransformer` or `make_column_selector`.
        """
        super().__init__(importance_getter, covariates)
        self.estimator = estimator
        self.n_features_to_select = n_features_to_select
        self.step = step

[docs]    @_BaseConfounderSelection._filter_covariates
    def fit(self, X, a, y):
        # This is like an abbreviated implementation of RFE in sklearn.
        # Main differences are (a) Conditioning on treatment for every iteration,
        # (b) adjusting ranking/support not to include treatment, and (c) accounting
        # for causallib data types for X and a.
        # TODO: the entire implementation may be reduced to overwriting the
        #       `importance_getter` function and rigging it to have infinity
        #       importance for the treatment assignment every time.
        n_features = len(X.columns)
        support_ = np.ones(n_features, dtype=bool)
        ranking_ = np.ones(n_features, dtype=int)
        while np.sum(support_) > self.n_features_to_select:
            features = np.arange(n_features)[support_]
            estimator = clone(self.estimator)
            estimator.fit(a.to_frame().join(X.iloc[:, features]), y)
            importances = _get_feature_importances(
                estimator, self.importance_getter, transform_func="square",
            )
            importances = importances[1:]  # Do not consider "a" for dropping
            ranks = np.argsort(importances)
            ranks = np.ravel(ranks)
            threshold = min(self.step, np.sum(support_) - self.n_features_to_select)
            support_[features[ranks][:threshold]] = False
            ranking_[np.logical_not(support_)] += 1
        features = np.arange(n_features)[support_]
        self.estimator.fit(a.to_frame().join(X.iloc[:, features]), y)
        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_
        return self

    # @_BaseConfounderSelection._filter_and_re_add_covariates
    # def transform(self, X, a=None):
    #     X = X.iloc[:, self.get_support()]
    #     return X

    def _get_support_mask(self):
        return self.support_


def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1):
    """
    Retrieve and aggregate (if ndim > 1) (and optionally transforms)
    the feature importances from an estimator.

    Args:
        estimator: A scikit-learn estimator from which we want to get the feature importances.
        getter (str | callable):  An attribute or a callable to get the feature importance.
                If `"auto"`, `estimator` is expected to expose `coef_` or `feature_importances_`.
        transform_func (str | None): The transform to apply to the feature importances.
                By default (`None`) no transformation is applied.
                Only "norm" and "square" are currently supported.
        norm_order (int): The norm order to apply when `transform_func="norm"`.
                Only applied when `importances.ndim > 1`.

    Returns:
        np.ndarray: The features importances, optionally transformed.
    """
    # A local version of sklearn's `_get_feature_importance`,
    # Because there have been multiple changes between version 0.20 and 0.24
    # in both API and import location, it uses the 0.24 version of the function.
    # Once dependencies move to > 0.24 this may be removed

    if isinstance(getter, str):
        if getter == "auto":
            if hasattr(estimator, "coef_"):
                getter = "coef_"
            elif hasattr(estimator, "feature_importances_"):
                getter = "feature_importances_"
            else:
                raise ValueError(
                    f"`importance_getter=='auto'` requires the estimator to have "
                    f"a `coef_` or `feature_importances_` attribute. "
                    f"If your estimator should have these attributes, "
                    f"make sure it is fitted before calling transform."
                )
        importances = getattr(estimator, getter)
    elif callable(getter):
        importances = getter(estimator)
    else:
        raise ValueError("`importance_getter` has to be a string or `callable`")

    if transform_func is None:
        pass
    elif transform_func == "norm":
        if importances.ndim == 1:
            importances = np.abs(importances)
        else:
            importances = np.linalg.norm(importances, axis=0, ord=norm_order)
    elif transform_func == "square":
        importances = importances ** 2
        if importances.ndim > 1:
            importances = importances.sum(axis=0)
    else:
        raise ValueError("`transform_func` only supports None, 'norm' and 'square'.")

    return importances