Source code for causallib.contrib.adversarial_balancing.classifier_selection

# (C) Copyright 2019 IBM Corp.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Created on Oct 30, 2019

from sklearn.base import clone
from sklearn.model_selection import KFold, cross_val_predict, ParameterGrid
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

import numpy as np


[docs]def select_classifier(model, X, A, n_splits=5, loss_type='01', seed=None):
    """Utility for selecting best classifier using cross-validation.

    Args:
        model: Either one of: scikit-learn classifier, scikit-learn SearchCV model (GridSearchCV, RandomizedSearchCV),
                              list of classifiers.
        X (np.ndarray): Covariate matrix size (num_samples, num_features)
        A (np.ndarray): binary labels indicating the source and target populations (num_samples,)
        n_splits (int): number of splits in cross-validation. relevant only if list of classifiers is passed.
        loss_type (str): name of loss metric to select classifier by. Either '01' for zero-one loss, otherwise
                         cross-entropy is used (and classifiers must implement predict_proba).
                         relevant only if list of classifiers is passed.
        seed (int): random seed for cross-validation split. relevant only if list of classifiers is passed.

    Returns:
        classifier: best performing classifier on validation set.
    """
    if isinstance(model, (GridSearchCV, RandomizedSearchCV)):
        selected_model = _select_classifier_from_sk_search(model, X, A)
    elif isinstance(model, list):
        selected_model = _select_classifier_from_list(candidates=model, X=X, A=A, n_splits=n_splits, seed=seed,
                                                      loss_type=loss_type)
    elif isinstance(model, dict):
        selected_model = _select_classifier_from_grid(X=X, A=A, n_splits=n_splits, seed=seed, **model,
                                                      loss_type=loss_type)
    else:  # A regular classifier was passed
        selected_model = model
    return selected_model


def _select_classifier_from_sk_search(estimator, X, A):
    """Return best model from a scikit-learn Search-estimator model.

    Args:
        estimator (GridSearchCV | RandomizedSearchCV): An initialized sklearn SearchCV classifier.
        X (np.ndarray): Covariate matrix size (num_samples, num_features)
        A (np.ndarray): binary labels indicating the source and target populations (num_samples,)

    Returns:
        classifier: model.best_estimator_ - best-performing classifier.
                    See scikit-learn's GridSearchCV and RandomizedSearchCV documentation for details on their return
                    values.
    """
    estimator.fit(X, A)
    best_estimator = clone(estimator.best_estimator_)
    return best_estimator


def _select_classifier_from_grid(estimator, X, A, param_grid, n_splits=5, seed=1, loss_type='01'):
    candidates = []
    for params in ParameterGrid(param_grid):
        estimator2 = clone(estimator)
        for key, value in params.items():
            setattr(estimator2, key, value)
        candidates.append(estimator2)

    return _select_classifier_from_list(candidates, X, A, n_splits=n_splits, seed=seed, loss_type=loss_type)


def _select_classifier_from_list(candidates, X, A, n_splits=5, seed=None, loss_type='01'):
    accuracies = np.zeros(len(candidates))

    class_weight = compute_class_weight(class_weight='balanced', classes=np.unique(A), y=A)[LabelEncoder().fit_transform(A)]

    if n_splits >= 2:
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        for model_idx, m in enumerate(candidates):
            if loss_type == '01':
                pred = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight}).reshape(-1)
            else:
                ps = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight},
                                       method='predict_proba')
                pred = ps[:, 1]
    else:
        for model_idx, m in enumerate(candidates):
            m.fit(X, A, sample_weight=class_weight)
            if loss_type == '01':
                pred = m.predict(X=X)
            else:
                pred = m.predict_proba(X=X)[:, 1]

    if loss_type == '01':
        accuracies[model_idx] = np.sum(class_weight[pred == A]) / np.sum(class_weight)
    else:
        logl = np.zeros(A.shape)
        logl[A == -1] = np.log(1.0 - pred[A == -1])
        logl[A == 1] = np.log(pred[A == 1])
        accuracies[model_idx] = np.sum(class_weight * logl) / np.sum(class_weight)

    i_best = np.argmax(accuracies)
    # print('accuracies =', accuracies, "accuracies-sorted", sorted(accuracies))
    # print('Selected model {} {}'.format(i_best, candidates[i_best]))
    return candidates[i_best]