Source code for causallib.contrib.adversarial_balancing.classifier_selection

# (C) Copyright 2019 IBM Corp.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Created on Oct 30, 2019

from sklearn.base import clone
from sklearn.model_selection import KFold, cross_val_predict, ParameterGrid
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

import numpy as np


[docs]def select_classifier(model, X, A, n_splits=5, loss_type='01', seed=None): """Utility for selecting best classifier using cross-validation. Args: model: Either one of: scikit-learn classifier, scikit-learn SearchCV model (GridSearchCV, RandomizedSearchCV), list of classifiers. X (np.ndarray): Covariate matrix size (num_samples, num_features) A (np.ndarray): binary labels indicating the source and target populations (num_samples,) n_splits (int): number of splits in cross-validation. relevant only if list of classifiers is passed. loss_type (str): name of loss metric to select classifier by. Either '01' for zero-one loss, otherwise cross-entropy is used (and classifiers must implement predict_proba). relevant only if list of classifiers is passed. seed (int): random seed for cross-validation split. relevant only if list of classifiers is passed. Returns: classifier: best performing classifier on validation set. """ if isinstance(model, (GridSearchCV, RandomizedSearchCV)): selected_model = _select_classifier_from_sk_search(model, X, A) elif isinstance(model, list): selected_model = _select_classifier_from_list(candidates=model, X=X, A=A, n_splits=n_splits, seed=seed, loss_type=loss_type) elif isinstance(model, dict): selected_model = _select_classifier_from_grid(X=X, A=A, n_splits=n_splits, seed=seed, **model, loss_type=loss_type) else: # A regular classifier was passed selected_model = model return selected_model
def _select_classifier_from_sk_search(estimator, X, A): """Return best model from a scikit-learn Search-estimator model. Args: estimator (GridSearchCV | RandomizedSearchCV): An initialized sklearn SearchCV classifier. X (np.ndarray): Covariate matrix size (num_samples, num_features) A (np.ndarray): binary labels indicating the source and target populations (num_samples,) Returns: classifier: model.best_estimator_ - best-performing classifier. See scikit-learn's GridSearchCV and RandomizedSearchCV documentation for details on their return values. """ estimator.fit(X, A) best_estimator = clone(estimator.best_estimator_) return best_estimator def _select_classifier_from_grid(estimator, X, A, param_grid, n_splits=5, seed=1, loss_type='01'): candidates = [] for params in ParameterGrid(param_grid): estimator2 = clone(estimator) for key, value in params.items(): setattr(estimator2, key, value) candidates.append(estimator2) return _select_classifier_from_list(candidates, X, A, n_splits=n_splits, seed=seed, loss_type=loss_type) def _select_classifier_from_list(candidates, X, A, n_splits=5, seed=None, loss_type='01'): accuracies = np.zeros(len(candidates)) class_weight = compute_class_weight(class_weight='balanced', classes=np.unique(A), y=A)[LabelEncoder().fit_transform(A)] if n_splits >= 2: cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) for model_idx, m in enumerate(candidates): if loss_type == '01': pred = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight}).reshape(-1) else: ps = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight}, method='predict_proba') pred = ps[:, 1] else: for model_idx, m in enumerate(candidates): m.fit(X, A, sample_weight=class_weight) if loss_type == '01': pred = m.predict(X=X) else: pred = m.predict_proba(X=X)[:, 1] if loss_type == '01': accuracies[model_idx] = np.sum(class_weight[pred == A]) / np.sum(class_weight) else: logl = np.zeros(A.shape) logl[A == -1] = np.log(1.0 - pred[A == -1]) logl[A == 1] = np.log(pred[A == 1]) accuracies[model_idx] = np.sum(class_weight * logl) / np.sum(class_weight) i_best = np.argmax(accuracies) # print('accuracies =', accuracies, "accuracies-sorted", sorted(accuracies)) # print('Selected model {} {}'.format(i_best, candidates[i_best])) return candidates[i_best]