Source code for causallib.evaluation.plots.curve_data_makers

"""Functions that calculate curve data for cross validation plots."""
from typing import List
import warnings

import numpy as np
import pandas as pd
from sklearn import metrics


[docs]def calculate_roc_curve(curve_data): """Calculates ROC curve on the folds Args: curve_data (dict) : dict of curves produced by BaseEvaluationPlotDataExtractor.calculate_curve_data Returns: dict[str, list[np.ndarray]]: Keys being "FPR", "TPR" and "AUC" (ROC metrics) and values are a list the size of number of folds with the evaluation of each fold. """ for curve_name in curve_data.keys(): curve_data[curve_name]["FPR"] = curve_data[curve_name].pop("first_ret_value") curve_data[curve_name]["TPR"] = curve_data[curve_name].pop("second_ret_value") curve_data[curve_name]["AUC"] = curve_data[curve_name].pop("area") return curve_data
[docs]def calculate_pr_curve(curve_data, targets): """Calculates precision-recall curve on the folds. Args: curve_data (dict) : dict of curves produced by BaseEvaluationPlotDataExtractor.calculate_curve_data targets (pd.Series): True labels. Returns: dict[str, list[np.ndarray]]: Keys being "Precision", "Recall" and "AP" (PR metrics) and values are a list the size of number of folds with the evaluation of each fold. Additional "prevalence" key, with positive-label "prevalence" is added to be used by the chance curve. """ for curve_name in curve_data.keys(): curve_data[curve_name]["Precision"] = curve_data[curve_name].pop( "first_ret_value" ) curve_data[curve_name]["Recall"] = curve_data[curve_name].pop( "second_ret_value" ) curve_data[curve_name]["AP"] = curve_data[curve_name].pop("area") curve_data["prevalence"] = targets.value_counts(normalize=True).loc[targets.max()] return curve_data
[docs]def calculate_performance_curve_data_on_folds( folds_predictions, folds_targets, sample_weights=None, area_metric=metrics.roc_auc_score, curve_metric=metrics.roc_curve, pos_label=None, ): """Calculates performance curves of the predictions across folds. Args: folds_predictions (list[pd.Series]): Score prediction (as in continuous output of classifier, `predict_proba` or `decision_function`) for every fold. folds_targets (list[pd.Series]): True labels for every fold. sample_weights (list[pd.Series] | None): weight for each sample for every fold. area_metric (callable): Performance metric of the area under the curve. curve_metric (callable): Performance metric returning 3 output vectors - metric1, metric2 and thresholds. Where metric1 and metric2 depict the curve when plotted on x-axis and y-axis. pos_label: What label in `targets` is considered the positive label. Returns: (list[np.ndarray], list[np.ndarray], list[np.ndarray], list[float]): For every fold, the calculated metric1 and metric2 (the curves), the thresholds and the area calculations. """ sample_weights = ( [None] * len(folds_predictions) if sample_weights is None else sample_weights ) # Scikit-learn precision_recall_curve and roc_curve do not return values in a consistent way. # Namely, roc_curve returns `fpr`, `tpr`, which correspond to x_axis, y_axis, # whereas precision_recall_curve returns `precision`, `recall`, # which correspond to y_axis, x_axis. # That's why this function will return the values the same order as Scikit's curves, # leaving it up to the caller to put labels on what those return values actually are # (specifically, whether they're x_axis or y-axis) first_ret_folds, second_ret_folds, threshold_folds, area_folds = [], [], [], [] for fold_prediction, fold_target, fold_weights in zip( folds_predictions, folds_targets, sample_weights ): first_ret_fold, second_ret_fold, threshold_fold = curve_metric( fold_target, fold_prediction, pos_label=pos_label, sample_weight=fold_weights, ) try: area_fold = area_metric( fold_target, fold_prediction, sample_weight=fold_weights ) except ValueError as v: # AUC cannot be evaluated if targets are constant warnings.warn(f"metric {area_metric.__name__} could not be evaluated") warnings.warn(str(v)) area_fold = np.nan first_ret_folds.append(first_ret_fold) second_ret_folds.append(second_ret_fold) threshold_folds.append(threshold_fold) area_folds.append(area_fold) return area_folds, first_ret_folds, second_ret_folds, threshold_folds
[docs]def calculate_curve_data_binary_outcome( folds_predictions, targets, curve_metric, area_metric, stratify_by=None, ): """Calculate different performance (ROC or PR) curves Args: folds_predictions (list[pd.Series]): Predictions for each fold. targets (pd.Series): True labels curve_metric (callable): Performance metric returning 3 output vectors - metric1, metric2 and thresholds. Where metric1 and metric2 depict the curve when plotted on x-axis and y-axis. area_metric (callable): Performance metric of the area under the curve. stratify_by (pd.Series): Group assignment to stratify by. Returns: dict[str, dict[str, list[np.ndarray]]]: Evaluation of the metric for each fold and for each curve. One curve for each group level in `stratify_by`. On general: {curve_name: {metric1: [evaluation_fold_1, ...]}}. For example: {"Treatment=1": {"FPR": [FPR_fold_1, FPR_fold_2, FPR_fold_3]}} """ # folds_targets = [targets.loc[p.index] for p in folds_predictions] # folds_stratify_by = [stratify_by.loc[p.index] for p in folds_predictions] stratify_values = sorted(set(stratify_by)) curve_data = {} for stratum_level in stratify_values: # Slice data for that stratum level across the folds: folds_stratum_predictions, folds_stratum_targets = [], [] for fold_predictions in folds_predictions: # Extract fold: fold_targets = targets.loc[fold_predictions.index] fold_stratify_by = stratify_by.loc[fold_predictions.index] # Extract stratum: mask = fold_stratify_by == stratum_level fold_predictions = fold_predictions.loc[mask] fold_targets = fold_targets.loc[mask] # Save: folds_stratum_predictions.append(fold_predictions) folds_stratum_targets.append(fold_targets) ( area_folds, first_ret_folds, second_ret_folds, threshold_folds, ) = calculate_performance_curve_data_on_folds( folds_stratum_predictions, folds_stratum_targets, None, area_metric, curve_metric, ) curve_data[f"Treatment={stratum_level}"] = { "first_ret_value": first_ret_folds, "second_ret_value": second_ret_folds, "Thresholds": threshold_folds, "area": area_folds, } return curve_data
[docs]def calculate_curve_data_propensity( fold_predictions: List[ "causallib.evaluation.weight_predictor.PropensityPredictions" ], targets, curve_metric, area_metric, ): """Calculate different performance (ROC or PR) curves Args: fold_predictions (list[PropensityEvaluatorPredictions]): Predictions for each fold. targets (pd.Series): True labels curve_metric (callable): Performance metric returning 3 output vectors - metric1, metric2 and thresholds. Where metric1 and metric2 depict the curve when plotted on x-axis and y-axis. area_metric (callable): Performance metric of the area under the curve. **kwargs: Returns: dict[str, dict[str, list[np.ndarray]]]: Evaluation of the metric for each fold and for each curve. 3 curves: * "unweighted" (regular) * "weighted" (weighted by inverse propensity) * "expected" (duplicated population, weighted by propensity) On general: {curve_name: {metric1: [evaluation_fold_1, ...]}}. For example: {"weighted": {"FPR": [FPR_fold_1, FPR_fold_2, FPR_fold3]}} """ curves_sample_weights = { "unweighted": [None for _ in fold_predictions], "weighted": [ fold_predictions.weight_by_treatment_assignment for fold_predictions in fold_predictions ], "expected": [ pd.concat([fold_predictions.propensity, 1 - fold_predictions.propensity]) for fold_predictions in fold_predictions ], } curves_folds_targets = [ targets.loc[fold_predictions.weight_by_treatment_assignment.index] for fold_predictions in fold_predictions ] curves_folds_targets = { "unweighted": curves_folds_targets, "weighted": curves_folds_targets, "expected": [ pd.concat([ pd.Series(data=targets.max(), index=fold_predictions.propensity.index), pd.Series(data=targets.min(), index=fold_predictions.propensity.index) ]) for fold_predictions in fold_predictions ], } fold_predictions = { "unweighted": [ fold_predictions.propensity for fold_predictions in fold_predictions ], "weighted": [ fold_predictions.propensity for fold_predictions in fold_predictions ], "expected": [ pd.concat([fold_predictions.propensity, fold_predictions.propensity]) for fold_predictions in fold_predictions ], } # Expected curve duplicates the population, basically concatenating so that: # prediction = [p, p], target = [1, 0], weights = [p, 1-p] curve_data = {} for curve_name in curves_sample_weights: sample_weights = curves_sample_weights[curve_name] folds_targets = curves_folds_targets[curve_name] folds_predictions = fold_predictions[curve_name] ( area_folds, first_ret_folds, second_ret_folds, threshold_folds, ) = calculate_performance_curve_data_on_folds( folds_predictions, folds_targets, sample_weights, area_metric, curve_metric, ) curve_data[curve_name] = { "first_ret_value": first_ret_folds, "second_ret_value": second_ret_folds, "Thresholds": threshold_folds, "area": area_folds, } # Rename keys (as will be presented as curve labels in legend) curve_data["Propensity"] = curve_data.pop("unweighted") curve_data["Weighted"] = curve_data.pop("weighted") curve_data["Expected"] = curve_data.pop("expected") return curve_data