|
24 | 24 | # %%
|
25 | 25 | # FixedThresholdClassifier: Setting the decision threshold of a binary classifier
|
26 | 26 | # -------------------------------------------------------------------------------
|
27 |
| -# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to |
28 |
| -# convert probability estimates (i.e. output of `predict_proba`) into class |
29 |
| -# predictions. However, 0.5 is almost never the desired threshold for a given problem. |
30 |
| -# :class:`~model_selection.FixedThresholdClassifier` allows to wrap any binary |
31 |
| -# classifier and set a custom decision threshold. |
| 27 | +# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 |
| 28 | +# to convert probability estimates (i.e. output of `predict_proba`) into class |
| 29 | +# predictions. However, 0.5 is almost never the desired threshold for a given |
| 30 | +# problem. :class:`~model_selection.FixedThresholdClassifier` allows wrapping any |
| 31 | +# binary classifier and setting a custom decision threshold. |
32 | 32 | from sklearn.datasets import make_classification
|
| 33 | +from sklearn.model_selection import train_test_split |
33 | 34 | from sklearn.linear_model import LogisticRegression
|
34 |
| -from sklearn.metrics import confusion_matrix |
| 35 | +from sklearn.metrics import ConfusionMatrixDisplay |
| 36 | + |
35 | 37 |
|
36 |
| -X, y = make_classification(n_samples=1_000, weights=[0.9, 0.1], random_state=0) |
37 |
| -classifier = LogisticRegression(random_state=0).fit(X, y) |
| 38 | +X, y = make_classification(n_samples=10_000, weights=[0.9, 0.1], random_state=0) |
| 39 | +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) |
38 | 40 |
|
39 |
| -print("confusion matrix:\n", confusion_matrix(y, classifier.predict(X))) |
| 41 | +classifier_05 = LogisticRegression(C=1e6, random_state=0).fit(X_train, y_train) |
| 42 | +_ = ConfusionMatrixDisplay.from_estimator(classifier_05, X_test, y_test) |
40 | 43 |
|
41 | 44 | # %%
|
42 | 45 | # Lowering the threshold, i.e. allowing more samples to be classified as the positive
|
43 | 46 | # class, increases the number of true positives at the cost of more false positives
|
44 | 47 | # (as is well known from the concavity of the ROC curve).
|
45 | 48 | from sklearn.model_selection import FixedThresholdClassifier
|
46 | 49 |
|
47 |
| -wrapped_classifier = FixedThresholdClassifier(classifier, threshold=0.1).fit(X, y) |
48 |
| - |
49 |
| -print("confusion matrix:\n", confusion_matrix(y, wrapped_classifier.predict(X))) |
| 50 | +classifier_01 = FixedThresholdClassifier(classifier_05, threshold=0.1) |
| 51 | +classifier_01.fit(X_train, y_train) |
| 52 | +_ = ConfusionMatrixDisplay.from_estimator(classifier_01, X_test, y_test) |
50 | 53 |
|
51 | 54 | # %%
|
52 | 55 | # TunedThresholdClassifierCV: Tuning the decision threshold of a binary classifier
|
53 | 56 | # --------------------------------------------------------------------------------
|
54 |
| -# The decision threshold of a binary classifier can be tuned to optimize a given |
55 |
| -# metric, using :class:`~model_selection.TunedThresholdClassifierCV`. |
56 |
| -from sklearn.metrics import balanced_accuracy_score |
| 57 | +# The decision threshold of a binary classifier can be tuned to optimize a |
| 58 | +# given metric, using :class:`~model_selection.TunedThresholdClassifierCV`. |
| 59 | +# |
| 60 | +# It is particularly useful to find the best decision threshold when the model |
| 61 | +# is meant to be deployed in a specific application context where we can assign |
| 62 | +# different gains or costs for true positives, true negatives, false positives, |
| 63 | +# and false negatives. |
| 64 | +# |
| 65 | +# Let's illustrate this by considering an arbitrary case where: |
| 66 | +# |
| 67 | +# - each true positive gains 1 unit of profit, e.g. euro, year of life in good |
| 68 | +# health, etc.; |
| 69 | +# - true negatives gain or cost nothing; |
| 70 | +# - each false negative costs 2; |
| 71 | +# - each false positive costs 0.1. |
| 72 | +# |
| 73 | +# Our metric quantifies the average profit per sample, which is defined by the |
| 74 | +# following Python function: |
| 75 | +from sklearn.metrics import confusion_matrix |
| 76 | + |
| 77 | + |
| 78 | +def custom_score(y_observed, y_pred): |
| 79 | + tn, fp, fn, tp = confusion_matrix(y_observed, y_pred, normalize="all").ravel() |
| 80 | + return tp - 2 * fn - 0.1 * fp |
57 | 81 |
|
58 |
| -# Due to the class imbalance, the balanced accuracy is not optimal for the default |
59 |
| -# threshold. The classifier tends to over predict the majority class. |
60 |
| -print(f"balanced accuracy: {balanced_accuracy_score(y, classifier.predict(X)):.2f}") |
| 82 | + |
| 83 | +print("Untuned decision threshold: 0.5") |
| 84 | +print(f"Custom score: {custom_score(y_test, classifier_05.predict(X_test)):.2f}") |
61 | 85 |
|
62 | 86 | # %%
|
63 |
| -# Tuning the threshold to optimize the balanced accuracy gives a smaller threshold |
64 |
| -# that allows more samples to be classified as the positive class. |
| 87 | +# It is interesting to observe that the average gain per prediction is negative |
| 88 | +# which means that this decision system is making a loss on average. |
| 89 | +# |
| 90 | +# Tuning the threshold to optimize this custom metric gives a smaller threshold |
| 91 | +# that allows more samples to be classified as the positive class. As a result, |
| 92 | +# the average gain per prediction improves. |
65 | 93 | from sklearn.model_selection import TunedThresholdClassifierCV
|
| 94 | +from sklearn.metrics import make_scorer |
66 | 95 |
|
| 96 | +custom_scorer = make_scorer( |
| 97 | + custom_score, response_method="predict", greater_is_better=True |
| 98 | +) |
67 | 99 | tuned_classifier = TunedThresholdClassifierCV(
|
68 |
| - classifier, cv=5, scoring="balanced_accuracy" |
| 100 | + classifier_05, cv=5, scoring=custom_scorer |
69 | 101 | ).fit(X, y)
|
70 | 102 |
|
71 |
| -print(f"new threshold: {tuned_classifier.best_threshold_:.4f}") |
72 |
| -print( |
73 |
| - f"balanced accuracy: {balanced_accuracy_score(y, tuned_classifier.predict(X)):.2f}" |
74 |
| -) |
| 103 | +print(f"Tuned decision threshold: {tuned_classifier.best_threshold_:.3f}") |
| 104 | +print(f"Custom score: {custom_score(y_test, tuned_classifier.predict(X_test)):.2f}") |
75 | 105 |
|
76 | 106 | # %%
|
77 |
| -# :class:`~model_selection.TunedThresholdClassifierCV` also benefits from the |
78 |
| -# metadata routing support (:ref:`Metadata Routing User Guide<metadata_routing>`) |
79 |
| -# allowing to optimze complex business metrics, detailed |
80 |
| -# in :ref:`Post-tuning the decision threshold for cost-sensitive learning |
| 107 | +# We observe that tuning the decision threshold can turn a machine |
| 108 | +# learning-based system that makes a loss on average into a beneficial one. |
| 109 | +# |
| 110 | +# In practice, defining a meaningful application-specific metric might involve |
| 111 | +# making those costs for bad predictions and gains for good predictions depend on |
| 112 | +# auxiliary metadata specific to each individual data point such as the amount |
| 113 | +# of a transaction in a fraud detection system. |
| 114 | +# |
| 115 | +# To achieve this, :class:`~model_selection.TunedThresholdClassifierCV` |
| 116 | +# leverages metadata routing support (:ref:`Metadata Routing User |
| 117 | +# Guide<metadata_routing>`) allowing to optimize complex business metrics as |
| 118 | +# detailed in :ref:`Post-tuning the decision threshold for cost-sensitive |
| 119 | +# learning |
81 | 120 | # <sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py>`.
|
82 | 121 |
|
83 | 122 | # %%
|
84 | 123 | # Performance improvements in PCA
|
85 | 124 | # -------------------------------
|
86 |
| -# :class:`~decomposition.PCA` has a new solver, "covariance_eigh", which is faster |
87 |
| -# and more memory efficient than the other solvers for datasets with a large number |
88 |
| -# of samples and a small number of features. |
| 125 | +# :class:`~decomposition.PCA` has a new solver, `"covariance_eigh"`, which is |
| 126 | +# up to an order of magnitude faster and more memory efficient than the other |
| 127 | +# solvers for datasets with many data points and few features. |
89 | 128 | from sklearn.datasets import make_low_rank_matrix
|
90 | 129 | from sklearn.decomposition import PCA
|
91 | 130 |
|
92 | 131 | X = make_low_rank_matrix(
|
93 | 132 | n_samples=10_000, n_features=100, tail_strength=0.1, random_state=0
|
94 | 133 | )
|
95 | 134 |
|
96 |
| -pca = PCA(n_components=10).fit(X) |
| 135 | +pca = PCA(n_components=10, svd_solver="covariance_eigh").fit(X) |
| 136 | +print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2f}") |
97 | 137 |
|
98 |
| -print(f"explained variance: {pca.explained_variance_ratio_.sum():.2f}") |
99 | 138 |
|
100 | 139 | # %%
|
101 |
| -# The "full" solver has also been improved to use less memory and allows to |
102 |
| -# transform faster. The "auto" option for the solver takes advantage of the |
103 |
| -# new solver and is now able to select an appropriate solver for sparse |
104 |
| -# datasets. |
| 140 | +# The new solver also accepts sparse input data: |
105 | 141 | from scipy.sparse import random
|
106 | 142 |
|
107 |
| -X = random(10000, 100, format="csr", random_state=0) |
| 143 | +X = random(10_000, 100, format="csr", random_state=0) |
108 | 144 |
|
109 |
| -pca = PCA(n_components=10, svd_solver="auto").fit(X) |
| 145 | +pca = PCA(n_components=10, svd_solver="covariance_eigh").fit(X) |
| 146 | +print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2f}") |
| 147 | + |
| 148 | +# %% |
| 149 | +# The `"full"` solver has also been improved to use less memory and allows |
| 150 | +# faster transformation. The default `svd_solver="auto"`` option takes |
| 151 | +# advantage of the new solver and is now able to select an appropriate solver |
| 152 | +# for sparse datasets. |
| 153 | +# |
| 154 | +# Similarly to most other PCA solvers, the new `"covariance_eigh"` solver can leverage |
| 155 | +# GPU computation if the input data is passed as a PyTorch or CuPy array by |
| 156 | +# enabling the experimental support for :ref:`Array API <array_api>`. |
110 | 157 |
|
111 | 158 | # %%
|
112 | 159 | # ColumnTransformer is subscriptable
|
|
0 commit comments