|
122 | 122 | print(f"Done in {toc_bwd - tic_bwd:.3f}s")
|
123 | 123 |
|
124 | 124 | # %%
|
125 |
| -# Discussion |
126 |
| -# ---------- |
127 |
| -# |
128 | 125 | # Interestingly, forward and backward selection have selected the same set of
|
129 | 126 | # features. In general, this isn't the case and the two methods would lead to
|
130 | 127 | # different results.
|
|
145 | 142 | # attribute. The forward SFS is faster than the backward SFS because it only
|
146 | 143 | # needs to perform `n_features_to_select = 2` iterations, while the backward
|
147 | 144 | # SFS needs to perform `n_features - n_features_to_select = 8` iterations.
|
| 145 | +# |
| 146 | +# Using negative tolerance values |
| 147 | +# ------------------------------- |
| 148 | +# |
| 149 | +# :class:`~sklearn.feature_selection.SequentialFeatureSelector` can be used |
| 150 | +# to remove features present in the dataset and return a |
| 151 | +# smaller subset of the original features with `direction="backward"` |
| 152 | +# and a negative value of `tol`. |
| 153 | +# |
| 154 | +# We begin by loading the Breast Cancer dataset, consisting of 30 different |
| 155 | +# features and 569 samples. |
| 156 | +import numpy as np |
| 157 | + |
| 158 | +from sklearn.datasets import load_breast_cancer |
| 159 | + |
| 160 | +breast_cancer_data = load_breast_cancer() |
| 161 | +X, y = breast_cancer_data.data, breast_cancer_data.target |
| 162 | +feature_names = np.array(breast_cancer_data.feature_names) |
| 163 | +print(breast_cancer_data.DESCR) |
| 164 | + |
| 165 | +# %% |
| 166 | +# We will make use of the :class:`~sklearn.linear_model.LogisticRegression` |
| 167 | +# estimator with :class:`~sklearn.feature_selection.SequentialFeatureSelector` |
| 168 | +# to perform the feature selection. |
| 169 | +from sklearn.linear_model import LogisticRegression |
| 170 | +from sklearn.metrics import roc_auc_score |
| 171 | +from sklearn.pipeline import make_pipeline |
| 172 | +from sklearn.preprocessing import StandardScaler |
| 173 | + |
| 174 | +for tol in [-1e-2, -1e-3, -1e-4]: |
| 175 | + start = time() |
| 176 | + feature_selector = SequentialFeatureSelector( |
| 177 | + LogisticRegression(), |
| 178 | + n_features_to_select="auto", |
| 179 | + direction="backward", |
| 180 | + scoring="roc_auc", |
| 181 | + tol=tol, |
| 182 | + n_jobs=2, |
| 183 | + ) |
| 184 | + model = make_pipeline(StandardScaler(), feature_selector, LogisticRegression()) |
| 185 | + model.fit(X, y) |
| 186 | + end = time() |
| 187 | + print(f"\ntol: {tol}") |
| 188 | + print(f"Features selected: {feature_names[model[1].get_support()]}") |
| 189 | + print(f"ROC AUC score: {roc_auc_score(y, model.predict_proba(X)[:, 1]):.3f}") |
| 190 | + print(f"Done in {end - start:.3f}s") |
| 191 | + |
| 192 | +# %% |
| 193 | +# We can see that the number of features selected tend to increase as negative |
| 194 | +# values of `tol` approach to zero. The time taken for feature selection also |
| 195 | +# decreases as the values of `tol` come closer to zero. |
0 commit comments