Skip to content

Commit 726e913

Browse files
committed
Pushing the docs to dev/ for branch: main, commit b98dc797c480b1b9495f918e201d45ee07f29feb
1 parent 4978024 commit 726e913

File tree

1,580 files changed

+6501
-6417
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,580 files changed

+6501
-6417
lines changed
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/067cd5d39b097d2c49dd98f563dac13a/plot_iterative_imputer_variants_comparison.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
},
1616
"outputs": [],
1717
"source": [
18-
"# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.datasets import fetch_california_housing\nfrom sklearn.ensemble import RandomForestRegressor\n\n# To use this experimental feature, we need to explicitly ask for it:\nfrom sklearn.experimental import enable_iterative_imputer # noqa\nfrom sklearn.impute import IterativeImputer, SimpleImputer\nfrom sklearn.kernel_approximation import Nystroem\nfrom sklearn.linear_model import BayesianRidge, Ridge\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.neighbors import KNeighborsRegressor\nfrom sklearn.pipeline import make_pipeline\n\nN_SPLITS = 5\n\nrng = np.random.RandomState(0)\n\nX_full, y_full = fetch_california_housing(return_X_y=True)\n# ~2k samples is enough for the purpose of the example.\n# Remove the following two lines for a slower run with different error bars.\nX_full = X_full[::10]\ny_full = y_full[::10]\nn_samples, n_features = X_full.shape\n\n# Estimate the score on the entire dataset, with no missing values\nbr_estimator = BayesianRidge()\nscore_full_data = pd.DataFrame(\n cross_val_score(\n br_estimator, X_full, y_full, scoring=\"neg_mean_squared_error\", cv=N_SPLITS\n ),\n columns=[\"Full Data\"],\n)\n\n# Add a single missing value to each row\nX_missing = X_full.copy()\ny_missing = y_full\nmissing_samples = np.arange(n_samples)\nmissing_features = rng.choice(n_features, n_samples, replace=True)\nX_missing[missing_samples, missing_features] = np.nan\n\n# Estimate the score after imputation (mean and median strategies)\nscore_simple_imputer = pd.DataFrame()\nfor strategy in (\"mean\", \"median\"):\n estimator = make_pipeline(\n SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator\n )\n score_simple_imputer[strategy] = cross_val_score(\n estimator, X_missing, y_missing, scoring=\"neg_mean_squared_error\", cv=N_SPLITS\n )\n\n# Estimate the score after iterative imputation of the missing values\n# with different estimators\nestimators = [\n BayesianRidge(),\n RandomForestRegressor(\n # We tuned the hyperparameters of the RandomForestRegressor to get a good\n # enough predictive performance for a restricted execution time.\n n_estimators=4,\n max_depth=10,\n bootstrap=True,\n max_samples=0.5,\n n_jobs=2,\n random_state=0,\n ),\n make_pipeline(\n Nystroem(kernel=\"polynomial\", degree=2, random_state=0), Ridge(alpha=1e3)\n ),\n KNeighborsRegressor(n_neighbors=15),\n]\nscore_iterative_imputer = pd.DataFrame()\n# iterative imputer is sensible to the tolerance and\n# dependent on the estimator used internally.\n# we tuned the tolerance to keep this example run with limited computational\n# resources while not changing the results too much compared to keeping the\n# stricter default value for the tolerance parameter.\ntolerances = (1e-3, 1e-1, 1e-1, 1e-2)\nfor impute_estimator, tol in zip(estimators, tolerances):\n estimator = make_pipeline(\n IterativeImputer(\n random_state=0, estimator=impute_estimator, max_iter=25, tol=tol\n ),\n br_estimator,\n )\n score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(\n estimator, X_missing, y_missing, scoring=\"neg_mean_squared_error\", cv=N_SPLITS\n )\n\nscores = pd.concat(\n [score_full_data, score_simple_imputer, score_iterative_imputer],\n keys=[\"Original\", \"SimpleImputer\", \"IterativeImputer\"],\n axis=1,\n)\n\n# plot california housing results\nfig, ax = plt.subplots(figsize=(13, 6))\nmeans = -scores.mean()\nerrors = scores.std()\nmeans.plot.barh(xerr=errors, ax=ax)\nax.set_title(\"California Housing Regression with Different Imputation Methods\")\nax.set_xlabel(\"MSE (smaller is better)\")\nax.set_yticks(np.arange(means.shape[0]))\nax.set_yticklabels([\" w/ \".join(label) for label in means.index.tolist()])\nplt.tight_layout(pad=1)\nplt.show()"
18+
"# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.datasets import fetch_california_housing\nfrom sklearn.ensemble import RandomForestRegressor\n\n# To use this experimental feature, we need to explicitly ask for it:\nfrom sklearn.experimental import enable_iterative_imputer # noqa: F401\nfrom sklearn.impute import IterativeImputer, SimpleImputer\nfrom sklearn.kernel_approximation import Nystroem\nfrom sklearn.linear_model import BayesianRidge, Ridge\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.neighbors import KNeighborsRegressor\nfrom sklearn.pipeline import make_pipeline\n\nN_SPLITS = 5\n\nrng = np.random.RandomState(0)\n\nX_full, y_full = fetch_california_housing(return_X_y=True)\n# ~2k samples is enough for the purpose of the example.\n# Remove the following two lines for a slower run with different error bars.\nX_full = X_full[::10]\ny_full = y_full[::10]\nn_samples, n_features = X_full.shape\n\n# Estimate the score on the entire dataset, with no missing values\nbr_estimator = BayesianRidge()\nscore_full_data = pd.DataFrame(\n cross_val_score(\n br_estimator, X_full, y_full, scoring=\"neg_mean_squared_error\", cv=N_SPLITS\n ),\n columns=[\"Full Data\"],\n)\n\n# Add a single missing value to each row\nX_missing = X_full.copy()\ny_missing = y_full\nmissing_samples = np.arange(n_samples)\nmissing_features = rng.choice(n_features, n_samples, replace=True)\nX_missing[missing_samples, missing_features] = np.nan\n\n# Estimate the score after imputation (mean and median strategies)\nscore_simple_imputer = pd.DataFrame()\nfor strategy in (\"mean\", \"median\"):\n estimator = make_pipeline(\n SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator\n )\n score_simple_imputer[strategy] = cross_val_score(\n estimator, X_missing, y_missing, scoring=\"neg_mean_squared_error\", cv=N_SPLITS\n )\n\n# Estimate the score after iterative imputation of the missing values\n# with different estimators\nestimators = [\n BayesianRidge(),\n RandomForestRegressor(\n # We tuned the hyperparameters of the RandomForestRegressor to get a good\n # enough predictive performance for a restricted execution time.\n n_estimators=4,\n max_depth=10,\n bootstrap=True,\n max_samples=0.5,\n n_jobs=2,\n random_state=0,\n ),\n make_pipeline(\n Nystroem(kernel=\"polynomial\", degree=2, random_state=0), Ridge(alpha=1e3)\n ),\n KNeighborsRegressor(n_neighbors=15),\n]\nscore_iterative_imputer = pd.DataFrame()\n# iterative imputer is sensible to the tolerance and\n# dependent on the estimator used internally.\n# we tuned the tolerance to keep this example run with limited computational\n# resources while not changing the results too much compared to keeping the\n# stricter default value for the tolerance parameter.\ntolerances = (1e-3, 1e-1, 1e-1, 1e-2)\nfor impute_estimator, tol in zip(estimators, tolerances):\n estimator = make_pipeline(\n IterativeImputer(\n random_state=0, estimator=impute_estimator, max_iter=25, tol=tol\n ),\n br_estimator,\n )\n score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(\n estimator, X_missing, y_missing, scoring=\"neg_mean_squared_error\", cv=N_SPLITS\n )\n\nscores = pd.concat(\n [score_full_data, score_simple_imputer, score_iterative_imputer],\n keys=[\"Original\", \"SimpleImputer\", \"IterativeImputer\"],\n axis=1,\n)\n\n# plot california housing results\nfig, ax = plt.subplots(figsize=(13, 6))\nmeans = -scores.mean()\nerrors = scores.std()\nmeans.plot.barh(xerr=errors, ax=ax)\nax.set_title(\"California Housing Regression with Different Imputation Methods\")\nax.set_xlabel(\"MSE (smaller is better)\")\nax.set_yticks(np.arange(means.shape[0]))\nax.set_yticklabels([\" w/ \".join(label) for label in means.index.tolist()])\nplt.tight_layout(pad=1)\nplt.show()"
1919
]
2020
}
2121
],
Binary file not shown.

dev/_downloads/0785ea6d45bde062e5beedda88131215/plot_release_highlights_1_3_0.ipynb

+5-5
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
},
3030
"outputs": [],
3131
"source": [
32-
"import numpy as np\nfrom sklearn.cluster import HDBSCAN\nfrom sklearn.datasets import load_digits\nfrom sklearn.metrics import v_measure_score\n\nX, true_labels = load_digits(return_X_y=True)\nprint(f\"number of digits: {len(np.unique(true_labels))}\")\n\nhdbscan = HDBSCAN(min_cluster_size=15).fit(X)\nnon_noisy_labels = hdbscan.labels_[hdbscan.labels_ != -1]\nprint(f\"number of clusters found: {len(np.unique(non_noisy_labels))}\")\n\nprint(v_measure_score(true_labels[hdbscan.labels_ != -1], non_noisy_labels))"
32+
"import numpy as np\n\nfrom sklearn.cluster import HDBSCAN\nfrom sklearn.datasets import load_digits\nfrom sklearn.metrics import v_measure_score\n\nX, true_labels = load_digits(return_X_y=True)\nprint(f\"number of digits: {len(np.unique(true_labels))}\")\n\nhdbscan = HDBSCAN(min_cluster_size=15).fit(X)\nnon_noisy_labels = hdbscan.labels_[hdbscan.labels_ != -1]\nprint(f\"number of clusters found: {len(np.unique(non_noisy_labels))}\")\n\nprint(v_measure_score(true_labels[hdbscan.labels_ != -1], non_noisy_labels))"
3333
]
3434
},
3535
{
@@ -47,7 +47,7 @@
4747
},
4848
"outputs": [],
4949
"source": [
50-
"import numpy as np\nfrom sklearn.preprocessing import TargetEncoder\n\nX = np.array([[\"cat\"] * 30 + [\"dog\"] * 20 + [\"snake\"] * 38], dtype=object).T\ny = [90.3] * 30 + [20.4] * 20 + [21.2] * 38\n\nenc = TargetEncoder(random_state=0)\nX_trans = enc.fit_transform(X, y)\n\nenc.encodings_"
50+
"import numpy as np\n\nfrom sklearn.preprocessing import TargetEncoder\n\nX = np.array([[\"cat\"] * 30 + [\"dog\"] * 20 + [\"snake\"] * 38], dtype=object).T\ny = [90.3] * 30 + [20.4] * 20 + [21.2] * 38\n\nenc = TargetEncoder(random_state=0)\nX_trans = enc.fit_transform(X, y)\n\nenc.encodings_"
5151
]
5252
},
5353
{
@@ -65,7 +65,7 @@
6565
},
6666
"outputs": [],
6767
"source": [
68-
"import numpy as np\nfrom sklearn.tree import DecisionTreeClassifier\n\nX = np.array([0, 1, 6, np.nan]).reshape(-1, 1)\ny = [0, 0, 1, 1]\n\ntree = DecisionTreeClassifier(random_state=0).fit(X, y)\ntree.predict(X)"
68+
"import numpy as np\n\nfrom sklearn.tree import DecisionTreeClassifier\n\nX = np.array([0, 1, 6, np.nan]).reshape(-1, 1)\ny = [0, 0, 1, 1]\n\ntree = DecisionTreeClassifier(random_state=0).fit(X, y)\ntree.predict(X)"
6969
]
7070
},
7171
{
@@ -101,7 +101,7 @@
101101
},
102102
"outputs": [],
103103
"source": [
104-
"import numpy as np\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.datasets import make_low_rank_matrix\nfrom sklearn.ensemble import HistGradientBoostingRegressor\n\nn_samples, n_features = 500, 10\nrng = np.random.RandomState(0)\nX = make_low_rank_matrix(n_samples, n_features, random_state=rng)\ncoef = rng.uniform(low=-10, high=20, size=n_features)\ny = rng.gamma(shape=2, scale=np.exp(X @ coef) / 2)\ngbdt = HistGradientBoostingRegressor(loss=\"gamma\")\ncross_val_score(gbdt, X, y).mean()"
104+
"import numpy as np\n\nfrom sklearn.datasets import make_low_rank_matrix\nfrom sklearn.ensemble import HistGradientBoostingRegressor\nfrom sklearn.model_selection import cross_val_score\n\nn_samples, n_features = 500, 10\nrng = np.random.RandomState(0)\nX = make_low_rank_matrix(n_samples, n_features, random_state=rng)\ncoef = rng.uniform(low=-10, high=20, size=n_features)\ny = rng.gamma(shape=2, scale=np.exp(X @ coef) / 2)\ngbdt = HistGradientBoostingRegressor(loss=\"gamma\")\ncross_val_score(gbdt, X, y).mean()"
105105
]
106106
},
107107
{
@@ -119,7 +119,7 @@
119119
},
120120
"outputs": [],
121121
"source": [
122-
"from sklearn.preprocessing import OrdinalEncoder\nimport numpy as np\n\nX = np.array(\n [[\"dog\"] * 5 + [\"cat\"] * 20 + [\"rabbit\"] * 10 + [\"snake\"] * 3], dtype=object\n).T\nenc = OrdinalEncoder(min_frequency=6).fit(X)\nenc.infrequent_categories_"
122+
"import numpy as np\n\nfrom sklearn.preprocessing import OrdinalEncoder\n\nX = np.array(\n [[\"dog\"] * 5 + [\"cat\"] * 20 + [\"rabbit\"] * 10 + [\"snake\"] * 3], dtype=object\n).T\nenc = OrdinalEncoder(min_frequency=6).fit(X)\nenc.infrequent_categories_"
123123
]
124124
}
125125
],
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/138e7c706c17949c3098ff8074b03ce7/plot_release_highlights_1_2_0.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# ruff: noqa
1+
# ruff: noqa: CPY001, E501
22
"""
33
=======================================
44
Release Highlights for scikit-learn 1.2
@@ -31,9 +31,10 @@
3131
# (some examples) <https://youtu.be/5bCg8VfX2x8>`__.
3232

3333
import numpy as np
34-
from sklearn.datasets import load_iris
35-
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
34+
3635
from sklearn.compose import ColumnTransformer
36+
from sklearn.datasets import load_iris
37+
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
3738

3839
X, y = load_iris(as_frame=True, return_X_y=True)
3940
sepal_cols = ["sepal length (cm)", "sepal width (cm)"]
@@ -78,6 +79,7 @@
7879
# :class:`~metrics.PredictionErrorDisplay` provides a way to analyze regression
7980
# models in a qualitative manner.
8081
import matplotlib.pyplot as plt
82+
8183
from sklearn.metrics import PredictionErrorDisplay
8284

8385
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
@@ -109,8 +111,8 @@
109111
X = X.select_dtypes(["number", "category"]).drop(columns=["body"])
110112

111113
# %%
112-
from sklearn.preprocessing import OrdinalEncoder
113114
from sklearn.pipeline import make_pipeline
115+
from sklearn.preprocessing import OrdinalEncoder
114116

115117
categorical_features = ["pclass", "sex", "embarked"]
116118
model = make_pipeline(
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/1b3f17ff0f112d5b77cbdb90f1c17046/plot_set_output.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
the `set_output` method or globally by setting `set_config(transform_output="pandas")`.
1111
For details, see
1212
`SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
13-
""" # noqa
13+
""" # noqa: CPY001
1414

1515
# %%
1616
# First, we load the iris dataset as a DataFrame to demonstrate the `set_output` API.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/23fb33f64b3c23edf25165a3a4f04237/plot_successive_halving_iterations.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
},
1616
"outputs": [],
1717
"source": [
18-
"# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nfrom scipy.stats import randint\n\nfrom sklearn import datasets\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.experimental import enable_halving_search_cv # noqa\nfrom sklearn.model_selection import HalvingRandomSearchCV"
18+
"# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nfrom scipy.stats import randint\n\nfrom sklearn import datasets\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.experimental import enable_halving_search_cv # noqa: F401\nfrom sklearn.model_selection import HalvingRandomSearchCV"
1919
]
2020
},
2121
{
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)