Skip to content

Commit 2c8fe5d

Browse files
committed
Pushing the docs to 1.1/ for branch: 1.1.X, commit 0822851f5cb17827939a7d7b4f8c84f43184ae89
1 parent 0c53fba commit 2c8fe5d

File tree

3,837 files changed

+830436
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,837 files changed

+830436
-0
lines changed

1.1/.buildinfo

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Sphinx build info version 1
2+
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3+
config: 44afcf8dd215cc5d065a44ea3a818dd0
4+
tags: 645f666f9bcd5a90fca523b33c5a78b7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Segmenting the picture of greek coins in regions\n\nThis example uses `spectral_clustering` on a graph created from\nvoxel-to-voxel difference on an image to break this image into multiple\npartly-homogeneous regions.\n\nThis procedure (spectral clustering on an image) is an efficient\napproximate solution for finding normalized graph cuts.\n\nThere are three options to assign labels:\n\n* 'kmeans' spectral clustering clusters samples in the embedding space\n using a kmeans algorithm\n* 'discrete' iteratively searches for the closest partition\n space to the embedding space of spectral clustering.\n* 'cluster_qr' assigns labels using the QR factorization with pivoting\n that directly determines the partition in the embedding space.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Gael Varoquaux <[email protected]>\n# Brian Cheung\n# Andrew Knyazev <[email protected]>\n# License: BSD 3 clause\n\nimport time\n\nimport numpy as np\nfrom scipy.ndimage import gaussian_filter\nimport matplotlib.pyplot as plt\nfrom skimage.data import coins\nfrom skimage.transform import rescale\n\nfrom sklearn.feature_extraction import image\nfrom sklearn.cluster import spectral_clustering\n\n\n# load the coins as a numpy array\norig_coins = coins()\n\n# Resize it to 20% of the original size to speed up the processing\n# Applying a Gaussian filter for smoothing prior to down-scaling\n# reduces aliasing artifacts.\nsmoothened_coins = gaussian_filter(orig_coins, sigma=2)\nrescaled_coins = rescale(\n smoothened_coins, 0.2, mode=\"reflect\", anti_aliasing=False, multichannel=False\n)\n\n# Convert the image into a graph with the value of the gradient on the\n# edges.\ngraph = image.img_to_graph(rescaled_coins)\n\n# Take a decreasing function of the gradient: an exponential\n# The smaller beta is, the more independent the segmentation is of the\n# actual image. For beta=1, the segmentation is close to a voronoi\nbeta = 10\neps = 1e-6\ngraph.data = np.exp(-beta * graph.data / graph.data.std()) + eps\n\n# The number of segmented regions to display needs to be chosen manually.\n# The current version of 'spectral_clustering' does not support determining\n# the number of good quality clusters automatically.\nn_regions = 26"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"Compute and visualize the resulting regions\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"# Computing a few extra eigenvectors may speed up the eigen_solver.\n# The spectral clustering quality may also benetif from requesting\n# extra regions for segmentation.\nn_regions_plus = 3\n\n# Apply spectral clustering using the default eigen_solver='arpack'.\n# Any implemented solver can be used: eigen_solver='arpack', 'lobpcg', or 'amg'.\n# Choosing eigen_solver='amg' requires an extra package called 'pyamg'.\n# The quality of segmentation and the speed of calculations is mostly determined\n# by the choice of the solver and the value of the tolerance 'eigen_tol'.\n# TODO: varying eigen_tol seems to have no effect for 'lobpcg' and 'amg' #21243.\nfor assign_labels in (\"kmeans\", \"discretize\", \"cluster_qr\"):\n t0 = time.time()\n labels = spectral_clustering(\n graph,\n n_clusters=(n_regions + n_regions_plus),\n eigen_tol=1e-7,\n assign_labels=assign_labels,\n random_state=42,\n )\n\n t1 = time.time()\n labels = labels.reshape(rescaled_coins.shape)\n plt.figure(figsize=(5, 5))\n plt.imshow(rescaled_coins, cmap=plt.cm.gray)\n\n plt.xticks(())\n plt.yticks(())\n title = \"Spectral clustering: %s, %.2fs\" % (assign_labels, (t1 - t0))\n print(title)\n plt.title(title)\n for l in range(n_regions):\n colors = [plt.cm.nipy_spectral((l + 4) / float(n_regions + 4))]\n plt.contour(labels == l, colors=colors)\n # To view individual segments as appear comment in plt.pause(0.5)\nplt.show()\n\n# TODO: After #21194 is merged and #21243 is fixed, check which eigen_solver\n# is the best and set eigen_solver='arpack', 'lobpcg', or 'amg' and eigen_tol\n# explicitly in this example."
48+
]
49+
}
50+
],
51+
"metadata": {
52+
"kernelspec": {
53+
"display_name": "Python 3",
54+
"language": "python",
55+
"name": "python3"
56+
},
57+
"language_info": {
58+
"codemirror_mode": {
59+
"name": "ipython",
60+
"version": 3
61+
},
62+
"file_extension": ".py",
63+
"mimetype": "text/x-python",
64+
"name": "python",
65+
"nbconvert_exporter": "python",
66+
"pygments_lexer": "ipython3",
67+
"version": "3.9.12"
68+
}
69+
},
70+
"nbformat": 4,
71+
"nbformat_minor": 0
72+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""
2+
=======================================================================
3+
Plot the decision surface of decision trees trained on the iris dataset
4+
=======================================================================
5+
6+
Plot the decision surface of a decision tree trained on pairs
7+
of features of the iris dataset.
8+
9+
See :ref:`decision tree <tree>` for more information on the estimator.
10+
11+
For each pair of iris features, the decision tree learns decision
12+
boundaries made of combinations of simple thresholding rules inferred from
13+
the training samples.
14+
15+
We also show the tree structure of a model built on all of the features.
16+
"""
17+
# %%
18+
# First load the copy of the Iris dataset shipped with scikit-learn:
19+
from sklearn.datasets import load_iris
20+
21+
iris = load_iris()
22+
23+
24+
# %%
25+
# Display the decision functions of trees trained on all pairs of features.
26+
import numpy as np
27+
import matplotlib.pyplot as plt
28+
29+
from sklearn.datasets import load_iris
30+
from sklearn.tree import DecisionTreeClassifier
31+
from sklearn.inspection import DecisionBoundaryDisplay
32+
33+
34+
# Parameters
35+
n_classes = 3
36+
plot_colors = "ryb"
37+
plot_step = 0.02
38+
39+
40+
for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
41+
# We only take the two corresponding features
42+
X = iris.data[:, pair]
43+
y = iris.target
44+
45+
# Train
46+
clf = DecisionTreeClassifier().fit(X, y)
47+
48+
# Plot the decision boundary
49+
ax = plt.subplot(2, 3, pairidx + 1)
50+
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
51+
DecisionBoundaryDisplay.from_estimator(
52+
clf,
53+
X,
54+
cmap=plt.cm.RdYlBu,
55+
response_method="predict",
56+
ax=ax,
57+
xlabel=iris.feature_names[pair[0]],
58+
ylabel=iris.feature_names[pair[1]],
59+
)
60+
61+
# Plot the training points
62+
for i, color in zip(range(n_classes), plot_colors):
63+
idx = np.where(y == i)
64+
plt.scatter(
65+
X[idx, 0],
66+
X[idx, 1],
67+
c=color,
68+
label=iris.target_names[i],
69+
cmap=plt.cm.RdYlBu,
70+
edgecolor="black",
71+
s=15,
72+
)
73+
74+
plt.suptitle("Decision surface of decision trees trained on pairs of features")
75+
plt.legend(loc="lower right", borderpad=0, handletextpad=0)
76+
_ = plt.axis("tight")
77+
78+
# %%
79+
# Display the structure of a single decision tree trained on all the features
80+
# together.
81+
from sklearn.tree import plot_tree
82+
83+
plt.figure()
84+
clf = DecisionTreeClassifier().fit(iris.data, iris.target)
85+
plot_tree(clf, filled=True)
86+
plt.title("Decision tree trained on all the iris features")
87+
plt.show()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""
2+
=============================================================
3+
Receiver Operating Characteristic (ROC) with cross validation
4+
=============================================================
5+
6+
Example of Receiver Operating Characteristic (ROC) metric to evaluate
7+
classifier output quality using cross-validation.
8+
9+
ROC curves typically feature true positive rate on the Y axis, and false
10+
positive rate on the X axis. This means that the top left corner of the plot is
11+
the "ideal" point - a false positive rate of zero, and a true positive rate of
12+
one. This is not very realistic, but it does mean that a larger area under the
13+
curve (AUC) is usually better.
14+
15+
The "steepness" of ROC curves is also important, since it is ideal to maximize
16+
the true positive rate while minimizing the false positive rate.
17+
18+
This example shows the ROC response of different datasets, created from K-fold
19+
cross-validation. Taking all of these curves, it is possible to calculate the
20+
mean area under curve, and see the variance of the curve when the
21+
training set is split into different subsets. This roughly shows how the
22+
classifier output is affected by changes in the training data, and how
23+
different the splits generated by K-fold cross-validation are from one another.
24+
25+
.. note::
26+
27+
See also :func:`sklearn.metrics.roc_auc_score`,
28+
:func:`sklearn.model_selection.cross_val_score`,
29+
:ref:`sphx_glr_auto_examples_model_selection_plot_roc.py`,
30+
31+
"""
32+
33+
# %%
34+
# Data IO and generation
35+
# ----------------------
36+
import numpy as np
37+
38+
from sklearn import datasets
39+
40+
# Import some data to play with
41+
iris = datasets.load_iris()
42+
X = iris.data
43+
y = iris.target
44+
X, y = X[y != 2], y[y != 2]
45+
n_samples, n_features = X.shape
46+
47+
# Add noisy features
48+
random_state = np.random.RandomState(0)
49+
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
50+
51+
# %%
52+
# Classification and ROC analysis
53+
# -------------------------------
54+
import matplotlib.pyplot as plt
55+
56+
from sklearn import svm
57+
from sklearn.metrics import auc
58+
from sklearn.metrics import RocCurveDisplay
59+
from sklearn.model_selection import StratifiedKFold
60+
61+
# Run classifier with cross-validation and plot ROC curves
62+
cv = StratifiedKFold(n_splits=6)
63+
classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
64+
65+
tprs = []
66+
aucs = []
67+
mean_fpr = np.linspace(0, 1, 100)
68+
69+
fig, ax = plt.subplots()
70+
for i, (train, test) in enumerate(cv.split(X, y)):
71+
classifier.fit(X[train], y[train])
72+
viz = RocCurveDisplay.from_estimator(
73+
classifier,
74+
X[test],
75+
y[test],
76+
name="ROC fold {}".format(i),
77+
alpha=0.3,
78+
lw=1,
79+
ax=ax,
80+
)
81+
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
82+
interp_tpr[0] = 0.0
83+
tprs.append(interp_tpr)
84+
aucs.append(viz.roc_auc)
85+
86+
ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)
87+
88+
mean_tpr = np.mean(tprs, axis=0)
89+
mean_tpr[-1] = 1.0
90+
mean_auc = auc(mean_fpr, mean_tpr)
91+
std_auc = np.std(aucs)
92+
ax.plot(
93+
mean_fpr,
94+
mean_tpr,
95+
color="b",
96+
label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
97+
lw=2,
98+
alpha=0.8,
99+
)
100+
101+
std_tpr = np.std(tprs, axis=0)
102+
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
103+
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
104+
ax.fill_between(
105+
mean_fpr,
106+
tprs_lower,
107+
tprs_upper,
108+
color="grey",
109+
alpha=0.2,
110+
label=r"$\pm$ 1 std. dev.",
111+
)
112+
113+
ax.set(
114+
xlim=[-0.05, 1.05],
115+
ylim=[-0.05, 1.05],
116+
title="Receiver operating characteristic example",
117+
)
118+
ax.legend(loc="lower right")
119+
plt.show()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
=================================================
3+
Concatenating multiple feature extraction methods
4+
=================================================
5+
6+
In many real-world examples, there are many ways to extract features from a
7+
dataset. Often it is beneficial to combine several methods to obtain good
8+
performance. This example shows how to use ``FeatureUnion`` to combine
9+
features obtained by PCA and univariate selection.
10+
11+
Combining features using this transformer has the benefit that it allows
12+
cross validation and grid searches over the whole process.
13+
14+
The combination used in this example is not particularly helpful on this
15+
dataset and is only used to illustrate the usage of FeatureUnion.
16+
17+
"""
18+
19+
# Author: Andreas Mueller <[email protected]>
20+
#
21+
# License: BSD 3 clause
22+
23+
from sklearn.pipeline import Pipeline, FeatureUnion
24+
from sklearn.model_selection import GridSearchCV
25+
from sklearn.svm import SVC
26+
from sklearn.datasets import load_iris
27+
from sklearn.decomposition import PCA
28+
from sklearn.feature_selection import SelectKBest
29+
30+
iris = load_iris()
31+
32+
X, y = iris.data, iris.target
33+
34+
# This dataset is way too high-dimensional. Better do PCA:
35+
pca = PCA(n_components=2)
36+
37+
# Maybe some original features were good, too?
38+
selection = SelectKBest(k=1)
39+
40+
# Build estimator from PCA and Univariate selection:
41+
42+
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
43+
44+
# Use combined features to transform dataset:
45+
X_features = combined_features.fit(X, y).transform(X)
46+
print("Combined space has", X_features.shape[1], "features")
47+
48+
svm = SVC(kernel="linear")
49+
50+
# Do grid search over k, n_components and C:
51+
52+
pipeline = Pipeline([("features", combined_features), ("svm", svm)])
53+
54+
param_grid = dict(
55+
features__pca__n_components=[1, 2, 3],
56+
features__univ_select__k=[1, 2],
57+
svm__C=[0.1, 1, 10],
58+
)
59+
60+
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
61+
grid_search.fit(X, y)
62+
print(grid_search.best_estimator_)

0 commit comments

Comments
 (0)