-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathexercise_03.py
217 lines (168 loc) · 6.95 KB
/
exercise_03.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
Astronomy Tutorial: exercise 3
Dimensionality reduction of stellar spectra
Usage: python exercise_03.py datadir [-m method] [-k n_neigbors]
[-n norm_type] [-N n_samples]
[-s]
- datadir is $TUTORIAL_DIR/data/sdss_photoz
This directory should contain the file sdss_photoz.npy
- method is one of [pca | lle | mlle | isomap]. If not specified,
PCA will be performed
- n_neighbors is an integer number of neighbors to use with manifold methods
- norm_type is one of [none | l1 | l2]. It specifies how the data should
be normalized.
- n_samples is the number of samples used for the projection. Only 1000
of the 4000 samples are used by default.
- specifying -s shuffles the data. This can help test for stability of
the reconstruction.
Description:
In this tutorial, we explore manifold learning techniques to visualize 4000
SDSS spectral data. This is a much more exploratory exercise than the previous
two. The goal is to determine how to best visualize this high-dimensional
space. You will implement PCA, LLE, Modified LLE, and Isomap, for various
data normalizations. The goal is to find the best visualization of the
data, where "best" in this case is a qualitative measure of how well the
different classes of points are separated in the projected space.
To make experimentation more streamlined
There are several places in this file with code to be filled-in as part of
the exercise. Each of these is labeled TODO below.
"""
import os, sys
import numpy as np
import pylab as pl
from matplotlib import ticker
from sklearn import preprocessing
from sklearn.decomposition import RandomizedPCA
from sklearn.manifold import LocallyLinearEmbedding, Isomap
#----------------------------------------------------------------------
# set up command-line option parser
from optparse import OptionParser
parser = OptionParser(usage=__doc__,
version="%prog 1.0")
parser.add_option("-m", "--method",
dest="method",
default='pca',
help="Specify method to use: [pca | lle | mlle | isomap]")
parser.add_option("-k", "--neighbors",
dest="n_neighbors",
type="int",
default=15,
help='Specify number of neighbors for manifold learning')
parser.add_option("-N", "--normalization",
dest="normalization",
default="none",
help="Specify normalization: [none | l1 | l2]")
parser.add_option("-n", "--n_samples",
dest="n_samples",
type="int",
default=1000,
help="Specify number of samples to use, up to 4000 (default 1000)")
parser.add_option("-s", "--shuffle",
dest="shuffle",
action="store_true",
default=False,
help="shuffle the data")
options, args = parser.parse_args()
if len(args) == 0:
parser.error("Must specify a data directory")
elif len(args) > 1:
parser.error("Must specify a single data directory")
datadir = args[0]
print "data directory: %s" % datadir
print " method = %s" % options.method
print " n_neighbors = %i" % options.n_neighbors
print " normalization = %s" % options.normalization
print " n_samples: %i" % options.n_samples
print " shuffle: %s" % options.shuffle
def three_component_plot(c1, c2, c3, color, labels):
pl.figure(figsize=(8,8))
kwargs = dict(s=4, lw=0, c=color, vmin=2, vmax=6)
ax1 = pl.subplot(221)
pl.scatter(c1, c2, **kwargs)
pl.ylabel('component 2')
ax2 = pl.subplot(223, sharex=ax1)
pl.scatter(c1, c3, **kwargs)
pl.xlabel('component 1')
pl.ylabel('component 3')
ax3 = pl.subplot(224, sharey=ax2)
pl.scatter(c2, c3, **kwargs)
pl.xlabel('component 2')
for ax in (ax1, ax2, ax3):
ax.xaxis.set_major_formatter(ticker.NullFormatter())
ax.yaxis.set_major_formatter(ticker.NullFormatter())
pl.subplots_adjust(hspace=0.05, wspace=0.05)
format = ticker.FuncFormatter(lambda i, *args: labels[i])
pl.colorbar(ticks = range(2, 7), format=format,
cax = pl.axes((0.52, 0.51, 0.02, 0.39)))
pl.clim(1.5, 6.5)
#----------------------------------------------------------------------
# Load data files
data = np.load(os.path.join(datadir, 'spec4000_corrected.npz'))
X = data['X']
y = data['y']
labels = data['labels']
if options.shuffle:
i = np.arange(y.shape[0], dtype=int)
np.random.shuffle(i)
X = X[i]
y = y[i]
#----------------------------------------------------------------------
# truncate the data for experimentation
#
# There are 4000 points, which can take a long time to run. By default,
# it is truncated to 1000 samples. This can be changed using the -n
# command-line argument.
X = X[:options.n_samples]
y = y[:options.n_samples]
#----------------------------------------------------------------------
# Normalization:
#
# The results of the dimensionality reduction can depend heavily on the
# data normalization. These can be commented or un-commented to try
# l1 or l2 normalization.
if options.normalization.lower() == 'none':
pass
elif options.normalization.lower() == 'l2':
X = preprocessing.normalize(X, 'l2')
elif options.normalization.lower() == 'l1':
X = preprocessing.normalize(X, 'l1')
else:
raise ValueError("Unrecognized normalization: '%s'" % options.normalization)
#======================================================================
# TODO: compute X_proj for each method.
# In each of the below cases, you should compute a projection of the
# data and store that projection in the matrix X_proj.
# X_proj should have the same number of rows as X, and should have
# at least 3 features.
X_proj = None
if options.method == 'pca':
print "Performing PCA"
#{{{ compute a RandomizedPCA projection of X with n_components >= 3
rpca = RandomizedPCA(n_components=3, random_state=0)
X_proj = rpca.fit_transform(X)
#}}}
elif options.method == 'lle':
print "Performing LLE"
#{{{ compute LLE on X with method='standard', and out_dim >= 3
lle = LocallyLinearEmbedding(n_neighbors=options.n_neighbors,
out_dim=3, method='standard')
X_proj = lle.fit_transform(X)
#}}}
elif options.method == 'mlle':
print "Performing MLLE"
#{{{ compute LLE on X with method='modified' and out_dim >= 3
lle = LocallyLinearEmbedding(n_neighbors=options.n_neighbors,
out_dim=3, method='modified')
X_proj = lle.fit_transform(X)
#}}}
elif options.method == 'isomap':
print "Performing Isomap"
#{{{ compute Isomap on X with out_dim >= 3
iso = Isomap(n_neighbors=options.n_neighbors,
out_dim=3)
X_proj = iso.fit_transform(X)
#}}}
else:
raise ValueError("Unrecognized method: '%s'" % options.method)
three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels)
pl.show()