-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathexercise_01.py
240 lines (200 loc) · 8.18 KB
/
exercise_01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
"""
Astronomy Tutorial: exercise 1
Classification of photometric sources
usage: python exercise_01.py datadir
- datadir is $TUTORIAL_DIR/data/sdss_colors
This directory should contain the files:
- sdssdr6_colors_class_train.npy
- sdssdr6_colors_class.200000.npy
Description:
In the tutorial, we used a Naive Bayes Classifier to separate Quasars
And Stars. In this exercise, we will extend this classification scheme
using Gaussian Mixture Models.
The Gaussian Naive Bayes method starts by fitting an N-dimensional gaussian
distribution to each class of data. When a test point is evaluated, the
relative log-likelihood from each distribution is used to predict the most
likely value. We're going to extend this by fitting a sum of gaussians to
each distribution.
There are several places in this file with code to be filled-in as part of
the exercise. Each of these is labeled TODO below.
"""
import os, sys
import numpy as np
import pylab as pl
from sklearn.mixture import gmm
from sklearn import metrics
try:
datadir = sys.argv[1]
except:
print __doc__
sys.exit()
#----------------------------------------------------------------------
# Load data files
train_data = np.load(os.path.join(datadir,
'sdssdr6_colors_class_train.npy'))
test_data = np.load(os.path.join(datadir,
'sdssdr6_colors_class.200000.npy'))
# set the number of training points: using all points leads to a very
# long running time. We'll start with 10000 training points. This
# can be increased if desired.
Ntrain = 10000
#Ntrain = len(train_data)
np.random.seed(0)
np.random.shuffle(train_data)
train_data = train_data[:Ntrain]
#----------------------------------------------------------------------
# Split training data into training and cross-validation sets
N_crossval = Ntrain / 5
train_data = train_data[:-N_crossval]
crossval_data = train_data[-N_crossval:]
#----------------------------------------------------------------------
# Set up data
#
X_train = np.zeros((train_data.size, 4), dtype=float)
X_train[:, 0] = train_data['u-g']
X_train[:, 1] = train_data['g-r']
X_train[:, 2] = train_data['r-i']
X_train[:, 3] = train_data['i-z']
y_train = (train_data['redshift'] > 0).astype(int)
Ntrain = len(y_train)
X_crossval = np.zeros((crossval_data.size, 4), dtype=float)
X_crossval[:, 0] = crossval_data['u-g']
X_crossval[:, 1] = crossval_data['g-r']
X_crossval[:, 2] = crossval_data['r-i']
X_crossval[:, 3] = crossval_data['i-z']
y_crossval = (crossval_data['redshift'] > 0).astype(int)
Ncrossval = len(y_crossval)
#======================================================================
# Recreating Gaussian Naive Bayes
#
# Here we will use Gaussian Mixture Models to duplicate our Gaussian
# Naive Bayes results from earlier. You'll create two sklearn.gmm.GMM()
# classifier instances, named `clf_0` and `clf_1`. Each should be
# initialized with a single component, and diagonal covariance.
# (hint: look at the doc string for sklearn.gmm.GMM to see how to set
# this up). The results should be compared to Gaussian Naive Bayes
# to check if they're correct.
#
# Objects to create:
# - clf_0 : trained on the portion of the training data with y == 0
# - clf_1 : trained on the portion of the training data with y == 1
#{{{ compute clf_0, clf_1
clf_0 = gmm.GMM(1, 'diag')
i0 = (y_train == 0)
clf_0.fit(X_train[i0])
clf_1 = gmm.GMM(1, 'diag')
i1 = (y_train == 1)
clf_1.fit(X_train[i1])
#}}}
# next we must construct the prior. The prior is the fraction of training
# points of each type.
#
# variables to compute:
# - prior0 : fraction of training points with y == 0
# - prior1 : fraction of training points with y == 1
#{{{ compute prior0, prior1
num0 = i0.sum()
num1 = i1.sum()
prior0 = num0 / float(Ntrain)
prior1 = num1 / float(Ntrain)
#}}}
# Now we use the prior and the classifiation to compute the log-likelihoods
# of the cross-validation points. The log likelihood is given by
#
# logL(x) = clf.score(x) + log(prior)
#
# You can use the function np.log() to compute the logarithm of the prior.
# variables to compute:
# logL : array, shape = (2, Ncrossval)
# logL[0] is the log-likelihood for y == 0
# logL[1] is the log-likelihood for y == 1
logL = None
#{{{ compute logL
logL = np.zeros((2, Ncrossval))
logL[0] = clf_0.score(X_crossval) + np.log(prior0)
logL[1] = clf_1.score(X_crossval) + np.log(prior1)
#}}}
# the predicted value for each sample is the index with the largest
# log-likelihood.
y_pred = np.argmax(logL, 0)
# now we print the results. We'll use the built-in classification
# report function in sklearn.metrics. This computes the precision,
# recall, and f1-score for each class.
print "------------------------------------------------------------"
print "One-component Gaussian Mixture:"
print " results for cross-validation set:"
print metrics.classification_report(y_crossval, y_pred,
target_names=['stars', 'QSOs'])
#----------------------------------------------------------------------
# Run Gaussian Naive Bayes to double-check that our results are correct.
# Because of rounding errors, it will not be exact, but the results should
# be very close.
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_crossval)
print "------------------------------------------------------------"
print "Gaussian Naive Bayes"
print " results for cross-validation set:"
print " (results should be within ~0.01 of above results)"
print metrics.classification_report(y_crossval, y_pred,
target_names=['stars', 'QSOs'])
#======================================================================
# Parameter optimization:
#
# Now take some time to experiment with the covariance type and the
# number of components, to see if you can optimize the F1 score
#
# Note that for a large number of components, the fit can take a long
# time, and will be dependent on the starting position. Use the
# documentation string of GMM to determine the options for covariance.
#
# It may be helpful to use only a subset of the training data while
# experimenting with these parameter values. This is called
# "Meta-parameter optimization". It can be accomplished automatically,
# but here we are doing it by hand for learning purposes.
y_pred = None
#{{{ compute y_pred for cross-validation data
clf_0 = gmm.GMM(5, 'full', random_state=0)
i0 = (y_train == 0)
clf_0.fit(X_train[i0])
clf_1 = gmm.GMM(5, 'full', random_state=0)
i1 = (y_train == 1)
clf_1.fit(X_train[i1])
logL = np.zeros((2, Ncrossval))
logL[0] = clf_0.score(X_crossval) + np.log(prior0)
logL[1] = clf_1.score(X_crossval) + np.log(prior1)
y_pred = np.argmax(logL, 0)
#}}}
print "------------------------------------------------------------"
print "GMM with tweaked parameters:"
print " results for cross-validation set"
print metrics.classification_report(y_crossval, y_pred,
target_names=['stars', 'QSOs'])
#----------------------------------------------------------------------
# Test Data
# once you have maximized the cross-validation, you can apply the estimator
# to your test data, and check how it compares to the predicted results
# from the researcher who compiled it.
X_test = np.zeros((test_data.size, 4), dtype=float)
X_test[:, 0] = test_data['u-g']
X_test[:, 1] = test_data['g-r']
X_test[:, 2] = test_data['r-i']
X_test[:, 3] = test_data['i-z']
y_pred_literature = (test_data['label'] == 0).astype(int)
Ntest = len(y_pred_literature)
# here you should compute y_pred for the test data, using the classifiers
# clf_0 and clf_1 which you already trained above.
y_pred = None
#{{{ compute y_pred for test data
logL = np.zeros((2, Ntest))
logL[0] = clf_0.score(X_test) + np.log(prior0)
logL[1] = clf_1.score(X_test) + np.log(prior1)
y_pred = np.argmax(logL, 0)
#}}}
print "------------------------------------------------------------"
print "Comparison of current results with published results"
print " results for test set"
print " (treating published results as the 'true' result)"
print metrics.classification_report(y_pred_literature, y_pred,
target_names=['stars', 'QSOs'])