doc/solutions/exercise_02.py

"""
Astronomy Tutorial: exercise 2

Photometric redshift determination

usage: python exercise_02.py datadir

  - datadir is $TUTORIAL_DIR/data/sdss_photoz
    This directory should contain the files:
       - sdss_photoz.npy

Here we will take a closer look at the photometric redshift problem discussed
in section 5 of the tutorial.  Using the decision tree classifier, we'll take
a look at the 4-color observations of just over 400,000 points.

The point of this exercise is to answer the question: how can we get the rms
error down to below 0.1?  Would it be a better use of telescope time to
observe more objects, or to observe additional features of the objects
in the data set?  We'll use the techniques discussed in section 3 of the
tutorial.
"""
import os, sys
import numpy as np
import pylab as pl

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

try:
    datadir = sys.argv[1]
except:
    print __doc__
    sys.exit()

def compute_rms_error(y_pred, y_true):
    """Compute the rms error between the arrays y_pred and y_true"""
    return np.sqrt(metrics.mean_squared_error(y_pred, y_true))

def compute_outlier_fraction(y_pred, y_true, cutoff=0.2):
    """Compute the outlier rate between the arrays y_pred and y_true"""
    return np.sum((abs(y_pred - y_true) > cutoff)) * 1. / len(y_pred)

#------------------------------------------------------------
# load data and compute colors
data = np.load(os.path.join(datadir, 'sdss_photoz.npy'))

# here we'll truncate the data to 50,000 points.  This will allow the code
# below to be run quickly while it's being written.  When you're satisfied
# that the code is ready to go, you can comment out this line.
data = data[:50000]

print '%i points' % data.shape[0]
u, g, r, i, z = [data[f] for f in 'ugriz']

X = np.zeros((len(data), 4))
X[:, 0] = u - g
X[:, 1] = g - r
X[:, 2] = r - i
X[:, 3] = i - z

y = data['redshift']

#------------------------------------------------------------
# divide into training, cross-validation, and test samples
Ntot = len(y)

Ncv = Ntot / 5
Ntest = Ntot / 5
Ntrain = Ntot - Ncv - Ntest

X_train = X[:Ntrain]
y_train = y[:Ntrain]

X_cv = X[Ntrain:Ntrain + Ncv]
y_cv = y[Ntrain:Ntrain + Ncv]

X_test = X[Ntrain + Ncv:]
y_test = y[Ntrain + Ncv:]

#------------------------------------------------------------
# plot the Decision Tree error as a function of max_depth parameter
#
#  This is the first main part of the exercise.  This is photometric
#  redshift determination using DecisionTreeRegressor.  Here you'll plot
#  the training error and cross-validation error as a function of the
#  meta-parameter 'max_depth'.
#
#  You will create three arrays: max_depth_array, train_error, and cv_error.
#  Use at least 10 different values of max_depth, and compute the training
#  and cross-validation error associated with each of them.
#
#  note that the error can be computed with the function compute_rms_error()

max_depth_array = []
train_error = []
cv_error = []

#{{{ compute the arrays max_depth_array, train_error, and cv_error
max_depth_array = np.arange(1, 21)
train_error = np.zeros(max_depth_array.shape)
cv_error = np.zeros(max_depth_array.shape)

for i, max_depth in enumerate(max_depth_array):
    # print progress update
    sys.stdout.write('  %i / %i\r' % (max_depth, max_depth_array[-1]))
    sys.stdout.flush()

    clf = RandomForestRegressor(10, max_depth=max_depth)
    #clf = DecisionTreeRegressor(max_depth=max_depth)
    clf.fit(X_train, y_train)

    y_train_pred = clf.predict(X_train)
    y_cv_pred = clf.predict(X_cv)

    train_error[i] = compute_rms_error(y_train_pred, y_train)
    cv_error[i] = compute_rms_error(y_cv_pred, y_cv)
#}}}

pl.figure()
pl.plot(max_depth_array, cv_error, label='cross-val error')
pl.plot(max_depth_array, train_error, label='training error')

pl.legend()
pl.xlabel('max depth')
pl.ylabel('error')

# select the value of max_depth which led to the best results
max_depth = max_depth_array[np.argmin(cv_error)]
print "max_depth = %i" % max_depth

#------------------------------------------------------------
# plot the Decision Tree error as a function of number of samples
#
#  This is the second main part of the exercise.  Here you'll plot the
#  training error and cross-validation error as a function of the
#  number of training samples.
#
#  You will create three arrays: n_samples_array, train_error, and cv_error.
#  Use at least 40 different values of n_samples, and compute the training
#  and cross-validation error associated with each of them.
#
#  Make sure that when computing the training error for each number of
#  samples, you use the same samples that the model was trained on.

n_samples_array = []
train_error = []
cv_error = []

#{{{ compute the arrays n_samples_array, train_error, and cv_error
n_samples_array = np.linspace(100, Ntrain, 50).astype(int)
train_error = np.zeros(n_samples_array.shape)
cv_error = np.zeros(n_samples_array.shape)

for i, n_samples in enumerate(n_samples_array):
    # print progress update
    sys.stdout.write('  %i / %i\r' % (n_samples, Ntrain))
    sys.stdout.flush()

    clf = RandomForestRegressor(10, max_depth=max_depth)
    #clf = DecisionTreeRegressor(max_depth=max_depth)
    clf.fit(X_train[:n_samples], y_train[:n_samples])

    y_train_pred = clf.predict(X_train[:n_samples])
    y_cv_pred = clf.predict(X_cv)

    train_error[i] = compute_rms_error(y_train_pred,
                                       y_train[:n_samples])
    cv_error[i] = compute_rms_error(y_cv_pred, y_cv)
print
#}}}

pl.figure()
pl.plot(n_samples_array, cv_error, label='cross-val error')
pl.plot(n_samples_array, train_error, label='training error')

pl.legend()
pl.xlabel('number of samples')
pl.ylabel('error')

#----------------------------------------------------------------------
# Use the whole dataset:
#  If you have been running your code on only a part of the dataset,
#  now that you have it working, you can run it on the full dataset
#  (note: this will take a long time to execute!)  You can do this by
#  commenting out the line
#     data = data[:50000]
#  above.  How does this change the results?


#------------------------------------------------------------
# Catastrophic Outliers
#  Though the rms error is one useful measure of the performance of an
#  algorithm, astronomers are often more interested in reducing the
#  'catastrophic outlier' rate.  Catastrophic outliers are points which
#  are given redshifts very far from the true value.  For accuracy of
#  cosmological results, this is often more important than the overall
#  rms error.
#
#  Here, you can re-implement te above tasks, plotting the catastrophic
#  outlier rate as a function of the max_depth parameter, and as a function
#  of the number of training points.  This can be accomplished either by
#  copying and pasting the above code here, or by modifying the above code.
#
#  To compute the catastrophic error rate, you can use the function
#  compute_outlier_fraction()

#{{{ repeat the above two plots using catastrophic error rate
max_depth_array = np.arange(1, 21)
train_error = np.zeros(max_depth_array.shape)
cv_error = np.zeros(max_depth_array.shape)

for i, max_depth in enumerate(max_depth_array):
    # print progress update
    sys.stdout.write('  %i / %i\r' % (max_depth, max_depth_array[-1]))
    sys.stdout.flush()

    clf = RandomForestRegressor(10, max_depth=max_depth)
    #clf = DecisionTreeRegressor(max_depth=max_depth)
    clf.fit(X_train, y_train)

    y_train_pred = clf.predict(X_train)
    y_cv_pred = clf.predict(X_cv)

    train_error[i] = compute_outlier_fraction(y_train_pred, y_train)
    cv_error[i] = compute_outlier_fraction(y_cv_pred, y_cv)

pl.figure()
pl.plot(max_depth_array, cv_error, label='cross-val error')
pl.plot(max_depth_array, train_error, label='training error')

pl.legend()
pl.xlabel('max depth')
pl.ylabel('error')

# select the value of max_depth which led to the best results
max_depth = max_depth_array[np.argmin(cv_error)]
print "max_depth = %i" % max_depth

#------------------------------------------------------------
# plot the Decision Tree error as a function of number of samples

n_samples_array = np.linspace(100, Ntrain, 50).astype(int)
train_error = np.zeros(n_samples_array.shape)
cv_error = np.zeros(n_samples_array.shape)

for i, n_samples in enumerate(n_samples_array):
    # print progress update
    sys.stdout.write('  %i / %i\r' % (n_samples, Ntrain))
    sys.stdout.flush()

    clf = RandomForestRegressor(10, max_depth=max_depth)
    #clf = DecisionTreeRegressor(max_depth=max_depth)
    clf.fit(X_train[:n_samples], y_train[:n_samples])

    y_train_pred = clf.predict(X_train[:n_samples])
    y_cv_pred = clf.predict(X_cv)

    train_error[i] = compute_outlier_fraction(y_train_pred,
                                              y_train[:n_samples])
    cv_error[i] = compute_outlier_fraction(y_cv_pred, y_cv)
print

pl.figure()
pl.plot(n_samples_array, cv_error, label='cross-val error')
pl.plot(n_samples_array, train_error, label='training error')

pl.legend()
pl.xlabel('number of samples')
pl.ylabel('error')

#}}}

#----------------------------------------------------------------------
# Analyze the results
#
#  Compare your results to the discussion of bias and variance in section
#  3.  How do you think these results could be improved?  Is it better to
#  spend telescope time increasing the size of the training set, or would
#  it be better to measure more features of the objects we already have?
#  Does this recommendation change if the astronomer is interested in
#  minimizing the number of catastrophic outliers rather than the rms error?

#----------------------------------------------------------------------
# Bonus
#  Random Forest Regression uses an ensemble of decision trees to reduce
#  the effects of over-fitting.
#  Use sklearn.ensemble.RandomForestRegressor and compare the results to
#  your decision tree.  Does this change the error?  Does it change your
#  recommendations?

pl.show()