""" Astronomy Tutorial: exercise 3 Dimensionality reduction of stellar spectra Usage: python exercise_03.py datadir [-m method] [-k n_neigbors] [-n norm_type] [-N n_samples] [-s] - datadir is $TUTORIAL_DIR/data/sdss_photoz This directory should contain the file sdss_photoz.npy - method is one of [pca | lle | mlle | isomap]. If not specified, PCA will be performed - n_neighbors is an integer number of neighbors to use with manifold methods - norm_type is one of [none | l1 | l2]. It specifies how the data should be normalized. - n_samples is the number of samples used for the projection. Only 1000 of the 4000 samples are used by default. - specifying -s shuffles the data. This can help test for stability of the reconstruction. Description: In this tutorial, we explore manifold learning techniques to visualize 4000 SDSS spectral data. This is a much more exploratory exercise than the previous two. The goal is to determine how to best visualize this high-dimensional space. You will implement PCA, LLE, Modified LLE, and Isomap, for various data normalizations. The goal is to find the best visualization of the data, where "best" in this case is a qualitative measure of how well the different classes of points are separated in the projected space. To make experimentation more streamlined There are several places in this file with code to be filled-in as part of the exercise. Each of these is labeled TODO below. """ import os, sys import numpy as np import pylab as pl from matplotlib import ticker from sklearn import preprocessing from sklearn.decomposition import RandomizedPCA from sklearn.manifold import LocallyLinearEmbedding, Isomap #---------------------------------------------------------------------- # set up command-line option parser from optparse import OptionParser parser = OptionParser(usage=__doc__, version="%prog 1.0") parser.add_option("-m", "--method", dest="method", default='pca', help="Specify method to use: [pca | lle | mlle | isomap]") parser.add_option("-k", "--neighbors", dest="n_neighbors", type="int", default=15, help='Specify number of neighbors for manifold learning') parser.add_option("-N", "--normalization", dest="normalization", default="none", help="Specify normalization: [none | l1 | l2]") parser.add_option("-n", "--n_samples", dest="n_samples", type="int", default=1000, help="Specify number of samples to use, up to 4000 (default 1000)") parser.add_option("-s", "--shuffle", dest="shuffle", action="store_true", default=False, help="shuffle the data") options, args = parser.parse_args() if len(args) == 0: parser.error("Must specify a data directory") elif len(args) > 1: parser.error("Must specify a single data directory") datadir = args[0] print "data directory: %s" % datadir print " method = %s" % options.method print " n_neighbors = %i" % options.n_neighbors print " normalization = %s" % options.normalization print " n_samples: %i" % options.n_samples print " shuffle: %s" % options.shuffle def three_component_plot(c1, c2, c3, color, labels): pl.figure(figsize=(8,8)) kwargs = dict(s=4, lw=0, c=color, vmin=2, vmax=6) ax1 = pl.subplot(221) pl.scatter(c1, c2, **kwargs) pl.ylabel('component 2') ax2 = pl.subplot(223, sharex=ax1) pl.scatter(c1, c3, **kwargs) pl.xlabel('component 1') pl.ylabel('component 3') ax3 = pl.subplot(224, sharey=ax2) pl.scatter(c2, c3, **kwargs) pl.xlabel('component 2') for ax in (ax1, ax2, ax3): ax.xaxis.set_major_formatter(ticker.NullFormatter()) ax.yaxis.set_major_formatter(ticker.NullFormatter()) pl.subplots_adjust(hspace=0.05, wspace=0.05) format = ticker.FuncFormatter(lambda i, *args: labels[i]) pl.colorbar(ticks = range(2, 7), format=format, cax = pl.axes((0.52, 0.51, 0.02, 0.39))) pl.clim(1.5, 6.5) #---------------------------------------------------------------------- # Load data files data = np.load(os.path.join(datadir, 'spec4000_corrected.npz')) X = data['X'] y = data['y'] labels = data['labels'] if options.shuffle: i = np.arange(y.shape[0], dtype=int) np.random.shuffle(i) X = X[i] y = y[i] #---------------------------------------------------------------------- # truncate the data for experimentation # # There are 4000 points, which can take a long time to run. By default, # it is truncated to 1000 samples. This can be changed using the -n # command-line argument. X = X[:options.n_samples] y = y[:options.n_samples] #---------------------------------------------------------------------- # Normalization: # # The results of the dimensionality reduction can depend heavily on the # data normalization. These can be commented or un-commented to try # l1 or l2 normalization. if options.normalization.lower() == 'none': pass elif options.normalization.lower() == 'l2': X = preprocessing.normalize(X, 'l2') elif options.normalization.lower() == 'l1': X = preprocessing.normalize(X, 'l1') else: raise ValueError("Unrecognized normalization: '%s'" % options.normalization) #====================================================================== # TODO: compute X_proj for each method. # In each of the below cases, you should compute a projection of the # data and store that projection in the matrix X_proj. # X_proj should have the same number of rows as X, and should have # at least 3 features. X_proj = None if options.method == 'pca': print "Performing PCA" #{{{ compute a RandomizedPCA projection of X with n_components >= 3 rpca = RandomizedPCA(n_components=3, random_state=0) X_proj = rpca.fit_transform(X) #}}} elif options.method == 'lle': print "Performing LLE" #{{{ compute LLE on X with method='standard', and out_dim >= 3 lle = LocallyLinearEmbedding(n_neighbors=options.n_neighbors, out_dim=3, method='standard') X_proj = lle.fit_transform(X) #}}} elif options.method == 'mlle': print "Performing MLLE" #{{{ compute LLE on X with method='modified' and out_dim >= 3 lle = LocallyLinearEmbedding(n_neighbors=options.n_neighbors, out_dim=3, method='modified') X_proj = lle.fit_transform(X) #}}} elif options.method == 'isomap': print "Performing Isomap" #{{{ compute Isomap on X with out_dim >= 3 iso = Isomap(n_neighbors=options.n_neighbors, out_dim=3) X_proj = iso.fit_transform(X) #}}} else: raise ValueError("Unrecognized method: '%s'" % options.method) three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels) pl.show()