# -*- coding: utf-8 -*- """ Created on Fri Sep 11 13:12:03 2015 @author: Camil Staps, s4498062 Use Python 2.* """ from __future__ import print_function import xlrd import numpy as np import numpy.linalg as la import matplotlib.lines as pltlines import matplotlib.patches as pltpatches import matplotlib.pyplot as plt # 1.2.1 a xls = xlrd.open_workbook(filename='Data/nanonose.xls') xls = xls.sheet_by_index(0) fst_col, fst_row = 3, 2 data = np.asmatrix([xls.col_values(i)[fst_row:] for i in range(fst_col,fst_col + 8)]) # 1.2.1 b colors = {'Water': '#61d4fa', 'Ethanol': '#ff3333', 'Acetone': '#549900', 'Heptane': '#d9910d', 'Pentanol': '#990096'} graph_colors = [colors[r] for r in xls.col_values(0)[fst_row:]] xs = xls.col_values(1)[fst_row:] fig = plt.figure(figsize=(12,6)) ax = plt.gca() ax.set_xscale('log') ax.set_yscale('symlog') plt.xlim([80,10 ** 5]) plt.ylim([-1, 500]) ax.scatter(xs, data.tolist()[0], s=60, c=graph_colors, alpha=0.4, marker='s') ax.scatter(xs, data.tolist()[1], s=60, c=graph_colors, alpha=0.4, marker='o') line_a = pltlines.Line2D([], [], ls=' ', marker='s', label='A', c='w') line_b = pltlines.Line2D([], [], ls=' ', marker='o', label='B', c='w') handles = [pltpatches.Patch(label=k, color=v) for k, v in colors.iteritems()] + [line_a, line_b] ax.legend(handles=handles, numpoints=1, loc=2) plt.show() # 1.2.2 a # PCA is a method that can be used to reduce dimensionality of a dataset. It # can be used when some variables are correlated; we then basically rewrite one # of them as a function of the other. Of course, in general that implies data # loss. # 1.2.2 b # EVD is a way to rewrite a diagonalizable matrix into a canonical form (a # summation of products of eigenvalues and corresponding eigenvectors). SVD is # a generalisation which can be applied to any matrix. # # In SVD, we write A = U*S*V^T. The Us are eigenvectors of AA^T (which can be # found using EVD); the Vs are eigenvectors of A^TA. # 1.2.2 c # Subtract mean means = [np.mean(np.array(data.T)[i]) for i in range(0,8)] means_matrix = np.transpose([means for _ in range(0, len(np.array(data)[0]))]) normalised_data = np.transpose(data - means_matrix) # Perform SVD U, S, V = la.svd(normalised_data) # Compute component variances squared_sum = sum([S[m_] ** 2 for m_ in range(1,len(S))]) variance = [100 * S[m] ** 2 / squared_sum for m in range(1,len(S))] # Plot fig = plt.figure(figsize=(2,4)) ax = plt.gca() ax.bar(range(0, len(variance)), variance, 1) plt.show() print("The first 3 components account for", sum(variance[:3]), "% variance.") # 1.2.2 d V = np.transpose(V) projections = [normalised_data * V[:,i] for i in range(0,8)] for i in range(0,8): # Isn't this possible in an easier way? projections[i] = [np.array(e)[0][0] for s in projections[i] for e in s] # Plot fig = plt.figure(figsize=(12,6)) ax = plt.gca() ax.scatter(xs, projections[0], c='#ff0000', marker='o', label='Projection onto component 1') ax.scatter(xs, projections[1], c='#00ff00', marker='o', label='Projection onto component 2') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels, loc=2) plt.show() print("In the graph above we see that rougly 70% of the variance is accounted " "for by the first two components. If we would plot only the first two " "dimensions of the data, we would have on average only 25% accounted for.") # 1.2.2 e print(V[:,1]) # As you can see, mainly takes into account the last two attributes (G and H). # Adherence with attribute A, B or C would give a large negative projection. # Adherence with attribute G or H would give a large positive projection.