# -*- coding: utf-8 -*-
"""
Created on Fri Sep 11 13:12:03 2015

@author: Camil Staps, s4498062

Use Python 2.*
"""

from __future__ import print_function
import xlrd
import numpy as np
import numpy.linalg as la
import matplotlib.lines as pltlines
import matplotlib.patches as pltpatches
import matplotlib.pyplot as plt

# 1.2.1 a

xls = xlrd.open_workbook(filename='Data/nanonose.xls')
xls = xls.sheet_by_index(0)

fst_col, fst_row = 3, 2
data = np.asmatrix([xls.col_values(i)[fst_row:] 
        for i in range(fst_col,fst_col + 8)])

# 1.2.1 b
colors = {'Water':      '#61d4fa',
          'Ethanol':    '#ff3333', 
          'Acetone':    '#549900', 
          'Heptane':    '#d9910d', 
          'Pentanol':   '#990096'}
graph_colors = [colors[r] for r in xls.col_values(0)[fst_row:]]

xs = xls.col_values(1)[fst_row:]

fig = plt.figure(figsize=(12,6))
ax = plt.gca()
ax.set_xscale('log')
ax.set_yscale('symlog')
plt.xlim([80,10 ** 5])
plt.ylim([-1, 500])

ax.scatter(xs, data.tolist()[0], s=60, c=graph_colors, alpha=0.4, marker='s')
ax.scatter(xs, data.tolist()[1], s=60, c=graph_colors, alpha=0.4, marker='o')

line_a = pltlines.Line2D([], [], ls=' ', marker='s', label='A', c='w')
line_b = pltlines.Line2D([], [], ls=' ', marker='o', label='B', c='w')
handles = [pltpatches.Patch(label=k, color=v) for k, v in 
            colors.iteritems()] + [line_a, line_b]
ax.legend(handles=handles, numpoints=1, loc=2)

plt.show()

# 1.2.2 a
# PCA is a method that can be used to reduce dimensionality of a dataset. It 
# can be used when some variables are correlated; we then basically rewrite one
# of them as a function of the other. Of course, in general that implies data
# loss.

# 1.2.2 b
# EVD is a way to rewrite a diagonalizable matrix into a canonical form (a
# summation of products of eigenvalues and corresponding eigenvectors). SVD is 
# a generalisation which can be applied to any matrix.
#
# In SVD, we write A = U*S*V^T. The Us are eigenvectors of AA^T (which can be
# found using EVD); the Vs are eigenvectors of A^TA.

# 1.2.2 c
# Subtract mean
means = [np.mean(np.array(data.T)[i]) for i in range(0,8)]
means_matrix = np.transpose([means for _ in range(0, len(np.array(data)[0]))])
normalised_data = np.transpose(data - means_matrix)
# Perform SVD
U, S, V = la.svd(normalised_data)
# Compute component variances
squared_sum = sum([S[m_] ** 2 for m_ in range(1,len(S))])
variance = [100 * S[m] ** 2 / squared_sum for m in range(1,len(S))]
# Plot
fig = plt.figure(figsize=(2,4))
ax = plt.gca()
ax.bar(range(0, len(variance)), variance, 1)
plt.show()

print("The first 3 components account for", sum(variance[:3]), "% variance.")

# 1.2.2 d
V = np.transpose(V)
projections = [normalised_data * V[:,i] for i in range(0,8)]
for i in range(0,8): # Isn't this possible in an easier way?
    projections[i] = [np.array(e)[0][0] for s in projections[i] for e in s]
# Plot
fig = plt.figure(figsize=(12,6))
ax = plt.gca()
ax.scatter(xs, projections[0], 
           c='#ff0000', marker='o', label='Projection onto component 1')
ax.scatter(xs, projections[1], 
           c='#00ff00', marker='o', label='Projection onto component 2')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc=2)
plt.show()

print("In the graph above we see that rougly 70% of the variance is accounted "
        "for by the first two components. If we would plot only the first two "
        "dimensions of the data, we would have on average only 25% accounted for.")

# 1.2.2 e
print(V[:,1])
# As you can see, mainly takes into account the last two attributes (G and H).
# Adherence with attribute A, B or C would give a large negative projection. 
# Adherence with attribute G or H would give a large positive projection.