Assignment 2 finished

author: Camil Staps 2015-10-11 21:14:28 +0200
committer: Camil Staps 2015-10-11 21:14:28 +0200
commit: 1beb50ef75a7db236a5ab3fdf88faf4c55f7c19d (patch)
tree: f1e5aa92ad6ac0a9a213aece1f2a0c0444a39e3d
parent: Start assignment 2 (diff)
17 files changed, 299 insertions, 1 deletions
diff --git a/Assignment2/Data/wine.mat b/Assignment 2/Data/wine.mat
index da15efd..da15efd 100644
--- a/Assignment2/Data/wine.mat
+++ b/Assignment 2/Data/wine.mat
diff --git a/Assignment 2/Data/zipdata.mat b/Assignment 2/Data/zipdata.mat
new file mode 100644
index 0000000..a98e796
--- /dev/null
+++ b/Assignment 2/Data/zipdata.mat
diff --git a/Assignment2/ex21.py b/Assignment 2/ex21.py
index 87d68cd..2594c61 100644
--- a/Assignment2/ex21.py
+++ b/Assignment 2/ex21.py
@@ -2,7 +2,9 @@
 """
 Created on Sat Oct 10 21:28:45 2015
 
-@author: camilstaps
+@author: Camil Staps (s4498062)
+
+Run with Python 2.7
 """
 
 import matplotlib.pyplot as plt
diff --git a/Assignment 2/ex22.py b/Assignment 2/ex22.py
new file mode 100644
index 0000000..3487752
--- /dev/null
+++ b/Assignment 2/ex22.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Oct 11 09:28:15 2015
+
+@author: Camil Staps (s4498062)
+
+Run with Python 2.7
+"""
+
+import itertools
+import matplotlib.pyplot as plt
+import matplotlib.pylab as plab
+from mpl_toolkits.mplot3d import Axes3D
+import numpy as np
+import scipy.io
+
+# 2.2.1
+zipd = scipy.io.loadmat('./Data/zipdata.mat')
+traindata = zipd['traindata']
+testdata = zipd['testdata']
+
+data = traindata[:,1:]
+classes = traindata[:,0]
+temp = [(d, c) for d, c in zip(data, classes) if c < 2]
+[data, classes] = [np.array(t) for t in zip(*temp)]
+
+mean = data.mean(0)
+
+# First visualisation
+for i in range(10):
+    plt.subplot(2, 5, i)
+    image = plab.reshape(data[i,:], (16, 16))
+    plt.imshow(image, extent=(0, 16, 0, 16), cmap=plab.cm.gray_r)
+    plt.axis('off')
+plt.show()
+
+# PCA
+Y = data - np.ones((len(data), 1)) * mean
+U, S, Vt = np.linalg.svd(Y, full_matrices=False)
+V = Vt.T
+Z = np.dot(Y, V[:,0:4])
+
+W = np.dot(Z[:10], V[:,0:4].T) + mean
+for i in range(10):
+    plt.subplot(2, 5, i)
+    image = plab.reshape(W[i,:], (16, 16))
+    plt.imshow(image, extent=(0, 16, 0, 16), cmap=plab.cm.gray_r)
+    plt.axis('off')
+plt.show()
+
+Y0 = [d for c, d in zip(classes, Y) if c == 0]
+Y1 = [d for c, d in zip(classes, Y) if c == 1]
+plt.figure(figsize=(16,16))
+for i, j in itertools.product(*[range(4), range(4)]):
+    plt.subplot(4, 4, 4 * i + j + 1)
+    
+    Z1 = np.dot(Y0, V[:,i:i + 1])
+    Z2 = np.dot(Y0, V[:,j:j + 1])
+    plt.scatter(Z1, Z2, color='r', marker='.', s=1, label='0')
+    Z1 = np.dot(Y1, V[:,i:i + 1])
+    Z2 = np.dot(Y1, V[:,j:j + 1])
+    plt.scatter(Z1, Z2, color='b', marker='.', s=1, label='1')
+    
+    plt.ylabel('PC' + str(i))
+    plt.xlabel('PC' + str(j))
+    plt.gca().axes.get_xaxis().set_ticks([])
+    plt.gca().axes.get_yaxis().set_ticks([])
+plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
+plt.show()
+
+fig = plt.figure(figsize=(8,8))
+ax = fig.add_subplot(111, projection='3d')
+Z1 = np.dot(Y0, V[:,0:1])
+Z2 = np.dot(Y0, V[:,1:2])
+Z3 = np.dot(Y0, V[:,2:3])
+ax.scatter(Z1, Z2, Z3, color='r', marker='.', s=10, label='0')
+Z1 = np.dot(Y1, V[:,0:1])
+Z2 = np.dot(Y1, V[:,1:2])
+Z3 = np.dot(Y1, V[:,2:3])
+ax.scatter(Z1, Z2, Z3, color='b', marker='.', s=10, label='1')
+ax.set_xlabel('PC1')
+ax.set_ylabel('PC2')
+ax.set_zlabel('PC3')
+ax.set_xticks([])
+ax.set_yticks([])
+ax.set_zticks([])
+plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
+plt.show()
diff --git a/Assignment 2/ex23.py b/Assignment 2/ex23.py
new file mode 100644
index 0000000..7c763ee
--- /dev/null
+++ b/Assignment 2/ex23.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Oct 11 18:47:35 2015
+
+@author: Camil Staps (s4498062)
+
+Run with Python 2.7
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+def all_samples(data, n):
+    """All samples without replacement or ordering with n elements from data"""
+    if n == 0:
+        return [[]]
+    else:
+        samples = []
+        for i, d in enumerate(data):
+            samples = samples + [[d] + s for s in all_samples(data[i+1:], n-1)]
+        return samples
+        
+def nearly_equal(m, n, sig_fig = 5):
+    """Determine whether two numbers are nearly equal"""
+    # http://stackoverflow.com/a/558289/1544337
+    return m == n or int(m * 10 ** sig_fig) == int(n * 10 ** sig_fig)
+
+data = np.array([2,3,6,8,11,18])
+
+# i
+print("Mean: %f\nStandard deviation: %f" % (data.mean(), data.std()))
+    
+samples_2 = all_samples(data, 2)
+samples_4 = all_samples(data, 4)
+
+# ii
+print([(s, np.mean(s)) for s in samples_2])
+print([(s, np.mean(s)) for s in samples_4])
+
+# iii
+samples_2_means = [np.mean(s) for s in samples_2]
+samples_4_means = [np.mean(s) for s in samples_4]
+
+print("Mean of 2-sample means: %f" % np.mean(samples_2_means))
+print("Standard deviation of 2-sample means: %f" % np.std(samples_2_means))
+print("Mean of 4-sample means: %f" % np.mean(samples_4_means))
+print("Standard deviation of 4-sample means: %f" % np.std(samples_4_means))
+
+# iv
+print("Means are equal (2): %r" % (np.mean(samples_2_means) == data.mean()))
+print("Means are equal (4): %r" % (np.mean(samples_4_means) == data.mean()))
+print("σ2 ≈ σ/√2×√(4/5): %r" % nearly_equal(
+        np.std(samples_2_means), data.std() / np.sqrt(2.) * np.sqrt(4./5.)))
+print("σ4 ≈ σ/√4×√(2/5): %r" % nearly_equal(
+        np.std(samples_4_means), data.std() / np.sqrt(4.) * np.sqrt(2./5.)))
+        
+# v
+plt.figure(figsize=(10,4))
+plt.subplot(1, 3, 1)
+plt.hist(data)
+plt.title('Population distribution')
+plt.subplot(1, 3, 2)
+plt.hist(samples_2_means)
+plt.title('2-Sample mean distribution')
+plt.subplot(1, 3, 3)
+plt.hist(samples_4_means)
+plt.title('4-Sample mean distribution')
+plt.show()
diff --git a/Assignment 2/report/assignment2.tex b/Assignment 2/report/assignment2.tex
new file mode 100644
index 0000000..6ae0b55
--- /dev/null
+++ b/Assignment 2/report/assignment2.tex
@@ -0,0 +1,140 @@
+\documentclass[10pt,a4paper]{article}
+
+\usepackage[margin=2cm]{geometry}
+\usepackage{graphicx}
+
+\let\assignment2
+
+\usepackage{enumitem}
+\setenumerate[1]{label=\assignment.\arabic*.}
+\setenumerate[2]{label=\arabic*.}
+\setenumerate[3]{label=\roman*.}
+
+% textcomp package is not available everywhere, and we only need the Copyright symbol
+% taken from http://tex.stackexchange.com/a/1677/23992
+\DeclareTextCommandDefault{\textregistered}{\textcircled{\check@mathfonts\fontsize\sf@size\z@\math@fontsfalse\selectfont R}}
+
+\usepackage{fancyhdr}
+\renewcommand{\headrulewidth}{0pt}
+\renewcommand{\footrulewidth}{0pt}
+\fancyhead{}
+%\fancyfoot[C]{Copyright {\textcopyright} 2015 Camil Staps}
+\pagestyle{fancy}
+
+\usepackage{caption}
+\usepackage{subcaption}
+
+\parindent0pt
+
+\title{Data Mining - assignment \assignment}
+\author{Camil Staps\\\small{s4498062}}
+
+\begin{document}
+
+\maketitle
+\thispagestyle{fancy}
+
+\begin{enumerate}
+    \item \begin{enumerate}
+            \item See figure \ref{fig:211-boxplots-1} through \ref{fig:211-hists-2}. It is clear that after eliminating the outliers we get a much better idea of the distributions.
+
+                \begin{figure}[p]
+                    \centering
+                    \includegraphics[width=\linewidth]{ex211-boxplots-1}
+                    \caption{Boxplots before eliminating outliers}
+                    \label{fig:211-boxplots-1}
+                \end{figure}
+                \begin{figure}[p]
+                    \centering
+                    \includegraphics[width=\linewidth]{ex211-hists-1}
+                    \caption{Histograms before eliminating outliers}
+                    \label{fig:211-hists-1}
+                \end{figure}
+                \begin{figure}[p]
+                    \centering
+                    \includegraphics[width=\linewidth]{ex211-boxplots-2}
+                    \caption{Boxplots after eliminating outliers}
+                    \label{fig:211-boxplots-2}
+                \end{figure}
+                \begin{figure}[p]
+                    \centering
+                    \includegraphics[width=\linewidth]{ex211-hists-2}
+                    \caption{Histograms after eliminating outliers}
+                    \label{fig:211-hists-2}
+                \end{figure}
+
+            \item See figure \ref{fig:212-scatters} and \ref{fig:212-correlation-bars}. As can be seen in the latter, there is a large (positive) correlation between alcohol percentage and quality, and there is a large (negative) correlation between density, volatile acidity and chlorides and quality.
+
+                From the first plots we also see that high quality wine has a `citric acid' level of around $0.4$.
+
+                \begin{figure}[p]
+                    \centering
+                    \includegraphics[width=\linewidth]{ex212-scatters}
+                    \caption{Scatter plots between attributes and wine quality}
+                    \label{fig:212-scatters}
+                \end{figure}
+                \begin{figure}[p]
+                    \centering
+                    \includegraphics[width=\linewidth]{ex212-correlation-bars}
+                    \caption{Correlations coefficients between attributes and wine quality}
+                    \label{fig:212-correlation-bars}
+                \end{figure}
+        \end{enumerate}
+
+    \item \begin{enumerate}
+            \item See figure \ref{fig:221-visualisation}. PCA seems to work quite well here. However, it is also clear (not from this picture though) that if we want to reconstruct all ten digits, we need more principal components. But in this case the digits are easily recognisable. This is even more clear in the scatter plot of PC0 against PC0 in figure \ref{fig:221-scatters}. It is clear that almost all zeroes and ones can be recognised by checking whether the first principal component is below some threshold.
+
+                \begin{figure}[p]
+                    \centering
+                    \begin{subfigure}{.45\linewidth}
+                        \includegraphics[width=\linewidth]{ex221-visualisation}
+                        \caption{Initially}
+                    \end{subfigure}
+                    \begin{subfigure}{.45\linewidth}
+                        \includegraphics[width=\linewidth]{ex221-reconstructed-visualisation}
+                        \caption{Reconstructed with four principal components}
+                    \end{subfigure}
+                    \caption{Visualisations of the first ten zeroes and ones}
+                    \label{fig:221-visualisation}
+                \end{figure}
+
+                See figure \ref{fig:221-scatters} and \ref{fig:221-scatter-3d} for scatter plots.
+
+                \begin{figure}[p]
+                    \centering
+                    \includegraphics[width=\linewidth]{ex221-scatters}
+                    \caption{Scatter plots of handwritten digits, projected on two principal components (NB: the principal components are numbered starting from $0$)}
+                    \label{fig:221-scatters}
+                \end{figure}
+                \begin{figure}[p]
+                    \centering
+                    \includegraphics[width=\linewidth]{ex221-scatter-3d}
+                    \caption{Scatter plot of handwritten digits, projected on three principal components}
+                    \label{fig:221-scatter-3d}
+                \end{figure}
+        \end{enumerate}
+
+    \item \begin{enumerate}
+            \item \begin{enumerate}
+                    \item \texttt{Mean: 8.000000\\Standard deviation: 5.385165}
+                    \item These are all combinations along with their means:
+                        
+                        \texttt{[([2, 3], 2.5), ([2, 6], 4.0), ([2, 8], 5.0), ([2, 11], 6.5), ([2, 18], 10.0), ([3, 6], 4.5), ([3, 8], 5.5), ([3, 11], 7.0), ([3, 18], 10.5), ([6, 8], 7.0), ([6, 11], 8.5), ([6, 18], 12.0), ([8, 11], 9.5), ([8, 18], 13.0), ([11, 18], 14.5)]{\\}[([2, 3, 6, 8], 4.75), ([2, 3, 6, 11], 5.5), ([2, 3, 6, 18], 7.25), ([2, 3, 8, 11], 6.0), ([2, 3, 8, 18], 7.75), ([2, 3, 11, 18], 8.5), ([2, 6, 8, 11], 6.75), ([2, 6, 8, 18], 8.5), ([2, 6, 11, 18], 9.25), ([2, 8, 11, 18], 9.75), ([3, 6, 8, 11], 7.0), ([3, 6, 8, 18], 8.75), ([3, 6, 11, 18], 9.5), ([3, 8, 11, 18], 10.0), ([6, 8, 11, 18], 10.75)]}
+                    \item \texttt{Mean of 2-sample means: 8.000000\\Standard deviation of 2-sample means: 3.405877\\Mean of 4-sample means: 8.000000\\Standard deviation of 4-sample means: 1.702939}
+                    \item \texttt{Means are equal (2): True\\Means are equal (4): True\\$\sigma_2 \approx \sigma/\sqrt2\times\sqrt{4/5}$: True\\$\sigma_4 \approx \sigma/\sqrt4\times\sqrt{2/5}$: True}
+
+                        Therefore, the Central Limit Theorem seems to be correct judging from this dataset.
+                    \item See figure \ref{fig:231-hists}. The top of the shape shifts from left to right as $N$ increases.
+
+                        \begin{figure}[p]
+                            \centering
+                            \includegraphics[width=\linewidth]{ex231-hists}
+                            \caption{Histograms of the population distribution and the sample means distributions}
+                            \label{fig:231-hists}
+                        \end{figure}
+                \end{enumerate}
+        \end{enumerate}
+\end{enumerate}
+
+\end{document}
+
diff --git a/Assignment 2/report/ex211-boxplots-1.png b/Assignment 2/report/ex211-boxplots-1.png
new file mode 100644
index 0000000..67ec78b
--- /dev/null
+++ b/Assignment 2/report/ex211-boxplots-1.png
diff --git a/Assignment 2/report/ex211-boxplots-2.png b/Assignment 2/report/ex211-boxplots-2.png
new file mode 100644
index 0000000..cacb3cb
--- /dev/null
+++ b/Assignment 2/report/ex211-boxplots-2.png
diff --git a/Assignment 2/report/ex211-hists-1.png b/Assignment 2/report/ex211-hists-1.png
new file mode 100644
index 0000000..bde8f94
--- /dev/null
+++ b/Assignment 2/report/ex211-hists-1.png
diff --git a/Assignment 2/report/ex211-hists-2.png b/Assignment 2/report/ex211-hists-2.png
new file mode 100644
index 0000000..88d9386
--- /dev/null
+++ b/Assignment 2/report/ex211-hists-2.png
diff --git a/Assignment 2/report/ex212-correlation-bars.png b/Assignment 2/report/ex212-correlation-bars.png
new file mode 100644
index 0000000..3918fb1
--- /dev/null
+++ b/Assignment 2/report/ex212-correlation-bars.png
diff --git a/Assignment 2/report/ex212-scatters.png b/Assignment 2/report/ex212-scatters.png
new file mode 100644
index 0000000..0ab049c
--- /dev/null
+++ b/Assignment 2/report/ex212-scatters.png
diff --git a/Assignment 2/report/ex221-reconstructed-visualisation.png b/Assignment 2/report/ex221-reconstructed-visualisation.png
new file mode 100644
index 0000000..ea070db
--- /dev/null
+++ b/Assignment 2/report/ex221-reconstructed-visualisation.png
diff --git a/Assignment 2/report/ex221-scatter-3d.png b/Assignment 2/report/ex221-scatter-3d.png
new file mode 100644
index 0000000..f5b45e1
--- /dev/null
+++ b/Assignment 2/report/ex221-scatter-3d.png
diff --git a/Assignment 2/report/ex221-scatters.png b/Assignment 2/report/ex221-scatters.png
new file mode 100644
index 0000000..ca6a932
--- /dev/null
+++ b/Assignment 2/report/ex221-scatters.png
diff --git a/Assignment 2/report/ex221-visualisation.png b/Assignment 2/report/ex221-visualisation.png
new file mode 100644
index 0000000..2a0de19
--- /dev/null
+++ b/Assignment 2/report/ex221-visualisation.png
diff --git a/Assignment 2/report/ex231-hists.png b/Assignment 2/report/ex231-hists.png
new file mode 100644
index 0000000..3e24297
--- /dev/null
+++ b/Assignment 2/report/ex231-hists.png
author	Camil Staps	2015-10-11 21:14:28 +0200
committer	Camil Staps	2015-10-11 21:14:28 +0200
commit	1beb50ef75a7db236a5ab3fdf88faf4c55f7c19d (patch)
tree	f1e5aa92ad6ac0a9a213aece1f2a0c0444a39e3d
parent	Start assignment 2 (diff)