diff options
author | Camil Staps | 2015-10-11 21:14:28 +0200 |
---|---|---|
committer | Camil Staps | 2015-10-11 21:14:28 +0200 |
commit | 1beb50ef75a7db236a5ab3fdf88faf4c55f7c19d (patch) | |
tree | f1e5aa92ad6ac0a9a213aece1f2a0c0444a39e3d | |
parent | Start assignment 2 (diff) |
Assignment 2 finished
17 files changed, 299 insertions, 1 deletions
diff --git a/Assignment2/Data/wine.mat b/Assignment 2/Data/wine.mat Binary files differindex da15efd..da15efd 100644 --- a/Assignment2/Data/wine.mat +++ b/Assignment 2/Data/wine.mat diff --git a/Assignment 2/Data/zipdata.mat b/Assignment 2/Data/zipdata.mat Binary files differnew file mode 100644 index 0000000..a98e796 --- /dev/null +++ b/Assignment 2/Data/zipdata.mat diff --git a/Assignment2/ex21.py b/Assignment 2/ex21.py index 87d68cd..2594c61 100644 --- a/Assignment2/ex21.py +++ b/Assignment 2/ex21.py @@ -2,7 +2,9 @@ """ Created on Sat Oct 10 21:28:45 2015 -@author: camilstaps +@author: Camil Staps (s4498062) + +Run with Python 2.7 """ import matplotlib.pyplot as plt diff --git a/Assignment 2/ex22.py b/Assignment 2/ex22.py new file mode 100644 index 0000000..3487752 --- /dev/null +++ b/Assignment 2/ex22.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Oct 11 09:28:15 2015 + +@author: Camil Staps (s4498062) + +Run with Python 2.7 +""" + +import itertools +import matplotlib.pyplot as plt +import matplotlib.pylab as plab +from mpl_toolkits.mplot3d import Axes3D +import numpy as np +import scipy.io + +# 2.2.1 +zipd = scipy.io.loadmat('./Data/zipdata.mat') +traindata = zipd['traindata'] +testdata = zipd['testdata'] + +data = traindata[:,1:] +classes = traindata[:,0] +temp = [(d, c) for d, c in zip(data, classes) if c < 2] +[data, classes] = [np.array(t) for t in zip(*temp)] + +mean = data.mean(0) + +# First visualisation +for i in range(10): + plt.subplot(2, 5, i) + image = plab.reshape(data[i,:], (16, 16)) + plt.imshow(image, extent=(0, 16, 0, 16), cmap=plab.cm.gray_r) + plt.axis('off') +plt.show() + +# PCA +Y = data - np.ones((len(data), 1)) * mean +U, S, Vt = np.linalg.svd(Y, full_matrices=False) +V = Vt.T +Z = np.dot(Y, V[:,0:4]) + +W = np.dot(Z[:10], V[:,0:4].T) + mean +for i in range(10): + plt.subplot(2, 5, i) + image = plab.reshape(W[i,:], (16, 16)) + plt.imshow(image, extent=(0, 16, 0, 16), cmap=plab.cm.gray_r) + plt.axis('off') +plt.show() + +Y0 = [d for c, d in zip(classes, Y) if c == 0] +Y1 = [d for c, d in zip(classes, Y) if c == 1] +plt.figure(figsize=(16,16)) +for i, j in itertools.product(*[range(4), range(4)]): + plt.subplot(4, 4, 4 * i + j + 1) + + Z1 = np.dot(Y0, V[:,i:i + 1]) + Z2 = np.dot(Y0, V[:,j:j + 1]) + plt.scatter(Z1, Z2, color='r', marker='.', s=1, label='0') + Z1 = np.dot(Y1, V[:,i:i + 1]) + Z2 = np.dot(Y1, V[:,j:j + 1]) + plt.scatter(Z1, Z2, color='b', marker='.', s=1, label='1') + + plt.ylabel('PC' + str(i)) + plt.xlabel('PC' + str(j)) + plt.gca().axes.get_xaxis().set_ticks([]) + plt.gca().axes.get_yaxis().set_ticks([]) +plt.legend(bbox_to_anchor=(1.05, 1), loc=2) +plt.show() + +fig = plt.figure(figsize=(8,8)) +ax = fig.add_subplot(111, projection='3d') +Z1 = np.dot(Y0, V[:,0:1]) +Z2 = np.dot(Y0, V[:,1:2]) +Z3 = np.dot(Y0, V[:,2:3]) +ax.scatter(Z1, Z2, Z3, color='r', marker='.', s=10, label='0') +Z1 = np.dot(Y1, V[:,0:1]) +Z2 = np.dot(Y1, V[:,1:2]) +Z3 = np.dot(Y1, V[:,2:3]) +ax.scatter(Z1, Z2, Z3, color='b', marker='.', s=10, label='1') +ax.set_xlabel('PC1') +ax.set_ylabel('PC2') +ax.set_zlabel('PC3') +ax.set_xticks([]) +ax.set_yticks([]) +ax.set_zticks([]) +plt.legend(bbox_to_anchor=(1.05, 1), loc=2) +plt.show() diff --git a/Assignment 2/ex23.py b/Assignment 2/ex23.py new file mode 100644 index 0000000..7c763ee --- /dev/null +++ b/Assignment 2/ex23.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Oct 11 18:47:35 2015 + +@author: Camil Staps (s4498062) + +Run with Python 2.7 +""" + +import matplotlib.pyplot as plt +import numpy as np + +def all_samples(data, n): + """All samples without replacement or ordering with n elements from data""" + if n == 0: + return [[]] + else: + samples = [] + for i, d in enumerate(data): + samples = samples + [[d] + s for s in all_samples(data[i+1:], n-1)] + return samples + +def nearly_equal(m, n, sig_fig = 5): + """Determine whether two numbers are nearly equal""" + # http://stackoverflow.com/a/558289/1544337 + return m == n or int(m * 10 ** sig_fig) == int(n * 10 ** sig_fig) + +data = np.array([2,3,6,8,11,18]) + +# i +print("Mean: %f\nStandard deviation: %f" % (data.mean(), data.std())) + +samples_2 = all_samples(data, 2) +samples_4 = all_samples(data, 4) + +# ii +print([(s, np.mean(s)) for s in samples_2]) +print([(s, np.mean(s)) for s in samples_4]) + +# iii +samples_2_means = [np.mean(s) for s in samples_2] +samples_4_means = [np.mean(s) for s in samples_4] + +print("Mean of 2-sample means: %f" % np.mean(samples_2_means)) +print("Standard deviation of 2-sample means: %f" % np.std(samples_2_means)) +print("Mean of 4-sample means: %f" % np.mean(samples_4_means)) +print("Standard deviation of 4-sample means: %f" % np.std(samples_4_means)) + +# iv +print("Means are equal (2): %r" % (np.mean(samples_2_means) == data.mean())) +print("Means are equal (4): %r" % (np.mean(samples_4_means) == data.mean())) +print("σ2 ≈ σ/√2×√(4/5): %r" % nearly_equal( + np.std(samples_2_means), data.std() / np.sqrt(2.) * np.sqrt(4./5.))) +print("σ4 ≈ σ/√4×√(2/5): %r" % nearly_equal( + np.std(samples_4_means), data.std() / np.sqrt(4.) * np.sqrt(2./5.))) + +# v +plt.figure(figsize=(10,4)) +plt.subplot(1, 3, 1) +plt.hist(data) +plt.title('Population distribution') +plt.subplot(1, 3, 2) +plt.hist(samples_2_means) +plt.title('2-Sample mean distribution') +plt.subplot(1, 3, 3) +plt.hist(samples_4_means) +plt.title('4-Sample mean distribution') +plt.show() diff --git a/Assignment 2/report/assignment2.tex b/Assignment 2/report/assignment2.tex new file mode 100644 index 0000000..6ae0b55 --- /dev/null +++ b/Assignment 2/report/assignment2.tex @@ -0,0 +1,140 @@ +\documentclass[10pt,a4paper]{article} + +\usepackage[margin=2cm]{geometry} +\usepackage{graphicx} + +\let\assignment2 + +\usepackage{enumitem} +\setenumerate[1]{label=\assignment.\arabic*.} +\setenumerate[2]{label=\arabic*.} +\setenumerate[3]{label=\roman*.} + +% textcomp package is not available everywhere, and we only need the Copyright symbol +% taken from http://tex.stackexchange.com/a/1677/23992 +\DeclareTextCommandDefault{\textregistered}{\textcircled{\check@mathfonts\fontsize\sf@size\z@\math@fontsfalse\selectfont R}} + +\usepackage{fancyhdr} +\renewcommand{\headrulewidth}{0pt} +\renewcommand{\footrulewidth}{0pt} +\fancyhead{} +%\fancyfoot[C]{Copyright {\textcopyright} 2015 Camil Staps} +\pagestyle{fancy} + +\usepackage{caption} +\usepackage{subcaption} + +\parindent0pt + +\title{Data Mining - assignment \assignment} +\author{Camil Staps\\\small{s4498062}} + +\begin{document} + +\maketitle +\thispagestyle{fancy} + +\begin{enumerate} + \item \begin{enumerate} + \item See figure \ref{fig:211-boxplots-1} through \ref{fig:211-hists-2}. It is clear that after eliminating the outliers we get a much better idea of the distributions. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-boxplots-1} + \caption{Boxplots before eliminating outliers} + \label{fig:211-boxplots-1} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-hists-1} + \caption{Histograms before eliminating outliers} + \label{fig:211-hists-1} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-boxplots-2} + \caption{Boxplots after eliminating outliers} + \label{fig:211-boxplots-2} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-hists-2} + \caption{Histograms after eliminating outliers} + \label{fig:211-hists-2} + \end{figure} + + \item See figure \ref{fig:212-scatters} and \ref{fig:212-correlation-bars}. As can be seen in the latter, there is a large (positive) correlation between alcohol percentage and quality, and there is a large (negative) correlation between density, volatile acidity and chlorides and quality. + + From the first plots we also see that high quality wine has a `citric acid' level of around $0.4$. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex212-scatters} + \caption{Scatter plots between attributes and wine quality} + \label{fig:212-scatters} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex212-correlation-bars} + \caption{Correlations coefficients between attributes and wine quality} + \label{fig:212-correlation-bars} + \end{figure} + \end{enumerate} + + \item \begin{enumerate} + \item See figure \ref{fig:221-visualisation}. PCA seems to work quite well here. However, it is also clear (not from this picture though) that if we want to reconstruct all ten digits, we need more principal components. But in this case the digits are easily recognisable. This is even more clear in the scatter plot of PC0 against PC0 in figure \ref{fig:221-scatters}. It is clear that almost all zeroes and ones can be recognised by checking whether the first principal component is below some threshold. + + \begin{figure}[p] + \centering + \begin{subfigure}{.45\linewidth} + \includegraphics[width=\linewidth]{ex221-visualisation} + \caption{Initially} + \end{subfigure} + \begin{subfigure}{.45\linewidth} + \includegraphics[width=\linewidth]{ex221-reconstructed-visualisation} + \caption{Reconstructed with four principal components} + \end{subfigure} + \caption{Visualisations of the first ten zeroes and ones} + \label{fig:221-visualisation} + \end{figure} + + See figure \ref{fig:221-scatters} and \ref{fig:221-scatter-3d} for scatter plots. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex221-scatters} + \caption{Scatter plots of handwritten digits, projected on two principal components (NB: the principal components are numbered starting from $0$)} + \label{fig:221-scatters} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex221-scatter-3d} + \caption{Scatter plot of handwritten digits, projected on three principal components} + \label{fig:221-scatter-3d} + \end{figure} + \end{enumerate} + + \item \begin{enumerate} + \item \begin{enumerate} + \item \texttt{Mean: 8.000000\\Standard deviation: 5.385165} + \item These are all combinations along with their means: + + \texttt{[([2, 3], 2.5), ([2, 6], 4.0), ([2, 8], 5.0), ([2, 11], 6.5), ([2, 18], 10.0), ([3, 6], 4.5), ([3, 8], 5.5), ([3, 11], 7.0), ([3, 18], 10.5), ([6, 8], 7.0), ([6, 11], 8.5), ([6, 18], 12.0), ([8, 11], 9.5), ([8, 18], 13.0), ([11, 18], 14.5)]{\\}[([2, 3, 6, 8], 4.75), ([2, 3, 6, 11], 5.5), ([2, 3, 6, 18], 7.25), ([2, 3, 8, 11], 6.0), ([2, 3, 8, 18], 7.75), ([2, 3, 11, 18], 8.5), ([2, 6, 8, 11], 6.75), ([2, 6, 8, 18], 8.5), ([2, 6, 11, 18], 9.25), ([2, 8, 11, 18], 9.75), ([3, 6, 8, 11], 7.0), ([3, 6, 8, 18], 8.75), ([3, 6, 11, 18], 9.5), ([3, 8, 11, 18], 10.0), ([6, 8, 11, 18], 10.75)]} + \item \texttt{Mean of 2-sample means: 8.000000\\Standard deviation of 2-sample means: 3.405877\\Mean of 4-sample means: 8.000000\\Standard deviation of 4-sample means: 1.702939} + \item \texttt{Means are equal (2): True\\Means are equal (4): True\\$\sigma_2 \approx \sigma/\sqrt2\times\sqrt{4/5}$: True\\$\sigma_4 \approx \sigma/\sqrt4\times\sqrt{2/5}$: True} + + Therefore, the Central Limit Theorem seems to be correct judging from this dataset. + \item See figure \ref{fig:231-hists}. The top of the shape shifts from left to right as $N$ increases. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex231-hists} + \caption{Histograms of the population distribution and the sample means distributions} + \label{fig:231-hists} + \end{figure} + \end{enumerate} + \end{enumerate} +\end{enumerate} + +\end{document} + diff --git a/Assignment 2/report/ex211-boxplots-1.png b/Assignment 2/report/ex211-boxplots-1.png Binary files differnew file mode 100644 index 0000000..67ec78b --- /dev/null +++ b/Assignment 2/report/ex211-boxplots-1.png diff --git a/Assignment 2/report/ex211-boxplots-2.png b/Assignment 2/report/ex211-boxplots-2.png Binary files differnew file mode 100644 index 0000000..cacb3cb --- /dev/null +++ b/Assignment 2/report/ex211-boxplots-2.png diff --git a/Assignment 2/report/ex211-hists-1.png b/Assignment 2/report/ex211-hists-1.png Binary files differnew file mode 100644 index 0000000..bde8f94 --- /dev/null +++ b/Assignment 2/report/ex211-hists-1.png diff --git a/Assignment 2/report/ex211-hists-2.png b/Assignment 2/report/ex211-hists-2.png Binary files differnew file mode 100644 index 0000000..88d9386 --- /dev/null +++ b/Assignment 2/report/ex211-hists-2.png diff --git a/Assignment 2/report/ex212-correlation-bars.png b/Assignment 2/report/ex212-correlation-bars.png Binary files differnew file mode 100644 index 0000000..3918fb1 --- /dev/null +++ b/Assignment 2/report/ex212-correlation-bars.png diff --git a/Assignment 2/report/ex212-scatters.png b/Assignment 2/report/ex212-scatters.png Binary files differnew file mode 100644 index 0000000..0ab049c --- /dev/null +++ b/Assignment 2/report/ex212-scatters.png diff --git a/Assignment 2/report/ex221-reconstructed-visualisation.png b/Assignment 2/report/ex221-reconstructed-visualisation.png Binary files differnew file mode 100644 index 0000000..ea070db --- /dev/null +++ b/Assignment 2/report/ex221-reconstructed-visualisation.png diff --git a/Assignment 2/report/ex221-scatter-3d.png b/Assignment 2/report/ex221-scatter-3d.png Binary files differnew file mode 100644 index 0000000..f5b45e1 --- /dev/null +++ b/Assignment 2/report/ex221-scatter-3d.png diff --git a/Assignment 2/report/ex221-scatters.png b/Assignment 2/report/ex221-scatters.png Binary files differnew file mode 100644 index 0000000..ca6a932 --- /dev/null +++ b/Assignment 2/report/ex221-scatters.png diff --git a/Assignment 2/report/ex221-visualisation.png b/Assignment 2/report/ex221-visualisation.png Binary files differnew file mode 100644 index 0000000..2a0de19 --- /dev/null +++ b/Assignment 2/report/ex221-visualisation.png diff --git a/Assignment 2/report/ex231-hists.png b/Assignment 2/report/ex231-hists.png Binary files differnew file mode 100644 index 0000000..3e24297 --- /dev/null +++ b/Assignment 2/report/ex231-hists.png |