From 1beb50ef75a7db236a5ab3fdf88faf4c55f7c19d Mon Sep 17 00:00:00 2001 From: Camil Staps Date: Sun, 11 Oct 2015 21:14:28 +0200 Subject: Assignment 2 finished --- Assignment 2/Data/wine.mat | Bin 0 -> 126090 bytes Assignment 2/Data/zipdata.mat | Bin 0 -> 3862827 bytes Assignment 2/ex21.py | 64 ++++++++++ Assignment 2/ex22.py | 88 +++++++++++++ Assignment 2/ex23.py | 68 ++++++++++ Assignment 2/report/assignment2.tex | 140 +++++++++++++++++++++ Assignment 2/report/ex211-boxplots-1.png | Bin 0 -> 82809 bytes Assignment 2/report/ex211-boxplots-2.png | Bin 0 -> 83829 bytes Assignment 2/report/ex211-hists-1.png | Bin 0 -> 134486 bytes Assignment 2/report/ex211-hists-2.png | Bin 0 -> 137724 bytes Assignment 2/report/ex212-correlation-bars.png | Bin 0 -> 48225 bytes Assignment 2/report/ex212-scatters.png | Bin 0 -> 177256 bytes .../report/ex221-reconstructed-visualisation.png | Bin 0 -> 45269 bytes Assignment 2/report/ex221-scatter-3d.png | Bin 0 -> 141551 bytes Assignment 2/report/ex221-scatters.png | Bin 0 -> 226468 bytes Assignment 2/report/ex221-visualisation.png | Bin 0 -> 29592 bytes Assignment 2/report/ex231-hists.png | Bin 0 -> 39326 bytes Assignment2/Data/wine.mat | Bin 126090 -> 0 bytes Assignment2/ex21.py | 62 --------- 19 files changed, 360 insertions(+), 62 deletions(-) create mode 100644 Assignment 2/Data/wine.mat create mode 100644 Assignment 2/Data/zipdata.mat create mode 100644 Assignment 2/ex21.py create mode 100644 Assignment 2/ex22.py create mode 100644 Assignment 2/ex23.py create mode 100644 Assignment 2/report/assignment2.tex create mode 100644 Assignment 2/report/ex211-boxplots-1.png create mode 100644 Assignment 2/report/ex211-boxplots-2.png create mode 100644 Assignment 2/report/ex211-hists-1.png create mode 100644 Assignment 2/report/ex211-hists-2.png create mode 100644 Assignment 2/report/ex212-correlation-bars.png create mode 100644 Assignment 2/report/ex212-scatters.png create mode 100644 Assignment 2/report/ex221-reconstructed-visualisation.png create mode 100644 Assignment 2/report/ex221-scatter-3d.png create mode 100644 Assignment 2/report/ex221-scatters.png create mode 100644 Assignment 2/report/ex221-visualisation.png create mode 100644 Assignment 2/report/ex231-hists.png delete mode 100644 Assignment2/Data/wine.mat delete mode 100644 Assignment2/ex21.py diff --git a/Assignment 2/Data/wine.mat b/Assignment 2/Data/wine.mat new file mode 100644 index 0000000..da15efd Binary files /dev/null and b/Assignment 2/Data/wine.mat differ diff --git a/Assignment 2/Data/zipdata.mat b/Assignment 2/Data/zipdata.mat new file mode 100644 index 0000000..a98e796 Binary files /dev/null and b/Assignment 2/Data/zipdata.mat differ diff --git a/Assignment 2/ex21.py b/Assignment 2/ex21.py new file mode 100644 index 0000000..2594c61 --- /dev/null +++ b/Assignment 2/ex21.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat Oct 10 21:28:45 2015 + +@author: Camil Staps (s4498062) + +Run with Python 2.7 +""" + +import matplotlib.pyplot as plt +from scipy import io as sciio, stats +import numpy as np + +# 2.1.1 +wine = sciio.loadmat('./Data/wine.mat') +data = wine['X'] +atts = [str(s[0]) for s in wine['attributeNames'][0]] + +# Initial boxplots & histograms +plt.figure(figsize=(20,10)) +plt.boxplot(stats.zscore(data)) +plt.xticks(range(len(atts) + 1), [''] + atts, rotation=45, ha='right') +plt.show() + +plt.figure(figsize=(20,10)) +for i in range(len(data[0])): + plt.subplot(3, 4, i + 1) + plt.hist(data[:,i]) + plt.xlabel(atts[i]) +plt.show() + +# Removing known outliers +data = np.array([d for d in data if d[1] < 20 and # Volatide acidity + 0.01 < d[7] and d[7] < 10 and # Density + 0.5 < d[10] and d[10] < 200]) # Alcohol + +# Clean boxplots & histograms +plt.figure(figsize=(20,10)) +plt.boxplot(stats.zscore(data)) +plt.xticks(range(len(atts) + 1), [''] + atts, rotation=45, ha='right') +plt.show() + +plt.figure(figsize=(20,10)) +for i in range(len(data[0])): + plt.subplot(3, 4, i + 1) + plt.hist(data[:,i]) + plt.xlabel(atts[i]) +plt.show() + +# 2.1.2 +data = np.transpose(data) +plt.figure(figsize=(20,10)) +for i in range(len(data) - 1): + plt.subplot(3, 4, i + 1) + plt.scatter(data[i], data[11], marker='.', alpha=0.2) + plt.xlabel(atts[i]) +plt.show() + +fig, ax = plt.subplots(figsize=(10,5)) +it = np.arange(len(data) - 1) +ax.bar(it, [stats.pearsonr(data[i], data[11])[0] for i in it]) +ax.set_xticks(it + 0.5) +ax.set_xticklabels(atts, rotation=90, ha='center') +plt.show() diff --git a/Assignment 2/ex22.py b/Assignment 2/ex22.py new file mode 100644 index 0000000..3487752 --- /dev/null +++ b/Assignment 2/ex22.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Oct 11 09:28:15 2015 + +@author: Camil Staps (s4498062) + +Run with Python 2.7 +""" + +import itertools +import matplotlib.pyplot as plt +import matplotlib.pylab as plab +from mpl_toolkits.mplot3d import Axes3D +import numpy as np +import scipy.io + +# 2.2.1 +zipd = scipy.io.loadmat('./Data/zipdata.mat') +traindata = zipd['traindata'] +testdata = zipd['testdata'] + +data = traindata[:,1:] +classes = traindata[:,0] +temp = [(d, c) for d, c in zip(data, classes) if c < 2] +[data, classes] = [np.array(t) for t in zip(*temp)] + +mean = data.mean(0) + +# First visualisation +for i in range(10): + plt.subplot(2, 5, i) + image = plab.reshape(data[i,:], (16, 16)) + plt.imshow(image, extent=(0, 16, 0, 16), cmap=plab.cm.gray_r) + plt.axis('off') +plt.show() + +# PCA +Y = data - np.ones((len(data), 1)) * mean +U, S, Vt = np.linalg.svd(Y, full_matrices=False) +V = Vt.T +Z = np.dot(Y, V[:,0:4]) + +W = np.dot(Z[:10], V[:,0:4].T) + mean +for i in range(10): + plt.subplot(2, 5, i) + image = plab.reshape(W[i,:], (16, 16)) + plt.imshow(image, extent=(0, 16, 0, 16), cmap=plab.cm.gray_r) + plt.axis('off') +plt.show() + +Y0 = [d for c, d in zip(classes, Y) if c == 0] +Y1 = [d for c, d in zip(classes, Y) if c == 1] +plt.figure(figsize=(16,16)) +for i, j in itertools.product(*[range(4), range(4)]): + plt.subplot(4, 4, 4 * i + j + 1) + + Z1 = np.dot(Y0, V[:,i:i + 1]) + Z2 = np.dot(Y0, V[:,j:j + 1]) + plt.scatter(Z1, Z2, color='r', marker='.', s=1, label='0') + Z1 = np.dot(Y1, V[:,i:i + 1]) + Z2 = np.dot(Y1, V[:,j:j + 1]) + plt.scatter(Z1, Z2, color='b', marker='.', s=1, label='1') + + plt.ylabel('PC' + str(i)) + plt.xlabel('PC' + str(j)) + plt.gca().axes.get_xaxis().set_ticks([]) + plt.gca().axes.get_yaxis().set_ticks([]) +plt.legend(bbox_to_anchor=(1.05, 1), loc=2) +plt.show() + +fig = plt.figure(figsize=(8,8)) +ax = fig.add_subplot(111, projection='3d') +Z1 = np.dot(Y0, V[:,0:1]) +Z2 = np.dot(Y0, V[:,1:2]) +Z3 = np.dot(Y0, V[:,2:3]) +ax.scatter(Z1, Z2, Z3, color='r', marker='.', s=10, label='0') +Z1 = np.dot(Y1, V[:,0:1]) +Z2 = np.dot(Y1, V[:,1:2]) +Z3 = np.dot(Y1, V[:,2:3]) +ax.scatter(Z1, Z2, Z3, color='b', marker='.', s=10, label='1') +ax.set_xlabel('PC1') +ax.set_ylabel('PC2') +ax.set_zlabel('PC3') +ax.set_xticks([]) +ax.set_yticks([]) +ax.set_zticks([]) +plt.legend(bbox_to_anchor=(1.05, 1), loc=2) +plt.show() diff --git a/Assignment 2/ex23.py b/Assignment 2/ex23.py new file mode 100644 index 0000000..7c763ee --- /dev/null +++ b/Assignment 2/ex23.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Oct 11 18:47:35 2015 + +@author: Camil Staps (s4498062) + +Run with Python 2.7 +""" + +import matplotlib.pyplot as plt +import numpy as np + +def all_samples(data, n): + """All samples without replacement or ordering with n elements from data""" + if n == 0: + return [[]] + else: + samples = [] + for i, d in enumerate(data): + samples = samples + [[d] + s for s in all_samples(data[i+1:], n-1)] + return samples + +def nearly_equal(m, n, sig_fig = 5): + """Determine whether two numbers are nearly equal""" + # http://stackoverflow.com/a/558289/1544337 + return m == n or int(m * 10 ** sig_fig) == int(n * 10 ** sig_fig) + +data = np.array([2,3,6,8,11,18]) + +# i +print("Mean: %f\nStandard deviation: %f" % (data.mean(), data.std())) + +samples_2 = all_samples(data, 2) +samples_4 = all_samples(data, 4) + +# ii +print([(s, np.mean(s)) for s in samples_2]) +print([(s, np.mean(s)) for s in samples_4]) + +# iii +samples_2_means = [np.mean(s) for s in samples_2] +samples_4_means = [np.mean(s) for s in samples_4] + +print("Mean of 2-sample means: %f" % np.mean(samples_2_means)) +print("Standard deviation of 2-sample means: %f" % np.std(samples_2_means)) +print("Mean of 4-sample means: %f" % np.mean(samples_4_means)) +print("Standard deviation of 4-sample means: %f" % np.std(samples_4_means)) + +# iv +print("Means are equal (2): %r" % (np.mean(samples_2_means) == data.mean())) +print("Means are equal (4): %r" % (np.mean(samples_4_means) == data.mean())) +print("σ2 ≈ σ/√2×√(4/5): %r" % nearly_equal( + np.std(samples_2_means), data.std() / np.sqrt(2.) * np.sqrt(4./5.))) +print("σ4 ≈ σ/√4×√(2/5): %r" % nearly_equal( + np.std(samples_4_means), data.std() / np.sqrt(4.) * np.sqrt(2./5.))) + +# v +plt.figure(figsize=(10,4)) +plt.subplot(1, 3, 1) +plt.hist(data) +plt.title('Population distribution') +plt.subplot(1, 3, 2) +plt.hist(samples_2_means) +plt.title('2-Sample mean distribution') +plt.subplot(1, 3, 3) +plt.hist(samples_4_means) +plt.title('4-Sample mean distribution') +plt.show() diff --git a/Assignment 2/report/assignment2.tex b/Assignment 2/report/assignment2.tex new file mode 100644 index 0000000..6ae0b55 --- /dev/null +++ b/Assignment 2/report/assignment2.tex @@ -0,0 +1,140 @@ +\documentclass[10pt,a4paper]{article} + +\usepackage[margin=2cm]{geometry} +\usepackage{graphicx} + +\let\assignment2 + +\usepackage{enumitem} +\setenumerate[1]{label=\assignment.\arabic*.} +\setenumerate[2]{label=\arabic*.} +\setenumerate[3]{label=\roman*.} + +% textcomp package is not available everywhere, and we only need the Copyright symbol +% taken from http://tex.stackexchange.com/a/1677/23992 +\DeclareTextCommandDefault{\textregistered}{\textcircled{\check@mathfonts\fontsize\sf@size\z@\math@fontsfalse\selectfont R}} + +\usepackage{fancyhdr} +\renewcommand{\headrulewidth}{0pt} +\renewcommand{\footrulewidth}{0pt} +\fancyhead{} +%\fancyfoot[C]{Copyright {\textcopyright} 2015 Camil Staps} +\pagestyle{fancy} + +\usepackage{caption} +\usepackage{subcaption} + +\parindent0pt + +\title{Data Mining - assignment \assignment} +\author{Camil Staps\\\small{s4498062}} + +\begin{document} + +\maketitle +\thispagestyle{fancy} + +\begin{enumerate} + \item \begin{enumerate} + \item See figure \ref{fig:211-boxplots-1} through \ref{fig:211-hists-2}. It is clear that after eliminating the outliers we get a much better idea of the distributions. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-boxplots-1} + \caption{Boxplots before eliminating outliers} + \label{fig:211-boxplots-1} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-hists-1} + \caption{Histograms before eliminating outliers} + \label{fig:211-hists-1} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-boxplots-2} + \caption{Boxplots after eliminating outliers} + \label{fig:211-boxplots-2} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-hists-2} + \caption{Histograms after eliminating outliers} + \label{fig:211-hists-2} + \end{figure} + + \item See figure \ref{fig:212-scatters} and \ref{fig:212-correlation-bars}. As can be seen in the latter, there is a large (positive) correlation between alcohol percentage and quality, and there is a large (negative) correlation between density, volatile acidity and chlorides and quality. + + From the first plots we also see that high quality wine has a `citric acid' level of around $0.4$. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex212-scatters} + \caption{Scatter plots between attributes and wine quality} + \label{fig:212-scatters} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex212-correlation-bars} + \caption{Correlations coefficients between attributes and wine quality} + \label{fig:212-correlation-bars} + \end{figure} + \end{enumerate} + + \item \begin{enumerate} + \item See figure \ref{fig:221-visualisation}. PCA seems to work quite well here. However, it is also clear (not from this picture though) that if we want to reconstruct all ten digits, we need more principal components. But in this case the digits are easily recognisable. This is even more clear in the scatter plot of PC0 against PC0 in figure \ref{fig:221-scatters}. It is clear that almost all zeroes and ones can be recognised by checking whether the first principal component is below some threshold. + + \begin{figure}[p] + \centering + \begin{subfigure}{.45\linewidth} + \includegraphics[width=\linewidth]{ex221-visualisation} + \caption{Initially} + \end{subfigure} + \begin{subfigure}{.45\linewidth} + \includegraphics[width=\linewidth]{ex221-reconstructed-visualisation} + \caption{Reconstructed with four principal components} + \end{subfigure} + \caption{Visualisations of the first ten zeroes and ones} + \label{fig:221-visualisation} + \end{figure} + + See figure \ref{fig:221-scatters} and \ref{fig:221-scatter-3d} for scatter plots. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex221-scatters} + \caption{Scatter plots of handwritten digits, projected on two principal components (NB: the principal components are numbered starting from $0$)} + \label{fig:221-scatters} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex221-scatter-3d} + \caption{Scatter plot of handwritten digits, projected on three principal components} + \label{fig:221-scatter-3d} + \end{figure} + \end{enumerate} + + \item \begin{enumerate} + \item \begin{enumerate} + \item \texttt{Mean: 8.000000\\Standard deviation: 5.385165} + \item These are all combinations along with their means: + + \texttt{[([2, 3], 2.5), ([2, 6], 4.0), ([2, 8], 5.0), ([2, 11], 6.5), ([2, 18], 10.0), ([3, 6], 4.5), ([3, 8], 5.5), ([3, 11], 7.0), ([3, 18], 10.5), ([6, 8], 7.0), ([6, 11], 8.5), ([6, 18], 12.0), ([8, 11], 9.5), ([8, 18], 13.0), ([11, 18], 14.5)]{\\}[([2, 3, 6, 8], 4.75), ([2, 3, 6, 11], 5.5), ([2, 3, 6, 18], 7.25), ([2, 3, 8, 11], 6.0), ([2, 3, 8, 18], 7.75), ([2, 3, 11, 18], 8.5), ([2, 6, 8, 11], 6.75), ([2, 6, 8, 18], 8.5), ([2, 6, 11, 18], 9.25), ([2, 8, 11, 18], 9.75), ([3, 6, 8, 11], 7.0), ([3, 6, 8, 18], 8.75), ([3, 6, 11, 18], 9.5), ([3, 8, 11, 18], 10.0), ([6, 8, 11, 18], 10.75)]} + \item \texttt{Mean of 2-sample means: 8.000000\\Standard deviation of 2-sample means: 3.405877\\Mean of 4-sample means: 8.000000\\Standard deviation of 4-sample means: 1.702939} + \item \texttt{Means are equal (2): True\\Means are equal (4): True\\$\sigma_2 \approx \sigma/\sqrt2\times\sqrt{4/5}$: True\\$\sigma_4 \approx \sigma/\sqrt4\times\sqrt{2/5}$: True} + + Therefore, the Central Limit Theorem seems to be correct judging from this dataset. + \item See figure \ref{fig:231-hists}. The top of the shape shifts from left to right as $N$ increases. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex231-hists} + \caption{Histograms of the population distribution and the sample means distributions} + \label{fig:231-hists} + \end{figure} + \end{enumerate} + \end{enumerate} +\end{enumerate} + +\end{document} + diff --git a/Assignment 2/report/ex211-boxplots-1.png b/Assignment 2/report/ex211-boxplots-1.png new file mode 100644 index 0000000..67ec78b Binary files /dev/null and b/Assignment 2/report/ex211-boxplots-1.png differ diff --git a/Assignment 2/report/ex211-boxplots-2.png b/Assignment 2/report/ex211-boxplots-2.png new file mode 100644 index 0000000..cacb3cb Binary files /dev/null and b/Assignment 2/report/ex211-boxplots-2.png differ diff --git a/Assignment 2/report/ex211-hists-1.png b/Assignment 2/report/ex211-hists-1.png new file mode 100644 index 0000000..bde8f94 Binary files /dev/null and b/Assignment 2/report/ex211-hists-1.png differ diff --git a/Assignment 2/report/ex211-hists-2.png b/Assignment 2/report/ex211-hists-2.png new file mode 100644 index 0000000..88d9386 Binary files /dev/null and b/Assignment 2/report/ex211-hists-2.png differ diff --git a/Assignment 2/report/ex212-correlation-bars.png b/Assignment 2/report/ex212-correlation-bars.png new file mode 100644 index 0000000..3918fb1 Binary files /dev/null and b/Assignment 2/report/ex212-correlation-bars.png differ diff --git a/Assignment 2/report/ex212-scatters.png b/Assignment 2/report/ex212-scatters.png new file mode 100644 index 0000000..0ab049c Binary files /dev/null and b/Assignment 2/report/ex212-scatters.png differ diff --git a/Assignment 2/report/ex221-reconstructed-visualisation.png b/Assignment 2/report/ex221-reconstructed-visualisation.png new file mode 100644 index 0000000..ea070db Binary files /dev/null and b/Assignment 2/report/ex221-reconstructed-visualisation.png differ diff --git a/Assignment 2/report/ex221-scatter-3d.png b/Assignment 2/report/ex221-scatter-3d.png new file mode 100644 index 0000000..f5b45e1 Binary files /dev/null and b/Assignment 2/report/ex221-scatter-3d.png differ diff --git a/Assignment 2/report/ex221-scatters.png b/Assignment 2/report/ex221-scatters.png new file mode 100644 index 0000000..ca6a932 Binary files /dev/null and b/Assignment 2/report/ex221-scatters.png differ diff --git a/Assignment 2/report/ex221-visualisation.png b/Assignment 2/report/ex221-visualisation.png new file mode 100644 index 0000000..2a0de19 Binary files /dev/null and b/Assignment 2/report/ex221-visualisation.png differ diff --git a/Assignment 2/report/ex231-hists.png b/Assignment 2/report/ex231-hists.png new file mode 100644 index 0000000..3e24297 Binary files /dev/null and b/Assignment 2/report/ex231-hists.png differ diff --git a/Assignment2/Data/wine.mat b/Assignment2/Data/wine.mat deleted file mode 100644 index da15efd..0000000 Binary files a/Assignment2/Data/wine.mat and /dev/null differ diff --git a/Assignment2/ex21.py b/Assignment2/ex21.py deleted file mode 100644 index 87d68cd..0000000 --- a/Assignment2/ex21.py +++ /dev/null @@ -1,62 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Sat Oct 10 21:28:45 2015 - -@author: camilstaps -""" - -import matplotlib.pyplot as plt -from scipy import io as sciio, stats -import numpy as np - -# 2.1.1 -wine = sciio.loadmat('./Data/wine.mat') -data = wine['X'] -atts = [str(s[0]) for s in wine['attributeNames'][0]] - -# Initial boxplots & histograms -plt.figure(figsize=(20,10)) -plt.boxplot(stats.zscore(data)) -plt.xticks(range(len(atts) + 1), [''] + atts, rotation=45, ha='right') -plt.show() - -plt.figure(figsize=(20,10)) -for i in range(len(data[0])): - plt.subplot(3, 4, i + 1) - plt.hist(data[:,i]) - plt.xlabel(atts[i]) -plt.show() - -# Removing known outliers -data = np.array([d for d in data if d[1] < 20 and # Volatide acidity - 0.01 < d[7] and d[7] < 10 and # Density - 0.5 < d[10] and d[10] < 200]) # Alcohol - -# Clean boxplots & histograms -plt.figure(figsize=(20,10)) -plt.boxplot(stats.zscore(data)) -plt.xticks(range(len(atts) + 1), [''] + atts, rotation=45, ha='right') -plt.show() - -plt.figure(figsize=(20,10)) -for i in range(len(data[0])): - plt.subplot(3, 4, i + 1) - plt.hist(data[:,i]) - plt.xlabel(atts[i]) -plt.show() - -# 2.1.2 -data = np.transpose(data) -plt.figure(figsize=(20,10)) -for i in range(len(data) - 1): - plt.subplot(3, 4, i + 1) - plt.scatter(data[i], data[11], marker='.', alpha=0.2) - plt.xlabel(atts[i]) -plt.show() - -fig, ax = plt.subplots(figsize=(10,5)) -it = np.arange(len(data) - 1) -ax.bar(it, [stats.pearsonr(data[i], data[11])[0] for i in it]) -ax.set_xticks(it + 0.5) -ax.set_xticklabels(atts, rotation=90, ha='center') -plt.show() -- cgit v1.2.3