From 1beb50ef75a7db236a5ab3fdf88faf4c55f7c19d Mon Sep 17 00:00:00 2001 From: Camil Staps Date: Sun, 11 Oct 2015 21:14:28 +0200 Subject: Assignment 2 finished --- Assignment 2/report/assignment2.tex | 140 ++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 Assignment 2/report/assignment2.tex (limited to 'Assignment 2/report/assignment2.tex') diff --git a/Assignment 2/report/assignment2.tex b/Assignment 2/report/assignment2.tex new file mode 100644 index 0000000..6ae0b55 --- /dev/null +++ b/Assignment 2/report/assignment2.tex @@ -0,0 +1,140 @@ +\documentclass[10pt,a4paper]{article} + +\usepackage[margin=2cm]{geometry} +\usepackage{graphicx} + +\let\assignment2 + +\usepackage{enumitem} +\setenumerate[1]{label=\assignment.\arabic*.} +\setenumerate[2]{label=\arabic*.} +\setenumerate[3]{label=\roman*.} + +% textcomp package is not available everywhere, and we only need the Copyright symbol +% taken from http://tex.stackexchange.com/a/1677/23992 +\DeclareTextCommandDefault{\textregistered}{\textcircled{\check@mathfonts\fontsize\sf@size\z@\math@fontsfalse\selectfont R}} + +\usepackage{fancyhdr} +\renewcommand{\headrulewidth}{0pt} +\renewcommand{\footrulewidth}{0pt} +\fancyhead{} +%\fancyfoot[C]{Copyright {\textcopyright} 2015 Camil Staps} +\pagestyle{fancy} + +\usepackage{caption} +\usepackage{subcaption} + +\parindent0pt + +\title{Data Mining - assignment \assignment} +\author{Camil Staps\\\small{s4498062}} + +\begin{document} + +\maketitle +\thispagestyle{fancy} + +\begin{enumerate} + \item \begin{enumerate} + \item See figure \ref{fig:211-boxplots-1} through \ref{fig:211-hists-2}. It is clear that after eliminating the outliers we get a much better idea of the distributions. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-boxplots-1} + \caption{Boxplots before eliminating outliers} + \label{fig:211-boxplots-1} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-hists-1} + \caption{Histograms before eliminating outliers} + \label{fig:211-hists-1} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-boxplots-2} + \caption{Boxplots after eliminating outliers} + \label{fig:211-boxplots-2} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex211-hists-2} + \caption{Histograms after eliminating outliers} + \label{fig:211-hists-2} + \end{figure} + + \item See figure \ref{fig:212-scatters} and \ref{fig:212-correlation-bars}. As can be seen in the latter, there is a large (positive) correlation between alcohol percentage and quality, and there is a large (negative) correlation between density, volatile acidity and chlorides and quality. + + From the first plots we also see that high quality wine has a `citric acid' level of around $0.4$. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex212-scatters} + \caption{Scatter plots between attributes and wine quality} + \label{fig:212-scatters} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex212-correlation-bars} + \caption{Correlations coefficients between attributes and wine quality} + \label{fig:212-correlation-bars} + \end{figure} + \end{enumerate} + + \item \begin{enumerate} + \item See figure \ref{fig:221-visualisation}. PCA seems to work quite well here. However, it is also clear (not from this picture though) that if we want to reconstruct all ten digits, we need more principal components. But in this case the digits are easily recognisable. This is even more clear in the scatter plot of PC0 against PC0 in figure \ref{fig:221-scatters}. It is clear that almost all zeroes and ones can be recognised by checking whether the first principal component is below some threshold. + + \begin{figure}[p] + \centering + \begin{subfigure}{.45\linewidth} + \includegraphics[width=\linewidth]{ex221-visualisation} + \caption{Initially} + \end{subfigure} + \begin{subfigure}{.45\linewidth} + \includegraphics[width=\linewidth]{ex221-reconstructed-visualisation} + \caption{Reconstructed with four principal components} + \end{subfigure} + \caption{Visualisations of the first ten zeroes and ones} + \label{fig:221-visualisation} + \end{figure} + + See figure \ref{fig:221-scatters} and \ref{fig:221-scatter-3d} for scatter plots. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex221-scatters} + \caption{Scatter plots of handwritten digits, projected on two principal components (NB: the principal components are numbered starting from $0$)} + \label{fig:221-scatters} + \end{figure} + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex221-scatter-3d} + \caption{Scatter plot of handwritten digits, projected on three principal components} + \label{fig:221-scatter-3d} + \end{figure} + \end{enumerate} + + \item \begin{enumerate} + \item \begin{enumerate} + \item \texttt{Mean: 8.000000\\Standard deviation: 5.385165} + \item These are all combinations along with their means: + + \texttt{[([2, 3], 2.5), ([2, 6], 4.0), ([2, 8], 5.0), ([2, 11], 6.5), ([2, 18], 10.0), ([3, 6], 4.5), ([3, 8], 5.5), ([3, 11], 7.0), ([3, 18], 10.5), ([6, 8], 7.0), ([6, 11], 8.5), ([6, 18], 12.0), ([8, 11], 9.5), ([8, 18], 13.0), ([11, 18], 14.5)]{\\}[([2, 3, 6, 8], 4.75), ([2, 3, 6, 11], 5.5), ([2, 3, 6, 18], 7.25), ([2, 3, 8, 11], 6.0), ([2, 3, 8, 18], 7.75), ([2, 3, 11, 18], 8.5), ([2, 6, 8, 11], 6.75), ([2, 6, 8, 18], 8.5), ([2, 6, 11, 18], 9.25), ([2, 8, 11, 18], 9.75), ([3, 6, 8, 11], 7.0), ([3, 6, 8, 18], 8.75), ([3, 6, 11, 18], 9.5), ([3, 8, 11, 18], 10.0), ([6, 8, 11, 18], 10.75)]} + \item \texttt{Mean of 2-sample means: 8.000000\\Standard deviation of 2-sample means: 3.405877\\Mean of 4-sample means: 8.000000\\Standard deviation of 4-sample means: 1.702939} + \item \texttt{Means are equal (2): True\\Means are equal (4): True\\$\sigma_2 \approx \sigma/\sqrt2\times\sqrt{4/5}$: True\\$\sigma_4 \approx \sigma/\sqrt4\times\sqrt{2/5}$: True} + + Therefore, the Central Limit Theorem seems to be correct judging from this dataset. + \item See figure \ref{fig:231-hists}. The top of the shape shifts from left to right as $N$ increases. + + \begin{figure}[p] + \centering + \includegraphics[width=\linewidth]{ex231-hists} + \caption{Histograms of the population distribution and the sample means distributions} + \label{fig:231-hists} + \end{figure} + \end{enumerate} + \end{enumerate} +\end{enumerate} + +\end{document} + -- cgit v1.2.3