diff options
author | Camil Staps | 2015-11-27 14:45:18 +0100 |
---|---|---|
committer | Camil Staps | 2015-11-27 14:45:18 +0100 |
commit | d8ac227e7e587669843598989bdeec41654a3fa7 (patch) | |
tree | 8905e41256dd2a7ea7c67799cdbe09a9db2fbba2 /Assignment 4/report/assignment4.tex | |
parent | Start assignment 4 (diff) |
Finish assignment 4
Diffstat (limited to 'Assignment 4/report/assignment4.tex')
-rw-r--r-- | Assignment 4/report/assignment4.tex | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/Assignment 4/report/assignment4.tex b/Assignment 4/report/assignment4.tex new file mode 100644 index 0000000..e7c23ba --- /dev/null +++ b/Assignment 4/report/assignment4.tex @@ -0,0 +1,140 @@ +\documentclass[10pt,a4paper]{article} + +\usepackage[margin=2cm]{geometry} +\usepackage{graphicx} + +\let\assignment4 + +\usepackage{enumitem} +\setenumerate[1]{label=\assignment.\arabic*.} +\setenumerate[2]{label=\arabic*.} +\setenumerate[3]{label=\roman*.} + +\usepackage{fancyhdr} +\renewcommand{\headrulewidth}{0pt} +\renewcommand{\footrulewidth}{0pt} +\fancyhead{} +%\fancyfoot[C]{Copyright {\textcopyright} 2015 Camil Staps} +\pagestyle{fancy} + +\usepackage{caption} +\usepackage{subcaption} +\usepackage[hidelinks]{hyperref} + +\parindent0pt + +\title{Data Mining - assignment \assignment} +\author{Camil Staps\\\small{s4498062}} + +\begin{document} + +\maketitle +\thispagestyle{fancy} + +\begin{enumerate} + \item \begin{enumerate} + \item See \autoref{fig:411}. K-means clustering only seems to work well on \texttt{synth1}. For \texttt{synth2}, \texttt{synth3} and \texttt{synth4} the clustering does not coincide with the true classes. + + \begin{figure}[h] + \centering + \includegraphics[width=.7\linewidth]{ex41_clusterings} + \caption{Clustering the \texttt{synth} data} + \label{fig:411} + \end{figure} + + \item See \autoref{fig:412}. On this data set, Jaccard (and Rand) seem to be the best choices to pick a good $K$. We would choose the $K$ for which the validity measure is maximal. + + When more than four clusters are used, classes are internally separated, because there are only four classes. This doesn't matter for the purity, because the clusters themselves are still pure in that case. But using Jaccard or Rand we can see that we're actually overfitting. + + \begin{figure}[h] + \centering + \includegraphics[width=.4\linewidth]{ex41_validity_measures} + \caption{Validity measures for K-means clustering on \texttt{synth1}} + \label{fig:412} + \end{figure} + + \item See \autoref{fig:413}. The faces are not recognisable at all. The data is not represented very well by the cluster centroids. Even with higher $K$ (e.g. $30$, $100$), the faces aren't recognisable. Decreasing obviously wouldn't help with that either. + + \begin{figure}[h] + \begin{subfigure}{\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_clusterfaces_k10} + \caption{$K=10$} + \end{subfigure} + \begin{subfigure}{\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_clusterfaces_k30} + \caption{$K=30$} + \end{subfigure} + \begin{subfigure}{\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_clusterfaces_k100} + \caption{$K=100$} + \end{subfigure} + \caption{Clustering wild faces} + \label{fig:413}. + \end{figure} + + \item See \autoref{fig:414}. Some digits can be written in different ways ($1$ with or without diagonal bar; $9$ with or without bottom curve, etc.), and some numbers look a lot like each other ($4$ and $9$, $5$ and $8$, $3$ and $8$, etc.), so why would we expect the clusters to correspond to the actual digits? + + With $K=20$ visualising goes a lot better (though not perfect yet). $K=30$ produces good results, on these records at least. + + As above, the algorithm seems to confuse mostly $4$ and $9$, $5$ and $8$ and $3$ and $8$. + + \begin{figure}[h] + \centering + \begin{subfigure}{.24\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_digits_org} + \caption{Original} + \end{subfigure} + \begin{subfigure}{.24\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_digits_cls_k10} + \caption{$K=10$} + \end{subfigure} + \begin{subfigure}{.24\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_digits_cls_k20} + \caption{$K=20$} + \end{subfigure} + \begin{subfigure}{.24\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_digits_cls_k30} + \caption{$K=30$} + \end{subfigure} + \caption{Clustering digits} + \label{fig:414} + \end{figure} + \end{enumerate} + + \item See \autoref{fig:42-single} through \ref{fig:42-average}. + + In the dendrograms of \texttt{synth1} we can see a clear difference between the \emph{single} and \emph{complete} method. The first tends to split on final clusters early on (assuming a top-down approach), and splits of one final cluster at a time in the beginning. The second method tends to first break the set in two more or less equally sized clusters (in this case), then breaks both into halves again. This gives a more balanced dendrogram. We see something similar also in the other datasets, but sadly the results aren't as good there. + + The \emph{average} method gives a result somewhere in between (not surprisingly): its dendrograms are slightly unbalanced, but not as much as the once where we used the \emph{single} method. + + \begin{figure}[h] + \centering + \includegraphics[width=\linewidth]{ex42_single} + \caption{Hierarchical clustering \texttt{synth1} with the \emph{single} method} + \label{fig:42-single} + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=\linewidth]{ex42_complete} + \caption{Hierarchical clustering \texttt{synth1} with the \emph{complete} method} + \label{fig:42-complete} + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=\linewidth]{ex42_average} + \caption{Hierarchical clustering \texttt{synth1} with the \emph{average} method} + \label{fig:42-average} + \end{figure} +\end{enumerate} + +\end{document} + |