From d8ac227e7e587669843598989bdeec41654a3fa7 Mon Sep 17 00:00:00 2001 From: Camil Staps Date: Fri, 27 Nov 2015 14:45:18 +0100 Subject: Finish assignment 4 --- Assignment 4/report/assignment4.tex | 140 +++++++++++++++++++++++++ Assignment 4/report/ex41_clusterfaces_k10.png | Bin 0 -> 213379 bytes Assignment 4/report/ex41_clusterfaces_k100.png | Bin 0 -> 252533 bytes Assignment 4/report/ex41_clusterfaces_k30.png | Bin 0 -> 244668 bytes Assignment 4/report/ex41_clusterings.png | Bin 0 -> 197798 bytes Assignment 4/report/ex41_digits_cls_k10.png | Bin 0 -> 37994 bytes Assignment 4/report/ex41_digits_cls_k20.png | Bin 0 -> 35710 bytes Assignment 4/report/ex41_digits_cls_k30.png | Bin 0 -> 35277 bytes Assignment 4/report/ex41_digits_org.png | Bin 0 -> 43141 bytes Assignment 4/report/ex41_validity_measures.png | Bin 0 -> 27765 bytes Assignment 4/report/ex42_average.png | Bin 0 -> 207962 bytes Assignment 4/report/ex42_complete.png | Bin 0 -> 209757 bytes Assignment 4/report/ex42_single.png | Bin 0 -> 204073 bytes 13 files changed, 140 insertions(+) create mode 100644 Assignment 4/report/assignment4.tex create mode 100644 Assignment 4/report/ex41_clusterfaces_k10.png create mode 100644 Assignment 4/report/ex41_clusterfaces_k100.png create mode 100644 Assignment 4/report/ex41_clusterfaces_k30.png create mode 100644 Assignment 4/report/ex41_clusterings.png create mode 100644 Assignment 4/report/ex41_digits_cls_k10.png create mode 100644 Assignment 4/report/ex41_digits_cls_k20.png create mode 100644 Assignment 4/report/ex41_digits_cls_k30.png create mode 100644 Assignment 4/report/ex41_digits_org.png create mode 100644 Assignment 4/report/ex41_validity_measures.png create mode 100644 Assignment 4/report/ex42_average.png create mode 100644 Assignment 4/report/ex42_complete.png create mode 100644 Assignment 4/report/ex42_single.png (limited to 'Assignment 4/report') diff --git a/Assignment 4/report/assignment4.tex b/Assignment 4/report/assignment4.tex new file mode 100644 index 0000000..e7c23ba --- /dev/null +++ b/Assignment 4/report/assignment4.tex @@ -0,0 +1,140 @@ +\documentclass[10pt,a4paper]{article} + +\usepackage[margin=2cm]{geometry} +\usepackage{graphicx} + +\let\assignment4 + +\usepackage{enumitem} +\setenumerate[1]{label=\assignment.\arabic*.} +\setenumerate[2]{label=\arabic*.} +\setenumerate[3]{label=\roman*.} + +\usepackage{fancyhdr} +\renewcommand{\headrulewidth}{0pt} +\renewcommand{\footrulewidth}{0pt} +\fancyhead{} +%\fancyfoot[C]{Copyright {\textcopyright} 2015 Camil Staps} +\pagestyle{fancy} + +\usepackage{caption} +\usepackage{subcaption} +\usepackage[hidelinks]{hyperref} + +\parindent0pt + +\title{Data Mining - assignment \assignment} +\author{Camil Staps\\\small{s4498062}} + +\begin{document} + +\maketitle +\thispagestyle{fancy} + +\begin{enumerate} + \item \begin{enumerate} + \item See \autoref{fig:411}. K-means clustering only seems to work well on \texttt{synth1}. For \texttt{synth2}, \texttt{synth3} and \texttt{synth4} the clustering does not coincide with the true classes. + + \begin{figure}[h] + \centering + \includegraphics[width=.7\linewidth]{ex41_clusterings} + \caption{Clustering the \texttt{synth} data} + \label{fig:411} + \end{figure} + + \item See \autoref{fig:412}. On this data set, Jaccard (and Rand) seem to be the best choices to pick a good $K$. We would choose the $K$ for which the validity measure is maximal. + + When more than four clusters are used, classes are internally separated, because there are only four classes. This doesn't matter for the purity, because the clusters themselves are still pure in that case. But using Jaccard or Rand we can see that we're actually overfitting. + + \begin{figure}[h] + \centering + \includegraphics[width=.4\linewidth]{ex41_validity_measures} + \caption{Validity measures for K-means clustering on \texttt{synth1}} + \label{fig:412} + \end{figure} + + \item See \autoref{fig:413}. The faces are not recognisable at all. The data is not represented very well by the cluster centroids. Even with higher $K$ (e.g. $30$, $100$), the faces aren't recognisable. Decreasing obviously wouldn't help with that either. + + \begin{figure}[h] + \begin{subfigure}{\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_clusterfaces_k10} + \caption{$K=10$} + \end{subfigure} + \begin{subfigure}{\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_clusterfaces_k30} + \caption{$K=30$} + \end{subfigure} + \begin{subfigure}{\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_clusterfaces_k100} + \caption{$K=100$} + \end{subfigure} + \caption{Clustering wild faces} + \label{fig:413}. + \end{figure} + + \item See \autoref{fig:414}. Some digits can be written in different ways ($1$ with or without diagonal bar; $9$ with or without bottom curve, etc.), and some numbers look a lot like each other ($4$ and $9$, $5$ and $8$, $3$ and $8$, etc.), so why would we expect the clusters to correspond to the actual digits? + + With $K=20$ visualising goes a lot better (though not perfect yet). $K=30$ produces good results, on these records at least. + + As above, the algorithm seems to confuse mostly $4$ and $9$, $5$ and $8$ and $3$ and $8$. + + \begin{figure}[h] + \centering + \begin{subfigure}{.24\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_digits_org} + \caption{Original} + \end{subfigure} + \begin{subfigure}{.24\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_digits_cls_k10} + \caption{$K=10$} + \end{subfigure} + \begin{subfigure}{.24\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_digits_cls_k20} + \caption{$K=20$} + \end{subfigure} + \begin{subfigure}{.24\linewidth} + \centering + \includegraphics[width=\linewidth]{ex41_digits_cls_k30} + \caption{$K=30$} + \end{subfigure} + \caption{Clustering digits} + \label{fig:414} + \end{figure} + \end{enumerate} + + \item See \autoref{fig:42-single} through \ref{fig:42-average}. + + In the dendrograms of \texttt{synth1} we can see a clear difference between the \emph{single} and \emph{complete} method. The first tends to split on final clusters early on (assuming a top-down approach), and splits of one final cluster at a time in the beginning. The second method tends to first break the set in two more or less equally sized clusters (in this case), then breaks both into halves again. This gives a more balanced dendrogram. We see something similar also in the other datasets, but sadly the results aren't as good there. + + The \emph{average} method gives a result somewhere in between (not surprisingly): its dendrograms are slightly unbalanced, but not as much as the once where we used the \emph{single} method. + + \begin{figure}[h] + \centering + \includegraphics[width=\linewidth]{ex42_single} + \caption{Hierarchical clustering \texttt{synth1} with the \emph{single} method} + \label{fig:42-single} + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=\linewidth]{ex42_complete} + \caption{Hierarchical clustering \texttt{synth1} with the \emph{complete} method} + \label{fig:42-complete} + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=\linewidth]{ex42_average} + \caption{Hierarchical clustering \texttt{synth1} with the \emph{average} method} + \label{fig:42-average} + \end{figure} +\end{enumerate} + +\end{document} + diff --git a/Assignment 4/report/ex41_clusterfaces_k10.png b/Assignment 4/report/ex41_clusterfaces_k10.png new file mode 100644 index 0000000..60e11a9 Binary files /dev/null and b/Assignment 4/report/ex41_clusterfaces_k10.png differ diff --git a/Assignment 4/report/ex41_clusterfaces_k100.png b/Assignment 4/report/ex41_clusterfaces_k100.png new file mode 100644 index 0000000..903b130 Binary files /dev/null and b/Assignment 4/report/ex41_clusterfaces_k100.png differ diff --git a/Assignment 4/report/ex41_clusterfaces_k30.png b/Assignment 4/report/ex41_clusterfaces_k30.png new file mode 100644 index 0000000..93065a6 Binary files /dev/null and b/Assignment 4/report/ex41_clusterfaces_k30.png differ diff --git a/Assignment 4/report/ex41_clusterings.png b/Assignment 4/report/ex41_clusterings.png new file mode 100644 index 0000000..df61d40 Binary files /dev/null and b/Assignment 4/report/ex41_clusterings.png differ diff --git a/Assignment 4/report/ex41_digits_cls_k10.png b/Assignment 4/report/ex41_digits_cls_k10.png new file mode 100644 index 0000000..1a24303 Binary files /dev/null and b/Assignment 4/report/ex41_digits_cls_k10.png differ diff --git a/Assignment 4/report/ex41_digits_cls_k20.png b/Assignment 4/report/ex41_digits_cls_k20.png new file mode 100644 index 0000000..028388f Binary files /dev/null and b/Assignment 4/report/ex41_digits_cls_k20.png differ diff --git a/Assignment 4/report/ex41_digits_cls_k30.png b/Assignment 4/report/ex41_digits_cls_k30.png new file mode 100644 index 0000000..3c75b72 Binary files /dev/null and b/Assignment 4/report/ex41_digits_cls_k30.png differ diff --git a/Assignment 4/report/ex41_digits_org.png b/Assignment 4/report/ex41_digits_org.png new file mode 100644 index 0000000..d10849f Binary files /dev/null and b/Assignment 4/report/ex41_digits_org.png differ diff --git a/Assignment 4/report/ex41_validity_measures.png b/Assignment 4/report/ex41_validity_measures.png new file mode 100644 index 0000000..916388b Binary files /dev/null and b/Assignment 4/report/ex41_validity_measures.png differ diff --git a/Assignment 4/report/ex42_average.png b/Assignment 4/report/ex42_average.png new file mode 100644 index 0000000..48a969c Binary files /dev/null and b/Assignment 4/report/ex42_average.png differ diff --git a/Assignment 4/report/ex42_complete.png b/Assignment 4/report/ex42_complete.png new file mode 100644 index 0000000..53cb86b Binary files /dev/null and b/Assignment 4/report/ex42_complete.png differ diff --git a/Assignment 4/report/ex42_single.png b/Assignment 4/report/ex42_single.png new file mode 100644 index 0000000..599361a Binary files /dev/null and b/Assignment 4/report/ex42_single.png differ -- cgit v1.2.3