From d8ac227e7e587669843598989bdeec41654a3fa7 Mon Sep 17 00:00:00 2001
From: Camil Staps
Date: Fri, 27 Nov 2015 14:45:18 +0100
Subject: Finish assignment 4

---
 Assignment 4/report/assignment4.tex            | 140 +++++++++++++++++++++++++
 Assignment 4/report/ex41_clusterfaces_k10.png  | Bin 0 -> 213379 bytes
 Assignment 4/report/ex41_clusterfaces_k100.png | Bin 0 -> 252533 bytes
 Assignment 4/report/ex41_clusterfaces_k30.png  | Bin 0 -> 244668 bytes
 Assignment 4/report/ex41_clusterings.png       | Bin 0 -> 197798 bytes
 Assignment 4/report/ex41_digits_cls_k10.png    | Bin 0 -> 37994 bytes
 Assignment 4/report/ex41_digits_cls_k20.png    | Bin 0 -> 35710 bytes
 Assignment 4/report/ex41_digits_cls_k30.png    | Bin 0 -> 35277 bytes
 Assignment 4/report/ex41_digits_org.png        | Bin 0 -> 43141 bytes
 Assignment 4/report/ex41_validity_measures.png | Bin 0 -> 27765 bytes
 Assignment 4/report/ex42_average.png           | Bin 0 -> 207962 bytes
 Assignment 4/report/ex42_complete.png          | Bin 0 -> 209757 bytes
 Assignment 4/report/ex42_single.png            | Bin 0 -> 204073 bytes
 13 files changed, 140 insertions(+)
 create mode 100644 Assignment 4/report/assignment4.tex
 create mode 100644 Assignment 4/report/ex41_clusterfaces_k10.png
 create mode 100644 Assignment 4/report/ex41_clusterfaces_k100.png
 create mode 100644 Assignment 4/report/ex41_clusterfaces_k30.png
 create mode 100644 Assignment 4/report/ex41_clusterings.png
 create mode 100644 Assignment 4/report/ex41_digits_cls_k10.png
 create mode 100644 Assignment 4/report/ex41_digits_cls_k20.png
 create mode 100644 Assignment 4/report/ex41_digits_cls_k30.png
 create mode 100644 Assignment 4/report/ex41_digits_org.png
 create mode 100644 Assignment 4/report/ex41_validity_measures.png
 create mode 100644 Assignment 4/report/ex42_average.png
 create mode 100644 Assignment 4/report/ex42_complete.png
 create mode 100644 Assignment 4/report/ex42_single.png

(limited to 'Assignment 4/report')

diff --git a/Assignment 4/report/assignment4.tex b/Assignment 4/report/assignment4.tex
new file mode 100644
index 0000000..e7c23ba
--- /dev/null
+++ b/Assignment 4/report/assignment4.tex	
@@ -0,0 +1,140 @@
+\documentclass[10pt,a4paper]{article}
+
+\usepackage[margin=2cm]{geometry}
+\usepackage{graphicx}
+
+\let\assignment4
+
+\usepackage{enumitem}
+\setenumerate[1]{label=\assignment.\arabic*.}
+\setenumerate[2]{label=\arabic*.}
+\setenumerate[3]{label=\roman*.}
+
+\usepackage{fancyhdr}
+\renewcommand{\headrulewidth}{0pt}
+\renewcommand{\footrulewidth}{0pt}
+\fancyhead{}
+%\fancyfoot[C]{Copyright {\textcopyright} 2015 Camil Staps}
+\pagestyle{fancy}
+
+\usepackage{caption}
+\usepackage{subcaption}
+\usepackage[hidelinks]{hyperref}
+
+\parindent0pt
+
+\title{Data Mining - assignment \assignment}
+\author{Camil Staps\\\small{s4498062}}
+
+\begin{document}
+
+\maketitle
+\thispagestyle{fancy}
+
+\begin{enumerate}
+    \item \begin{enumerate}
+            \item See \autoref{fig:411}. K-means clustering only seems to work well on \texttt{synth1}. For \texttt{synth2}, \texttt{synth3} and \texttt{synth4} the clustering does not coincide with the true classes.
+
+                \begin{figure}[h]
+                    \centering
+                    \includegraphics[width=.7\linewidth]{ex41_clusterings}
+                    \caption{Clustering the \texttt{synth} data}
+                    \label{fig:411}
+                \end{figure}
+
+            \item See \autoref{fig:412}. On this data set, Jaccard (and Rand) seem to be the best choices to pick a good $K$. We would choose the $K$ for which the validity measure is maximal.
+
+                When more than four clusters are used, classes are internally separated, because there are only four classes. This doesn't matter for the purity, because the clusters themselves are still pure in that case. But using Jaccard or Rand we can see that we're actually overfitting.
+
+                \begin{figure}[h]
+                    \centering
+                    \includegraphics[width=.4\linewidth]{ex41_validity_measures}
+                    \caption{Validity measures for K-means clustering on \texttt{synth1}}
+                    \label{fig:412}
+                \end{figure}
+
+            \item See \autoref{fig:413}. The faces are not recognisable at all. The data is not represented very well by the cluster centroids. Even with higher $K$ (e.g. $30$, $100$), the faces aren't recognisable. Decreasing obviously wouldn't help with that either.
+
+                \begin{figure}[h]
+                    \begin{subfigure}{\linewidth}
+                        \centering
+                        \includegraphics[width=\linewidth]{ex41_clusterfaces_k10}
+                        \caption{$K=10$}
+                    \end{subfigure}
+                    \begin{subfigure}{\linewidth}
+                        \centering
+                        \includegraphics[width=\linewidth]{ex41_clusterfaces_k30}
+                        \caption{$K=30$}
+                    \end{subfigure}
+                    \begin{subfigure}{\linewidth}
+                        \centering
+                        \includegraphics[width=\linewidth]{ex41_clusterfaces_k100}
+                        \caption{$K=100$}
+                    \end{subfigure}
+                    \caption{Clustering wild faces}
+                    \label{fig:413}.
+                \end{figure}
+
+            \item See \autoref{fig:414}. Some digits can be written in different ways ($1$ with or without diagonal bar; $9$ with or without bottom curve, etc.), and some numbers look a lot like each other ($4$ and $9$, $5$ and $8$, $3$ and $8$, etc.), so why would we expect the clusters to correspond to the actual digits?
+
+                With $K=20$ visualising goes a lot better (though not perfect yet). $K=30$ produces good results, on these records at least.
+                
+                As above, the algorithm seems to confuse mostly $4$ and $9$, $5$ and $8$ and $3$ and $8$.
+
+                \begin{figure}[h]
+                    \centering
+                    \begin{subfigure}{.24\linewidth}
+                        \centering
+                        \includegraphics[width=\linewidth]{ex41_digits_org}
+                        \caption{Original}
+                    \end{subfigure}
+                    \begin{subfigure}{.24\linewidth}
+                        \centering
+                        \includegraphics[width=\linewidth]{ex41_digits_cls_k10}
+                        \caption{$K=10$}
+                    \end{subfigure}
+                    \begin{subfigure}{.24\linewidth}
+                        \centering
+                        \includegraphics[width=\linewidth]{ex41_digits_cls_k20}
+                        \caption{$K=20$}
+                    \end{subfigure}
+                    \begin{subfigure}{.24\linewidth}
+                        \centering
+                        \includegraphics[width=\linewidth]{ex41_digits_cls_k30}
+                        \caption{$K=30$}
+                    \end{subfigure}
+                    \caption{Clustering digits}
+                    \label{fig:414}
+                \end{figure}
+        \end{enumerate}
+
+    \item See \autoref{fig:42-single} through \ref{fig:42-average}.
+
+        In the dendrograms of \texttt{synth1} we can see a clear difference between the \emph{single} and \emph{complete} method. The first tends to split on final clusters early on (assuming a top-down approach), and splits of one final cluster at a time in the beginning. The second method tends to first break the set in two more or less equally sized clusters (in this case), then breaks both into halves again. This gives a more balanced dendrogram. We see something similar also in the other datasets, but sadly the results aren't as good there.
+
+        The \emph{average} method gives a result somewhere in between (not surprisingly): its dendrograms are slightly unbalanced, but not as much as the once where we used the \emph{single} method.
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=\linewidth]{ex42_single}
+            \caption{Hierarchical clustering \texttt{synth1} with the \emph{single} method}
+            \label{fig:42-single}
+        \end{figure}
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=\linewidth]{ex42_complete}
+            \caption{Hierarchical clustering \texttt{synth1} with the \emph{complete} method}
+            \label{fig:42-complete}
+        \end{figure}
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=\linewidth]{ex42_average}
+            \caption{Hierarchical clustering \texttt{synth1} with the \emph{average} method}
+            \label{fig:42-average}
+        \end{figure}
+\end{enumerate}
+
+\end{document}
+
diff --git a/Assignment 4/report/ex41_clusterfaces_k10.png b/Assignment 4/report/ex41_clusterfaces_k10.png
new file mode 100644
index 0000000..60e11a9
Binary files /dev/null and b/Assignment 4/report/ex41_clusterfaces_k10.png differ
diff --git a/Assignment 4/report/ex41_clusterfaces_k100.png b/Assignment 4/report/ex41_clusterfaces_k100.png
new file mode 100644
index 0000000..903b130
Binary files /dev/null and b/Assignment 4/report/ex41_clusterfaces_k100.png differ
diff --git a/Assignment 4/report/ex41_clusterfaces_k30.png b/Assignment 4/report/ex41_clusterfaces_k30.png
new file mode 100644
index 0000000..93065a6
Binary files /dev/null and b/Assignment 4/report/ex41_clusterfaces_k30.png differ
diff --git a/Assignment 4/report/ex41_clusterings.png b/Assignment 4/report/ex41_clusterings.png
new file mode 100644
index 0000000..df61d40
Binary files /dev/null and b/Assignment 4/report/ex41_clusterings.png differ
diff --git a/Assignment 4/report/ex41_digits_cls_k10.png b/Assignment 4/report/ex41_digits_cls_k10.png
new file mode 100644
index 0000000..1a24303
Binary files /dev/null and b/Assignment 4/report/ex41_digits_cls_k10.png differ
diff --git a/Assignment 4/report/ex41_digits_cls_k20.png b/Assignment 4/report/ex41_digits_cls_k20.png
new file mode 100644
index 0000000..028388f
Binary files /dev/null and b/Assignment 4/report/ex41_digits_cls_k20.png differ
diff --git a/Assignment 4/report/ex41_digits_cls_k30.png b/Assignment 4/report/ex41_digits_cls_k30.png
new file mode 100644
index 0000000..3c75b72
Binary files /dev/null and b/Assignment 4/report/ex41_digits_cls_k30.png differ
diff --git a/Assignment 4/report/ex41_digits_org.png b/Assignment 4/report/ex41_digits_org.png
new file mode 100644
index 0000000..d10849f
Binary files /dev/null and b/Assignment 4/report/ex41_digits_org.png differ
diff --git a/Assignment 4/report/ex41_validity_measures.png b/Assignment 4/report/ex41_validity_measures.png
new file mode 100644
index 0000000..916388b
Binary files /dev/null and b/Assignment 4/report/ex41_validity_measures.png differ
diff --git a/Assignment 4/report/ex42_average.png b/Assignment 4/report/ex42_average.png
new file mode 100644
index 0000000..48a969c
Binary files /dev/null and b/Assignment 4/report/ex42_average.png differ
diff --git a/Assignment 4/report/ex42_complete.png b/Assignment 4/report/ex42_complete.png
new file mode 100644
index 0000000..53cb86b
Binary files /dev/null and b/Assignment 4/report/ex42_complete.png differ
diff --git a/Assignment 4/report/ex42_single.png b/Assignment 4/report/ex42_single.png
new file mode 100644
index 0000000..599361a
Binary files /dev/null and b/Assignment 4/report/ex42_single.png differ
-- 
cgit v1.2.3