1 files changed, 85 insertions, 0 deletions
diff --git a/Assignment 3/report/assignment3.tex b/Assignment 3/report/assignment3.tex
new file mode 100644
index 0000000..1577d2a
--- /dev/null
+++ b/Assignment 3/report/assignment3.tex
@@ -0,0 +1,85 @@
+\documentclass[10pt,a4paper]{article}
+
+\usepackage[margin=2cm]{geometry}
+\usepackage{graphicx}
+
+\let\assignment3
+
+\usepackage{enumitem}
+\setenumerate[1]{label=\assignment.\arabic*.}
+\setenumerate[2]{label=\arabic*.}
+\setenumerate[3]{label=\roman*.}
+
+% textcomp package is not available everywhere, and we only need the Copyright symbol
+% taken from http://tex.stackexchange.com/a/1677/23992
+\DeclareTextCommandDefault{\textregistered}{\textcircled{\check@mathfonts\fontsize\sf@size\z@\math@fontsfalse\selectfont R}}
+
+\usepackage{fancyhdr}
+\renewcommand{\headrulewidth}{0pt}
+\renewcommand{\footrulewidth}{0pt}
+\fancyhead{}
+%\fancyfoot[C]{Copyright {\textcopyright} 2015 Camil Staps}
+\pagestyle{fancy}
+
+\usepackage{caption}
+\usepackage{subcaption}
+
+\parindent0pt
+
+\title{Data Mining - assignment \assignment}
+\author{Camil Staps\\\small{s4498062}}
+
+\begin{document}
+
+\maketitle
+\thispagestyle{fancy}
+
+\begin{enumerate}
+    \item \begin{enumerate}
+            \setcounter{enumii}{1}
+            \item If we \emph{increase} the value of \texttt{min\_samples\_split}, we allow less splits, making the tree (in general) smaller. If we \emph{decrease} the value, we allow more splits, making the tree (in general) larger. 
+                
+                See Figure \ref{fig:312} for the tree.
+
+                \begin{figure}[h]
+                    \centering
+                    \includegraphics[width=\linewidth]{ex31_wine_tree.pdf}
+                    \caption{Decision tree for wine data (also included as \texttt{ex31\_wine\_tree.pdf})}
+                    \label{fig:312}
+                \end{figure}
+
+            \item Python predicts this as a white wine. We can verify this by going through the tree ourselves: left, left, right. In this leaf there are $14$ red wines and $17$ white wines, so the prediction is `white'. Only the attributes Total sulfur dioxide (7), Chlorides (5) and Sulphates (10) are used for this. The attributes in the tree are zero-indexed.
+
+            \item 98\%.
+        \end{enumerate}
+
+    \item See Figure \ref{fig:321}. As we can see there (or as output from the Python program), the optimal tree depth reported by both Holdout CV and 10-Fold CV is $14$. However, this is not very stable across different runs. The Holdout CV is unsurprisingly less stable. 
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=.8\linewidth]{ex32_classification_error}
+            \caption{Classification error for different tree depths}
+            \label{fig:321}
+        \end{figure}
+
+        100-Fold CV is even more stable (but not completely yet), and usually returns an optimal tree depth of around $8$.
+
+    \item \begin{enumerate}
+            \setcounter{enumii}{1}
+            \item See Figure \ref{fig:332}. Both classifiers perform better than random guessing, because their lines are everywhere higher than the line of the null hypothesis. The area under the curve of M1 is greater than the area under the curve of M2, so M1 performs better.
+
+                \begin{figure}[h]
+                    \centering
+                    \includegraphics[width=.8\linewidth]{ex33_roc_curves}
+                    \caption{ROC curves for two different classifiers}
+                    \label{fig:332}
+                \end{figure}
+
+            \item The baseline AUC would be $\frac12$. The AUC of M1 is $0.956$, and the AUC of M2 is $0.765$, so both perform better than the baseline.
+
+            \item The accuracy of the first is 86\%, the accuracy of the second 69\%.
+        \end{enumerate}
+\end{enumerate}
+
+\end{document}
+