Assignment 6/ex61.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

# -*- coding: utf-8 -*-
"""
Created on Fri Jan  8 13:15:01 2016

@author: camil
"""

from random import shuffle

import numpy as np

import scipy.io as sciio
from scipy.spatial.distance import cosine, correlation

import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from sklearn import datasets

def ex611(sets=range(1,5)):
    fs = 15
    for n in sets:
        synth = sciio.loadmat('./data/synth' + str(n) + '.mat')
        X = synth['X']
        y = synth['y']
        
        plt.figure(figsize=(18,3))
        plt.subplot(1, 5, 1)
        plt.scatter(X[:,1], X[:,0], c=y, s=50, alpha=0.5, marker='s')
        plt.title('synth' + str(n), fontsize=fs)
        
        measures = ['euclidean', 'manhattan', cosine, correlation]
        measure_names = ['euclidean', 'manhattan', 'cosine', 'correlation']
        ns = np.arange(1, 41, 1)
        
        combined = zip(X, y)
        shuffle(combined)
        X[:], y[:] = zip(*combined)
        
        cv = cross_validation.KFold(len(X), n_folds=10)
        
        for measure, measure_name, d in zip(measures, measure_names, range(len(measures))):
            accuracies = []
            for i in ns:
                acc = []
                for train, test in cv:
                    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
                    
                    clf = KNeighborsClassifier(n_neighbors=i, metric=measure)
                    clf.fit(X_train, y_train.ravel())
                    acc.append(clf.score(X_test, y_test.ravel()))
                accuracies.append(np.mean(acc))
            errors = np.repeat(1, len(accuracies)) - accuracies            
                
            plt.subplot(1, 5, 2+d)
            plt.plot(ns, accuracies, c='blue')
            plt.plot(ns, errors, c='red')
            plt.xlabel('N neighbors')
            plt.ylabel('Accuracy / Error rate')
            plt.ylim([-0.05, 1.05])
            plt.title(measure_name, fontsize=fs)
        
        plt.tight_layout()
        plt.show()
       
def ex612():
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    
    ns, scores = range(1,41), []
    for n in ns:
        cv = cross_validation.LeaveOneOut(len(X))
        scores_ = []
        for train, test in cv:
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            
            clf = KNeighborsClassifier(n_neighbors=n)
            clf.fit(X_train, y_train)
            scores_.append(1-clf.score(X_test, y_test))
        scores.append(np.mean(scores_))
        
    plt.plot(ns, scores, color='red')
    plt.ylim([min(scores) - (max(scores)-min(scores))*0.1,
              max(scores) + (max(scores)-min(scores))*0.1])
    plt.title('Classifying the Iris data set with KNN', fontsize=18)
    plt.xlabel('N neighbors')
    plt.ylabel('Error rate')
    plt.show()
    
def ex613():
    wine = sciio.loadmat('./data/wine.mat')
    #classNames = [str(n[0][0]) for n in wine['classNames']]
    X = wine['X'][:,:-1]
    y = np.array(wine['X'][:,-1], dtype='f')
    
    ns, scores = range(1,41,1), []
    for n in ns:
        clf = KNeighborsClassifier(n_neighbors=n)
        clf.fit(X, np.repeat(0, len(X)))
        scores_ = []
        for r, p, i in zip(X, y, range(len(X))):
            ind = clf.kneighbors([r], n_neighbors=n+1, return_distance=False)
            prediction = np.mean(y[[j for j in ind[0] if j != i]])
            scores_.append((prediction - p) ** 2)
        scores.append(np.mean(scores_))
        
    plt.plot(ns, scores, color='red')
    plt.ylim([min(scores) - (max(scores)-min(scores))*0.1,
              max(scores) + (max(scores)-min(scores))*0.1])
    plt.title('Predicting alcohol percentage with KNN', fontsize=18)
    plt.xlabel('N neighbors')
    plt.ylabel('Average error')
    plt.show()

if __name__ == '__main__':
    #ex611()
    #ex612()
    ex613()