1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 8 13:15:01 2016
@author: camil
"""
from random import shuffle
import numpy as np
import scipy.io as sciio
from scipy.spatial.distance import cosine, correlation
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from sklearn import datasets
def ex611(sets=range(1,5)):
fs = 15
for n in sets:
synth = sciio.loadmat('./data/synth' + str(n) + '.mat')
X = synth['X']
y = synth['y']
plt.figure(figsize=(18,3))
plt.subplot(1, 5, 1)
plt.scatter(X[:,1], X[:,0], c=y, s=50, alpha=0.5, marker='s')
plt.title('synth' + str(n), fontsize=fs)
measures = ['euclidean', 'manhattan', cosine, correlation]
measure_names = ['euclidean', 'manhattan', 'cosine', 'correlation']
ns = np.arange(1, 41, 1)
combined = zip(X, y)
shuffle(combined)
X[:], y[:] = zip(*combined)
cv = cross_validation.KFold(len(X), n_folds=10)
for measure, measure_name, d in zip(measures, measure_names, range(len(measures))):
accuracies = []
for i in ns:
acc = []
for train, test in cv:
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
clf = KNeighborsClassifier(n_neighbors=i, metric=measure)
clf.fit(X_train, y_train.ravel())
acc.append(clf.score(X_test, y_test.ravel()))
accuracies.append(np.mean(acc))
errors = np.repeat(1, len(accuracies)) - accuracies
plt.subplot(1, 5, 2+d)
plt.plot(ns, accuracies, c='blue')
plt.plot(ns, errors, c='red')
plt.xlabel('N neighbors')
plt.ylabel('Accuracy / Error rate')
plt.ylim([-0.05, 1.05])
plt.title(measure_name, fontsize=fs)
plt.tight_layout()
plt.show()
def ex612():
iris = datasets.load_iris()
X, y = iris.data, iris.target
ns, scores = range(1,41), []
for n in ns:
cv = cross_validation.LeaveOneOut(len(X))
scores_ = []
for train, test in cv:
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
clf = KNeighborsClassifier(n_neighbors=n)
clf.fit(X_train, y_train)
scores_.append(1-clf.score(X_test, y_test))
scores.append(np.mean(scores_))
plt.plot(ns, scores, color='red')
plt.ylim([min(scores) - (max(scores)-min(scores))*0.1,
max(scores) + (max(scores)-min(scores))*0.1])
plt.title('Classifying the Iris data set with KNN', fontsize=18)
plt.xlabel('N neighbors')
plt.ylabel('Error rate')
plt.show()
def ex613():
wine = sciio.loadmat('./data/wine.mat')
#classNames = [str(n[0][0]) for n in wine['classNames']]
X = wine['X'][:,:-1]
y = np.array(wine['X'][:,-1], dtype='f')
ns, scores = range(1,41,1), []
for n in ns:
clf = KNeighborsClassifier(n_neighbors=n)
clf.fit(X, np.repeat(0, len(X)))
scores_ = []
for r, p, i in zip(X, y, range(len(X))):
ind = clf.kneighbors([r], n_neighbors=n+1, return_distance=False)
prediction = np.mean(y[[j for j in ind[0] if j != i]])
scores_.append((prediction - p) ** 2)
scores.append(np.mean(scores_))
plt.plot(ns, scores, color='red')
plt.ylim([min(scores) - (max(scores)-min(scores))*0.1,
max(scores) + (max(scores)-min(scores))*0.1])
plt.title('Predicting alcohol percentage with KNN', fontsize=18)
plt.xlabel('N neighbors')
plt.ylabel('Average error')
plt.show()
if __name__ == '__main__':
#ex611()
#ex612()
ex613()
|