# -*- coding: utf-8 -*- """ Created on Fri Jan 8 13:15:01 2016 @author: camil """ from random import shuffle import numpy as np import scipy.io as sciio from scipy.spatial.distance import cosine, correlation import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsClassifier from sklearn import cross_validation from sklearn import datasets def ex611(sets=range(1,5)): fs = 15 for n in sets: synth = sciio.loadmat('./data/synth' + str(n) + '.mat') X = synth['X'] y = synth['y'] plt.figure(figsize=(18,3)) plt.subplot(1, 5, 1) plt.scatter(X[:,1], X[:,0], c=y, s=50, alpha=0.5, marker='s') plt.title('synth' + str(n), fontsize=fs) measures = ['euclidean', 'manhattan', cosine, correlation] measure_names = ['euclidean', 'manhattan', 'cosine', 'correlation'] ns = np.arange(1, 41, 1) combined = zip(X, y) shuffle(combined) X[:], y[:] = zip(*combined) cv = cross_validation.KFold(len(X), n_folds=10) for measure, measure_name, d in zip(measures, measure_names, range(len(measures))): accuracies = [] for i in ns: acc = [] for train, test in cv: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] clf = KNeighborsClassifier(n_neighbors=i, metric=measure) clf.fit(X_train, y_train.ravel()) acc.append(clf.score(X_test, y_test.ravel())) accuracies.append(np.mean(acc)) errors = np.repeat(1, len(accuracies)) - accuracies plt.subplot(1, 5, 2+d) plt.plot(ns, accuracies, c='blue') plt.plot(ns, errors, c='red') plt.xlabel('N neighbors') plt.ylabel('Accuracy / Error rate') plt.ylim([-0.05, 1.05]) plt.title(measure_name, fontsize=fs) plt.tight_layout() plt.show() def ex612(): iris = datasets.load_iris() X, y = iris.data, iris.target ns, scores = range(1,41), [] for n in ns: cv = cross_validation.LeaveOneOut(len(X)) scores_ = [] for train, test in cv: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] clf = KNeighborsClassifier(n_neighbors=n) clf.fit(X_train, y_train) scores_.append(1-clf.score(X_test, y_test)) scores.append(np.mean(scores_)) plt.plot(ns, scores, color='red') plt.ylim([min(scores) - (max(scores)-min(scores))*0.1, max(scores) + (max(scores)-min(scores))*0.1]) plt.title('Classifying the Iris data set with KNN', fontsize=18) plt.xlabel('N neighbors') plt.ylabel('Error rate') plt.show() def ex613(): wine = sciio.loadmat('./data/wine.mat') #classNames = [str(n[0][0]) for n in wine['classNames']] X = wine['X'][:,:-1] y = np.array(wine['X'][:,-1], dtype='f') ns, scores = range(1,41,1), [] for n in ns: clf = KNeighborsClassifier(n_neighbors=n) clf.fit(X, np.repeat(0, len(X))) scores_ = [] for r, p, i in zip(X, y, range(len(X))): ind = clf.kneighbors([r], n_neighbors=n+1, return_distance=False) prediction = np.mean(y[[j for j in ind[0] if j != i]]) scores_.append((prediction - p) ** 2) scores.append(np.mean(scores_)) plt.plot(ns, scores, color='red') plt.ylim([min(scores) - (max(scores)-min(scores))*0.1, max(scores) + (max(scores)-min(scores))*0.1]) plt.title('Predicting alcohol percentage with KNN', fontsize=18) plt.xlabel('N neighbors') plt.ylabel('Average error') plt.show() if __name__ == '__main__': #ex611() #ex612() ex613()