diff options
author | Camil Staps | 2015-11-27 00:18:32 +0100 |
---|---|---|
committer | Camil Staps | 2015-11-27 00:18:32 +0100 |
commit | d88d00232cfdbfd508834911af6ad89a217b84e1 (patch) | |
tree | 20308e5e89f76ce8f987598e26f75db6ad4cbd4e /Assignment 4 | |
parent | Assignment 3 report (diff) |
Start assignment 4
Diffstat (limited to 'Assignment 4')
-rw-r--r-- | Assignment 4/data/digits.mat | bin | 0 -> 3863457 bytes | |||
-rw-r--r-- | Assignment 4/data/synth1.mat | bin | 0 -> 7121 bytes | |||
-rw-r--r-- | Assignment 4/data/synth2.mat | bin | 0 -> 7091 bytes | |||
-rw-r--r-- | Assignment 4/data/synth3.mat | bin | 0 -> 7217 bytes | |||
-rw-r--r-- | Assignment 4/data/synth4.mat | bin | 0 -> 50386 bytes | |||
-rw-r--r-- | Assignment 4/data/wildfaces.mat | bin | 0 -> 36145247 bytes | |||
-rw-r--r-- | Assignment 4/ex41.py | 92 | ||||
-rw-r--r-- | Assignment 4/packages/clusterPlot.py | 75 | ||||
-rw-r--r-- | Assignment 4/packages/clusterVal.py | 47 |
9 files changed, 214 insertions, 0 deletions
diff --git a/Assignment 4/data/digits.mat b/Assignment 4/data/digits.mat Binary files differnew file mode 100644 index 0000000..434cf47 --- /dev/null +++ b/Assignment 4/data/digits.mat diff --git a/Assignment 4/data/synth1.mat b/Assignment 4/data/synth1.mat Binary files differnew file mode 100644 index 0000000..4eb623f --- /dev/null +++ b/Assignment 4/data/synth1.mat diff --git a/Assignment 4/data/synth2.mat b/Assignment 4/data/synth2.mat Binary files differnew file mode 100644 index 0000000..99838d2 --- /dev/null +++ b/Assignment 4/data/synth2.mat diff --git a/Assignment 4/data/synth3.mat b/Assignment 4/data/synth3.mat Binary files differnew file mode 100644 index 0000000..adefbcf --- /dev/null +++ b/Assignment 4/data/synth3.mat diff --git a/Assignment 4/data/synth4.mat b/Assignment 4/data/synth4.mat Binary files differnew file mode 100644 index 0000000..8a445f9 --- /dev/null +++ b/Assignment 4/data/synth4.mat diff --git a/Assignment 4/data/wildfaces.mat b/Assignment 4/data/wildfaces.mat Binary files differnew file mode 100644 index 0000000..1f5894a --- /dev/null +++ b/Assignment 4/data/wildfaces.mat diff --git a/Assignment 4/ex41.py b/Assignment 4/ex41.py new file mode 100644 index 0000000..5ae66db --- /dev/null +++ b/Assignment 4/ex41.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Oct 23 14:45:21 2015 + +@author: Camil Staps, s4498062 + +This is Python 2 code. +""" + +import sys +sys.path.insert(0, './packages') + +import numpy as np +from scipy import io as sciio +from sklearn import cluster +from clusterPlot import clusterPlot +from clusterVal import clusterVal +import matplotlib.pyplot as plt + +# 4.1.1 +n = 1 +synth = sciio.loadmat('./data/synth' + str(n) + '.mat') +X = synth['X'] +y = synth['y'] +centroid, label, inertia = cluster.k_means(X, 4) +clusterPlot(X, label, centroid, y) + +# 4.1.2 +entropies, purities, rands, jaccards = [], [], [], [] +for i in range(1, 11): + _, label, _ = cluster.k_means(X, i) + entropy, purity, rand, jaccard = clusterVal(y, label) + entropies.append(entropy) + purities.append(purity) + rands.append(rand) + jaccards.append(jaccard) + +print(entropies, purities, rands, jaccards) + +x = np.arange(1,11) +plt.figure(figsize=(8,8)) +plt.subplot(2,2,1) +plt.plot(x, entropies, label='Entropy') +plt.legend() +plt.subplot(2,2,2) +plt.plot(x, purities, label='Purity') +plt.legend(loc=4) +plt.subplot(2,2,3) +plt.plot(x, rands, label='Rand') +plt.legend(loc=4) +plt.subplot(2,2,4) +plt.plot(x, jaccards, label='Jaccard') +plt.legend(loc=4) +plt.show() + +# 4.1.3 +faces = sciio.loadmat('./data/wildfaces.mat') +X = faces['X'] +k = 0 +centroid, label, inertia = cluster.k_means(X, 10) + +n = 10 +plt.figure(figsize=(n*2,4)) +for k in range(0,n): + plt.subplot(2, n, k + 1) + plt.imshow(np.reshape(X[k,:], (3,40,40)).T) + plt.axis('off') + plt.subplot(2, n, k + 1 + n) + plt.imshow(np.reshape(centroid[label[k],:], (3,40,40)).T) + plt.axis('off') +plt.show() + +# 4.1.4 +digits = sciio.loadmat('./data/digits.mat') +X = digits['X'] +k = 20 + +plt.figure(figsize=(6,4)) +for k in range(0,24): + plt.subplot(4, 6, k + 1) + plt.imshow(np.reshape(X[k], (16,16)), cmap=plt.cm.binary) + plt.axis('off') +plt.show() + +centroid, label, inertia = cluster.k_means(X, k) + +plt.figure(figsize=(6,4)) +for k in range(0,24): + plt.subplot(4, 6, k + 1) + plt.imshow(np.reshape(centroid[label[k]], (16,16)), cmap=plt.cm.binary) + plt.axis('off') +plt.show() diff --git a/Assignment 4/packages/clusterPlot.py b/Assignment 4/packages/clusterPlot.py new file mode 100644 index 0000000..2f37a3d --- /dev/null +++ b/Assignment 4/packages/clusterPlot.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*-
+"""
+Created on Mon Apr 14 09:01:18 2014
+
+"""
+
+def clusterPlot(X, clusterid, centroids='None', y='None', covars='None', figsize=(16,10)):
+ '''
+ CLUSTERPLOT Plots a clustering of a data set as well as the true class
+ labels. If data is more than 2-dimensional it should be first projected
+ onto the first two principal components. Data objects are plotted as a dot
+ with a circle around. The color of the dot indicates the true class,
+ and the cicle indicates the cluster index. Optionally, the centroids are
+ plotted as filled-star markers, and ellipsoids corresponding to covariance
+ matrices (e.g. for gaussian mixture models).
+
+ Usage:
+ clusterplot(X, clusterid)
+ clusterplot(X, clusterid, centroids=c_matrix, y=y_matrix)
+ clusterplot(X, clusterid, centroids=c_matrix, y=y_matrix, covars=c_tensor)
+
+ Input:
+ X N-by-M data matrix (N data objects with M attributes)
+ clusterid N-by-1 vector of cluster indices
+ centroids K-by-M matrix of cluster centroids (optional)
+ y N-by-1 vector of true class labels (optional)
+ covars M-by-M-by-K tensor of covariance matrices (optional)
+ '''
+ import numpy as np
+ from matplotlib.pyplot import figure, cm, plot, hold, legend, xlim, show
+
+
+ X = np.asarray(X)
+ cls = np.asarray(clusterid)
+ if y=='None':
+ y = np.zeros((X.shape[0],1))
+ else:
+ y = np.asarray(y)
+ if centroids!='None':
+ centroids = np.asarray(centroids)
+ K = np.size(np.unique(cls))
+ C = np.size(np.unique(y))
+ ncolors = np.max([C,K])
+
+ # plot data points color-coded by class, cluster markers and centroids
+ figure(figsize=figsize)
+ hold(True)
+ colors = [0]*ncolors
+ for color in range(ncolors):
+ colors[color] = cm.jet.__call__(color*1.0/(1.0*ncolors-1))[:3]
+ for i,cs in enumerate(np.unique(y)):
+ plot(X[(y==cs).ravel(),0], X[(y==cs).ravel(),1], 'o', markeredgecolor='k', markerfacecolor=colors[i],markersize=6, zorder=2)
+ for i,cr in enumerate(np.unique(cls)):
+ plot(X[(cls==cr).ravel(),0], X[(cls==cr).ravel(),1], 'o', markersize=12, markeredgecolor=colors[i], markerfacecolor='None', markeredgewidth=3, zorder=1)
+ if centroids!='None':
+ for cd in range(centroids.shape[0]):
+ plot(centroids[cd,0], centroids[cd,1], '*', markersize=22, markeredgecolor='k', markerfacecolor=colors[cd], markeredgewidth=2, zorder=3)
+ # plot cluster shapes:
+ if covars!='None':
+ for cd in range(centroids.shape[0]):
+ x1, x2 = gauss_2d(centroids[cd],covars[cd,:,:])
+ plot(x1,x2,'-', color=colors[cd], linewidth=3, zorder=5)
+ hold(False)
+
+ # create legend
+ legend_items = np.unique(y).tolist()+np.unique(cls).tolist()+np.unique(cls).tolist()
+ for i in range(len(legend_items)):
+ if i<C: legend_items[i] = 'Class: {0}'.format(legend_items[i]);
+ elif i<C+K: legend_items[i] = 'Cluster: {0}'.format(legend_items[i]);
+ else: legend_items[i] = 'Centroid: {0}'.format(legend_items[i]);
+ legend(legend_items, numpoints=1, markerscale=.75, prop={'size': 9})
+
+ xlim(X[:,0].min()*1.1, X[:,0].max()*1.2)
+
+ show()
diff --git a/Assignment 4/packages/clusterVal.py b/Assignment 4/packages/clusterVal.py new file mode 100644 index 0000000..df97334 --- /dev/null +++ b/Assignment 4/packages/clusterVal.py @@ -0,0 +1,47 @@ +def clusterVal(y, clusterid):
+ '''
+ CLUSTERVAL Estimate cluster validity using Entropy, Purity, Rand Statistic,
+ and Jaccard coefficient.
+
+ Usage:
+ Entropy, Purity, Rand, Jaccard = clusterval(y, clusterid);
+
+ Input:
+ y N-by-1 vector of class labels
+ clusterid N-by-1 vector of cluster indices
+
+ Output:
+ Entropy Entropy measure.
+ Purity Purity measure.
+ Rand Rand index.
+ Jaccard Jaccard coefficient.
+ '''
+
+ import numpy as np
+
+ y = np.asarray(y).ravel(); clusterid = np.asarray(clusterid).ravel()
+ C = np.unique(y).size; K = np.unique(clusterid).size; N = y.shape[0]
+ EPS = 2.22e-16
+
+ p_ij = np.zeros((K,C)) # probability that member of i'th cluster belongs to j'th class
+ m_i = np.zeros((K,1)) # total number of objects in i'th cluster
+ for k in range(K):
+ m_i[k] = (clusterid==k).sum()
+ yk = y[clusterid==k]
+ for c in range(C):
+ m_ij = (yk==c).sum() # number of objects of j'th class in i'th cluster
+ p_ij[k,c] = m_ij.astype(float)/m_i[k]
+ entropy = ( (1-(p_ij*np.log2(p_ij+EPS)).sum(axis=1))*m_i.T ).sum() / (N*K)
+ purity = ( p_ij.max(axis=1) ).sum() / K
+
+ f00=0; f01=0; f10=0; f11=0
+ for i in range(N):
+ for j in range(i):
+ if y[i]!=y[j] and clusterid[i]!=clusterid[j]: f00 += 1; # different class, different cluster
+ elif y[i]==y[j] and clusterid[i]==clusterid[j]: f11 += 1; # same class, same cluster
+ elif y[i]==y[j] and clusterid[i]!=clusterid[j]: f10 += 1; # same class, different cluster
+ else: f01 +=1; # different class, same cluster
+ rand = np.float(f00+f11)/(f00+f01+f10+f11)
+ jaccard = np.float(f11)/(f01+f10+f11)
+
+ return entropy, purity, rand, jaccard
|