Start assignment 4

author: Camil Staps 2015-11-27 00:18:32 +0100
committer: Camil Staps 2015-11-27 00:18:32 +0100
commit: d88d00232cfdbfd508834911af6ad89a217b84e1 (patch)
tree: 20308e5e89f76ce8f987598e26f75db6ad4cbd4e /Assignment 4
parent: Assignment 3 report (diff)
9 files changed, 214 insertions, 0 deletions
diff --git a/Assignment 4/data/digits.mat b/Assignment 4/data/digits.mat
new file mode 100644
index 0000000..434cf47
--- /dev/null
+++ b/Assignment 4/data/digits.mat
diff --git a/Assignment 4/data/synth1.mat b/Assignment 4/data/synth1.mat
new file mode 100644
index 0000000..4eb623f
--- /dev/null
+++ b/Assignment 4/data/synth1.mat
diff --git a/Assignment 4/data/synth2.mat b/Assignment 4/data/synth2.mat
new file mode 100644
index 0000000..99838d2
--- /dev/null
+++ b/Assignment 4/data/synth2.mat
diff --git a/Assignment 4/data/synth3.mat b/Assignment 4/data/synth3.mat
new file mode 100644
index 0000000..adefbcf
--- /dev/null
+++ b/Assignment 4/data/synth3.mat
diff --git a/Assignment 4/data/synth4.mat b/Assignment 4/data/synth4.mat
new file mode 100644
index 0000000..8a445f9
--- /dev/null
+++ b/Assignment 4/data/synth4.mat
diff --git a/Assignment 4/data/wildfaces.mat b/Assignment 4/data/wildfaces.mat
new file mode 100644
index 0000000..1f5894a
--- /dev/null
+++ b/Assignment 4/data/wildfaces.mat
diff --git a/Assignment 4/ex41.py b/Assignment 4/ex41.py
new file mode 100644
index 0000000..5ae66db
--- /dev/null
+++ b/Assignment 4/ex41.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Oct 23 14:45:21 2015
+
+@author: Camil Staps, s4498062
+
+This is Python 2 code.
+"""
+
+import sys
+sys.path.insert(0, './packages')
+
+import numpy as np
+from scipy import io as sciio
+from sklearn import cluster
+from clusterPlot import clusterPlot
+from clusterVal import clusterVal
+import matplotlib.pyplot as plt
+
+# 4.1.1
+n = 1
+synth = sciio.loadmat('./data/synth' + str(n) + '.mat')
+X = synth['X']
+y = synth['y']
+centroid, label, inertia = cluster.k_means(X, 4)
+clusterPlot(X, label, centroid, y)
+
+# 4.1.2
+entropies, purities, rands, jaccards = [], [], [], []
+for i in range(1, 11):
+    _, label, _ = cluster.k_means(X, i)
+    entropy, purity, rand, jaccard = clusterVal(y, label)
+    entropies.append(entropy)
+    purities.append(purity)
+    rands.append(rand)
+    jaccards.append(jaccard)
+    
+print(entropies, purities, rands, jaccards)
+    
+x = np.arange(1,11)
+plt.figure(figsize=(8,8))
+plt.subplot(2,2,1)
+plt.plot(x, entropies, label='Entropy')
+plt.legend()
+plt.subplot(2,2,2)
+plt.plot(x, purities, label='Purity')
+plt.legend(loc=4)
+plt.subplot(2,2,3)
+plt.plot(x, rands, label='Rand')
+plt.legend(loc=4)
+plt.subplot(2,2,4)
+plt.plot(x, jaccards, label='Jaccard')
+plt.legend(loc=4)
+plt.show()
+
+# 4.1.3
+faces = sciio.loadmat('./data/wildfaces.mat')
+X = faces['X']
+k = 0
+centroid, label, inertia = cluster.k_means(X, 10)
+
+n = 10
+plt.figure(figsize=(n*2,4))
+for k in range(0,n):
+    plt.subplot(2, n, k + 1)
+    plt.imshow(np.reshape(X[k,:], (3,40,40)).T)
+    plt.axis('off')
+    plt.subplot(2, n, k + 1 + n)
+    plt.imshow(np.reshape(centroid[label[k],:], (3,40,40)).T)
+    plt.axis('off')
+plt.show()
+    
+# 4.1.4
+digits = sciio.loadmat('./data/digits.mat')
+X = digits['X']
+k = 20
+
+plt.figure(figsize=(6,4))
+for k in range(0,24):
+    plt.subplot(4, 6, k + 1)
+    plt.imshow(np.reshape(X[k], (16,16)), cmap=plt.cm.binary)
+    plt.axis('off')
+plt.show()
+
+centroid, label, inertia = cluster.k_means(X, k)
+
+plt.figure(figsize=(6,4))
+for k in range(0,24):
+    plt.subplot(4, 6, k + 1)
+    plt.imshow(np.reshape(centroid[label[k]], (16,16)), cmap=plt.cm.binary)
+    plt.axis('off')
+plt.show()
diff --git a/Assignment 4/packages/clusterPlot.py b/Assignment 4/packages/clusterPlot.py
new file mode 100644
index 0000000..2f37a3d
--- /dev/null
+++ b/Assignment 4/packages/clusterPlot.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Apr 14 09:01:18 2014
+
+"""
+
+def clusterPlot(X, clusterid, centroids='None', y='None', covars='None', figsize=(16,10)):
+    '''
+    CLUSTERPLOT Plots a clustering of a data set as well as the true class
+    labels. If data is more than 2-dimensional it should be first projected
+    onto the first two principal components. Data objects are plotted as a dot
+    with a circle around. The color of the dot indicates the true class,
+    and the cicle indicates the cluster index. Optionally, the centroids are
+    plotted as filled-star markers, and ellipsoids corresponding to covariance
+    matrices (e.g. for gaussian mixture models).
+
+    Usage:
+    clusterplot(X, clusterid)
+    clusterplot(X, clusterid, centroids=c_matrix, y=y_matrix)
+    clusterplot(X, clusterid, centroids=c_matrix, y=y_matrix, covars=c_tensor)
+    
+    Input:
+    X           N-by-M data matrix (N data objects with M attributes)
+    clusterid   N-by-1 vector of cluster indices
+    centroids   K-by-M matrix of cluster centroids (optional)
+    y           N-by-1 vector of true class labels (optional)
+    covars      M-by-M-by-K tensor of covariance matrices (optional)
+    '''
+    import numpy as np
+    from matplotlib.pyplot import figure, cm, plot, hold, legend, xlim, show
+
+
+    X = np.asarray(X)
+    cls = np.asarray(clusterid)
+    if y=='None':
+        y = np.zeros((X.shape[0],1))
+    else:
+        y = np.asarray(y)
+    if centroids!='None':
+        centroids = np.asarray(centroids)
+    K = np.size(np.unique(cls))
+    C = np.size(np.unique(y))
+    ncolors = np.max([C,K])
+    
+    # plot data points color-coded by class, cluster markers and centroids
+    figure(figsize=figsize)        
+    hold(True)
+    colors = [0]*ncolors
+    for color in range(ncolors):
+        colors[color] = cm.jet.__call__(color*1.0/(1.0*ncolors-1))[:3]
+    for i,cs in enumerate(np.unique(y)):
+        plot(X[(y==cs).ravel(),0], X[(y==cs).ravel(),1], 'o', markeredgecolor='k', markerfacecolor=colors[i],markersize=6, zorder=2)
+    for i,cr in enumerate(np.unique(cls)):
+        plot(X[(cls==cr).ravel(),0], X[(cls==cr).ravel(),1], 'o', markersize=12, markeredgecolor=colors[i], markerfacecolor='None', markeredgewidth=3, zorder=1)
+    if centroids!='None':        
+        for cd in range(centroids.shape[0]):
+            plot(centroids[cd,0], centroids[cd,1], '*', markersize=22, markeredgecolor='k', markerfacecolor=colors[cd], markeredgewidth=2, zorder=3)
+    # plot cluster shapes:
+    if covars!='None':
+        for cd in range(centroids.shape[0]):
+            x1, x2 = gauss_2d(centroids[cd],covars[cd,:,:])
+            plot(x1,x2,'-', color=colors[cd], linewidth=3, zorder=5)
+    hold(False)
+
+    # create legend        
+    legend_items = np.unique(y).tolist()+np.unique(cls).tolist()+np.unique(cls).tolist()
+    for i in range(len(legend_items)):
+        if i<C: legend_items[i] = 'Class: {0}'.format(legend_items[i]);
+        elif i<C+K: legend_items[i] = 'Cluster: {0}'.format(legend_items[i]);
+        else: legend_items[i] = 'Centroid: {0}'.format(legend_items[i]);
+    legend(legend_items, numpoints=1, markerscale=.75, prop={'size': 9})
+    
+    xlim(X[:,0].min()*1.1, X[:,0].max()*1.2)
+    
+    show()
diff --git a/Assignment 4/packages/clusterVal.py b/Assignment 4/packages/clusterVal.py
new file mode 100644
index 0000000..df97334
--- /dev/null
+++ b/Assignment 4/packages/clusterVal.py
@@ -0,0 +1,47 @@
+def clusterVal(y, clusterid):
+    '''
+    CLUSTERVAL Estimate cluster validity using Entropy, Purity, Rand Statistic,
+    and Jaccard coefficient.
+    
+    Usage:
+      Entropy, Purity, Rand, Jaccard = clusterval(y, clusterid);
+    
+    Input:
+       y         N-by-1 vector of class labels 
+       clusterid N-by-1 vector of cluster indices
+    
+    Output:
+      Entropy    Entropy measure.
+      Purity     Purity measure.
+      Rand       Rand index.
+      Jaccard    Jaccard coefficient.
+    '''
+    
+    import numpy as np    
+    
+    y = np.asarray(y).ravel(); clusterid = np.asarray(clusterid).ravel()
+    C = np.unique(y).size; K = np.unique(clusterid).size; N = y.shape[0]
+    EPS = 2.22e-16
+    
+    p_ij = np.zeros((K,C))          # probability that member of i'th cluster belongs to j'th class
+    m_i = np.zeros((K,1))           # total number of objects in i'th cluster
+    for k in range(K):
+        m_i[k] = (clusterid==k).sum()
+        yk = y[clusterid==k]
+        for c in range(C):
+            m_ij = (yk==c).sum()    # number of objects of j'th class in i'th cluster
+            p_ij[k,c] = m_ij.astype(float)/m_i[k]
+    entropy = ( (1-(p_ij*np.log2(p_ij+EPS)).sum(axis=1))*m_i.T ).sum() / (N*K) 
+    purity = ( p_ij.max(axis=1) ).sum() / K
+
+    f00=0; f01=0; f10=0; f11=0
+    for i in range(N):
+        for j in range(i):
+            if y[i]!=y[j] and clusterid[i]!=clusterid[j]: f00 += 1;     # different class, different cluster    
+            elif y[i]==y[j] and clusterid[i]==clusterid[j]: f11 += 1;   # same class, same cluster
+            elif y[i]==y[j] and clusterid[i]!=clusterid[j]: f10 += 1;   # same class, different cluster    
+            else: f01 +=1;                                              # different class, same cluster
+    rand = np.float(f00+f11)/(f00+f01+f10+f11)
+    jaccard = np.float(f11)/(f01+f10+f11)
+
+    return entropy, purity, rand, jaccard
author	Camil Staps	2015-11-27 00:18:32 +0100
committer	Camil Staps	2015-11-27 00:18:32 +0100
commit	d88d00232cfdbfd508834911af6ad89a217b84e1 (patch)
tree	20308e5e89f76ce8f987598e26f75db6ad4cbd4e /Assignment 4
parent	Assignment 3 report (diff)