diff options
Diffstat (limited to 'Assignment 4/packages/clusterVal.py')
-rw-r--r-- | Assignment 4/packages/clusterVal.py | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/Assignment 4/packages/clusterVal.py b/Assignment 4/packages/clusterVal.py new file mode 100644 index 0000000..df97334 --- /dev/null +++ b/Assignment 4/packages/clusterVal.py @@ -0,0 +1,47 @@ +def clusterVal(y, clusterid):
+ '''
+ CLUSTERVAL Estimate cluster validity using Entropy, Purity, Rand Statistic,
+ and Jaccard coefficient.
+
+ Usage:
+ Entropy, Purity, Rand, Jaccard = clusterval(y, clusterid);
+
+ Input:
+ y N-by-1 vector of class labels
+ clusterid N-by-1 vector of cluster indices
+
+ Output:
+ Entropy Entropy measure.
+ Purity Purity measure.
+ Rand Rand index.
+ Jaccard Jaccard coefficient.
+ '''
+
+ import numpy as np
+
+ y = np.asarray(y).ravel(); clusterid = np.asarray(clusterid).ravel()
+ C = np.unique(y).size; K = np.unique(clusterid).size; N = y.shape[0]
+ EPS = 2.22e-16
+
+ p_ij = np.zeros((K,C)) # probability that member of i'th cluster belongs to j'th class
+ m_i = np.zeros((K,1)) # total number of objects in i'th cluster
+ for k in range(K):
+ m_i[k] = (clusterid==k).sum()
+ yk = y[clusterid==k]
+ for c in range(C):
+ m_ij = (yk==c).sum() # number of objects of j'th class in i'th cluster
+ p_ij[k,c] = m_ij.astype(float)/m_i[k]
+ entropy = ( (1-(p_ij*np.log2(p_ij+EPS)).sum(axis=1))*m_i.T ).sum() / (N*K)
+ purity = ( p_ij.max(axis=1) ).sum() / K
+
+ f00=0; f01=0; f10=0; f11=0
+ for i in range(N):
+ for j in range(i):
+ if y[i]!=y[j] and clusterid[i]!=clusterid[j]: f00 += 1; # different class, different cluster
+ elif y[i]==y[j] and clusterid[i]==clusterid[j]: f11 += 1; # same class, same cluster
+ elif y[i]==y[j] and clusterid[i]!=clusterid[j]: f10 += 1; # same class, different cluster
+ else: f01 +=1; # different class, same cluster
+ rand = np.float(f00+f11)/(f00+f01+f10+f11)
+ jaccard = np.float(f11)/(f01+f10+f11)
+
+ return entropy, purity, rand, jaccard
|