CoCalc -- clustering

GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/clustering/clustering_functions.R
²⁶⁰¹ views
1
# Functions for evaluating clustering  
2

3
# test example data for all the functions, remove row.names or else
4
# will produce warnings when using it on bootstrapping (complaining about 
5
# duplicated row.names )
6

7
# mtcars_scaled <- scale(mtcars)
8
# row.names(mtcars_scaled) <- NULL
9

10
# ------------------------------------------------------------------------------------
11
#### Choosing the right k for clustering
12
library(dplyr)
13
library(tidyr)
14
library(ggplot2)
15

16
## method 1. WSS :compute the total within sum square error, this measures how close
17
#  are the points in a cluster to each other 
18

19
# [Distance] : calculates the sum squared distance of a given cluster of points,
20
#              note that "sum squared distance" is used here for measuring variance 
21
Distance <- function(cluster)
22
{
23
	# the center of the cluster, mean of all the points
24
	center <- colMeans(cluster)
25
	
26
	# calculate the summed squared error between every point and 
27
	# the center of that cluster 
28
	distance <- apply( cluster, 1, function(row)
29
	{
30
		sum( ( row - center )^2 )
31
	}) %>% sum()
32

33
	return(distance)
34
}
35

36
# calculate the within sum squared error manually for hierarchical clustering 
37
# [WSS] : pass in the dataset, and the resulting groups(cluster)
38
WSS <- function( data, groups )
39
{
40
	k <- max(groups)
41

42
	# loop through each groups (clusters) and obtain its 
43
	# within sum squared error 
44
	total <- lapply( 1:k, function(k)
45
	{
46
		# extract the data point within the cluster
47
		cluster <- subset( data, groups == k )
48

49
		distance <- Distance(cluster)
50
		return(distance)
51
	}) %>% unlist()
52

53
	return( sum(total) )
54
}
55

56
# testing 
57
# sum_squared_error <- WSS( data = mtcars_scaled, groups =  groups )
58

59
# this value will will decrease as the number of clusters increases, 
60
# because each cluster will be smaller and tighter.
61
# And the rate of the decrease will slow down after the optimal cluster number
62

63

64
## method 2 : Calinski-Harabasz index, ratio of the between cluster variance
65
#			  to the total within cluster variance
66
# http://www.mathworks.com/help/stats/clustering.evaluation.calinskiharabaszevaluation-class.html 
67

68
# TSS (total sum of square) : the squared distance of all the data points from 
69
# the dataset's centroid 
70

71
# BSS (between sum of square) = TSS - WSS, measures how far apart are the clusters
72
# from each other 
73
# !! a good clustering has a small WSS and a high BSS
74

75
# CHIndex = B / W, the ratio should be maximized at the optimal k
76
# B = BSS(k) / (k-1) ; k = # of cluster
77
# W = WSS(k) / (n-k) ; n = # of data points
78

79
# [CHCriterion] : calculates both Calinski-Harabasz index and within sum squared error
80
# @kmax          = maximum cluster number, caculates the CH index from 2 cluster to kmax
81
# @clustermethod = "kmeanspp", "hclust"
82

83
CHCriterion <- function( data, kmax, clustermethod, ...  )
84
{
85
	if( !clustermethod %in% c( "kmeanspp", "hclust" ) )
86
		stop( "method must be one of 'kmeanspp' or 'hclust'" )
87

88
	# total sum squared error (independent with the number of cluster k)
89
	tss <- Distance( cluster = data )
90

91
	# initialize a numeric vector storing the score
92
	wss <- numeric(kmax)
93

94
	# k starts from 2, cluster 1 is meaningless
95
	if( clustermethod == "kmeanspp" )
96
	{
97
		for( k in 2:kmax )
98
		{
99
			results <- Kmeanspp( data, k, ... )
100
			wss[k]  <- results$tot.withinss 
101
		}		
102
	}else # "hclust"
103
	{
104
		d <- dist( data, method = "euclidean" )
105
		clustering <- hclust( d, ... )
106
		for( k in 2:kmax )
107
		{
108
			groups <- cutree( clustering, k )
109
			wss[k] <- WSS( data = data, groups =  groups )
110
		}
111
	}		
112
	
113
	# between sum of square
114
	bss <- tss - wss[-1]
115

116
	# cluster count start from 2! 
117
	numerator <- bss / ( 1:(kmax-1) )
118
	denominator <- wss[-1] / ( nrow(data) - 2:kmax )
119

120
	criteria <- data.frame( k = 2:kmax,
121
	                        CHIndex = numerator / denominator,
122
							wss = wss[-1] )
123

124
	# convert to long format for plotting 
125
	criteria_long <- gather( criteria, "index", "value", -1 )
126

127
	plot <- ggplot( criteria_long, aes( k, value, color = index ) ) + 
128
			geom_line() + geom_point( aes( shape = index ), size = 3 ) +
129
			facet_wrap( ~ index, scale = "free_y" ) + 
130
			guides( color = FALSE, shape = FALSE )
131

132
	return( list( data = criteria, 
133
				  plot = plot ) )
134
}
135

136

137
# testing 
138
# criteria <- CHCriterion( data = mtcars_scaled, kmax = 10, 
139
#	                       clustermethod = "hclust", method = "ward.D" )
140

141
# criteria <- CHCriterion( data = mtcars_scaled, kmax = 10, 
142
#	                       clustermethod = "kmeanspp", nstart = 10, iter.max = 100 )
143

144

145
# for CHindex, the local maximum should be your ideal cluster number
146
# for WWS, the "elbow" center should be your ideal cluster number
147

148
# for the mtcars_scaled example, CHindex shows local optimal at cluster 4
149
# seems reasonable when looking at the dendogram 
150

151
# test <- hclust( dist(mtcars_scaled), method = "ward.D" )
152
# plot(test)
153
# rect.hclust( test, k = 4 )
154

155

156
# ----------------------------------------------------------------------------------------------
157
#### boostrap evaluation of a cluster result 
158
# clustering algorithms will often produce several clusters that represents 
159
# actual cluster of the data, and then one or two clusters that represents "others".
160
# which means that they're made up of data points that have no relationship with each other
161
# they just don't fit anywhere else.
162

163
# use boostrap resampling to evaluate the stability of the cluster, steps :
164
# 1. Cluster the original data.
165
# 2. Draw a new dataset of the same size as the original by resampling the original
166
#    dataset with replacement, therefore some data point may show up more than once
167
#    while others not at all. Cluster this new data.
168
# 3. For every cluster in the original cluster, find the most similar cluster in the 
169
#    new clustering, which is the one with the max Jaccard Similarity. Then if this 
170
#    Jaccard Similarity is less than .5 than the original cluster is considered to 
171
#    be "dissolved", that is it did not show up in the new cluster. A cluster that
172
#    dissolved too often is most likely not a real cluster.
173
#    Jaccard Similarity of two vectors = intersect / union
174

175

176
# ----------------------------------------------------------------------------------------------
177
# [ClusterMethod] : supports heirarchical clustering and kmeans++ bootstrap
178

179
# kmeans++ explanation and source code
180
# source in function Kmeanspp
181
source("/Users/ethen/machine-learning/clustering_old/clustering/kmeanspp.R")
182

183
# @data          = data frame type data, matrix also works 
184
# @k             = specify the number of clusters
185
# @clustermethod = "hclust" for heirarchical clustering, and "kmeanspp" for kmeans++
186
# @noise.cut     = if specified, the points of the resulting cluster whose number is smaller
187
#                  than it will be considered as noise, and all of these noise cluster will be
188
#                  grouped together as one whole cluster
189
# @...           = pass in other parameters for hclust or kmeans++ (same as kmeans)
190

191
ClusterMethod <- function( data, k, noise.cut = 0, clustermethod, ... )
192
{
193
	if( !clustermethod %in% c( "kmeanspp", "hclust" ) )
194
		stop( "method must be one of 'kmeanspp' or 'hclust'" )
195

196
	# hierarchical clustering 
197
	if( clustermethod == "hclust" )
198
	{
199
		cluster   <- hclust( dist(data), ... )
200
		partition <- cutree( cluster, k )
201

202
	}else # kmeanspp
203
	{
204
		cluster   <- Kmeanspp( data = data, k = k, ... )
205
		partition <- cluster$cluster
206
	}	
207

208
	# equivalent to k
209
	cluster_num <- max(partition) 
210

211
	# calculate each cluster's size 
212
	cluster_size <- numeric(cluster_num)
213
	for( i in 1:cluster_num )
214
		cluster_size[i] <- sum( partition == i )
215

216
	# if there're cluster size smaller than the specified noise.cut, do :
217
	not_noise_num <- sum( cluster_size > noise.cut )
218

219
	if( cluster_num > not_noise_num )
220
	{
221
		# extract the cluster whose size is larger than noise.cut
222
		cluster_new <- (1:cluster_num)[ cluster_size > noise.cut ]
223
		
224
		# all the data points whose original cluster is smaller than the noise.cut
225
		# will be assigned to the same new cluster
226
		cluster_num <- not_noise_num + 1
227

228
		# new clustering number, assign the noise cluster's number first
229
		# then adjust the original cluster's number
230
		new <- rep( cluster_num, nrow(data) )
231

232
		for( i in 1:not_noise_num )
233
			new[ ( partition == cluster_new[i] ) ] <- i
234

235
		partition <- new
236
	}
237
	
238
	# boolean vector indicating which data point belongs to which cluster
239
	cluster_list <- lapply( 1:cluster_num, function(x)
240
	{
241
		return( partition == x )
242
	})
243

244
	cluster_result <- list( result      = cluster,	                        
245
	                        partition   = partition,
246
	                        clusternum  = cluster_num,
247
	                        clusterlist = cluster_list )
248
	return(cluster_result)
249
}
250

251
# cluster_result <- ClusterMethod( data = mtcars_scaled, k = 5, clustermethod = "hclust" )
252

253

254
# ----------------------------------------------------------------------------------------------
255
# [ClusterBootstrap] : 
256
# @bootstrap : number of boostrap iteraion
257
# @dissolve  : if the jaccard similarity is smaller than this number, then it is considered
258
#              to be "dissolved"
259
# Returns    : 1. result        : the original clustering object.
260
#              2. bootmean      : mean of the Jaccard Similarity for specified bootstrap time.
261
#              3. partition     : the original clustering result, a vector specifying which 
262
#                                 group does the data point belong.
263
#              4. clusternum    : final cluster count, 
264
#                                 if you specified noise.cut then it might be different from k.
265
#              5. bootdissolved : number of times each cluster's jaccard similarity is smaller than
266
#                                 the dissolve value.
267

268
ClusterBootstrap <- function( data, k, noise.cut = 0, bootstrap = 100, 
269
	                          dissolve = .5, clustermethod, ... )
270
{
271
	# step 1
272
	cluster_result <- ClusterMethod( data = data, k = k, noise.cut = noise.cut, 
273
		                             clustermethod = clustermethod, ... )
274

275
	cluster_num  <- cluster_result$clusternum
276
	boot_jaccard <- matrix( 0, nrow = bootstrap, ncol = cluster_num )
277
	
278
	# pass in two vectors containing TRUE and FALSE
279
	# ( do not use built in intersect or union ! )
280
	jaccardSimilarity <- function( x, y )
281
	{
282
		jaccard <- sum( x & y ) / ( sum(x) + sum(y) - sum( x & y ) )
283
		return(jaccard)
284
	}
285

286
	n <- nrow(data)
287
	for( i in 1:bootstrap )
288
	{
289
		# step 2, cluster the new sampled data 
290
		sampling  <- sample( n, n, replace = TRUE )
291
		boot_data <- data[ sampling, ]
292

293
		boot_result <- ClusterMethod( data = boot_data, k = k, noise.cut = noise.cut, 
294
			                          clustermethod = clustermethod, ... )
295
		boot_num <- boot_result$clusternum
296

297
		# step 3
298
		for( j in 1:cluster_num )
299
		{
300
			# compare the original cluster with every other bootstrapped cluster
301
			similarity <- lapply( 1:boot_num, function(k)
302
			{
303
				jaccard <- jaccardSimilarity( x = cluster_result$clusterlist[[j]][sampling],
304
				                              y = boot_result$clusterlist[[k]] )
305
			}) %>% unlist()
306

307
			# return the largest jaccard similarity
308
			boot_jaccard[ i, j ] <- max(similarity)
309
		}	
310
	}
311

312
	# cluster's stability, mean of all the boostrapped jaccard similarity 
313
	boot_mean <- colMeans(boot_jaccard)
314

315
	# how many times are each cluster's jaccard similarity below the 
316
	# specified "dissolved" value  
317
	boot_dissolved <- apply( boot_jaccard, 2, function(x)
318
	{
319
		sum( x < dissolve, na.rm = TRUE )
320
	})
321

322
	boot_result <- list( result        = cluster_result$result,
323
	                     bootmean      = boot_mean,
324
	                     partition     = cluster_result$partition,
325
	                     clusternum    = cluster_num,                     
326
	                     bootdissolved = boot_dissolved )
327
	return(boot_result)
328
}
329

330
# test
331
# set.seed(1234)
332
# boot_clust <- ClusterBootstrap( data = mtcars_scaled, k = 4, clustermethod = "kmeanspp",
333
#                                 nstart = 10, iter.max = 100 )
334

335
# ... parameters for hclust
336
# method = "ward.D"
337

338
# boot_clust
339

340
# rule of thumb, values below 0.6 should be considered unstable 
341
# boot_clust$bootmean
342

343
# clusters that have a low bootmean or high bootdissolved
344
# has the characteristics of what we’ve been calling the “other” cluster.
345

346

347

348
Product

Resources

Company