Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/clustering/clustering_functions.R
2601 views
1
# Functions for evaluating clustering
2
3
# test example data for all the functions, remove row.names or else
4
# will produce warnings when using it on bootstrapping (complaining about
5
# duplicated row.names )
6
7
# mtcars_scaled <- scale(mtcars)
8
# row.names(mtcars_scaled) <- NULL
9
10
# ------------------------------------------------------------------------------------
11
#### Choosing the right k for clustering
12
library(dplyr)
13
library(tidyr)
14
library(ggplot2)
15
16
## method 1. WSS :compute the total within sum square error, this measures how close
17
# are the points in a cluster to each other
18
19
# [Distance] : calculates the sum squared distance of a given cluster of points,
20
# note that "sum squared distance" is used here for measuring variance
21
Distance <- function(cluster)
22
{
23
# the center of the cluster, mean of all the points
24
center <- colMeans(cluster)
25
26
# calculate the summed squared error between every point and
27
# the center of that cluster
28
distance <- apply( cluster, 1, function(row)
29
{
30
sum( ( row - center )^2 )
31
}) %>% sum()
32
33
return(distance)
34
}
35
36
# calculate the within sum squared error manually for hierarchical clustering
37
# [WSS] : pass in the dataset, and the resulting groups(cluster)
38
WSS <- function( data, groups )
39
{
40
k <- max(groups)
41
42
# loop through each groups (clusters) and obtain its
43
# within sum squared error
44
total <- lapply( 1:k, function(k)
45
{
46
# extract the data point within the cluster
47
cluster <- subset( data, groups == k )
48
49
distance <- Distance(cluster)
50
return(distance)
51
}) %>% unlist()
52
53
return( sum(total) )
54
}
55
56
# testing
57
# sum_squared_error <- WSS( data = mtcars_scaled, groups = groups )
58
59
# this value will will decrease as the number of clusters increases,
60
# because each cluster will be smaller and tighter.
61
# And the rate of the decrease will slow down after the optimal cluster number
62
63
64
## method 2 : Calinski-Harabasz index, ratio of the between cluster variance
65
# to the total within cluster variance
66
# http://www.mathworks.com/help/stats/clustering.evaluation.calinskiharabaszevaluation-class.html
67
68
# TSS (total sum of square) : the squared distance of all the data points from
69
# the dataset's centroid
70
71
# BSS (between sum of square) = TSS - WSS, measures how far apart are the clusters
72
# from each other
73
# !! a good clustering has a small WSS and a high BSS
74
75
# CHIndex = B / W, the ratio should be maximized at the optimal k
76
# B = BSS(k) / (k-1) ; k = # of cluster
77
# W = WSS(k) / (n-k) ; n = # of data points
78
79
# [CHCriterion] : calculates both Calinski-Harabasz index and within sum squared error
80
# @kmax = maximum cluster number, caculates the CH index from 2 cluster to kmax
81
# @clustermethod = "kmeanspp", "hclust"
82
83
CHCriterion <- function( data, kmax, clustermethod, ... )
84
{
85
if( !clustermethod %in% c( "kmeanspp", "hclust" ) )
86
stop( "method must be one of 'kmeanspp' or 'hclust'" )
87
88
# total sum squared error (independent with the number of cluster k)
89
tss <- Distance( cluster = data )
90
91
# initialize a numeric vector storing the score
92
wss <- numeric(kmax)
93
94
# k starts from 2, cluster 1 is meaningless
95
if( clustermethod == "kmeanspp" )
96
{
97
for( k in 2:kmax )
98
{
99
results <- Kmeanspp( data, k, ... )
100
wss[k] <- results$tot.withinss
101
}
102
}else # "hclust"
103
{
104
d <- dist( data, method = "euclidean" )
105
clustering <- hclust( d, ... )
106
for( k in 2:kmax )
107
{
108
groups <- cutree( clustering, k )
109
wss[k] <- WSS( data = data, groups = groups )
110
}
111
}
112
113
# between sum of square
114
bss <- tss - wss[-1]
115
116
# cluster count start from 2!
117
numerator <- bss / ( 1:(kmax-1) )
118
denominator <- wss[-1] / ( nrow(data) - 2:kmax )
119
120
criteria <- data.frame( k = 2:kmax,
121
CHIndex = numerator / denominator,
122
wss = wss[-1] )
123
124
# convert to long format for plotting
125
criteria_long <- gather( criteria, "index", "value", -1 )
126
127
plot <- ggplot( criteria_long, aes( k, value, color = index ) ) +
128
geom_line() + geom_point( aes( shape = index ), size = 3 ) +
129
facet_wrap( ~ index, scale = "free_y" ) +
130
guides( color = FALSE, shape = FALSE )
131
132
return( list( data = criteria,
133
plot = plot ) )
134
}
135
136
137
# testing
138
# criteria <- CHCriterion( data = mtcars_scaled, kmax = 10,
139
# clustermethod = "hclust", method = "ward.D" )
140
141
# criteria <- CHCriterion( data = mtcars_scaled, kmax = 10,
142
# clustermethod = "kmeanspp", nstart = 10, iter.max = 100 )
143
144
145
# for CHindex, the local maximum should be your ideal cluster number
146
# for WWS, the "elbow" center should be your ideal cluster number
147
148
# for the mtcars_scaled example, CHindex shows local optimal at cluster 4
149
# seems reasonable when looking at the dendogram
150
151
# test <- hclust( dist(mtcars_scaled), method = "ward.D" )
152
# plot(test)
153
# rect.hclust( test, k = 4 )
154
155
156
# ----------------------------------------------------------------------------------------------
157
#### boostrap evaluation of a cluster result
158
# clustering algorithms will often produce several clusters that represents
159
# actual cluster of the data, and then one or two clusters that represents "others".
160
# which means that they're made up of data points that have no relationship with each other
161
# they just don't fit anywhere else.
162
163
# use boostrap resampling to evaluate the stability of the cluster, steps :
164
# 1. Cluster the original data.
165
# 2. Draw a new dataset of the same size as the original by resampling the original
166
# dataset with replacement, therefore some data point may show up more than once
167
# while others not at all. Cluster this new data.
168
# 3. For every cluster in the original cluster, find the most similar cluster in the
169
# new clustering, which is the one with the max Jaccard Similarity. Then if this
170
# Jaccard Similarity is less than .5 than the original cluster is considered to
171
# be "dissolved", that is it did not show up in the new cluster. A cluster that
172
# dissolved too often is most likely not a real cluster.
173
# Jaccard Similarity of two vectors = intersect / union
174
175
176
# ----------------------------------------------------------------------------------------------
177
# [ClusterMethod] : supports heirarchical clustering and kmeans++ bootstrap
178
179
# kmeans++ explanation and source code
180
# source in function Kmeanspp
181
source("/Users/ethen/machine-learning/clustering_old/clustering/kmeanspp.R")
182
183
# @data = data frame type data, matrix also works
184
# @k = specify the number of clusters
185
# @clustermethod = "hclust" for heirarchical clustering, and "kmeanspp" for kmeans++
186
# @noise.cut = if specified, the points of the resulting cluster whose number is smaller
187
# than it will be considered as noise, and all of these noise cluster will be
188
# grouped together as one whole cluster
189
# @... = pass in other parameters for hclust or kmeans++ (same as kmeans)
190
191
ClusterMethod <- function( data, k, noise.cut = 0, clustermethod, ... )
192
{
193
if( !clustermethod %in% c( "kmeanspp", "hclust" ) )
194
stop( "method must be one of 'kmeanspp' or 'hclust'" )
195
196
# hierarchical clustering
197
if( clustermethod == "hclust" )
198
{
199
cluster <- hclust( dist(data), ... )
200
partition <- cutree( cluster, k )
201
202
}else # kmeanspp
203
{
204
cluster <- Kmeanspp( data = data, k = k, ... )
205
partition <- cluster$cluster
206
}
207
208
# equivalent to k
209
cluster_num <- max(partition)
210
211
# calculate each cluster's size
212
cluster_size <- numeric(cluster_num)
213
for( i in 1:cluster_num )
214
cluster_size[i] <- sum( partition == i )
215
216
# if there're cluster size smaller than the specified noise.cut, do :
217
not_noise_num <- sum( cluster_size > noise.cut )
218
219
if( cluster_num > not_noise_num )
220
{
221
# extract the cluster whose size is larger than noise.cut
222
cluster_new <- (1:cluster_num)[ cluster_size > noise.cut ]
223
224
# all the data points whose original cluster is smaller than the noise.cut
225
# will be assigned to the same new cluster
226
cluster_num <- not_noise_num + 1
227
228
# new clustering number, assign the noise cluster's number first
229
# then adjust the original cluster's number
230
new <- rep( cluster_num, nrow(data) )
231
232
for( i in 1:not_noise_num )
233
new[ ( partition == cluster_new[i] ) ] <- i
234
235
partition <- new
236
}
237
238
# boolean vector indicating which data point belongs to which cluster
239
cluster_list <- lapply( 1:cluster_num, function(x)
240
{
241
return( partition == x )
242
})
243
244
cluster_result <- list( result = cluster,
245
partition = partition,
246
clusternum = cluster_num,
247
clusterlist = cluster_list )
248
return(cluster_result)
249
}
250
251
# cluster_result <- ClusterMethod( data = mtcars_scaled, k = 5, clustermethod = "hclust" )
252
253
254
# ----------------------------------------------------------------------------------------------
255
# [ClusterBootstrap] :
256
# @bootstrap : number of boostrap iteraion
257
# @dissolve : if the jaccard similarity is smaller than this number, then it is considered
258
# to be "dissolved"
259
# Returns : 1. result : the original clustering object.
260
# 2. bootmean : mean of the Jaccard Similarity for specified bootstrap time.
261
# 3. partition : the original clustering result, a vector specifying which
262
# group does the data point belong.
263
# 4. clusternum : final cluster count,
264
# if you specified noise.cut then it might be different from k.
265
# 5. bootdissolved : number of times each cluster's jaccard similarity is smaller than
266
# the dissolve value.
267
268
ClusterBootstrap <- function( data, k, noise.cut = 0, bootstrap = 100,
269
dissolve = .5, clustermethod, ... )
270
{
271
# step 1
272
cluster_result <- ClusterMethod( data = data, k = k, noise.cut = noise.cut,
273
clustermethod = clustermethod, ... )
274
275
cluster_num <- cluster_result$clusternum
276
boot_jaccard <- matrix( 0, nrow = bootstrap, ncol = cluster_num )
277
278
# pass in two vectors containing TRUE and FALSE
279
# ( do not use built in intersect or union ! )
280
jaccardSimilarity <- function( x, y )
281
{
282
jaccard <- sum( x & y ) / ( sum(x) + sum(y) - sum( x & y ) )
283
return(jaccard)
284
}
285
286
n <- nrow(data)
287
for( i in 1:bootstrap )
288
{
289
# step 2, cluster the new sampled data
290
sampling <- sample( n, n, replace = TRUE )
291
boot_data <- data[ sampling, ]
292
293
boot_result <- ClusterMethod( data = boot_data, k = k, noise.cut = noise.cut,
294
clustermethod = clustermethod, ... )
295
boot_num <- boot_result$clusternum
296
297
# step 3
298
for( j in 1:cluster_num )
299
{
300
# compare the original cluster with every other bootstrapped cluster
301
similarity <- lapply( 1:boot_num, function(k)
302
{
303
jaccard <- jaccardSimilarity( x = cluster_result$clusterlist[[j]][sampling],
304
y = boot_result$clusterlist[[k]] )
305
}) %>% unlist()
306
307
# return the largest jaccard similarity
308
boot_jaccard[ i, j ] <- max(similarity)
309
}
310
}
311
312
# cluster's stability, mean of all the boostrapped jaccard similarity
313
boot_mean <- colMeans(boot_jaccard)
314
315
# how many times are each cluster's jaccard similarity below the
316
# specified "dissolved" value
317
boot_dissolved <- apply( boot_jaccard, 2, function(x)
318
{
319
sum( x < dissolve, na.rm = TRUE )
320
})
321
322
boot_result <- list( result = cluster_result$result,
323
bootmean = boot_mean,
324
partition = cluster_result$partition,
325
clusternum = cluster_num,
326
bootdissolved = boot_dissolved )
327
return(boot_result)
328
}
329
330
# test
331
# set.seed(1234)
332
# boot_clust <- ClusterBootstrap( data = mtcars_scaled, k = 4, clustermethod = "kmeanspp",
333
# nstart = 10, iter.max = 100 )
334
335
# ... parameters for hclust
336
# method = "ward.D"
337
338
# boot_clust
339
340
# rule of thumb, values below 0.6 should be considered unstable
341
# boot_clust$bootmean
342
343
# clusters that have a low bootmean or high bootdissolved
344
# has the characteristics of what we’ve been calling the “other” cluster.
345
346
347
348