Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/clustering/kmeanspp.R
2615 views
1
library(data.table)
2
3
Kmeanspp <- function( data, k, ... )
4
{
5
# kmeans++, generating a better k initial random center for kmeans. Workflow:
6
# 1. choose a data point at random from the dataset, this serves as the first center point.
7
# 2. compute the SQUARED distance of all other data points to the randomly chosen center point.
8
# 3. to generate the next center point, each data point is chosen with the prob (weight) of
9
# its squared distance to the chosen center of this round divided by the the
10
# total squared distance (in R, sample function's probability are already weighted,
11
# do not need to tune them to add up to one).
12
# 4. next recompute the weight of each data point as the minimum of the distance between it and
13
# ALL the centers that are already generated ( e.g. for the second iteration, compare the
14
# distance of the data point between the first and second center and choose the smaller one ).
15
# 5. repeat step 3 and 4 until having k centers.
16
#
17
# Parameters
18
# ----------
19
# data : data.frame, data.table, matrix data
20
#
21
# k : int
22
# number of clusters
23
#
24
# ... :
25
# all other parameters that can be passed into R's kmeans except for the data and center
26
# , see ?kmeans for more detail
27
#
28
# Returns
29
# -------
30
# result : list
31
# R's kmeans original output
32
#
33
# Reference
34
# ---------
35
# https://datasciencelab.wordpress.com/2014/01/15/improved-seeding-for-clustering-with-k-means/
36
37
if( !is.data.table(data) )
38
data <- data.table(data)
39
40
# used with bootstrapped data. so unique the data
41
# to avoid duplicates, or kmeans will warn about
42
# identical cluster center
43
unique_data <- unique(data)
44
45
# generate the first center randomly
46
n <- nrow(unique_data)
47
center_ids <- integer(k)
48
center_ids[1] <- sample.int( n, 1 )
49
50
for( i in 1:( k - 1 ) ){
51
52
# calculate the squared distance between the center and
53
# all the data points
54
center <- unique_data[ center_ids[i], ]
55
dists <- apply( unique_data, 1, function(datapoint){
56
sum( ( datapoint - center )^2 )
57
})
58
59
# sample the next center using the squared distance as the weighted probability,
60
# starting from the second center, the measure "squared distance" for each data point
61
# is the min distance between each data point and each center that has already been
62
# generated
63
if( i == 1 ){
64
distance <- dists
65
}else{
66
distance <- cbind( distance, dists )
67
distance <- apply( distance, 1, min )
68
}
69
center_ids[ i + 1 ] <- sample.int( n, 1, prob = distance )
70
}
71
72
# cluster the whole "data", using the center_ids generated using kmeanspp
73
results <- kmeans( data, centers = unique_data[ center_ids, ], ... )
74
return(results)
75
}
76
77
78
test <- function(){
79
# test example data
80
# the example code is wrapped in the string below
81
"
82
# remove the species column
83
iris_data <- iris[ , -5 ]
84
85
# normalize the dataset
86
iris_data <- data.table( scale(iris_data) )
87
results <- Kmeanspp( data = iris_data, k = 3 )
88
89
# example output, the generated center, size of each cluster
90
# and confusion matrix of the original cluster and clustered result
91
results$center
92
results$size
93
table( iris$Species, results$cluster )
94
95
iris_data[ , `:=`( Species = iris$Species, cluster = results$cluster ) ]
96
split( iris_data, iris_data$cluster )
97
"
98
print('testing')
99
}
100
101
102
103
104