CoCalc -- kmeanspp.R

GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/clustering/kmeanspp.R
²⁶¹⁵ views
1
library(data.table)
2

3
Kmeanspp <- function( data, k, ... )
4
{
5
	# kmeans++, generating a better k initial random center for kmeans. Workflow:
6
	# 1. choose a data point at random from the dataset, this serves as the first center point. 
7
	# 2. compute the SQUARED distance of all other data points to the randomly chosen center point.
8
	# 3. to generate the next center point, each data point is chosen with the prob (weight) of 
9
	#    its squared distance to the chosen center of this round divided by the the 
10
	#    total squared distance (in R, sample function's probability are already weighted, 
11
	#    do not need to tune them to add up to one).
12
	# 4. next recompute the weight of each data point as the minimum of the distance between it and
13
	#    ALL the centers that are already generated ( e.g. for the second iteration, compare the 
14
	#    distance of the data point between the first and second center and choose the smaller one ).
15
	# 5. repeat step 3 and 4 until having k centers. 
16
	#
17
	# Parameters
18
	# ----------
19
	# data : data.frame, data.table, matrix data
20
	#
21
	# k : int 
22
	#     number of clusters
23
	# 
24
	# ... : 
25
	#     all other parameters that can be passed into R's kmeans except for the data and center
26
	#     , see ?kmeans for more detail
27
	#
28
	# Returns
29
	# -------
30
	# result : list
31
	#     R's kmeans original output
32
	#
33
	# Reference
34
	# ---------
35
	# https://datasciencelab.wordpress.com/2014/01/15/improved-seeding-for-clustering-with-k-means/
36

37
	if( !is.data.table(data) )
38
		data <- data.table(data)
39
	
40
	# used with bootstrapped data. so unique the data
41
	# to avoid duplicates, or kmeans will warn about 
42
	# identical cluster center
43
	unique_data <- unique(data)
44

45
	# generate the first center randomly
46
	n <- nrow(unique_data)
47
	center_ids <- integer(k)
48
	center_ids[1] <- sample.int( n, 1 )
49

50
	for( i in 1:( k - 1 ) ){		
51
		
52
		# calculate the squared distance between the center and 
53
		# all the data points
54
		center <- unique_data[ center_ids[i], ]
55
		dists <- apply( unique_data, 1, function(datapoint){
56
			sum( ( datapoint - center )^2 )
57
		})
58

59
		# sample the next center using the squared distance as the weighted probability,
60
		# starting from the second center, the measure "squared distance" for each data point
61
		# is the min distance between each data point and each center that has already been
62
		# generated
63
		if( i == 1 ){		
64
			distance <- dists
65
		}else{
66
			distance <- cbind( distance, dists )
67
			distance <- apply( distance, 1, min )
68
		}
69
		center_ids[ i + 1 ] <- sample.int( n, 1, prob = distance )					
70
	}
71

72
	# cluster the whole "data", using the center_ids generated using kmeanspp
73
	results <- kmeans( data, centers = unique_data[ center_ids, ], ... )
74
	return(results)	
75
}
76

77

78
test <- function(){
79
	# test example data
80
	# the example code is wrapped in the string below
81
	"
82
	# remove the species column
83
	iris_data <- iris[ , -5 ]
84

85
	# normalize the dataset
86
	iris_data <- data.table( scale(iris_data) )
87
	results <- Kmeanspp( data = iris_data, k = 3 )
88

89
	# example output, the generated center, size of each cluster
90
	# and confusion matrix of the original cluster and clustered result
91
	results$center
92
	results$size
93
	table( iris$Species, results$cluster )
94

95
	iris_data[ , `:=`( Species = iris$Species, cluster = results$cluster ) ]
96
	split( iris_data, iris_data$cluster )
97
	"
98
	print('testing')
99
}
100

101

102

103

104
Product

Resources

Company