Path: blob/master/clustering_old/clustering/kmeanspp.R
2615 views
library(data.table)12Kmeanspp <- function( data, k, ... )3{4# kmeans++, generating a better k initial random center for kmeans. Workflow:5# 1. choose a data point at random from the dataset, this serves as the first center point.6# 2. compute the SQUARED distance of all other data points to the randomly chosen center point.7# 3. to generate the next center point, each data point is chosen with the prob (weight) of8# its squared distance to the chosen center of this round divided by the the9# total squared distance (in R, sample function's probability are already weighted,10# do not need to tune them to add up to one).11# 4. next recompute the weight of each data point as the minimum of the distance between it and12# ALL the centers that are already generated ( e.g. for the second iteration, compare the13# distance of the data point between the first and second center and choose the smaller one ).14# 5. repeat step 3 and 4 until having k centers.15#16# Parameters17# ----------18# data : data.frame, data.table, matrix data19#20# k : int21# number of clusters22#23# ... :24# all other parameters that can be passed into R's kmeans except for the data and center25# , see ?kmeans for more detail26#27# Returns28# -------29# result : list30# R's kmeans original output31#32# Reference33# ---------34# https://datasciencelab.wordpress.com/2014/01/15/improved-seeding-for-clustering-with-k-means/3536if( !is.data.table(data) )37data <- data.table(data)3839# used with bootstrapped data. so unique the data40# to avoid duplicates, or kmeans will warn about41# identical cluster center42unique_data <- unique(data)4344# generate the first center randomly45n <- nrow(unique_data)46center_ids <- integer(k)47center_ids[1] <- sample.int( n, 1 )4849for( i in 1:( k - 1 ) ){5051# calculate the squared distance between the center and52# all the data points53center <- unique_data[ center_ids[i], ]54dists <- apply( unique_data, 1, function(datapoint){55sum( ( datapoint - center )^2 )56})5758# sample the next center using the squared distance as the weighted probability,59# starting from the second center, the measure "squared distance" for each data point60# is the min distance between each data point and each center that has already been61# generated62if( i == 1 ){63distance <- dists64}else{65distance <- cbind( distance, dists )66distance <- apply( distance, 1, min )67}68center_ids[ i + 1 ] <- sample.int( n, 1, prob = distance )69}7071# cluster the whole "data", using the center_ids generated using kmeanspp72results <- kmeans( data, centers = unique_data[ center_ids, ], ... )73return(results)74}757677test <- function(){78# test example data79# the example code is wrapped in the string below80"81# remove the species column82iris_data <- iris[ , -5 ]8384# normalize the dataset85iris_data <- data.table( scale(iris_data) )86results <- Kmeanspp( data = iris_data, k = 3 )8788# example output, the generated center, size of each cluster89# and confusion matrix of the original cluster and clustered result90results$center91results$size92table( iris$Species, results$cluster )9394iris_data[ , `:=`( Species = iris$Species, cluster = results$cluster ) ]95split( iris_data, iris_data$cluster )96"97print('testing')98}99100101102103104