Path: blob/master/clustering_old/clustering/clustering_functions.R
2601 views
# Functions for evaluating clustering12# test example data for all the functions, remove row.names or else3# will produce warnings when using it on bootstrapping (complaining about4# duplicated row.names )56# mtcars_scaled <- scale(mtcars)7# row.names(mtcars_scaled) <- NULL89# ------------------------------------------------------------------------------------10#### Choosing the right k for clustering11library(dplyr)12library(tidyr)13library(ggplot2)1415## method 1. WSS :compute the total within sum square error, this measures how close16# are the points in a cluster to each other1718# [Distance] : calculates the sum squared distance of a given cluster of points,19# note that "sum squared distance" is used here for measuring variance20Distance <- function(cluster)21{22# the center of the cluster, mean of all the points23center <- colMeans(cluster)2425# calculate the summed squared error between every point and26# the center of that cluster27distance <- apply( cluster, 1, function(row)28{29sum( ( row - center )^2 )30}) %>% sum()3132return(distance)33}3435# calculate the within sum squared error manually for hierarchical clustering36# [WSS] : pass in the dataset, and the resulting groups(cluster)37WSS <- function( data, groups )38{39k <- max(groups)4041# loop through each groups (clusters) and obtain its42# within sum squared error43total <- lapply( 1:k, function(k)44{45# extract the data point within the cluster46cluster <- subset( data, groups == k )4748distance <- Distance(cluster)49return(distance)50}) %>% unlist()5152return( sum(total) )53}5455# testing56# sum_squared_error <- WSS( data = mtcars_scaled, groups = groups )5758# this value will will decrease as the number of clusters increases,59# because each cluster will be smaller and tighter.60# And the rate of the decrease will slow down after the optimal cluster number616263## method 2 : Calinski-Harabasz index, ratio of the between cluster variance64# to the total within cluster variance65# http://www.mathworks.com/help/stats/clustering.evaluation.calinskiharabaszevaluation-class.html6667# TSS (total sum of square) : the squared distance of all the data points from68# the dataset's centroid6970# BSS (between sum of square) = TSS - WSS, measures how far apart are the clusters71# from each other72# !! a good clustering has a small WSS and a high BSS7374# CHIndex = B / W, the ratio should be maximized at the optimal k75# B = BSS(k) / (k-1) ; k = # of cluster76# W = WSS(k) / (n-k) ; n = # of data points7778# [CHCriterion] : calculates both Calinski-Harabasz index and within sum squared error79# @kmax = maximum cluster number, caculates the CH index from 2 cluster to kmax80# @clustermethod = "kmeanspp", "hclust"8182CHCriterion <- function( data, kmax, clustermethod, ... )83{84if( !clustermethod %in% c( "kmeanspp", "hclust" ) )85stop( "method must be one of 'kmeanspp' or 'hclust'" )8687# total sum squared error (independent with the number of cluster k)88tss <- Distance( cluster = data )8990# initialize a numeric vector storing the score91wss <- numeric(kmax)9293# k starts from 2, cluster 1 is meaningless94if( clustermethod == "kmeanspp" )95{96for( k in 2:kmax )97{98results <- Kmeanspp( data, k, ... )99wss[k] <- results$tot.withinss100}101}else # "hclust"102{103d <- dist( data, method = "euclidean" )104clustering <- hclust( d, ... )105for( k in 2:kmax )106{107groups <- cutree( clustering, k )108wss[k] <- WSS( data = data, groups = groups )109}110}111112# between sum of square113bss <- tss - wss[-1]114115# cluster count start from 2!116numerator <- bss / ( 1:(kmax-1) )117denominator <- wss[-1] / ( nrow(data) - 2:kmax )118119criteria <- data.frame( k = 2:kmax,120CHIndex = numerator / denominator,121wss = wss[-1] )122123# convert to long format for plotting124criteria_long <- gather( criteria, "index", "value", -1 )125126plot <- ggplot( criteria_long, aes( k, value, color = index ) ) +127geom_line() + geom_point( aes( shape = index ), size = 3 ) +128facet_wrap( ~ index, scale = "free_y" ) +129guides( color = FALSE, shape = FALSE )130131return( list( data = criteria,132plot = plot ) )133}134135136# testing137# criteria <- CHCriterion( data = mtcars_scaled, kmax = 10,138# clustermethod = "hclust", method = "ward.D" )139140# criteria <- CHCriterion( data = mtcars_scaled, kmax = 10,141# clustermethod = "kmeanspp", nstart = 10, iter.max = 100 )142143144# for CHindex, the local maximum should be your ideal cluster number145# for WWS, the "elbow" center should be your ideal cluster number146147# for the mtcars_scaled example, CHindex shows local optimal at cluster 4148# seems reasonable when looking at the dendogram149150# test <- hclust( dist(mtcars_scaled), method = "ward.D" )151# plot(test)152# rect.hclust( test, k = 4 )153154155# ----------------------------------------------------------------------------------------------156#### boostrap evaluation of a cluster result157# clustering algorithms will often produce several clusters that represents158# actual cluster of the data, and then one or two clusters that represents "others".159# which means that they're made up of data points that have no relationship with each other160# they just don't fit anywhere else.161162# use boostrap resampling to evaluate the stability of the cluster, steps :163# 1. Cluster the original data.164# 2. Draw a new dataset of the same size as the original by resampling the original165# dataset with replacement, therefore some data point may show up more than once166# while others not at all. Cluster this new data.167# 3. For every cluster in the original cluster, find the most similar cluster in the168# new clustering, which is the one with the max Jaccard Similarity. Then if this169# Jaccard Similarity is less than .5 than the original cluster is considered to170# be "dissolved", that is it did not show up in the new cluster. A cluster that171# dissolved too often is most likely not a real cluster.172# Jaccard Similarity of two vectors = intersect / union173174175# ----------------------------------------------------------------------------------------------176# [ClusterMethod] : supports heirarchical clustering and kmeans++ bootstrap177178# kmeans++ explanation and source code179# source in function Kmeanspp180source("/Users/ethen/machine-learning/clustering_old/clustering/kmeanspp.R")181182# @data = data frame type data, matrix also works183# @k = specify the number of clusters184# @clustermethod = "hclust" for heirarchical clustering, and "kmeanspp" for kmeans++185# @noise.cut = if specified, the points of the resulting cluster whose number is smaller186# than it will be considered as noise, and all of these noise cluster will be187# grouped together as one whole cluster188# @... = pass in other parameters for hclust or kmeans++ (same as kmeans)189190ClusterMethod <- function( data, k, noise.cut = 0, clustermethod, ... )191{192if( !clustermethod %in% c( "kmeanspp", "hclust" ) )193stop( "method must be one of 'kmeanspp' or 'hclust'" )194195# hierarchical clustering196if( clustermethod == "hclust" )197{198cluster <- hclust( dist(data), ... )199partition <- cutree( cluster, k )200201}else # kmeanspp202{203cluster <- Kmeanspp( data = data, k = k, ... )204partition <- cluster$cluster205}206207# equivalent to k208cluster_num <- max(partition)209210# calculate each cluster's size211cluster_size <- numeric(cluster_num)212for( i in 1:cluster_num )213cluster_size[i] <- sum( partition == i )214215# if there're cluster size smaller than the specified noise.cut, do :216not_noise_num <- sum( cluster_size > noise.cut )217218if( cluster_num > not_noise_num )219{220# extract the cluster whose size is larger than noise.cut221cluster_new <- (1:cluster_num)[ cluster_size > noise.cut ]222223# all the data points whose original cluster is smaller than the noise.cut224# will be assigned to the same new cluster225cluster_num <- not_noise_num + 1226227# new clustering number, assign the noise cluster's number first228# then adjust the original cluster's number229new <- rep( cluster_num, nrow(data) )230231for( i in 1:not_noise_num )232new[ ( partition == cluster_new[i] ) ] <- i233234partition <- new235}236237# boolean vector indicating which data point belongs to which cluster238cluster_list <- lapply( 1:cluster_num, function(x)239{240return( partition == x )241})242243cluster_result <- list( result = cluster,244partition = partition,245clusternum = cluster_num,246clusterlist = cluster_list )247return(cluster_result)248}249250# cluster_result <- ClusterMethod( data = mtcars_scaled, k = 5, clustermethod = "hclust" )251252253# ----------------------------------------------------------------------------------------------254# [ClusterBootstrap] :255# @bootstrap : number of boostrap iteraion256# @dissolve : if the jaccard similarity is smaller than this number, then it is considered257# to be "dissolved"258# Returns : 1. result : the original clustering object.259# 2. bootmean : mean of the Jaccard Similarity for specified bootstrap time.260# 3. partition : the original clustering result, a vector specifying which261# group does the data point belong.262# 4. clusternum : final cluster count,263# if you specified noise.cut then it might be different from k.264# 5. bootdissolved : number of times each cluster's jaccard similarity is smaller than265# the dissolve value.266267ClusterBootstrap <- function( data, k, noise.cut = 0, bootstrap = 100,268dissolve = .5, clustermethod, ... )269{270# step 1271cluster_result <- ClusterMethod( data = data, k = k, noise.cut = noise.cut,272clustermethod = clustermethod, ... )273274cluster_num <- cluster_result$clusternum275boot_jaccard <- matrix( 0, nrow = bootstrap, ncol = cluster_num )276277# pass in two vectors containing TRUE and FALSE278# ( do not use built in intersect or union ! )279jaccardSimilarity <- function( x, y )280{281jaccard <- sum( x & y ) / ( sum(x) + sum(y) - sum( x & y ) )282return(jaccard)283}284285n <- nrow(data)286for( i in 1:bootstrap )287{288# step 2, cluster the new sampled data289sampling <- sample( n, n, replace = TRUE )290boot_data <- data[ sampling, ]291292boot_result <- ClusterMethod( data = boot_data, k = k, noise.cut = noise.cut,293clustermethod = clustermethod, ... )294boot_num <- boot_result$clusternum295296# step 3297for( j in 1:cluster_num )298{299# compare the original cluster with every other bootstrapped cluster300similarity <- lapply( 1:boot_num, function(k)301{302jaccard <- jaccardSimilarity( x = cluster_result$clusterlist[[j]][sampling],303y = boot_result$clusterlist[[k]] )304}) %>% unlist()305306# return the largest jaccard similarity307boot_jaccard[ i, j ] <- max(similarity)308}309}310311# cluster's stability, mean of all the boostrapped jaccard similarity312boot_mean <- colMeans(boot_jaccard)313314# how many times are each cluster's jaccard similarity below the315# specified "dissolved" value316boot_dissolved <- apply( boot_jaccard, 2, function(x)317{318sum( x < dissolve, na.rm = TRUE )319})320321boot_result <- list( result = cluster_result$result,322bootmean = boot_mean,323partition = cluster_result$partition,324clusternum = cluster_num,325bootdissolved = boot_dissolved )326return(boot_result)327}328329# test330# set.seed(1234)331# boot_clust <- ClusterBootstrap( data = mtcars_scaled, k = 4, clustermethod = "kmeanspp",332# nstart = 10, iter.max = 100 )333334# ... parameters for hclust335# method = "ward.D"336337# boot_clust338339# rule of thumb, values below 0.6 should be considered unstable340# boot_clust$bootmean341342# clusters that have a low bootmean or high bootdissolved343# has the characteristics of what we’ve been calling the “other” cluster.344345346347348