Path: blob/master/2019-spring/slides/kmeans_example.Rmd
2051 views
---
title: "cluster" output: github_document
---
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse) library(broom) library(gganimate) library(gridExtra)
## Code to do each iteration of Kmeans and prepare the data for future plotting # I Definetly should be able to clean up this code with a loop or function but i cannot think right now... set.seed(1230) # Initialize Clustering iris_sample0 <- iris %>% select(x = Sepal.Length, y = Sepal.Width) %>% mutate(iter0 = factor(sample(c(1,2,3), size = n(), replace = TRUE))) iris_means0 <- iris_sample0 %>% select(group = iter0, x, y) %>% group_by(group) %>% summarize(center_x = mean(x), center_y = mean(y)) %>% mutate(iteration = "iter0") # Perform First Iteration iris_sample1 <- iris_sample0 %>% mutate(dist1 = sqrt((x - iris_means0$center_x[1])^2 + (y - iris_means0$center_y[1])^2), dist2 = sqrt((x - iris_means0$center_x[2])^2 + (y - iris_means0$center_y[2])^2), dist3 = sqrt((x - iris_means0$center_x[3])^2 + (y - iris_means0$center_y[3])^2), iter1 = case_when( pmin(dist1, dist2, dist3) == dist1 ~ 1, pmin(dist1, dist2, dist3) == dist2 ~ 2, pmin(dist1, dist2, dist3) == dist3 ~ 3), iter1 = factor(iter1)) %>% select(-starts_with("dist")) iris_means1 <- iris_sample1 %>% select(group = iter1, x, y) %>% group_by(group) %>% summarize(center_x = mean(x), center_y = mean(y)) %>% mutate(iteration = "iter1") # Perform Second Iteration iris_sample2 <- iris_sample1 %>% mutate(dist1 = sqrt((x - iris_means1$center_x[1])^2 + (y - iris_means1$center_y[1])^2), dist2 = sqrt((x - iris_means1$center_x[2])^2 + (y - iris_means1$center_y[2])^2), dist3 = sqrt((x - iris_means1$center_x[3])^2 + (y - iris_means1$center_y[3])^2), iter2 = case_when( pmin(dist1, dist2, dist3) == dist1 ~ 1, pmin(dist1, dist2, dist3) == dist2 ~ 2, pmin(dist1, dist2, dist3) == dist3 ~ 3), iter2 = factor(iter2)) %>% select(-starts_with("dist")) iris_means2 <- iris_sample2 %>% select(group = iter2, x, y) %>% group_by(group) %>% summarize(center_x = mean(x), center_y = mean(y)) %>% mutate(iteration = "iter2") # Perform Third Iteration iris_sample3 <- iris_sample2 %>% mutate(dist1 = sqrt((x - iris_means2$center_x[1])^2 + (y - iris_means2$center_y[1])^2), dist2 = sqrt((x - iris_means2$center_x[2])^2 + (y - iris_means2$center_y[2])^2), dist3 = sqrt((x - iris_means2$center_x[3])^2 + (y - iris_means2$center_y[3])^2), iter3 = case_when( pmin(dist1, dist2, dist3) == dist1 ~ 1, pmin(dist1, dist2, dist3) == dist2 ~ 2, pmin(dist1, dist2, dist3) == dist3 ~ 3), iter3 = factor(iter3)) %>% select(-starts_with("dist")) iris_means3 <- iris_sample3 %>% select(group = iter3, x, y) %>% group_by(group) %>% summarize(center_x = mean(x), center_y = mean(y)) %>% mutate(iteration = "iter3") # Perform Fourth Iteration iris_sample4 <- iris_sample3 %>% mutate(dist1 = sqrt((x - iris_means3$center_x[1])^2 + (y - iris_means3$center_y[1])^2), dist2 = sqrt((x - iris_means3$center_x[2])^2 + (y - iris_means3$center_y[2])^2), dist3 = sqrt((x - iris_means3$center_x[3])^2 + (y - iris_means3$center_y[3])^2), iter4 = case_when( pmin(dist1, dist2, dist3) == dist1 ~ 1, pmin(dist1, dist2, dist3) == dist2 ~ 2, pmin(dist1, dist2, dist3) == dist3 ~ 3), iter4 = factor(iter4)) %>% select(-starts_with("dist")) iris_means4 <- iris_sample4 %>% select(group = iter4, x, y) %>% group_by(group) %>% summarize(center_x = mean(x), center_y = mean(y)) %>% mutate(iteration = "iter4") # Perform Fifth Iteration iris_sample5 <- iris_sample4 %>% mutate(dist1 = sqrt((x - iris_means4$center_x[1])^2 + (y - iris_means4$center_y[1])^2), dist2 = sqrt((x - iris_means4$center_x[2])^2 + (y - iris_means4$center_y[2])^2), dist3 = sqrt((x - iris_means4$center_x[3])^2 + (y - iris_means4$center_y[3])^2), iter5 = case_when( pmin(dist1, dist2, dist3) == dist1 ~ 1, pmin(dist1, dist2, dist3) == dist2 ~ 2, pmin(dist1, dist2, dist3) == dist3 ~ 3), iter5 = factor(iter5)) %>% select(-starts_with("dist")) iris_means5 <- iris_sample5 %>% select(group = iter5, x, y) %>% group_by(group) %>% summarize(center_x = mean(x), center_y = mean(y)) %>% mutate(iteration = "iter5") df_kmeans_example <- iris_sample5 df_kmeans_ex_center <- bind_rows(iris_means0, iris_means1, iris_means2, iris_means3, iris_means4, iris_means5)
# Create facet plot for K means algorithm df_kmeans_centers <- df_kmeans_ex_center %>% mutate(sample = 1) %>% bind_rows(df_kmeans_ex_center) %>% replace_na(list(sample = 2)) %>% mutate(iteration = case_when( sample == 1 ~ paste(iteration, "b", sep = ""), sample == 2 ~ paste("iter", as.numeric(str_sub(iteration, start = -1)) + 1, "a", sep = ""))) %>% select(-sample, cluster = group) %>% filter(iteration != "iter6a") df_kmeans <- df_kmeans_example %>% rename(iter0a = iter0, iter1a = iter1, iter2a = iter2, iter3a = iter3, iter4a = iter4, iter5a = iter5) %>% mutate(iter0b = iter0a, iter1b = iter1a, iter2b = iter2a, iter3b = iter3a, iter4b = iter4a, iter5b = iter5a) %>% gather(starts_with("iter"), key = "iteration", value = "group") %>% mutate(iteration = factor(iteration)) %>% rename(cluster = group) iris_sample0 %>% ggplot(aes(x,y, col = "NA")) + geom_point() + scale_color_manual(values= c("#000000")) + labs(title = "K Means Clustering Algorithm", col = "cluster") df_kmeans %>% filter(str_detect(iteration, "b")) %>% ggplot(aes(x = x, y = y, col = cluster)) + geom_point(alpha = 0.6) + geom_point(data = filter(df_kmeans_centers, str_detect(iteration, "b")), aes(x = center_x, y = center_y, fill = cluster), shape = 21, col = "black",size = 3) + facet_wrap(~iteration) + labs(title = "K Means Clustering Algorithm")
tot <- df_kmeans %>% filter(iteration == "iter5b") %>% summarize(center_x = mean(x), center_y = mean(y)) %>% mutate(cluster = "total") df_kmeans %>% filter(iteration == "iter5b") %>% ggplot(aes(x = x, y = y, col = cluster)) + geom_point(alpha = 0.6) + geom_point(data = filter(df_kmeans_centers, iteration == "iter5b"), aes(x = center_x, y = center_y, fill = cluster), shape = 21, col = "black",size = 4) + geom_point(data = tot, aes(x = center_x, y = center_y), shape = 3, col = "black",size = 7) + labs(title = "K Means Clustering Algorithm")
df <- iris %>% select(x = Sepal.Length, y = Sepal.Width) for(k in 1:9){ model <- kmeans(df, centers = k, nstart = 20) clusters <- tibble(k = model$cluster) df <- df %>% bind_cols(clusters) } df %>% gather(-x, -y, key = "Number_of_clusters", value = "Value") %>% mutate(No_of_Clusters = factor(sort(rep(1:9, times = 150)))) %>% ggplot(aes(x, y, col = factor(Value))) + geom_point() + facet_wrap(~No_of_Clusters) + labs(title = "K means models with varying K", col = "Cluster") tibble(k = 1:9) %>% mutate(clusts = map(k, ~kmeans(df, .x, nstart = 20)), glanced = map(clusts, glance)) %>% unnest(glanced) %>% ggplot(aes(x = k, y = tot.withinss)) + geom_line() + geom_point() + scale_x_continuous(breaks = 1:9) + labs(title = "K Means Elbow Plot")
# Create gganimation for kmeans algorithm df_kmeans %>% mutate(group = seq_len(nrow(df_kmeans))) %>% ggplot(aes(x = x, y = y, col = cluster, group = group)) + geom_point(alpha = 0.8) + geom_point(data = df_kmeans_centers, aes(x = center_x, y = center_y, fill = cluster, group = cluster), shape = 21, col = "black",size = 6) + labs(title = "{closest_state}") + transition_states(iteration, transition_length = 5, state_length = 10) + enter_fade() + exit_fade() + ease_aes('sine-in-out')