Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/machine-learning/kmeans.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage{float}
9
\usepackage{geometry}
10
\geometry{margin=1in}
11
\usepackage[makestderr]{pythontex}
12
13
\title{K-Means Clustering: Algorithm and Analysis}
14
\author{Machine Learning Foundations}
15
\date{\today}
16
17
\begin{document}
18
\maketitle
19
20
\begin{abstract}
21
This document presents a comprehensive study of K-means clustering, including algorithm implementation, convergence analysis, cluster quality metrics (silhouette score, inertia), the elbow method for optimal K selection, and comparison with other clustering approaches. We demonstrate practical considerations for initialization and scaling.
22
\end{abstract}
23
24
\section{Introduction}
25
K-means clustering partitions $n$ observations into $K$ clusters by minimizing within-cluster variance:
26
\begin{equation}
27
J = \sum_{k=1}^{K} \sum_{i \in C_k} \|x_i - \mu_k\|^2
28
\end{equation}
29
where $C_k$ is the set of points in cluster $k$ and $\mu_k$ is the centroid.
30
31
The algorithm alternates between:
32
\begin{enumerate}
33
\item \textbf{Assignment}: Assign each point to nearest centroid
34
\item \textbf{Update}: Recompute centroids as cluster means
35
\end{enumerate}
36
37
\section{Computational Environment}
38
\begin{pycode}
39
import numpy as np
40
import matplotlib.pyplot as plt
41
from scipy.spatial.distance import cdist
42
import warnings
43
warnings.filterwarnings('ignore')
44
45
plt.rc('text', usetex=True)
46
plt.rc('font', family='serif')
47
np.random.seed(42)
48
49
def save_plot(filename, caption):
50
plt.savefig(filename, bbox_inches='tight', dpi=150)
51
print(r'\begin{figure}[H]')
52
print(r'\centering')
53
print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')
54
print(r'\caption{' + caption + '}')
55
print(r'\end{figure}')
56
plt.close()
57
\end{pycode}
58
59
\section{Data Generation}
60
\begin{pycode}
61
# Generate clustered data
62
def make_blobs(n_samples, centers, cluster_std=1.0, random_state=42):
63
np.random.seed(random_state)
64
n_centers = len(centers)
65
n_per_cluster = n_samples // n_centers
66
X = []
67
y = []
68
for i, center in enumerate(centers):
69
X.append(np.random.randn(n_per_cluster, 2) * cluster_std + center)
70
y.extend([i] * n_per_cluster)
71
return np.vstack(X), np.array(y)
72
73
# Create dataset with 4 clusters
74
true_centers = [[-4, -4], [-4, 4], [4, -4], [4, 4]]
75
X, y_true = make_blobs(400, true_centers, cluster_std=1.2)
76
77
n_samples = len(X)
78
n_features = X.shape[1]
79
true_k = len(true_centers)
80
81
# Plot original data
82
fig, ax = plt.subplots(figsize=(8, 6))
83
scatter = ax.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis',
84
alpha=0.7, edgecolors='black', s=40)
85
for i, center in enumerate(true_centers):
86
ax.plot(center[0], center[1], 'r*', markersize=15)
87
ax.set_xlabel('Feature 1')
88
ax.set_ylabel('Feature 2')
89
ax.set_title('Generated Data with True Clusters')
90
ax.grid(True, alpha=0.3)
91
plt.colorbar(scatter, label='True Cluster')
92
save_plot('km_data.pdf', 'Synthetic dataset with 4 well-separated clusters.')
93
\end{pycode}
94
95
Dataset: $n = \py{n_samples}$ samples, $p = \py{n_features}$ features, $K_{true} = \py{true_k}$ clusters.
96
97
\section{K-Means Implementation}
98
\begin{pycode}
99
class KMeans:
100
def __init__(self, n_clusters=3, max_iter=300, tol=1e-4, init='kmeans++'):
101
self.n_clusters = n_clusters
102
self.max_iter = max_iter
103
self.tol = tol
104
self.init = init
105
self.centroids = None
106
self.labels_ = None
107
self.inertia_ = None
108
self.n_iter_ = 0
109
self.history = []
110
111
def _init_centroids(self, X):
112
n_samples = X.shape[0]
113
if self.init == 'random':
114
idx = np.random.choice(n_samples, self.n_clusters, replace=False)
115
return X[idx].copy()
116
elif self.init == 'kmeans++':
117
centroids = [X[np.random.randint(n_samples)]]
118
for _ in range(1, self.n_clusters):
119
distances = np.min(cdist(X, centroids), axis=1)
120
probs = distances**2 / np.sum(distances**2)
121
idx = np.random.choice(n_samples, p=probs)
122
centroids.append(X[idx])
123
return np.array(centroids)
124
125
def fit(self, X):
126
self.centroids = self._init_centroids(X)
127
self.history = [self.centroids.copy()]
128
129
for i in range(self.max_iter):
130
# Assignment step
131
distances = cdist(X, self.centroids)
132
self.labels_ = np.argmin(distances, axis=1)
133
134
# Update step
135
new_centroids = np.array([X[self.labels_ == k].mean(axis=0)
136
if np.sum(self.labels_ == k) > 0
137
else self.centroids[k]
138
for k in range(self.n_clusters)])
139
140
# Check convergence
141
shift = np.linalg.norm(new_centroids - self.centroids)
142
self.centroids = new_centroids
143
self.history.append(self.centroids.copy())
144
self.n_iter_ = i + 1
145
146
if shift < self.tol:
147
break
148
149
# Compute inertia
150
distances = cdist(X, self.centroids)
151
self.inertia_ = np.sum(np.min(distances, axis=1)**2)
152
return self
153
154
def predict(self, X):
155
distances = cdist(X, self.centroids)
156
return np.argmin(distances, axis=1)
157
158
# Run K-means
159
kmeans = KMeans(n_clusters=4, init='kmeans++')
160
kmeans.fit(X)
161
labels = kmeans.labels_
162
\end{pycode}
163
164
\section{Algorithm Convergence}
165
\begin{pycode}
166
# Visualize convergence
167
fig, axes = plt.subplots(2, 3, figsize=(12, 8))
168
iterations_to_show = [0, 1, 2, 3, 5, min(len(kmeans.history)-1, 10)]
169
170
for ax, it in zip(axes.flat, iterations_to_show):
171
# Get labels at this iteration
172
centroids = kmeans.history[it]
173
distances = cdist(X, centroids)
174
labels_it = np.argmin(distances, axis=1)
175
176
ax.scatter(X[:, 0], X[:, 1], c=labels_it, cmap='viridis',
177
alpha=0.5, s=20)
178
ax.scatter(centroids[:, 0], centroids[:, 1], c='red',
179
marker='X', s=200, edgecolors='black', linewidths=2)
180
ax.set_title(f'Iteration {it}')
181
ax.set_xlabel('Feature 1')
182
ax.set_ylabel('Feature 2')
183
ax.grid(True, alpha=0.3)
184
185
plt.tight_layout()
186
save_plot('km_convergence.pdf', 'K-means convergence showing centroid movement over iterations.')
187
\end{pycode}
188
189
Algorithm converged in \py{kmeans.n_iter_} iterations with inertia $J = \py{f"{kmeans.inertia_:.2f}"}$.
190
191
\section{Elbow Method for Optimal K}
192
\begin{pycode}
193
# Compute inertia for different K values
194
k_range = range(1, 11)
195
inertias = []
196
silhouettes = []
197
198
def silhouette_score(X, labels):
199
"""Compute silhouette score."""
200
n = len(X)
201
unique_labels = np.unique(labels)
202
if len(unique_labels) < 2:
203
return 0
204
205
scores = []
206
for i in range(n):
207
# a(i): mean distance to same cluster
208
same_cluster = X[labels == labels[i]]
209
if len(same_cluster) > 1:
210
a = np.mean(np.linalg.norm(same_cluster - X[i], axis=1))
211
else:
212
a = 0
213
214
# b(i): min mean distance to other clusters
215
b = np.inf
216
for k in unique_labels:
217
if k != labels[i]:
218
other_cluster = X[labels == k]
219
if len(other_cluster) > 0:
220
b = min(b, np.mean(np.linalg.norm(other_cluster - X[i], axis=1)))
221
222
if b == np.inf:
223
b = 0
224
225
if max(a, b) > 0:
226
scores.append((b - a) / max(a, b))
227
else:
228
scores.append(0)
229
230
return np.mean(scores)
231
232
for k in k_range:
233
km = KMeans(n_clusters=k, init='kmeans++')
234
km.fit(X)
235
inertias.append(km.inertia_)
236
if k > 1:
237
silhouettes.append(silhouette_score(X, km.labels_))
238
else:
239
silhouettes.append(0)
240
241
# Find elbow using second derivative
242
diffs = np.diff(inertias)
243
diffs2 = np.diff(diffs)
244
elbow_k = np.argmax(diffs2) + 2 # +2 because of two diffs
245
246
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
247
248
# Elbow plot
249
axes[0].plot(k_range, inertias, 'b-o', linewidth=2, markersize=8)
250
axes[0].axvline(elbow_k, color='r', linestyle='--', label=f'Elbow at K={elbow_k}')
251
axes[0].set_xlabel('Number of Clusters (K)')
252
axes[0].set_ylabel('Inertia')
253
axes[0].set_title('Elbow Method')
254
axes[0].legend()
255
axes[0].grid(True, alpha=0.3)
256
257
# Silhouette plot
258
axes[1].plot(k_range, silhouettes, 'g-s', linewidth=2, markersize=8)
259
best_sil_k = k_range[np.argmax(silhouettes)]
260
axes[1].axvline(best_sil_k, color='r', linestyle='--', label=f'Best at K={best_sil_k}')
261
axes[1].set_xlabel('Number of Clusters (K)')
262
axes[1].set_ylabel('Silhouette Score')
263
axes[1].set_title('Silhouette Analysis')
264
axes[1].legend()
265
axes[1].grid(True, alpha=0.3)
266
267
plt.tight_layout()
268
save_plot('km_elbow.pdf', 'Elbow method and silhouette analysis for optimal K selection.')
269
270
best_silhouette = max(silhouettes)
271
\end{pycode}
272
273
Elbow method suggests $K = \py{elbow_k}$, silhouette analysis suggests $K = \py{best_sil_k}$ with score \py{f"{best_silhouette:.3f}"}.
274
275
\section{Silhouette Analysis}
276
\begin{pycode}
277
# Detailed silhouette plot for K=4
278
km_4 = KMeans(n_clusters=4, init='kmeans++')
279
km_4.fit(X)
280
labels_4 = km_4.labels_
281
282
# Compute individual silhouette scores
283
n = len(X)
284
sil_samples = []
285
for i in range(n):
286
same_cluster = X[labels_4 == labels_4[i]]
287
if len(same_cluster) > 1:
288
a = np.mean(np.linalg.norm(same_cluster - X[i], axis=1))
289
else:
290
a = 0
291
292
b = np.inf
293
for k in range(4):
294
if k != labels_4[i]:
295
other_cluster = X[labels_4 == k]
296
if len(other_cluster) > 0:
297
b = min(b, np.mean(np.linalg.norm(other_cluster - X[i], axis=1)))
298
if b == np.inf:
299
b = 0
300
301
if max(a, b) > 0:
302
sil_samples.append((b - a) / max(a, b))
303
else:
304
sil_samples.append(0)
305
306
sil_samples = np.array(sil_samples)
307
308
fig, ax = plt.subplots(figsize=(8, 6))
309
310
y_lower = 10
311
for k in range(4):
312
cluster_sil = sil_samples[labels_4 == k]
313
cluster_sil.sort()
314
size_cluster = len(cluster_sil)
315
y_upper = y_lower + size_cluster
316
317
color = plt.cm.viridis(k / 4)
318
ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_sil,
319
facecolor=color, edgecolor=color, alpha=0.7)
320
ax.text(-0.05, y_lower + 0.5 * size_cluster, str(k))
321
y_lower = y_upper + 10
322
323
ax.axvline(np.mean(sil_samples), color='red', linestyle='--',
324
label=f'Mean: {np.mean(sil_samples):.3f}')
325
ax.set_xlabel('Silhouette Coefficient')
326
ax.set_ylabel('Cluster')
327
ax.set_title('Silhouette Plot for K=4')
328
ax.legend()
329
ax.grid(True, alpha=0.3)
330
save_plot('km_silhouette.pdf', 'Silhouette plot showing cluster quality for each sample.')
331
\end{pycode}
332
333
\section{Initialization Comparison}
334
\begin{pycode}
335
# Compare random vs kmeans++ initialization
336
n_runs = 20
337
random_inertias = []
338
kpp_inertias = []
339
340
for _ in range(n_runs):
341
km_random = KMeans(n_clusters=4, init='random')
342
km_random.fit(X)
343
random_inertias.append(km_random.inertia_)
344
345
km_kpp = KMeans(n_clusters=4, init='kmeans++')
346
km_kpp.fit(X)
347
kpp_inertias.append(km_kpp.inertia_)
348
349
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
350
351
# Box plot
352
bp = axes[0].boxplot([random_inertias, kpp_inertias],
353
labels=['Random', 'K-Means++'])
354
axes[0].set_ylabel('Inertia')
355
axes[0].set_title('Initialization Method Comparison')
356
axes[0].grid(True, alpha=0.3)
357
358
# Histogram
359
axes[1].hist(random_inertias, bins=10, alpha=0.6, label='Random', color='blue')
360
axes[1].hist(kpp_inertias, bins=10, alpha=0.6, label='K-Means++', color='green')
361
axes[1].set_xlabel('Inertia')
362
axes[1].set_ylabel('Frequency')
363
axes[1].set_title('Distribution of Final Inertia')
364
axes[1].legend()
365
axes[1].grid(True, alpha=0.3)
366
367
plt.tight_layout()
368
save_plot('km_initialization.pdf', 'Comparison of random vs K-means++ initialization over 20 runs.')
369
370
random_mean = np.mean(random_inertias)
371
kpp_mean = np.mean(kpp_inertias)
372
\end{pycode}
373
374
Mean inertia: Random = \py{f"{random_mean:.1f}"}, K-Means++ = \py{f"{kpp_mean:.1f}"}.
375
376
\section{Cluster Visualization}
377
\begin{pycode}
378
# Final clustering result
379
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
380
381
# Clustering result
382
scatter = axes[0].scatter(X[:, 0], X[:, 1], c=km_4.labels_, cmap='viridis',
383
alpha=0.7, edgecolors='black', s=40)
384
axes[0].scatter(km_4.centroids[:, 0], km_4.centroids[:, 1], c='red',
385
marker='X', s=200, edgecolors='black', linewidths=2,
386
label='Centroids')
387
axes[0].set_xlabel('Feature 1')
388
axes[0].set_ylabel('Feature 2')
389
axes[0].set_title('K-Means Clustering Result (K=4)')
390
axes[0].legend()
391
axes[0].grid(True, alpha=0.3)
392
393
# Voronoi diagram
394
h = 0.05
395
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
396
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
397
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
398
np.arange(y_min, y_max, h))
399
Z = km_4.predict(np.c_[xx.ravel(), yy.ravel()])
400
Z = Z.reshape(xx.shape)
401
402
axes[1].contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
403
axes[1].contour(xx, yy, Z, colors='black', linewidths=0.5)
404
axes[1].scatter(X[:, 0], X[:, 1], c=km_4.labels_, cmap='viridis',
405
alpha=0.7, edgecolors='black', s=40)
406
axes[1].scatter(km_4.centroids[:, 0], km_4.centroids[:, 1], c='red',
407
marker='X', s=200, edgecolors='black', linewidths=2)
408
axes[1].set_xlabel('Feature 1')
409
axes[1].set_ylabel('Feature 2')
410
axes[1].set_title('Voronoi Regions')
411
axes[1].grid(True, alpha=0.3)
412
413
plt.tight_layout()
414
save_plot('km_result.pdf', 'Final K-means clustering with Voronoi decision boundaries.')
415
\end{pycode}
416
417
\section{Cluster Statistics}
418
\begin{pycode}
419
# Compute cluster statistics
420
cluster_sizes = [np.sum(km_4.labels_ == k) for k in range(4)]
421
cluster_stds = [np.mean(np.std(X[km_4.labels_ == k], axis=0)) for k in range(4)]
422
423
# Compute distances to centroids
424
cluster_distances = []
425
for k in range(4):
426
points = X[km_4.labels_ == k]
427
dists = np.linalg.norm(points - km_4.centroids[k], axis=1)
428
cluster_distances.append(np.mean(dists))
429
\end{pycode}
430
431
\begin{pycode}
432
print(r'\begin{table}[H]')
433
print(r'\centering')
434
print(r'\caption{Cluster Statistics}')
435
print(r'\begin{tabular}{ccccc}')
436
print(r'\toprule')
437
print(r'Cluster & Size & Mean Std & Mean Distance & Centroid \\')
438
print(r'\midrule')
439
for k in range(4):
440
print(f'{k} & {cluster_sizes[k]} & {cluster_stds[k]:.2f} & {cluster_distances[k]:.2f} & ({km_4.centroids[k, 0]:.2f}, {km_4.centroids[k, 1]:.2f}) \\\\')
441
print(r'\bottomrule')
442
print(r'\end{tabular}')
443
print(r'\end{table}')
444
\end{pycode}
445
446
\section{Results Summary}
447
\begin{table}[H]
448
\centering
449
\caption{K-Means Clustering Summary}
450
\begin{tabular}{lr}
451
\toprule
452
Metric & Value \\
453
\midrule
454
Dataset Size & \py{n_samples} \\
455
True Clusters & \py{true_k} \\
456
Optimal K (Elbow) & \py{elbow_k} \\
457
Optimal K (Silhouette) & \py{best_sil_k} \\
458
Best Silhouette Score & \py{f"{best_silhouette:.3f}"} \\
459
Final Inertia & \py{f"{km_4.inertia_:.2f}"} \\
460
Convergence Iterations & \py{kmeans.n_iter_} \\
461
\bottomrule
462
\end{tabular}
463
\end{table}
464
465
\section{Conclusion}
466
This analysis demonstrated:
467
\begin{itemize}
468
\item K-means algorithm implementation with K-means++ initialization
469
\item Convergence visualization showing centroid updates
470
\item Elbow method and silhouette analysis for optimal K selection
471
\item Importance of initialization (K-means++ outperforms random)
472
\item Voronoi regions showing cluster boundaries
473
\item Detailed cluster quality metrics
474
\end{itemize}
475
476
The K-means++ initialization consistently achieves lower inertia (\py{f"{kpp_mean:.1f}"} vs \py{f"{random_mean:.1f}"} for random), and the optimal number of clusters matches the true value of \py{true_k}.
477
478
\end{document}
479
480