CoCalc -- kmeans.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/machine-learning/kmeans.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage{float}
9
\usepackage{geometry}
10
\geometry{margin=1in}
11
\usepackage[makestderr]{pythontex}
12

13
\title{K-Means Clustering: Algorithm and Analysis}
14
\author{Machine Learning Foundations}
15
\date{\today}
16

17
\begin{document}
18
\maketitle
19

20
\begin{abstract}
21
This document presents a comprehensive study of K-means clustering, including algorithm implementation, convergence analysis, cluster quality metrics (silhouette score, inertia), the elbow method for optimal K selection, and comparison with other clustering approaches. We demonstrate practical considerations for initialization and scaling.
22
\end{abstract}
23

24
\section{Introduction}
25
K-means clustering partitions $n$ observations into $K$ clusters by minimizing within-cluster variance:
26
\begin{equation}
27
J = \sum_{k=1}^{K} \sum_{i \in C_k} \|x_i - \mu_k\|^2
28
\end{equation}
29
where $C_k$ is the set of points in cluster $k$ and $\mu_k$ is the centroid.
30

31
The algorithm alternates between:
32
\begin{enumerate}
33
    \item \textbf{Assignment}: Assign each point to nearest centroid
34
    \item \textbf{Update}: Recompute centroids as cluster means
35
\end{enumerate}
36

37
\section{Computational Environment}
38
\begin{pycode}
39
import numpy as np
40
import matplotlib.pyplot as plt
41
from scipy.spatial.distance import cdist
42
import warnings
43
warnings.filterwarnings('ignore')
44

45
plt.rc('text', usetex=True)
46
plt.rc('font', family='serif')
47
np.random.seed(42)
48

49
def save_plot(filename, caption):
50
    plt.savefig(filename, bbox_inches='tight', dpi=150)
51
    print(r'\begin{figure}[H]')
52
    print(r'\centering')
53
    print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')
54
    print(r'\caption{' + caption + '}')
55
    print(r'\end{figure}')
56
    plt.close()
57
\end{pycode}
58

59
\section{Data Generation}
60
\begin{pycode}
61
# Generate clustered data
62
def make_blobs(n_samples, centers, cluster_std=1.0, random_state=42):
63
    np.random.seed(random_state)
64
    n_centers = len(centers)
65
    n_per_cluster = n_samples // n_centers
66
    X = []
67
    y = []
68
    for i, center in enumerate(centers):
69
        X.append(np.random.randn(n_per_cluster, 2) * cluster_std + center)
70
        y.extend([i] * n_per_cluster)
71
    return np.vstack(X), np.array(y)
72

73
# Create dataset with 4 clusters
74
true_centers = [[-4, -4], [-4, 4], [4, -4], [4, 4]]
75
X, y_true = make_blobs(400, true_centers, cluster_std=1.2)
76

77
n_samples = len(X)
78
n_features = X.shape[1]
79
true_k = len(true_centers)
80

81
# Plot original data
82
fig, ax = plt.subplots(figsize=(8, 6))
83
scatter = ax.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis',
84
                     alpha=0.7, edgecolors='black', s=40)
85
for i, center in enumerate(true_centers):
86
    ax.plot(center[0], center[1], 'r*', markersize=15)
87
ax.set_xlabel('Feature 1')
88
ax.set_ylabel('Feature 2')
89
ax.set_title('Generated Data with True Clusters')
90
ax.grid(True, alpha=0.3)
91
plt.colorbar(scatter, label='True Cluster')
92
save_plot('km_data.pdf', 'Synthetic dataset with 4 well-separated clusters.')
93
\end{pycode}
94

95
Dataset: $n = \py{n_samples}$ samples, $p = \py{n_features}$ features, $K_{true} = \py{true_k}$ clusters.
96

97
\section{K-Means Implementation}
98
\begin{pycode}
99
class KMeans:
100
    def __init__(self, n_clusters=3, max_iter=300, tol=1e-4, init='kmeans++'):
101
        self.n_clusters = n_clusters
102
        self.max_iter = max_iter
103
        self.tol = tol
104
        self.init = init
105
        self.centroids = None
106
        self.labels_ = None
107
        self.inertia_ = None
108
        self.n_iter_ = 0
109
        self.history = []
110

111
    def _init_centroids(self, X):
112
        n_samples = X.shape[0]
113
        if self.init == 'random':
114
            idx = np.random.choice(n_samples, self.n_clusters, replace=False)
115
            return X[idx].copy()
116
        elif self.init == 'kmeans++':
117
            centroids = [X[np.random.randint(n_samples)]]
118
            for _ in range(1, self.n_clusters):
119
                distances = np.min(cdist(X, centroids), axis=1)
120
                probs = distances**2 / np.sum(distances**2)
121
                idx = np.random.choice(n_samples, p=probs)
122
                centroids.append(X[idx])
123
            return np.array(centroids)
124

125
    def fit(self, X):
126
        self.centroids = self._init_centroids(X)
127
        self.history = [self.centroids.copy()]
128

129
        for i in range(self.max_iter):
130
            # Assignment step
131
            distances = cdist(X, self.centroids)
132
            self.labels_ = np.argmin(distances, axis=1)
133

134
            # Update step
135
            new_centroids = np.array([X[self.labels_ == k].mean(axis=0)
136
                                      if np.sum(self.labels_ == k) > 0
137
                                      else self.centroids[k]
138
                                      for k in range(self.n_clusters)])
139

140
            # Check convergence
141
            shift = np.linalg.norm(new_centroids - self.centroids)
142
            self.centroids = new_centroids
143
            self.history.append(self.centroids.copy())
144
            self.n_iter_ = i + 1
145

146
            if shift < self.tol:
147
                break
148

149
        # Compute inertia
150
        distances = cdist(X, self.centroids)
151
        self.inertia_ = np.sum(np.min(distances, axis=1)**2)
152
        return self
153

154
    def predict(self, X):
155
        distances = cdist(X, self.centroids)
156
        return np.argmin(distances, axis=1)
157

158
# Run K-means
159
kmeans = KMeans(n_clusters=4, init='kmeans++')
160
kmeans.fit(X)
161
labels = kmeans.labels_
162
\end{pycode}
163

164
\section{Algorithm Convergence}
165
\begin{pycode}
166
# Visualize convergence
167
fig, axes = plt.subplots(2, 3, figsize=(12, 8))
168
iterations_to_show = [0, 1, 2, 3, 5, min(len(kmeans.history)-1, 10)]
169

170
for ax, it in zip(axes.flat, iterations_to_show):
171
    # Get labels at this iteration
172
    centroids = kmeans.history[it]
173
    distances = cdist(X, centroids)
174
    labels_it = np.argmin(distances, axis=1)
175

176
    ax.scatter(X[:, 0], X[:, 1], c=labels_it, cmap='viridis',
177
               alpha=0.5, s=20)
178
    ax.scatter(centroids[:, 0], centroids[:, 1], c='red',
179
               marker='X', s=200, edgecolors='black', linewidths=2)
180
    ax.set_title(f'Iteration {it}')
181
    ax.set_xlabel('Feature 1')
182
    ax.set_ylabel('Feature 2')
183
    ax.grid(True, alpha=0.3)
184

185
plt.tight_layout()
186
save_plot('km_convergence.pdf', 'K-means convergence showing centroid movement over iterations.')
187
\end{pycode}
188

189
Algorithm converged in \py{kmeans.n_iter_} iterations with inertia $J = \py{f"{kmeans.inertia_:.2f}"}$.
190

191
\section{Elbow Method for Optimal K}
192
\begin{pycode}
193
# Compute inertia for different K values
194
k_range = range(1, 11)
195
inertias = []
196
silhouettes = []
197

198
def silhouette_score(X, labels):
199
    """Compute silhouette score."""
200
    n = len(X)
201
    unique_labels = np.unique(labels)
202
    if len(unique_labels) < 2:
203
        return 0
204

205
    scores = []
206
    for i in range(n):
207
        # a(i): mean distance to same cluster
208
        same_cluster = X[labels == labels[i]]
209
        if len(same_cluster) > 1:
210
            a = np.mean(np.linalg.norm(same_cluster - X[i], axis=1))
211
        else:
212
            a = 0
213

214
        # b(i): min mean distance to other clusters
215
        b = np.inf
216
        for k in unique_labels:
217
            if k != labels[i]:
218
                other_cluster = X[labels == k]
219
                if len(other_cluster) > 0:
220
                    b = min(b, np.mean(np.linalg.norm(other_cluster - X[i], axis=1)))
221

222
        if b == np.inf:
223
            b = 0
224

225
        if max(a, b) > 0:
226
            scores.append((b - a) / max(a, b))
227
        else:
228
            scores.append(0)
229

230
    return np.mean(scores)
231

232
for k in k_range:
233
    km = KMeans(n_clusters=k, init='kmeans++')
234
    km.fit(X)
235
    inertias.append(km.inertia_)
236
    if k > 1:
237
        silhouettes.append(silhouette_score(X, km.labels_))
238
    else:
239
        silhouettes.append(0)
240

241
# Find elbow using second derivative
242
diffs = np.diff(inertias)
243
diffs2 = np.diff(diffs)
244
elbow_k = np.argmax(diffs2) + 2  # +2 because of two diffs
245

246
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
247

248
# Elbow plot
249
axes[0].plot(k_range, inertias, 'b-o', linewidth=2, markersize=8)
250
axes[0].axvline(elbow_k, color='r', linestyle='--', label=f'Elbow at K={elbow_k}')
251
axes[0].set_xlabel('Number of Clusters (K)')
252
axes[0].set_ylabel('Inertia')
253
axes[0].set_title('Elbow Method')
254
axes[0].legend()
255
axes[0].grid(True, alpha=0.3)
256

257
# Silhouette plot
258
axes[1].plot(k_range, silhouettes, 'g-s', linewidth=2, markersize=8)
259
best_sil_k = k_range[np.argmax(silhouettes)]
260
axes[1].axvline(best_sil_k, color='r', linestyle='--', label=f'Best at K={best_sil_k}')
261
axes[1].set_xlabel('Number of Clusters (K)')
262
axes[1].set_ylabel('Silhouette Score')
263
axes[1].set_title('Silhouette Analysis')
264
axes[1].legend()
265
axes[1].grid(True, alpha=0.3)
266

267
plt.tight_layout()
268
save_plot('km_elbow.pdf', 'Elbow method and silhouette analysis for optimal K selection.')
269

270
best_silhouette = max(silhouettes)
271
\end{pycode}
272

273
Elbow method suggests $K = \py{elbow_k}$, silhouette analysis suggests $K = \py{best_sil_k}$ with score \py{f"{best_silhouette:.3f}"}.
274

275
\section{Silhouette Analysis}
276
\begin{pycode}
277
# Detailed silhouette plot for K=4
278
km_4 = KMeans(n_clusters=4, init='kmeans++')
279
km_4.fit(X)
280
labels_4 = km_4.labels_
281

282
# Compute individual silhouette scores
283
n = len(X)
284
sil_samples = []
285
for i in range(n):
286
    same_cluster = X[labels_4 == labels_4[i]]
287
    if len(same_cluster) > 1:
288
        a = np.mean(np.linalg.norm(same_cluster - X[i], axis=1))
289
    else:
290
        a = 0
291

292
    b = np.inf
293
    for k in range(4):
294
        if k != labels_4[i]:
295
            other_cluster = X[labels_4 == k]
296
            if len(other_cluster) > 0:
297
                b = min(b, np.mean(np.linalg.norm(other_cluster - X[i], axis=1)))
298
    if b == np.inf:
299
        b = 0
300

301
    if max(a, b) > 0:
302
        sil_samples.append((b - a) / max(a, b))
303
    else:
304
        sil_samples.append(0)
305

306
sil_samples = np.array(sil_samples)
307

308
fig, ax = plt.subplots(figsize=(8, 6))
309

310
y_lower = 10
311
for k in range(4):
312
    cluster_sil = sil_samples[labels_4 == k]
313
    cluster_sil.sort()
314
    size_cluster = len(cluster_sil)
315
    y_upper = y_lower + size_cluster
316

317
    color = plt.cm.viridis(k / 4)
318
    ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_sil,
319
                     facecolor=color, edgecolor=color, alpha=0.7)
320
    ax.text(-0.05, y_lower + 0.5 * size_cluster, str(k))
321
    y_lower = y_upper + 10
322

323
ax.axvline(np.mean(sil_samples), color='red', linestyle='--',
324
           label=f'Mean: {np.mean(sil_samples):.3f}')
325
ax.set_xlabel('Silhouette Coefficient')
326
ax.set_ylabel('Cluster')
327
ax.set_title('Silhouette Plot for K=4')
328
ax.legend()
329
ax.grid(True, alpha=0.3)
330
save_plot('km_silhouette.pdf', 'Silhouette plot showing cluster quality for each sample.')
331
\end{pycode}
332

333
\section{Initialization Comparison}
334
\begin{pycode}
335
# Compare random vs kmeans++ initialization
336
n_runs = 20
337
random_inertias = []
338
kpp_inertias = []
339

340
for _ in range(n_runs):
341
    km_random = KMeans(n_clusters=4, init='random')
342
    km_random.fit(X)
343
    random_inertias.append(km_random.inertia_)
344

345
    km_kpp = KMeans(n_clusters=4, init='kmeans++')
346
    km_kpp.fit(X)
347
    kpp_inertias.append(km_kpp.inertia_)
348

349
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
350

351
# Box plot
352
bp = axes[0].boxplot([random_inertias, kpp_inertias],
353
                      labels=['Random', 'K-Means++'])
354
axes[0].set_ylabel('Inertia')
355
axes[0].set_title('Initialization Method Comparison')
356
axes[0].grid(True, alpha=0.3)
357

358
# Histogram
359
axes[1].hist(random_inertias, bins=10, alpha=0.6, label='Random', color='blue')
360
axes[1].hist(kpp_inertias, bins=10, alpha=0.6, label='K-Means++', color='green')
361
axes[1].set_xlabel('Inertia')
362
axes[1].set_ylabel('Frequency')
363
axes[1].set_title('Distribution of Final Inertia')
364
axes[1].legend()
365
axes[1].grid(True, alpha=0.3)
366

367
plt.tight_layout()
368
save_plot('km_initialization.pdf', 'Comparison of random vs K-means++ initialization over 20 runs.')
369

370
random_mean = np.mean(random_inertias)
371
kpp_mean = np.mean(kpp_inertias)
372
\end{pycode}
373

374
Mean inertia: Random = \py{f"{random_mean:.1f}"}, K-Means++ = \py{f"{kpp_mean:.1f}"}.
375

376
\section{Cluster Visualization}
377
\begin{pycode}
378
# Final clustering result
379
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
380

381
# Clustering result
382
scatter = axes[0].scatter(X[:, 0], X[:, 1], c=km_4.labels_, cmap='viridis',
383
                          alpha=0.7, edgecolors='black', s=40)
384
axes[0].scatter(km_4.centroids[:, 0], km_4.centroids[:, 1], c='red',
385
                marker='X', s=200, edgecolors='black', linewidths=2,
386
                label='Centroids')
387
axes[0].set_xlabel('Feature 1')
388
axes[0].set_ylabel('Feature 2')
389
axes[0].set_title('K-Means Clustering Result (K=4)')
390
axes[0].legend()
391
axes[0].grid(True, alpha=0.3)
392

393
# Voronoi diagram
394
h = 0.05
395
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
396
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
397
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
398
                     np.arange(y_min, y_max, h))
399
Z = km_4.predict(np.c_[xx.ravel(), yy.ravel()])
400
Z = Z.reshape(xx.shape)
401

402
axes[1].contourf(xx, yy, Z, alpha=0.3, cmap='viridis')
403
axes[1].contour(xx, yy, Z, colors='black', linewidths=0.5)
404
axes[1].scatter(X[:, 0], X[:, 1], c=km_4.labels_, cmap='viridis',
405
                alpha=0.7, edgecolors='black', s=40)
406
axes[1].scatter(km_4.centroids[:, 0], km_4.centroids[:, 1], c='red',
407
                marker='X', s=200, edgecolors='black', linewidths=2)
408
axes[1].set_xlabel('Feature 1')
409
axes[1].set_ylabel('Feature 2')
410
axes[1].set_title('Voronoi Regions')
411
axes[1].grid(True, alpha=0.3)
412

413
plt.tight_layout()
414
save_plot('km_result.pdf', 'Final K-means clustering with Voronoi decision boundaries.')
415
\end{pycode}
416

417
\section{Cluster Statistics}
418
\begin{pycode}
419
# Compute cluster statistics
420
cluster_sizes = [np.sum(km_4.labels_ == k) for k in range(4)]
421
cluster_stds = [np.mean(np.std(X[km_4.labels_ == k], axis=0)) for k in range(4)]
422

423
# Compute distances to centroids
424
cluster_distances = []
425
for k in range(4):
426
    points = X[km_4.labels_ == k]
427
    dists = np.linalg.norm(points - km_4.centroids[k], axis=1)
428
    cluster_distances.append(np.mean(dists))
429
\end{pycode}
430

431
\begin{pycode}
432
print(r'\begin{table}[H]')
433
print(r'\centering')
434
print(r'\caption{Cluster Statistics}')
435
print(r'\begin{tabular}{ccccc}')
436
print(r'\toprule')
437
print(r'Cluster & Size & Mean Std & Mean Distance & Centroid \\')
438
print(r'\midrule')
439
for k in range(4):
440
    print(f'{k} & {cluster_sizes[k]} & {cluster_stds[k]:.2f} & {cluster_distances[k]:.2f} & ({km_4.centroids[k, 0]:.2f}, {km_4.centroids[k, 1]:.2f}) \\\\')
441
print(r'\bottomrule')
442
print(r'\end{tabular}')
443
print(r'\end{table}')
444
\end{pycode}
445

446
\section{Results Summary}
447
\begin{table}[H]
448
\centering
449
\caption{K-Means Clustering Summary}
450
\begin{tabular}{lr}
451
\toprule
452
Metric & Value \\
453
\midrule
454
Dataset Size & \py{n_samples} \\
455
True Clusters & \py{true_k} \\
456
Optimal K (Elbow) & \py{elbow_k} \\
457
Optimal K (Silhouette) & \py{best_sil_k} \\
458
Best Silhouette Score & \py{f"{best_silhouette:.3f}"} \\
459
Final Inertia & \py{f"{km_4.inertia_:.2f}"} \\
460
Convergence Iterations & \py{kmeans.n_iter_} \\
461
\bottomrule
462
\end{tabular}
463
\end{table}
464

465
\section{Conclusion}
466
This analysis demonstrated:
467
\begin{itemize}
468
    \item K-means algorithm implementation with K-means++ initialization
469
    \item Convergence visualization showing centroid updates
470
    \item Elbow method and silhouette analysis for optimal K selection
471
    \item Importance of initialization (K-means++ outperforms random)
472
    \item Voronoi regions showing cluster boundaries
473
    \item Detailed cluster quality metrics
474
\end{itemize}
475

476
The K-means++ initialization consistently achieves lower inertia (\py{f"{kpp_mean:.1f}"} vs \py{f"{random_mean:.1f}"} for random), and the optimal number of clusters matches the true value of \py{true_k}.
477

478
\end{document}
479

480
Product

Resources

Company