Path: blob/main/latex-templates/templates/machine-learning/kmeans.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{booktabs}6\usepackage{siunitx}7\usepackage{float}8\usepackage{geometry}9\geometry{margin=1in}10\usepackage[makestderr]{pythontex}1112\title{K-Means Clustering: Algorithm and Analysis}13\author{Machine Learning Foundations}14\date{\today}1516\begin{document}17\maketitle1819\begin{abstract}20This document presents a comprehensive study of K-means clustering, including algorithm implementation, convergence analysis, cluster quality metrics (silhouette score, inertia), the elbow method for optimal K selection, and comparison with other clustering approaches. We demonstrate practical considerations for initialization and scaling.21\end{abstract}2223\section{Introduction}24K-means clustering partitions $n$ observations into $K$ clusters by minimizing within-cluster variance:25\begin{equation}26J = \sum_{k=1}^{K} \sum_{i \in C_k} \|x_i - \mu_k\|^227\end{equation}28where $C_k$ is the set of points in cluster $k$ and $\mu_k$ is the centroid.2930The algorithm alternates between:31\begin{enumerate}32\item \textbf{Assignment}: Assign each point to nearest centroid33\item \textbf{Update}: Recompute centroids as cluster means34\end{enumerate}3536\section{Computational Environment}37\begin{pycode}38import numpy as np39import matplotlib.pyplot as plt40from scipy.spatial.distance import cdist41import warnings42warnings.filterwarnings('ignore')4344plt.rc('text', usetex=True)45plt.rc('font', family='serif')46np.random.seed(42)4748def save_plot(filename, caption):49plt.savefig(filename, bbox_inches='tight', dpi=150)50print(r'\begin{figure}[H]')51print(r'\centering')52print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')53print(r'\caption{' + caption + '}')54print(r'\end{figure}')55plt.close()56\end{pycode}5758\section{Data Generation}59\begin{pycode}60# Generate clustered data61def make_blobs(n_samples, centers, cluster_std=1.0, random_state=42):62np.random.seed(random_state)63n_centers = len(centers)64n_per_cluster = n_samples // n_centers65X = []66y = []67for i, center in enumerate(centers):68X.append(np.random.randn(n_per_cluster, 2) * cluster_std + center)69y.extend([i] * n_per_cluster)70return np.vstack(X), np.array(y)7172# Create dataset with 4 clusters73true_centers = [[-4, -4], [-4, 4], [4, -4], [4, 4]]74X, y_true = make_blobs(400, true_centers, cluster_std=1.2)7576n_samples = len(X)77n_features = X.shape[1]78true_k = len(true_centers)7980# Plot original data81fig, ax = plt.subplots(figsize=(8, 6))82scatter = ax.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis',83alpha=0.7, edgecolors='black', s=40)84for i, center in enumerate(true_centers):85ax.plot(center[0], center[1], 'r*', markersize=15)86ax.set_xlabel('Feature 1')87ax.set_ylabel('Feature 2')88ax.set_title('Generated Data with True Clusters')89ax.grid(True, alpha=0.3)90plt.colorbar(scatter, label='True Cluster')91save_plot('km_data.pdf', 'Synthetic dataset with 4 well-separated clusters.')92\end{pycode}9394Dataset: $n = \py{n_samples}$ samples, $p = \py{n_features}$ features, $K_{true} = \py{true_k}$ clusters.9596\section{K-Means Implementation}97\begin{pycode}98class KMeans:99def __init__(self, n_clusters=3, max_iter=300, tol=1e-4, init='kmeans++'):100self.n_clusters = n_clusters101self.max_iter = max_iter102self.tol = tol103self.init = init104self.centroids = None105self.labels_ = None106self.inertia_ = None107self.n_iter_ = 0108self.history = []109110def _init_centroids(self, X):111n_samples = X.shape[0]112if self.init == 'random':113idx = np.random.choice(n_samples, self.n_clusters, replace=False)114return X[idx].copy()115elif self.init == 'kmeans++':116centroids = [X[np.random.randint(n_samples)]]117for _ in range(1, self.n_clusters):118distances = np.min(cdist(X, centroids), axis=1)119probs = distances**2 / np.sum(distances**2)120idx = np.random.choice(n_samples, p=probs)121centroids.append(X[idx])122return np.array(centroids)123124def fit(self, X):125self.centroids = self._init_centroids(X)126self.history = [self.centroids.copy()]127128for i in range(self.max_iter):129# Assignment step130distances = cdist(X, self.centroids)131self.labels_ = np.argmin(distances, axis=1)132133# Update step134new_centroids = np.array([X[self.labels_ == k].mean(axis=0)135if np.sum(self.labels_ == k) > 0136else self.centroids[k]137for k in range(self.n_clusters)])138139# Check convergence140shift = np.linalg.norm(new_centroids - self.centroids)141self.centroids = new_centroids142self.history.append(self.centroids.copy())143self.n_iter_ = i + 1144145if shift < self.tol:146break147148# Compute inertia149distances = cdist(X, self.centroids)150self.inertia_ = np.sum(np.min(distances, axis=1)**2)151return self152153def predict(self, X):154distances = cdist(X, self.centroids)155return np.argmin(distances, axis=1)156157# Run K-means158kmeans = KMeans(n_clusters=4, init='kmeans++')159kmeans.fit(X)160labels = kmeans.labels_161\end{pycode}162163\section{Algorithm Convergence}164\begin{pycode}165# Visualize convergence166fig, axes = plt.subplots(2, 3, figsize=(12, 8))167iterations_to_show = [0, 1, 2, 3, 5, min(len(kmeans.history)-1, 10)]168169for ax, it in zip(axes.flat, iterations_to_show):170# Get labels at this iteration171centroids = kmeans.history[it]172distances = cdist(X, centroids)173labels_it = np.argmin(distances, axis=1)174175ax.scatter(X[:, 0], X[:, 1], c=labels_it, cmap='viridis',176alpha=0.5, s=20)177ax.scatter(centroids[:, 0], centroids[:, 1], c='red',178marker='X', s=200, edgecolors='black', linewidths=2)179ax.set_title(f'Iteration {it}')180ax.set_xlabel('Feature 1')181ax.set_ylabel('Feature 2')182ax.grid(True, alpha=0.3)183184plt.tight_layout()185save_plot('km_convergence.pdf', 'K-means convergence showing centroid movement over iterations.')186\end{pycode}187188Algorithm converged in \py{kmeans.n_iter_} iterations with inertia $J = \py{f"{kmeans.inertia_:.2f}"}$.189190\section{Elbow Method for Optimal K}191\begin{pycode}192# Compute inertia for different K values193k_range = range(1, 11)194inertias = []195silhouettes = []196197def silhouette_score(X, labels):198"""Compute silhouette score."""199n = len(X)200unique_labels = np.unique(labels)201if len(unique_labels) < 2:202return 0203204scores = []205for i in range(n):206# a(i): mean distance to same cluster207same_cluster = X[labels == labels[i]]208if len(same_cluster) > 1:209a = np.mean(np.linalg.norm(same_cluster - X[i], axis=1))210else:211a = 0212213# b(i): min mean distance to other clusters214b = np.inf215for k in unique_labels:216if k != labels[i]:217other_cluster = X[labels == k]218if len(other_cluster) > 0:219b = min(b, np.mean(np.linalg.norm(other_cluster - X[i], axis=1)))220221if b == np.inf:222b = 0223224if max(a, b) > 0:225scores.append((b - a) / max(a, b))226else:227scores.append(0)228229return np.mean(scores)230231for k in k_range:232km = KMeans(n_clusters=k, init='kmeans++')233km.fit(X)234inertias.append(km.inertia_)235if k > 1:236silhouettes.append(silhouette_score(X, km.labels_))237else:238silhouettes.append(0)239240# Find elbow using second derivative241diffs = np.diff(inertias)242diffs2 = np.diff(diffs)243elbow_k = np.argmax(diffs2) + 2 # +2 because of two diffs244245fig, axes = plt.subplots(1, 2, figsize=(12, 5))246247# Elbow plot248axes[0].plot(k_range, inertias, 'b-o', linewidth=2, markersize=8)249axes[0].axvline(elbow_k, color='r', linestyle='--', label=f'Elbow at K={elbow_k}')250axes[0].set_xlabel('Number of Clusters (K)')251axes[0].set_ylabel('Inertia')252axes[0].set_title('Elbow Method')253axes[0].legend()254axes[0].grid(True, alpha=0.3)255256# Silhouette plot257axes[1].plot(k_range, silhouettes, 'g-s', linewidth=2, markersize=8)258best_sil_k = k_range[np.argmax(silhouettes)]259axes[1].axvline(best_sil_k, color='r', linestyle='--', label=f'Best at K={best_sil_k}')260axes[1].set_xlabel('Number of Clusters (K)')261axes[1].set_ylabel('Silhouette Score')262axes[1].set_title('Silhouette Analysis')263axes[1].legend()264axes[1].grid(True, alpha=0.3)265266plt.tight_layout()267save_plot('km_elbow.pdf', 'Elbow method and silhouette analysis for optimal K selection.')268269best_silhouette = max(silhouettes)270\end{pycode}271272Elbow method suggests $K = \py{elbow_k}$, silhouette analysis suggests $K = \py{best_sil_k}$ with score \py{f"{best_silhouette:.3f}"}.273274\section{Silhouette Analysis}275\begin{pycode}276# Detailed silhouette plot for K=4277km_4 = KMeans(n_clusters=4, init='kmeans++')278km_4.fit(X)279labels_4 = km_4.labels_280281# Compute individual silhouette scores282n = len(X)283sil_samples = []284for i in range(n):285same_cluster = X[labels_4 == labels_4[i]]286if len(same_cluster) > 1:287a = np.mean(np.linalg.norm(same_cluster - X[i], axis=1))288else:289a = 0290291b = np.inf292for k in range(4):293if k != labels_4[i]:294other_cluster = X[labels_4 == k]295if len(other_cluster) > 0:296b = min(b, np.mean(np.linalg.norm(other_cluster - X[i], axis=1)))297if b == np.inf:298b = 0299300if max(a, b) > 0:301sil_samples.append((b - a) / max(a, b))302else:303sil_samples.append(0)304305sil_samples = np.array(sil_samples)306307fig, ax = plt.subplots(figsize=(8, 6))308309y_lower = 10310for k in range(4):311cluster_sil = sil_samples[labels_4 == k]312cluster_sil.sort()313size_cluster = len(cluster_sil)314y_upper = y_lower + size_cluster315316color = plt.cm.viridis(k / 4)317ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_sil,318facecolor=color, edgecolor=color, alpha=0.7)319ax.text(-0.05, y_lower + 0.5 * size_cluster, str(k))320y_lower = y_upper + 10321322ax.axvline(np.mean(sil_samples), color='red', linestyle='--',323label=f'Mean: {np.mean(sil_samples):.3f}')324ax.set_xlabel('Silhouette Coefficient')325ax.set_ylabel('Cluster')326ax.set_title('Silhouette Plot for K=4')327ax.legend()328ax.grid(True, alpha=0.3)329save_plot('km_silhouette.pdf', 'Silhouette plot showing cluster quality for each sample.')330\end{pycode}331332\section{Initialization Comparison}333\begin{pycode}334# Compare random vs kmeans++ initialization335n_runs = 20336random_inertias = []337kpp_inertias = []338339for _ in range(n_runs):340km_random = KMeans(n_clusters=4, init='random')341km_random.fit(X)342random_inertias.append(km_random.inertia_)343344km_kpp = KMeans(n_clusters=4, init='kmeans++')345km_kpp.fit(X)346kpp_inertias.append(km_kpp.inertia_)347348fig, axes = plt.subplots(1, 2, figsize=(12, 5))349350# Box plot351bp = axes[0].boxplot([random_inertias, kpp_inertias],352labels=['Random', 'K-Means++'])353axes[0].set_ylabel('Inertia')354axes[0].set_title('Initialization Method Comparison')355axes[0].grid(True, alpha=0.3)356357# Histogram358axes[1].hist(random_inertias, bins=10, alpha=0.6, label='Random', color='blue')359axes[1].hist(kpp_inertias, bins=10, alpha=0.6, label='K-Means++', color='green')360axes[1].set_xlabel('Inertia')361axes[1].set_ylabel('Frequency')362axes[1].set_title('Distribution of Final Inertia')363axes[1].legend()364axes[1].grid(True, alpha=0.3)365366plt.tight_layout()367save_plot('km_initialization.pdf', 'Comparison of random vs K-means++ initialization over 20 runs.')368369random_mean = np.mean(random_inertias)370kpp_mean = np.mean(kpp_inertias)371\end{pycode}372373Mean inertia: Random = \py{f"{random_mean:.1f}"}, K-Means++ = \py{f"{kpp_mean:.1f}"}.374375\section{Cluster Visualization}376\begin{pycode}377# Final clustering result378fig, axes = plt.subplots(1, 2, figsize=(14, 6))379380# Clustering result381scatter = axes[0].scatter(X[:, 0], X[:, 1], c=km_4.labels_, cmap='viridis',382alpha=0.7, edgecolors='black', s=40)383axes[0].scatter(km_4.centroids[:, 0], km_4.centroids[:, 1], c='red',384marker='X', s=200, edgecolors='black', linewidths=2,385label='Centroids')386axes[0].set_xlabel('Feature 1')387axes[0].set_ylabel('Feature 2')388axes[0].set_title('K-Means Clustering Result (K=4)')389axes[0].legend()390axes[0].grid(True, alpha=0.3)391392# Voronoi diagram393h = 0.05394x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1395y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1396xx, yy = np.meshgrid(np.arange(x_min, x_max, h),397np.arange(y_min, y_max, h))398Z = km_4.predict(np.c_[xx.ravel(), yy.ravel()])399Z = Z.reshape(xx.shape)400401axes[1].contourf(xx, yy, Z, alpha=0.3, cmap='viridis')402axes[1].contour(xx, yy, Z, colors='black', linewidths=0.5)403axes[1].scatter(X[:, 0], X[:, 1], c=km_4.labels_, cmap='viridis',404alpha=0.7, edgecolors='black', s=40)405axes[1].scatter(km_4.centroids[:, 0], km_4.centroids[:, 1], c='red',406marker='X', s=200, edgecolors='black', linewidths=2)407axes[1].set_xlabel('Feature 1')408axes[1].set_ylabel('Feature 2')409axes[1].set_title('Voronoi Regions')410axes[1].grid(True, alpha=0.3)411412plt.tight_layout()413save_plot('km_result.pdf', 'Final K-means clustering with Voronoi decision boundaries.')414\end{pycode}415416\section{Cluster Statistics}417\begin{pycode}418# Compute cluster statistics419cluster_sizes = [np.sum(km_4.labels_ == k) for k in range(4)]420cluster_stds = [np.mean(np.std(X[km_4.labels_ == k], axis=0)) for k in range(4)]421422# Compute distances to centroids423cluster_distances = []424for k in range(4):425points = X[km_4.labels_ == k]426dists = np.linalg.norm(points - km_4.centroids[k], axis=1)427cluster_distances.append(np.mean(dists))428\end{pycode}429430\begin{pycode}431print(r'\begin{table}[H]')432print(r'\centering')433print(r'\caption{Cluster Statistics}')434print(r'\begin{tabular}{ccccc}')435print(r'\toprule')436print(r'Cluster & Size & Mean Std & Mean Distance & Centroid \\')437print(r'\midrule')438for k in range(4):439print(f'{k} & {cluster_sizes[k]} & {cluster_stds[k]:.2f} & {cluster_distances[k]:.2f} & ({km_4.centroids[k, 0]:.2f}, {km_4.centroids[k, 1]:.2f}) \\\\')440print(r'\bottomrule')441print(r'\end{tabular}')442print(r'\end{table}')443\end{pycode}444445\section{Results Summary}446\begin{table}[H]447\centering448\caption{K-Means Clustering Summary}449\begin{tabular}{lr}450\toprule451Metric & Value \\452\midrule453Dataset Size & \py{n_samples} \\454True Clusters & \py{true_k} \\455Optimal K (Elbow) & \py{elbow_k} \\456Optimal K (Silhouette) & \py{best_sil_k} \\457Best Silhouette Score & \py{f"{best_silhouette:.3f}"} \\458Final Inertia & \py{f"{km_4.inertia_:.2f}"} \\459Convergence Iterations & \py{kmeans.n_iter_} \\460\bottomrule461\end{tabular}462\end{table}463464\section{Conclusion}465This analysis demonstrated:466\begin{itemize}467\item K-means algorithm implementation with K-means++ initialization468\item Convergence visualization showing centroid updates469\item Elbow method and silhouette analysis for optimal K selection470\item Importance of initialization (K-means++ outperforms random)471\item Voronoi regions showing cluster boundaries472\item Detailed cluster quality metrics473\end{itemize}474475The K-means++ initialization consistently achieves lower inertia (\py{f"{kpp_mean:.1f}"} vs \py{f"{random_mean:.1f}"} for random), and the optimal number of clusters matches the true value of \py{true_k}.476477\end{document}478479480