Path: blob/main/latex-templates/templates/machine-learning/svm.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{booktabs}6\usepackage{siunitx}7\usepackage{float}8\usepackage{geometry}9\geometry{margin=1in}10\usepackage[makestderr]{pythontex}1112\title{Support Vector Machines: Kernels and Classification}13\author{Machine Learning Foundations}14\date{\today}1516\begin{document}17\maketitle1819\begin{abstract}20This document presents a comprehensive analysis of Support Vector Machines (SVM) including hard and soft margin classification, the kernel trick for nonlinear decision boundaries, hyperparameter tuning (C and gamma), and multi-class strategies. We visualize decision boundaries, support vectors, and margin regions.21\end{abstract}2223\section{Introduction}24Support Vector Machines find the maximum margin hyperplane separating classes:25\begin{equation}26\min_{w,b} \frac{1}{2}\|w\|^2 \quad \text{subject to} \quad y_i(w \cdot x_i + b) \geq 127\end{equation}2829For soft margin SVM with slack variables $\xi_i$:30\begin{equation}31\min_{w,b,\xi} \frac{1}{2}\|w\|^2 + C\sum_{i=1}^{n}\xi_i32\end{equation}3334The kernel trick replaces dot products: $K(x_i, x_j) = \phi(x_i) \cdot \phi(x_j)$3536Common kernels:37\begin{itemize}38\item Linear: $K(x, x') = x \cdot x'$39\item RBF: $K(x, x') = \exp(-\gamma \|x - x'\|^2)$40\item Polynomial: $K(x, x') = (x \cdot x' + c)^d$41\end{itemize}4243\section{Computational Environment}44\begin{pycode}45import numpy as np46import matplotlib.pyplot as plt47from scipy.spatial.distance import cdist48import warnings49warnings.filterwarnings('ignore')5051plt.rc('text', usetex=True)52plt.rc('font', family='serif')53np.random.seed(42)5455def save_plot(filename, caption):56plt.savefig(filename, bbox_inches='tight', dpi=150)57print(r'\begin{figure}[H]')58print(r'\centering')59print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')60print(r'\caption{' + caption + '}')61print(r'\end{figure}')62plt.close()63\end{pycode}6465\section{Data Generation}66\begin{pycode}67# Generate linearly separable data68def make_linear_data(n_samples, margin=0.5, random_state=42):69np.random.seed(random_state)70n_per_class = n_samples // 271X = np.vstack([72np.random.randn(n_per_class, 2) + [2, 2],73np.random.randn(n_per_class, 2) + [-2, -2]74])75y = np.array([1]*n_per_class + [-1]*n_per_class)76idx = np.random.permutation(n_samples)77return X[idx], y[idx]7879# Generate nonlinear data (circles)80def make_circles_data(n_samples, noise=0.1, random_state=42):81np.random.seed(random_state)82n_per_class = n_samples // 283theta = np.linspace(0, 2*np.pi, n_per_class)84X_outer = np.column_stack([2*np.cos(theta), 2*np.sin(theta)])85X_inner = np.column_stack([0.5*np.cos(theta), 0.5*np.sin(theta)])86X = np.vstack([X_outer + noise*np.random.randn(n_per_class, 2),87X_inner + noise*np.random.randn(n_per_class, 2)])88y = np.array([1]*n_per_class + [-1]*n_per_class)89idx = np.random.permutation(n_samples)90return X[idx], y[idx]9192# Create datasets93X_linear, y_linear = make_linear_data(200)94X_circles, y_circles = make_circles_data(200, noise=0.15)9596# Split97X_linear_train, X_linear_test = X_linear[:150], X_linear[150:]98y_linear_train, y_linear_test = y_linear[:150], y_linear[150:]99X_circles_train, X_circles_test = X_circles[:150], X_circles[150:]100y_circles_train, y_circles_test = y_circles[:150], y_circles[150:]101102# Plot datasets103fig, axes = plt.subplots(1, 2, figsize=(12, 5))104105axes[0].scatter(X_linear[y_linear==1, 0], X_linear[y_linear==1, 1],106c='blue', alpha=0.6, s=40, label='Class +1')107axes[0].scatter(X_linear[y_linear==-1, 0], X_linear[y_linear==-1, 1],108c='red', alpha=0.6, s=40, label='Class -1')109axes[0].set_xlabel('Feature 1')110axes[0].set_ylabel('Feature 2')111axes[0].set_title('Linearly Separable Data')112axes[0].legend()113axes[0].grid(True, alpha=0.3)114115axes[1].scatter(X_circles[y_circles==1, 0], X_circles[y_circles==1, 1],116c='blue', alpha=0.6, s=40, label='Class +1')117axes[1].scatter(X_circles[y_circles==-1, 0], X_circles[y_circles==-1, 1],118c='red', alpha=0.6, s=40, label='Class -1')119axes[1].set_xlabel('Feature 1')120axes[1].set_ylabel('Feature 2')121axes[1].set_title('Nonlinearly Separable Data (Circles)')122axes[1].legend()123axes[1].grid(True, alpha=0.3)124125plt.tight_layout()126save_plot('svm_data.pdf', 'Classification datasets: linearly and nonlinearly separable.')127\end{pycode}128129\section{SVM Implementation}130\begin{pycode}131class SVM:132def __init__(self, kernel='linear', C=1.0, gamma=1.0, degree=3):133self.kernel = kernel134self.C = C135self.gamma = gamma136self.degree = degree137self.alpha = None138self.support_vectors = None139self.support_labels = None140self.b = 0141142def _kernel_function(self, X1, X2):143if self.kernel == 'linear':144return X1 @ X2.T145elif self.kernel == 'rbf':146sq_dist = cdist(X1, X2, 'sqeuclidean')147return np.exp(-self.gamma * sq_dist)148elif self.kernel == 'poly':149return (1 + X1 @ X2.T)**self.degree150151def fit(self, X, y):152n_samples = len(X)153K = self._kernel_function(X, X)154155# Simple SMO-like algorithm (simplified)156self.alpha = np.zeros(n_samples)157for _ in range(100): # Iterations158for i in range(n_samples):159Ei = np.sum(self.alpha * y * K[i]) - y[i]160if (y[i] * Ei < -0.01 and self.alpha[i] < self.C) or \161(y[i] * Ei > 0.01 and self.alpha[i] > 0):162j = np.random.choice([k for k in range(n_samples) if k != i])163Ej = np.sum(self.alpha * y * K[j]) - y[j]164165alpha_i_old, alpha_j_old = self.alpha[i], self.alpha[j]166167if y[i] != y[j]:168L = max(0, self.alpha[j] - self.alpha[i])169H = min(self.C, self.C + self.alpha[j] - self.alpha[i])170else:171L = max(0, self.alpha[i] + self.alpha[j] - self.C)172H = min(self.C, self.alpha[i] + self.alpha[j])173174if L == H:175continue176177eta = 2 * K[i, j] - K[i, i] - K[j, j]178if eta >= 0:179continue180181self.alpha[j] -= y[j] * (Ei - Ej) / eta182self.alpha[j] = np.clip(self.alpha[j], L, H)183self.alpha[i] += y[i] * y[j] * (alpha_j_old - self.alpha[j])184185# Extract support vectors186sv_mask = self.alpha > 1e-5187self.support_vectors = X[sv_mask]188self.support_labels = y[sv_mask]189self.alpha = self.alpha[sv_mask]190191# Compute bias192K_sv = self._kernel_function(self.support_vectors, self.support_vectors)193self.b = np.mean(self.support_labels -194np.sum(self.alpha * self.support_labels * K_sv, axis=1))195196return self197198def decision_function(self, X):199K = self._kernel_function(X, self.support_vectors)200return np.sum(self.alpha * self.support_labels * K, axis=1) + self.b201202def predict(self, X):203return np.sign(self.decision_function(X))204205def score(self, X, y):206return np.mean(self.predict(X) == y)207\end{pycode}208209\section{Linear SVM}210\begin{pycode}211def plot_svm_decision_boundary(model, X, y, ax, title):212h = 0.05213x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1214y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1215xx, yy = np.meshgrid(np.arange(x_min, x_max, h),216np.arange(y_min, y_max, h))217218Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])219Z = Z.reshape(xx.shape)220221# Decision boundary and margins222ax.contourf(xx, yy, Z, levels=[-100, 0, 100], alpha=0.3, colors=['red', 'blue'])223ax.contour(xx, yy, Z, levels=[-1, 0, 1], colors=['red', 'black', 'blue'],224linestyles=['--', '-', '--'], linewidths=[1, 2, 1])225226# Data points227ax.scatter(X[y==1, 0], X[y==1, 1], c='blue', alpha=0.6, s=40)228ax.scatter(X[y==-1, 0], X[y==-1, 1], c='red', alpha=0.6, s=40)229230# Support vectors231if model.support_vectors is not None:232ax.scatter(model.support_vectors[:, 0], model.support_vectors[:, 1],233s=150, facecolors='none', edgecolors='black', linewidths=2)234235ax.set_xlabel('Feature 1')236ax.set_ylabel('Feature 2')237ax.set_title(title)238ax.grid(True, alpha=0.3)239240# Train linear SVM241svm_linear = SVM(kernel='linear', C=1.0)242svm_linear.fit(X_linear_train, y_linear_train)243244linear_train_acc = svm_linear.score(X_linear_train, y_linear_train)245linear_test_acc = svm_linear.score(X_linear_test, y_linear_test)246n_sv_linear = len(svm_linear.support_vectors)247248# Visualize249fig, ax = plt.subplots(figsize=(8, 6))250plot_svm_decision_boundary(svm_linear, X_linear, y_linear, ax,251f'Linear SVM (Accuracy: {linear_test_acc:.2f}, SVs: {n_sv_linear})')252save_plot('svm_linear.pdf', 'Linear SVM decision boundary with support vectors highlighted.')253\end{pycode}254255Linear SVM: Training accuracy = \py{f"{linear_train_acc:.3f}"}, Test accuracy = \py{f"{linear_test_acc:.3f}"}, Support vectors = \py{n_sv_linear}.256257\section{RBF Kernel SVM}258\begin{pycode}259# Train RBF SVM on circles data260svm_rbf = SVM(kernel='rbf', C=10.0, gamma=1.0)261svm_rbf.fit(X_circles_train, y_circles_train)262263rbf_train_acc = svm_rbf.score(X_circles_train, y_circles_train)264rbf_test_acc = svm_rbf.score(X_circles_test, y_circles_test)265n_sv_rbf = len(svm_rbf.support_vectors)266267fig, ax = plt.subplots(figsize=(8, 6))268plot_svm_decision_boundary(svm_rbf, X_circles, y_circles, ax,269f'RBF SVM (Accuracy: {rbf_test_acc:.2f}, SVs: {n_sv_rbf})')270save_plot('svm_rbf.pdf', 'RBF kernel SVM decision boundary for nonlinear data.')271\end{pycode}272273RBF SVM: Training accuracy = \py{f"{rbf_train_acc:.3f}"}, Test accuracy = \py{f"{rbf_test_acc:.3f}"}.274275\section{Effect of C Parameter}276\begin{pycode}277# Compare different C values278C_values = [0.01, 0.1, 1.0, 10.0]279fig, axes = plt.subplots(2, 2, figsize=(12, 10))280281for ax, C in zip(axes.flat, C_values):282svm_temp = SVM(kernel='rbf', C=C, gamma=1.0)283svm_temp.fit(X_circles_train, y_circles_train)284acc = svm_temp.score(X_circles_test, y_circles_test)285n_sv = len(svm_temp.support_vectors)286plot_svm_decision_boundary(svm_temp, X_circles, y_circles, ax,287f'C = {C}, Accuracy = {acc:.2f}, SVs = {n_sv}')288289plt.tight_layout()290save_plot('svm_c_effect.pdf', 'Effect of regularization parameter C on SVM decision boundary.')291\end{pycode}292293\section{Effect of Gamma Parameter}294\begin{pycode}295# Compare different gamma values296gamma_values = [0.1, 0.5, 1.0, 5.0]297fig, axes = plt.subplots(2, 2, figsize=(12, 10))298299for ax, gamma in zip(axes.flat, gamma_values):300svm_temp = SVM(kernel='rbf', C=1.0, gamma=gamma)301svm_temp.fit(X_circles_train, y_circles_train)302acc = svm_temp.score(X_circles_test, y_circles_test)303n_sv = len(svm_temp.support_vectors)304plot_svm_decision_boundary(svm_temp, X_circles, y_circles, ax,305f'gamma = {gamma}, Acc = {acc:.2f}, SVs = {n_sv}')306307plt.tight_layout()308save_plot('svm_gamma_effect.pdf', 'Effect of RBF kernel parameter gamma on decision boundary.')309\end{pycode}310311\section{Kernel Comparison}312\begin{pycode}313# Compare different kernels314kernels = ['linear', 'rbf', 'poly']315fig, axes = plt.subplots(1, 3, figsize=(15, 5))316317kernel_results = []318for ax, kernel in zip(axes, kernels):319if kernel == 'linear':320svm_temp = SVM(kernel=kernel, C=1.0)321elif kernel == 'rbf':322svm_temp = SVM(kernel=kernel, C=10.0, gamma=1.0)323else:324svm_temp = SVM(kernel=kernel, C=1.0, degree=3)325326svm_temp.fit(X_circles_train, y_circles_train)327acc = svm_temp.score(X_circles_test, y_circles_test)328n_sv = len(svm_temp.support_vectors)329kernel_results.append((kernel, acc, n_sv))330plot_svm_decision_boundary(svm_temp, X_circles, y_circles, ax,331f'{kernel.upper()} Kernel (Acc: {acc:.2f})')332333plt.tight_layout()334save_plot('svm_kernels.pdf', 'Comparison of different kernels on nonlinear data.')335\end{pycode}336337\section{Hyperparameter Tuning}338\begin{pycode}339# Grid search over C and gamma340C_range = [0.1, 1.0, 10.0]341gamma_range = [0.1, 1.0, 10.0]342343results = np.zeros((len(C_range), len(gamma_range)))344for i, C in enumerate(C_range):345for j, gamma in enumerate(gamma_range):346svm_temp = SVM(kernel='rbf', C=C, gamma=gamma)347svm_temp.fit(X_circles_train, y_circles_train)348results[i, j] = svm_temp.score(X_circles_test, y_circles_test)349350# Find best parameters351best_idx = np.unravel_index(np.argmax(results), results.shape)352best_C = C_range[best_idx[0]]353best_gamma = gamma_range[best_idx[1]]354best_score = results[best_idx]355356fig, ax = plt.subplots(figsize=(8, 6))357im = ax.imshow(results, cmap='YlOrRd', aspect='auto')358ax.set_xticks(range(len(gamma_range)))359ax.set_yticks(range(len(C_range)))360ax.set_xticklabels([str(g) for g in gamma_range])361ax.set_yticklabels([str(c) for c in C_range])362ax.set_xlabel('Gamma')363ax.set_ylabel('C')364ax.set_title('Grid Search: Test Accuracy')365366# Annotate cells367for i in range(len(C_range)):368for j in range(len(gamma_range)):369ax.text(j, i, f'{results[i, j]:.2f}', ha='center', va='center',370color='white' if results[i, j] > 0.8 else 'black')371372plt.colorbar(im, label='Accuracy')373save_plot('svm_gridsearch.pdf', 'Grid search heatmap for C and gamma hyperparameters.')374\end{pycode}375376Best hyperparameters: $C = \py{best_C}$, $\gamma = \py{best_gamma}$ with accuracy \py{f"{best_score:.3f}"}.377378\section{Results Summary}379\begin{table}[H]380\centering381\caption{SVM Model Performance}382\begin{tabular}{lcccc}383\toprule384Model & Kernel & Test Accuracy & Support Vectors \\385\midrule386Linear Data & Linear & \py{f"{linear_test_acc:.3f}"} & \py{n_sv_linear} \\387Circles Data & RBF & \py{f"{rbf_test_acc:.3f}"} & \py{n_sv_rbf} \\388\bottomrule389\end{tabular}390\end{table}391392\begin{pycode}393print(r'\begin{table}[H]')394print(r'\centering')395print(r'\caption{Kernel Comparison on Circles Data}')396print(r'\begin{tabular}{lcc}')397print(r'\toprule')398print(r'Kernel & Test Accuracy & Support Vectors \\')399print(r'\midrule')400for k, a, s in kernel_results:401print(f'{k.upper()} & {a:.3f} & {s} \\\\')402print(r'\bottomrule')403print(r'\end{tabular}')404print(r'\end{table}')405\end{pycode}406407\section{Conclusion}408This analysis demonstrated:409\begin{itemize}410\item Hard and soft margin SVM classification411\item The kernel trick for nonlinear decision boundaries412\item Effect of C parameter on margin width and misclassifications413\item Effect of gamma on RBF kernel flexibility414\item Kernel selection based on data characteristics415\item Grid search for hyperparameter optimization416\end{itemize}417418The RBF kernel with appropriate hyperparameters ($C = \py{best_C}$, $\gamma = \py{best_gamma}$) achieves \py{f"{best_score:.1%}"} accuracy on the nonlinear circles dataset.419420\end{document}421422423