CoCalc -- svm.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/machine-learning/svm.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage{float}
9
\usepackage{geometry}
10
\geometry{margin=1in}
11
\usepackage[makestderr]{pythontex}
12

13
\title{Support Vector Machines: Kernels and Classification}
14
\author{Machine Learning Foundations}
15
\date{\today}
16

17
\begin{document}
18
\maketitle
19

20
\begin{abstract}
21
This document presents a comprehensive analysis of Support Vector Machines (SVM) including hard and soft margin classification, the kernel trick for nonlinear decision boundaries, hyperparameter tuning (C and gamma), and multi-class strategies. We visualize decision boundaries, support vectors, and margin regions.
22
\end{abstract}
23

24
\section{Introduction}
25
Support Vector Machines find the maximum margin hyperplane separating classes:
26
\begin{equation}
27
\min_{w,b} \frac{1}{2}\|w\|^2 \quad \text{subject to} \quad y_i(w \cdot x_i + b) \geq 1
28
\end{equation}
29

30
For soft margin SVM with slack variables $\xi_i$:
31
\begin{equation}
32
\min_{w,b,\xi} \frac{1}{2}\|w\|^2 + C\sum_{i=1}^{n}\xi_i
33
\end{equation}
34

35
The kernel trick replaces dot products: $K(x_i, x_j) = \phi(x_i) \cdot \phi(x_j)$
36

37
Common kernels:
38
\begin{itemize}
39
    \item Linear: $K(x, x') = x \cdot x'$
40
    \item RBF: $K(x, x') = \exp(-\gamma \|x - x'\|^2)$
41
    \item Polynomial: $K(x, x') = (x \cdot x' + c)^d$
42
\end{itemize}
43

44
\section{Computational Environment}
45
\begin{pycode}
46
import numpy as np
47
import matplotlib.pyplot as plt
48
from scipy.spatial.distance import cdist
49
import warnings
50
warnings.filterwarnings('ignore')
51

52
plt.rc('text', usetex=True)
53
plt.rc('font', family='serif')
54
np.random.seed(42)
55

56
def save_plot(filename, caption):
57
    plt.savefig(filename, bbox_inches='tight', dpi=150)
58
    print(r'\begin{figure}[H]')
59
    print(r'\centering')
60
    print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')
61
    print(r'\caption{' + caption + '}')
62
    print(r'\end{figure}')
63
    plt.close()
64
\end{pycode}
65

66
\section{Data Generation}
67
\begin{pycode}
68
# Generate linearly separable data
69
def make_linear_data(n_samples, margin=0.5, random_state=42):
70
    np.random.seed(random_state)
71
    n_per_class = n_samples // 2
72
    X = np.vstack([
73
        np.random.randn(n_per_class, 2) + [2, 2],
74
        np.random.randn(n_per_class, 2) + [-2, -2]
75
    ])
76
    y = np.array([1]*n_per_class + [-1]*n_per_class)
77
    idx = np.random.permutation(n_samples)
78
    return X[idx], y[idx]
79

80
# Generate nonlinear data (circles)
81
def make_circles_data(n_samples, noise=0.1, random_state=42):
82
    np.random.seed(random_state)
83
    n_per_class = n_samples // 2
84
    theta = np.linspace(0, 2*np.pi, n_per_class)
85
    X_outer = np.column_stack([2*np.cos(theta), 2*np.sin(theta)])
86
    X_inner = np.column_stack([0.5*np.cos(theta), 0.5*np.sin(theta)])
87
    X = np.vstack([X_outer + noise*np.random.randn(n_per_class, 2),
88
                   X_inner + noise*np.random.randn(n_per_class, 2)])
89
    y = np.array([1]*n_per_class + [-1]*n_per_class)
90
    idx = np.random.permutation(n_samples)
91
    return X[idx], y[idx]
92

93
# Create datasets
94
X_linear, y_linear = make_linear_data(200)
95
X_circles, y_circles = make_circles_data(200, noise=0.15)
96

97
# Split
98
X_linear_train, X_linear_test = X_linear[:150], X_linear[150:]
99
y_linear_train, y_linear_test = y_linear[:150], y_linear[150:]
100
X_circles_train, X_circles_test = X_circles[:150], X_circles[150:]
101
y_circles_train, y_circles_test = y_circles[:150], y_circles[150:]
102

103
# Plot datasets
104
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
105

106
axes[0].scatter(X_linear[y_linear==1, 0], X_linear[y_linear==1, 1],
107
                c='blue', alpha=0.6, s=40, label='Class +1')
108
axes[0].scatter(X_linear[y_linear==-1, 0], X_linear[y_linear==-1, 1],
109
                c='red', alpha=0.6, s=40, label='Class -1')
110
axes[0].set_xlabel('Feature 1')
111
axes[0].set_ylabel('Feature 2')
112
axes[0].set_title('Linearly Separable Data')
113
axes[0].legend()
114
axes[0].grid(True, alpha=0.3)
115

116
axes[1].scatter(X_circles[y_circles==1, 0], X_circles[y_circles==1, 1],
117
                c='blue', alpha=0.6, s=40, label='Class +1')
118
axes[1].scatter(X_circles[y_circles==-1, 0], X_circles[y_circles==-1, 1],
119
                c='red', alpha=0.6, s=40, label='Class -1')
120
axes[1].set_xlabel('Feature 1')
121
axes[1].set_ylabel('Feature 2')
122
axes[1].set_title('Nonlinearly Separable Data (Circles)')
123
axes[1].legend()
124
axes[1].grid(True, alpha=0.3)
125

126
plt.tight_layout()
127
save_plot('svm_data.pdf', 'Classification datasets: linearly and nonlinearly separable.')
128
\end{pycode}
129

130
\section{SVM Implementation}
131
\begin{pycode}
132
class SVM:
133
    def __init__(self, kernel='linear', C=1.0, gamma=1.0, degree=3):
134
        self.kernel = kernel
135
        self.C = C
136
        self.gamma = gamma
137
        self.degree = degree
138
        self.alpha = None
139
        self.support_vectors = None
140
        self.support_labels = None
141
        self.b = 0
142

143
    def _kernel_function(self, X1, X2):
144
        if self.kernel == 'linear':
145
            return X1 @ X2.T
146
        elif self.kernel == 'rbf':
147
            sq_dist = cdist(X1, X2, 'sqeuclidean')
148
            return np.exp(-self.gamma * sq_dist)
149
        elif self.kernel == 'poly':
150
            return (1 + X1 @ X2.T)**self.degree
151

152
    def fit(self, X, y):
153
        n_samples = len(X)
154
        K = self._kernel_function(X, X)
155

156
        # Simple SMO-like algorithm (simplified)
157
        self.alpha = np.zeros(n_samples)
158
        for _ in range(100):  # Iterations
159
            for i in range(n_samples):
160
                Ei = np.sum(self.alpha * y * K[i]) - y[i]
161
                if (y[i] * Ei < -0.01 and self.alpha[i] < self.C) or \
162
                   (y[i] * Ei > 0.01 and self.alpha[i] > 0):
163
                    j = np.random.choice([k for k in range(n_samples) if k != i])
164
                    Ej = np.sum(self.alpha * y * K[j]) - y[j]
165

166
                    alpha_i_old, alpha_j_old = self.alpha[i], self.alpha[j]
167

168
                    if y[i] != y[j]:
169
                        L = max(0, self.alpha[j] - self.alpha[i])
170
                        H = min(self.C, self.C + self.alpha[j] - self.alpha[i])
171
                    else:
172
                        L = max(0, self.alpha[i] + self.alpha[j] - self.C)
173
                        H = min(self.C, self.alpha[i] + self.alpha[j])
174

175
                    if L == H:
176
                        continue
177

178
                    eta = 2 * K[i, j] - K[i, i] - K[j, j]
179
                    if eta >= 0:
180
                        continue
181

182
                    self.alpha[j] -= y[j] * (Ei - Ej) / eta
183
                    self.alpha[j] = np.clip(self.alpha[j], L, H)
184
                    self.alpha[i] += y[i] * y[j] * (alpha_j_old - self.alpha[j])
185

186
        # Extract support vectors
187
        sv_mask = self.alpha > 1e-5
188
        self.support_vectors = X[sv_mask]
189
        self.support_labels = y[sv_mask]
190
        self.alpha = self.alpha[sv_mask]
191

192
        # Compute bias
193
        K_sv = self._kernel_function(self.support_vectors, self.support_vectors)
194
        self.b = np.mean(self.support_labels -
195
                         np.sum(self.alpha * self.support_labels * K_sv, axis=1))
196

197
        return self
198

199
    def decision_function(self, X):
200
        K = self._kernel_function(X, self.support_vectors)
201
        return np.sum(self.alpha * self.support_labels * K, axis=1) + self.b
202

203
    def predict(self, X):
204
        return np.sign(self.decision_function(X))
205

206
    def score(self, X, y):
207
        return np.mean(self.predict(X) == y)
208
\end{pycode}
209

210
\section{Linear SVM}
211
\begin{pycode}
212
def plot_svm_decision_boundary(model, X, y, ax, title):
213
    h = 0.05
214
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
215
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
216
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
217
                         np.arange(y_min, y_max, h))
218

219
    Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
220
    Z = Z.reshape(xx.shape)
221

222
    # Decision boundary and margins
223
    ax.contourf(xx, yy, Z, levels=[-100, 0, 100], alpha=0.3, colors=['red', 'blue'])
224
    ax.contour(xx, yy, Z, levels=[-1, 0, 1], colors=['red', 'black', 'blue'],
225
               linestyles=['--', '-', '--'], linewidths=[1, 2, 1])
226

227
    # Data points
228
    ax.scatter(X[y==1, 0], X[y==1, 1], c='blue', alpha=0.6, s=40)
229
    ax.scatter(X[y==-1, 0], X[y==-1, 1], c='red', alpha=0.6, s=40)
230

231
    # Support vectors
232
    if model.support_vectors is not None:
233
        ax.scatter(model.support_vectors[:, 0], model.support_vectors[:, 1],
234
                   s=150, facecolors='none', edgecolors='black', linewidths=2)
235

236
    ax.set_xlabel('Feature 1')
237
    ax.set_ylabel('Feature 2')
238
    ax.set_title(title)
239
    ax.grid(True, alpha=0.3)
240

241
# Train linear SVM
242
svm_linear = SVM(kernel='linear', C=1.0)
243
svm_linear.fit(X_linear_train, y_linear_train)
244

245
linear_train_acc = svm_linear.score(X_linear_train, y_linear_train)
246
linear_test_acc = svm_linear.score(X_linear_test, y_linear_test)
247
n_sv_linear = len(svm_linear.support_vectors)
248

249
# Visualize
250
fig, ax = plt.subplots(figsize=(8, 6))
251
plot_svm_decision_boundary(svm_linear, X_linear, y_linear, ax,
252
                           f'Linear SVM (Accuracy: {linear_test_acc:.2f}, SVs: {n_sv_linear})')
253
save_plot('svm_linear.pdf', 'Linear SVM decision boundary with support vectors highlighted.')
254
\end{pycode}
255

256
Linear SVM: Training accuracy = \py{f"{linear_train_acc:.3f}"}, Test accuracy = \py{f"{linear_test_acc:.3f}"}, Support vectors = \py{n_sv_linear}.
257

258
\section{RBF Kernel SVM}
259
\begin{pycode}
260
# Train RBF SVM on circles data
261
svm_rbf = SVM(kernel='rbf', C=10.0, gamma=1.0)
262
svm_rbf.fit(X_circles_train, y_circles_train)
263

264
rbf_train_acc = svm_rbf.score(X_circles_train, y_circles_train)
265
rbf_test_acc = svm_rbf.score(X_circles_test, y_circles_test)
266
n_sv_rbf = len(svm_rbf.support_vectors)
267

268
fig, ax = plt.subplots(figsize=(8, 6))
269
plot_svm_decision_boundary(svm_rbf, X_circles, y_circles, ax,
270
                           f'RBF SVM (Accuracy: {rbf_test_acc:.2f}, SVs: {n_sv_rbf})')
271
save_plot('svm_rbf.pdf', 'RBF kernel SVM decision boundary for nonlinear data.')
272
\end{pycode}
273

274
RBF SVM: Training accuracy = \py{f"{rbf_train_acc:.3f}"}, Test accuracy = \py{f"{rbf_test_acc:.3f}"}.
275

276
\section{Effect of C Parameter}
277
\begin{pycode}
278
# Compare different C values
279
C_values = [0.01, 0.1, 1.0, 10.0]
280
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
281

282
for ax, C in zip(axes.flat, C_values):
283
    svm_temp = SVM(kernel='rbf', C=C, gamma=1.0)
284
    svm_temp.fit(X_circles_train, y_circles_train)
285
    acc = svm_temp.score(X_circles_test, y_circles_test)
286
    n_sv = len(svm_temp.support_vectors)
287
    plot_svm_decision_boundary(svm_temp, X_circles, y_circles, ax,
288
                               f'C = {C}, Accuracy = {acc:.2f}, SVs = {n_sv}')
289

290
plt.tight_layout()
291
save_plot('svm_c_effect.pdf', 'Effect of regularization parameter C on SVM decision boundary.')
292
\end{pycode}
293

294
\section{Effect of Gamma Parameter}
295
\begin{pycode}
296
# Compare different gamma values
297
gamma_values = [0.1, 0.5, 1.0, 5.0]
298
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
299

300
for ax, gamma in zip(axes.flat, gamma_values):
301
    svm_temp = SVM(kernel='rbf', C=1.0, gamma=gamma)
302
    svm_temp.fit(X_circles_train, y_circles_train)
303
    acc = svm_temp.score(X_circles_test, y_circles_test)
304
    n_sv = len(svm_temp.support_vectors)
305
    plot_svm_decision_boundary(svm_temp, X_circles, y_circles, ax,
306
                               f'gamma = {gamma}, Acc = {acc:.2f}, SVs = {n_sv}')
307

308
plt.tight_layout()
309
save_plot('svm_gamma_effect.pdf', 'Effect of RBF kernel parameter gamma on decision boundary.')
310
\end{pycode}
311

312
\section{Kernel Comparison}
313
\begin{pycode}
314
# Compare different kernels
315
kernels = ['linear', 'rbf', 'poly']
316
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
317

318
kernel_results = []
319
for ax, kernel in zip(axes, kernels):
320
    if kernel == 'linear':
321
        svm_temp = SVM(kernel=kernel, C=1.0)
322
    elif kernel == 'rbf':
323
        svm_temp = SVM(kernel=kernel, C=10.0, gamma=1.0)
324
    else:
325
        svm_temp = SVM(kernel=kernel, C=1.0, degree=3)
326

327
    svm_temp.fit(X_circles_train, y_circles_train)
328
    acc = svm_temp.score(X_circles_test, y_circles_test)
329
    n_sv = len(svm_temp.support_vectors)
330
    kernel_results.append((kernel, acc, n_sv))
331
    plot_svm_decision_boundary(svm_temp, X_circles, y_circles, ax,
332
                               f'{kernel.upper()} Kernel (Acc: {acc:.2f})')
333

334
plt.tight_layout()
335
save_plot('svm_kernels.pdf', 'Comparison of different kernels on nonlinear data.')
336
\end{pycode}
337

338
\section{Hyperparameter Tuning}
339
\begin{pycode}
340
# Grid search over C and gamma
341
C_range = [0.1, 1.0, 10.0]
342
gamma_range = [0.1, 1.0, 10.0]
343

344
results = np.zeros((len(C_range), len(gamma_range)))
345
for i, C in enumerate(C_range):
346
    for j, gamma in enumerate(gamma_range):
347
        svm_temp = SVM(kernel='rbf', C=C, gamma=gamma)
348
        svm_temp.fit(X_circles_train, y_circles_train)
349
        results[i, j] = svm_temp.score(X_circles_test, y_circles_test)
350

351
# Find best parameters
352
best_idx = np.unravel_index(np.argmax(results), results.shape)
353
best_C = C_range[best_idx[0]]
354
best_gamma = gamma_range[best_idx[1]]
355
best_score = results[best_idx]
356

357
fig, ax = plt.subplots(figsize=(8, 6))
358
im = ax.imshow(results, cmap='YlOrRd', aspect='auto')
359
ax.set_xticks(range(len(gamma_range)))
360
ax.set_yticks(range(len(C_range)))
361
ax.set_xticklabels([str(g) for g in gamma_range])
362
ax.set_yticklabels([str(c) for c in C_range])
363
ax.set_xlabel('Gamma')
364
ax.set_ylabel('C')
365
ax.set_title('Grid Search: Test Accuracy')
366

367
# Annotate cells
368
for i in range(len(C_range)):
369
    for j in range(len(gamma_range)):
370
        ax.text(j, i, f'{results[i, j]:.2f}', ha='center', va='center',
371
                color='white' if results[i, j] > 0.8 else 'black')
372

373
plt.colorbar(im, label='Accuracy')
374
save_plot('svm_gridsearch.pdf', 'Grid search heatmap for C and gamma hyperparameters.')
375
\end{pycode}
376

377
Best hyperparameters: $C = \py{best_C}$, $\gamma = \py{best_gamma}$ with accuracy \py{f"{best_score:.3f}"}.
378

379
\section{Results Summary}
380
\begin{table}[H]
381
\centering
382
\caption{SVM Model Performance}
383
\begin{tabular}{lcccc}
384
\toprule
385
Model & Kernel & Test Accuracy & Support Vectors \\
386
\midrule
387
Linear Data & Linear & \py{f"{linear_test_acc:.3f}"} & \py{n_sv_linear} \\
388
Circles Data & RBF & \py{f"{rbf_test_acc:.3f}"} & \py{n_sv_rbf} \\
389
\bottomrule
390
\end{tabular}
391
\end{table}
392

393
\begin{pycode}
394
print(r'\begin{table}[H]')
395
print(r'\centering')
396
print(r'\caption{Kernel Comparison on Circles Data}')
397
print(r'\begin{tabular}{lcc}')
398
print(r'\toprule')
399
print(r'Kernel & Test Accuracy & Support Vectors \\')
400
print(r'\midrule')
401
for k, a, s in kernel_results:
402
    print(f'{k.upper()} & {a:.3f} & {s} \\\\')
403
print(r'\bottomrule')
404
print(r'\end{tabular}')
405
print(r'\end{table}')
406
\end{pycode}
407

408
\section{Conclusion}
409
This analysis demonstrated:
410
\begin{itemize}
411
    \item Hard and soft margin SVM classification
412
    \item The kernel trick for nonlinear decision boundaries
413
    \item Effect of C parameter on margin width and misclassifications
414
    \item Effect of gamma on RBF kernel flexibility
415
    \item Kernel selection based on data characteristics
416
    \item Grid search for hyperparameter optimization
417
\end{itemize}
418

419
The RBF kernel with appropriate hyperparameters ($C = \py{best_C}$, $\gamma = \py{best_gamma}$) achieves \py{f"{best_score:.1%}"} accuracy on the nonlinear circles dataset.
420

421
\end{document}
422

423
Product

Resources

Company