Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/machine-learning/linear_regression.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage{float}
9
\usepackage{geometry}
10
\geometry{margin=1in}
11
\usepackage[makestderr]{pythontex}
12
13
\title{Linear Regression: OLS, Gradient Descent, and Regularization}
14
\author{Machine Learning Foundations}
15
\date{\today}
16
17
\begin{document}
18
\maketitle
19
20
\begin{abstract}
21
This document presents a comprehensive analysis of linear regression methods including ordinary least squares (OLS), gradient descent optimization, and regularization techniques (Ridge and Lasso). We examine model diagnostics, multicollinearity detection, and cross-validation for hyperparameter tuning.
22
\end{abstract}
23
24
\section{Introduction}
25
Linear regression models the relationship between features $X$ and target $y$:
26
\begin{equation}
27
y = X\beta + \epsilon, \quad \epsilon \sim \mathcal{N}(0, \sigma^2 I)
28
\end{equation}
29
30
\textbf{OLS Solution:}
31
\begin{equation}
32
\hat{\beta}_{OLS} = (X^T X)^{-1} X^T y
33
\end{equation}
34
35
\textbf{Ridge Regression:}
36
\begin{equation}
37
\hat{\beta}_{Ridge} = (X^T X + \lambda I)^{-1} X^T y
38
\end{equation}
39
40
\textbf{Lasso Regression:}
41
\begin{equation}
42
\hat{\beta}_{Lasso} = \arg\min_\beta \|y - X\beta\|_2^2 + \lambda \|\beta\|_1
43
\end{equation}
44
45
\section{Computational Environment}
46
\begin{pycode}
47
import numpy as np
48
import matplotlib.pyplot as plt
49
from scipy import stats
50
import warnings
51
warnings.filterwarnings('ignore')
52
53
plt.rc('text', usetex=True)
54
plt.rc('font', family='serif')
55
np.random.seed(42)
56
57
def save_plot(filename, caption):
58
plt.savefig(filename, bbox_inches='tight', dpi=150)
59
print(r'\begin{figure}[H]')
60
print(r'\centering')
61
print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')
62
print(r'\caption{' + caption + '}')
63
print(r'\end{figure}')
64
plt.close()
65
\end{pycode}
66
67
\section{Data Generation}
68
\begin{pycode}
69
# Generate regression dataset with known coefficients
70
n_samples = 200
71
n_features = 5
72
true_coefs = np.array([3.0, -2.0, 1.5, 0, 0]) # Last two are irrelevant
73
74
# Generate features with some correlation
75
X = np.random.randn(n_samples, n_features)
76
X[:, 1] = X[:, 0] * 0.5 + X[:, 1] * 0.5 # Introduce correlation
77
78
# Generate target with noise
79
y = X @ true_coefs + np.random.normal(0, 1, n_samples)
80
81
# Split data
82
n_train = 150
83
X_train, X_test = X[:n_train], X[n_train:]
84
y_train, y_test = y[:n_train], y[n_train:]
85
86
# Standardize features
87
X_mean = X_train.mean(axis=0)
88
X_std = X_train.std(axis=0)
89
X_train_scaled = (X_train - X_mean) / X_std
90
X_test_scaled = (X_test - X_mean) / X_std
91
92
# Plot data overview
93
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
94
95
# Feature distributions
96
for i in range(n_features):
97
axes[0].hist(X_train[:, i], bins=20, alpha=0.5, label=f'X{i+1}')
98
axes[0].set_xlabel('Value')
99
axes[0].set_ylabel('Frequency')
100
axes[0].set_title('Feature Distributions')
101
axes[0].legend()
102
103
# Correlation matrix
104
corr = np.corrcoef(X_train.T)
105
im = axes[1].imshow(corr, cmap='RdBu_r', vmin=-1, vmax=1)
106
axes[1].set_xticks(range(n_features))
107
axes[1].set_yticks(range(n_features))
108
axes[1].set_xticklabels([f'X{i+1}' for i in range(n_features)])
109
axes[1].set_yticklabels([f'X{i+1}' for i in range(n_features)])
110
axes[1].set_title('Feature Correlation')
111
plt.colorbar(im, ax=axes[1])
112
113
# Target distribution
114
axes[2].hist(y_train, bins=20, color='steelblue', edgecolor='black', alpha=0.7)
115
axes[2].set_xlabel('y')
116
axes[2].set_ylabel('Frequency')
117
axes[2].set_title('Target Distribution')
118
119
plt.tight_layout()
120
save_plot('lr_data.pdf', 'Dataset overview showing feature distributions and correlations.')
121
\end{pycode}
122
123
Dataset: $n = \py{n_samples}$ samples, $p = \py{n_features}$ features.
124
125
\section{Ordinary Least Squares}
126
\begin{pycode}
127
class LinearRegression:
128
def __init__(self):
129
self.coef_ = None
130
self.intercept_ = None
131
132
def fit(self, X, y):
133
# Add intercept
134
X_b = np.column_stack([np.ones(len(X)), X])
135
# OLS solution
136
self.coef_ = np.linalg.lstsq(X_b, y, rcond=None)[0]
137
self.intercept_ = self.coef_[0]
138
self.coef_ = self.coef_[1:]
139
return self
140
141
def predict(self, X):
142
return X @ self.coef_ + self.intercept_
143
144
def score(self, X, y):
145
y_pred = self.predict(X)
146
ss_res = np.sum((y - y_pred)**2)
147
ss_tot = np.sum((y - np.mean(y))**2)
148
return 1 - ss_res / ss_tot
149
150
# Fit OLS model
151
ols = LinearRegression()
152
ols.fit(X_train_scaled, y_train)
153
154
y_train_pred = ols.predict(X_train_scaled)
155
y_test_pred = ols.predict(X_test_scaled)
156
157
train_r2 = ols.score(X_train_scaled, y_train)
158
test_r2 = ols.score(X_test_scaled, y_test)
159
train_mse = np.mean((y_train - y_train_pred)**2)
160
test_mse = np.mean((y_test - y_test_pred)**2)
161
162
# Residual analysis
163
residuals = y_train - y_train_pred
164
165
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
166
167
# Predicted vs Actual
168
axes[0, 0].scatter(y_train, y_train_pred, alpha=0.5, s=30)
169
lims = [min(y_train.min(), y_train_pred.min()), max(y_train.max(), y_train_pred.max())]
170
axes[0, 0].plot(lims, lims, 'r--', linewidth=2)
171
axes[0, 0].set_xlabel('Actual')
172
axes[0, 0].set_ylabel('Predicted')
173
axes[0, 0].set_title(f'Predicted vs Actual (R2 = {train_r2:.3f})')
174
axes[0, 0].grid(True, alpha=0.3)
175
176
# Residuals vs Fitted
177
axes[0, 1].scatter(y_train_pred, residuals, alpha=0.5, s=30)
178
axes[0, 1].axhline(0, color='r', linestyle='--')
179
axes[0, 1].set_xlabel('Fitted Values')
180
axes[0, 1].set_ylabel('Residuals')
181
axes[0, 1].set_title('Residuals vs Fitted')
182
axes[0, 1].grid(True, alpha=0.3)
183
184
# Q-Q plot
185
stats.probplot(residuals, dist="norm", plot=axes[1, 0])
186
axes[1, 0].set_title('Normal Q-Q Plot')
187
axes[1, 0].grid(True, alpha=0.3)
188
189
# Residual histogram
190
axes[1, 1].hist(residuals, bins=20, density=True, alpha=0.7, color='steelblue', edgecolor='black')
191
x_norm = np.linspace(residuals.min(), residuals.max(), 100)
192
axes[1, 1].plot(x_norm, stats.norm.pdf(x_norm, 0, np.std(residuals)), 'r-', linewidth=2)
193
axes[1, 1].set_xlabel('Residual')
194
axes[1, 1].set_ylabel('Density')
195
axes[1, 1].set_title('Residual Distribution')
196
197
plt.tight_layout()
198
save_plot('lr_diagnostics.pdf', 'OLS regression diagnostics showing residual analysis.')
199
\end{pycode}
200
201
OLS Performance: Train $R^2 = \py{f"{train_r2:.3f}"}$, Test $R^2 = \py{f"{test_r2:.3f}"}$.
202
203
\section{Gradient Descent}
204
\begin{pycode}
205
class GradientDescentRegression:
206
def __init__(self, learning_rate=0.01, n_iterations=1000):
207
self.lr = learning_rate
208
self.n_iter = n_iterations
209
self.coef_ = None
210
self.intercept_ = None
211
self.loss_history = []
212
213
def fit(self, X, y):
214
n_samples, n_features = X.shape
215
self.coef_ = np.zeros(n_features)
216
self.intercept_ = 0
217
self.loss_history = []
218
219
for _ in range(self.n_iter):
220
y_pred = X @ self.coef_ + self.intercept_
221
error = y_pred - y
222
loss = np.mean(error**2)
223
self.loss_history.append(loss)
224
225
# Gradients
226
grad_coef = (2/n_samples) * X.T @ error
227
grad_intercept = (2/n_samples) * np.sum(error)
228
229
# Update
230
self.coef_ -= self.lr * grad_coef
231
self.intercept_ -= self.lr * grad_intercept
232
233
return self
234
235
def predict(self, X):
236
return X @ self.coef_ + self.intercept_
237
238
# Fit with gradient descent
239
gd = GradientDescentRegression(learning_rate=0.1, n_iterations=500)
240
gd.fit(X_train_scaled, y_train)
241
242
# Compare learning rates
243
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
244
245
learning_rates = [0.001, 0.01, 0.1, 0.5]
246
for lr in learning_rates:
247
gd_temp = GradientDescentRegression(learning_rate=lr, n_iterations=200)
248
gd_temp.fit(X_train_scaled, y_train)
249
axes[0].plot(gd_temp.loss_history, label=f'lr={lr}', linewidth=1.5)
250
251
axes[0].set_xlabel('Iteration')
252
axes[0].set_ylabel('MSE Loss')
253
axes[0].set_title('Gradient Descent Convergence')
254
axes[0].legend()
255
axes[0].grid(True, alpha=0.3)
256
axes[0].set_yscale('log')
257
258
# Coefficient comparison
259
x_pos = np.arange(n_features)
260
width = 0.35
261
axes[1].bar(x_pos - width/2, ols.coef_, width, label='OLS', alpha=0.8)
262
axes[1].bar(x_pos + width/2, gd.coef_, width, label='GD', alpha=0.8)
263
axes[1].set_xticks(x_pos)
264
axes[1].set_xticklabels([f'X{i+1}' for i in range(n_features)])
265
axes[1].set_xlabel('Feature')
266
axes[1].set_ylabel('Coefficient')
267
axes[1].set_title('OLS vs Gradient Descent Coefficients')
268
axes[1].legend()
269
axes[1].grid(True, alpha=0.3)
270
271
plt.tight_layout()
272
save_plot('lr_gradient_descent.pdf', 'Gradient descent convergence and coefficient comparison with OLS.')
273
\end{pycode}
274
275
\section{Ridge Regression}
276
\begin{pycode}
277
class RidgeRegression:
278
def __init__(self, alpha=1.0):
279
self.alpha = alpha
280
self.coef_ = None
281
self.intercept_ = None
282
283
def fit(self, X, y):
284
n_features = X.shape[1]
285
X_b = np.column_stack([np.ones(len(X)), X])
286
I = np.eye(n_features + 1)
287
I[0, 0] = 0 # Don't regularize intercept
288
coef = np.linalg.solve(X_b.T @ X_b + self.alpha * I, X_b.T @ y)
289
self.intercept_ = coef[0]
290
self.coef_ = coef[1:]
291
return self
292
293
def predict(self, X):
294
return X @ self.coef_ + self.intercept_
295
296
def score(self, X, y):
297
y_pred = self.predict(X)
298
ss_res = np.sum((y - y_pred)**2)
299
ss_tot = np.sum((y - np.mean(y))**2)
300
return 1 - ss_res / ss_tot
301
302
# Ridge path
303
alphas = np.logspace(-3, 3, 50)
304
ridge_coefs = []
305
ridge_scores = []
306
307
for alpha in alphas:
308
ridge = RidgeRegression(alpha=alpha)
309
ridge.fit(X_train_scaled, y_train)
310
ridge_coefs.append(ridge.coef_)
311
ridge_scores.append(ridge.score(X_test_scaled, y_test))
312
313
ridge_coefs = np.array(ridge_coefs)
314
315
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
316
317
# Coefficient path
318
for i in range(n_features):
319
axes[0].plot(alphas, ridge_coefs[:, i], label=f'X{i+1}', linewidth=1.5)
320
axes[0].set_xscale('log')
321
axes[0].set_xlabel('Alpha (Regularization)')
322
axes[0].set_ylabel('Coefficient Value')
323
axes[0].set_title('Ridge Regression Coefficient Path')
324
axes[0].legend()
325
axes[0].grid(True, alpha=0.3)
326
327
# Validation curve
328
axes[1].plot(alphas, ridge_scores, 'b-o', linewidth=1.5, markersize=4)
329
best_alpha_idx = np.argmax(ridge_scores)
330
best_alpha = alphas[best_alpha_idx]
331
axes[1].axvline(best_alpha, color='r', linestyle='--', label=f'Best alpha={best_alpha:.3f}')
332
axes[1].set_xscale('log')
333
axes[1].set_xlabel('Alpha')
334
axes[1].set_ylabel('Test R2 Score')
335
axes[1].set_title('Ridge Regression Validation Curve')
336
axes[1].legend()
337
axes[1].grid(True, alpha=0.3)
338
339
plt.tight_layout()
340
save_plot('lr_ridge.pdf', 'Ridge regression coefficient path and validation curve.')
341
342
best_ridge = RidgeRegression(alpha=best_alpha)
343
best_ridge.fit(X_train_scaled, y_train)
344
ridge_test_r2 = best_ridge.score(X_test_scaled, y_test)
345
\end{pycode}
346
347
Best Ridge alpha: \py{f"{best_alpha:.3f}"} with test $R^2 = \py{f"{ridge_test_r2:.3f}"}$.
348
349
\section{Lasso Regression}
350
\begin{pycode}
351
class LassoRegression:
352
def __init__(self, alpha=1.0, n_iterations=1000):
353
self.alpha = alpha
354
self.n_iter = n_iterations
355
self.coef_ = None
356
self.intercept_ = None
357
358
def _soft_threshold(self, x, threshold):
359
return np.sign(x) * np.maximum(np.abs(x) - threshold, 0)
360
361
def fit(self, X, y):
362
n_samples, n_features = X.shape
363
self.coef_ = np.zeros(n_features)
364
self.intercept_ = np.mean(y)
365
366
# Coordinate descent
367
for _ in range(self.n_iter):
368
for j in range(n_features):
369
residual = y - self.intercept_ - X @ self.coef_ + X[:, j] * self.coef_[j]
370
rho = X[:, j] @ residual
371
z = np.sum(X[:, j]**2)
372
self.coef_[j] = self._soft_threshold(rho, self.alpha * n_samples / 2) / z
373
374
self.intercept_ = np.mean(y - X @ self.coef_)
375
376
return self
377
378
def predict(self, X):
379
return X @ self.coef_ + self.intercept_
380
381
def score(self, X, y):
382
y_pred = self.predict(X)
383
ss_res = np.sum((y - y_pred)**2)
384
ss_tot = np.sum((y - np.mean(y))**2)
385
return 1 - ss_res / ss_tot
386
387
# Lasso path
388
lasso_alphas = np.logspace(-4, 0, 50)
389
lasso_coefs = []
390
lasso_scores = []
391
392
for alpha in lasso_alphas:
393
lasso = LassoRegression(alpha=alpha, n_iterations=500)
394
lasso.fit(X_train_scaled, y_train)
395
lasso_coefs.append(lasso.coef_)
396
lasso_scores.append(lasso.score(X_test_scaled, y_test))
397
398
lasso_coefs = np.array(lasso_coefs)
399
400
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
401
402
# Coefficient path
403
for i in range(n_features):
404
axes[0].plot(lasso_alphas, lasso_coefs[:, i], label=f'X{i+1}', linewidth=1.5)
405
axes[0].set_xscale('log')
406
axes[0].set_xlabel('Alpha (Regularization)')
407
axes[0].set_ylabel('Coefficient Value')
408
axes[0].set_title('Lasso Regression Coefficient Path')
409
axes[0].legend()
410
axes[0].grid(True, alpha=0.3)
411
412
# Validation curve
413
axes[1].plot(lasso_alphas, lasso_scores, 'g-s', linewidth=1.5, markersize=4)
414
best_lasso_idx = np.argmax(lasso_scores)
415
best_lasso_alpha = lasso_alphas[best_lasso_idx]
416
axes[1].axvline(best_lasso_alpha, color='r', linestyle='--', label=f'Best alpha={best_lasso_alpha:.4f}')
417
axes[1].set_xscale('log')
418
axes[1].set_xlabel('Alpha')
419
axes[1].set_ylabel('Test R2 Score')
420
axes[1].set_title('Lasso Regression Validation Curve')
421
axes[1].legend()
422
axes[1].grid(True, alpha=0.3)
423
424
plt.tight_layout()
425
save_plot('lr_lasso.pdf', 'Lasso regression coefficient path showing feature selection.')
426
427
best_lasso = LassoRegression(alpha=best_lasso_alpha, n_iterations=500)
428
best_lasso.fit(X_train_scaled, y_train)
429
lasso_test_r2 = best_lasso.score(X_test_scaled, y_test)
430
n_selected = np.sum(np.abs(best_lasso.coef_) > 0.01)
431
\end{pycode}
432
433
Best Lasso alpha: \py{f"{best_lasso_alpha:.4f}"} with test $R^2 = \py{f"{lasso_test_r2:.3f}"}$. Selected \py{n_selected} features.
434
435
\section{Model Comparison}
436
\begin{pycode}
437
# Compare all methods
438
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
439
440
# Coefficient comparison
441
methods = ['True', 'OLS', 'Ridge', 'Lasso']
442
coefs = [true_coefs, ols.coef_, best_ridge.coef_, best_lasso.coef_]
443
444
x = np.arange(n_features)
445
width = 0.2
446
447
for i, (method, coef) in enumerate(zip(methods, coefs)):
448
axes[0].bar(x + i*width, coef, width, label=method, alpha=0.8)
449
450
axes[0].set_xticks(x + 1.5*width)
451
axes[0].set_xticklabels([f'X{i+1}' for i in range(n_features)])
452
axes[0].set_xlabel('Feature')
453
axes[0].set_ylabel('Coefficient')
454
axes[0].set_title('Coefficient Comparison Across Methods')
455
axes[0].legend()
456
axes[0].grid(True, alpha=0.3)
457
458
# Performance comparison
459
methods_names = ['OLS', 'Ridge', 'Lasso']
460
test_scores = [test_r2, ridge_test_r2, lasso_test_r2]
461
train_scores = [train_r2, best_ridge.score(X_train_scaled, y_train),
462
best_lasso.score(X_train_scaled, y_train)]
463
464
x = np.arange(len(methods_names))
465
width = 0.35
466
axes[1].bar(x - width/2, train_scores, width, label='Train', alpha=0.8)
467
axes[1].bar(x + width/2, test_scores, width, label='Test', alpha=0.8)
468
axes[1].set_xticks(x)
469
axes[1].set_xticklabels(methods_names)
470
axes[1].set_ylabel('R2 Score')
471
axes[1].set_title('Model Performance Comparison')
472
axes[1].legend()
473
axes[1].grid(True, alpha=0.3)
474
475
plt.tight_layout()
476
save_plot('lr_comparison.pdf', 'Comparison of OLS, Ridge, and Lasso regression methods.')
477
\end{pycode}
478
479
\section{Results Summary}
480
\begin{table}[H]
481
\centering
482
\caption{Regression Model Performance}
483
\begin{tabular}{lccc}
484
\toprule
485
Model & Train $R^2$ & Test $R^2$ & Hyperparameter \\
486
\midrule
487
OLS & \py{f"{train_r2:.3f}"} & \py{f"{test_r2:.3f}"} & - \\
488
Ridge & \py{f"{best_ridge.score(X_train_scaled, y_train):.3f}"} & \py{f"{ridge_test_r2:.3f}"} & $\alpha = \py{f"{best_alpha:.3f}"}$ \\
489
Lasso & \py{f"{best_lasso.score(X_train_scaled, y_train):.3f}"} & \py{f"{lasso_test_r2:.3f}"} & $\alpha = \py{f"{best_lasso_alpha:.4f}"}$ \\
490
\bottomrule
491
\end{tabular}
492
\end{table}
493
494
\begin{pycode}
495
print(r'\begin{table}[H]')
496
print(r'\centering')
497
print(r'\caption{Coefficient Estimates}')
498
print(r'\begin{tabular}{ccccc}')
499
print(r'\toprule')
500
print(r'Feature & True & OLS & Ridge & Lasso \\')
501
print(r'\midrule')
502
for i in range(n_features):
503
print(f'X{i+1} & {true_coefs[i]:.2f} & {ols.coef_[i]:.2f} & {best_ridge.coef_[i]:.2f} & {best_lasso.coef_[i]:.2f} \\\\')
504
print(r'\bottomrule')
505
print(r'\end{tabular}')
506
print(r'\end{table}')
507
\end{pycode}
508
509
\section{Conclusion}
510
This analysis demonstrated:
511
\begin{itemize}
512
\item OLS as the baseline with closed-form solution
513
\item Gradient descent as iterative optimization method
514
\item Ridge regression for handling multicollinearity (L2 penalty)
515
\item Lasso regression for feature selection (L1 penalty)
516
\item Hyperparameter tuning via validation curves
517
\end{itemize}
518
519
The Lasso method successfully identified the relevant features and set irrelevant coefficients close to zero, demonstrating its utility for sparse solutions.
520
521
\end{document}
522
523