CoCalc -- linear_regression.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/machine-learning/linear_regression.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage{float}
9
\usepackage{geometry}
10
\geometry{margin=1in}
11
\usepackage[makestderr]{pythontex}
12

13
\title{Linear Regression: OLS, Gradient Descent, and Regularization}
14
\author{Machine Learning Foundations}
15
\date{\today}
16

17
\begin{document}
18
\maketitle
19

20
\begin{abstract}
21
This document presents a comprehensive analysis of linear regression methods including ordinary least squares (OLS), gradient descent optimization, and regularization techniques (Ridge and Lasso). We examine model diagnostics, multicollinearity detection, and cross-validation for hyperparameter tuning.
22
\end{abstract}
23

24
\section{Introduction}
25
Linear regression models the relationship between features $X$ and target $y$:
26
\begin{equation}
27
y = X\beta + \epsilon, \quad \epsilon \sim \mathcal{N}(0, \sigma^2 I)
28
\end{equation}
29

30
\textbf{OLS Solution:}
31
\begin{equation}
32
\hat{\beta}_{OLS} = (X^T X)^{-1} X^T y
33
\end{equation}
34

35
\textbf{Ridge Regression:}
36
\begin{equation}
37
\hat{\beta}_{Ridge} = (X^T X + \lambda I)^{-1} X^T y
38
\end{equation}
39

40
\textbf{Lasso Regression:}
41
\begin{equation}
42
\hat{\beta}_{Lasso} = \arg\min_\beta \|y - X\beta\|_2^2 + \lambda \|\beta\|_1
43
\end{equation}
44

45
\section{Computational Environment}
46
\begin{pycode}
47
import numpy as np
48
import matplotlib.pyplot as plt
49
from scipy import stats
50
import warnings
51
warnings.filterwarnings('ignore')
52

53
plt.rc('text', usetex=True)
54
plt.rc('font', family='serif')
55
np.random.seed(42)
56

57
def save_plot(filename, caption):
58
    plt.savefig(filename, bbox_inches='tight', dpi=150)
59
    print(r'\begin{figure}[H]')
60
    print(r'\centering')
61
    print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')
62
    print(r'\caption{' + caption + '}')
63
    print(r'\end{figure}')
64
    plt.close()
65
\end{pycode}
66

67
\section{Data Generation}
68
\begin{pycode}
69
# Generate regression dataset with known coefficients
70
n_samples = 200
71
n_features = 5
72
true_coefs = np.array([3.0, -2.0, 1.5, 0, 0])  # Last two are irrelevant
73

74
# Generate features with some correlation
75
X = np.random.randn(n_samples, n_features)
76
X[:, 1] = X[:, 0] * 0.5 + X[:, 1] * 0.5  # Introduce correlation
77

78
# Generate target with noise
79
y = X @ true_coefs + np.random.normal(0, 1, n_samples)
80

81
# Split data
82
n_train = 150
83
X_train, X_test = X[:n_train], X[n_train:]
84
y_train, y_test = y[:n_train], y[n_train:]
85

86
# Standardize features
87
X_mean = X_train.mean(axis=0)
88
X_std = X_train.std(axis=0)
89
X_train_scaled = (X_train - X_mean) / X_std
90
X_test_scaled = (X_test - X_mean) / X_std
91

92
# Plot data overview
93
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
94

95
# Feature distributions
96
for i in range(n_features):
97
    axes[0].hist(X_train[:, i], bins=20, alpha=0.5, label=f'X{i+1}')
98
axes[0].set_xlabel('Value')
99
axes[0].set_ylabel('Frequency')
100
axes[0].set_title('Feature Distributions')
101
axes[0].legend()
102

103
# Correlation matrix
104
corr = np.corrcoef(X_train.T)
105
im = axes[1].imshow(corr, cmap='RdBu_r', vmin=-1, vmax=1)
106
axes[1].set_xticks(range(n_features))
107
axes[1].set_yticks(range(n_features))
108
axes[1].set_xticklabels([f'X{i+1}' for i in range(n_features)])
109
axes[1].set_yticklabels([f'X{i+1}' for i in range(n_features)])
110
axes[1].set_title('Feature Correlation')
111
plt.colorbar(im, ax=axes[1])
112

113
# Target distribution
114
axes[2].hist(y_train, bins=20, color='steelblue', edgecolor='black', alpha=0.7)
115
axes[2].set_xlabel('y')
116
axes[2].set_ylabel('Frequency')
117
axes[2].set_title('Target Distribution')
118

119
plt.tight_layout()
120
save_plot('lr_data.pdf', 'Dataset overview showing feature distributions and correlations.')
121
\end{pycode}
122

123
Dataset: $n = \py{n_samples}$ samples, $p = \py{n_features}$ features.
124

125
\section{Ordinary Least Squares}
126
\begin{pycode}
127
class LinearRegression:
128
    def __init__(self):
129
        self.coef_ = None
130
        self.intercept_ = None
131

132
    def fit(self, X, y):
133
        # Add intercept
134
        X_b = np.column_stack([np.ones(len(X)), X])
135
        # OLS solution
136
        self.coef_ = np.linalg.lstsq(X_b, y, rcond=None)[0]
137
        self.intercept_ = self.coef_[0]
138
        self.coef_ = self.coef_[1:]
139
        return self
140

141
    def predict(self, X):
142
        return X @ self.coef_ + self.intercept_
143

144
    def score(self, X, y):
145
        y_pred = self.predict(X)
146
        ss_res = np.sum((y - y_pred)**2)
147
        ss_tot = np.sum((y - np.mean(y))**2)
148
        return 1 - ss_res / ss_tot
149

150
# Fit OLS model
151
ols = LinearRegression()
152
ols.fit(X_train_scaled, y_train)
153

154
y_train_pred = ols.predict(X_train_scaled)
155
y_test_pred = ols.predict(X_test_scaled)
156

157
train_r2 = ols.score(X_train_scaled, y_train)
158
test_r2 = ols.score(X_test_scaled, y_test)
159
train_mse = np.mean((y_train - y_train_pred)**2)
160
test_mse = np.mean((y_test - y_test_pred)**2)
161

162
# Residual analysis
163
residuals = y_train - y_train_pred
164

165
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
166

167
# Predicted vs Actual
168
axes[0, 0].scatter(y_train, y_train_pred, alpha=0.5, s=30)
169
lims = [min(y_train.min(), y_train_pred.min()), max(y_train.max(), y_train_pred.max())]
170
axes[0, 0].plot(lims, lims, 'r--', linewidth=2)
171
axes[0, 0].set_xlabel('Actual')
172
axes[0, 0].set_ylabel('Predicted')
173
axes[0, 0].set_title(f'Predicted vs Actual (R2 = {train_r2:.3f})')
174
axes[0, 0].grid(True, alpha=0.3)
175

176
# Residuals vs Fitted
177
axes[0, 1].scatter(y_train_pred, residuals, alpha=0.5, s=30)
178
axes[0, 1].axhline(0, color='r', linestyle='--')
179
axes[0, 1].set_xlabel('Fitted Values')
180
axes[0, 1].set_ylabel('Residuals')
181
axes[0, 1].set_title('Residuals vs Fitted')
182
axes[0, 1].grid(True, alpha=0.3)
183

184
# Q-Q plot
185
stats.probplot(residuals, dist="norm", plot=axes[1, 0])
186
axes[1, 0].set_title('Normal Q-Q Plot')
187
axes[1, 0].grid(True, alpha=0.3)
188

189
# Residual histogram
190
axes[1, 1].hist(residuals, bins=20, density=True, alpha=0.7, color='steelblue', edgecolor='black')
191
x_norm = np.linspace(residuals.min(), residuals.max(), 100)
192
axes[1, 1].plot(x_norm, stats.norm.pdf(x_norm, 0, np.std(residuals)), 'r-', linewidth=2)
193
axes[1, 1].set_xlabel('Residual')
194
axes[1, 1].set_ylabel('Density')
195
axes[1, 1].set_title('Residual Distribution')
196

197
plt.tight_layout()
198
save_plot('lr_diagnostics.pdf', 'OLS regression diagnostics showing residual analysis.')
199
\end{pycode}
200

201
OLS Performance: Train $R^2 = \py{f"{train_r2:.3f}"}$, Test $R^2 = \py{f"{test_r2:.3f}"}$.
202

203
\section{Gradient Descent}
204
\begin{pycode}
205
class GradientDescentRegression:
206
    def __init__(self, learning_rate=0.01, n_iterations=1000):
207
        self.lr = learning_rate
208
        self.n_iter = n_iterations
209
        self.coef_ = None
210
        self.intercept_ = None
211
        self.loss_history = []
212

213
    def fit(self, X, y):
214
        n_samples, n_features = X.shape
215
        self.coef_ = np.zeros(n_features)
216
        self.intercept_ = 0
217
        self.loss_history = []
218

219
        for _ in range(self.n_iter):
220
            y_pred = X @ self.coef_ + self.intercept_
221
            error = y_pred - y
222
            loss = np.mean(error**2)
223
            self.loss_history.append(loss)
224

225
            # Gradients
226
            grad_coef = (2/n_samples) * X.T @ error
227
            grad_intercept = (2/n_samples) * np.sum(error)
228

229
            # Update
230
            self.coef_ -= self.lr * grad_coef
231
            self.intercept_ -= self.lr * grad_intercept
232

233
        return self
234

235
    def predict(self, X):
236
        return X @ self.coef_ + self.intercept_
237

238
# Fit with gradient descent
239
gd = GradientDescentRegression(learning_rate=0.1, n_iterations=500)
240
gd.fit(X_train_scaled, y_train)
241

242
# Compare learning rates
243
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
244

245
learning_rates = [0.001, 0.01, 0.1, 0.5]
246
for lr in learning_rates:
247
    gd_temp = GradientDescentRegression(learning_rate=lr, n_iterations=200)
248
    gd_temp.fit(X_train_scaled, y_train)
249
    axes[0].plot(gd_temp.loss_history, label=f'lr={lr}', linewidth=1.5)
250

251
axes[0].set_xlabel('Iteration')
252
axes[0].set_ylabel('MSE Loss')
253
axes[0].set_title('Gradient Descent Convergence')
254
axes[0].legend()
255
axes[0].grid(True, alpha=0.3)
256
axes[0].set_yscale('log')
257

258
# Coefficient comparison
259
x_pos = np.arange(n_features)
260
width = 0.35
261
axes[1].bar(x_pos - width/2, ols.coef_, width, label='OLS', alpha=0.8)
262
axes[1].bar(x_pos + width/2, gd.coef_, width, label='GD', alpha=0.8)
263
axes[1].set_xticks(x_pos)
264
axes[1].set_xticklabels([f'X{i+1}' for i in range(n_features)])
265
axes[1].set_xlabel('Feature')
266
axes[1].set_ylabel('Coefficient')
267
axes[1].set_title('OLS vs Gradient Descent Coefficients')
268
axes[1].legend()
269
axes[1].grid(True, alpha=0.3)
270

271
plt.tight_layout()
272
save_plot('lr_gradient_descent.pdf', 'Gradient descent convergence and coefficient comparison with OLS.')
273
\end{pycode}
274

275
\section{Ridge Regression}
276
\begin{pycode}
277
class RidgeRegression:
278
    def __init__(self, alpha=1.0):
279
        self.alpha = alpha
280
        self.coef_ = None
281
        self.intercept_ = None
282

283
    def fit(self, X, y):
284
        n_features = X.shape[1]
285
        X_b = np.column_stack([np.ones(len(X)), X])
286
        I = np.eye(n_features + 1)
287
        I[0, 0] = 0  # Don't regularize intercept
288
        coef = np.linalg.solve(X_b.T @ X_b + self.alpha * I, X_b.T @ y)
289
        self.intercept_ = coef[0]
290
        self.coef_ = coef[1:]
291
        return self
292

293
    def predict(self, X):
294
        return X @ self.coef_ + self.intercept_
295

296
    def score(self, X, y):
297
        y_pred = self.predict(X)
298
        ss_res = np.sum((y - y_pred)**2)
299
        ss_tot = np.sum((y - np.mean(y))**2)
300
        return 1 - ss_res / ss_tot
301

302
# Ridge path
303
alphas = np.logspace(-3, 3, 50)
304
ridge_coefs = []
305
ridge_scores = []
306

307
for alpha in alphas:
308
    ridge = RidgeRegression(alpha=alpha)
309
    ridge.fit(X_train_scaled, y_train)
310
    ridge_coefs.append(ridge.coef_)
311
    ridge_scores.append(ridge.score(X_test_scaled, y_test))
312

313
ridge_coefs = np.array(ridge_coefs)
314

315
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
316

317
# Coefficient path
318
for i in range(n_features):
319
    axes[0].plot(alphas, ridge_coefs[:, i], label=f'X{i+1}', linewidth=1.5)
320
axes[0].set_xscale('log')
321
axes[0].set_xlabel('Alpha (Regularization)')
322
axes[0].set_ylabel('Coefficient Value')
323
axes[0].set_title('Ridge Regression Coefficient Path')
324
axes[0].legend()
325
axes[0].grid(True, alpha=0.3)
326

327
# Validation curve
328
axes[1].plot(alphas, ridge_scores, 'b-o', linewidth=1.5, markersize=4)
329
best_alpha_idx = np.argmax(ridge_scores)
330
best_alpha = alphas[best_alpha_idx]
331
axes[1].axvline(best_alpha, color='r', linestyle='--', label=f'Best alpha={best_alpha:.3f}')
332
axes[1].set_xscale('log')
333
axes[1].set_xlabel('Alpha')
334
axes[1].set_ylabel('Test R2 Score')
335
axes[1].set_title('Ridge Regression Validation Curve')
336
axes[1].legend()
337
axes[1].grid(True, alpha=0.3)
338

339
plt.tight_layout()
340
save_plot('lr_ridge.pdf', 'Ridge regression coefficient path and validation curve.')
341

342
best_ridge = RidgeRegression(alpha=best_alpha)
343
best_ridge.fit(X_train_scaled, y_train)
344
ridge_test_r2 = best_ridge.score(X_test_scaled, y_test)
345
\end{pycode}
346

347
Best Ridge alpha: \py{f"{best_alpha:.3f}"} with test $R^2 = \py{f"{ridge_test_r2:.3f}"}$.
348

349
\section{Lasso Regression}
350
\begin{pycode}
351
class LassoRegression:
352
    def __init__(self, alpha=1.0, n_iterations=1000):
353
        self.alpha = alpha
354
        self.n_iter = n_iterations
355
        self.coef_ = None
356
        self.intercept_ = None
357

358
    def _soft_threshold(self, x, threshold):
359
        return np.sign(x) * np.maximum(np.abs(x) - threshold, 0)
360

361
    def fit(self, X, y):
362
        n_samples, n_features = X.shape
363
        self.coef_ = np.zeros(n_features)
364
        self.intercept_ = np.mean(y)
365

366
        # Coordinate descent
367
        for _ in range(self.n_iter):
368
            for j in range(n_features):
369
                residual = y - self.intercept_ - X @ self.coef_ + X[:, j] * self.coef_[j]
370
                rho = X[:, j] @ residual
371
                z = np.sum(X[:, j]**2)
372
                self.coef_[j] = self._soft_threshold(rho, self.alpha * n_samples / 2) / z
373

374
            self.intercept_ = np.mean(y - X @ self.coef_)
375

376
        return self
377

378
    def predict(self, X):
379
        return X @ self.coef_ + self.intercept_
380

381
    def score(self, X, y):
382
        y_pred = self.predict(X)
383
        ss_res = np.sum((y - y_pred)**2)
384
        ss_tot = np.sum((y - np.mean(y))**2)
385
        return 1 - ss_res / ss_tot
386

387
# Lasso path
388
lasso_alphas = np.logspace(-4, 0, 50)
389
lasso_coefs = []
390
lasso_scores = []
391

392
for alpha in lasso_alphas:
393
    lasso = LassoRegression(alpha=alpha, n_iterations=500)
394
    lasso.fit(X_train_scaled, y_train)
395
    lasso_coefs.append(lasso.coef_)
396
    lasso_scores.append(lasso.score(X_test_scaled, y_test))
397

398
lasso_coefs = np.array(lasso_coefs)
399

400
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
401

402
# Coefficient path
403
for i in range(n_features):
404
    axes[0].plot(lasso_alphas, lasso_coefs[:, i], label=f'X{i+1}', linewidth=1.5)
405
axes[0].set_xscale('log')
406
axes[0].set_xlabel('Alpha (Regularization)')
407
axes[0].set_ylabel('Coefficient Value')
408
axes[0].set_title('Lasso Regression Coefficient Path')
409
axes[0].legend()
410
axes[0].grid(True, alpha=0.3)
411

412
# Validation curve
413
axes[1].plot(lasso_alphas, lasso_scores, 'g-s', linewidth=1.5, markersize=4)
414
best_lasso_idx = np.argmax(lasso_scores)
415
best_lasso_alpha = lasso_alphas[best_lasso_idx]
416
axes[1].axvline(best_lasso_alpha, color='r', linestyle='--', label=f'Best alpha={best_lasso_alpha:.4f}')
417
axes[1].set_xscale('log')
418
axes[1].set_xlabel('Alpha')
419
axes[1].set_ylabel('Test R2 Score')
420
axes[1].set_title('Lasso Regression Validation Curve')
421
axes[1].legend()
422
axes[1].grid(True, alpha=0.3)
423

424
plt.tight_layout()
425
save_plot('lr_lasso.pdf', 'Lasso regression coefficient path showing feature selection.')
426

427
best_lasso = LassoRegression(alpha=best_lasso_alpha, n_iterations=500)
428
best_lasso.fit(X_train_scaled, y_train)
429
lasso_test_r2 = best_lasso.score(X_test_scaled, y_test)
430
n_selected = np.sum(np.abs(best_lasso.coef_) > 0.01)
431
\end{pycode}
432

433
Best Lasso alpha: \py{f"{best_lasso_alpha:.4f}"} with test $R^2 = \py{f"{lasso_test_r2:.3f}"}$. Selected \py{n_selected} features.
434

435
\section{Model Comparison}
436
\begin{pycode}
437
# Compare all methods
438
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
439

440
# Coefficient comparison
441
methods = ['True', 'OLS', 'Ridge', 'Lasso']
442
coefs = [true_coefs, ols.coef_, best_ridge.coef_, best_lasso.coef_]
443

444
x = np.arange(n_features)
445
width = 0.2
446

447
for i, (method, coef) in enumerate(zip(methods, coefs)):
448
    axes[0].bar(x + i*width, coef, width, label=method, alpha=0.8)
449

450
axes[0].set_xticks(x + 1.5*width)
451
axes[0].set_xticklabels([f'X{i+1}' for i in range(n_features)])
452
axes[0].set_xlabel('Feature')
453
axes[0].set_ylabel('Coefficient')
454
axes[0].set_title('Coefficient Comparison Across Methods')
455
axes[0].legend()
456
axes[0].grid(True, alpha=0.3)
457

458
# Performance comparison
459
methods_names = ['OLS', 'Ridge', 'Lasso']
460
test_scores = [test_r2, ridge_test_r2, lasso_test_r2]
461
train_scores = [train_r2, best_ridge.score(X_train_scaled, y_train),
462
                best_lasso.score(X_train_scaled, y_train)]
463

464
x = np.arange(len(methods_names))
465
width = 0.35
466
axes[1].bar(x - width/2, train_scores, width, label='Train', alpha=0.8)
467
axes[1].bar(x + width/2, test_scores, width, label='Test', alpha=0.8)
468
axes[1].set_xticks(x)
469
axes[1].set_xticklabels(methods_names)
470
axes[1].set_ylabel('R2 Score')
471
axes[1].set_title('Model Performance Comparison')
472
axes[1].legend()
473
axes[1].grid(True, alpha=0.3)
474

475
plt.tight_layout()
476
save_plot('lr_comparison.pdf', 'Comparison of OLS, Ridge, and Lasso regression methods.')
477
\end{pycode}
478

479
\section{Results Summary}
480
\begin{table}[H]
481
\centering
482
\caption{Regression Model Performance}
483
\begin{tabular}{lccc}
484
\toprule
485
Model & Train $R^2$ & Test $R^2$ & Hyperparameter \\
486
\midrule
487
OLS & \py{f"{train_r2:.3f}"} & \py{f"{test_r2:.3f}"} & - \\
488
Ridge & \py{f"{best_ridge.score(X_train_scaled, y_train):.3f}"} & \py{f"{ridge_test_r2:.3f}"} & $\alpha = \py{f"{best_alpha:.3f}"}$ \\
489
Lasso & \py{f"{best_lasso.score(X_train_scaled, y_train):.3f}"} & \py{f"{lasso_test_r2:.3f}"} & $\alpha = \py{f"{best_lasso_alpha:.4f}"}$ \\
490
\bottomrule
491
\end{tabular}
492
\end{table}
493

494
\begin{pycode}
495
print(r'\begin{table}[H]')
496
print(r'\centering')
497
print(r'\caption{Coefficient Estimates}')
498
print(r'\begin{tabular}{ccccc}')
499
print(r'\toprule')
500
print(r'Feature & True & OLS & Ridge & Lasso \\')
501
print(r'\midrule')
502
for i in range(n_features):
503
    print(f'X{i+1} & {true_coefs[i]:.2f} & {ols.coef_[i]:.2f} & {best_ridge.coef_[i]:.2f} & {best_lasso.coef_[i]:.2f} \\\\')
504
print(r'\bottomrule')
505
print(r'\end{tabular}')
506
print(r'\end{table}')
507
\end{pycode}
508

509
\section{Conclusion}
510
This analysis demonstrated:
511
\begin{itemize}
512
    \item OLS as the baseline with closed-form solution
513
    \item Gradient descent as iterative optimization method
514
    \item Ridge regression for handling multicollinearity (L2 penalty)
515
    \item Lasso regression for feature selection (L1 penalty)
516
    \item Hyperparameter tuning via validation curves
517
\end{itemize}
518

519
The Lasso method successfully identified the relevant features and set irrelevant coefficients close to zero, demonstrating its utility for sparse solutions.
520

521
\end{document}
522

523
Product

Resources

Company