Path: blob/main/latex-templates/templates/machine-learning/linear_regression.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{booktabs}6\usepackage{siunitx}7\usepackage{float}8\usepackage{geometry}9\geometry{margin=1in}10\usepackage[makestderr]{pythontex}1112\title{Linear Regression: OLS, Gradient Descent, and Regularization}13\author{Machine Learning Foundations}14\date{\today}1516\begin{document}17\maketitle1819\begin{abstract}20This document presents a comprehensive analysis of linear regression methods including ordinary least squares (OLS), gradient descent optimization, and regularization techniques (Ridge and Lasso). We examine model diagnostics, multicollinearity detection, and cross-validation for hyperparameter tuning.21\end{abstract}2223\section{Introduction}24Linear regression models the relationship between features $X$ and target $y$:25\begin{equation}26y = X\beta + \epsilon, \quad \epsilon \sim \mathcal{N}(0, \sigma^2 I)27\end{equation}2829\textbf{OLS Solution:}30\begin{equation}31\hat{\beta}_{OLS} = (X^T X)^{-1} X^T y32\end{equation}3334\textbf{Ridge Regression:}35\begin{equation}36\hat{\beta}_{Ridge} = (X^T X + \lambda I)^{-1} X^T y37\end{equation}3839\textbf{Lasso Regression:}40\begin{equation}41\hat{\beta}_{Lasso} = \arg\min_\beta \|y - X\beta\|_2^2 + \lambda \|\beta\|_142\end{equation}4344\section{Computational Environment}45\begin{pycode}46import numpy as np47import matplotlib.pyplot as plt48from scipy import stats49import warnings50warnings.filterwarnings('ignore')5152plt.rc('text', usetex=True)53plt.rc('font', family='serif')54np.random.seed(42)5556def save_plot(filename, caption):57plt.savefig(filename, bbox_inches='tight', dpi=150)58print(r'\begin{figure}[H]')59print(r'\centering')60print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')61print(r'\caption{' + caption + '}')62print(r'\end{figure}')63plt.close()64\end{pycode}6566\section{Data Generation}67\begin{pycode}68# Generate regression dataset with known coefficients69n_samples = 20070n_features = 571true_coefs = np.array([3.0, -2.0, 1.5, 0, 0]) # Last two are irrelevant7273# Generate features with some correlation74X = np.random.randn(n_samples, n_features)75X[:, 1] = X[:, 0] * 0.5 + X[:, 1] * 0.5 # Introduce correlation7677# Generate target with noise78y = X @ true_coefs + np.random.normal(0, 1, n_samples)7980# Split data81n_train = 15082X_train, X_test = X[:n_train], X[n_train:]83y_train, y_test = y[:n_train], y[n_train:]8485# Standardize features86X_mean = X_train.mean(axis=0)87X_std = X_train.std(axis=0)88X_train_scaled = (X_train - X_mean) / X_std89X_test_scaled = (X_test - X_mean) / X_std9091# Plot data overview92fig, axes = plt.subplots(1, 3, figsize=(12, 4))9394# Feature distributions95for i in range(n_features):96axes[0].hist(X_train[:, i], bins=20, alpha=0.5, label=f'X{i+1}')97axes[0].set_xlabel('Value')98axes[0].set_ylabel('Frequency')99axes[0].set_title('Feature Distributions')100axes[0].legend()101102# Correlation matrix103corr = np.corrcoef(X_train.T)104im = axes[1].imshow(corr, cmap='RdBu_r', vmin=-1, vmax=1)105axes[1].set_xticks(range(n_features))106axes[1].set_yticks(range(n_features))107axes[1].set_xticklabels([f'X{i+1}' for i in range(n_features)])108axes[1].set_yticklabels([f'X{i+1}' for i in range(n_features)])109axes[1].set_title('Feature Correlation')110plt.colorbar(im, ax=axes[1])111112# Target distribution113axes[2].hist(y_train, bins=20, color='steelblue', edgecolor='black', alpha=0.7)114axes[2].set_xlabel('y')115axes[2].set_ylabel('Frequency')116axes[2].set_title('Target Distribution')117118plt.tight_layout()119save_plot('lr_data.pdf', 'Dataset overview showing feature distributions and correlations.')120\end{pycode}121122Dataset: $n = \py{n_samples}$ samples, $p = \py{n_features}$ features.123124\section{Ordinary Least Squares}125\begin{pycode}126class LinearRegression:127def __init__(self):128self.coef_ = None129self.intercept_ = None130131def fit(self, X, y):132# Add intercept133X_b = np.column_stack([np.ones(len(X)), X])134# OLS solution135self.coef_ = np.linalg.lstsq(X_b, y, rcond=None)[0]136self.intercept_ = self.coef_[0]137self.coef_ = self.coef_[1:]138return self139140def predict(self, X):141return X @ self.coef_ + self.intercept_142143def score(self, X, y):144y_pred = self.predict(X)145ss_res = np.sum((y - y_pred)**2)146ss_tot = np.sum((y - np.mean(y))**2)147return 1 - ss_res / ss_tot148149# Fit OLS model150ols = LinearRegression()151ols.fit(X_train_scaled, y_train)152153y_train_pred = ols.predict(X_train_scaled)154y_test_pred = ols.predict(X_test_scaled)155156train_r2 = ols.score(X_train_scaled, y_train)157test_r2 = ols.score(X_test_scaled, y_test)158train_mse = np.mean((y_train - y_train_pred)**2)159test_mse = np.mean((y_test - y_test_pred)**2)160161# Residual analysis162residuals = y_train - y_train_pred163164fig, axes = plt.subplots(2, 2, figsize=(10, 8))165166# Predicted vs Actual167axes[0, 0].scatter(y_train, y_train_pred, alpha=0.5, s=30)168lims = [min(y_train.min(), y_train_pred.min()), max(y_train.max(), y_train_pred.max())]169axes[0, 0].plot(lims, lims, 'r--', linewidth=2)170axes[0, 0].set_xlabel('Actual')171axes[0, 0].set_ylabel('Predicted')172axes[0, 0].set_title(f'Predicted vs Actual (R2 = {train_r2:.3f})')173axes[0, 0].grid(True, alpha=0.3)174175# Residuals vs Fitted176axes[0, 1].scatter(y_train_pred, residuals, alpha=0.5, s=30)177axes[0, 1].axhline(0, color='r', linestyle='--')178axes[0, 1].set_xlabel('Fitted Values')179axes[0, 1].set_ylabel('Residuals')180axes[0, 1].set_title('Residuals vs Fitted')181axes[0, 1].grid(True, alpha=0.3)182183# Q-Q plot184stats.probplot(residuals, dist="norm", plot=axes[1, 0])185axes[1, 0].set_title('Normal Q-Q Plot')186axes[1, 0].grid(True, alpha=0.3)187188# Residual histogram189axes[1, 1].hist(residuals, bins=20, density=True, alpha=0.7, color='steelblue', edgecolor='black')190x_norm = np.linspace(residuals.min(), residuals.max(), 100)191axes[1, 1].plot(x_norm, stats.norm.pdf(x_norm, 0, np.std(residuals)), 'r-', linewidth=2)192axes[1, 1].set_xlabel('Residual')193axes[1, 1].set_ylabel('Density')194axes[1, 1].set_title('Residual Distribution')195196plt.tight_layout()197save_plot('lr_diagnostics.pdf', 'OLS regression diagnostics showing residual analysis.')198\end{pycode}199200OLS Performance: Train $R^2 = \py{f"{train_r2:.3f}"}$, Test $R^2 = \py{f"{test_r2:.3f}"}$.201202\section{Gradient Descent}203\begin{pycode}204class GradientDescentRegression:205def __init__(self, learning_rate=0.01, n_iterations=1000):206self.lr = learning_rate207self.n_iter = n_iterations208self.coef_ = None209self.intercept_ = None210self.loss_history = []211212def fit(self, X, y):213n_samples, n_features = X.shape214self.coef_ = np.zeros(n_features)215self.intercept_ = 0216self.loss_history = []217218for _ in range(self.n_iter):219y_pred = X @ self.coef_ + self.intercept_220error = y_pred - y221loss = np.mean(error**2)222self.loss_history.append(loss)223224# Gradients225grad_coef = (2/n_samples) * X.T @ error226grad_intercept = (2/n_samples) * np.sum(error)227228# Update229self.coef_ -= self.lr * grad_coef230self.intercept_ -= self.lr * grad_intercept231232return self233234def predict(self, X):235return X @ self.coef_ + self.intercept_236237# Fit with gradient descent238gd = GradientDescentRegression(learning_rate=0.1, n_iterations=500)239gd.fit(X_train_scaled, y_train)240241# Compare learning rates242fig, axes = plt.subplots(1, 2, figsize=(12, 5))243244learning_rates = [0.001, 0.01, 0.1, 0.5]245for lr in learning_rates:246gd_temp = GradientDescentRegression(learning_rate=lr, n_iterations=200)247gd_temp.fit(X_train_scaled, y_train)248axes[0].plot(gd_temp.loss_history, label=f'lr={lr}', linewidth=1.5)249250axes[0].set_xlabel('Iteration')251axes[0].set_ylabel('MSE Loss')252axes[0].set_title('Gradient Descent Convergence')253axes[0].legend()254axes[0].grid(True, alpha=0.3)255axes[0].set_yscale('log')256257# Coefficient comparison258x_pos = np.arange(n_features)259width = 0.35260axes[1].bar(x_pos - width/2, ols.coef_, width, label='OLS', alpha=0.8)261axes[1].bar(x_pos + width/2, gd.coef_, width, label='GD', alpha=0.8)262axes[1].set_xticks(x_pos)263axes[1].set_xticklabels([f'X{i+1}' for i in range(n_features)])264axes[1].set_xlabel('Feature')265axes[1].set_ylabel('Coefficient')266axes[1].set_title('OLS vs Gradient Descent Coefficients')267axes[1].legend()268axes[1].grid(True, alpha=0.3)269270plt.tight_layout()271save_plot('lr_gradient_descent.pdf', 'Gradient descent convergence and coefficient comparison with OLS.')272\end{pycode}273274\section{Ridge Regression}275\begin{pycode}276class RidgeRegression:277def __init__(self, alpha=1.0):278self.alpha = alpha279self.coef_ = None280self.intercept_ = None281282def fit(self, X, y):283n_features = X.shape[1]284X_b = np.column_stack([np.ones(len(X)), X])285I = np.eye(n_features + 1)286I[0, 0] = 0 # Don't regularize intercept287coef = np.linalg.solve(X_b.T @ X_b + self.alpha * I, X_b.T @ y)288self.intercept_ = coef[0]289self.coef_ = coef[1:]290return self291292def predict(self, X):293return X @ self.coef_ + self.intercept_294295def score(self, X, y):296y_pred = self.predict(X)297ss_res = np.sum((y - y_pred)**2)298ss_tot = np.sum((y - np.mean(y))**2)299return 1 - ss_res / ss_tot300301# Ridge path302alphas = np.logspace(-3, 3, 50)303ridge_coefs = []304ridge_scores = []305306for alpha in alphas:307ridge = RidgeRegression(alpha=alpha)308ridge.fit(X_train_scaled, y_train)309ridge_coefs.append(ridge.coef_)310ridge_scores.append(ridge.score(X_test_scaled, y_test))311312ridge_coefs = np.array(ridge_coefs)313314fig, axes = plt.subplots(1, 2, figsize=(12, 5))315316# Coefficient path317for i in range(n_features):318axes[0].plot(alphas, ridge_coefs[:, i], label=f'X{i+1}', linewidth=1.5)319axes[0].set_xscale('log')320axes[0].set_xlabel('Alpha (Regularization)')321axes[0].set_ylabel('Coefficient Value')322axes[0].set_title('Ridge Regression Coefficient Path')323axes[0].legend()324axes[0].grid(True, alpha=0.3)325326# Validation curve327axes[1].plot(alphas, ridge_scores, 'b-o', linewidth=1.5, markersize=4)328best_alpha_idx = np.argmax(ridge_scores)329best_alpha = alphas[best_alpha_idx]330axes[1].axvline(best_alpha, color='r', linestyle='--', label=f'Best alpha={best_alpha:.3f}')331axes[1].set_xscale('log')332axes[1].set_xlabel('Alpha')333axes[1].set_ylabel('Test R2 Score')334axes[1].set_title('Ridge Regression Validation Curve')335axes[1].legend()336axes[1].grid(True, alpha=0.3)337338plt.tight_layout()339save_plot('lr_ridge.pdf', 'Ridge regression coefficient path and validation curve.')340341best_ridge = RidgeRegression(alpha=best_alpha)342best_ridge.fit(X_train_scaled, y_train)343ridge_test_r2 = best_ridge.score(X_test_scaled, y_test)344\end{pycode}345346Best Ridge alpha: \py{f"{best_alpha:.3f}"} with test $R^2 = \py{f"{ridge_test_r2:.3f}"}$.347348\section{Lasso Regression}349\begin{pycode}350class LassoRegression:351def __init__(self, alpha=1.0, n_iterations=1000):352self.alpha = alpha353self.n_iter = n_iterations354self.coef_ = None355self.intercept_ = None356357def _soft_threshold(self, x, threshold):358return np.sign(x) * np.maximum(np.abs(x) - threshold, 0)359360def fit(self, X, y):361n_samples, n_features = X.shape362self.coef_ = np.zeros(n_features)363self.intercept_ = np.mean(y)364365# Coordinate descent366for _ in range(self.n_iter):367for j in range(n_features):368residual = y - self.intercept_ - X @ self.coef_ + X[:, j] * self.coef_[j]369rho = X[:, j] @ residual370z = np.sum(X[:, j]**2)371self.coef_[j] = self._soft_threshold(rho, self.alpha * n_samples / 2) / z372373self.intercept_ = np.mean(y - X @ self.coef_)374375return self376377def predict(self, X):378return X @ self.coef_ + self.intercept_379380def score(self, X, y):381y_pred = self.predict(X)382ss_res = np.sum((y - y_pred)**2)383ss_tot = np.sum((y - np.mean(y))**2)384return 1 - ss_res / ss_tot385386# Lasso path387lasso_alphas = np.logspace(-4, 0, 50)388lasso_coefs = []389lasso_scores = []390391for alpha in lasso_alphas:392lasso = LassoRegression(alpha=alpha, n_iterations=500)393lasso.fit(X_train_scaled, y_train)394lasso_coefs.append(lasso.coef_)395lasso_scores.append(lasso.score(X_test_scaled, y_test))396397lasso_coefs = np.array(lasso_coefs)398399fig, axes = plt.subplots(1, 2, figsize=(12, 5))400401# Coefficient path402for i in range(n_features):403axes[0].plot(lasso_alphas, lasso_coefs[:, i], label=f'X{i+1}', linewidth=1.5)404axes[0].set_xscale('log')405axes[0].set_xlabel('Alpha (Regularization)')406axes[0].set_ylabel('Coefficient Value')407axes[0].set_title('Lasso Regression Coefficient Path')408axes[0].legend()409axes[0].grid(True, alpha=0.3)410411# Validation curve412axes[1].plot(lasso_alphas, lasso_scores, 'g-s', linewidth=1.5, markersize=4)413best_lasso_idx = np.argmax(lasso_scores)414best_lasso_alpha = lasso_alphas[best_lasso_idx]415axes[1].axvline(best_lasso_alpha, color='r', linestyle='--', label=f'Best alpha={best_lasso_alpha:.4f}')416axes[1].set_xscale('log')417axes[1].set_xlabel('Alpha')418axes[1].set_ylabel('Test R2 Score')419axes[1].set_title('Lasso Regression Validation Curve')420axes[1].legend()421axes[1].grid(True, alpha=0.3)422423plt.tight_layout()424save_plot('lr_lasso.pdf', 'Lasso regression coefficient path showing feature selection.')425426best_lasso = LassoRegression(alpha=best_lasso_alpha, n_iterations=500)427best_lasso.fit(X_train_scaled, y_train)428lasso_test_r2 = best_lasso.score(X_test_scaled, y_test)429n_selected = np.sum(np.abs(best_lasso.coef_) > 0.01)430\end{pycode}431432Best Lasso alpha: \py{f"{best_lasso_alpha:.4f}"} with test $R^2 = \py{f"{lasso_test_r2:.3f}"}$. Selected \py{n_selected} features.433434\section{Model Comparison}435\begin{pycode}436# Compare all methods437fig, axes = plt.subplots(1, 2, figsize=(12, 5))438439# Coefficient comparison440methods = ['True', 'OLS', 'Ridge', 'Lasso']441coefs = [true_coefs, ols.coef_, best_ridge.coef_, best_lasso.coef_]442443x = np.arange(n_features)444width = 0.2445446for i, (method, coef) in enumerate(zip(methods, coefs)):447axes[0].bar(x + i*width, coef, width, label=method, alpha=0.8)448449axes[0].set_xticks(x + 1.5*width)450axes[0].set_xticklabels([f'X{i+1}' for i in range(n_features)])451axes[0].set_xlabel('Feature')452axes[0].set_ylabel('Coefficient')453axes[0].set_title('Coefficient Comparison Across Methods')454axes[0].legend()455axes[0].grid(True, alpha=0.3)456457# Performance comparison458methods_names = ['OLS', 'Ridge', 'Lasso']459test_scores = [test_r2, ridge_test_r2, lasso_test_r2]460train_scores = [train_r2, best_ridge.score(X_train_scaled, y_train),461best_lasso.score(X_train_scaled, y_train)]462463x = np.arange(len(methods_names))464width = 0.35465axes[1].bar(x - width/2, train_scores, width, label='Train', alpha=0.8)466axes[1].bar(x + width/2, test_scores, width, label='Test', alpha=0.8)467axes[1].set_xticks(x)468axes[1].set_xticklabels(methods_names)469axes[1].set_ylabel('R2 Score')470axes[1].set_title('Model Performance Comparison')471axes[1].legend()472axes[1].grid(True, alpha=0.3)473474plt.tight_layout()475save_plot('lr_comparison.pdf', 'Comparison of OLS, Ridge, and Lasso regression methods.')476\end{pycode}477478\section{Results Summary}479\begin{table}[H]480\centering481\caption{Regression Model Performance}482\begin{tabular}{lccc}483\toprule484Model & Train $R^2$ & Test $R^2$ & Hyperparameter \\485\midrule486OLS & \py{f"{train_r2:.3f}"} & \py{f"{test_r2:.3f}"} & - \\487Ridge & \py{f"{best_ridge.score(X_train_scaled, y_train):.3f}"} & \py{f"{ridge_test_r2:.3f}"} & $\alpha = \py{f"{best_alpha:.3f}"}$ \\488Lasso & \py{f"{best_lasso.score(X_train_scaled, y_train):.3f}"} & \py{f"{lasso_test_r2:.3f}"} & $\alpha = \py{f"{best_lasso_alpha:.4f}"}$ \\489\bottomrule490\end{tabular}491\end{table}492493\begin{pycode}494print(r'\begin{table}[H]')495print(r'\centering')496print(r'\caption{Coefficient Estimates}')497print(r'\begin{tabular}{ccccc}')498print(r'\toprule')499print(r'Feature & True & OLS & Ridge & Lasso \\')500print(r'\midrule')501for i in range(n_features):502print(f'X{i+1} & {true_coefs[i]:.2f} & {ols.coef_[i]:.2f} & {best_ridge.coef_[i]:.2f} & {best_lasso.coef_[i]:.2f} \\\\')503print(r'\bottomrule')504print(r'\end{tabular}')505print(r'\end{table}')506\end{pycode}507508\section{Conclusion}509This analysis demonstrated:510\begin{itemize}511\item OLS as the baseline with closed-form solution512\item Gradient descent as iterative optimization method513\item Ridge regression for handling multicollinearity (L2 penalty)514\item Lasso regression for feature selection (L1 penalty)515\item Hyperparameter tuning via validation curves516\end{itemize}517518The Lasso method successfully identified the relevant features and set irrelevant coefficients close to zero, demonstrating its utility for sparse solutions.519520\end{document}521522523