Path: blob/main/latex-templates/templates/other/optimization.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{siunitx}6\usepackage{booktabs}7\usepackage{multirow}8\usepackage{float}9\usepackage[makestderr]{pythontex}1011\title{Optimization: Gradient Descent Methods and Convergence Analysis}12\author{Computational Science Templates}13\date{\today}1415\begin{document}16\maketitle1718\begin{abstract}19Optimization algorithms find minima of objective functions and are fundamental to machine learning, scientific computing, and engineering design. This document demonstrates gradient descent variants (vanilla, momentum, RMSprop, Adam), Newton's method, quasi-Newton methods (BFGS), and constrained optimization. Using PythonTeX, we compare convergence behavior on standard test functions and analyze the effects of hyperparameters on optimization performance.20\end{abstract}2122\section{Introduction}23Optimization algorithms find minima of objective functions. This analysis compares gradient descent variants on test functions, demonstrating convergence behavior, hyperparameter sensitivity, and the tradeoffs between different optimization strategies. Applications span machine learning (neural network training), scientific computing (parameter estimation), and engineering (design optimization).2425\section{Mathematical Framework}2627\subsection{First-Order Methods}28Gradient descent update:29\begin{equation}30x_{k+1} = x_k - \alpha \nabla f(x_k)31\end{equation}3233Momentum update (heavy ball method):34\begin{equation}35v_{k+1} = \beta v_k + \alpha \nabla f(x_k), \quad x_{k+1} = x_k - v_{k+1}36\end{equation}3738Nesterov accelerated gradient:39\begin{equation}40v_{k+1} = \beta v_k + \alpha \nabla f(x_k - \beta v_k), \quad x_{k+1} = x_k - v_{k+1}41\end{equation}4243\subsection{Adaptive Learning Rates}44RMSprop:45\begin{equation}46s_k = \rho s_{k-1} + (1-\rho) (\nabla f(x_k))^2, \quad x_{k+1} = x_k - \frac{\alpha}{\sqrt{s_k + \epsilon}} \nabla f(x_k)47\end{equation}4849Adam (Adaptive Moment Estimation):50\begin{align}51m_k &= \beta_1 m_{k-1} + (1-\beta_1) \nabla f(x_k) \\52v_k &= \beta_2 v_{k-1} + (1-\beta_2) (\nabla f(x_k))^2 \\53\hat{m}_k &= \frac{m_k}{1-\beta_1^k}, \quad \hat{v}_k = \frac{v_k}{1-\beta_2^k} \\54x_{k+1} &= x_k - \frac{\alpha}{\sqrt{\hat{v}_k} + \epsilon} \hat{m}_k55\end{align}5657\subsection{Second-Order Methods}58Newton's method:59\begin{equation}60x_{k+1} = x_k - [H_f(x_k)]^{-1} \nabla f(x_k)61\end{equation}62where $H_f$ is the Hessian matrix.6364BFGS (quasi-Newton):65\begin{equation}66x_{k+1} = x_k - \alpha_k B_k^{-1} \nabla f(x_k)67\end{equation}68where $B_k$ approximates the Hessian using gradient differences.6970\subsection{Convergence Rates}71For $\mu$-strongly convex functions with $L$-Lipschitz gradients:72\begin{itemize}73\item Gradient descent: $\mathcal{O}((1 - \mu/L)^k)$74\item Momentum: $\mathcal{O}((1 - \sqrt{\mu/L})^k)$75\item Newton's method: quadratic convergence near optimum76\end{itemize}7778\section{Environment Setup}7980\begin{pycode}81import numpy as np82import matplotlib.pyplot as plt83from scipy.optimize import minimize, minimize_scalar84import warnings85warnings.filterwarnings('ignore')8687plt.rc('text', usetex=True)88plt.rc('font', family='serif')8990np.random.seed(42)9192def save_plot(filename, caption=""):93plt.savefig(filename, bbox_inches='tight', dpi=150)94print(r'\begin{figure}[htbp]')95print(r'\centering')96print(r'\includegraphics[width=0.95\textwidth]{' + filename + '}')97if caption:98print(r'\caption{' + caption + '}')99print(r'\end{figure}')100plt.close()101\end{pycode}102103\section{Test Functions}104105\begin{pycode}106# Define test functions and their gradients107108# Rosenbrock function (banana function)109def rosenbrock(x):110return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2111112def rosenbrock_grad(x):113dx = -2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2)114dy = 200*(x[1] - x[0]**2)115return np.array([dx, dy])116117def rosenbrock_hess(x):118h11 = 2 - 400*x[1] + 1200*x[0]**2119h12 = -400*x[0]120h22 = 200121return np.array([[h11, h12], [h12, h22]])122123# Beale function124def beale(x):125term1 = (1.5 - x[0] + x[0]*x[1])**2126term2 = (2.25 - x[0] + x[0]*x[1]**2)**2127term3 = (2.625 - x[0] + x[0]*x[1]**3)**2128return term1 + term2 + term3129130def beale_grad(x):131dx = (2*(1.5 - x[0] + x[0]*x[1])*(-1 + x[1]) +1322*(2.25 - x[0] + x[0]*x[1]**2)*(-1 + x[1]**2) +1332*(2.625 - x[0] + x[0]*x[1]**3)*(-1 + x[1]**3))134dy = (2*(1.5 - x[0] + x[0]*x[1])*x[0] +1352*(2.25 - x[0] + x[0]*x[1]**2)*2*x[0]*x[1] +1362*(2.625 - x[0] + x[0]*x[1]**3)*3*x[0]*x[1]**2)137return np.array([dx, dy])138139# Rastrigin function (multimodal)140def rastrigin(x):141A = 10142n = len(x)143return A*n + sum(xi**2 - A*np.cos(2*np.pi*xi) for xi in x)144145def rastrigin_grad(x):146A = 10147return np.array([2*xi + 2*np.pi*A*np.sin(2*np.pi*xi) for xi in x])148149# Quadratic function (for analysis)150def quadratic(x, A=None, b=None):151if A is None:152A = np.array([[10, 0], [0, 1]])153if b is None:154b = np.zeros(2)155return 0.5 * x @ A @ x - b @ x156157def quadratic_grad(x, A=None, b=None):158if A is None:159A = np.array([[10, 0], [0, 1]])160if b is None:161b = np.zeros(2)162return A @ x - b163164# Store optimal values165rosenbrock_opt = np.array([1.0, 1.0])166beale_opt = np.array([3.0, 0.5])167rastrigin_opt = np.array([0.0, 0.0])168quadratic_opt = np.array([0.0, 0.0])169\end{pycode}170171\section{Gradient Descent Variants}172173\begin{pycode}174# Implement optimization algorithms175176def gradient_descent(x0, grad_f, alpha=0.001, n_iter=1000):177"""Vanilla gradient descent."""178x = x0.copy()179history = [x.copy()]180for _ in range(n_iter):181x = x - alpha * grad_f(x)182history.append(x.copy())183return np.array(history)184185def momentum_gd(x0, grad_f, alpha=0.001, beta=0.9, n_iter=1000):186"""Gradient descent with momentum."""187x = x0.copy()188v = np.zeros_like(x)189history = [x.copy()]190for _ in range(n_iter):191v = beta*v + alpha*grad_f(x)192x = x - v193history.append(x.copy())194return np.array(history)195196def nesterov_gd(x0, grad_f, alpha=0.001, beta=0.9, n_iter=1000):197"""Nesterov accelerated gradient."""198x = x0.copy()199v = np.zeros_like(x)200history = [x.copy()]201for _ in range(n_iter):202x_lookahead = x - beta*v203v = beta*v + alpha*grad_f(x_lookahead)204x = x - v205history.append(x.copy())206return np.array(history)207208def rmsprop(x0, grad_f, alpha=0.001, rho=0.9, eps=1e-8, n_iter=1000):209"""RMSprop optimizer."""210x = x0.copy()211s = np.zeros_like(x)212history = [x.copy()]213for _ in range(n_iter):214g = grad_f(x)215s = rho*s + (1-rho)*g**2216x = x - alpha * g / (np.sqrt(s) + eps)217history.append(x.copy())218return np.array(history)219220def adam(x0, grad_f, alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-8, n_iter=1000):221"""Adam optimizer."""222x = x0.copy()223m = np.zeros_like(x)224v = np.zeros_like(x)225history = [x.copy()]226for t in range(1, n_iter+1):227g = grad_f(x)228m = beta1*m + (1-beta1)*g229v = beta2*v + (1-beta2)*g**2230m_hat = m / (1 - beta1**t)231v_hat = v / (1 - beta2**t)232x = x - alpha * m_hat / (np.sqrt(v_hat) + eps)233history.append(x.copy())234return np.array(history)235236# Run optimizers on Rosenbrock237x0 = np.array([-1.5, 1.5])238n_iter = 5000239240hist_gd = gradient_descent(x0, rosenbrock_grad, alpha=0.001, n_iter=n_iter)241hist_mom = momentum_gd(x0, rosenbrock_grad, alpha=0.001, beta=0.9, n_iter=n_iter)242hist_nest = nesterov_gd(x0, rosenbrock_grad, alpha=0.001, beta=0.9, n_iter=n_iter)243hist_rms = rmsprop(x0, rosenbrock_grad, alpha=0.01, n_iter=n_iter)244hist_adam = adam(x0, rosenbrock_grad, alpha=0.01, n_iter=n_iter)245246# Compute objective values247f_gd = [rosenbrock(x) for x in hist_gd]248f_mom = [rosenbrock(x) for x in hist_mom]249f_nest = [rosenbrock(x) for x in hist_nest]250f_rms = [rosenbrock(x) for x in hist_rms]251f_adam = [rosenbrock(x) for x in hist_adam]252253# Contour plot of Rosenbrock254x = np.linspace(-2, 2, 100)255y = np.linspace(-1, 3, 100)256X, Y = np.meshgrid(x, y)257Z = (1 - X)**2 + 100*(Y - X**2)**2258259# Visualization260fig, axes = plt.subplots(2, 2, figsize=(12, 10))261262# Optimization paths263axes[0, 0].contour(X, Y, Z, levels=np.logspace(0, 3, 20), cmap='viridis', alpha=0.7)264axes[0, 0].plot(hist_gd[:, 0], hist_gd[:, 1], 'b.-', markersize=1, linewidth=0.5, label='GD', alpha=0.7)265axes[0, 0].plot(hist_mom[:, 0], hist_mom[:, 1], 'r.-', markersize=1, linewidth=0.5, label='Momentum', alpha=0.7)266axes[0, 0].plot(hist_adam[:, 0], hist_adam[:, 1], 'g.-', markersize=1, linewidth=0.5, label='Adam', alpha=0.7)267axes[0, 0].plot(1, 1, 'k*', markersize=15, label='Optimum')268axes[0, 0].set_xlabel('$x_1$')269axes[0, 0].set_ylabel('$x_2$')270axes[0, 0].set_title('Optimization Paths on Rosenbrock')271axes[0, 0].legend(fontsize=8)272273# Convergence comparison274axes[0, 1].semilogy(f_gd, 'b-', linewidth=1, label='GD')275axes[0, 1].semilogy(f_mom, 'r-', linewidth=1, label='Momentum')276axes[0, 1].semilogy(f_nest, 'm-', linewidth=1, label='Nesterov')277axes[0, 1].semilogy(f_rms, 'c-', linewidth=1, label='RMSprop')278axes[0, 1].semilogy(f_adam, 'g-', linewidth=1, label='Adam')279axes[0, 1].set_xlabel('Iteration')280axes[0, 1].set_ylabel('$f(x)$')281axes[0, 1].set_title('Convergence Comparison')282axes[0, 1].legend(fontsize=8)283axes[0, 1].grid(True, alpha=0.3)284285# Effect of learning rate286alphas = [0.0001, 0.0005, 0.001, 0.002]287for alpha in alphas:288hist = gradient_descent(x0, rosenbrock_grad, alpha=alpha, n_iter=2000)289f_vals = [rosenbrock(x) for x in hist]290axes[1, 0].semilogy(f_vals, linewidth=1, label=f'$\\alpha = {alpha}$')291axes[1, 0].set_xlabel('Iteration')292axes[1, 0].set_ylabel('$f(x)$')293axes[1, 0].set_title('Effect of Learning Rate (GD)')294axes[1, 0].legend(fontsize=8)295axes[1, 0].grid(True, alpha=0.3)296297# Effect of momentum298betas = [0.0, 0.5, 0.9, 0.99]299for beta in betas:300hist = momentum_gd(x0, rosenbrock_grad, alpha=0.001, beta=beta, n_iter=2000)301f_vals = [rosenbrock(x) for x in hist]302axes[1, 1].semilogy(f_vals, linewidth=1, label=f'$\\beta = {beta}$')303axes[1, 1].set_xlabel('Iteration')304axes[1, 1].set_ylabel('$f(x)$')305axes[1, 1].set_title('Effect of Momentum')306axes[1, 1].legend(fontsize=8)307axes[1, 1].grid(True, alpha=0.3)308309plt.tight_layout()310save_plot('optimization_gradient_descent.pdf', 'Gradient descent variants on Rosenbrock function')311312# Store final values313final_gd = rosenbrock(hist_gd[-1])314final_mom = rosenbrock(hist_mom[-1])315final_nest = rosenbrock(hist_nest[-1])316final_rms = rosenbrock(hist_rms[-1])317final_adam = rosenbrock(hist_adam[-1])318\end{pycode}319320\section{Second-Order Methods}321322\begin{pycode}323# Newton's method324def newton_method(x0, grad_f, hess_f, n_iter=100, tol=1e-10):325"""Newton's method with Hessian."""326x = x0.copy()327history = [x.copy()]328for _ in range(n_iter):329g = grad_f(x)330H = hess_f(x)331try:332dx = np.linalg.solve(H, -g)333except:334break335x = x + dx336history.append(x.copy())337if np.linalg.norm(g) < tol:338break339return np.array(history)340341# BFGS quasi-Newton342def bfgs(x0, f, grad_f, n_iter=1000, tol=1e-10):343"""BFGS quasi-Newton method."""344n = len(x0)345x = x0.copy()346B = np.eye(n) # Approximate inverse Hessian347history = [x.copy()]348349for _ in range(n_iter):350g = grad_f(x)351if np.linalg.norm(g) < tol:352break353354# Search direction355d = -B @ g356357# Line search (backtracking)358alpha = 1.0359c = 0.5360rho = 0.5361while f(x + alpha*d) > f(x) + c*alpha*g@d:362alpha *= rho363364# Update365s = alpha * d366x_new = x + s367y = grad_f(x_new) - g368369# BFGS update370if y @ s > 1e-10:371rho_k = 1.0 / (y @ s)372I = np.eye(n)373B = (I - rho_k * np.outer(s, y)) @ B @ (I - rho_k * np.outer(y, s)) + rho_k * np.outer(s, s)374375x = x_new376history.append(x.copy())377378return np.array(history)379380# Run second-order methods381x0_newton = np.array([0.5, 0.5]) # Start closer for Newton382hist_newton = newton_method(x0_newton, rosenbrock_grad, rosenbrock_hess, n_iter=100)383hist_bfgs = bfgs(x0, rosenbrock, rosenbrock_grad, n_iter=1000)384385f_newton = [rosenbrock(x) for x in hist_newton]386f_bfgs = [rosenbrock(x) for x in hist_bfgs]387388# Visualization389fig, axes = plt.subplots(2, 2, figsize=(12, 10))390391# Newton's method path392axes[0, 0].contour(X, Y, Z, levels=np.logspace(0, 3, 20), cmap='viridis', alpha=0.7)393axes[0, 0].plot(hist_newton[:, 0], hist_newton[:, 1], 'ro-', markersize=6, linewidth=1.5, label='Newton')394axes[0, 0].plot(1, 1, 'k*', markersize=15)395axes[0, 0].set_xlabel('$x_1$')396axes[0, 0].set_ylabel('$x_2$')397axes[0, 0].set_title(f"Newton's Method ({len(hist_newton)-1} iterations)")398axes[0, 0].legend()399400# BFGS path401axes[0, 1].contour(X, Y, Z, levels=np.logspace(0, 3, 20), cmap='viridis', alpha=0.7)402axes[0, 1].plot(hist_bfgs[:, 0], hist_bfgs[:, 1], 'go-', markersize=3, linewidth=0.8, label='BFGS')403axes[0, 1].plot(1, 1, 'k*', markersize=15)404axes[0, 1].set_xlabel('$x_1$')405axes[0, 1].set_ylabel('$x_2$')406axes[0, 1].set_title(f'BFGS Method ({len(hist_bfgs)-1} iterations)')407axes[0, 1].legend()408409# Convergence comparison (log scale)410axes[1, 0].semilogy(f_newton, 'r-o', markersize=6, linewidth=1.5, label='Newton')411axes[1, 0].semilogy(f_bfgs, 'g-', linewidth=1, label='BFGS')412axes[1, 0].semilogy(f_adam[:500], 'b-', linewidth=1, label='Adam')413axes[1, 0].set_xlabel('Iteration')414axes[1, 0].set_ylabel('$f(x)$')415axes[1, 0].set_title('Second-Order vs First-Order Convergence')416axes[1, 0].legend()417axes[1, 0].grid(True, alpha=0.3)418419# Convergence rate analysis (distance to optimum)420dist_newton = [np.linalg.norm(x - rosenbrock_opt) for x in hist_newton]421dist_bfgs = [np.linalg.norm(x - rosenbrock_opt) for x in hist_bfgs]422dist_adam = [np.linalg.norm(x - rosenbrock_opt) for x in hist_adam[:500]]423424axes[1, 1].semilogy(dist_newton, 'r-o', markersize=6, linewidth=1.5, label='Newton')425axes[1, 1].semilogy(dist_bfgs, 'g-', linewidth=1, label='BFGS')426axes[1, 1].semilogy(dist_adam, 'b-', linewidth=1, label='Adam')427axes[1, 1].set_xlabel('Iteration')428axes[1, 1].set_ylabel('$||x - x^*||$')429axes[1, 1].set_title('Distance to Optimum')430axes[1, 1].legend()431axes[1, 1].grid(True, alpha=0.3)432433plt.tight_layout()434save_plot('optimization_second_order.pdf', "Second-order methods: Newton and BFGS")435436final_newton = rosenbrock(hist_newton[-1])437final_bfgs = rosenbrock(hist_bfgs[-1])438newton_iters = len(hist_newton) - 1439bfgs_iters = len(hist_bfgs) - 1440\end{pycode}441442\section{Condition Number and Convergence}443444\begin{pycode}445# Analyze effect of condition number on convergence446# Quadratic function f(x) = 0.5 * x'Ax with different condition numbers447448def make_quadratic(kappa):449"""Create quadratic with condition number kappa."""450A = np.array([[kappa, 0], [0, 1]])451return A452453def run_gd_quadratic(A, x0, alpha, n_iter):454"""Run GD on quadratic."""455history = [x0.copy()]456x = x0.copy()457for _ in range(n_iter):458g = A @ x459x = x - alpha * g460history.append(x.copy())461return np.array(history)462463# Test different condition numbers464kappas = [1, 5, 10, 50, 100]465x0_quad = np.array([5.0, 5.0])466n_iter_quad = 200467468# Visualization469fig, axes = plt.subplots(2, 2, figsize=(12, 10))470471# Convergence for different condition numbers472for kappa in kappas:473A = make_quadratic(kappa)474# Optimal step size for quadratic475L = kappa476mu = 1477alpha_opt = 2 / (L + mu)478479hist = run_gd_quadratic(A, x0_quad, alpha_opt, n_iter_quad)480f_vals = [0.5 * x @ A @ x for x in hist]481axes[0, 0].semilogy(f_vals, linewidth=1.5, label=f'$\\kappa = {kappa}$')482483axes[0, 0].set_xlabel('Iteration')484axes[0, 0].set_ylabel('$f(x)$')485axes[0, 0].set_title('Effect of Condition Number (optimal $\\alpha$)')486axes[0, 0].legend(fontsize=8)487axes[0, 0].grid(True, alpha=0.3)488489# Theoretical vs actual convergence rate490kappa_range = np.arange(1, 101)491theory_rate = (kappa_range - 1) / (kappa_range + 1)492axes[0, 1].plot(kappa_range, theory_rate, 'b-', linewidth=2, label='$(\\kappa-1)/(\\kappa+1)$')493axes[0, 1].set_xlabel('Condition number $\\kappa$')494axes[0, 1].set_ylabel('Convergence rate')495axes[0, 1].set_title('Theoretical Convergence Rate (GD)')496axes[0, 1].legend()497axes[0, 1].grid(True, alpha=0.3)498499# Effect of step size on ill-conditioned problem500kappa_ill = 100501A_ill = make_quadratic(kappa_ill)502alphas_quad = [0.001, 0.005, 0.01, 0.019, 0.02]503504for alpha in alphas_quad:505hist = run_gd_quadratic(A_ill, x0_quad, alpha, n_iter_quad)506f_vals = [0.5 * x @ A_ill @ x for x in hist]507axes[1, 0].semilogy(f_vals, linewidth=1, label=f'$\\alpha = {alpha}$')508509axes[1, 0].set_xlabel('Iteration')510axes[1, 0].set_ylabel('$f(x)$')511axes[1, 0].set_title(f'Step Size Sensitivity ($\\kappa = {kappa_ill}$)')512axes[1, 0].legend(fontsize=8)513axes[1, 0].grid(True, alpha=0.3)514axes[1, 0].set_ylim([1e-15, 1e5])515516# Compare GD vs momentum on ill-conditioned517A_ill = make_quadratic(100)518hist_gd_ill = run_gd_quadratic(A_ill, x0_quad, 0.01, 500)519520# Momentum for quadratic521def run_momentum_quadratic(A, x0, alpha, beta, n_iter):522x = x0.copy()523v = np.zeros_like(x)524history = [x.copy()]525for _ in range(n_iter):526g = A @ x527v = beta*v + alpha*g528x = x - v529history.append(x.copy())530return np.array(history)531532hist_mom_ill = run_momentum_quadratic(A_ill, x0_quad, 0.01, 0.9, 500)533534f_gd_ill = [0.5 * x @ A_ill @ x for x in hist_gd_ill]535f_mom_ill = [0.5 * x @ A_ill @ x for x in hist_mom_ill]536537axes[1, 1].semilogy(f_gd_ill, 'b-', linewidth=1.5, label='GD')538axes[1, 1].semilogy(f_mom_ill, 'r-', linewidth=1.5, label='Momentum')539axes[1, 1].set_xlabel('Iteration')540axes[1, 1].set_ylabel('$f(x)$')541axes[1, 1].set_title(f'GD vs Momentum ($\\kappa = 100$)')542axes[1, 1].legend()543axes[1, 1].grid(True, alpha=0.3)544545plt.tight_layout()546save_plot('optimization_condition_number.pdf', 'Effect of condition number on convergence')547\end{pycode}548549\section{Constrained Optimization}550551\begin{pycode}552# Constrained optimization using penalty methods and Lagrange multipliers553554# Problem: minimize f(x) subject to g(x) <= 0555# f(x) = (x1-1)^2 + (x2-2)^2556# g(x) = x1^2 + x2^2 - 1 <= 0557558def objective_constrained(x):559return (x[0] - 1)**2 + (x[1] - 2)**2560561def objective_grad(x):562return np.array([2*(x[0] - 1), 2*(x[1] - 2)])563564def constraint(x):565return x[0]**2 + x[1]**2 - 1566567def constraint_grad(x):568return np.array([2*x[0], 2*x[1]])569570# Penalty method571def penalty_method(x0, f, g, grad_f, grad_g, rho_init=1, rho_mult=10, n_outer=5, n_inner=500):572"""Solve constrained problem using quadratic penalty."""573x = x0.copy()574rho = rho_init575history = [x.copy()]576rhos = [rho]577578for _ in range(n_outer):579# Inner optimization580for _ in range(n_inner):581violation = max(0, g(x))582grad = grad_f(x) + 2*rho*violation*grad_g(x)583x = x - 0.01 * grad584history.append(x.copy())585586rho *= rho_mult587rhos.append(rho)588589return np.array(history), rhos590591# Augmented Lagrangian method592def augmented_lagrangian(x0, f, g, grad_f, grad_g, rho=10, n_outer=10, n_inner=100):593"""Augmented Lagrangian method."""594x = x0.copy()595lam = 0 # Lagrange multiplier596history = [x.copy()]597multipliers = [lam]598599for _ in range(n_outer):600# Inner optimization601for _ in range(n_inner):602violation = g(x)603grad = grad_f(x) + lam*grad_g(x) + rho*max(0, violation)*grad_g(x)604x = x - 0.01 * grad605history.append(x.copy())606607# Update multiplier608lam = max(0, lam + rho * g(x))609multipliers.append(lam)610611return np.array(history), multipliers612613# Projected gradient descent614def projected_gd(x0, grad_f, project, alpha=0.1, n_iter=500):615"""Projected gradient descent."""616x = x0.copy()617history = [x.copy()]618for _ in range(n_iter):619x = x - alpha * grad_f(x)620x = project(x) # Project onto feasible set621history.append(x.copy())622return np.array(history)623624def project_to_ball(x, radius=1):625"""Project onto unit ball."""626norm = np.linalg.norm(x)627if norm > radius:628return radius * x / norm629return x630631# Run methods632x0_con = np.array([0.0, 0.0])633hist_penalty, rhos = penalty_method(x0_con, objective_constrained, constraint,634objective_grad, constraint_grad)635hist_al, multipliers = augmented_lagrangian(x0_con, objective_constrained, constraint,636objective_grad, constraint_grad)637hist_proj = projected_gd(x0_con, objective_grad, project_to_ball, alpha=0.1, n_iter=500)638639# Analytical solution640# Optimum on boundary: x* = (1, 2)/sqrt(5)641x_opt_con = np.array([1, 2]) / np.sqrt(5)642f_opt_con = objective_constrained(x_opt_con)643644# Visualization645fig, axes = plt.subplots(2, 2, figsize=(12, 10))646647# Contour plot with constraint648theta_c = np.linspace(0, 2*np.pi, 100)649x_circle = np.cos(theta_c)650y_circle = np.sin(theta_c)651652x_con = np.linspace(-1.5, 2, 100)653y_con = np.linspace(-1, 3, 100)654X_con, Y_con = np.meshgrid(x_con, y_con)655Z_con = (X_con - 1)**2 + (Y_con - 2)**2656657axes[0, 0].contour(X_con, Y_con, Z_con, levels=20, cmap='viridis', alpha=0.7)658axes[0, 0].plot(x_circle, y_circle, 'r-', linewidth=2, label='Constraint')659axes[0, 0].fill(x_circle, y_circle, alpha=0.2, color='gray')660axes[0, 0].plot(hist_penalty[-1, 0], hist_penalty[-1, 1], 'bo', markersize=10, label='Penalty')661axes[0, 0].plot(hist_al[-1, 0], hist_al[-1, 1], 'g^', markersize=10, label='Aug. Lagrangian')662axes[0, 0].plot(hist_proj[-1, 0], hist_proj[-1, 1], 'rs', markersize=10, label='Projected GD')663axes[0, 0].plot(x_opt_con[0], x_opt_con[1], 'k*', markersize=15, label='Optimum')664axes[0, 0].set_xlabel('$x_1$')665axes[0, 0].set_ylabel('$x_2$')666axes[0, 0].set_title('Constrained Optimization')667axes[0, 0].legend(fontsize=8)668axes[0, 0].set_xlim([-1.5, 2])669axes[0, 0].set_ylim([-1, 3])670671# Convergence to optimum672f_penalty = [objective_constrained(x) for x in hist_penalty]673f_al = [objective_constrained(x) for x in hist_al]674f_proj = [objective_constrained(x) for x in hist_proj]675676axes[0, 1].plot(f_penalty, 'b-', linewidth=1, label='Penalty', alpha=0.7)677axes[0, 1].plot(f_al, 'g-', linewidth=1, label='Aug. Lagrangian', alpha=0.7)678axes[0, 1].plot(f_proj, 'r-', linewidth=1, label='Projected GD', alpha=0.7)679axes[0, 1].axhline(y=f_opt_con, color='k', linestyle='--', label=f'Optimal = {f_opt_con:.4f}')680axes[0, 1].set_xlabel('Iteration')681axes[0, 1].set_ylabel('$f(x)$')682axes[0, 1].set_title('Objective Value')683axes[0, 1].legend(fontsize=8)684axes[0, 1].grid(True, alpha=0.3)685686# Constraint violation687viol_penalty = [max(0, constraint(x)) for x in hist_penalty]688viol_al = [max(0, constraint(x)) for x in hist_al]689viol_proj = [max(0, constraint(x)) for x in hist_proj]690691axes[1, 0].semilogy(viol_penalty, 'b-', linewidth=1, label='Penalty')692axes[1, 0].semilogy(viol_al, 'g-', linewidth=1, label='Aug. Lagrangian')693axes[1, 0].semilogy(viol_proj, 'r-', linewidth=1, label='Projected GD')694axes[1, 0].set_xlabel('Iteration')695axes[1, 0].set_ylabel('Constraint violation')696axes[1, 0].set_title('Constraint Satisfaction')697axes[1, 0].legend(fontsize=8)698axes[1, 0].grid(True, alpha=0.3)699700# Lagrange multiplier evolution701axes[1, 1].plot(multipliers, 'g-o', markersize=6, linewidth=1.5)702axes[1, 1].set_xlabel('Outer iteration')703axes[1, 1].set_ylabel('$\\lambda$')704axes[1, 1].set_title('Lagrange Multiplier Evolution')705axes[1, 1].grid(True, alpha=0.3)706707# Theoretical optimal multiplier708lam_opt = 2 * np.sqrt(5) - 2709axes[1, 1].axhline(y=lam_opt, color='r', linestyle='--', label=f'$\\lambda^* = {lam_opt:.3f}$')710axes[1, 1].legend()711712plt.tight_layout()713save_plot('optimization_constrained.pdf', 'Constrained optimization methods')714715final_penalty = objective_constrained(hist_penalty[-1])716final_al = objective_constrained(hist_al[-1])717final_proj = objective_constrained(hist_proj[-1])718\end{pycode}719720\section{Stochastic Optimization}721722\begin{pycode}723# Stochastic gradient descent for noisy gradients724725def sgd(x0, grad_f, batch_grad_f, alpha=0.01, n_iter=1000, noise_std=0.1):726"""Stochastic gradient descent with noisy gradients."""727x = x0.copy()728history = [x.copy()]729for _ in range(n_iter):730# Noisy gradient estimate731g = grad_f(x) + noise_std * np.random.randn(len(x))732x = x - alpha * g733history.append(x.copy())734return np.array(history)735736def sgd_with_decay(x0, grad_f, alpha0=0.1, decay=0.001, n_iter=1000, noise_std=0.5):737"""SGD with learning rate decay."""738x = x0.copy()739history = [x.copy()]740for t in range(1, n_iter+1):741alpha = alpha0 / (1 + decay * t)742g = grad_f(x) + noise_std * np.random.randn(len(x))743x = x - alpha * g744history.append(x.copy())745return np.array(history)746747def mini_batch_sgd(x0, grad_f, batch_size=10, alpha=0.01, n_iter=1000, noise_std=0.5):748"""Mini-batch SGD with averaged gradients."""749x = x0.copy()750history = [x.copy()]751for _ in range(n_iter):752# Average over batch753g = grad_f(x) + noise_std * np.random.randn(len(x)) / np.sqrt(batch_size)754x = x - alpha * g755history.append(x.copy())756return np.array(history)757758# Run stochastic methods on simple quadratic759A_quad = np.array([[5, 0], [0, 1]])760def quad_grad(x):761return A_quad @ x762763x0_sgd = np.array([5.0, 5.0])764n_iter_sgd = 2000765766hist_sgd = sgd(x0_sgd, quad_grad, None, alpha=0.01, noise_std=1.0)767hist_sgd_decay = sgd_with_decay(x0_sgd, quad_grad, alpha0=0.1, decay=0.01, noise_std=1.0, n_iter=n_iter_sgd)768hist_minibatch = mini_batch_sgd(x0_sgd, quad_grad, batch_size=32, alpha=0.05, noise_std=1.0, n_iter=n_iter_sgd)769770# Full batch for comparison771hist_full = run_gd_quadratic(A_quad, x0_sgd, 0.1, n_iter_sgd)772773# Visualization774fig, axes = plt.subplots(2, 2, figsize=(12, 10))775776# Optimization paths777axes[0, 0].plot(hist_sgd[:500, 0], hist_sgd[:500, 1], 'b-', linewidth=0.3, alpha=0.5, label='SGD')778axes[0, 0].plot(hist_full[:500, 0], hist_full[:500, 1], 'r-', linewidth=1.5, label='Full batch')779axes[0, 0].plot(0, 0, 'k*', markersize=15)780axes[0, 0].set_xlabel('$x_1$')781axes[0, 0].set_ylabel('$x_2$')782axes[0, 0].set_title('SGD vs Full Batch GD')783axes[0, 0].legend()784axes[0, 0].grid(True, alpha=0.3)785786# Convergence with noise787f_sgd = [0.5 * x @ A_quad @ x for x in hist_sgd]788f_decay = [0.5 * x @ A_quad @ x for x in hist_sgd_decay]789f_mini = [0.5 * x @ A_quad @ x for x in hist_minibatch]790f_full = [0.5 * x @ A_quad @ x for x in hist_full]791792axes[0, 1].semilogy(f_sgd, 'b-', linewidth=0.5, alpha=0.5, label='SGD')793axes[0, 1].semilogy(f_decay, 'g-', linewidth=0.5, alpha=0.5, label='SGD + decay')794axes[0, 1].semilogy(f_mini, 'c-', linewidth=0.5, alpha=0.5, label='Mini-batch')795axes[0, 1].semilogy(f_full, 'r-', linewidth=1.5, label='Full batch')796axes[0, 1].set_xlabel('Iteration')797axes[0, 1].set_ylabel('$f(x)$')798axes[0, 1].set_title('Stochastic Optimization Convergence')799axes[0, 1].legend(fontsize=8)800axes[0, 1].grid(True, alpha=0.3)801802# Moving average803window = 50804f_sgd_avg = np.convolve(f_sgd, np.ones(window)/window, mode='valid')805f_decay_avg = np.convolve(f_decay, np.ones(window)/window, mode='valid')806f_mini_avg = np.convolve(f_mini, np.ones(window)/window, mode='valid')807808axes[1, 0].semilogy(f_sgd_avg, 'b-', linewidth=1.5, label='SGD')809axes[1, 0].semilogy(f_decay_avg, 'g-', linewidth=1.5, label='SGD + decay')810axes[1, 0].semilogy(f_mini_avg, 'c-', linewidth=1.5, label='Mini-batch')811axes[1, 0].set_xlabel('Iteration')812axes[1, 0].set_ylabel('$f(x)$ (moving avg)')813axes[1, 0].set_title(f'Smoothed Convergence (window={window})')814axes[1, 0].legend(fontsize=8)815axes[1, 0].grid(True, alpha=0.3)816817# Variance reduction with batch size818batch_sizes = [1, 4, 16, 64]819final_variances = []820for bs in batch_sizes:821# Run multiple trials822finals = []823for _ in range(20):824hist = mini_batch_sgd(x0_sgd, quad_grad, batch_size=bs, alpha=0.01, noise_std=1.0, n_iter=500)825finals.append(0.5 * hist[-1] @ A_quad @ hist[-1])826final_variances.append(np.var(finals))827828axes[1, 1].loglog(batch_sizes, final_variances, 'o-', markersize=8, linewidth=2)829axes[1, 1].set_xlabel('Batch size')830axes[1, 1].set_ylabel('Variance of final $f(x)$')831axes[1, 1].set_title('Variance Reduction with Batch Size')832axes[1, 1].grid(True, alpha=0.3)833834plt.tight_layout()835save_plot('optimization_stochastic.pdf', 'Stochastic gradient descent methods')836\end{pycode}837838\section{Comparison on Multiple Test Functions}839840\begin{pycode}841# Compare optimizers on different test functions842843test_functions = {844'Rosenbrock': (rosenbrock, rosenbrock_grad, np.array([-1.5, 1.5]), np.array([1, 1])),845'Beale': (beale, beale_grad, np.array([0.0, 0.0]), np.array([3, 0.5])),846'Quadratic': (lambda x: quadratic(x), lambda x: quadratic_grad(x), np.array([5, 5]), np.array([0, 0]))847}848849results = {}850n_iter_compare = 2000851852for name, (f, grad, x0, opt) in test_functions.items():853results[name] = {}854855# Run optimizers856hist = gradient_descent(x0, grad, alpha=0.001, n_iter=n_iter_compare)857results[name]['GD'] = np.linalg.norm(hist[-1] - opt)858859hist = momentum_gd(x0, grad, alpha=0.001, beta=0.9, n_iter=n_iter_compare)860results[name]['Momentum'] = np.linalg.norm(hist[-1] - opt)861862hist = adam(x0, grad, alpha=0.01, n_iter=n_iter_compare)863results[name]['Adam'] = np.linalg.norm(hist[-1] - opt)864865hist = bfgs(x0, f, grad, n_iter=n_iter_compare)866results[name]['BFGS'] = np.linalg.norm(hist[-1] - opt)867868# Visualization869fig, axes = plt.subplots(1, 2, figsize=(12, 5))870871# Bar chart comparison872functions = list(results.keys())873methods = ['GD', 'Momentum', 'Adam', 'BFGS']874x_pos = np.arange(len(functions))875width = 0.2876877for i, method in enumerate(methods):878values = [results[fn][method] for fn in functions]879axes[0].bar(x_pos + i*width, values, width, label=method)880881axes[0].set_ylabel('Final distance to optimum')882axes[0].set_xticks(x_pos + 1.5*width)883axes[0].set_xticklabels(functions)884axes[0].set_title('Optimizer Comparison')885axes[0].legend()886axes[0].grid(True, alpha=0.3)887axes[0].set_yscale('log')888889# Iteration counts to reach tolerance890tol = 1e-3891iter_counts = {}892893for name, (f, grad, x0, opt) in test_functions.items():894iter_counts[name] = {}895896for method_name, method in [('GD', lambda x0, g: gradient_descent(x0, g, alpha=0.001, n_iter=10000)),897('Momentum', lambda x0, g: momentum_gd(x0, g, alpha=0.001, beta=0.9, n_iter=10000)),898('Adam', lambda x0, g: adam(x0, g, alpha=0.01, n_iter=10000))]:899hist = method(x0, grad)900distances = [np.linalg.norm(x - opt) for x in hist]901# Find first iteration below tolerance902below_tol = np.where(np.array(distances) < tol)[0]903if len(below_tol) > 0:904iter_counts[name][method_name] = below_tol[0]905else:906iter_counts[name][method_name] = 10000907908# Heatmap of iteration counts909data = np.array([[iter_counts[fn][m] for m in ['GD', 'Momentum', 'Adam']] for fn in functions])910im = axes[1].imshow(data, cmap='YlOrRd', aspect='auto')911912axes[1].set_xticks(range(3))913axes[1].set_yticks(range(len(functions)))914axes[1].set_xticklabels(['GD', 'Momentum', 'Adam'])915axes[1].set_yticklabels(functions)916917for i in range(len(functions)):918for j in range(3):919text = axes[1].text(j, i, f'{data[i, j]:.0f}', ha='center', va='center')920921plt.colorbar(im, ax=axes[1], label='Iterations')922axes[1].set_title(f'Iterations to reach $||x - x^*|| < {tol}$')923924plt.tight_layout()925save_plot('optimization_comparison.pdf', 'Optimizer comparison across test functions')926\end{pycode}927928\section{Results Summary}929930\begin{pycode}931# Create comprehensive results table932print(r'\begin{table}[H]')933print(r'\centering')934print(r'\caption{Optimization Results on Rosenbrock Function}')935print(r'\begin{tabular}{lcc}')936print(r'\toprule')937print(r'Method & Final $f(x)$ & Iterations \\')938print(r'\midrule')939print(f'Gradient Descent & {final_gd:.6f} & {n_iter} \\\\')940print(f'Momentum & {final_mom:.6f} & {n_iter} \\\\')941print(f'Nesterov & {final_nest:.6f} & {n_iter} \\\\')942print(f'RMSprop & {final_rms:.6f} & {n_iter} \\\\')943print(f'Adam & {final_adam:.6f} & {n_iter} \\\\')944print(f"Newton's Method & {final_newton:.6f} & {newton_iters} \\\\")945print(f'BFGS & {final_bfgs:.6f} & {bfgs_iters} \\\\')946print(r'\bottomrule')947print(r'\end{tabular}')948print(r'\end{table}')949950# Constrained optimization results951print(r'\begin{table}[H]')952print(r'\centering')953print(r'\caption{Constrained Optimization Results}')954print(r'\begin{tabular}{lcc}')955print(r'\toprule')956print(r'Method & Final $f(x)$ & Constraint Violation \\')957print(r'\midrule')958print(f'Penalty Method & {final_penalty:.4f} & {max(0, constraint(hist_penalty[-1])):.6f} \\\\')959print(f'Aug. Lagrangian & {final_al:.4f} & {max(0, constraint(hist_al[-1])):.6f} \\\\')960print(f'Projected GD & {final_proj:.4f} & {max(0, constraint(hist_proj[-1])):.6f} \\\\')961print(f'Analytical Opt. & {f_opt_con:.4f} & 0.0 \\\\')962print(r'\bottomrule')963print(r'\end{tabular}')964print(r'\end{table}')965966# Method comparison967print(r'\begin{table}[H]')968print(r'\centering')969print(r'\caption{Optimizer Performance Comparison}')970print(r'\begin{tabular}{lcccc}')971print(r'\toprule')972print(r'Function & GD & Momentum & Adam & BFGS \\')973print(r'\midrule')974for name in results.keys():975gd = results[name]['GD']976mom = results[name]['Momentum']977adam_r = results[name]['Adam']978bfgs_r = results[name]['BFGS']979print(f'{name} & {gd:.2e} & {mom:.2e} & {adam_r:.2e} & {bfgs_r:.2e} \\\\')980print(r'\bottomrule')981print(r'\end{tabular}')982print(r'\end{table}')983\end{pycode}984985\section{Statistical Summary}986987Optimization results:988\begin{itemize}989\item Final value (GD): \py{f"{final_gd:.6f}"}990\item Final value (Momentum): \py{f"{final_mom:.6f}"}991\item Final value (Nesterov): \py{f"{final_nest:.6f}"}992\item Final value (RMSprop): \py{f"{final_rms:.6f}"}993\item Final value (Adam): \py{f"{final_adam:.6f}"}994\item Final value (Newton): \py{f"{final_newton:.6f}"} in \py{f"{newton_iters}"} iterations995\item Final value (BFGS): \py{f"{final_bfgs:.6f}"} in \py{f"{bfgs_iters}"} iterations996\item Optimal point: $(1, 1)$997\item Constrained optimum: \py{f"{f_opt_con:.4f}"} at \py{f"({x_opt_con[0]:.3f}, {x_opt_con[1]:.3f})"}998\end{itemize}9991000\section{Conclusion}10011002This analysis compared optimization methods across different problem classes:10031004\begin{enumerate}1005\item \textbf{First-order methods}: Momentum accelerates convergence by accumulating past gradients. Adaptive methods (RMSprop, Adam) automatically tune per-parameter learning rates, making them robust across problems.10061007\item \textbf{Second-order methods}: Newton's method achieves quadratic convergence near the optimum but requires Hessian computation. BFGS approximates the Hessian using gradient differences, providing superlinear convergence without explicit second derivatives.10081009\item \textbf{Condition number effects}: Ill-conditioned problems (high $\kappa$) slow gradient descent significantly. Momentum and adaptive methods partially mitigate this effect.10101011\item \textbf{Constrained optimization}: Penalty methods, augmented Lagrangian, and projected gradient descent handle constraints through different mechanisms. Augmented Lagrangian combines benefits of penalty and dual methods.10121013\item \textbf{Stochastic optimization}: Mini-batch SGD with learning rate decay balances noise reduction and computational efficiency. Batch size controls variance of gradient estimates.1014\end{enumerate}10151016Learning rate selection is critical: too small leads to slow convergence, too large causes divergence. Adaptive methods like Adam are popular in deep learning due to their robustness to hyperparameter choices.10171018\end{document}101910201021