CoCalc -- optimization.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/other/optimization.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{multirow}
9
\usepackage{float}
10
\usepackage[makestderr]{pythontex}
11

12
\title{Optimization: Gradient Descent Methods and Convergence Analysis}
13
\author{Computational Science Templates}
14
\date{\today}
15

16
\begin{document}
17
\maketitle
18

19
\begin{abstract}
20
Optimization algorithms find minima of objective functions and are fundamental to machine learning, scientific computing, and engineering design. This document demonstrates gradient descent variants (vanilla, momentum, RMSprop, Adam), Newton's method, quasi-Newton methods (BFGS), and constrained optimization. Using PythonTeX, we compare convergence behavior on standard test functions and analyze the effects of hyperparameters on optimization performance.
21
\end{abstract}
22

23
\section{Introduction}
24
Optimization algorithms find minima of objective functions. This analysis compares gradient descent variants on test functions, demonstrating convergence behavior, hyperparameter sensitivity, and the tradeoffs between different optimization strategies. Applications span machine learning (neural network training), scientific computing (parameter estimation), and engineering (design optimization).
25

26
\section{Mathematical Framework}
27

28
\subsection{First-Order Methods}
29
Gradient descent update:
30
\begin{equation}
31
x_{k+1} = x_k - \alpha \nabla f(x_k)
32
\end{equation}
33

34
Momentum update (heavy ball method):
35
\begin{equation}
36
v_{k+1} = \beta v_k + \alpha \nabla f(x_k), \quad x_{k+1} = x_k - v_{k+1}
37
\end{equation}
38

39
Nesterov accelerated gradient:
40
\begin{equation}
41
v_{k+1} = \beta v_k + \alpha \nabla f(x_k - \beta v_k), \quad x_{k+1} = x_k - v_{k+1}
42
\end{equation}
43

44
\subsection{Adaptive Learning Rates}
45
RMSprop:
46
\begin{equation}
47
s_k = \rho s_{k-1} + (1-\rho) (\nabla f(x_k))^2, \quad x_{k+1} = x_k - \frac{\alpha}{\sqrt{s_k + \epsilon}} \nabla f(x_k)
48
\end{equation}
49

50
Adam (Adaptive Moment Estimation):
51
\begin{align}
52
m_k &= \beta_1 m_{k-1} + (1-\beta_1) \nabla f(x_k) \\
53
v_k &= \beta_2 v_{k-1} + (1-\beta_2) (\nabla f(x_k))^2 \\
54
\hat{m}_k &= \frac{m_k}{1-\beta_1^k}, \quad \hat{v}_k = \frac{v_k}{1-\beta_2^k} \\
55
x_{k+1} &= x_k - \frac{\alpha}{\sqrt{\hat{v}_k} + \epsilon} \hat{m}_k
56
\end{align}
57

58
\subsection{Second-Order Methods}
59
Newton's method:
60
\begin{equation}
61
x_{k+1} = x_k - [H_f(x_k)]^{-1} \nabla f(x_k)
62
\end{equation}
63
where $H_f$ is the Hessian matrix.
64

65
BFGS (quasi-Newton):
66
\begin{equation}
67
x_{k+1} = x_k - \alpha_k B_k^{-1} \nabla f(x_k)
68
\end{equation}
69
where $B_k$ approximates the Hessian using gradient differences.
70

71
\subsection{Convergence Rates}
72
For $\mu$-strongly convex functions with $L$-Lipschitz gradients:
73
\begin{itemize}
74
    \item Gradient descent: $\mathcal{O}((1 - \mu/L)^k)$
75
    \item Momentum: $\mathcal{O}((1 - \sqrt{\mu/L})^k)$
76
    \item Newton's method: quadratic convergence near optimum
77
\end{itemize}
78

79
\section{Environment Setup}
80

81
\begin{pycode}
82
import numpy as np
83
import matplotlib.pyplot as plt
84
from scipy.optimize import minimize, minimize_scalar
85
import warnings
86
warnings.filterwarnings('ignore')
87

88
plt.rc('text', usetex=True)
89
plt.rc('font', family='serif')
90

91
np.random.seed(42)
92

93
def save_plot(filename, caption=""):
94
    plt.savefig(filename, bbox_inches='tight', dpi=150)
95
    print(r'\begin{figure}[htbp]')
96
    print(r'\centering')
97
    print(r'\includegraphics[width=0.95\textwidth]{' + filename + '}')
98
    if caption:
99
        print(r'\caption{' + caption + '}')
100
    print(r'\end{figure}')
101
    plt.close()
102
\end{pycode}
103

104
\section{Test Functions}
105

106
\begin{pycode}
107
# Define test functions and their gradients
108

109
# Rosenbrock function (banana function)
110
def rosenbrock(x):
111
    return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2
112

113
def rosenbrock_grad(x):
114
    dx = -2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2)
115
    dy = 200*(x[1] - x[0]**2)
116
    return np.array([dx, dy])
117

118
def rosenbrock_hess(x):
119
    h11 = 2 - 400*x[1] + 1200*x[0]**2
120
    h12 = -400*x[0]
121
    h22 = 200
122
    return np.array([[h11, h12], [h12, h22]])
123

124
# Beale function
125
def beale(x):
126
    term1 = (1.5 - x[0] + x[0]*x[1])**2
127
    term2 = (2.25 - x[0] + x[0]*x[1]**2)**2
128
    term3 = (2.625 - x[0] + x[0]*x[1]**3)**2
129
    return term1 + term2 + term3
130

131
def beale_grad(x):
132
    dx = (2*(1.5 - x[0] + x[0]*x[1])*(-1 + x[1]) +
133
          2*(2.25 - x[0] + x[0]*x[1]**2)*(-1 + x[1]**2) +
134
          2*(2.625 - x[0] + x[0]*x[1]**3)*(-1 + x[1]**3))
135
    dy = (2*(1.5 - x[0] + x[0]*x[1])*x[0] +
136
          2*(2.25 - x[0] + x[0]*x[1]**2)*2*x[0]*x[1] +
137
          2*(2.625 - x[0] + x[0]*x[1]**3)*3*x[0]*x[1]**2)
138
    return np.array([dx, dy])
139

140
# Rastrigin function (multimodal)
141
def rastrigin(x):
142
    A = 10
143
    n = len(x)
144
    return A*n + sum(xi**2 - A*np.cos(2*np.pi*xi) for xi in x)
145

146
def rastrigin_grad(x):
147
    A = 10
148
    return np.array([2*xi + 2*np.pi*A*np.sin(2*np.pi*xi) for xi in x])
149

150
# Quadratic function (for analysis)
151
def quadratic(x, A=None, b=None):
152
    if A is None:
153
        A = np.array([[10, 0], [0, 1]])
154
    if b is None:
155
        b = np.zeros(2)
156
    return 0.5 * x @ A @ x - b @ x
157

158
def quadratic_grad(x, A=None, b=None):
159
    if A is None:
160
        A = np.array([[10, 0], [0, 1]])
161
    if b is None:
162
        b = np.zeros(2)
163
    return A @ x - b
164

165
# Store optimal values
166
rosenbrock_opt = np.array([1.0, 1.0])
167
beale_opt = np.array([3.0, 0.5])
168
rastrigin_opt = np.array([0.0, 0.0])
169
quadratic_opt = np.array([0.0, 0.0])
170
\end{pycode}
171

172
\section{Gradient Descent Variants}
173

174
\begin{pycode}
175
# Implement optimization algorithms
176

177
def gradient_descent(x0, grad_f, alpha=0.001, n_iter=1000):
178
    """Vanilla gradient descent."""
179
    x = x0.copy()
180
    history = [x.copy()]
181
    for _ in range(n_iter):
182
        x = x - alpha * grad_f(x)
183
        history.append(x.copy())
184
    return np.array(history)
185

186
def momentum_gd(x0, grad_f, alpha=0.001, beta=0.9, n_iter=1000):
187
    """Gradient descent with momentum."""
188
    x = x0.copy()
189
    v = np.zeros_like(x)
190
    history = [x.copy()]
191
    for _ in range(n_iter):
192
        v = beta*v + alpha*grad_f(x)
193
        x = x - v
194
        history.append(x.copy())
195
    return np.array(history)
196

197
def nesterov_gd(x0, grad_f, alpha=0.001, beta=0.9, n_iter=1000):
198
    """Nesterov accelerated gradient."""
199
    x = x0.copy()
200
    v = np.zeros_like(x)
201
    history = [x.copy()]
202
    for _ in range(n_iter):
203
        x_lookahead = x - beta*v
204
        v = beta*v + alpha*grad_f(x_lookahead)
205
        x = x - v
206
        history.append(x.copy())
207
    return np.array(history)
208

209
def rmsprop(x0, grad_f, alpha=0.001, rho=0.9, eps=1e-8, n_iter=1000):
210
    """RMSprop optimizer."""
211
    x = x0.copy()
212
    s = np.zeros_like(x)
213
    history = [x.copy()]
214
    for _ in range(n_iter):
215
        g = grad_f(x)
216
        s = rho*s + (1-rho)*g**2
217
        x = x - alpha * g / (np.sqrt(s) + eps)
218
        history.append(x.copy())
219
    return np.array(history)
220

221
def adam(x0, grad_f, alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-8, n_iter=1000):
222
    """Adam optimizer."""
223
    x = x0.copy()
224
    m = np.zeros_like(x)
225
    v = np.zeros_like(x)
226
    history = [x.copy()]
227
    for t in range(1, n_iter+1):
228
        g = grad_f(x)
229
        m = beta1*m + (1-beta1)*g
230
        v = beta2*v + (1-beta2)*g**2
231
        m_hat = m / (1 - beta1**t)
232
        v_hat = v / (1 - beta2**t)
233
        x = x - alpha * m_hat / (np.sqrt(v_hat) + eps)
234
        history.append(x.copy())
235
    return np.array(history)
236

237
# Run optimizers on Rosenbrock
238
x0 = np.array([-1.5, 1.5])
239
n_iter = 5000
240

241
hist_gd = gradient_descent(x0, rosenbrock_grad, alpha=0.001, n_iter=n_iter)
242
hist_mom = momentum_gd(x0, rosenbrock_grad, alpha=0.001, beta=0.9, n_iter=n_iter)
243
hist_nest = nesterov_gd(x0, rosenbrock_grad, alpha=0.001, beta=0.9, n_iter=n_iter)
244
hist_rms = rmsprop(x0, rosenbrock_grad, alpha=0.01, n_iter=n_iter)
245
hist_adam = adam(x0, rosenbrock_grad, alpha=0.01, n_iter=n_iter)
246

247
# Compute objective values
248
f_gd = [rosenbrock(x) for x in hist_gd]
249
f_mom = [rosenbrock(x) for x in hist_mom]
250
f_nest = [rosenbrock(x) for x in hist_nest]
251
f_rms = [rosenbrock(x) for x in hist_rms]
252
f_adam = [rosenbrock(x) for x in hist_adam]
253

254
# Contour plot of Rosenbrock
255
x = np.linspace(-2, 2, 100)
256
y = np.linspace(-1, 3, 100)
257
X, Y = np.meshgrid(x, y)
258
Z = (1 - X)**2 + 100*(Y - X**2)**2
259

260
# Visualization
261
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
262

263
# Optimization paths
264
axes[0, 0].contour(X, Y, Z, levels=np.logspace(0, 3, 20), cmap='viridis', alpha=0.7)
265
axes[0, 0].plot(hist_gd[:, 0], hist_gd[:, 1], 'b.-', markersize=1, linewidth=0.5, label='GD', alpha=0.7)
266
axes[0, 0].plot(hist_mom[:, 0], hist_mom[:, 1], 'r.-', markersize=1, linewidth=0.5, label='Momentum', alpha=0.7)
267
axes[0, 0].plot(hist_adam[:, 0], hist_adam[:, 1], 'g.-', markersize=1, linewidth=0.5, label='Adam', alpha=0.7)
268
axes[0, 0].plot(1, 1, 'k*', markersize=15, label='Optimum')
269
axes[0, 0].set_xlabel('$x_1$')
270
axes[0, 0].set_ylabel('$x_2$')
271
axes[0, 0].set_title('Optimization Paths on Rosenbrock')
272
axes[0, 0].legend(fontsize=8)
273

274
# Convergence comparison
275
axes[0, 1].semilogy(f_gd, 'b-', linewidth=1, label='GD')
276
axes[0, 1].semilogy(f_mom, 'r-', linewidth=1, label='Momentum')
277
axes[0, 1].semilogy(f_nest, 'm-', linewidth=1, label='Nesterov')
278
axes[0, 1].semilogy(f_rms, 'c-', linewidth=1, label='RMSprop')
279
axes[0, 1].semilogy(f_adam, 'g-', linewidth=1, label='Adam')
280
axes[0, 1].set_xlabel('Iteration')
281
axes[0, 1].set_ylabel('$f(x)$')
282
axes[0, 1].set_title('Convergence Comparison')
283
axes[0, 1].legend(fontsize=8)
284
axes[0, 1].grid(True, alpha=0.3)
285

286
# Effect of learning rate
287
alphas = [0.0001, 0.0005, 0.001, 0.002]
288
for alpha in alphas:
289
    hist = gradient_descent(x0, rosenbrock_grad, alpha=alpha, n_iter=2000)
290
    f_vals = [rosenbrock(x) for x in hist]
291
    axes[1, 0].semilogy(f_vals, linewidth=1, label=f'$\\alpha = {alpha}$')
292
axes[1, 0].set_xlabel('Iteration')
293
axes[1, 0].set_ylabel('$f(x)$')
294
axes[1, 0].set_title('Effect of Learning Rate (GD)')
295
axes[1, 0].legend(fontsize=8)
296
axes[1, 0].grid(True, alpha=0.3)
297

298
# Effect of momentum
299
betas = [0.0, 0.5, 0.9, 0.99]
300
for beta in betas:
301
    hist = momentum_gd(x0, rosenbrock_grad, alpha=0.001, beta=beta, n_iter=2000)
302
    f_vals = [rosenbrock(x) for x in hist]
303
    axes[1, 1].semilogy(f_vals, linewidth=1, label=f'$\\beta = {beta}$')
304
axes[1, 1].set_xlabel('Iteration')
305
axes[1, 1].set_ylabel('$f(x)$')
306
axes[1, 1].set_title('Effect of Momentum')
307
axes[1, 1].legend(fontsize=8)
308
axes[1, 1].grid(True, alpha=0.3)
309

310
plt.tight_layout()
311
save_plot('optimization_gradient_descent.pdf', 'Gradient descent variants on Rosenbrock function')
312

313
# Store final values
314
final_gd = rosenbrock(hist_gd[-1])
315
final_mom = rosenbrock(hist_mom[-1])
316
final_nest = rosenbrock(hist_nest[-1])
317
final_rms = rosenbrock(hist_rms[-1])
318
final_adam = rosenbrock(hist_adam[-1])
319
\end{pycode}
320

321
\section{Second-Order Methods}
322

323
\begin{pycode}
324
# Newton's method
325
def newton_method(x0, grad_f, hess_f, n_iter=100, tol=1e-10):
326
    """Newton's method with Hessian."""
327
    x = x0.copy()
328
    history = [x.copy()]
329
    for _ in range(n_iter):
330
        g = grad_f(x)
331
        H = hess_f(x)
332
        try:
333
            dx = np.linalg.solve(H, -g)
334
        except:
335
            break
336
        x = x + dx
337
        history.append(x.copy())
338
        if np.linalg.norm(g) < tol:
339
            break
340
    return np.array(history)
341

342
# BFGS quasi-Newton
343
def bfgs(x0, f, grad_f, n_iter=1000, tol=1e-10):
344
    """BFGS quasi-Newton method."""
345
    n = len(x0)
346
    x = x0.copy()
347
    B = np.eye(n)  # Approximate inverse Hessian
348
    history = [x.copy()]
349

350
    for _ in range(n_iter):
351
        g = grad_f(x)
352
        if np.linalg.norm(g) < tol:
353
            break
354

355
        # Search direction
356
        d = -B @ g
357

358
        # Line search (backtracking)
359
        alpha = 1.0
360
        c = 0.5
361
        rho = 0.5
362
        while f(x + alpha*d) > f(x) + c*alpha*g@d:
363
            alpha *= rho
364

365
        # Update
366
        s = alpha * d
367
        x_new = x + s
368
        y = grad_f(x_new) - g
369

370
        # BFGS update
371
        if y @ s > 1e-10:
372
            rho_k = 1.0 / (y @ s)
373
            I = np.eye(n)
374
            B = (I - rho_k * np.outer(s, y)) @ B @ (I - rho_k * np.outer(y, s)) + rho_k * np.outer(s, s)
375

376
        x = x_new
377
        history.append(x.copy())
378

379
    return np.array(history)
380

381
# Run second-order methods
382
x0_newton = np.array([0.5, 0.5])  # Start closer for Newton
383
hist_newton = newton_method(x0_newton, rosenbrock_grad, rosenbrock_hess, n_iter=100)
384
hist_bfgs = bfgs(x0, rosenbrock, rosenbrock_grad, n_iter=1000)
385

386
f_newton = [rosenbrock(x) for x in hist_newton]
387
f_bfgs = [rosenbrock(x) for x in hist_bfgs]
388

389
# Visualization
390
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
391

392
# Newton's method path
393
axes[0, 0].contour(X, Y, Z, levels=np.logspace(0, 3, 20), cmap='viridis', alpha=0.7)
394
axes[0, 0].plot(hist_newton[:, 0], hist_newton[:, 1], 'ro-', markersize=6, linewidth=1.5, label='Newton')
395
axes[0, 0].plot(1, 1, 'k*', markersize=15)
396
axes[0, 0].set_xlabel('$x_1$')
397
axes[0, 0].set_ylabel('$x_2$')
398
axes[0, 0].set_title(f"Newton's Method ({len(hist_newton)-1} iterations)")
399
axes[0, 0].legend()
400

401
# BFGS path
402
axes[0, 1].contour(X, Y, Z, levels=np.logspace(0, 3, 20), cmap='viridis', alpha=0.7)
403
axes[0, 1].plot(hist_bfgs[:, 0], hist_bfgs[:, 1], 'go-', markersize=3, linewidth=0.8, label='BFGS')
404
axes[0, 1].plot(1, 1, 'k*', markersize=15)
405
axes[0, 1].set_xlabel('$x_1$')
406
axes[0, 1].set_ylabel('$x_2$')
407
axes[0, 1].set_title(f'BFGS Method ({len(hist_bfgs)-1} iterations)')
408
axes[0, 1].legend()
409

410
# Convergence comparison (log scale)
411
axes[1, 0].semilogy(f_newton, 'r-o', markersize=6, linewidth=1.5, label='Newton')
412
axes[1, 0].semilogy(f_bfgs, 'g-', linewidth=1, label='BFGS')
413
axes[1, 0].semilogy(f_adam[:500], 'b-', linewidth=1, label='Adam')
414
axes[1, 0].set_xlabel('Iteration')
415
axes[1, 0].set_ylabel('$f(x)$')
416
axes[1, 0].set_title('Second-Order vs First-Order Convergence')
417
axes[1, 0].legend()
418
axes[1, 0].grid(True, alpha=0.3)
419

420
# Convergence rate analysis (distance to optimum)
421
dist_newton = [np.linalg.norm(x - rosenbrock_opt) for x in hist_newton]
422
dist_bfgs = [np.linalg.norm(x - rosenbrock_opt) for x in hist_bfgs]
423
dist_adam = [np.linalg.norm(x - rosenbrock_opt) for x in hist_adam[:500]]
424

425
axes[1, 1].semilogy(dist_newton, 'r-o', markersize=6, linewidth=1.5, label='Newton')
426
axes[1, 1].semilogy(dist_bfgs, 'g-', linewidth=1, label='BFGS')
427
axes[1, 1].semilogy(dist_adam, 'b-', linewidth=1, label='Adam')
428
axes[1, 1].set_xlabel('Iteration')
429
axes[1, 1].set_ylabel('$||x - x^*||$')
430
axes[1, 1].set_title('Distance to Optimum')
431
axes[1, 1].legend()
432
axes[1, 1].grid(True, alpha=0.3)
433

434
plt.tight_layout()
435
save_plot('optimization_second_order.pdf', "Second-order methods: Newton and BFGS")
436

437
final_newton = rosenbrock(hist_newton[-1])
438
final_bfgs = rosenbrock(hist_bfgs[-1])
439
newton_iters = len(hist_newton) - 1
440
bfgs_iters = len(hist_bfgs) - 1
441
\end{pycode}
442

443
\section{Condition Number and Convergence}
444

445
\begin{pycode}
446
# Analyze effect of condition number on convergence
447
# Quadratic function f(x) = 0.5 * x'Ax with different condition numbers
448

449
def make_quadratic(kappa):
450
    """Create quadratic with condition number kappa."""
451
    A = np.array([[kappa, 0], [0, 1]])
452
    return A
453

454
def run_gd_quadratic(A, x0, alpha, n_iter):
455
    """Run GD on quadratic."""
456
    history = [x0.copy()]
457
    x = x0.copy()
458
    for _ in range(n_iter):
459
        g = A @ x
460
        x = x - alpha * g
461
        history.append(x.copy())
462
    return np.array(history)
463

464
# Test different condition numbers
465
kappas = [1, 5, 10, 50, 100]
466
x0_quad = np.array([5.0, 5.0])
467
n_iter_quad = 200
468

469
# Visualization
470
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
471

472
# Convergence for different condition numbers
473
for kappa in kappas:
474
    A = make_quadratic(kappa)
475
    # Optimal step size for quadratic
476
    L = kappa
477
    mu = 1
478
    alpha_opt = 2 / (L + mu)
479

480
    hist = run_gd_quadratic(A, x0_quad, alpha_opt, n_iter_quad)
481
    f_vals = [0.5 * x @ A @ x for x in hist]
482
    axes[0, 0].semilogy(f_vals, linewidth=1.5, label=f'$\\kappa = {kappa}$')
483

484
axes[0, 0].set_xlabel('Iteration')
485
axes[0, 0].set_ylabel('$f(x)$')
486
axes[0, 0].set_title('Effect of Condition Number (optimal $\\alpha$)')
487
axes[0, 0].legend(fontsize=8)
488
axes[0, 0].grid(True, alpha=0.3)
489

490
# Theoretical vs actual convergence rate
491
kappa_range = np.arange(1, 101)
492
theory_rate = (kappa_range - 1) / (kappa_range + 1)
493
axes[0, 1].plot(kappa_range, theory_rate, 'b-', linewidth=2, label='$(\\kappa-1)/(\\kappa+1)$')
494
axes[0, 1].set_xlabel('Condition number $\\kappa$')
495
axes[0, 1].set_ylabel('Convergence rate')
496
axes[0, 1].set_title('Theoretical Convergence Rate (GD)')
497
axes[0, 1].legend()
498
axes[0, 1].grid(True, alpha=0.3)
499

500
# Effect of step size on ill-conditioned problem
501
kappa_ill = 100
502
A_ill = make_quadratic(kappa_ill)
503
alphas_quad = [0.001, 0.005, 0.01, 0.019, 0.02]
504

505
for alpha in alphas_quad:
506
    hist = run_gd_quadratic(A_ill, x0_quad, alpha, n_iter_quad)
507
    f_vals = [0.5 * x @ A_ill @ x for x in hist]
508
    axes[1, 0].semilogy(f_vals, linewidth=1, label=f'$\\alpha = {alpha}$')
509

510
axes[1, 0].set_xlabel('Iteration')
511
axes[1, 0].set_ylabel('$f(x)$')
512
axes[1, 0].set_title(f'Step Size Sensitivity ($\\kappa = {kappa_ill}$)')
513
axes[1, 0].legend(fontsize=8)
514
axes[1, 0].grid(True, alpha=0.3)
515
axes[1, 0].set_ylim([1e-15, 1e5])
516

517
# Compare GD vs momentum on ill-conditioned
518
A_ill = make_quadratic(100)
519
hist_gd_ill = run_gd_quadratic(A_ill, x0_quad, 0.01, 500)
520

521
# Momentum for quadratic
522
def run_momentum_quadratic(A, x0, alpha, beta, n_iter):
523
    x = x0.copy()
524
    v = np.zeros_like(x)
525
    history = [x.copy()]
526
    for _ in range(n_iter):
527
        g = A @ x
528
        v = beta*v + alpha*g
529
        x = x - v
530
        history.append(x.copy())
531
    return np.array(history)
532

533
hist_mom_ill = run_momentum_quadratic(A_ill, x0_quad, 0.01, 0.9, 500)
534

535
f_gd_ill = [0.5 * x @ A_ill @ x for x in hist_gd_ill]
536
f_mom_ill = [0.5 * x @ A_ill @ x for x in hist_mom_ill]
537

538
axes[1, 1].semilogy(f_gd_ill, 'b-', linewidth=1.5, label='GD')
539
axes[1, 1].semilogy(f_mom_ill, 'r-', linewidth=1.5, label='Momentum')
540
axes[1, 1].set_xlabel('Iteration')
541
axes[1, 1].set_ylabel('$f(x)$')
542
axes[1, 1].set_title(f'GD vs Momentum ($\\kappa = 100$)')
543
axes[1, 1].legend()
544
axes[1, 1].grid(True, alpha=0.3)
545

546
plt.tight_layout()
547
save_plot('optimization_condition_number.pdf', 'Effect of condition number on convergence')
548
\end{pycode}
549

550
\section{Constrained Optimization}
551

552
\begin{pycode}
553
# Constrained optimization using penalty methods and Lagrange multipliers
554

555
# Problem: minimize f(x) subject to g(x) <= 0
556
# f(x) = (x1-1)^2 + (x2-2)^2
557
# g(x) = x1^2 + x2^2 - 1 <= 0
558

559
def objective_constrained(x):
560
    return (x[0] - 1)**2 + (x[1] - 2)**2
561

562
def objective_grad(x):
563
    return np.array([2*(x[0] - 1), 2*(x[1] - 2)])
564

565
def constraint(x):
566
    return x[0]**2 + x[1]**2 - 1
567

568
def constraint_grad(x):
569
    return np.array([2*x[0], 2*x[1]])
570

571
# Penalty method
572
def penalty_method(x0, f, g, grad_f, grad_g, rho_init=1, rho_mult=10, n_outer=5, n_inner=500):
573
    """Solve constrained problem using quadratic penalty."""
574
    x = x0.copy()
575
    rho = rho_init
576
    history = [x.copy()]
577
    rhos = [rho]
578

579
    for _ in range(n_outer):
580
        # Inner optimization
581
        for _ in range(n_inner):
582
            violation = max(0, g(x))
583
            grad = grad_f(x) + 2*rho*violation*grad_g(x)
584
            x = x - 0.01 * grad
585
            history.append(x.copy())
586

587
        rho *= rho_mult
588
        rhos.append(rho)
589

590
    return np.array(history), rhos
591

592
# Augmented Lagrangian method
593
def augmented_lagrangian(x0, f, g, grad_f, grad_g, rho=10, n_outer=10, n_inner=100):
594
    """Augmented Lagrangian method."""
595
    x = x0.copy()
596
    lam = 0  # Lagrange multiplier
597
    history = [x.copy()]
598
    multipliers = [lam]
599

600
    for _ in range(n_outer):
601
        # Inner optimization
602
        for _ in range(n_inner):
603
            violation = g(x)
604
            grad = grad_f(x) + lam*grad_g(x) + rho*max(0, violation)*grad_g(x)
605
            x = x - 0.01 * grad
606
            history.append(x.copy())
607

608
        # Update multiplier
609
        lam = max(0, lam + rho * g(x))
610
        multipliers.append(lam)
611

612
    return np.array(history), multipliers
613

614
# Projected gradient descent
615
def projected_gd(x0, grad_f, project, alpha=0.1, n_iter=500):
616
    """Projected gradient descent."""
617
    x = x0.copy()
618
    history = [x.copy()]
619
    for _ in range(n_iter):
620
        x = x - alpha * grad_f(x)
621
        x = project(x)  # Project onto feasible set
622
        history.append(x.copy())
623
    return np.array(history)
624

625
def project_to_ball(x, radius=1):
626
    """Project onto unit ball."""
627
    norm = np.linalg.norm(x)
628
    if norm > radius:
629
        return radius * x / norm
630
    return x
631

632
# Run methods
633
x0_con = np.array([0.0, 0.0])
634
hist_penalty, rhos = penalty_method(x0_con, objective_constrained, constraint,
635
                                     objective_grad, constraint_grad)
636
hist_al, multipliers = augmented_lagrangian(x0_con, objective_constrained, constraint,
637
                                             objective_grad, constraint_grad)
638
hist_proj = projected_gd(x0_con, objective_grad, project_to_ball, alpha=0.1, n_iter=500)
639

640
# Analytical solution
641
# Optimum on boundary: x* = (1, 2)/sqrt(5)
642
x_opt_con = np.array([1, 2]) / np.sqrt(5)
643
f_opt_con = objective_constrained(x_opt_con)
644

645
# Visualization
646
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
647

648
# Contour plot with constraint
649
theta_c = np.linspace(0, 2*np.pi, 100)
650
x_circle = np.cos(theta_c)
651
y_circle = np.sin(theta_c)
652

653
x_con = np.linspace(-1.5, 2, 100)
654
y_con = np.linspace(-1, 3, 100)
655
X_con, Y_con = np.meshgrid(x_con, y_con)
656
Z_con = (X_con - 1)**2 + (Y_con - 2)**2
657

658
axes[0, 0].contour(X_con, Y_con, Z_con, levels=20, cmap='viridis', alpha=0.7)
659
axes[0, 0].plot(x_circle, y_circle, 'r-', linewidth=2, label='Constraint')
660
axes[0, 0].fill(x_circle, y_circle, alpha=0.2, color='gray')
661
axes[0, 0].plot(hist_penalty[-1, 0], hist_penalty[-1, 1], 'bo', markersize=10, label='Penalty')
662
axes[0, 0].plot(hist_al[-1, 0], hist_al[-1, 1], 'g^', markersize=10, label='Aug. Lagrangian')
663
axes[0, 0].plot(hist_proj[-1, 0], hist_proj[-1, 1], 'rs', markersize=10, label='Projected GD')
664
axes[0, 0].plot(x_opt_con[0], x_opt_con[1], 'k*', markersize=15, label='Optimum')
665
axes[0, 0].set_xlabel('$x_1$')
666
axes[0, 0].set_ylabel('$x_2$')
667
axes[0, 0].set_title('Constrained Optimization')
668
axes[0, 0].legend(fontsize=8)
669
axes[0, 0].set_xlim([-1.5, 2])
670
axes[0, 0].set_ylim([-1, 3])
671

672
# Convergence to optimum
673
f_penalty = [objective_constrained(x) for x in hist_penalty]
674
f_al = [objective_constrained(x) for x in hist_al]
675
f_proj = [objective_constrained(x) for x in hist_proj]
676

677
axes[0, 1].plot(f_penalty, 'b-', linewidth=1, label='Penalty', alpha=0.7)
678
axes[0, 1].plot(f_al, 'g-', linewidth=1, label='Aug. Lagrangian', alpha=0.7)
679
axes[0, 1].plot(f_proj, 'r-', linewidth=1, label='Projected GD', alpha=0.7)
680
axes[0, 1].axhline(y=f_opt_con, color='k', linestyle='--', label=f'Optimal = {f_opt_con:.4f}')
681
axes[0, 1].set_xlabel('Iteration')
682
axes[0, 1].set_ylabel('$f(x)$')
683
axes[0, 1].set_title('Objective Value')
684
axes[0, 1].legend(fontsize=8)
685
axes[0, 1].grid(True, alpha=0.3)
686

687
# Constraint violation
688
viol_penalty = [max(0, constraint(x)) for x in hist_penalty]
689
viol_al = [max(0, constraint(x)) for x in hist_al]
690
viol_proj = [max(0, constraint(x)) for x in hist_proj]
691

692
axes[1, 0].semilogy(viol_penalty, 'b-', linewidth=1, label='Penalty')
693
axes[1, 0].semilogy(viol_al, 'g-', linewidth=1, label='Aug. Lagrangian')
694
axes[1, 0].semilogy(viol_proj, 'r-', linewidth=1, label='Projected GD')
695
axes[1, 0].set_xlabel('Iteration')
696
axes[1, 0].set_ylabel('Constraint violation')
697
axes[1, 0].set_title('Constraint Satisfaction')
698
axes[1, 0].legend(fontsize=8)
699
axes[1, 0].grid(True, alpha=0.3)
700

701
# Lagrange multiplier evolution
702
axes[1, 1].plot(multipliers, 'g-o', markersize=6, linewidth=1.5)
703
axes[1, 1].set_xlabel('Outer iteration')
704
axes[1, 1].set_ylabel('$\\lambda$')
705
axes[1, 1].set_title('Lagrange Multiplier Evolution')
706
axes[1, 1].grid(True, alpha=0.3)
707

708
# Theoretical optimal multiplier
709
lam_opt = 2 * np.sqrt(5) - 2
710
axes[1, 1].axhline(y=lam_opt, color='r', linestyle='--', label=f'$\\lambda^* = {lam_opt:.3f}$')
711
axes[1, 1].legend()
712

713
plt.tight_layout()
714
save_plot('optimization_constrained.pdf', 'Constrained optimization methods')
715

716
final_penalty = objective_constrained(hist_penalty[-1])
717
final_al = objective_constrained(hist_al[-1])
718
final_proj = objective_constrained(hist_proj[-1])
719
\end{pycode}
720

721
\section{Stochastic Optimization}
722

723
\begin{pycode}
724
# Stochastic gradient descent for noisy gradients
725

726
def sgd(x0, grad_f, batch_grad_f, alpha=0.01, n_iter=1000, noise_std=0.1):
727
    """Stochastic gradient descent with noisy gradients."""
728
    x = x0.copy()
729
    history = [x.copy()]
730
    for _ in range(n_iter):
731
        # Noisy gradient estimate
732
        g = grad_f(x) + noise_std * np.random.randn(len(x))
733
        x = x - alpha * g
734
        history.append(x.copy())
735
    return np.array(history)
736

737
def sgd_with_decay(x0, grad_f, alpha0=0.1, decay=0.001, n_iter=1000, noise_std=0.5):
738
    """SGD with learning rate decay."""
739
    x = x0.copy()
740
    history = [x.copy()]
741
    for t in range(1, n_iter+1):
742
        alpha = alpha0 / (1 + decay * t)
743
        g = grad_f(x) + noise_std * np.random.randn(len(x))
744
        x = x - alpha * g
745
        history.append(x.copy())
746
    return np.array(history)
747

748
def mini_batch_sgd(x0, grad_f, batch_size=10, alpha=0.01, n_iter=1000, noise_std=0.5):
749
    """Mini-batch SGD with averaged gradients."""
750
    x = x0.copy()
751
    history = [x.copy()]
752
    for _ in range(n_iter):
753
        # Average over batch
754
        g = grad_f(x) + noise_std * np.random.randn(len(x)) / np.sqrt(batch_size)
755
        x = x - alpha * g
756
        history.append(x.copy())
757
    return np.array(history)
758

759
# Run stochastic methods on simple quadratic
760
A_quad = np.array([[5, 0], [0, 1]])
761
def quad_grad(x):
762
    return A_quad @ x
763

764
x0_sgd = np.array([5.0, 5.0])
765
n_iter_sgd = 2000
766

767
hist_sgd = sgd(x0_sgd, quad_grad, None, alpha=0.01, noise_std=1.0)
768
hist_sgd_decay = sgd_with_decay(x0_sgd, quad_grad, alpha0=0.1, decay=0.01, noise_std=1.0, n_iter=n_iter_sgd)
769
hist_minibatch = mini_batch_sgd(x0_sgd, quad_grad, batch_size=32, alpha=0.05, noise_std=1.0, n_iter=n_iter_sgd)
770

771
# Full batch for comparison
772
hist_full = run_gd_quadratic(A_quad, x0_sgd, 0.1, n_iter_sgd)
773

774
# Visualization
775
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
776

777
# Optimization paths
778
axes[0, 0].plot(hist_sgd[:500, 0], hist_sgd[:500, 1], 'b-', linewidth=0.3, alpha=0.5, label='SGD')
779
axes[0, 0].plot(hist_full[:500, 0], hist_full[:500, 1], 'r-', linewidth=1.5, label='Full batch')
780
axes[0, 0].plot(0, 0, 'k*', markersize=15)
781
axes[0, 0].set_xlabel('$x_1$')
782
axes[0, 0].set_ylabel('$x_2$')
783
axes[0, 0].set_title('SGD vs Full Batch GD')
784
axes[0, 0].legend()
785
axes[0, 0].grid(True, alpha=0.3)
786

787
# Convergence with noise
788
f_sgd = [0.5 * x @ A_quad @ x for x in hist_sgd]
789
f_decay = [0.5 * x @ A_quad @ x for x in hist_sgd_decay]
790
f_mini = [0.5 * x @ A_quad @ x for x in hist_minibatch]
791
f_full = [0.5 * x @ A_quad @ x for x in hist_full]
792

793
axes[0, 1].semilogy(f_sgd, 'b-', linewidth=0.5, alpha=0.5, label='SGD')
794
axes[0, 1].semilogy(f_decay, 'g-', linewidth=0.5, alpha=0.5, label='SGD + decay')
795
axes[0, 1].semilogy(f_mini, 'c-', linewidth=0.5, alpha=0.5, label='Mini-batch')
796
axes[0, 1].semilogy(f_full, 'r-', linewidth=1.5, label='Full batch')
797
axes[0, 1].set_xlabel('Iteration')
798
axes[0, 1].set_ylabel('$f(x)$')
799
axes[0, 1].set_title('Stochastic Optimization Convergence')
800
axes[0, 1].legend(fontsize=8)
801
axes[0, 1].grid(True, alpha=0.3)
802

803
# Moving average
804
window = 50
805
f_sgd_avg = np.convolve(f_sgd, np.ones(window)/window, mode='valid')
806
f_decay_avg = np.convolve(f_decay, np.ones(window)/window, mode='valid')
807
f_mini_avg = np.convolve(f_mini, np.ones(window)/window, mode='valid')
808

809
axes[1, 0].semilogy(f_sgd_avg, 'b-', linewidth=1.5, label='SGD')
810
axes[1, 0].semilogy(f_decay_avg, 'g-', linewidth=1.5, label='SGD + decay')
811
axes[1, 0].semilogy(f_mini_avg, 'c-', linewidth=1.5, label='Mini-batch')
812
axes[1, 0].set_xlabel('Iteration')
813
axes[1, 0].set_ylabel('$f(x)$ (moving avg)')
814
axes[1, 0].set_title(f'Smoothed Convergence (window={window})')
815
axes[1, 0].legend(fontsize=8)
816
axes[1, 0].grid(True, alpha=0.3)
817

818
# Variance reduction with batch size
819
batch_sizes = [1, 4, 16, 64]
820
final_variances = []
821
for bs in batch_sizes:
822
    # Run multiple trials
823
    finals = []
824
    for _ in range(20):
825
        hist = mini_batch_sgd(x0_sgd, quad_grad, batch_size=bs, alpha=0.01, noise_std=1.0, n_iter=500)
826
        finals.append(0.5 * hist[-1] @ A_quad @ hist[-1])
827
    final_variances.append(np.var(finals))
828

829
axes[1, 1].loglog(batch_sizes, final_variances, 'o-', markersize=8, linewidth=2)
830
axes[1, 1].set_xlabel('Batch size')
831
axes[1, 1].set_ylabel('Variance of final $f(x)$')
832
axes[1, 1].set_title('Variance Reduction with Batch Size')
833
axes[1, 1].grid(True, alpha=0.3)
834

835
plt.tight_layout()
836
save_plot('optimization_stochastic.pdf', 'Stochastic gradient descent methods')
837
\end{pycode}
838

839
\section{Comparison on Multiple Test Functions}
840

841
\begin{pycode}
842
# Compare optimizers on different test functions
843

844
test_functions = {
845
    'Rosenbrock': (rosenbrock, rosenbrock_grad, np.array([-1.5, 1.5]), np.array([1, 1])),
846
    'Beale': (beale, beale_grad, np.array([0.0, 0.0]), np.array([3, 0.5])),
847
    'Quadratic': (lambda x: quadratic(x), lambda x: quadratic_grad(x), np.array([5, 5]), np.array([0, 0]))
848
}
849

850
results = {}
851
n_iter_compare = 2000
852

853
for name, (f, grad, x0, opt) in test_functions.items():
854
    results[name] = {}
855

856
    # Run optimizers
857
    hist = gradient_descent(x0, grad, alpha=0.001, n_iter=n_iter_compare)
858
    results[name]['GD'] = np.linalg.norm(hist[-1] - opt)
859

860
    hist = momentum_gd(x0, grad, alpha=0.001, beta=0.9, n_iter=n_iter_compare)
861
    results[name]['Momentum'] = np.linalg.norm(hist[-1] - opt)
862

863
    hist = adam(x0, grad, alpha=0.01, n_iter=n_iter_compare)
864
    results[name]['Adam'] = np.linalg.norm(hist[-1] - opt)
865

866
    hist = bfgs(x0, f, grad, n_iter=n_iter_compare)
867
    results[name]['BFGS'] = np.linalg.norm(hist[-1] - opt)
868

869
# Visualization
870
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
871

872
# Bar chart comparison
873
functions = list(results.keys())
874
methods = ['GD', 'Momentum', 'Adam', 'BFGS']
875
x_pos = np.arange(len(functions))
876
width = 0.2
877

878
for i, method in enumerate(methods):
879
    values = [results[fn][method] for fn in functions]
880
    axes[0].bar(x_pos + i*width, values, width, label=method)
881

882
axes[0].set_ylabel('Final distance to optimum')
883
axes[0].set_xticks(x_pos + 1.5*width)
884
axes[0].set_xticklabels(functions)
885
axes[0].set_title('Optimizer Comparison')
886
axes[0].legend()
887
axes[0].grid(True, alpha=0.3)
888
axes[0].set_yscale('log')
889

890
# Iteration counts to reach tolerance
891
tol = 1e-3
892
iter_counts = {}
893

894
for name, (f, grad, x0, opt) in test_functions.items():
895
    iter_counts[name] = {}
896

897
    for method_name, method in [('GD', lambda x0, g: gradient_descent(x0, g, alpha=0.001, n_iter=10000)),
898
                                 ('Momentum', lambda x0, g: momentum_gd(x0, g, alpha=0.001, beta=0.9, n_iter=10000)),
899
                                 ('Adam', lambda x0, g: adam(x0, g, alpha=0.01, n_iter=10000))]:
900
        hist = method(x0, grad)
901
        distances = [np.linalg.norm(x - opt) for x in hist]
902
        # Find first iteration below tolerance
903
        below_tol = np.where(np.array(distances) < tol)[0]
904
        if len(below_tol) > 0:
905
            iter_counts[name][method_name] = below_tol[0]
906
        else:
907
            iter_counts[name][method_name] = 10000
908

909
# Heatmap of iteration counts
910
data = np.array([[iter_counts[fn][m] for m in ['GD', 'Momentum', 'Adam']] for fn in functions])
911
im = axes[1].imshow(data, cmap='YlOrRd', aspect='auto')
912

913
axes[1].set_xticks(range(3))
914
axes[1].set_yticks(range(len(functions)))
915
axes[1].set_xticklabels(['GD', 'Momentum', 'Adam'])
916
axes[1].set_yticklabels(functions)
917

918
for i in range(len(functions)):
919
    for j in range(3):
920
        text = axes[1].text(j, i, f'{data[i, j]:.0f}', ha='center', va='center')
921

922
plt.colorbar(im, ax=axes[1], label='Iterations')
923
axes[1].set_title(f'Iterations to reach $||x - x^*|| < {tol}$')
924

925
plt.tight_layout()
926
save_plot('optimization_comparison.pdf', 'Optimizer comparison across test functions')
927
\end{pycode}
928

929
\section{Results Summary}
930

931
\begin{pycode}
932
# Create comprehensive results table
933
print(r'\begin{table}[H]')
934
print(r'\centering')
935
print(r'\caption{Optimization Results on Rosenbrock Function}')
936
print(r'\begin{tabular}{lcc}')
937
print(r'\toprule')
938
print(r'Method & Final $f(x)$ & Iterations \\')
939
print(r'\midrule')
940
print(f'Gradient Descent & {final_gd:.6f} & {n_iter} \\\\')
941
print(f'Momentum & {final_mom:.6f} & {n_iter} \\\\')
942
print(f'Nesterov & {final_nest:.6f} & {n_iter} \\\\')
943
print(f'RMSprop & {final_rms:.6f} & {n_iter} \\\\')
944
print(f'Adam & {final_adam:.6f} & {n_iter} \\\\')
945
print(f"Newton's Method & {final_newton:.6f} & {newton_iters} \\\\")
946
print(f'BFGS & {final_bfgs:.6f} & {bfgs_iters} \\\\')
947
print(r'\bottomrule')
948
print(r'\end{tabular}')
949
print(r'\end{table}')
950

951
# Constrained optimization results
952
print(r'\begin{table}[H]')
953
print(r'\centering')
954
print(r'\caption{Constrained Optimization Results}')
955
print(r'\begin{tabular}{lcc}')
956
print(r'\toprule')
957
print(r'Method & Final $f(x)$ & Constraint Violation \\')
958
print(r'\midrule')
959
print(f'Penalty Method & {final_penalty:.4f} & {max(0, constraint(hist_penalty[-1])):.6f} \\\\')
960
print(f'Aug. Lagrangian & {final_al:.4f} & {max(0, constraint(hist_al[-1])):.6f} \\\\')
961
print(f'Projected GD & {final_proj:.4f} & {max(0, constraint(hist_proj[-1])):.6f} \\\\')
962
print(f'Analytical Opt. & {f_opt_con:.4f} & 0.0 \\\\')
963
print(r'\bottomrule')
964
print(r'\end{tabular}')
965
print(r'\end{table}')
966

967
# Method comparison
968
print(r'\begin{table}[H]')
969
print(r'\centering')
970
print(r'\caption{Optimizer Performance Comparison}')
971
print(r'\begin{tabular}{lcccc}')
972
print(r'\toprule')
973
print(r'Function & GD & Momentum & Adam & BFGS \\')
974
print(r'\midrule')
975
for name in results.keys():
976
    gd = results[name]['GD']
977
    mom = results[name]['Momentum']
978
    adam_r = results[name]['Adam']
979
    bfgs_r = results[name]['BFGS']
980
    print(f'{name} & {gd:.2e} & {mom:.2e} & {adam_r:.2e} & {bfgs_r:.2e} \\\\')
981
print(r'\bottomrule')
982
print(r'\end{tabular}')
983
print(r'\end{table}')
984
\end{pycode}
985

986
\section{Statistical Summary}
987

988
Optimization results:
989
\begin{itemize}
990
    \item Final value (GD): \py{f"{final_gd:.6f}"}
991
    \item Final value (Momentum): \py{f"{final_mom:.6f}"}
992
    \item Final value (Nesterov): \py{f"{final_nest:.6f}"}
993
    \item Final value (RMSprop): \py{f"{final_rms:.6f}"}
994
    \item Final value (Adam): \py{f"{final_adam:.6f}"}
995
    \item Final value (Newton): \py{f"{final_newton:.6f}"} in \py{f"{newton_iters}"} iterations
996
    \item Final value (BFGS): \py{f"{final_bfgs:.6f}"} in \py{f"{bfgs_iters}"} iterations
997
    \item Optimal point: $(1, 1)$
998
    \item Constrained optimum: \py{f"{f_opt_con:.4f}"} at \py{f"({x_opt_con[0]:.3f}, {x_opt_con[1]:.3f})"}
999
\end{itemize}
1000

1001
\section{Conclusion}
1002

1003
This analysis compared optimization methods across different problem classes:
1004

1005
\begin{enumerate}
1006
    \item \textbf{First-order methods}: Momentum accelerates convergence by accumulating past gradients. Adaptive methods (RMSprop, Adam) automatically tune per-parameter learning rates, making them robust across problems.
1007

1008
    \item \textbf{Second-order methods}: Newton's method achieves quadratic convergence near the optimum but requires Hessian computation. BFGS approximates the Hessian using gradient differences, providing superlinear convergence without explicit second derivatives.
1009

1010
    \item \textbf{Condition number effects}: Ill-conditioned problems (high $\kappa$) slow gradient descent significantly. Momentum and adaptive methods partially mitigate this effect.
1011

1012
    \item \textbf{Constrained optimization}: Penalty methods, augmented Lagrangian, and projected gradient descent handle constraints through different mechanisms. Augmented Lagrangian combines benefits of penalty and dual methods.
1013

1014
    \item \textbf{Stochastic optimization}: Mini-batch SGD with learning rate decay balances noise reduction and computational efficiency. Batch size controls variance of gradient estimates.
1015
\end{enumerate}
1016

1017
Learning rate selection is critical: too small leads to slow convergence, too large causes divergence. Adaptive methods like Adam are popular in deep learning due to their robustness to hyperparameter choices.
1018

1019
\end{document}
1020

1021
Product

Resources

Company