Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/other/optimization.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{multirow}
9
\usepackage{float}
10
\usepackage[makestderr]{pythontex}
11
12
\title{Optimization: Gradient Descent Methods and Convergence Analysis}
13
\author{Computational Science Templates}
14
\date{\today}
15
16
\begin{document}
17
\maketitle
18
19
\begin{abstract}
20
Optimization algorithms find minima of objective functions and are fundamental to machine learning, scientific computing, and engineering design. This document demonstrates gradient descent variants (vanilla, momentum, RMSprop, Adam), Newton's method, quasi-Newton methods (BFGS), and constrained optimization. Using PythonTeX, we compare convergence behavior on standard test functions and analyze the effects of hyperparameters on optimization performance.
21
\end{abstract}
22
23
\section{Introduction}
24
Optimization algorithms find minima of objective functions. This analysis compares gradient descent variants on test functions, demonstrating convergence behavior, hyperparameter sensitivity, and the tradeoffs between different optimization strategies. Applications span machine learning (neural network training), scientific computing (parameter estimation), and engineering (design optimization).
25
26
\section{Mathematical Framework}
27
28
\subsection{First-Order Methods}
29
Gradient descent update:
30
\begin{equation}
31
x_{k+1} = x_k - \alpha \nabla f(x_k)
32
\end{equation}
33
34
Momentum update (heavy ball method):
35
\begin{equation}
36
v_{k+1} = \beta v_k + \alpha \nabla f(x_k), \quad x_{k+1} = x_k - v_{k+1}
37
\end{equation}
38
39
Nesterov accelerated gradient:
40
\begin{equation}
41
v_{k+1} = \beta v_k + \alpha \nabla f(x_k - \beta v_k), \quad x_{k+1} = x_k - v_{k+1}
42
\end{equation}
43
44
\subsection{Adaptive Learning Rates}
45
RMSprop:
46
\begin{equation}
47
s_k = \rho s_{k-1} + (1-\rho) (\nabla f(x_k))^2, \quad x_{k+1} = x_k - \frac{\alpha}{\sqrt{s_k + \epsilon}} \nabla f(x_k)
48
\end{equation}
49
50
Adam (Adaptive Moment Estimation):
51
\begin{align}
52
m_k &= \beta_1 m_{k-1} + (1-\beta_1) \nabla f(x_k) \\
53
v_k &= \beta_2 v_{k-1} + (1-\beta_2) (\nabla f(x_k))^2 \\
54
\hat{m}_k &= \frac{m_k}{1-\beta_1^k}, \quad \hat{v}_k = \frac{v_k}{1-\beta_2^k} \\
55
x_{k+1} &= x_k - \frac{\alpha}{\sqrt{\hat{v}_k} + \epsilon} \hat{m}_k
56
\end{align}
57
58
\subsection{Second-Order Methods}
59
Newton's method:
60
\begin{equation}
61
x_{k+1} = x_k - [H_f(x_k)]^{-1} \nabla f(x_k)
62
\end{equation}
63
where $H_f$ is the Hessian matrix.
64
65
BFGS (quasi-Newton):
66
\begin{equation}
67
x_{k+1} = x_k - \alpha_k B_k^{-1} \nabla f(x_k)
68
\end{equation}
69
where $B_k$ approximates the Hessian using gradient differences.
70
71
\subsection{Convergence Rates}
72
For $\mu$-strongly convex functions with $L$-Lipschitz gradients:
73
\begin{itemize}
74
\item Gradient descent: $\mathcal{O}((1 - \mu/L)^k)$
75
\item Momentum: $\mathcal{O}((1 - \sqrt{\mu/L})^k)$
76
\item Newton's method: quadratic convergence near optimum
77
\end{itemize}
78
79
\section{Environment Setup}
80
81
\begin{pycode}
82
import numpy as np
83
import matplotlib.pyplot as plt
84
from scipy.optimize import minimize, minimize_scalar
85
import warnings
86
warnings.filterwarnings('ignore')
87
88
plt.rc('text', usetex=True)
89
plt.rc('font', family='serif')
90
91
np.random.seed(42)
92
93
def save_plot(filename, caption=""):
94
plt.savefig(filename, bbox_inches='tight', dpi=150)
95
print(r'\begin{figure}[htbp]')
96
print(r'\centering')
97
print(r'\includegraphics[width=0.95\textwidth]{' + filename + '}')
98
if caption:
99
print(r'\caption{' + caption + '}')
100
print(r'\end{figure}')
101
plt.close()
102
\end{pycode}
103
104
\section{Test Functions}
105
106
\begin{pycode}
107
# Define test functions and their gradients
108
109
# Rosenbrock function (banana function)
110
def rosenbrock(x):
111
return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2
112
113
def rosenbrock_grad(x):
114
dx = -2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2)
115
dy = 200*(x[1] - x[0]**2)
116
return np.array([dx, dy])
117
118
def rosenbrock_hess(x):
119
h11 = 2 - 400*x[1] + 1200*x[0]**2
120
h12 = -400*x[0]
121
h22 = 200
122
return np.array([[h11, h12], [h12, h22]])
123
124
# Beale function
125
def beale(x):
126
term1 = (1.5 - x[0] + x[0]*x[1])**2
127
term2 = (2.25 - x[0] + x[0]*x[1]**2)**2
128
term3 = (2.625 - x[0] + x[0]*x[1]**3)**2
129
return term1 + term2 + term3
130
131
def beale_grad(x):
132
dx = (2*(1.5 - x[0] + x[0]*x[1])*(-1 + x[1]) +
133
2*(2.25 - x[0] + x[0]*x[1]**2)*(-1 + x[1]**2) +
134
2*(2.625 - x[0] + x[0]*x[1]**3)*(-1 + x[1]**3))
135
dy = (2*(1.5 - x[0] + x[0]*x[1])*x[0] +
136
2*(2.25 - x[0] + x[0]*x[1]**2)*2*x[0]*x[1] +
137
2*(2.625 - x[0] + x[0]*x[1]**3)*3*x[0]*x[1]**2)
138
return np.array([dx, dy])
139
140
# Rastrigin function (multimodal)
141
def rastrigin(x):
142
A = 10
143
n = len(x)
144
return A*n + sum(xi**2 - A*np.cos(2*np.pi*xi) for xi in x)
145
146
def rastrigin_grad(x):
147
A = 10
148
return np.array([2*xi + 2*np.pi*A*np.sin(2*np.pi*xi) for xi in x])
149
150
# Quadratic function (for analysis)
151
def quadratic(x, A=None, b=None):
152
if A is None:
153
A = np.array([[10, 0], [0, 1]])
154
if b is None:
155
b = np.zeros(2)
156
return 0.5 * x @ A @ x - b @ x
157
158
def quadratic_grad(x, A=None, b=None):
159
if A is None:
160
A = np.array([[10, 0], [0, 1]])
161
if b is None:
162
b = np.zeros(2)
163
return A @ x - b
164
165
# Store optimal values
166
rosenbrock_opt = np.array([1.0, 1.0])
167
beale_opt = np.array([3.0, 0.5])
168
rastrigin_opt = np.array([0.0, 0.0])
169
quadratic_opt = np.array([0.0, 0.0])
170
\end{pycode}
171
172
\section{Gradient Descent Variants}
173
174
\begin{pycode}
175
# Implement optimization algorithms
176
177
def gradient_descent(x0, grad_f, alpha=0.001, n_iter=1000):
178
"""Vanilla gradient descent."""
179
x = x0.copy()
180
history = [x.copy()]
181
for _ in range(n_iter):
182
x = x - alpha * grad_f(x)
183
history.append(x.copy())
184
return np.array(history)
185
186
def momentum_gd(x0, grad_f, alpha=0.001, beta=0.9, n_iter=1000):
187
"""Gradient descent with momentum."""
188
x = x0.copy()
189
v = np.zeros_like(x)
190
history = [x.copy()]
191
for _ in range(n_iter):
192
v = beta*v + alpha*grad_f(x)
193
x = x - v
194
history.append(x.copy())
195
return np.array(history)
196
197
def nesterov_gd(x0, grad_f, alpha=0.001, beta=0.9, n_iter=1000):
198
"""Nesterov accelerated gradient."""
199
x = x0.copy()
200
v = np.zeros_like(x)
201
history = [x.copy()]
202
for _ in range(n_iter):
203
x_lookahead = x - beta*v
204
v = beta*v + alpha*grad_f(x_lookahead)
205
x = x - v
206
history.append(x.copy())
207
return np.array(history)
208
209
def rmsprop(x0, grad_f, alpha=0.001, rho=0.9, eps=1e-8, n_iter=1000):
210
"""RMSprop optimizer."""
211
x = x0.copy()
212
s = np.zeros_like(x)
213
history = [x.copy()]
214
for _ in range(n_iter):
215
g = grad_f(x)
216
s = rho*s + (1-rho)*g**2
217
x = x - alpha * g / (np.sqrt(s) + eps)
218
history.append(x.copy())
219
return np.array(history)
220
221
def adam(x0, grad_f, alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-8, n_iter=1000):
222
"""Adam optimizer."""
223
x = x0.copy()
224
m = np.zeros_like(x)
225
v = np.zeros_like(x)
226
history = [x.copy()]
227
for t in range(1, n_iter+1):
228
g = grad_f(x)
229
m = beta1*m + (1-beta1)*g
230
v = beta2*v + (1-beta2)*g**2
231
m_hat = m / (1 - beta1**t)
232
v_hat = v / (1 - beta2**t)
233
x = x - alpha * m_hat / (np.sqrt(v_hat) + eps)
234
history.append(x.copy())
235
return np.array(history)
236
237
# Run optimizers on Rosenbrock
238
x0 = np.array([-1.5, 1.5])
239
n_iter = 5000
240
241
hist_gd = gradient_descent(x0, rosenbrock_grad, alpha=0.001, n_iter=n_iter)
242
hist_mom = momentum_gd(x0, rosenbrock_grad, alpha=0.001, beta=0.9, n_iter=n_iter)
243
hist_nest = nesterov_gd(x0, rosenbrock_grad, alpha=0.001, beta=0.9, n_iter=n_iter)
244
hist_rms = rmsprop(x0, rosenbrock_grad, alpha=0.01, n_iter=n_iter)
245
hist_adam = adam(x0, rosenbrock_grad, alpha=0.01, n_iter=n_iter)
246
247
# Compute objective values
248
f_gd = [rosenbrock(x) for x in hist_gd]
249
f_mom = [rosenbrock(x) for x in hist_mom]
250
f_nest = [rosenbrock(x) for x in hist_nest]
251
f_rms = [rosenbrock(x) for x in hist_rms]
252
f_adam = [rosenbrock(x) for x in hist_adam]
253
254
# Contour plot of Rosenbrock
255
x = np.linspace(-2, 2, 100)
256
y = np.linspace(-1, 3, 100)
257
X, Y = np.meshgrid(x, y)
258
Z = (1 - X)**2 + 100*(Y - X**2)**2
259
260
# Visualization
261
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
262
263
# Optimization paths
264
axes[0, 0].contour(X, Y, Z, levels=np.logspace(0, 3, 20), cmap='viridis', alpha=0.7)
265
axes[0, 0].plot(hist_gd[:, 0], hist_gd[:, 1], 'b.-', markersize=1, linewidth=0.5, label='GD', alpha=0.7)
266
axes[0, 0].plot(hist_mom[:, 0], hist_mom[:, 1], 'r.-', markersize=1, linewidth=0.5, label='Momentum', alpha=0.7)
267
axes[0, 0].plot(hist_adam[:, 0], hist_adam[:, 1], 'g.-', markersize=1, linewidth=0.5, label='Adam', alpha=0.7)
268
axes[0, 0].plot(1, 1, 'k*', markersize=15, label='Optimum')
269
axes[0, 0].set_xlabel('$x_1$')
270
axes[0, 0].set_ylabel('$x_2$')
271
axes[0, 0].set_title('Optimization Paths on Rosenbrock')
272
axes[0, 0].legend(fontsize=8)
273
274
# Convergence comparison
275
axes[0, 1].semilogy(f_gd, 'b-', linewidth=1, label='GD')
276
axes[0, 1].semilogy(f_mom, 'r-', linewidth=1, label='Momentum')
277
axes[0, 1].semilogy(f_nest, 'm-', linewidth=1, label='Nesterov')
278
axes[0, 1].semilogy(f_rms, 'c-', linewidth=1, label='RMSprop')
279
axes[0, 1].semilogy(f_adam, 'g-', linewidth=1, label='Adam')
280
axes[0, 1].set_xlabel('Iteration')
281
axes[0, 1].set_ylabel('$f(x)$')
282
axes[0, 1].set_title('Convergence Comparison')
283
axes[0, 1].legend(fontsize=8)
284
axes[0, 1].grid(True, alpha=0.3)
285
286
# Effect of learning rate
287
alphas = [0.0001, 0.0005, 0.001, 0.002]
288
for alpha in alphas:
289
hist = gradient_descent(x0, rosenbrock_grad, alpha=alpha, n_iter=2000)
290
f_vals = [rosenbrock(x) for x in hist]
291
axes[1, 0].semilogy(f_vals, linewidth=1, label=f'$\\alpha = {alpha}$')
292
axes[1, 0].set_xlabel('Iteration')
293
axes[1, 0].set_ylabel('$f(x)$')
294
axes[1, 0].set_title('Effect of Learning Rate (GD)')
295
axes[1, 0].legend(fontsize=8)
296
axes[1, 0].grid(True, alpha=0.3)
297
298
# Effect of momentum
299
betas = [0.0, 0.5, 0.9, 0.99]
300
for beta in betas:
301
hist = momentum_gd(x0, rosenbrock_grad, alpha=0.001, beta=beta, n_iter=2000)
302
f_vals = [rosenbrock(x) for x in hist]
303
axes[1, 1].semilogy(f_vals, linewidth=1, label=f'$\\beta = {beta}$')
304
axes[1, 1].set_xlabel('Iteration')
305
axes[1, 1].set_ylabel('$f(x)$')
306
axes[1, 1].set_title('Effect of Momentum')
307
axes[1, 1].legend(fontsize=8)
308
axes[1, 1].grid(True, alpha=0.3)
309
310
plt.tight_layout()
311
save_plot('optimization_gradient_descent.pdf', 'Gradient descent variants on Rosenbrock function')
312
313
# Store final values
314
final_gd = rosenbrock(hist_gd[-1])
315
final_mom = rosenbrock(hist_mom[-1])
316
final_nest = rosenbrock(hist_nest[-1])
317
final_rms = rosenbrock(hist_rms[-1])
318
final_adam = rosenbrock(hist_adam[-1])
319
\end{pycode}
320
321
\section{Second-Order Methods}
322
323
\begin{pycode}
324
# Newton's method
325
def newton_method(x0, grad_f, hess_f, n_iter=100, tol=1e-10):
326
"""Newton's method with Hessian."""
327
x = x0.copy()
328
history = [x.copy()]
329
for _ in range(n_iter):
330
g = grad_f(x)
331
H = hess_f(x)
332
try:
333
dx = np.linalg.solve(H, -g)
334
except:
335
break
336
x = x + dx
337
history.append(x.copy())
338
if np.linalg.norm(g) < tol:
339
break
340
return np.array(history)
341
342
# BFGS quasi-Newton
343
def bfgs(x0, f, grad_f, n_iter=1000, tol=1e-10):
344
"""BFGS quasi-Newton method."""
345
n = len(x0)
346
x = x0.copy()
347
B = np.eye(n) # Approximate inverse Hessian
348
history = [x.copy()]
349
350
for _ in range(n_iter):
351
g = grad_f(x)
352
if np.linalg.norm(g) < tol:
353
break
354
355
# Search direction
356
d = -B @ g
357
358
# Line search (backtracking)
359
alpha = 1.0
360
c = 0.5
361
rho = 0.5
362
while f(x + alpha*d) > f(x) + c*alpha*g@d:
363
alpha *= rho
364
365
# Update
366
s = alpha * d
367
x_new = x + s
368
y = grad_f(x_new) - g
369
370
# BFGS update
371
if y @ s > 1e-10:
372
rho_k = 1.0 / (y @ s)
373
I = np.eye(n)
374
B = (I - rho_k * np.outer(s, y)) @ B @ (I - rho_k * np.outer(y, s)) + rho_k * np.outer(s, s)
375
376
x = x_new
377
history.append(x.copy())
378
379
return np.array(history)
380
381
# Run second-order methods
382
x0_newton = np.array([0.5, 0.5]) # Start closer for Newton
383
hist_newton = newton_method(x0_newton, rosenbrock_grad, rosenbrock_hess, n_iter=100)
384
hist_bfgs = bfgs(x0, rosenbrock, rosenbrock_grad, n_iter=1000)
385
386
f_newton = [rosenbrock(x) for x in hist_newton]
387
f_bfgs = [rosenbrock(x) for x in hist_bfgs]
388
389
# Visualization
390
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
391
392
# Newton's method path
393
axes[0, 0].contour(X, Y, Z, levels=np.logspace(0, 3, 20), cmap='viridis', alpha=0.7)
394
axes[0, 0].plot(hist_newton[:, 0], hist_newton[:, 1], 'ro-', markersize=6, linewidth=1.5, label='Newton')
395
axes[0, 0].plot(1, 1, 'k*', markersize=15)
396
axes[0, 0].set_xlabel('$x_1$')
397
axes[0, 0].set_ylabel('$x_2$')
398
axes[0, 0].set_title(f"Newton's Method ({len(hist_newton)-1} iterations)")
399
axes[0, 0].legend()
400
401
# BFGS path
402
axes[0, 1].contour(X, Y, Z, levels=np.logspace(0, 3, 20), cmap='viridis', alpha=0.7)
403
axes[0, 1].plot(hist_bfgs[:, 0], hist_bfgs[:, 1], 'go-', markersize=3, linewidth=0.8, label='BFGS')
404
axes[0, 1].plot(1, 1, 'k*', markersize=15)
405
axes[0, 1].set_xlabel('$x_1$')
406
axes[0, 1].set_ylabel('$x_2$')
407
axes[0, 1].set_title(f'BFGS Method ({len(hist_bfgs)-1} iterations)')
408
axes[0, 1].legend()
409
410
# Convergence comparison (log scale)
411
axes[1, 0].semilogy(f_newton, 'r-o', markersize=6, linewidth=1.5, label='Newton')
412
axes[1, 0].semilogy(f_bfgs, 'g-', linewidth=1, label='BFGS')
413
axes[1, 0].semilogy(f_adam[:500], 'b-', linewidth=1, label='Adam')
414
axes[1, 0].set_xlabel('Iteration')
415
axes[1, 0].set_ylabel('$f(x)$')
416
axes[1, 0].set_title('Second-Order vs First-Order Convergence')
417
axes[1, 0].legend()
418
axes[1, 0].grid(True, alpha=0.3)
419
420
# Convergence rate analysis (distance to optimum)
421
dist_newton = [np.linalg.norm(x - rosenbrock_opt) for x in hist_newton]
422
dist_bfgs = [np.linalg.norm(x - rosenbrock_opt) for x in hist_bfgs]
423
dist_adam = [np.linalg.norm(x - rosenbrock_opt) for x in hist_adam[:500]]
424
425
axes[1, 1].semilogy(dist_newton, 'r-o', markersize=6, linewidth=1.5, label='Newton')
426
axes[1, 1].semilogy(dist_bfgs, 'g-', linewidth=1, label='BFGS')
427
axes[1, 1].semilogy(dist_adam, 'b-', linewidth=1, label='Adam')
428
axes[1, 1].set_xlabel('Iteration')
429
axes[1, 1].set_ylabel('$||x - x^*||$')
430
axes[1, 1].set_title('Distance to Optimum')
431
axes[1, 1].legend()
432
axes[1, 1].grid(True, alpha=0.3)
433
434
plt.tight_layout()
435
save_plot('optimization_second_order.pdf', "Second-order methods: Newton and BFGS")
436
437
final_newton = rosenbrock(hist_newton[-1])
438
final_bfgs = rosenbrock(hist_bfgs[-1])
439
newton_iters = len(hist_newton) - 1
440
bfgs_iters = len(hist_bfgs) - 1
441
\end{pycode}
442
443
\section{Condition Number and Convergence}
444
445
\begin{pycode}
446
# Analyze effect of condition number on convergence
447
# Quadratic function f(x) = 0.5 * x'Ax with different condition numbers
448
449
def make_quadratic(kappa):
450
"""Create quadratic with condition number kappa."""
451
A = np.array([[kappa, 0], [0, 1]])
452
return A
453
454
def run_gd_quadratic(A, x0, alpha, n_iter):
455
"""Run GD on quadratic."""
456
history = [x0.copy()]
457
x = x0.copy()
458
for _ in range(n_iter):
459
g = A @ x
460
x = x - alpha * g
461
history.append(x.copy())
462
return np.array(history)
463
464
# Test different condition numbers
465
kappas = [1, 5, 10, 50, 100]
466
x0_quad = np.array([5.0, 5.0])
467
n_iter_quad = 200
468
469
# Visualization
470
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
471
472
# Convergence for different condition numbers
473
for kappa in kappas:
474
A = make_quadratic(kappa)
475
# Optimal step size for quadratic
476
L = kappa
477
mu = 1
478
alpha_opt = 2 / (L + mu)
479
480
hist = run_gd_quadratic(A, x0_quad, alpha_opt, n_iter_quad)
481
f_vals = [0.5 * x @ A @ x for x in hist]
482
axes[0, 0].semilogy(f_vals, linewidth=1.5, label=f'$\\kappa = {kappa}$')
483
484
axes[0, 0].set_xlabel('Iteration')
485
axes[0, 0].set_ylabel('$f(x)$')
486
axes[0, 0].set_title('Effect of Condition Number (optimal $\\alpha$)')
487
axes[0, 0].legend(fontsize=8)
488
axes[0, 0].grid(True, alpha=0.3)
489
490
# Theoretical vs actual convergence rate
491
kappa_range = np.arange(1, 101)
492
theory_rate = (kappa_range - 1) / (kappa_range + 1)
493
axes[0, 1].plot(kappa_range, theory_rate, 'b-', linewidth=2, label='$(\\kappa-1)/(\\kappa+1)$')
494
axes[0, 1].set_xlabel('Condition number $\\kappa$')
495
axes[0, 1].set_ylabel('Convergence rate')
496
axes[0, 1].set_title('Theoretical Convergence Rate (GD)')
497
axes[0, 1].legend()
498
axes[0, 1].grid(True, alpha=0.3)
499
500
# Effect of step size on ill-conditioned problem
501
kappa_ill = 100
502
A_ill = make_quadratic(kappa_ill)
503
alphas_quad = [0.001, 0.005, 0.01, 0.019, 0.02]
504
505
for alpha in alphas_quad:
506
hist = run_gd_quadratic(A_ill, x0_quad, alpha, n_iter_quad)
507
f_vals = [0.5 * x @ A_ill @ x for x in hist]
508
axes[1, 0].semilogy(f_vals, linewidth=1, label=f'$\\alpha = {alpha}$')
509
510
axes[1, 0].set_xlabel('Iteration')
511
axes[1, 0].set_ylabel('$f(x)$')
512
axes[1, 0].set_title(f'Step Size Sensitivity ($\\kappa = {kappa_ill}$)')
513
axes[1, 0].legend(fontsize=8)
514
axes[1, 0].grid(True, alpha=0.3)
515
axes[1, 0].set_ylim([1e-15, 1e5])
516
517
# Compare GD vs momentum on ill-conditioned
518
A_ill = make_quadratic(100)
519
hist_gd_ill = run_gd_quadratic(A_ill, x0_quad, 0.01, 500)
520
521
# Momentum for quadratic
522
def run_momentum_quadratic(A, x0, alpha, beta, n_iter):
523
x = x0.copy()
524
v = np.zeros_like(x)
525
history = [x.copy()]
526
for _ in range(n_iter):
527
g = A @ x
528
v = beta*v + alpha*g
529
x = x - v
530
history.append(x.copy())
531
return np.array(history)
532
533
hist_mom_ill = run_momentum_quadratic(A_ill, x0_quad, 0.01, 0.9, 500)
534
535
f_gd_ill = [0.5 * x @ A_ill @ x for x in hist_gd_ill]
536
f_mom_ill = [0.5 * x @ A_ill @ x for x in hist_mom_ill]
537
538
axes[1, 1].semilogy(f_gd_ill, 'b-', linewidth=1.5, label='GD')
539
axes[1, 1].semilogy(f_mom_ill, 'r-', linewidth=1.5, label='Momentum')
540
axes[1, 1].set_xlabel('Iteration')
541
axes[1, 1].set_ylabel('$f(x)$')
542
axes[1, 1].set_title(f'GD vs Momentum ($\\kappa = 100$)')
543
axes[1, 1].legend()
544
axes[1, 1].grid(True, alpha=0.3)
545
546
plt.tight_layout()
547
save_plot('optimization_condition_number.pdf', 'Effect of condition number on convergence')
548
\end{pycode}
549
550
\section{Constrained Optimization}
551
552
\begin{pycode}
553
# Constrained optimization using penalty methods and Lagrange multipliers
554
555
# Problem: minimize f(x) subject to g(x) <= 0
556
# f(x) = (x1-1)^2 + (x2-2)^2
557
# g(x) = x1^2 + x2^2 - 1 <= 0
558
559
def objective_constrained(x):
560
return (x[0] - 1)**2 + (x[1] - 2)**2
561
562
def objective_grad(x):
563
return np.array([2*(x[0] - 1), 2*(x[1] - 2)])
564
565
def constraint(x):
566
return x[0]**2 + x[1]**2 - 1
567
568
def constraint_grad(x):
569
return np.array([2*x[0], 2*x[1]])
570
571
# Penalty method
572
def penalty_method(x0, f, g, grad_f, grad_g, rho_init=1, rho_mult=10, n_outer=5, n_inner=500):
573
"""Solve constrained problem using quadratic penalty."""
574
x = x0.copy()
575
rho = rho_init
576
history = [x.copy()]
577
rhos = [rho]
578
579
for _ in range(n_outer):
580
# Inner optimization
581
for _ in range(n_inner):
582
violation = max(0, g(x))
583
grad = grad_f(x) + 2*rho*violation*grad_g(x)
584
x = x - 0.01 * grad
585
history.append(x.copy())
586
587
rho *= rho_mult
588
rhos.append(rho)
589
590
return np.array(history), rhos
591
592
# Augmented Lagrangian method
593
def augmented_lagrangian(x0, f, g, grad_f, grad_g, rho=10, n_outer=10, n_inner=100):
594
"""Augmented Lagrangian method."""
595
x = x0.copy()
596
lam = 0 # Lagrange multiplier
597
history = [x.copy()]
598
multipliers = [lam]
599
600
for _ in range(n_outer):
601
# Inner optimization
602
for _ in range(n_inner):
603
violation = g(x)
604
grad = grad_f(x) + lam*grad_g(x) + rho*max(0, violation)*grad_g(x)
605
x = x - 0.01 * grad
606
history.append(x.copy())
607
608
# Update multiplier
609
lam = max(0, lam + rho * g(x))
610
multipliers.append(lam)
611
612
return np.array(history), multipliers
613
614
# Projected gradient descent
615
def projected_gd(x0, grad_f, project, alpha=0.1, n_iter=500):
616
"""Projected gradient descent."""
617
x = x0.copy()
618
history = [x.copy()]
619
for _ in range(n_iter):
620
x = x - alpha * grad_f(x)
621
x = project(x) # Project onto feasible set
622
history.append(x.copy())
623
return np.array(history)
624
625
def project_to_ball(x, radius=1):
626
"""Project onto unit ball."""
627
norm = np.linalg.norm(x)
628
if norm > radius:
629
return radius * x / norm
630
return x
631
632
# Run methods
633
x0_con = np.array([0.0, 0.0])
634
hist_penalty, rhos = penalty_method(x0_con, objective_constrained, constraint,
635
objective_grad, constraint_grad)
636
hist_al, multipliers = augmented_lagrangian(x0_con, objective_constrained, constraint,
637
objective_grad, constraint_grad)
638
hist_proj = projected_gd(x0_con, objective_grad, project_to_ball, alpha=0.1, n_iter=500)
639
640
# Analytical solution
641
# Optimum on boundary: x* = (1, 2)/sqrt(5)
642
x_opt_con = np.array([1, 2]) / np.sqrt(5)
643
f_opt_con = objective_constrained(x_opt_con)
644
645
# Visualization
646
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
647
648
# Contour plot with constraint
649
theta_c = np.linspace(0, 2*np.pi, 100)
650
x_circle = np.cos(theta_c)
651
y_circle = np.sin(theta_c)
652
653
x_con = np.linspace(-1.5, 2, 100)
654
y_con = np.linspace(-1, 3, 100)
655
X_con, Y_con = np.meshgrid(x_con, y_con)
656
Z_con = (X_con - 1)**2 + (Y_con - 2)**2
657
658
axes[0, 0].contour(X_con, Y_con, Z_con, levels=20, cmap='viridis', alpha=0.7)
659
axes[0, 0].plot(x_circle, y_circle, 'r-', linewidth=2, label='Constraint')
660
axes[0, 0].fill(x_circle, y_circle, alpha=0.2, color='gray')
661
axes[0, 0].plot(hist_penalty[-1, 0], hist_penalty[-1, 1], 'bo', markersize=10, label='Penalty')
662
axes[0, 0].plot(hist_al[-1, 0], hist_al[-1, 1], 'g^', markersize=10, label='Aug. Lagrangian')
663
axes[0, 0].plot(hist_proj[-1, 0], hist_proj[-1, 1], 'rs', markersize=10, label='Projected GD')
664
axes[0, 0].plot(x_opt_con[0], x_opt_con[1], 'k*', markersize=15, label='Optimum')
665
axes[0, 0].set_xlabel('$x_1$')
666
axes[0, 0].set_ylabel('$x_2$')
667
axes[0, 0].set_title('Constrained Optimization')
668
axes[0, 0].legend(fontsize=8)
669
axes[0, 0].set_xlim([-1.5, 2])
670
axes[0, 0].set_ylim([-1, 3])
671
672
# Convergence to optimum
673
f_penalty = [objective_constrained(x) for x in hist_penalty]
674
f_al = [objective_constrained(x) for x in hist_al]
675
f_proj = [objective_constrained(x) for x in hist_proj]
676
677
axes[0, 1].plot(f_penalty, 'b-', linewidth=1, label='Penalty', alpha=0.7)
678
axes[0, 1].plot(f_al, 'g-', linewidth=1, label='Aug. Lagrangian', alpha=0.7)
679
axes[0, 1].plot(f_proj, 'r-', linewidth=1, label='Projected GD', alpha=0.7)
680
axes[0, 1].axhline(y=f_opt_con, color='k', linestyle='--', label=f'Optimal = {f_opt_con:.4f}')
681
axes[0, 1].set_xlabel('Iteration')
682
axes[0, 1].set_ylabel('$f(x)$')
683
axes[0, 1].set_title('Objective Value')
684
axes[0, 1].legend(fontsize=8)
685
axes[0, 1].grid(True, alpha=0.3)
686
687
# Constraint violation
688
viol_penalty = [max(0, constraint(x)) for x in hist_penalty]
689
viol_al = [max(0, constraint(x)) for x in hist_al]
690
viol_proj = [max(0, constraint(x)) for x in hist_proj]
691
692
axes[1, 0].semilogy(viol_penalty, 'b-', linewidth=1, label='Penalty')
693
axes[1, 0].semilogy(viol_al, 'g-', linewidth=1, label='Aug. Lagrangian')
694
axes[1, 0].semilogy(viol_proj, 'r-', linewidth=1, label='Projected GD')
695
axes[1, 0].set_xlabel('Iteration')
696
axes[1, 0].set_ylabel('Constraint violation')
697
axes[1, 0].set_title('Constraint Satisfaction')
698
axes[1, 0].legend(fontsize=8)
699
axes[1, 0].grid(True, alpha=0.3)
700
701
# Lagrange multiplier evolution
702
axes[1, 1].plot(multipliers, 'g-o', markersize=6, linewidth=1.5)
703
axes[1, 1].set_xlabel('Outer iteration')
704
axes[1, 1].set_ylabel('$\\lambda$')
705
axes[1, 1].set_title('Lagrange Multiplier Evolution')
706
axes[1, 1].grid(True, alpha=0.3)
707
708
# Theoretical optimal multiplier
709
lam_opt = 2 * np.sqrt(5) - 2
710
axes[1, 1].axhline(y=lam_opt, color='r', linestyle='--', label=f'$\\lambda^* = {lam_opt:.3f}$')
711
axes[1, 1].legend()
712
713
plt.tight_layout()
714
save_plot('optimization_constrained.pdf', 'Constrained optimization methods')
715
716
final_penalty = objective_constrained(hist_penalty[-1])
717
final_al = objective_constrained(hist_al[-1])
718
final_proj = objective_constrained(hist_proj[-1])
719
\end{pycode}
720
721
\section{Stochastic Optimization}
722
723
\begin{pycode}
724
# Stochastic gradient descent for noisy gradients
725
726
def sgd(x0, grad_f, batch_grad_f, alpha=0.01, n_iter=1000, noise_std=0.1):
727
"""Stochastic gradient descent with noisy gradients."""
728
x = x0.copy()
729
history = [x.copy()]
730
for _ in range(n_iter):
731
# Noisy gradient estimate
732
g = grad_f(x) + noise_std * np.random.randn(len(x))
733
x = x - alpha * g
734
history.append(x.copy())
735
return np.array(history)
736
737
def sgd_with_decay(x0, grad_f, alpha0=0.1, decay=0.001, n_iter=1000, noise_std=0.5):
738
"""SGD with learning rate decay."""
739
x = x0.copy()
740
history = [x.copy()]
741
for t in range(1, n_iter+1):
742
alpha = alpha0 / (1 + decay * t)
743
g = grad_f(x) + noise_std * np.random.randn(len(x))
744
x = x - alpha * g
745
history.append(x.copy())
746
return np.array(history)
747
748
def mini_batch_sgd(x0, grad_f, batch_size=10, alpha=0.01, n_iter=1000, noise_std=0.5):
749
"""Mini-batch SGD with averaged gradients."""
750
x = x0.copy()
751
history = [x.copy()]
752
for _ in range(n_iter):
753
# Average over batch
754
g = grad_f(x) + noise_std * np.random.randn(len(x)) / np.sqrt(batch_size)
755
x = x - alpha * g
756
history.append(x.copy())
757
return np.array(history)
758
759
# Run stochastic methods on simple quadratic
760
A_quad = np.array([[5, 0], [0, 1]])
761
def quad_grad(x):
762
return A_quad @ x
763
764
x0_sgd = np.array([5.0, 5.0])
765
n_iter_sgd = 2000
766
767
hist_sgd = sgd(x0_sgd, quad_grad, None, alpha=0.01, noise_std=1.0)
768
hist_sgd_decay = sgd_with_decay(x0_sgd, quad_grad, alpha0=0.1, decay=0.01, noise_std=1.0, n_iter=n_iter_sgd)
769
hist_minibatch = mini_batch_sgd(x0_sgd, quad_grad, batch_size=32, alpha=0.05, noise_std=1.0, n_iter=n_iter_sgd)
770
771
# Full batch for comparison
772
hist_full = run_gd_quadratic(A_quad, x0_sgd, 0.1, n_iter_sgd)
773
774
# Visualization
775
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
776
777
# Optimization paths
778
axes[0, 0].plot(hist_sgd[:500, 0], hist_sgd[:500, 1], 'b-', linewidth=0.3, alpha=0.5, label='SGD')
779
axes[0, 0].plot(hist_full[:500, 0], hist_full[:500, 1], 'r-', linewidth=1.5, label='Full batch')
780
axes[0, 0].plot(0, 0, 'k*', markersize=15)
781
axes[0, 0].set_xlabel('$x_1$')
782
axes[0, 0].set_ylabel('$x_2$')
783
axes[0, 0].set_title('SGD vs Full Batch GD')
784
axes[0, 0].legend()
785
axes[0, 0].grid(True, alpha=0.3)
786
787
# Convergence with noise
788
f_sgd = [0.5 * x @ A_quad @ x for x in hist_sgd]
789
f_decay = [0.5 * x @ A_quad @ x for x in hist_sgd_decay]
790
f_mini = [0.5 * x @ A_quad @ x for x in hist_minibatch]
791
f_full = [0.5 * x @ A_quad @ x for x in hist_full]
792
793
axes[0, 1].semilogy(f_sgd, 'b-', linewidth=0.5, alpha=0.5, label='SGD')
794
axes[0, 1].semilogy(f_decay, 'g-', linewidth=0.5, alpha=0.5, label='SGD + decay')
795
axes[0, 1].semilogy(f_mini, 'c-', linewidth=0.5, alpha=0.5, label='Mini-batch')
796
axes[0, 1].semilogy(f_full, 'r-', linewidth=1.5, label='Full batch')
797
axes[0, 1].set_xlabel('Iteration')
798
axes[0, 1].set_ylabel('$f(x)$')
799
axes[0, 1].set_title('Stochastic Optimization Convergence')
800
axes[0, 1].legend(fontsize=8)
801
axes[0, 1].grid(True, alpha=0.3)
802
803
# Moving average
804
window = 50
805
f_sgd_avg = np.convolve(f_sgd, np.ones(window)/window, mode='valid')
806
f_decay_avg = np.convolve(f_decay, np.ones(window)/window, mode='valid')
807
f_mini_avg = np.convolve(f_mini, np.ones(window)/window, mode='valid')
808
809
axes[1, 0].semilogy(f_sgd_avg, 'b-', linewidth=1.5, label='SGD')
810
axes[1, 0].semilogy(f_decay_avg, 'g-', linewidth=1.5, label='SGD + decay')
811
axes[1, 0].semilogy(f_mini_avg, 'c-', linewidth=1.5, label='Mini-batch')
812
axes[1, 0].set_xlabel('Iteration')
813
axes[1, 0].set_ylabel('$f(x)$ (moving avg)')
814
axes[1, 0].set_title(f'Smoothed Convergence (window={window})')
815
axes[1, 0].legend(fontsize=8)
816
axes[1, 0].grid(True, alpha=0.3)
817
818
# Variance reduction with batch size
819
batch_sizes = [1, 4, 16, 64]
820
final_variances = []
821
for bs in batch_sizes:
822
# Run multiple trials
823
finals = []
824
for _ in range(20):
825
hist = mini_batch_sgd(x0_sgd, quad_grad, batch_size=bs, alpha=0.01, noise_std=1.0, n_iter=500)
826
finals.append(0.5 * hist[-1] @ A_quad @ hist[-1])
827
final_variances.append(np.var(finals))
828
829
axes[1, 1].loglog(batch_sizes, final_variances, 'o-', markersize=8, linewidth=2)
830
axes[1, 1].set_xlabel('Batch size')
831
axes[1, 1].set_ylabel('Variance of final $f(x)$')
832
axes[1, 1].set_title('Variance Reduction with Batch Size')
833
axes[1, 1].grid(True, alpha=0.3)
834
835
plt.tight_layout()
836
save_plot('optimization_stochastic.pdf', 'Stochastic gradient descent methods')
837
\end{pycode}
838
839
\section{Comparison on Multiple Test Functions}
840
841
\begin{pycode}
842
# Compare optimizers on different test functions
843
844
test_functions = {
845
'Rosenbrock': (rosenbrock, rosenbrock_grad, np.array([-1.5, 1.5]), np.array([1, 1])),
846
'Beale': (beale, beale_grad, np.array([0.0, 0.0]), np.array([3, 0.5])),
847
'Quadratic': (lambda x: quadratic(x), lambda x: quadratic_grad(x), np.array([5, 5]), np.array([0, 0]))
848
}
849
850
results = {}
851
n_iter_compare = 2000
852
853
for name, (f, grad, x0, opt) in test_functions.items():
854
results[name] = {}
855
856
# Run optimizers
857
hist = gradient_descent(x0, grad, alpha=0.001, n_iter=n_iter_compare)
858
results[name]['GD'] = np.linalg.norm(hist[-1] - opt)
859
860
hist = momentum_gd(x0, grad, alpha=0.001, beta=0.9, n_iter=n_iter_compare)
861
results[name]['Momentum'] = np.linalg.norm(hist[-1] - opt)
862
863
hist = adam(x0, grad, alpha=0.01, n_iter=n_iter_compare)
864
results[name]['Adam'] = np.linalg.norm(hist[-1] - opt)
865
866
hist = bfgs(x0, f, grad, n_iter=n_iter_compare)
867
results[name]['BFGS'] = np.linalg.norm(hist[-1] - opt)
868
869
# Visualization
870
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
871
872
# Bar chart comparison
873
functions = list(results.keys())
874
methods = ['GD', 'Momentum', 'Adam', 'BFGS']
875
x_pos = np.arange(len(functions))
876
width = 0.2
877
878
for i, method in enumerate(methods):
879
values = [results[fn][method] for fn in functions]
880
axes[0].bar(x_pos + i*width, values, width, label=method)
881
882
axes[0].set_ylabel('Final distance to optimum')
883
axes[0].set_xticks(x_pos + 1.5*width)
884
axes[0].set_xticklabels(functions)
885
axes[0].set_title('Optimizer Comparison')
886
axes[0].legend()
887
axes[0].grid(True, alpha=0.3)
888
axes[0].set_yscale('log')
889
890
# Iteration counts to reach tolerance
891
tol = 1e-3
892
iter_counts = {}
893
894
for name, (f, grad, x0, opt) in test_functions.items():
895
iter_counts[name] = {}
896
897
for method_name, method in [('GD', lambda x0, g: gradient_descent(x0, g, alpha=0.001, n_iter=10000)),
898
('Momentum', lambda x0, g: momentum_gd(x0, g, alpha=0.001, beta=0.9, n_iter=10000)),
899
('Adam', lambda x0, g: adam(x0, g, alpha=0.01, n_iter=10000))]:
900
hist = method(x0, grad)
901
distances = [np.linalg.norm(x - opt) for x in hist]
902
# Find first iteration below tolerance
903
below_tol = np.where(np.array(distances) < tol)[0]
904
if len(below_tol) > 0:
905
iter_counts[name][method_name] = below_tol[0]
906
else:
907
iter_counts[name][method_name] = 10000
908
909
# Heatmap of iteration counts
910
data = np.array([[iter_counts[fn][m] for m in ['GD', 'Momentum', 'Adam']] for fn in functions])
911
im = axes[1].imshow(data, cmap='YlOrRd', aspect='auto')
912
913
axes[1].set_xticks(range(3))
914
axes[1].set_yticks(range(len(functions)))
915
axes[1].set_xticklabels(['GD', 'Momentum', 'Adam'])
916
axes[1].set_yticklabels(functions)
917
918
for i in range(len(functions)):
919
for j in range(3):
920
text = axes[1].text(j, i, f'{data[i, j]:.0f}', ha='center', va='center')
921
922
plt.colorbar(im, ax=axes[1], label='Iterations')
923
axes[1].set_title(f'Iterations to reach $||x - x^*|| < {tol}$')
924
925
plt.tight_layout()
926
save_plot('optimization_comparison.pdf', 'Optimizer comparison across test functions')
927
\end{pycode}
928
929
\section{Results Summary}
930
931
\begin{pycode}
932
# Create comprehensive results table
933
print(r'\begin{table}[H]')
934
print(r'\centering')
935
print(r'\caption{Optimization Results on Rosenbrock Function}')
936
print(r'\begin{tabular}{lcc}')
937
print(r'\toprule')
938
print(r'Method & Final $f(x)$ & Iterations \\')
939
print(r'\midrule')
940
print(f'Gradient Descent & {final_gd:.6f} & {n_iter} \\\\')
941
print(f'Momentum & {final_mom:.6f} & {n_iter} \\\\')
942
print(f'Nesterov & {final_nest:.6f} & {n_iter} \\\\')
943
print(f'RMSprop & {final_rms:.6f} & {n_iter} \\\\')
944
print(f'Adam & {final_adam:.6f} & {n_iter} \\\\')
945
print(f"Newton's Method & {final_newton:.6f} & {newton_iters} \\\\")
946
print(f'BFGS & {final_bfgs:.6f} & {bfgs_iters} \\\\')
947
print(r'\bottomrule')
948
print(r'\end{tabular}')
949
print(r'\end{table}')
950
951
# Constrained optimization results
952
print(r'\begin{table}[H]')
953
print(r'\centering')
954
print(r'\caption{Constrained Optimization Results}')
955
print(r'\begin{tabular}{lcc}')
956
print(r'\toprule')
957
print(r'Method & Final $f(x)$ & Constraint Violation \\')
958
print(r'\midrule')
959
print(f'Penalty Method & {final_penalty:.4f} & {max(0, constraint(hist_penalty[-1])):.6f} \\\\')
960
print(f'Aug. Lagrangian & {final_al:.4f} & {max(0, constraint(hist_al[-1])):.6f} \\\\')
961
print(f'Projected GD & {final_proj:.4f} & {max(0, constraint(hist_proj[-1])):.6f} \\\\')
962
print(f'Analytical Opt. & {f_opt_con:.4f} & 0.0 \\\\')
963
print(r'\bottomrule')
964
print(r'\end{tabular}')
965
print(r'\end{table}')
966
967
# Method comparison
968
print(r'\begin{table}[H]')
969
print(r'\centering')
970
print(r'\caption{Optimizer Performance Comparison}')
971
print(r'\begin{tabular}{lcccc}')
972
print(r'\toprule')
973
print(r'Function & GD & Momentum & Adam & BFGS \\')
974
print(r'\midrule')
975
for name in results.keys():
976
gd = results[name]['GD']
977
mom = results[name]['Momentum']
978
adam_r = results[name]['Adam']
979
bfgs_r = results[name]['BFGS']
980
print(f'{name} & {gd:.2e} & {mom:.2e} & {adam_r:.2e} & {bfgs_r:.2e} \\\\')
981
print(r'\bottomrule')
982
print(r'\end{tabular}')
983
print(r'\end{table}')
984
\end{pycode}
985
986
\section{Statistical Summary}
987
988
Optimization results:
989
\begin{itemize}
990
\item Final value (GD): \py{f"{final_gd:.6f}"}
991
\item Final value (Momentum): \py{f"{final_mom:.6f}"}
992
\item Final value (Nesterov): \py{f"{final_nest:.6f}"}
993
\item Final value (RMSprop): \py{f"{final_rms:.6f}"}
994
\item Final value (Adam): \py{f"{final_adam:.6f}"}
995
\item Final value (Newton): \py{f"{final_newton:.6f}"} in \py{f"{newton_iters}"} iterations
996
\item Final value (BFGS): \py{f"{final_bfgs:.6f}"} in \py{f"{bfgs_iters}"} iterations
997
\item Optimal point: $(1, 1)$
998
\item Constrained optimum: \py{f"{f_opt_con:.4f}"} at \py{f"({x_opt_con[0]:.3f}, {x_opt_con[1]:.3f})"}
999
\end{itemize}
1000
1001
\section{Conclusion}
1002
1003
This analysis compared optimization methods across different problem classes:
1004
1005
\begin{enumerate}
1006
\item \textbf{First-order methods}: Momentum accelerates convergence by accumulating past gradients. Adaptive methods (RMSprop, Adam) automatically tune per-parameter learning rates, making them robust across problems.
1007
1008
\item \textbf{Second-order methods}: Newton's method achieves quadratic convergence near the optimum but requires Hessian computation. BFGS approximates the Hessian using gradient differences, providing superlinear convergence without explicit second derivatives.
1009
1010
\item \textbf{Condition number effects}: Ill-conditioned problems (high $\kappa$) slow gradient descent significantly. Momentum and adaptive methods partially mitigate this effect.
1011
1012
\item \textbf{Constrained optimization}: Penalty methods, augmented Lagrangian, and projected gradient descent handle constraints through different mechanisms. Augmented Lagrangian combines benefits of penalty and dual methods.
1013
1014
\item \textbf{Stochastic optimization}: Mini-batch SGD with learning rate decay balances noise reduction and computational efficiency. Batch size controls variance of gradient estimates.
1015
\end{enumerate}
1016
1017
Learning rate selection is critical: too small leads to slow convergence, too large causes divergence. Adaptive methods like Adam are popular in deep learning due to their robustness to hyperparameter choices.
1018
1019
\end{document}
1020
1021