Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/machine-learning/neural_network.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{algorithm2e}
9
\usepackage{subcaption}
10
\usepackage[makestderr]{pythontex}
11
12
% Theorem environments for tutorial style
13
\newtheorem{definition}{Definition}
14
\newtheorem{remark}{Remark}
15
16
\title{Neural Network Training: A Complete Pipeline\\
17
\large From Architecture Design to Performance Analysis}
18
\author{Machine Learning Research Group\\Computational Science Templates}
19
\date{\today}
20
21
\begin{document}
22
\maketitle
23
24
\begin{abstract}
25
This tutorial provides a comprehensive walkthrough of training a neural network for function approximation. We implement a multi-layer perceptron from scratch using NumPy, demonstrating forward propagation, backpropagation, and gradient descent optimization. The analysis includes architecture comparison, learning rate sensitivity, and convergence diagnostics.
26
\end{abstract}
27
28
\section{Introduction}
29
Artificial neural networks are universal function approximators capable of learning complex nonlinear mappings. This document presents a complete training pipeline, from data generation to model evaluation, with emphasis on understanding the mathematical foundations.
30
31
\begin{definition}[Feedforward Neural Network]
32
A feedforward neural network is a function $f: \mathbb{R}^n \to \mathbb{R}^m$ composed of alternating linear transformations and nonlinear activations:
33
\begin{equation}
34
f(\mathbf{x}) = \sigma_L(W_L \cdot \sigma_{L-1}(W_{L-1} \cdots \sigma_1(W_1 \mathbf{x} + \mathbf{b}_1) \cdots + \mathbf{b}_{L-1}) + \mathbf{b}_L)
35
\end{equation}
36
where $W_l$ are weight matrices, $\mathbf{b}_l$ are bias vectors, and $\sigma_l$ are activation functions.
37
\end{definition}
38
39
\section{Mathematical Framework}
40
41
\subsection{Forward Propagation}
42
For a network with $L$ layers, the forward pass computes:
43
\begin{align}
44
\mathbf{z}^{(l)} &= W^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)} \quad &\text{(pre-activation)}\\
45
\mathbf{a}^{(l)} &= \sigma(\mathbf{z}^{(l)}) \quad &\text{(activation)}
46
\end{align}
47
48
\subsection{Backpropagation}
49
The gradient of the loss with respect to weights is computed via the chain rule:
50
\begin{equation}
51
\frac{\partial \mathcal{L}}{\partial W^{(l)}} = \boldsymbol{\delta}^{(l)} (\mathbf{a}^{(l-1)})^T
52
\end{equation}
53
where the error signal propagates backward:
54
\begin{equation}
55
\boldsymbol{\delta}^{(l)} = (W^{(l+1)})^T \boldsymbol{\delta}^{(l+1)} \odot \sigma'(\mathbf{z}^{(l)})
56
\end{equation}
57
58
\subsection{Activation Functions}
59
We compare several common activation functions:
60
\begin{align}
61
\text{Sigmoid:} \quad &\sigma(z) = \frac{1}{1 + e^{-z}} \\
62
\text{Tanh:} \quad &\sigma(z) = \frac{e^z - e^{-z}}{e^z + e^{-z}} \\
63
\text{ReLU:} \quad &\sigma(z) = \max(0, z)
64
\end{align}
65
66
\section{Implementation}
67
68
\begin{pycode}
69
import numpy as np
70
import matplotlib.pyplot as plt
71
from time import time
72
plt.rc('text', usetex=True)
73
plt.rc('font', family='serif')
74
75
np.random.seed(42)
76
77
# Activation functions and their derivatives
78
def sigmoid(z):
79
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
80
81
def sigmoid_derivative(z):
82
s = sigmoid(z)
83
return s * (1 - s)
84
85
def relu(z):
86
return np.maximum(0, z)
87
88
def relu_derivative(z):
89
return (z > 0).astype(float)
90
91
def tanh(z):
92
return np.tanh(z)
93
94
def tanh_derivative(z):
95
return 1 - np.tanh(z)**2
96
97
class NeuralNetwork:
98
"""Multi-layer perceptron with configurable architecture."""
99
100
def __init__(self, layers, activation='relu'):
101
self.layers = layers
102
self.n_layers = len(layers)
103
104
# Choose activation function
105
if activation == 'sigmoid':
106
self.activation = sigmoid
107
self.activation_derivative = sigmoid_derivative
108
elif activation == 'tanh':
109
self.activation = tanh
110
self.activation_derivative = tanh_derivative
111
else:
112
self.activation = relu
113
self.activation_derivative = relu_derivative
114
115
# Initialize weights using He initialization
116
self.weights = []
117
self.biases = []
118
for i in range(len(layers) - 1):
119
w = np.random.randn(layers[i+1], layers[i]) * np.sqrt(2.0 / layers[i])
120
b = np.zeros((layers[i+1], 1))
121
self.weights.append(w)
122
self.biases.append(b)
123
124
# Store training history
125
self.loss_history = []
126
self.val_loss_history = []
127
128
def forward(self, X):
129
"""Forward propagation."""
130
self.activations = [X]
131
self.z_values = []
132
133
a = X
134
for i in range(len(self.weights) - 1):
135
z = self.weights[i] @ a + self.biases[i]
136
self.z_values.append(z)
137
a = self.activation(z)
138
self.activations.append(a)
139
140
# Output layer (linear for regression)
141
z = self.weights[-1] @ a + self.biases[-1]
142
self.z_values.append(z)
143
self.activations.append(z)
144
145
return z
146
147
def backward(self, y, learning_rate):
148
"""Backpropagation with gradient descent."""
149
m = y.shape[1]
150
151
# Output layer error
152
delta = self.activations[-1] - y
153
154
# Backpropagate through layers
155
for i in range(len(self.weights) - 1, -1, -1):
156
dW = (1/m) * delta @ self.activations[i].T
157
db = (1/m) * np.sum(delta, axis=1, keepdims=True)
158
159
if i > 0:
160
delta = self.weights[i].T @ delta * self.activation_derivative(self.z_values[i-1])
161
162
# Update weights
163
self.weights[i] -= learning_rate * dW
164
self.biases[i] -= learning_rate * db
165
166
def compute_loss(self, y_pred, y_true):
167
"""Mean squared error loss."""
168
return np.mean((y_pred - y_true)**2)
169
170
def train(self, X_train, y_train, X_val, y_val, epochs, learning_rate):
171
"""Train the network."""
172
for epoch in range(epochs):
173
# Forward pass
174
y_pred = self.forward(X_train)
175
176
# Compute loss
177
loss = self.compute_loss(y_pred, y_train)
178
self.loss_history.append(loss)
179
180
# Validation loss
181
val_pred = self.forward(X_val)
182
val_loss = self.compute_loss(val_pred, y_val)
183
self.val_loss_history.append(val_loss)
184
185
# Backward pass - need to re-forward to restore activations
186
self.forward(X_train)
187
self.backward(y_train, learning_rate)
188
189
return self.loss_history, self.val_loss_history
190
191
# Generate training data: approximate a complex function
192
def target_function(x):
193
return np.sin(2*x) * np.exp(-0.1*x**2) + 0.5*np.cos(5*x)
194
195
n_train = 200
196
n_val = 50
197
X_train = np.random.uniform(-3, 3, (1, n_train))
198
y_train = target_function(X_train)
199
X_val = np.random.uniform(-3, 3, (1, n_val))
200
y_val = target_function(X_val)
201
202
# For plotting
203
X_test = np.linspace(-3, 3, 500).reshape(1, -1)
204
y_test = target_function(X_test)
205
206
# Train networks with different architectures
207
architectures = [
208
([1, 32, 1], 'Small'),
209
([1, 64, 32, 1], 'Medium'),
210
([1, 128, 64, 32, 1], 'Large')
211
]
212
213
results = {}
214
for arch, name in architectures:
215
nn = NeuralNetwork(arch, activation='tanh')
216
start_time = time()
217
train_loss, val_loss = nn.train(X_train, y_train, X_val, y_val,
218
epochs=1000, learning_rate=0.01)
219
training_time = time() - start_time
220
221
# Get predictions
222
y_pred = nn.forward(X_test)
223
224
results[name] = {
225
'model': nn,
226
'train_loss': train_loss,
227
'val_loss': val_loss,
228
'predictions': y_pred,
229
'final_loss': train_loss[-1],
230
'time': training_time,
231
'params': sum(w.size + b.size for w, b in zip(nn.weights, nn.biases))
232
}
233
234
# Learning rate comparison
235
learning_rates = [0.001, 0.01, 0.1]
236
lr_results = {}
237
for lr in learning_rates:
238
nn = NeuralNetwork([1, 64, 32, 1], activation='tanh')
239
train_loss, _ = nn.train(X_train, y_train, X_val, y_val,
240
epochs=500, learning_rate=lr)
241
lr_results[lr] = train_loss
242
243
# Create comprehensive visualization
244
fig = plt.figure(figsize=(12, 10))
245
246
# Plot 1: Function approximation comparison
247
ax1 = fig.add_subplot(2, 2, 1)
248
ax1.plot(X_test.flatten(), y_test.flatten(), 'k-', linewidth=2, label='Target', alpha=0.7)
249
colors = ['#2ecc71', '#3498db', '#9b59b6']
250
for (name, res), color in zip(results.items(), colors):
251
ax1.plot(X_test.flatten(), res['predictions'].flatten(),
252
linestyle='--', linewidth=1.5, color=color, label=name)
253
ax1.scatter(X_train.flatten(), y_train.flatten(), s=10, alpha=0.3, color='gray', label='Training data')
254
ax1.set_xlabel('$x$')
255
ax1.set_ylabel('$y$')
256
ax1.set_title('Function Approximation by Architecture')
257
ax1.legend(loc='upper right', fontsize=8)
258
ax1.grid(True, alpha=0.3)
259
260
# Plot 2: Training curves
261
ax2 = fig.add_subplot(2, 2, 2)
262
for (name, res), color in zip(results.items(), colors):
263
ax2.semilogy(res['train_loss'], color=color, linewidth=1.5, label=f'{name} (train)')
264
ax2.semilogy(res['val_loss'], color=color, linewidth=1, linestyle='--', alpha=0.7)
265
ax2.set_xlabel('Epoch')
266
ax2.set_ylabel('MSE Loss')
267
ax2.set_title('Training Convergence')
268
ax2.legend(fontsize=8)
269
ax2.grid(True, alpha=0.3)
270
271
# Plot 3: Learning rate sensitivity
272
ax3 = fig.add_subplot(2, 2, 3)
273
lr_colors = ['#e74c3c', '#f39c12', '#27ae60']
274
for lr, color in zip(learning_rates, lr_colors):
275
ax3.semilogy(lr_results[lr], color=color, linewidth=1.5, label=f'$\\eta = {lr}$')
276
ax3.set_xlabel('Epoch')
277
ax3.set_ylabel('Training Loss')
278
ax3.set_title('Learning Rate Sensitivity')
279
ax3.legend(fontsize=8)
280
ax3.grid(True, alpha=0.3)
281
282
# Plot 4: Model comparison summary
283
ax4 = fig.add_subplot(2, 2, 4)
284
names = list(results.keys())
285
final_losses = [results[n]['final_loss'] for n in names]
286
params = [results[n]['params'] for n in names]
287
times = [results[n]['time']*1000 for n in names]
288
289
x_pos = np.arange(len(names))
290
width = 0.25
291
292
bars1 = ax4.bar(x_pos - width, [f*1000 for f in final_losses], width,
293
label='Final Loss ($\\times 10^3$)', color='#3498db', alpha=0.8)
294
ax4_twin = ax4.twinx()
295
bars2 = ax4_twin.bar(x_pos, [p/100 for p in params], width,
296
label='Parameters ($\\times 100$)', color='#2ecc71', alpha=0.8)
297
bars3 = ax4_twin.bar(x_pos + width, times, width,
298
label='Time (ms)', color='#9b59b6', alpha=0.8)
299
300
ax4.set_xlabel('Architecture')
301
ax4.set_ylabel('Loss ($\\times 10^{-3}$)', color='#3498db')
302
ax4_twin.set_ylabel('Parameters/Time', color='#2ecc71')
303
ax4.set_xticks(x_pos)
304
ax4.set_xticklabels(names)
305
ax4.set_title('Model Comparison')
306
307
# Combined legend
308
lines1, labels1 = ax4.get_legend_handles_labels()
309
lines2, labels2 = ax4_twin.get_legend_handles_labels()
310
ax4.legend(lines1 + lines2, labels1 + labels2, loc='upper left', fontsize=7)
311
312
plt.tight_layout()
313
plt.savefig('neural_network_plot.pdf', bbox_inches='tight', dpi=150)
314
print(r'\begin{center}')
315
print(r'\includegraphics[width=\textwidth]{neural_network_plot.pdf}')
316
print(r'\end{center}')
317
plt.close()
318
319
# Extract key results
320
best_model = min(results.items(), key=lambda x: x[1]['final_loss'])
321
best_name = best_model[0]
322
best_loss = best_model[1]['final_loss']
323
best_params = best_model[1]['params']
324
\end{pycode}
325
326
\section{Training Algorithm}
327
328
\begin{algorithm}[H]
329
\SetAlgoLined
330
\KwIn{Training data $(X, y)$, learning rate $\eta$, epochs $E$}
331
\KwOut{Trained weights $\{W^{(l)}, b^{(l)}\}$}
332
Initialize weights with He initialization\;
333
\For{epoch $= 1$ \KwTo $E$}{
334
\tcc{Forward propagation}
335
$\mathbf{a}^{(0)} \leftarrow X$\;
336
\For{$l = 1$ \KwTo $L$}{
337
$\mathbf{z}^{(l)} \leftarrow W^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)}$\;
338
$\mathbf{a}^{(l)} \leftarrow \sigma(\mathbf{z}^{(l)})$\;
339
}
340
\tcc{Backpropagation}
341
$\boldsymbol{\delta}^{(L)} \leftarrow \mathbf{a}^{(L)} - y$\;
342
\For{$l = L$ \KwTo $1$}{
343
$\nabla W^{(l)} \leftarrow \boldsymbol{\delta}^{(l)} (\mathbf{a}^{(l-1)})^T / m$\;
344
$\nabla \mathbf{b}^{(l)} \leftarrow \text{mean}(\boldsymbol{\delta}^{(l)})$\;
345
$\boldsymbol{\delta}^{(l-1)} \leftarrow (W^{(l)})^T \boldsymbol{\delta}^{(l)} \odot \sigma'(\mathbf{z}^{(l-1)})$\;
346
}
347
\tcc{Gradient descent update}
348
$W^{(l)} \leftarrow W^{(l)} - \eta \nabla W^{(l)}$\;
349
$\mathbf{b}^{(l)} \leftarrow \mathbf{b}^{(l)} - \eta \nabla \mathbf{b}^{(l)}$\;
350
}
351
\caption{Backpropagation with Gradient Descent}
352
\end{algorithm}
353
354
\section{Results and Discussion}
355
356
\subsection{Architecture Comparison}
357
358
\begin{pycode}
359
# Create results table
360
print(r'\begin{table}[h]')
361
print(r'\centering')
362
print(r'\caption{Neural Network Architecture Comparison}')
363
print(r'\begin{tabular}{lccc}')
364
print(r'\toprule')
365
print(r'Architecture & Parameters & Final Loss & Training Time (ms) \\')
366
print(r'\midrule')
367
for name in ['Small', 'Medium', 'Large']:
368
res = results[name]
369
print(f"{name} & {res['params']} & {res['final_loss']:.2e} & {res['time']*1000:.1f} \\\\")
370
print(r'\bottomrule')
371
print(r'\end{tabular}')
372
print(r'\end{table}')
373
\end{pycode}
374
375
The \py{best_name} architecture achieved the best performance with a final MSE of \py{f"{best_loss:.2e}"} using \py{best_params} trainable parameters.
376
377
\subsection{Observations}
378
379
\begin{remark}[Capacity vs. Generalization]
380
While larger networks have more representational capacity, they also require more training time and are more prone to overfitting. The gap between training and validation loss indicates generalization performance.
381
\end{remark}
382
383
\begin{remark}[Learning Rate Selection]
384
The learning rate $\eta = 0.01$ provides a good balance between convergence speed and stability. Too small ($\eta = 0.001$) results in slow convergence, while too large ($\eta = 0.1$) may cause oscillations or divergence.
385
\end{remark}
386
387
\subsection{Key Findings}
388
\begin{itemize}
389
\item Training samples: \py{n_train}, Validation samples: \py{n_val}
390
\item Best architecture: \py{best_name} with loss \py{f"{best_loss:.4f}"}
391
\item The medium network [1, 64, 32, 1] offers the best trade-off between complexity and performance
392
\item Tanh activation outperforms ReLU for this smooth target function
393
\item He initialization is crucial for training deep networks
394
\end{itemize}
395
396
\section{Limitations and Extensions}
397
398
\subsection{Current Limitations}
399
\begin{enumerate}
400
\item \textbf{Optimization}: Plain gradient descent converges slowly. Momentum, Adam, or RMSprop would improve convergence.
401
\item \textbf{Regularization}: No L2 penalty or dropout is implemented, risking overfitting on larger networks.
402
\item \textbf{Batch Training}: Full-batch gradient descent is used; mini-batch SGD would scale better.
403
\end{enumerate}
404
405
\subsection{Possible Extensions}
406
\begin{itemize}
407
\item Implement Adam optimizer with adaptive learning rates
408
\item Add batch normalization between layers
409
\item Implement early stopping based on validation loss
410
\item Extend to classification with softmax output and cross-entropy loss
411
\end{itemize}
412
413
\section{Conclusion}
414
This tutorial demonstrated a complete neural network training pipeline from scratch. Key insights include the importance of architecture selection, the sensitivity to hyperparameters like learning rate, and the trade-offs between model capacity and generalization. The implementation provides a foundation for understanding more advanced deep learning frameworks.
415
416
\section*{Further Reading}
417
\begin{itemize}
418
\item Goodfellow, I., Bengio, Y., \& Courville, A. (2016). \textit{Deep Learning}. MIT Press.
419
\item He, K., et al. (2015). Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification.
420
\item Kingma, D. P., \& Ba, J. (2015). Adam: A method for stochastic optimization.
421
\end{itemize}
422
423
\end{document}
424
425