Path: blob/main/latex-templates/templates/machine-learning/neural_network.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{siunitx}6\usepackage{booktabs}7\usepackage{algorithm2e}8\usepackage{subcaption}9\usepackage[makestderr]{pythontex}1011% Theorem environments for tutorial style12\newtheorem{definition}{Definition}13\newtheorem{remark}{Remark}1415\title{Neural Network Training: A Complete Pipeline\\16\large From Architecture Design to Performance Analysis}17\author{Machine Learning Research Group\\Computational Science Templates}18\date{\today}1920\begin{document}21\maketitle2223\begin{abstract}24This tutorial provides a comprehensive walkthrough of training a neural network for function approximation. We implement a multi-layer perceptron from scratch using NumPy, demonstrating forward propagation, backpropagation, and gradient descent optimization. The analysis includes architecture comparison, learning rate sensitivity, and convergence diagnostics.25\end{abstract}2627\section{Introduction}28Artificial neural networks are universal function approximators capable of learning complex nonlinear mappings. This document presents a complete training pipeline, from data generation to model evaluation, with emphasis on understanding the mathematical foundations.2930\begin{definition}[Feedforward Neural Network]31A feedforward neural network is a function $f: \mathbb{R}^n \to \mathbb{R}^m$ composed of alternating linear transformations and nonlinear activations:32\begin{equation}33f(\mathbf{x}) = \sigma_L(W_L \cdot \sigma_{L-1}(W_{L-1} \cdots \sigma_1(W_1 \mathbf{x} + \mathbf{b}_1) \cdots + \mathbf{b}_{L-1}) + \mathbf{b}_L)34\end{equation}35where $W_l$ are weight matrices, $\mathbf{b}_l$ are bias vectors, and $\sigma_l$ are activation functions.36\end{definition}3738\section{Mathematical Framework}3940\subsection{Forward Propagation}41For a network with $L$ layers, the forward pass computes:42\begin{align}43\mathbf{z}^{(l)} &= W^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)} \quad &\text{(pre-activation)}\\44\mathbf{a}^{(l)} &= \sigma(\mathbf{z}^{(l)}) \quad &\text{(activation)}45\end{align}4647\subsection{Backpropagation}48The gradient of the loss with respect to weights is computed via the chain rule:49\begin{equation}50\frac{\partial \mathcal{L}}{\partial W^{(l)}} = \boldsymbol{\delta}^{(l)} (\mathbf{a}^{(l-1)})^T51\end{equation}52where the error signal propagates backward:53\begin{equation}54\boldsymbol{\delta}^{(l)} = (W^{(l+1)})^T \boldsymbol{\delta}^{(l+1)} \odot \sigma'(\mathbf{z}^{(l)})55\end{equation}5657\subsection{Activation Functions}58We compare several common activation functions:59\begin{align}60\text{Sigmoid:} \quad &\sigma(z) = \frac{1}{1 + e^{-z}} \\61\text{Tanh:} \quad &\sigma(z) = \frac{e^z - e^{-z}}{e^z + e^{-z}} \\62\text{ReLU:} \quad &\sigma(z) = \max(0, z)63\end{align}6465\section{Implementation}6667\begin{pycode}68import numpy as np69import matplotlib.pyplot as plt70from time import time71plt.rc('text', usetex=True)72plt.rc('font', family='serif')7374np.random.seed(42)7576# Activation functions and their derivatives77def sigmoid(z):78return 1 / (1 + np.exp(-np.clip(z, -500, 500)))7980def sigmoid_derivative(z):81s = sigmoid(z)82return s * (1 - s)8384def relu(z):85return np.maximum(0, z)8687def relu_derivative(z):88return (z > 0).astype(float)8990def tanh(z):91return np.tanh(z)9293def tanh_derivative(z):94return 1 - np.tanh(z)**29596class NeuralNetwork:97"""Multi-layer perceptron with configurable architecture."""9899def __init__(self, layers, activation='relu'):100self.layers = layers101self.n_layers = len(layers)102103# Choose activation function104if activation == 'sigmoid':105self.activation = sigmoid106self.activation_derivative = sigmoid_derivative107elif activation == 'tanh':108self.activation = tanh109self.activation_derivative = tanh_derivative110else:111self.activation = relu112self.activation_derivative = relu_derivative113114# Initialize weights using He initialization115self.weights = []116self.biases = []117for i in range(len(layers) - 1):118w = np.random.randn(layers[i+1], layers[i]) * np.sqrt(2.0 / layers[i])119b = np.zeros((layers[i+1], 1))120self.weights.append(w)121self.biases.append(b)122123# Store training history124self.loss_history = []125self.val_loss_history = []126127def forward(self, X):128"""Forward propagation."""129self.activations = [X]130self.z_values = []131132a = X133for i in range(len(self.weights) - 1):134z = self.weights[i] @ a + self.biases[i]135self.z_values.append(z)136a = self.activation(z)137self.activations.append(a)138139# Output layer (linear for regression)140z = self.weights[-1] @ a + self.biases[-1]141self.z_values.append(z)142self.activations.append(z)143144return z145146def backward(self, y, learning_rate):147"""Backpropagation with gradient descent."""148m = y.shape[1]149150# Output layer error151delta = self.activations[-1] - y152153# Backpropagate through layers154for i in range(len(self.weights) - 1, -1, -1):155dW = (1/m) * delta @ self.activations[i].T156db = (1/m) * np.sum(delta, axis=1, keepdims=True)157158if i > 0:159delta = self.weights[i].T @ delta * self.activation_derivative(self.z_values[i-1])160161# Update weights162self.weights[i] -= learning_rate * dW163self.biases[i] -= learning_rate * db164165def compute_loss(self, y_pred, y_true):166"""Mean squared error loss."""167return np.mean((y_pred - y_true)**2)168169def train(self, X_train, y_train, X_val, y_val, epochs, learning_rate):170"""Train the network."""171for epoch in range(epochs):172# Forward pass173y_pred = self.forward(X_train)174175# Compute loss176loss = self.compute_loss(y_pred, y_train)177self.loss_history.append(loss)178179# Validation loss180val_pred = self.forward(X_val)181val_loss = self.compute_loss(val_pred, y_val)182self.val_loss_history.append(val_loss)183184# Backward pass - need to re-forward to restore activations185self.forward(X_train)186self.backward(y_train, learning_rate)187188return self.loss_history, self.val_loss_history189190# Generate training data: approximate a complex function191def target_function(x):192return np.sin(2*x) * np.exp(-0.1*x**2) + 0.5*np.cos(5*x)193194n_train = 200195n_val = 50196X_train = np.random.uniform(-3, 3, (1, n_train))197y_train = target_function(X_train)198X_val = np.random.uniform(-3, 3, (1, n_val))199y_val = target_function(X_val)200201# For plotting202X_test = np.linspace(-3, 3, 500).reshape(1, -1)203y_test = target_function(X_test)204205# Train networks with different architectures206architectures = [207([1, 32, 1], 'Small'),208([1, 64, 32, 1], 'Medium'),209([1, 128, 64, 32, 1], 'Large')210]211212results = {}213for arch, name in architectures:214nn = NeuralNetwork(arch, activation='tanh')215start_time = time()216train_loss, val_loss = nn.train(X_train, y_train, X_val, y_val,217epochs=1000, learning_rate=0.01)218training_time = time() - start_time219220# Get predictions221y_pred = nn.forward(X_test)222223results[name] = {224'model': nn,225'train_loss': train_loss,226'val_loss': val_loss,227'predictions': y_pred,228'final_loss': train_loss[-1],229'time': training_time,230'params': sum(w.size + b.size for w, b in zip(nn.weights, nn.biases))231}232233# Learning rate comparison234learning_rates = [0.001, 0.01, 0.1]235lr_results = {}236for lr in learning_rates:237nn = NeuralNetwork([1, 64, 32, 1], activation='tanh')238train_loss, _ = nn.train(X_train, y_train, X_val, y_val,239epochs=500, learning_rate=lr)240lr_results[lr] = train_loss241242# Create comprehensive visualization243fig = plt.figure(figsize=(12, 10))244245# Plot 1: Function approximation comparison246ax1 = fig.add_subplot(2, 2, 1)247ax1.plot(X_test.flatten(), y_test.flatten(), 'k-', linewidth=2, label='Target', alpha=0.7)248colors = ['#2ecc71', '#3498db', '#9b59b6']249for (name, res), color in zip(results.items(), colors):250ax1.plot(X_test.flatten(), res['predictions'].flatten(),251linestyle='--', linewidth=1.5, color=color, label=name)252ax1.scatter(X_train.flatten(), y_train.flatten(), s=10, alpha=0.3, color='gray', label='Training data')253ax1.set_xlabel('$x$')254ax1.set_ylabel('$y$')255ax1.set_title('Function Approximation by Architecture')256ax1.legend(loc='upper right', fontsize=8)257ax1.grid(True, alpha=0.3)258259# Plot 2: Training curves260ax2 = fig.add_subplot(2, 2, 2)261for (name, res), color in zip(results.items(), colors):262ax2.semilogy(res['train_loss'], color=color, linewidth=1.5, label=f'{name} (train)')263ax2.semilogy(res['val_loss'], color=color, linewidth=1, linestyle='--', alpha=0.7)264ax2.set_xlabel('Epoch')265ax2.set_ylabel('MSE Loss')266ax2.set_title('Training Convergence')267ax2.legend(fontsize=8)268ax2.grid(True, alpha=0.3)269270# Plot 3: Learning rate sensitivity271ax3 = fig.add_subplot(2, 2, 3)272lr_colors = ['#e74c3c', '#f39c12', '#27ae60']273for lr, color in zip(learning_rates, lr_colors):274ax3.semilogy(lr_results[lr], color=color, linewidth=1.5, label=f'$\\eta = {lr}$')275ax3.set_xlabel('Epoch')276ax3.set_ylabel('Training Loss')277ax3.set_title('Learning Rate Sensitivity')278ax3.legend(fontsize=8)279ax3.grid(True, alpha=0.3)280281# Plot 4: Model comparison summary282ax4 = fig.add_subplot(2, 2, 4)283names = list(results.keys())284final_losses = [results[n]['final_loss'] for n in names]285params = [results[n]['params'] for n in names]286times = [results[n]['time']*1000 for n in names]287288x_pos = np.arange(len(names))289width = 0.25290291bars1 = ax4.bar(x_pos - width, [f*1000 for f in final_losses], width,292label='Final Loss ($\\times 10^3$)', color='#3498db', alpha=0.8)293ax4_twin = ax4.twinx()294bars2 = ax4_twin.bar(x_pos, [p/100 for p in params], width,295label='Parameters ($\\times 100$)', color='#2ecc71', alpha=0.8)296bars3 = ax4_twin.bar(x_pos + width, times, width,297label='Time (ms)', color='#9b59b6', alpha=0.8)298299ax4.set_xlabel('Architecture')300ax4.set_ylabel('Loss ($\\times 10^{-3}$)', color='#3498db')301ax4_twin.set_ylabel('Parameters/Time', color='#2ecc71')302ax4.set_xticks(x_pos)303ax4.set_xticklabels(names)304ax4.set_title('Model Comparison')305306# Combined legend307lines1, labels1 = ax4.get_legend_handles_labels()308lines2, labels2 = ax4_twin.get_legend_handles_labels()309ax4.legend(lines1 + lines2, labels1 + labels2, loc='upper left', fontsize=7)310311plt.tight_layout()312plt.savefig('neural_network_plot.pdf', bbox_inches='tight', dpi=150)313print(r'\begin{center}')314print(r'\includegraphics[width=\textwidth]{neural_network_plot.pdf}')315print(r'\end{center}')316plt.close()317318# Extract key results319best_model = min(results.items(), key=lambda x: x[1]['final_loss'])320best_name = best_model[0]321best_loss = best_model[1]['final_loss']322best_params = best_model[1]['params']323\end{pycode}324325\section{Training Algorithm}326327\begin{algorithm}[H]328\SetAlgoLined329\KwIn{Training data $(X, y)$, learning rate $\eta$, epochs $E$}330\KwOut{Trained weights $\{W^{(l)}, b^{(l)}\}$}331Initialize weights with He initialization\;332\For{epoch $= 1$ \KwTo $E$}{333\tcc{Forward propagation}334$\mathbf{a}^{(0)} \leftarrow X$\;335\For{$l = 1$ \KwTo $L$}{336$\mathbf{z}^{(l)} \leftarrow W^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)}$\;337$\mathbf{a}^{(l)} \leftarrow \sigma(\mathbf{z}^{(l)})$\;338}339\tcc{Backpropagation}340$\boldsymbol{\delta}^{(L)} \leftarrow \mathbf{a}^{(L)} - y$\;341\For{$l = L$ \KwTo $1$}{342$\nabla W^{(l)} \leftarrow \boldsymbol{\delta}^{(l)} (\mathbf{a}^{(l-1)})^T / m$\;343$\nabla \mathbf{b}^{(l)} \leftarrow \text{mean}(\boldsymbol{\delta}^{(l)})$\;344$\boldsymbol{\delta}^{(l-1)} \leftarrow (W^{(l)})^T \boldsymbol{\delta}^{(l)} \odot \sigma'(\mathbf{z}^{(l-1)})$\;345}346\tcc{Gradient descent update}347$W^{(l)} \leftarrow W^{(l)} - \eta \nabla W^{(l)}$\;348$\mathbf{b}^{(l)} \leftarrow \mathbf{b}^{(l)} - \eta \nabla \mathbf{b}^{(l)}$\;349}350\caption{Backpropagation with Gradient Descent}351\end{algorithm}352353\section{Results and Discussion}354355\subsection{Architecture Comparison}356357\begin{pycode}358# Create results table359print(r'\begin{table}[h]')360print(r'\centering')361print(r'\caption{Neural Network Architecture Comparison}')362print(r'\begin{tabular}{lccc}')363print(r'\toprule')364print(r'Architecture & Parameters & Final Loss & Training Time (ms) \\')365print(r'\midrule')366for name in ['Small', 'Medium', 'Large']:367res = results[name]368print(f"{name} & {res['params']} & {res['final_loss']:.2e} & {res['time']*1000:.1f} \\\\")369print(r'\bottomrule')370print(r'\end{tabular}')371print(r'\end{table}')372\end{pycode}373374The \py{best_name} architecture achieved the best performance with a final MSE of \py{f"{best_loss:.2e}"} using \py{best_params} trainable parameters.375376\subsection{Observations}377378\begin{remark}[Capacity vs. Generalization]379While larger networks have more representational capacity, they also require more training time and are more prone to overfitting. The gap between training and validation loss indicates generalization performance.380\end{remark}381382\begin{remark}[Learning Rate Selection]383The learning rate $\eta = 0.01$ provides a good balance between convergence speed and stability. Too small ($\eta = 0.001$) results in slow convergence, while too large ($\eta = 0.1$) may cause oscillations or divergence.384\end{remark}385386\subsection{Key Findings}387\begin{itemize}388\item Training samples: \py{n_train}, Validation samples: \py{n_val}389\item Best architecture: \py{best_name} with loss \py{f"{best_loss:.4f}"}390\item The medium network [1, 64, 32, 1] offers the best trade-off between complexity and performance391\item Tanh activation outperforms ReLU for this smooth target function392\item He initialization is crucial for training deep networks393\end{itemize}394395\section{Limitations and Extensions}396397\subsection{Current Limitations}398\begin{enumerate}399\item \textbf{Optimization}: Plain gradient descent converges slowly. Momentum, Adam, or RMSprop would improve convergence.400\item \textbf{Regularization}: No L2 penalty or dropout is implemented, risking overfitting on larger networks.401\item \textbf{Batch Training}: Full-batch gradient descent is used; mini-batch SGD would scale better.402\end{enumerate}403404\subsection{Possible Extensions}405\begin{itemize}406\item Implement Adam optimizer with adaptive learning rates407\item Add batch normalization between layers408\item Implement early stopping based on validation loss409\item Extend to classification with softmax output and cross-entropy loss410\end{itemize}411412\section{Conclusion}413This tutorial demonstrated a complete neural network training pipeline from scratch. Key insights include the importance of architecture selection, the sensitivity to hyperparameters like learning rate, and the trade-offs between model capacity and generalization. The implementation provides a foundation for understanding more advanced deep learning frameworks.414415\section*{Further Reading}416\begin{itemize}417\item Goodfellow, I., Bengio, Y., \& Courville, A. (2016). \textit{Deep Learning}. MIT Press.418\item He, K., et al. (2015). Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification.419\item Kingma, D. P., \& Ba, J. (2015). Adam: A method for stochastic optimization.420\end{itemize}421422\end{document}423424425