# coding: utf-8123import sys4from python_environment_check import check_packages5from sklearn.datasets import fetch_openml6import matplotlib.pyplot as plt7from sklearn.model_selection import train_test_split8import numpy as np910# # Machine Learning with PyTorch and Scikit-Learn11# # -- Code Examples1213# ## Package version checks1415# Add folder to path in order to load from the check_packages.py script:16171819sys.path.insert(0, '..')202122# Check recommended package versions:232425262728d = {29'numpy': '1.21.2',30'matplotlib': '3.4.3',31'sklearn': '1.0',32}33check_packages(d)343536# # Chapter 11 - Implementing a Multi-layer Artificial Neural Network from Scratch37#3839# ### Overview4041# - [Modeling complex functions with artificial neural networks](#Modeling-complex-functions-with-artificial-neural-networks)42# - [Single-layer neural network recap](#Single-layer-neural-network-recap)43# - [Introducing the multi-layer neural network architecture](#Introducing-the-multi-layer-neural-network-architecture)44# - [Activating a neural network via forward propagation](#Activating-a-neural-network-via-forward-propagation)45# - [Classifying handwritten digits](#Classifying-handwritten-digits)46# - [Obtaining the MNIST dataset](#Obtaining-the-MNIST-dataset)47# - [Implementing a multi-layer perceptron](#Implementing-a-multi-layer-perceptron)48# - [Coding the neural network training loop](#Coding-the-neural-network-training-loop)49# - [Evaluating the neural network performance](#Evaluating-the-neural-network-performance)50# - [Training an artificial neural network](#Training-an-artificial-neural-network)51# - [Computing the loss function](#Computing-the-loss-function)52# - [Developing your intuition for backpropagation](#Developing-your-intuition-for-backpropagation)53# - [Training neural networks via backpropagation](#Training-neural-networks-via-backpropagation)54# - [Convergence in neural networks](#Convergence-in-neural-networks)55# - [Summary](#Summary)56575859606162# # Modeling complex functions with artificial neural networks6364# ...6566# ## Single-layer neural network recap67686970717273# ## Introducing the multi-layer neural network architecture7475767778798081828384# ## Activating a neural network via forward propagation858687# # Classifying handwritten digits8889# ...9091# ## Obtaining and preparing the MNIST dataset9293# The MNIST dataset is publicly available at http://yann.lecun.com/exdb/mnist/ and consists of the following four parts:94#95# - Training set images: train-images-idx3-ubyte.gz (9.9 MB, 47 MB unzipped, 60,000 examples)96# - Training set labels: train-labels-idx1-ubyte.gz (29 KB, 60 KB unzipped, 60,000 labels)97# - Test set images: t10k-images-idx3-ubyte.gz (1.6 MB, 7.8 MB, 10,000 examples)98# - Test set labels: t10k-labels-idx1-ubyte.gz (5 KB, 10 KB unzipped, 10,000 labels)99#100#101102103104105106X, y = fetch_openml('mnist_784', version=1, return_X_y=True)107X = X.values108y = y.astype(int).values109110print(X.shape)111print(y.shape)112113114# Normalize to [-1, 1] range:115116117118X = ((X / 255.) - .5) * 2119120121# Visualize the first digit of each class:122123124125126fig, ax = plt.subplots(nrows=2, ncols=5, sharex=True, sharey=True)127ax = ax.flatten()128for i in range(10):129img = X[y == i][0].reshape(28, 28)130ax[i].imshow(img, cmap='Greys')131132ax[0].set_xticks([])133ax[0].set_yticks([])134plt.tight_layout()135#plt.savefig('figures/11_4.png', dpi=300)136plt.show()137138139# Visualize 25 different versions of "7":140141142143fig, ax = plt.subplots(nrows=5, ncols=5, sharex=True, sharey=True)144ax = ax.flatten()145for i in range(25):146img = X[y == 7][i].reshape(28, 28)147ax[i].imshow(img, cmap='Greys')148149ax[0].set_xticks([])150ax[0].set_yticks([])151plt.tight_layout()152# plt.savefig('figures/11_5.png', dpi=300)153plt.show()154155156# Split into training, validation, and test set:157158159160161162X_temp, X_test, y_temp, y_test = train_test_split(163X, y, test_size=10000, random_state=123, stratify=y)164165X_train, X_valid, y_train, y_valid = train_test_split(166X_temp, y_temp, test_size=5000, random_state=123, stratify=y_temp)167168169# optional to free up some memory by deleting non-used arrays:170del X_temp, y_temp, X, y171172173174# ## Implementing a multi-layer perceptron175176177178179180181182##########################183### MODEL184##########################185186def sigmoid(z):187return 1. / (1. + np.exp(-z))188189190def int_to_onehot(y, num_labels):191192ary = np.zeros((y.shape[0], num_labels))193for i, val in enumerate(y):194ary[i, val] = 1195196return ary197198199class NeuralNetMLP:200201def __init__(self, num_features, num_hidden, num_classes, random_seed=123):202super().__init__()203204self.num_classes = num_classes205206# hidden207rng = np.random.RandomState(random_seed)208209self.weight_h = rng.normal(210loc=0.0, scale=0.1, size=(num_hidden, num_features))211self.bias_h = np.zeros(num_hidden)212213# output214self.weight_out = rng.normal(215loc=0.0, scale=0.1, size=(num_classes, num_hidden))216self.bias_out = np.zeros(num_classes)217218def forward(self, x):219# Hidden layer220# input dim: [n_examples, n_features] dot [n_hidden, n_features].T221# output dim: [n_examples, n_hidden]222z_h = np.dot(x, self.weight_h.T) + self.bias_h223a_h = sigmoid(z_h)224225# Output layer226# input dim: [n_examples, n_hidden] dot [n_classes, n_hidden].T227# output dim: [n_examples, n_classes]228z_out = np.dot(a_h, self.weight_out.T) + self.bias_out229a_out = sigmoid(z_out)230return a_h, a_out231232def backward(self, x, a_h, a_out, y):233234#########################235### Output layer weights236#########################237238# onehot encoding239y_onehot = int_to_onehot(y, self.num_classes)240241# Part 1: dLoss/dOutWeights242## = dLoss/dOutAct * dOutAct/dOutNet * dOutNet/dOutWeight243## where DeltaOut = dLoss/dOutAct * dOutAct/dOutNet244## for convenient re-use245246# input/output dim: [n_examples, n_classes]247d_loss__d_a_out = 2.*(a_out - y_onehot) / y.shape[0]248249# input/output dim: [n_examples, n_classes]250d_a_out__d_z_out = a_out * (1. - a_out) # sigmoid derivative251252# output dim: [n_examples, n_classes]253delta_out = d_loss__d_a_out * d_a_out__d_z_out # "delta (rule) placeholder"254255# gradient for output weights256257# [n_examples, n_hidden]258d_z_out__dw_out = a_h259260# input dim: [n_classes, n_examples] dot [n_examples, n_hidden]261# output dim: [n_classes, n_hidden]262d_loss__dw_out = np.dot(delta_out.T, d_z_out__dw_out)263d_loss__db_out = np.sum(delta_out, axis=0)264265266#################################267# Part 2: dLoss/dHiddenWeights268## = DeltaOut * dOutNet/dHiddenAct * dHiddenAct/dHiddenNet * dHiddenNet/dWeight269270# [n_classes, n_hidden]271d_z_out__a_h = self.weight_out272273# output dim: [n_examples, n_hidden]274d_loss__a_h = np.dot(delta_out, d_z_out__a_h)275276# [n_examples, n_hidden]277d_a_h__d_z_h = a_h * (1. - a_h) # sigmoid derivative278279# [n_examples, n_features]280d_z_h__d_w_h = x281282# output dim: [n_hidden, n_features]283d_loss__d_w_h = np.dot((d_loss__a_h * d_a_h__d_z_h).T, d_z_h__d_w_h)284d_loss__d_b_h = np.sum((d_loss__a_h * d_a_h__d_z_h), axis=0)285286return (d_loss__dw_out, d_loss__db_out,287d_loss__d_w_h, d_loss__d_b_h)288289290291292model = NeuralNetMLP(num_features=28*28,293num_hidden=50,294num_classes=10)295296297# ## Coding the neural network training loop298299# Defining data loaders:300301302303304num_epochs = 50305minibatch_size = 100306307308def minibatch_generator(X, y, minibatch_size):309indices = np.arange(X.shape[0])310np.random.shuffle(indices)311312for start_idx in range(0, indices.shape[0] - minibatch_size313+ 1, minibatch_size):314batch_idx = indices[start_idx:start_idx + minibatch_size]315316yield X[batch_idx], y[batch_idx]317318319# iterate over training epochs320for i in range(num_epochs):321322# iterate over minibatches323minibatch_gen = minibatch_generator(324X_train, y_train, minibatch_size)325326for X_train_mini, y_train_mini in minibatch_gen:327328break329330break331332print(X_train_mini.shape)333print(y_train_mini.shape)334335336# Defining a function to compute the loss and accuracy337338339340def mse_loss(targets, probas, num_labels=10):341onehot_targets = int_to_onehot(targets, num_labels=num_labels)342return np.mean((onehot_targets - probas)**2)343344345def accuracy(targets, predicted_labels):346return np.mean(predicted_labels == targets)347348349_, probas = model.forward(X_valid)350mse = mse_loss(y_valid, probas)351352predicted_labels = np.argmax(probas, axis=1)353acc = accuracy(y_valid, predicted_labels)354355print(f'Initial validation MSE: {mse:.1f}')356print(f'Initial validation accuracy: {acc*100:.1f}%')357358359360361def compute_mse_and_acc(nnet, X, y, num_labels=10, minibatch_size=100):362mse, correct_pred, num_examples = 0., 0, 0363minibatch_gen = minibatch_generator(X, y, minibatch_size)364365for i, (features, targets) in enumerate(minibatch_gen):366367_, probas = nnet.forward(features)368predicted_labels = np.argmax(probas, axis=1)369370onehot_targets = int_to_onehot(targets, num_labels=num_labels)371loss = np.mean((onehot_targets - probas)**2)372correct_pred += (predicted_labels == targets).sum()373374num_examples += targets.shape[0]375mse += loss376377mse = mse/(i+1)378acc = correct_pred/num_examples379return mse, acc380381382383384mse, acc = compute_mse_and_acc(model, X_valid, y_valid)385print(f'Initial valid MSE: {mse:.1f}')386print(f'Initial valid accuracy: {acc*100:.1f}%')387388389390391def train(model, X_train, y_train, X_valid, y_valid, num_epochs,392learning_rate=0.1):393394epoch_loss = []395epoch_train_acc = []396epoch_valid_acc = []397398for e in range(num_epochs):399400# iterate over minibatches401minibatch_gen = minibatch_generator(402X_train, y_train, minibatch_size)403404for X_train_mini, y_train_mini in minibatch_gen:405406#### Compute outputs ####407a_h, a_out = model.forward(X_train_mini)408409#### Compute gradients ####410d_loss__d_w_out, d_loss__d_b_out, d_loss__d_w_h, d_loss__d_b_h = model.backward(X_train_mini, a_h, a_out, y_train_mini)411412#### Update weights ####413model.weight_h -= learning_rate * d_loss__d_w_h414model.bias_h -= learning_rate * d_loss__d_b_h415model.weight_out -= learning_rate * d_loss__d_w_out416model.bias_out -= learning_rate * d_loss__d_b_out417418#### Epoch Logging ####419train_mse, train_acc = compute_mse_and_acc(model, X_train, y_train)420valid_mse, valid_acc = compute_mse_and_acc(model, X_valid, y_valid)421train_acc, valid_acc = train_acc*100, valid_acc*100422epoch_train_acc.append(train_acc)423epoch_valid_acc.append(valid_acc)424epoch_loss.append(train_mse)425print(f'Epoch: {e+1:03d}/{num_epochs:03d} '426f'| Train MSE: {train_mse:.2f} '427f'| Train Acc: {train_acc:.2f}% '428f'| Valid Acc: {valid_acc:.2f}%')429430return epoch_loss, epoch_train_acc, epoch_valid_acc431432433434435np.random.seed(123) # for the training set shuffling436437epoch_loss, epoch_train_acc, epoch_valid_acc = train(438model, X_train, y_train, X_valid, y_valid,439num_epochs=50, learning_rate=0.1)440441442# ## Evaluating the neural network performance443444445446plt.plot(range(len(epoch_loss)), epoch_loss)447plt.ylabel('Mean squared error')448plt.xlabel('Epoch')449#plt.savefig('figures/11_07.png', dpi=300)450plt.show()451452453454455plt.plot(range(len(epoch_train_acc)), epoch_train_acc,456label='Training')457plt.plot(range(len(epoch_valid_acc)), epoch_valid_acc,458label='Validation')459plt.ylabel('Accuracy')460plt.xlabel('Epochs')461plt.legend(loc='lower right')462#plt.savefig('figures/11_08.png', dpi=300)463plt.show()464465466467468test_mse, test_acc = compute_mse_and_acc(model, X_test, y_test)469print(f'Test accuracy: {test_acc*100:.2f}%')470471472# Plot failure cases:473474475476X_test_subset = X_test[:1000, :]477y_test_subset = y_test[:1000]478479_, probas = model.forward(X_test_subset)480test_pred = np.argmax(probas, axis=1)481482misclassified_images = X_test_subset[y_test_subset != test_pred][:25]483misclassified_labels = test_pred[y_test_subset != test_pred][:25]484correct_labels = y_test_subset[y_test_subset != test_pred][:25]485486487488489fig, ax = plt.subplots(nrows=5, ncols=5,490sharex=True, sharey=True, figsize=(8, 8))491ax = ax.flatten()492for i in range(25):493img = misclassified_images[i].reshape(28, 28)494ax[i].imshow(img, cmap='Greys', interpolation='nearest')495ax[i].set_title(f'{i+1}) '496f'True: {correct_labels[i]}\n'497f' Predicted: {misclassified_labels[i]}')498499ax[0].set_xticks([])500ax[0].set_yticks([])501plt.tight_layout()502#plt.savefig('figures/11_09.png', dpi=300)503plt.show()504505506507# # Training an artificial neural network508509# ...510511# ## Computing the loss function512513514515516517518# ## Developing your intuition for backpropagation519520# ...521522# ## Training neural networks via backpropagation523524525526527528529530531532533534535536537# # Convergence in neural networks538539540541542543544# ...545546# # Summary547548# ...549550# ---551#552# Readers may ignore the next cell.553554555556557558559