CoCalc -- ch11.py

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch11/ch11.py
¹²⁴⁷ views
1
# coding: utf-8
2

3

4
import sys
5
from python_environment_check import check_packages
6
from sklearn.datasets import fetch_openml
7
import matplotlib.pyplot as plt
8
from sklearn.model_selection import train_test_split
9
import numpy as np
10

11
# # Machine Learning with PyTorch and Scikit-Learn  
12
# # -- Code Examples
13

14
# ## Package version checks
15

16
# Add folder to path in order to load from the check_packages.py script:
17

18

19

20
sys.path.insert(0, '..')
21

22

23
# Check recommended package versions:
24

25

26

27

28

29
d = {
30
    'numpy': '1.21.2',
31
    'matplotlib': '3.4.3',
32
    'sklearn': '1.0',
33
}
34
check_packages(d)
35

36

37
# # Chapter 11 - Implementing a Multi-layer Artificial Neural Network from Scratch
38
# 
39

40
# ### Overview
41

42
# - [Modeling complex functions with artificial neural networks](#Modeling-complex-functions-with-artificial-neural-networks)
43
#   - [Single-layer neural network recap](#Single-layer-neural-network-recap)
44
#   - [Introducing the multi-layer neural network architecture](#Introducing-the-multi-layer-neural-network-architecture)
45
#   - [Activating a neural network via forward propagation](#Activating-a-neural-network-via-forward-propagation)
46
# - [Classifying handwritten digits](#Classifying-handwritten-digits)
47
#   - [Obtaining the MNIST dataset](#Obtaining-the-MNIST-dataset)
48
#   - [Implementing a multi-layer perceptron](#Implementing-a-multi-layer-perceptron)
49
#   - [Coding the neural network training loop](#Coding-the-neural-network-training-loop)
50
#   - [Evaluating the neural network performance](#Evaluating-the-neural-network-performance)
51
# - [Training an artificial neural network](#Training-an-artificial-neural-network)
52
#   - [Computing the loss function](#Computing-the-loss-function)
53
#   - [Developing your intuition for backpropagation](#Developing-your-intuition-for-backpropagation)
54
#   - [Training neural networks via backpropagation](#Training-neural-networks-via-backpropagation)
55
# - [Convergence in neural networks](#Convergence-in-neural-networks)
56
# - [Summary](#Summary)
57

58

59

60

61

62

63
# # Modeling complex functions with artificial neural networks
64

65
# ...
66

67
# ## Single-layer neural network recap
68

69

70

71

72

73

74
# ## Introducing the multi-layer neural network architecture
75

76

77

78

79

80

81

82

83

84

85
# ## Activating a neural network via forward propagation
86

87

88
# # Classifying handwritten digits
89

90
# ...
91

92
# ## Obtaining and preparing the MNIST dataset
93

94
# The MNIST dataset is publicly available at http://yann.lecun.com/exdb/mnist/ and consists of the following four parts:
95
# 
96
# - Training set images: train-images-idx3-ubyte.gz (9.9 MB, 47 MB unzipped, 60,000 examples)
97
# - Training set labels: train-labels-idx1-ubyte.gz (29 KB, 60 KB unzipped, 60,000 labels)
98
# - Test set images: t10k-images-idx3-ubyte.gz (1.6 MB, 7.8 MB, 10,000 examples)
99
# - Test set labels: t10k-labels-idx1-ubyte.gz (5 KB, 10 KB unzipped, 10,000 labels)
100
# 
101
# 
102

103

104

105

106

107
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
108
X = X.values
109
y = y.astype(int).values
110

111
print(X.shape)
112
print(y.shape)
113

114

115
# Normalize to [-1, 1] range:
116

117

118

119
X = ((X / 255.) - .5) * 2
120

121

122
# Visualize the first digit of each class:
123

124

125

126

127
fig, ax = plt.subplots(nrows=2, ncols=5, sharex=True, sharey=True)
128
ax = ax.flatten()
129
for i in range(10):
130
    img = X[y == i][0].reshape(28, 28)
131
    ax[i].imshow(img, cmap='Greys')
132

133
ax[0].set_xticks([])
134
ax[0].set_yticks([])
135
plt.tight_layout()
136
#plt.savefig('figures/11_4.png', dpi=300)
137
plt.show()
138

139

140
# Visualize 25 different versions of "7":
141

142

143

144
fig, ax = plt.subplots(nrows=5, ncols=5, sharex=True, sharey=True)
145
ax = ax.flatten()
146
for i in range(25):
147
    img = X[y == 7][i].reshape(28, 28)
148
    ax[i].imshow(img, cmap='Greys')
149

150
ax[0].set_xticks([])
151
ax[0].set_yticks([])
152
plt.tight_layout()
153
# plt.savefig('figures/11_5.png', dpi=300)
154
plt.show()
155

156

157
# Split into training, validation, and test set:
158

159

160

161

162

163
X_temp, X_test, y_temp, y_test = train_test_split(
164
    X, y, test_size=10000, random_state=123, stratify=y)
165

166
X_train, X_valid, y_train, y_valid = train_test_split(
167
    X_temp, y_temp, test_size=5000, random_state=123, stratify=y_temp)
168

169

170
# optional to free up some memory by deleting non-used arrays:
171
del X_temp, y_temp, X, y
172

173

174

175
# ## Implementing a multi-layer perceptron
176

177

178

179

180

181

182

183
##########################
184
### MODEL
185
##########################
186

187
def sigmoid(z):                                        
188
    return 1. / (1. + np.exp(-z))
189

190

191
def int_to_onehot(y, num_labels):
192

193
    ary = np.zeros((y.shape[0], num_labels))
194
    for i, val in enumerate(y):
195
        ary[i, val] = 1
196

197
    return ary
198

199

200
class NeuralNetMLP:
201

202
    def __init__(self, num_features, num_hidden, num_classes, random_seed=123):
203
        super().__init__()
204
        
205
        self.num_classes = num_classes
206
        
207
        # hidden
208
        rng = np.random.RandomState(random_seed)
209
        
210
        self.weight_h = rng.normal(
211
            loc=0.0, scale=0.1, size=(num_hidden, num_features))
212
        self.bias_h = np.zeros(num_hidden)
213
        
214
        # output
215
        self.weight_out = rng.normal(
216
            loc=0.0, scale=0.1, size=(num_classes, num_hidden))
217
        self.bias_out = np.zeros(num_classes)
218
        
219
    def forward(self, x):
220
        # Hidden layer
221
        # input dim: [n_examples, n_features] dot [n_hidden, n_features].T
222
        # output dim: [n_examples, n_hidden]
223
        z_h = np.dot(x, self.weight_h.T) + self.bias_h
224
        a_h = sigmoid(z_h)
225

226
        # Output layer
227
        # input dim: [n_examples, n_hidden] dot [n_classes, n_hidden].T
228
        # output dim: [n_examples, n_classes]
229
        z_out = np.dot(a_h, self.weight_out.T) + self.bias_out
230
        a_out = sigmoid(z_out)
231
        return a_h, a_out
232

233
    def backward(self, x, a_h, a_out, y):  
234
    
235
        #########################
236
        ### Output layer weights
237
        #########################
238
        
239
        # onehot encoding
240
        y_onehot = int_to_onehot(y, self.num_classes)
241

242
        # Part 1: dLoss/dOutWeights
243
        ## = dLoss/dOutAct * dOutAct/dOutNet * dOutNet/dOutWeight
244
        ## where DeltaOut = dLoss/dOutAct * dOutAct/dOutNet
245
        ## for convenient re-use
246
        
247
        # input/output dim: [n_examples, n_classes]
248
        d_loss__d_a_out = 2.*(a_out - y_onehot) / y.shape[0]
249

250
        # input/output dim: [n_examples, n_classes]
251
        d_a_out__d_z_out = a_out * (1. - a_out) # sigmoid derivative
252

253
        # output dim: [n_examples, n_classes]
254
        delta_out = d_loss__d_a_out * d_a_out__d_z_out # "delta (rule) placeholder"
255

256
        # gradient for output weights
257
        
258
        # [n_examples, n_hidden]
259
        d_z_out__dw_out = a_h
260
        
261
        # input dim: [n_classes, n_examples] dot [n_examples, n_hidden]
262
        # output dim: [n_classes, n_hidden]
263
        d_loss__dw_out = np.dot(delta_out.T, d_z_out__dw_out)
264
        d_loss__db_out = np.sum(delta_out, axis=0)
265
        
266

267
        #################################        
268
        # Part 2: dLoss/dHiddenWeights
269
        ## = DeltaOut * dOutNet/dHiddenAct * dHiddenAct/dHiddenNet * dHiddenNet/dWeight
270
        
271
        # [n_classes, n_hidden]
272
        d_z_out__a_h = self.weight_out
273
        
274
        # output dim: [n_examples, n_hidden]
275
        d_loss__a_h = np.dot(delta_out, d_z_out__a_h)
276
        
277
        # [n_examples, n_hidden]
278
        d_a_h__d_z_h = a_h * (1. - a_h) # sigmoid derivative
279
        
280
        # [n_examples, n_features]
281
        d_z_h__d_w_h = x
282
        
283
        # output dim: [n_hidden, n_features]
284
        d_loss__d_w_h = np.dot((d_loss__a_h * d_a_h__d_z_h).T, d_z_h__d_w_h)
285
        d_loss__d_b_h = np.sum((d_loss__a_h * d_a_h__d_z_h), axis=0)
286

287
        return (d_loss__dw_out, d_loss__db_out, 
288
                d_loss__d_w_h, d_loss__d_b_h)
289

290

291

292

293
model = NeuralNetMLP(num_features=28*28,
294
                     num_hidden=50,
295
                     num_classes=10)
296

297

298
# ## Coding the neural network training loop
299

300
# Defining data loaders:
301

302

303

304

305
num_epochs = 50
306
minibatch_size = 100
307

308

309
def minibatch_generator(X, y, minibatch_size):
310
    indices = np.arange(X.shape[0])
311
    np.random.shuffle(indices)
312

313
    for start_idx in range(0, indices.shape[0] - minibatch_size 
314
                           + 1, minibatch_size):
315
        batch_idx = indices[start_idx:start_idx + minibatch_size]
316
        
317
        yield X[batch_idx], y[batch_idx]
318

319
        
320
# iterate over training epochs
321
for i in range(num_epochs):
322

323
    # iterate over minibatches
324
    minibatch_gen = minibatch_generator(
325
        X_train, y_train, minibatch_size)
326
    
327
    for X_train_mini, y_train_mini in minibatch_gen:
328

329
        break
330
        
331
    break
332
    
333
print(X_train_mini.shape)
334
print(y_train_mini.shape)
335

336

337
# Defining a function to compute the loss and accuracy
338

339

340

341
def mse_loss(targets, probas, num_labels=10):
342
    onehot_targets = int_to_onehot(targets, num_labels=num_labels)
343
    return np.mean((onehot_targets - probas)**2)
344

345

346
def accuracy(targets, predicted_labels):
347
    return np.mean(predicted_labels == targets) 
348

349

350
_, probas = model.forward(X_valid)
351
mse = mse_loss(y_valid, probas)
352

353
predicted_labels = np.argmax(probas, axis=1)
354
acc = accuracy(y_valid, predicted_labels)
355

356
print(f'Initial validation MSE: {mse:.1f}')
357
print(f'Initial validation accuracy: {acc*100:.1f}%')
358

359

360

361

362
def compute_mse_and_acc(nnet, X, y, num_labels=10, minibatch_size=100):
363
    mse, correct_pred, num_examples = 0., 0, 0
364
    minibatch_gen = minibatch_generator(X, y, minibatch_size)
365
        
366
    for i, (features, targets) in enumerate(minibatch_gen):
367

368
        _, probas = nnet.forward(features)
369
        predicted_labels = np.argmax(probas, axis=1)
370
        
371
        onehot_targets = int_to_onehot(targets, num_labels=num_labels)
372
        loss = np.mean((onehot_targets - probas)**2)
373
        correct_pred += (predicted_labels == targets).sum()
374
        
375
        num_examples += targets.shape[0]
376
        mse += loss
377

378
    mse = mse/(i+1)
379
    acc = correct_pred/num_examples
380
    return mse, acc
381

382

383

384

385
mse, acc = compute_mse_and_acc(model, X_valid, y_valid)
386
print(f'Initial valid MSE: {mse:.1f}')
387
print(f'Initial valid accuracy: {acc*100:.1f}%')
388

389

390

391

392
def train(model, X_train, y_train, X_valid, y_valid, num_epochs,
393
          learning_rate=0.1):
394
    
395
    epoch_loss = []
396
    epoch_train_acc = []
397
    epoch_valid_acc = []
398
    
399
    for e in range(num_epochs):
400

401
        # iterate over minibatches
402
        minibatch_gen = minibatch_generator(
403
            X_train, y_train, minibatch_size)
404

405
        for X_train_mini, y_train_mini in minibatch_gen:
406
            
407
            #### Compute outputs ####
408
            a_h, a_out = model.forward(X_train_mini)
409

410
            #### Compute gradients ####
411
            d_loss__d_w_out, d_loss__d_b_out, d_loss__d_w_h, d_loss__d_b_h =                 model.backward(X_train_mini, a_h, a_out, y_train_mini)
412

413
            #### Update weights ####
414
            model.weight_h -= learning_rate * d_loss__d_w_h
415
            model.bias_h -= learning_rate * d_loss__d_b_h
416
            model.weight_out -= learning_rate * d_loss__d_w_out
417
            model.bias_out -= learning_rate * d_loss__d_b_out
418
        
419
        #### Epoch Logging ####        
420
        train_mse, train_acc = compute_mse_and_acc(model, X_train, y_train)
421
        valid_mse, valid_acc = compute_mse_and_acc(model, X_valid, y_valid)
422
        train_acc, valid_acc = train_acc*100, valid_acc*100
423
        epoch_train_acc.append(train_acc)
424
        epoch_valid_acc.append(valid_acc)
425
        epoch_loss.append(train_mse)
426
        print(f'Epoch: {e+1:03d}/{num_epochs:03d} '
427
              f'| Train MSE: {train_mse:.2f} '
428
              f'| Train Acc: {train_acc:.2f}% '
429
              f'| Valid Acc: {valid_acc:.2f}%')
430

431
    return epoch_loss, epoch_train_acc, epoch_valid_acc
432

433

434

435

436
np.random.seed(123) # for the training set shuffling
437

438
epoch_loss, epoch_train_acc, epoch_valid_acc = train(
439
    model, X_train, y_train, X_valid, y_valid,
440
    num_epochs=50, learning_rate=0.1)
441

442

443
# ## Evaluating the neural network performance
444

445

446

447
plt.plot(range(len(epoch_loss)), epoch_loss)
448
plt.ylabel('Mean squared error')
449
plt.xlabel('Epoch')
450
#plt.savefig('figures/11_07.png', dpi=300)
451
plt.show()
452

453

454

455

456
plt.plot(range(len(epoch_train_acc)), epoch_train_acc,
457
         label='Training')
458
plt.plot(range(len(epoch_valid_acc)), epoch_valid_acc,
459
         label='Validation')
460
plt.ylabel('Accuracy')
461
plt.xlabel('Epochs')
462
plt.legend(loc='lower right')
463
#plt.savefig('figures/11_08.png', dpi=300)
464
plt.show()
465

466

467

468

469
test_mse, test_acc = compute_mse_and_acc(model, X_test, y_test)
470
print(f'Test accuracy: {test_acc*100:.2f}%')
471

472

473
# Plot failure cases:
474

475

476

477
X_test_subset = X_test[:1000, :]
478
y_test_subset = y_test[:1000]
479

480
_, probas = model.forward(X_test_subset)
481
test_pred = np.argmax(probas, axis=1)
482

483
misclassified_images = X_test_subset[y_test_subset != test_pred][:25]
484
misclassified_labels = test_pred[y_test_subset != test_pred][:25]
485
correct_labels = y_test_subset[y_test_subset != test_pred][:25]
486

487

488

489

490
fig, ax = plt.subplots(nrows=5, ncols=5, 
491
                       sharex=True, sharey=True, figsize=(8, 8))
492
ax = ax.flatten()
493
for i in range(25):
494
    img = misclassified_images[i].reshape(28, 28)
495
    ax[i].imshow(img, cmap='Greys', interpolation='nearest')
496
    ax[i].set_title(f'{i+1}) '
497
                    f'True: {correct_labels[i]}\n'
498
                    f' Predicted: {misclassified_labels[i]}')
499

500
ax[0].set_xticks([])
501
ax[0].set_yticks([])
502
plt.tight_layout()
503
#plt.savefig('figures/11_09.png', dpi=300)
504
plt.show()
505

506

507

508
# # Training an artificial neural network
509

510
# ...
511

512
# ## Computing the loss function
513

514

515

516

517

518

519
# ## Developing your intuition for backpropagation
520

521
# ...
522

523
# ## Training neural networks via backpropagation
524

525

526

527

528

529

530

531

532

533

534

535

536

537

538
# # Convergence in neural networks
539

540

541

542

543

544

545
# ...
546

547
# # Summary
548

549
# ...
550

551
# ---
552
# 
553
# Readers may ignore the next cell.
554

555

556

557

558

559
Product

Resources

Company