CoCalc -- ch06.py

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch06/ch06.py
¹²⁴⁵ views
1
# coding: utf-8
2

3

4
import sys
5
from python_environment_check import check_packages
6
import pandas as pd
7
from sklearn.preprocessing import LabelEncoder
8
from sklearn.model_selection import train_test_split
9
from sklearn.preprocessing import StandardScaler
10
from sklearn.decomposition import PCA
11
from sklearn.linear_model import LogisticRegression
12
from sklearn.pipeline import make_pipeline
13
import numpy as np
14
from sklearn.model_selection import StratifiedKFold
15
from sklearn.model_selection import cross_val_score
16
import matplotlib.pyplot as plt
17
from sklearn.model_selection import learning_curve
18
from sklearn.model_selection import validation_curve
19
from sklearn.model_selection import GridSearchCV
20
from sklearn.svm import SVC
21
from sklearn.model_selection import RandomizedSearchCV
22
from sklearn.experimental import enable_halving_search_cv
23
from sklearn.model_selection import HalvingRandomSearchCV
24
from sklearn.tree import DecisionTreeClassifier
25
from sklearn.metrics import confusion_matrix
26
from sklearn.metrics import precision_score, recall_score, f1_score
27
from sklearn.metrics import matthews_corrcoef
28
from sklearn.metrics import make_scorer
29
from sklearn.metrics import roc_curve, auc
30
from numpy import interp
31
from sklearn.utils import resample
32

33
# # Machine Learning with PyTorch and Scikit-Learn  
34
# # -- Code Examples
35

36
# ## Package version checks
37

38
# Add folder to path in order to load from the check_packages.py script:
39

40

41

42
sys.path.insert(0, '..')
43

44

45
# Check recommended package versions:
46

47

48

49

50

51
d = {
52
    'numpy': '1.21.2',
53
    'matplotlib': '3.4.3',
54
    'sklearn': '1.0',
55
    'pandas': '1.3.2'
56
}
57
check_packages(d)
58

59

60
# # Chapter 6 - Learning Best Practices for Model Evaluation and Hyperparameter Tuning
61

62

63
# ### Overview
64

65
# - [Streamlining workflows with pipelines](#Streamlining-workflows-with-pipelines)
66
#   - [Loading the Breast Cancer Wisconsin dataset](#Loading-the-Breast-Cancer-Wisconsin-dataset)
67
#   - [Combining transformers and estimators in a pipeline](#Combining-transformers-and-estimators-in-a-pipeline)
68
# - [Using k-fold cross-validation to assess model performance](#Using-k-fold-cross-validation-to-assess-model-performance)
69
#   - [The holdout method](#The-holdout-method)
70
#   - [K-fold cross-validation](#K-fold-cross-validation)
71
# - [Debugging algorithms with learning and validation curves](#Debugging-algorithms-with-learning-and-validation-curves)
72
#   - [Diagnosing bias and variance problems with learning curves](#Diagnosing-bias-and-variance-problems-with-learning-curves)
73
#   - [Addressing overfitting and underfitting with validation curves](#Addressing-overfitting-and-underfitting-with-validation-curves)
74
# - [Fine-tuning machine learning models via grid search](#Fine-tuning-machine-learning-models-via-grid-search)
75
#   - [Tuning hyperparameters via grid search](#Tuning-hyperparameters-via-grid-search)
76
#   - [Exploring hyperparameter configurations more widely with randomized search](#Exploring-hyperparameter-configurations-more-widely-with-randomized-search)
77
#   - [More resource-efficient hyperparameter search with successive
78
# halving](#More-resource-efficient-hyperparameter-search-with-successive-halving)
79
#   - [Algorithm selection with nested cross-validation](#Algorithm-selection-with-nested-cross-validation)
80
# - [Looking at different performance evaluation metrics](#Looking-at-different-performance-evaluation-metrics)
81
#   - [Reading a confusion matrix](#Reading-a-confusion-matrix)
82
#   - [Optimizing the precision and recall of a classification model](#Optimizing-the-precision-and-recall-of-a-classification-model)
83
#   - [Plotting a receiver operating characteristic](#Plotting-a-receiver-operating-characteristic)
84
#   - [The scoring metrics for multiclass classification](#The-scoring-metrics-for-multiclass-classification)
85
# - [Dealing with class imbalance](#Dealing-with-class-imbalance)
86
# - [Summary](#Summary)
87

88

89

90

91

92

93
# # Streamlining workflows with pipelines
94

95
# ...
96

97
# ## Loading the Breast Cancer Wisconsin dataset
98

99

100

101

102
df = pd.read_csv('https://archive.ics.uci.edu/ml/'
103
                 'machine-learning-databases'
104
                 '/breast-cancer-wisconsin/wdbc.data', header=None)
105

106
# if the Breast Cancer dataset is temporarily unavailable from the
107
# UCI machine learning repository, un-comment the following line
108
# of code to load the dataset from a local path:
109

110
# df = pd.read_csv('wdbc.data', header=None)
111

112
df.head()
113

114

115

116

117
df.shape
118

119

120

121

122

123

124
X = df.loc[:, 2:].values
125
y = df.loc[:, 1].values
126
le = LabelEncoder()
127
y = le.fit_transform(y)
128
le.classes_
129

130

131

132

133
le.transform(['M', 'B'])
134

135

136

137

138

139
X_train, X_test, y_train, y_test =     train_test_split(X, y, 
140
                     test_size=0.20,
141
                     stratify=y,
142
                     random_state=1)
143

144

145

146
# ## Combining transformers and estimators in a pipeline
147

148

149

150

151
pipe_lr = make_pipeline(StandardScaler(),
152
                        PCA(n_components=2),
153
                        LogisticRegression())
154

155
pipe_lr.fit(X_train, y_train)
156
y_pred = pipe_lr.predict(X_test)
157
test_acc = pipe_lr.score(X_test, y_test)
158
print(f'Test accuracy: {test_acc:.3f}')
159

160

161

162

163

164

165

166
# # Using k-fold cross validation to assess model performance
167

168
# ...
169

170
# ## The holdout method
171

172

173

174

175

176

177
# ## K-fold cross-validation
178

179

180

181

182

183

184

185
    
186

187
kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)
188

189
scores = []
190
for k, (train, test) in enumerate(kfold):
191
    pipe_lr.fit(X_train[train], y_train[train])
192
    score = pipe_lr.score(X_train[test], y_train[test])
193
    scores.append(score)
194

195
    print(f'Fold: {k+1:02d}, '
196
          f'Class distr.: {np.bincount(y_train[train])}, '
197
          f'Acc.: {score:.3f}')
198
    
199
mean_acc = np.mean(scores)
200
std_acc = np.std(scores)
201
print(f'\nCV accuracy: {mean_acc:.3f} +/- {std_acc:.3f}')
202

203

204

205

206

207
scores = cross_val_score(estimator=pipe_lr,
208
                         X=X_train,
209
                         y=y_train,
210
                         cv=10,
211
                         n_jobs=1)
212
print(f'CV accuracy scores: {scores}')
213
print(f'CV accuracy: {np.mean(scores):.3f} '
214
      f'+/- {np.std(scores):.3f}')
215

216

217

218
# # Debugging algorithms with learning curves
219

220

221
# ## Diagnosing bias and variance problems with learning curves
222

223

224

225

226

227

228

229

230

231
pipe_lr = make_pipeline(StandardScaler(),
232
                        LogisticRegression(penalty='l2', max_iter=10000))
233

234
train_sizes, train_scores, test_scores =                learning_curve(estimator=pipe_lr,
235
                               X=X_train,
236
                               y=y_train,
237
                               train_sizes=np.linspace(0.1, 1.0, 10),
238
                               cv=10,
239
                               n_jobs=1)
240

241
train_mean = np.mean(train_scores, axis=1)
242
train_std = np.std(train_scores, axis=1)
243
test_mean = np.mean(test_scores, axis=1)
244
test_std = np.std(test_scores, axis=1)
245

246
plt.plot(train_sizes, train_mean,
247
         color='blue', marker='o',
248
         markersize=5, label='Training accuracy')
249

250
plt.fill_between(train_sizes,
251
                 train_mean + train_std,
252
                 train_mean - train_std,
253
                 alpha=0.15, color='blue')
254

255
plt.plot(train_sizes, test_mean,
256
         color='green', linestyle='--',
257
         marker='s', markersize=5,
258
         label='Validation accuracy')
259

260
plt.fill_between(train_sizes,
261
                 test_mean + test_std,
262
                 test_mean - test_std,
263
                 alpha=0.15, color='green')
264

265
plt.grid()
266
plt.xlabel('Number of training examples')
267
plt.ylabel('Accuracy')
268
plt.legend(loc='lower right')
269
plt.ylim([0.8, 1.03])
270
plt.tight_layout()
271
# plt.savefig('figures/06_05.png', dpi=300)
272
plt.show()
273

274

275

276
# ## Addressing over- and underfitting with validation curves
277

278

279

280

281

282
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
283
train_scores, test_scores = validation_curve(
284
                estimator=pipe_lr, 
285
                X=X_train, 
286
                y=y_train, 
287
                param_name='logisticregression__C', 
288
                param_range=param_range,
289
                cv=10)
290

291
train_mean = np.mean(train_scores, axis=1)
292
train_std = np.std(train_scores, axis=1)
293
test_mean = np.mean(test_scores, axis=1)
294
test_std = np.std(test_scores, axis=1)
295

296
plt.plot(param_range, train_mean, 
297
         color='blue', marker='o', 
298
         markersize=5, label='Training accuracy')
299

300
plt.fill_between(param_range, train_mean + train_std,
301
                 train_mean - train_std, alpha=0.15,
302
                 color='blue')
303

304
plt.plot(param_range, test_mean, 
305
         color='green', linestyle='--', 
306
         marker='s', markersize=5, 
307
         label='Validation accuracy')
308

309
plt.fill_between(param_range, 
310
                 test_mean + test_std,
311
                 test_mean - test_std, 
312
                 alpha=0.15, color='green')
313

314
plt.grid()
315
plt.xscale('log')
316
plt.legend(loc='lower right')
317
plt.xlabel('Parameter C')
318
plt.ylabel('Accuracy')
319
plt.ylim([0.8, 1.0])
320
plt.tight_layout()
321
# plt.savefig('figures/06_06.png', dpi=300)
322
plt.show()
323

324

325

326
# # Fine-tuning machine learning models via grid search
327

328

329
# ## Tuning hyperparameters via grid search 
330

331

332

333

334
pipe_svc = make_pipeline(StandardScaler(),
335
                         SVC(random_state=1))
336

337
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
338

339
param_grid = [{'svc__C': param_range, 
340
               'svc__kernel': ['linear']},
341
              {'svc__C': param_range, 
342
               'svc__gamma': param_range, 
343
               'svc__kernel': ['rbf']}]
344

345
gs = GridSearchCV(estimator=pipe_svc, 
346
                  param_grid=param_grid, 
347
                  scoring='accuracy', 
348
                  refit=True,
349
                  cv=10)
350
gs = gs.fit(X_train, y_train)
351
print(gs.best_score_)
352
print(gs.best_params_)
353

354

355

356

357
clf = gs.best_estimator_
358

359
# clf.fit(X_train, y_train) 
360
# note that we do not need to refit the classifier
361
# because this is done automatically via refit=True.
362

363
print(f'Test accuracy: {clf.score(X_test, y_test):.3f}')
364

365

366

367

368

369

370
pipe_svc = make_pipeline(
371
    StandardScaler(),
372
    SVC(random_state=1))
373

374
param_grid = [{'svc__C': param_range,
375
               'svc__kernel': ['linear']},
376
              {'svc__C': param_range,
377
               'svc__gamma': param_range,
378
               'svc__kernel': ['rbg']}]
379

380

381
rs = RandomizedSearchCV(estimator=pipe_svc,
382
                        param_distributions=param_grid,
383
                        scoring='accuracy',
384
                        refit=True,
385
                        n_iter=20,
386
                        cv=10,
387
                        random_state=1,
388
                        n_jobs=-1)
389

390

391

392

393
rs = rs.fit(X_train, y_train)
394
print(rs.best_score_)
395

396

397

398

399
print(rs.best_params_)
400

401

402
# ## Exploring hyperparameter configurations more widely with randomized search
403

404

405

406

407

408

409

410
param_range = [0.0001, 0.001, 0.01, 0.1,
411
               1.0, 10.0, 100.0, 1000.0]
412

413
param_range = scipy.stats.loguniform(0.0001, 1000.0)
414

415
np.random.seed(1)
416
param_range.rvs(10)
417

418

419
# ## More resource-efficient hyperparameter search with successive halving
420

421

422

423

424

425

426

427
hs = HalvingRandomSearchCV(
428
    pipe_svc,
429
    param_distributions=param_grid,
430
    n_candidates='exhaust',
431
    resource='n_samples',
432
    factor=1.5,
433
    random_state=1,
434
    n_jobs=-1)
435

436

437

438

439
hs = hs.fit(X_train, y_train)
440
print(hs.best_score_)
441
print(hs.best_params_)
442

443

444

445

446
clf = hs.best_estimator_
447
print(f'Test accuracy: {hs.score(X_test, y_test):.3f}')
448

449

450

451
# ## Algorithm selection with nested cross-validation
452

453

454

455

456

457

458

459
gs = GridSearchCV(estimator=pipe_svc,
460
                  param_grid=param_grid,
461
                  scoring='accuracy',
462
                  cv=2)
463

464
scores = cross_val_score(gs, X_train, y_train, 
465
                         scoring='accuracy', cv=5)
466
print(f'CV accuracy: {np.mean(scores):.3f} '
467
      f'+/- {np.std(scores):.3f}')
468

469

470

471

472

473
gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
474
                  param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}],
475
                  scoring='accuracy',
476
                  cv=2)
477

478
scores = cross_val_score(gs, X_train, y_train, 
479
                         scoring='accuracy', cv=5)
480
print(f'CV accuracy: {np.mean(scores):.3f} '
481
      f'+/- {np.std(scores):.3f}')
482

483

484

485
# # Looking at different performance evaluation metrics
486

487
# ...
488

489
# ## Reading a confusion matrix
490

491

492

493

494

495

496

497

498
pipe_svc.fit(X_train, y_train)
499
y_pred = pipe_svc.predict(X_test)
500
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
501
print(confmat)
502

503

504

505

506
fig, ax = plt.subplots(figsize=(2.5, 2.5))
507
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
508
for i in range(confmat.shape[0]):
509
    for j in range(confmat.shape[1]):
510
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
511
ax.xaxis.set_ticks_position('bottom')
512

513
plt.xlabel('Predicted label')
514
plt.ylabel('True label')
515

516
plt.tight_layout()
517
#plt.savefig('figures/06_09.png', dpi=300)
518
plt.show()
519

520

521
# ### Additional Note
522

523
# Remember that we previously encoded the class labels so that *malignant* examples are the "postive" class (1), and *benign* examples are the "negative" class (0):
524

525

526

527
le.transform(['M', 'B'])
528

529

530

531

532
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
533
print(confmat)
534

535

536
# Next, we printed the confusion matrix like so:
537

538

539

540
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
541
print(confmat)
542

543

544
# Note that the (true) class 0 examples that are correctly predicted as class 0 (true negatives) are now in the upper left corner of the matrix (index 0, 0). In order to change the ordering so that the true negatives are in the lower right corner (index 1,1) and the true positves are in the upper left, we can use the `labels` argument like shown below:
545

546

547

548
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[1, 0])
549
print(confmat)
550

551

552
# We conclude:
553
# 
554
# Assuming that class 1 (malignant) is the positive class in this example, our model correctly classified 71 of the examples that belong to class 0 (true negatives) and 40 examples that belong to class 1 (true positives), respectively. However, our model also incorrectly misclassified 1 example from class 0 as class 1 (false positive), and it predicted that 2 examples are benign although it is a malignant tumor (false negatives).
555

556

557
# ## Optimizing the precision and recall of a classification model
558

559

560

561

562
pre_val = precision_score(y_true=y_test, y_pred=y_pred)
563
print(f'Precision: {pre_val:.3f}')
564

565
rec_val = recall_score(y_true=y_test, y_pred=y_pred)
566
print(f'Recall: {rec_val:.3f}')
567

568
f1_val = f1_score(y_true=y_test, y_pred=y_pred)
569
print(f'F1: {f1_val:.3f}')
570

571
mcc_val = matthews_corrcoef(y_true=y_test, y_pred=y_pred)
572
print(f'MCC: {mcc_val:.3f}')
573

574

575

576

577

578
scorer = make_scorer(f1_score, pos_label=0)
579

580
c_gamma_range = [0.01, 0.1, 1.0, 10.0]
581

582
param_grid = [{'svc__C': c_gamma_range,
583
               'svc__kernel': ['linear']},
584
              {'svc__C': c_gamma_range,
585
               'svc__gamma': c_gamma_range,
586
               'svc__kernel': ['rbf']}]
587

588
gs = GridSearchCV(estimator=pipe_svc,
589
                  param_grid=param_grid,
590
                  scoring=scorer,
591
                  cv=10,
592
                  n_jobs=-1)
593
gs = gs.fit(X_train, y_train)
594
print(gs.best_score_)
595
print(gs.best_params_)
596

597

598

599
# ## Plotting a receiver operating characteristic
600

601

602

603

604

605
pipe_lr = make_pipeline(StandardScaler(),
606
                        PCA(n_components=2),
607
                        LogisticRegression(penalty='l2', 
608
                                           random_state=1,
609
                                           solver='lbfgs',
610
                                           C=100.0))
611

612
X_train2 = X_train[:, [4, 14]]
613
    
614

615
cv = list(StratifiedKFold(n_splits=3).split(X_train, y_train))
616

617
fig = plt.figure(figsize=(7, 5))
618

619
mean_tpr = 0.0
620
mean_fpr = np.linspace(0, 1, 100)
621
all_tpr = []
622

623
for i, (train, test) in enumerate(cv):
624
    probas = pipe_lr.fit(X_train2[train],
625
                         y_train[train]).predict_proba(X_train2[test])
626

627
    fpr, tpr, thresholds = roc_curve(y_train[test],
628
                                     probas[:, 1],
629
                                     pos_label=1)
630
    mean_tpr += interp(mean_fpr, fpr, tpr)
631
    mean_tpr[0] = 0.0
632
    roc_auc = auc(fpr, tpr)
633
    plt.plot(fpr,
634
             tpr,
635
             label=f'ROC fold {i+1} (area = {roc_auc:.2f})')
636

637
plt.plot([0, 1],
638
         [0, 1],
639
         linestyle='--',
640
         color=(0.6, 0.6, 0.6),
641
         label='Random guessing (area = 0.5)')
642

643
mean_tpr /= len(cv)
644
mean_tpr[-1] = 1.0
645
mean_auc = auc(mean_fpr, mean_tpr)
646
plt.plot(mean_fpr, mean_tpr, 'k--',
647
         label=f'Mean ROC (area = {mean_auc:.2f})', lw=2)
648
plt.plot([0, 0, 1],
649
         [0, 1, 1],
650
         linestyle=':',
651
         color='black',
652
         label='Perfect performance (area = 1.0)')
653

654
plt.xlim([-0.05, 1.05])
655
plt.ylim([-0.05, 1.05])
656
plt.xlabel('False positive rate')
657
plt.ylabel('True positive rate')
658
plt.legend(loc='lower right')
659

660
plt.tight_layout()
661
# plt.savefig('figures/06_10.png', dpi=300)
662
plt.show()
663

664

665

666
# ## The scoring metrics for multiclass classification
667

668

669

670
pre_scorer = make_scorer(score_func=precision_score, 
671
                         pos_label=1, 
672
                         greater_is_better=True, 
673
                         average='micro')
674

675

676
# ## Dealing with class imbalance
677

678

679

680
X_imb = np.vstack((X[y == 0], X[y == 1][:40]))
681
y_imb = np.hstack((y[y == 0], y[y == 1][:40]))
682

683

684

685

686
y_pred = np.zeros(y_imb.shape[0])
687
np.mean(y_pred == y_imb) * 100
688

689

690

691

692

693
print('Number of class 1 examples before:', X_imb[y_imb == 1].shape[0])
694

695
X_upsampled, y_upsampled = resample(X_imb[y_imb == 1],
696
                                    y_imb[y_imb == 1],
697
                                    replace=True,
698
                                    n_samples=X_imb[y_imb == 0].shape[0],
699
                                    random_state=123)
700

701
print('Number of class 1 examples after:', X_upsampled.shape[0])
702

703

704

705

706
X_bal = np.vstack((X[y == 0], X_upsampled))
707
y_bal = np.hstack((y[y == 0], y_upsampled))
708

709

710

711

712
y_pred = np.zeros(y_bal.shape[0])
713
np.mean(y_pred == y_bal) * 100
714

715

716

717
# # Summary
718

719
# ...
720

721
# ---
722
# 
723
# Readers may ignore the next cell.
724

725

726

727

728

729
Product

Resources

Company