Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch06/ch06.py
1245 views
1
# coding: utf-8
2
3
4
import sys
5
from python_environment_check import check_packages
6
import pandas as pd
7
from sklearn.preprocessing import LabelEncoder
8
from sklearn.model_selection import train_test_split
9
from sklearn.preprocessing import StandardScaler
10
from sklearn.decomposition import PCA
11
from sklearn.linear_model import LogisticRegression
12
from sklearn.pipeline import make_pipeline
13
import numpy as np
14
from sklearn.model_selection import StratifiedKFold
15
from sklearn.model_selection import cross_val_score
16
import matplotlib.pyplot as plt
17
from sklearn.model_selection import learning_curve
18
from sklearn.model_selection import validation_curve
19
from sklearn.model_selection import GridSearchCV
20
from sklearn.svm import SVC
21
from sklearn.model_selection import RandomizedSearchCV
22
from sklearn.experimental import enable_halving_search_cv
23
from sklearn.model_selection import HalvingRandomSearchCV
24
from sklearn.tree import DecisionTreeClassifier
25
from sklearn.metrics import confusion_matrix
26
from sklearn.metrics import precision_score, recall_score, f1_score
27
from sklearn.metrics import matthews_corrcoef
28
from sklearn.metrics import make_scorer
29
from sklearn.metrics import roc_curve, auc
30
from numpy import interp
31
from sklearn.utils import resample
32
33
# # Machine Learning with PyTorch and Scikit-Learn
34
# # -- Code Examples
35
36
# ## Package version checks
37
38
# Add folder to path in order to load from the check_packages.py script:
39
40
41
42
sys.path.insert(0, '..')
43
44
45
# Check recommended package versions:
46
47
48
49
50
51
d = {
52
'numpy': '1.21.2',
53
'matplotlib': '3.4.3',
54
'sklearn': '1.0',
55
'pandas': '1.3.2'
56
}
57
check_packages(d)
58
59
60
# # Chapter 6 - Learning Best Practices for Model Evaluation and Hyperparameter Tuning
61
62
63
# ### Overview
64
65
# - [Streamlining workflows with pipelines](#Streamlining-workflows-with-pipelines)
66
# - [Loading the Breast Cancer Wisconsin dataset](#Loading-the-Breast-Cancer-Wisconsin-dataset)
67
# - [Combining transformers and estimators in a pipeline](#Combining-transformers-and-estimators-in-a-pipeline)
68
# - [Using k-fold cross-validation to assess model performance](#Using-k-fold-cross-validation-to-assess-model-performance)
69
# - [The holdout method](#The-holdout-method)
70
# - [K-fold cross-validation](#K-fold-cross-validation)
71
# - [Debugging algorithms with learning and validation curves](#Debugging-algorithms-with-learning-and-validation-curves)
72
# - [Diagnosing bias and variance problems with learning curves](#Diagnosing-bias-and-variance-problems-with-learning-curves)
73
# - [Addressing overfitting and underfitting with validation curves](#Addressing-overfitting-and-underfitting-with-validation-curves)
74
# - [Fine-tuning machine learning models via grid search](#Fine-tuning-machine-learning-models-via-grid-search)
75
# - [Tuning hyperparameters via grid search](#Tuning-hyperparameters-via-grid-search)
76
# - [Exploring hyperparameter configurations more widely with randomized search](#Exploring-hyperparameter-configurations-more-widely-with-randomized-search)
77
# - [More resource-efficient hyperparameter search with successive
78
# halving](#More-resource-efficient-hyperparameter-search-with-successive-halving)
79
# - [Algorithm selection with nested cross-validation](#Algorithm-selection-with-nested-cross-validation)
80
# - [Looking at different performance evaluation metrics](#Looking-at-different-performance-evaluation-metrics)
81
# - [Reading a confusion matrix](#Reading-a-confusion-matrix)
82
# - [Optimizing the precision and recall of a classification model](#Optimizing-the-precision-and-recall-of-a-classification-model)
83
# - [Plotting a receiver operating characteristic](#Plotting-a-receiver-operating-characteristic)
84
# - [The scoring metrics for multiclass classification](#The-scoring-metrics-for-multiclass-classification)
85
# - [Dealing with class imbalance](#Dealing-with-class-imbalance)
86
# - [Summary](#Summary)
87
88
89
90
91
92
93
# # Streamlining workflows with pipelines
94
95
# ...
96
97
# ## Loading the Breast Cancer Wisconsin dataset
98
99
100
101
102
df = pd.read_csv('https://archive.ics.uci.edu/ml/'
103
'machine-learning-databases'
104
'/breast-cancer-wisconsin/wdbc.data', header=None)
105
106
# if the Breast Cancer dataset is temporarily unavailable from the
107
# UCI machine learning repository, un-comment the following line
108
# of code to load the dataset from a local path:
109
110
# df = pd.read_csv('wdbc.data', header=None)
111
112
df.head()
113
114
115
116
117
df.shape
118
119
120
121
122
123
124
X = df.loc[:, 2:].values
125
y = df.loc[:, 1].values
126
le = LabelEncoder()
127
y = le.fit_transform(y)
128
le.classes_
129
130
131
132
133
le.transform(['M', 'B'])
134
135
136
137
138
139
X_train, X_test, y_train, y_test = train_test_split(X, y,
140
test_size=0.20,
141
stratify=y,
142
random_state=1)
143
144
145
146
# ## Combining transformers and estimators in a pipeline
147
148
149
150
151
pipe_lr = make_pipeline(StandardScaler(),
152
PCA(n_components=2),
153
LogisticRegression())
154
155
pipe_lr.fit(X_train, y_train)
156
y_pred = pipe_lr.predict(X_test)
157
test_acc = pipe_lr.score(X_test, y_test)
158
print(f'Test accuracy: {test_acc:.3f}')
159
160
161
162
163
164
165
166
# # Using k-fold cross validation to assess model performance
167
168
# ...
169
170
# ## The holdout method
171
172
173
174
175
176
177
# ## K-fold cross-validation
178
179
180
181
182
183
184
185
186
187
kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)
188
189
scores = []
190
for k, (train, test) in enumerate(kfold):
191
pipe_lr.fit(X_train[train], y_train[train])
192
score = pipe_lr.score(X_train[test], y_train[test])
193
scores.append(score)
194
195
print(f'Fold: {k+1:02d}, '
196
f'Class distr.: {np.bincount(y_train[train])}, '
197
f'Acc.: {score:.3f}')
198
199
mean_acc = np.mean(scores)
200
std_acc = np.std(scores)
201
print(f'\nCV accuracy: {mean_acc:.3f} +/- {std_acc:.3f}')
202
203
204
205
206
207
scores = cross_val_score(estimator=pipe_lr,
208
X=X_train,
209
y=y_train,
210
cv=10,
211
n_jobs=1)
212
print(f'CV accuracy scores: {scores}')
213
print(f'CV accuracy: {np.mean(scores):.3f} '
214
f'+/- {np.std(scores):.3f}')
215
216
217
218
# # Debugging algorithms with learning curves
219
220
221
# ## Diagnosing bias and variance problems with learning curves
222
223
224
225
226
227
228
229
230
231
pipe_lr = make_pipeline(StandardScaler(),
232
LogisticRegression(penalty='l2', max_iter=10000))
233
234
train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr,
235
X=X_train,
236
y=y_train,
237
train_sizes=np.linspace(0.1, 1.0, 10),
238
cv=10,
239
n_jobs=1)
240
241
train_mean = np.mean(train_scores, axis=1)
242
train_std = np.std(train_scores, axis=1)
243
test_mean = np.mean(test_scores, axis=1)
244
test_std = np.std(test_scores, axis=1)
245
246
plt.plot(train_sizes, train_mean,
247
color='blue', marker='o',
248
markersize=5, label='Training accuracy')
249
250
plt.fill_between(train_sizes,
251
train_mean + train_std,
252
train_mean - train_std,
253
alpha=0.15, color='blue')
254
255
plt.plot(train_sizes, test_mean,
256
color='green', linestyle='--',
257
marker='s', markersize=5,
258
label='Validation accuracy')
259
260
plt.fill_between(train_sizes,
261
test_mean + test_std,
262
test_mean - test_std,
263
alpha=0.15, color='green')
264
265
plt.grid()
266
plt.xlabel('Number of training examples')
267
plt.ylabel('Accuracy')
268
plt.legend(loc='lower right')
269
plt.ylim([0.8, 1.03])
270
plt.tight_layout()
271
# plt.savefig('figures/06_05.png', dpi=300)
272
plt.show()
273
274
275
276
# ## Addressing over- and underfitting with validation curves
277
278
279
280
281
282
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
283
train_scores, test_scores = validation_curve(
284
estimator=pipe_lr,
285
X=X_train,
286
y=y_train,
287
param_name='logisticregression__C',
288
param_range=param_range,
289
cv=10)
290
291
train_mean = np.mean(train_scores, axis=1)
292
train_std = np.std(train_scores, axis=1)
293
test_mean = np.mean(test_scores, axis=1)
294
test_std = np.std(test_scores, axis=1)
295
296
plt.plot(param_range, train_mean,
297
color='blue', marker='o',
298
markersize=5, label='Training accuracy')
299
300
plt.fill_between(param_range, train_mean + train_std,
301
train_mean - train_std, alpha=0.15,
302
color='blue')
303
304
plt.plot(param_range, test_mean,
305
color='green', linestyle='--',
306
marker='s', markersize=5,
307
label='Validation accuracy')
308
309
plt.fill_between(param_range,
310
test_mean + test_std,
311
test_mean - test_std,
312
alpha=0.15, color='green')
313
314
plt.grid()
315
plt.xscale('log')
316
plt.legend(loc='lower right')
317
plt.xlabel('Parameter C')
318
plt.ylabel('Accuracy')
319
plt.ylim([0.8, 1.0])
320
plt.tight_layout()
321
# plt.savefig('figures/06_06.png', dpi=300)
322
plt.show()
323
324
325
326
# # Fine-tuning machine learning models via grid search
327
328
329
# ## Tuning hyperparameters via grid search
330
331
332
333
334
pipe_svc = make_pipeline(StandardScaler(),
335
SVC(random_state=1))
336
337
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
338
339
param_grid = [{'svc__C': param_range,
340
'svc__kernel': ['linear']},
341
{'svc__C': param_range,
342
'svc__gamma': param_range,
343
'svc__kernel': ['rbf']}]
344
345
gs = GridSearchCV(estimator=pipe_svc,
346
param_grid=param_grid,
347
scoring='accuracy',
348
refit=True,
349
cv=10)
350
gs = gs.fit(X_train, y_train)
351
print(gs.best_score_)
352
print(gs.best_params_)
353
354
355
356
357
clf = gs.best_estimator_
358
359
# clf.fit(X_train, y_train)
360
# note that we do not need to refit the classifier
361
# because this is done automatically via refit=True.
362
363
print(f'Test accuracy: {clf.score(X_test, y_test):.3f}')
364
365
366
367
368
369
370
pipe_svc = make_pipeline(
371
StandardScaler(),
372
SVC(random_state=1))
373
374
param_grid = [{'svc__C': param_range,
375
'svc__kernel': ['linear']},
376
{'svc__C': param_range,
377
'svc__gamma': param_range,
378
'svc__kernel': ['rbg']}]
379
380
381
rs = RandomizedSearchCV(estimator=pipe_svc,
382
param_distributions=param_grid,
383
scoring='accuracy',
384
refit=True,
385
n_iter=20,
386
cv=10,
387
random_state=1,
388
n_jobs=-1)
389
390
391
392
393
rs = rs.fit(X_train, y_train)
394
print(rs.best_score_)
395
396
397
398
399
print(rs.best_params_)
400
401
402
# ## Exploring hyperparameter configurations more widely with randomized search
403
404
405
406
407
408
409
410
param_range = [0.0001, 0.001, 0.01, 0.1,
411
1.0, 10.0, 100.0, 1000.0]
412
413
param_range = scipy.stats.loguniform(0.0001, 1000.0)
414
415
np.random.seed(1)
416
param_range.rvs(10)
417
418
419
# ## More resource-efficient hyperparameter search with successive halving
420
421
422
423
424
425
426
427
hs = HalvingRandomSearchCV(
428
pipe_svc,
429
param_distributions=param_grid,
430
n_candidates='exhaust',
431
resource='n_samples',
432
factor=1.5,
433
random_state=1,
434
n_jobs=-1)
435
436
437
438
439
hs = hs.fit(X_train, y_train)
440
print(hs.best_score_)
441
print(hs.best_params_)
442
443
444
445
446
clf = hs.best_estimator_
447
print(f'Test accuracy: {hs.score(X_test, y_test):.3f}')
448
449
450
451
# ## Algorithm selection with nested cross-validation
452
453
454
455
456
457
458
459
gs = GridSearchCV(estimator=pipe_svc,
460
param_grid=param_grid,
461
scoring='accuracy',
462
cv=2)
463
464
scores = cross_val_score(gs, X_train, y_train,
465
scoring='accuracy', cv=5)
466
print(f'CV accuracy: {np.mean(scores):.3f} '
467
f'+/- {np.std(scores):.3f}')
468
469
470
471
472
473
gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
474
param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}],
475
scoring='accuracy',
476
cv=2)
477
478
scores = cross_val_score(gs, X_train, y_train,
479
scoring='accuracy', cv=5)
480
print(f'CV accuracy: {np.mean(scores):.3f} '
481
f'+/- {np.std(scores):.3f}')
482
483
484
485
# # Looking at different performance evaluation metrics
486
487
# ...
488
489
# ## Reading a confusion matrix
490
491
492
493
494
495
496
497
498
pipe_svc.fit(X_train, y_train)
499
y_pred = pipe_svc.predict(X_test)
500
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
501
print(confmat)
502
503
504
505
506
fig, ax = plt.subplots(figsize=(2.5, 2.5))
507
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
508
for i in range(confmat.shape[0]):
509
for j in range(confmat.shape[1]):
510
ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
511
ax.xaxis.set_ticks_position('bottom')
512
513
plt.xlabel('Predicted label')
514
plt.ylabel('True label')
515
516
plt.tight_layout()
517
#plt.savefig('figures/06_09.png', dpi=300)
518
plt.show()
519
520
521
# ### Additional Note
522
523
# Remember that we previously encoded the class labels so that *malignant* examples are the "postive" class (1), and *benign* examples are the "negative" class (0):
524
525
526
527
le.transform(['M', 'B'])
528
529
530
531
532
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
533
print(confmat)
534
535
536
# Next, we printed the confusion matrix like so:
537
538
539
540
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
541
print(confmat)
542
543
544
# Note that the (true) class 0 examples that are correctly predicted as class 0 (true negatives) are now in the upper left corner of the matrix (index 0, 0). In order to change the ordering so that the true negatives are in the lower right corner (index 1,1) and the true positves are in the upper left, we can use the `labels` argument like shown below:
545
546
547
548
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[1, 0])
549
print(confmat)
550
551
552
# We conclude:
553
#
554
# Assuming that class 1 (malignant) is the positive class in this example, our model correctly classified 71 of the examples that belong to class 0 (true negatives) and 40 examples that belong to class 1 (true positives), respectively. However, our model also incorrectly misclassified 1 example from class 0 as class 1 (false positive), and it predicted that 2 examples are benign although it is a malignant tumor (false negatives).
555
556
557
# ## Optimizing the precision and recall of a classification model
558
559
560
561
562
pre_val = precision_score(y_true=y_test, y_pred=y_pred)
563
print(f'Precision: {pre_val:.3f}')
564
565
rec_val = recall_score(y_true=y_test, y_pred=y_pred)
566
print(f'Recall: {rec_val:.3f}')
567
568
f1_val = f1_score(y_true=y_test, y_pred=y_pred)
569
print(f'F1: {f1_val:.3f}')
570
571
mcc_val = matthews_corrcoef(y_true=y_test, y_pred=y_pred)
572
print(f'MCC: {mcc_val:.3f}')
573
574
575
576
577
578
scorer = make_scorer(f1_score, pos_label=0)
579
580
c_gamma_range = [0.01, 0.1, 1.0, 10.0]
581
582
param_grid = [{'svc__C': c_gamma_range,
583
'svc__kernel': ['linear']},
584
{'svc__C': c_gamma_range,
585
'svc__gamma': c_gamma_range,
586
'svc__kernel': ['rbf']}]
587
588
gs = GridSearchCV(estimator=pipe_svc,
589
param_grid=param_grid,
590
scoring=scorer,
591
cv=10,
592
n_jobs=-1)
593
gs = gs.fit(X_train, y_train)
594
print(gs.best_score_)
595
print(gs.best_params_)
596
597
598
599
# ## Plotting a receiver operating characteristic
600
601
602
603
604
605
pipe_lr = make_pipeline(StandardScaler(),
606
PCA(n_components=2),
607
LogisticRegression(penalty='l2',
608
random_state=1,
609
solver='lbfgs',
610
C=100.0))
611
612
X_train2 = X_train[:, [4, 14]]
613
614
615
cv = list(StratifiedKFold(n_splits=3).split(X_train, y_train))
616
617
fig = plt.figure(figsize=(7, 5))
618
619
mean_tpr = 0.0
620
mean_fpr = np.linspace(0, 1, 100)
621
all_tpr = []
622
623
for i, (train, test) in enumerate(cv):
624
probas = pipe_lr.fit(X_train2[train],
625
y_train[train]).predict_proba(X_train2[test])
626
627
fpr, tpr, thresholds = roc_curve(y_train[test],
628
probas[:, 1],
629
pos_label=1)
630
mean_tpr += interp(mean_fpr, fpr, tpr)
631
mean_tpr[0] = 0.0
632
roc_auc = auc(fpr, tpr)
633
plt.plot(fpr,
634
tpr,
635
label=f'ROC fold {i+1} (area = {roc_auc:.2f})')
636
637
plt.plot([0, 1],
638
[0, 1],
639
linestyle='--',
640
color=(0.6, 0.6, 0.6),
641
label='Random guessing (area = 0.5)')
642
643
mean_tpr /= len(cv)
644
mean_tpr[-1] = 1.0
645
mean_auc = auc(mean_fpr, mean_tpr)
646
plt.plot(mean_fpr, mean_tpr, 'k--',
647
label=f'Mean ROC (area = {mean_auc:.2f})', lw=2)
648
plt.plot([0, 0, 1],
649
[0, 1, 1],
650
linestyle=':',
651
color='black',
652
label='Perfect performance (area = 1.0)')
653
654
plt.xlim([-0.05, 1.05])
655
plt.ylim([-0.05, 1.05])
656
plt.xlabel('False positive rate')
657
plt.ylabel('True positive rate')
658
plt.legend(loc='lower right')
659
660
plt.tight_layout()
661
# plt.savefig('figures/06_10.png', dpi=300)
662
plt.show()
663
664
665
666
# ## The scoring metrics for multiclass classification
667
668
669
670
pre_scorer = make_scorer(score_func=precision_score,
671
pos_label=1,
672
greater_is_better=True,
673
average='micro')
674
675
676
# ## Dealing with class imbalance
677
678
679
680
X_imb = np.vstack((X[y == 0], X[y == 1][:40]))
681
y_imb = np.hstack((y[y == 0], y[y == 1][:40]))
682
683
684
685
686
y_pred = np.zeros(y_imb.shape[0])
687
np.mean(y_pred == y_imb) * 100
688
689
690
691
692
693
print('Number of class 1 examples before:', X_imb[y_imb == 1].shape[0])
694
695
X_upsampled, y_upsampled = resample(X_imb[y_imb == 1],
696
y_imb[y_imb == 1],
697
replace=True,
698
n_samples=X_imb[y_imb == 0].shape[0],
699
random_state=123)
700
701
print('Number of class 1 examples after:', X_upsampled.shape[0])
702
703
704
705
706
X_bal = np.vstack((X[y == 0], X_upsampled))
707
y_bal = np.hstack((y[y == 0], y_upsampled))
708
709
710
711
712
y_pred = np.zeros(y_bal.shape[0])
713
np.mean(y_pred == y_bal) * 100
714
715
716
717
# # Summary
718
719
# ...
720
721
# ---
722
#
723
# Readers may ignore the next cell.
724
725
726
727
728
729