Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch06/ch06.ipynb
1245 views
Kernel: Python 3 (ipykernel)

Machine Learning with PyTorch and Scikit-Learn

-- Code Examples

Package version checks

Add folder to path in order to load from the check_packages.py script:

import sys sys.path.insert(0, '..')

Check recommended package versions:

from python_environment_check import check_packages d = { 'numpy': '1.21.2', 'matplotlib': '3.4.3', 'sklearn': '1.0', 'pandas': '1.3.2' } check_packages(d)
[OK] Your Python version is 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:24:02) [Clang 11.1.0 ] [OK] numpy 1.22.1 [OK] matplotlib 3.5.1 [OK] sklearn 1.0.2 [OK] pandas 1.4.0

Chapter 6 - Learning Best Practices for Model Evaluation and Hyperparameter Tuning



Overview



from IPython.display import Image %matplotlib inline

Streamlining workflows with pipelines

...

Loading the Breast Cancer Wisconsin dataset

import pandas as pd df = pd.read_csv('https://archive.ics.uci.edu/ml/' 'machine-learning-databases' '/breast-cancer-wisconsin/wdbc.data', header=None) # if the Breast Cancer dataset is temporarily unavailable from the # UCI machine learning repository, un-comment the following line # of code to load the dataset from a local path: # df = pd.read_csv('wdbc.data', header=None) df.head()
df.shape
(569, 32)

from sklearn.preprocessing import LabelEncoder X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) le.classes_
array(['B', 'M'], dtype=object)
le.transform(['M', 'B'])
array([1, 0])
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)


Combining transformers and estimators in a pipeline

from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression()) pipe_lr.fit(X_train, y_train) y_pred = pipe_lr.predict(X_test) test_acc = pipe_lr.score(X_test, y_test) print(f'Test accuracy: {test_acc:.3f}')
Test accuracy: 0.956
Image(filename='figures/06_01.png', width=500)
Image in a Jupyter notebook


Using k-fold cross validation to assess model performance

...

The holdout method

Image(filename='figures/06_02.png', width=500)
Image in a Jupyter notebook


K-fold cross-validation

Image(filename='figures/06_03.png', width=500)
Image in a Jupyter notebook
import numpy as np from sklearn.model_selection import StratifiedKFold kfold = StratifiedKFold(n_splits=10).split(X_train, y_train) scores = [] for k, (train, test) in enumerate(kfold): pipe_lr.fit(X_train[train], y_train[train]) score = pipe_lr.score(X_train[test], y_train[test]) scores.append(score) print(f'Fold: {k+1:02d}, ' f'Class distr.: {np.bincount(y_train[train])}, ' f'Acc.: {score:.3f}') mean_acc = np.mean(scores) std_acc = np.std(scores) print(f'\nCV accuracy: {mean_acc:.3f} +/- {std_acc:.3f}')
Fold: 01, Class distr.: [256 153], Acc.: 0.935 Fold: 02, Class distr.: [256 153], Acc.: 0.935 Fold: 03, Class distr.: [256 153], Acc.: 0.957 Fold: 04, Class distr.: [256 153], Acc.: 0.957 Fold: 05, Class distr.: [256 153], Acc.: 0.935 Fold: 06, Class distr.: [257 153], Acc.: 0.956 Fold: 07, Class distr.: [257 153], Acc.: 0.978 Fold: 08, Class distr.: [257 153], Acc.: 0.933 Fold: 09, Class distr.: [257 153], Acc.: 0.956 Fold: 10, Class distr.: [257 153], Acc.: 0.956 CV accuracy: 0.950 +/- 0.014
from sklearn.model_selection import cross_val_score scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10, n_jobs=1) print(f'CV accuracy scores: {scores}') print(f'CV accuracy: {np.mean(scores):.3f} ' f'+/- {np.std(scores):.3f}')
CV accuracy scores: [0.93478261 0.93478261 0.95652174 0.95652174 0.93478261 0.95555556 0.97777778 0.93333333 0.95555556 0.95555556] CV accuracy: 0.950 +/- 0.014


Debugging algorithms with learning curves



Diagnosing bias and variance problems with learning curves

Image(filename='figures/06_04.png', width=600)
Image in a Jupyter notebook
import matplotlib.pyplot as plt from sklearn.model_selection import learning_curve pipe_lr = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', max_iter=10000)) train_sizes, train_scores, test_scores =\ learning_curve(estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training examples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.8, 1.03]) plt.tight_layout() # plt.savefig('figures/06_05.png', dpi=300) plt.show()
Image in a Jupyter notebook


Addressing over- and underfitting with validation curves

from sklearn.model_selection import validation_curve param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = validation_curve( estimator=pipe_lr, X=X_train, y=y_train, param_name='logisticregression__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='Training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.legend(loc='lower right') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.ylim([0.8, 1.0]) plt.tight_layout() # plt.savefig('figures/06_06.png', dpi=300) plt.show()
Image in a Jupyter notebook


Fine-tuning machine learning models via grid search



Tuning hyperparameters via grid search

from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1)) param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']}, {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', refit=True, cv=10) gs = gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_)
0.9846859903381642 {'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
clf = gs.best_estimator_ # clf.fit(X_train, y_train) # note that we do not need to refit the classifier # because this is done automatically via refit=True. print(f'Test accuracy: {clf.score(X_test, y_test):.3f}')
Test accuracy: 0.974
from sklearn.model_selection import RandomizedSearchCV pipe_svc = make_pipeline( StandardScaler(), SVC(random_state=1)) param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']}, {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}] rs = RandomizedSearchCV(estimator=pipe_svc, param_distributions=param_grid, scoring='accuracy', refit=True, n_iter=20, cv=10, random_state=1, n_jobs=-1)
rs = rs.fit(X_train, y_train) print(rs.best_score_)
0.9737681159420291
print(rs.best_params_)
{'svc__kernel': 'rbf', 'svc__gamma': 0.001, 'svc__C': 10.0}
Image(filename='figures/06_11.png', width=600)
Image in a Jupyter notebook
import scipy.stats param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] param_range = scipy.stats.loguniform(0.0001, 1000.0) np.random.seed(1) param_range.rvs(10)
array([8.30145146e-02, 1.10222804e+01, 1.00184520e-04, 1.30715777e-02, 1.06485687e-03, 4.42965766e-04, 2.01289666e-03, 2.62376594e-02, 5.98924832e-02, 5.91176467e-01])

More resource-efficient hyperparameter search with successive halving

from sklearn.experimental import enable_halving_search_cv from sklearn.model_selection import HalvingRandomSearchCV
hs = HalvingRandomSearchCV( pipe_svc, param_distributions=param_grid, n_candidates='exhaust', resource='n_samples', factor=1.5, random_state=1, n_jobs=-1)
hs = hs.fit(X_train, y_train) print(hs.best_score_) print(hs.best_params_)
0.9676470588235293 {'svc__kernel': 'rbf', 'svc__gamma': 0.0001, 'svc__C': 100.0}
clf = hs.best_estimator_ print(f'Test accuracy: {hs.score(X_test, y_test):.3f}')
Test accuracy: 0.965


Algorithm selection with nested cross-validation

Image(filename='figures/06_07.png', width=500)
Image in a Jupyter notebook
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=2) scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) print(f'CV accuracy: {np.mean(scores):.3f} ' f'+/- {np.std(scores):.3f}')
CV accuracy: 0.974 +/- 0.015
from sklearn.tree import DecisionTreeClassifier gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0), param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}], scoring='accuracy', cv=2) scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) print(f'CV accuracy: {np.mean(scores):.3f} ' f'+/- {np.std(scores):.3f}')
CV accuracy: 0.934 +/- 0.016


Looking at different performance evaluation metrics

...

Reading a confusion matrix

Image(filename='figures/06_08.png', width=300)
Image in a Jupyter notebook
from sklearn.metrics import confusion_matrix pipe_svc.fit(X_train, y_train) y_pred = pipe_svc.predict(X_test) confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat)
[[71 1] [ 2 40]]
fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center') ax.xaxis.set_ticks_position('bottom') plt.xlabel('Predicted label') plt.ylabel('True label') plt.tight_layout() #plt.savefig('figures/06_09.png', dpi=300) plt.show()
Image in a Jupyter notebook

Additional Note

Remember that we previously encoded the class labels so that malignant examples are the "postive" class (1), and benign examples are the "negative" class (0):

le.transform(['M', 'B'])
array([1, 0])
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat)
[[71 1] [ 2 40]]

Next, we printed the confusion matrix like so:

confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat)
[[71 1] [ 2 40]]

Note that the (true) class 0 examples that are correctly predicted as class 0 (true negatives) are now in the upper left corner of the matrix (index 0, 0). In order to change the ordering so that the true negatives are in the lower right corner (index 1,1) and the true positves are in the upper left, we can use the labels argument like shown below:

confmat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[1, 0]) print(confmat)
[[40 2] [ 1 71]]

We conclude:

Assuming that class 1 (malignant) is the positive class in this example, our model correctly classified 71 of the examples that belong to class 0 (true negatives) and 40 examples that belong to class 1 (true positives), respectively. However, our model also incorrectly misclassified 1 example from class 0 as class 1 (false positive), and it predicted that 2 examples are benign although it is a malignant tumor (false negatives).



Optimizing the precision and recall of a classification model

from sklearn.metrics import precision_score, recall_score, f1_score from sklearn.metrics import matthews_corrcoef pre_val = precision_score(y_true=y_test, y_pred=y_pred) print(f'Precision: {pre_val:.3f}') rec_val = recall_score(y_true=y_test, y_pred=y_pred) print(f'Recall: {rec_val:.3f}') f1_val = f1_score(y_true=y_test, y_pred=y_pred) print(f'F1: {f1_val:.3f}') mcc_val = matthews_corrcoef(y_true=y_test, y_pred=y_pred) print(f'MCC: {mcc_val:.3f}')
Precision: 0.976 Recall: 0.952 F1: 0.964 MCC: 0.943
from sklearn.metrics import make_scorer scorer = make_scorer(f1_score, pos_label=0) c_gamma_range = [0.01, 0.1, 1.0, 10.0] param_grid = [{'svc__C': c_gamma_range, 'svc__kernel': ['linear']}, {'svc__C': c_gamma_range, 'svc__gamma': c_gamma_range, 'svc__kernel': ['rbf']}] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring=scorer, cv=10, n_jobs=-1) gs = gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_)
0.9861994953378878 {'svc__C': 10.0, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}


Plotting a receiver operating characteristic

from sklearn.metrics import roc_curve, auc from numpy import interp pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression(penalty='l2', random_state=1, solver='lbfgs', C=100.0)) X_train2 = X_train[:, [4, 14]] cv = list(StratifiedKFold(n_splits=3).split(X_train, y_train)) fig = plt.figure(figsize=(7, 5)) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas = pipe_lr.fit(X_train2[train], y_train[train]).predict_proba(X_train2[test]) fpr, tpr, thresholds = roc_curve(y_train[test], probas[:, 1], pos_label=1) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label=f'ROC fold {i+1} (area = {roc_auc:.2f})') plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='Random guessing (area = 0.5)') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label=f'Mean ROC (area = {mean_auc:.2f})', lw=2) plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black', label='Perfect performance (area = 1.0)') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.legend(loc='lower right') plt.tight_layout() # plt.savefig('figures/06_10.png', dpi=300) plt.show()
Image in a Jupyter notebook


The scoring metrics for multiclass classification

pre_scorer = make_scorer(score_func=precision_score, pos_label=1, greater_is_better=True, average='micro')

Dealing with class imbalance

X_imb = np.vstack((X[y == 0], X[y == 1][:40])) y_imb = np.hstack((y[y == 0], y[y == 1][:40]))
y_pred = np.zeros(y_imb.shape[0]) np.mean(y_pred == y_imb) * 100
89.92443324937027
from sklearn.utils import resample print('Number of class 1 examples before:', X_imb[y_imb == 1].shape[0]) X_upsampled, y_upsampled = resample(X_imb[y_imb == 1], y_imb[y_imb == 1], replace=True, n_samples=X_imb[y_imb == 0].shape[0], random_state=123) print('Number of class 1 examples after:', X_upsampled.shape[0])
Number of class 1 examples before: 40 Number of class 1 examples after: 357
X_bal = np.vstack((X[y == 0], X_upsampled)) y_bal = np.hstack((y[y == 0], y_upsampled))
y_pred = np.zeros(y_bal.shape[0]) np.mean(y_pred == y_bal) * 100
50.0


Summary

...


Readers may ignore the next cell.

! python ../.convert_notebook_to_script.py --input ch06.ipynb --output ch06.py
[NbConvertApp] Converting notebook ch06.ipynb to script [NbConvertApp] Writing 18900 bytes to ch06.py