CoCalc -- ch05.py

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch05/ch05.py
¹²⁴⁷ views
1
# coding: utf-8
2

3

4
import sys
5
from python_environment_check import check_packages
6
import pandas as pd
7
from sklearn.model_selection import train_test_split
8
from sklearn.preprocessing import StandardScaler
9
import numpy as np
10
import matplotlib.pyplot as plt
11
from sklearn.decomposition import PCA
12
from matplotlib.colors import ListedColormap
13
from sklearn.linear_model import LogisticRegression
14
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
15
from sklearn.datasets import load_digits
16
from sklearn.manifold import TSNE
17
import matplotlib.patheffects as PathEffects
18

19
# # Machine Learning with PyTorch and Scikit-Learn  
20
# # -- Code Examples
21

22
# ## Package version checks
23

24
# Add folder to path in order to load from the check_packages.py script:
25

26

27

28
sys.path.insert(0, '..')
29

30

31
# Check recommended package versions:
32

33

34

35

36

37
d = {
38
    'numpy': '1.21.2',
39
    'matplotlib': '3.4.3',
40
    'sklearn': '1.0',
41
    'pandas': '1.3.2'
42
}
43
check_packages(d)
44

45

46
# # Chapter 5 - Compressing Data via Dimensionality Reduction
47

48

49
# ### Overview
50

51
# - [Unsupervised dimensionality reduction via principal component analysis 128](#Unsupervised-dimensionality-reduction-via-principal-component-analysis-128)
52
#   - [The main steps behind principal component analysis](#The-main-steps-behind-principal-component-analysis)
53
#   - [Extracting the principal components step-by-step](#Extracting-the-principal-components-step-by-step)
54
#   - [Total and explained variance](#Total-and-explained-variance)
55
#   - [Feature transformation](#Feature-transformation)
56
#   - [Principal component analysis in scikit-learn](#Principal-component-analysis-in-scikit-learn)
57
#   - [Assessing feature contributions](#Assessing-feature-contributions)
58
# - [Supervised data compression via linear discriminant analysis](#Supervised-data-compression-via-linear-discriminant-analysis)
59
#   - [Principal component analysis versus linear discriminant analysis](#Principal-component-analysis-versus-linear-discriminant-analysis)
60
#   - [The inner workings of linear discriminant analysis](#The-inner-workings-of-linear-discriminant-analysis)
61
#   - [Computing the scatter matrices](#Computing-the-scatter-matrices)
62
#   - [Selecting linear discriminants for the new feature subspace](#Selecting-linear-discriminants-for-the-new-feature-subspace)
63
#   - [Projecting examples onto the new feature space](#Projecting-examples-onto-the-new-feature-space)
64
#   - [LDA via scikit-learn](#LDA-via-scikit-learn)
65
# - [Nonlinear dimensionality reduction techniques](#Nonlinear-dimensionality-reduction-techniques)
66
#   - [Visualizing data via t-distributed stochastic neighbor embedding](#Visualizing-data-via-t-distributed-stochastic-neighbor-embedding)
67
# - [Summary](#Summary)
68

69

70

71

72

73

74
# # Unsupervised dimensionality reduction via principal component analysis
75

76
# ## The main steps behind principal component analysis
77

78

79

80

81

82
# ## Extracting the principal components step-by-step
83

84

85

86

87
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/'
88
                      'machine-learning-databases/wine/wine.data',
89
                      header=None)
90

91
# if the Wine dataset is temporarily unavailable from the
92
# UCI machine learning repository, un-comment the following line
93
# of code to load the dataset from a local path:
94

95
# df_wine = pd.read_csv('wine.data', header=None)
96

97
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
98
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
99
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
100
                   'Color intensity', 'Hue',
101
                   'OD280/OD315 of diluted wines', 'Proline']
102

103
df_wine.head()
104

105

106

107
# Splitting the data into 70% training and 30% test subsets.
108

109

110

111

112
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
113

114
X_train, X_test, y_train, y_test =     train_test_split(X, y, test_size=0.3, 
115
                     stratify=y,
116
                     random_state=0)
117

118

119
# Standardizing the data.
120

121

122

123

124
sc = StandardScaler()
125
X_train_std = sc.fit_transform(X_train)
126
X_test_std = sc.transform(X_test)
127

128

129
# ---
130
# 
131
# **Note**
132
# 
133
# Accidentally, I wrote `X_test_std = sc.fit_transform(X_test)` instead of `X_test_std = sc.transform(X_test)`. In this case, it wouldn't make a big difference since the mean and standard deviation of the test set should be (quite) similar to the training set. However, as remember from Chapter 3, the correct way is to re-use parameters from the training set if we are doing any kind of transformation -- the test set should basically stand for "new, unseen" data.
134
# 
135
# My initial typo reflects a common mistake is that some people are *not* re-using these parameters from the model training/building and standardize the new data "from scratch." Here's simple example to explain why this is a problem.
136
# 
137
# Let's assume we have a simple training set consisting of 3 examples with 1 feature (let's call this feature "length"):
138
# 
139
# - train_1: 10 cm -> class_2
140
# - train_2: 20 cm -> class_2
141
# - train_3: 30 cm -> class_1
142
# 
143
# mean: 20, std.: 8.2
144
# 
145
# After standardization, the transformed feature values are
146
# 
147
# - train_std_1: -1.21 -> class_2
148
# - train_std_2: 0 -> class_2
149
# - train_std_3: 1.21 -> class_1
150
# 
151
# Next, let's assume our model has learned to classify examples with a standardized length value < 0.6 as class_2 (class_1 otherwise). So far so good. Now, let's say we have 3 unlabeled data points that we want to classify:
152
# 
153
# - new_4: 5 cm -> class ?
154
# - new_5: 6 cm -> class ?
155
# - new_6: 7 cm -> class ?
156
# 
157
# If we look at the "unstandardized "length" values in our training datast, it is intuitive to say that all of these examples are likely belonging to class_2. However, if we standardize these by re-computing standard deviation and and mean you would get similar values as before in the training set and your classifier would (probably incorrectly) classify examples 4 and 5 as class 2.
158
# 
159
# - new_std_4: -1.21 -> class 2
160
# - new_std_5: 0 -> class 2
161
# - new_std_6: 1.21 -> class 1
162
# 
163
# However, if we use the parameters from your "training set standardization," we'd get the values:
164
# 
165
# - example5: -18.37 -> class 2
166
# - example6: -17.15 -> class 2
167
# - example7: -15.92 -> class 2
168
# 
169
# The values 5 cm, 6 cm, and 7 cm are much lower than anything we have seen in the training set previously. Thus, it only makes sense that the standardized features of the "new examples" are much lower than every standardized feature in the training set.
170
# 
171
# ---
172

173
# Eigendecomposition of the covariance matrix.
174

175

176

177
cov_mat = np.cov(X_train_std.T)
178
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
179

180
print('\nEigenvalues \n', eigen_vals)
181

182

183
# **Note**: 
184
# 
185
# Above, I used the [`numpy.linalg.eig`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html) function to decompose the symmetric covariance matrix into its eigenvalues and eigenvectors.
186
#     <pre>>>> eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)</pre>
187
#     This is not really a "mistake," but probably suboptimal. It would be better to use [`numpy.linalg.eigh`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html) in such cases, which has been designed for [Hermetian matrices](https://en.wikipedia.org/wiki/Hermitian_matrix). The latter always returns real  eigenvalues; whereas the numerically less stable `np.linalg.eig` can decompose nonsymmetric square matrices, you may find that it returns complex eigenvalues in certain cases. (S.R.)
188
# 
189

190

191
# ## Total and explained variance
192

193

194

195
tot = sum(eigen_vals)
196
var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
197
cum_var_exp = np.cumsum(var_exp)
198

199

200

201

202

203

204
plt.bar(range(1, 14), var_exp, align='center',
205
        label='Individual explained variance')
206
plt.step(range(1, 14), cum_var_exp, where='mid',
207
         label='Cumulative explained variance')
208
plt.ylabel('Explained variance ratio')
209
plt.xlabel('Principal component index')
210
plt.legend(loc='best')
211
plt.tight_layout()
212
# plt.savefig('figures/05_02.png', dpi=300)
213
plt.show()
214

215

216

217
# ## Feature transformation
218

219

220

221
# Make a list of (eigenvalue, eigenvector) tuples
222
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
223
               for i in range(len(eigen_vals))]
224

225
# Sort the (eigenvalue, eigenvector) tuples from high to low
226
eigen_pairs.sort(key=lambda k: k[0], reverse=True)
227

228

229

230

231
w = np.hstack((eigen_pairs[0][1][:, np.newaxis],
232
               eigen_pairs[1][1][:, np.newaxis]))
233
print('Matrix W:\n', w)
234

235

236
# **Note**
237
# Depending on which version of NumPy and LAPACK you are using, you may obtain the Matrix W with its signs flipped. Please note that this is not an issue: If $v$ is an eigenvector of a matrix $\Sigma$, we have
238
# 
239
# $$\Sigma v = \lambda v,$$
240
# 
241
# where $\lambda$ is our eigenvalue,
242
# 
243
# 
244
# then $-v$ is also an eigenvector that has the same eigenvalue, since
245
# $$\Sigma \cdot (-v) = -\Sigma v = -\lambda v = \lambda \cdot (-v).$$
246

247

248

249
X_train_std[0].dot(w)
250

251

252

253

254
X_train_pca = X_train_std.dot(w)
255
colors = ['r', 'b', 'g']
256
markers = ['o', 's', '^']
257

258
for l, c, m in zip(np.unique(y_train), colors, markers):
259
    plt.scatter(X_train_pca[y_train == l, 0], 
260
                X_train_pca[y_train == l, 1], 
261
                c=c, label=f'Class {l}', marker=m)
262

263
plt.xlabel('PC 1')
264
plt.ylabel('PC 2')
265
plt.legend(loc='lower left')
266
plt.tight_layout()
267
# plt.savefig('figures/05_03.png', dpi=300)
268
plt.show()
269

270

271

272
# ## Principal component analysis in scikit-learn
273

274
# **NOTE**
275
# 
276
# The following four code cells has been added in addition to the content to the book, to illustrate how to replicate the results from our own PCA implementation in scikit-learn:
277

278

279

280

281
pca = PCA()
282
X_train_pca = pca.fit_transform(X_train_std)
283
pca.explained_variance_ratio_
284

285

286

287

288
plt.bar(range(1, 14), pca.explained_variance_ratio_, align='center')
289
plt.step(range(1, 14), np.cumsum(pca.explained_variance_ratio_), where='mid')
290
plt.ylabel('Explained variance ratio')
291
plt.xlabel('Principal components')
292

293
plt.show()
294

295

296

297

298
pca = PCA(n_components=2)
299
X_train_pca = pca.fit_transform(X_train_std)
300
X_test_pca = pca.transform(X_test_std)
301

302

303

304

305
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1])
306
plt.xlabel('PC 1')
307
plt.ylabel('PC 2')
308
plt.show()
309

310

311

312

313

314
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
315

316
    # setup marker generator and color map
317
    markers = ('o', 's', '^', 'v', '<')
318
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
319
    cmap = ListedColormap(colors[:len(np.unique(y))])
320

321
    # plot the decision surface
322
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
323
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
324
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
325
                           np.arange(x2_min, x2_max, resolution))
326
    lab = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
327
    lab = lab.reshape(xx1.shape)
328
    plt.contourf(xx1, xx2, lab, alpha=0.3, cmap=cmap)
329
    plt.xlim(xx1.min(), xx1.max())
330
    plt.ylim(xx2.min(), xx2.max())
331

332
    # plot class examples
333
    for idx, cl in enumerate(np.unique(y)):
334
        plt.scatter(x=X[y == cl, 0], 
335
                    y=X[y == cl, 1],
336
                    alpha=0.8, 
337
                    c=colors[idx],
338
                    marker=markers[idx], 
339
                    label=f'Class {cl}', 
340
                    edgecolor='black')
341

342

343
# Training logistic regression classifier using the first 2 principal components.
344

345

346

347

348
pca = PCA(n_components=2)
349
X_train_pca = pca.fit_transform(X_train_std)
350
X_test_pca = pca.transform(X_test_std)
351

352
lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')
353
lr = lr.fit(X_train_pca, y_train)
354

355

356

357

358
plot_decision_regions(X_train_pca, y_train, classifier=lr)
359
plt.xlabel('PC 1')
360
plt.ylabel('PC 2')
361
plt.legend(loc='lower left')
362
plt.tight_layout()
363
# plt.savefig('figures/05_04.png', dpi=300)
364
plt.show()
365

366

367

368

369
plot_decision_regions(X_test_pca, y_test, classifier=lr)
370
plt.xlabel('PC 1')
371
plt.ylabel('PC 2')
372
plt.legend(loc='lower left')
373
plt.tight_layout()
374
# plt.savefig('figures/05_05.png', dpi=300)
375
plt.show()
376

377

378

379

380
pca = PCA(n_components=None)
381
X_train_pca = pca.fit_transform(X_train_std)
382
pca.explained_variance_ratio_
383

384

385
# ## Assessing feature contributions
386

387

388

389
loadings = eigen_vecs * np.sqrt(eigen_vals)
390

391
fig, ax = plt.subplots()
392

393
ax.bar(range(13), loadings[:, 0], align='center')
394
ax.set_ylabel('Loadings for PC 1')
395
ax.set_xticks(range(13))
396
ax.set_xticklabels(df_wine.columns[1:], rotation=90)
397

398
plt.ylim([-1, 1])
399
plt.tight_layout()
400
plt.savefig('figures/05_05_02.png', dpi=300)
401
plt.show()
402

403

404

405

406
loadings[:, 0]
407

408

409

410

411
sklearn_loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
412

413
fig, ax = plt.subplots()
414

415
ax.bar(range(13), sklearn_loadings[:, 0], align='center')
416
ax.set_ylabel('Loadings for PC 1')
417
ax.set_xticks(range(13))
418
ax.set_xticklabels(df_wine.columns[1:], rotation=90)
419

420
plt.ylim([-1, 1])
421
plt.tight_layout()
422
plt.savefig('figures/05_05_03.png', dpi=300)
423
plt.show()
424

425

426

427
# # Supervised data compression via linear discriminant analysis
428

429
# ## Principal component analysis versus linear discriminant analysis
430

431

432

433

434

435
# ## The inner workings of linear discriminant analysis
436

437

438
# ## Computing the scatter matrices
439

440
# Calculate the mean vectors for each class:
441

442

443

444
np.set_printoptions(precision=4)
445

446
mean_vecs = []
447
for label in range(1, 4):
448
    mean_vecs.append(np.mean(X_train_std[y_train == label], axis=0))
449
    print(f'MV {label}: {mean_vecs[label - 1]}\n')
450

451

452
# Compute the within-class scatter matrix:
453

454

455

456
d = 13 # number of features
457
S_W = np.zeros((d, d))
458
for label, mv in zip(range(1, 4), mean_vecs):
459
    class_scatter = np.zeros((d, d))  # scatter matrix for each class
460
    for row in X_train_std[y_train == label]:
461
        row, mv = row.reshape(d, 1), mv.reshape(d, 1)  # make column vectors
462
        class_scatter += (row - mv).dot((row - mv).T)
463
    S_W += class_scatter                          # sum class scatter matrices
464

465
print('Within-class scatter matrix: '
466
      f'{S_W.shape[0]}x{S_W.shape[1]}')
467

468

469
# Better: covariance matrix since classes are not equally distributed:
470

471

472

473
print('Class label distribution:',  
474
      np.bincount(y_train)[1:])
475

476

477

478

479
d = 13  # number of features
480
S_W = np.zeros((d, d))
481
for label, mv in zip(range(1, 4), mean_vecs):
482
    class_scatter = np.cov(X_train_std[y_train == label].T)
483
    S_W += class_scatter
484
    
485
print('Scaled within-class scatter matrix: '
486
      f'{S_W.shape[0]}x{S_W.shape[1]}')
487

488

489
# Compute the between-class scatter matrix:
490

491

492

493
mean_overall = np.mean(X_train_std, axis=0)
494
mean_overall = mean_overall.reshape(d, 1)  # make column vector
495

496
d = 13  # number of features
497
S_B = np.zeros((d, d))
498

499
for i, mean_vec in enumerate(mean_vecs):
500
    n = X_train_std[y_train == i + 1, :].shape[0]
501
    mean_vec = mean_vec.reshape(d, 1)  # make column vector
502
    S_B += n * (mean_vec - mean_overall).dot((mean_vec - mean_overall).T)
503

504
print('Between-class scatter matrix: '
505
      f'{S_B.shape[0]}x{S_B.shape[1]}')
506

507

508

509
# ## Selecting linear discriminants for the new feature subspace
510

511
# Solve the generalized eigenvalue problem for the matrix $S_W^{-1}S_B$:
512

513

514

515
eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))
516

517

518
# **Note**:
519
#     
520
# Above, I used the [`numpy.linalg.eig`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html) function to decompose the symmetric covariance matrix into its eigenvalues and eigenvectors.
521
#     <pre>>>> eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)</pre>
522
#     This is not really a "mistake," but probably suboptimal. It would be better to use [`numpy.linalg.eigh`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html) in such cases, which has been designed for [Hermetian matrices](https://en.wikipedia.org/wiki/Hermitian_matrix). The latter always returns real  eigenvalues; whereas the numerically less stable `np.linalg.eig` can decompose nonsymmetric square matrices, you may find that it returns complex eigenvalues in certain cases. (S.R.)
523
# 
524

525
# Sort eigenvectors in descending order of the eigenvalues:
526

527

528

529
# Make a list of (eigenvalue, eigenvector) tuples
530
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
531
               for i in range(len(eigen_vals))]
532

533
# Sort the (eigenvalue, eigenvector) tuples from high to low
534
eigen_pairs = sorted(eigen_pairs, key=lambda k: k[0], reverse=True)
535

536
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
537

538
print('Eigenvalues in descending order:\n')
539
for eigen_val in eigen_pairs:
540
    print(eigen_val[0])
541

542

543

544

545
tot = sum(eigen_vals.real)
546
discr = [(i / tot) for i in sorted(eigen_vals.real, reverse=True)]
547
cum_discr = np.cumsum(discr)
548

549
plt.bar(range(1, 14), discr, align='center',
550
        label='Individual discriminability')
551
plt.step(range(1, 14), cum_discr, where='mid',
552
         label='Cumulative discriminability')
553
plt.ylabel('Discriminability ratio')
554
plt.xlabel('Linear discriminants')
555
plt.ylim([-0.1, 1.1])
556
plt.legend(loc='best')
557
plt.tight_layout()
558
#plt.savefig('figures/05_07.png', dpi=300)
559
plt.show()
560

561

562

563

564
w = np.hstack((eigen_pairs[0][1][:, np.newaxis].real,
565
              eigen_pairs[1][1][:, np.newaxis].real))
566
print('Matrix W:\n', w)
567

568

569

570
# ## Projecting examples onto the new feature space
571

572

573

574
X_train_lda = X_train_std.dot(w)
575
colors = ['r', 'b', 'g']
576
markers = ['o', 's', '^']
577

578
for l, c, m in zip(np.unique(y_train), colors, markers):
579
    plt.scatter(X_train_lda[y_train == l, 0],
580
                X_train_lda[y_train == l, 1] * (-1),
581
                c=c, label=f'Class {l}', marker=m)
582

583
plt.xlabel('LD 1')
584
plt.ylabel('LD 2')
585
plt.legend(loc='lower right')
586
plt.tight_layout()
587
plt.savefig('figures/05_08.png', dpi=300)
588
plt.show()
589

590

591

592
# ## LDA via scikit-learn
593

594

595

596

597
lda = LDA(n_components=2)
598
X_train_lda = lda.fit_transform(X_train_std, y_train)
599

600

601

602

603

604
lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')
605
lr = lr.fit(X_train_lda, y_train)
606

607
plot_decision_regions(X_train_lda, y_train, classifier=lr)
608
plt.xlabel('LD 1')
609
plt.ylabel('LD 2')
610
plt.legend(loc='lower left')
611
plt.tight_layout()
612
# plt.savefig('figures/05_09.png', dpi=300)
613
plt.show()
614

615

616

617

618
X_test_lda = lda.transform(X_test_std)
619

620
plot_decision_regions(X_test_lda, y_test, classifier=lr)
621
plt.xlabel('LD 1')
622
plt.ylabel('LD 2')
623
plt.legend(loc='lower left')
624
plt.tight_layout()
625
# plt.savefig('figures/05_10.png', dpi=300)
626
plt.show()
627

628

629

630
# # Nonlinear dimensionality reduction techniques
631

632

633

634

635

636
# ### Visualizing data via t-distributed stochastic neighbor embedding
637

638

639

640

641
digits = load_digits()
642

643
fig, ax = plt.subplots(1, 4)
644

645
for i in range(4):
646
    ax[i].imshow(digits.images[i], cmap='Greys')
647
    
648
# plt.savefig('figures/05_12.png', dpi=300)
649
plt.show() 
650

651

652

653

654
digits.data.shape
655

656

657

658

659
y_digits = digits.target
660
X_digits = digits.data
661

662

663

664

665

666

667
tsne = TSNE(n_components=2,
668
            init='pca',
669
            random_state=123)
670
X_digits_tsne = tsne.fit_transform(X_digits)
671

672

673

674

675

676

677
def plot_projection(x, colors):
678
    
679
    f = plt.figure(figsize=(8, 8))
680
    ax = plt.subplot(aspect='equal')
681
    for i in range(10):
682
        plt.scatter(x[colors == i, 0],
683
                    x[colors == i, 1])
684

685
    for i in range(10):
686

687
        xtext, ytext = np.median(x[colors == i, :], axis=0)
688
        txt = ax.text(xtext, ytext, str(i), fontsize=24)
689
        txt.set_path_effects([
690
            PathEffects.Stroke(linewidth=5, foreground="w"),
691
            PathEffects.Normal()])
692
        
693
plot_projection(X_digits_tsne, y_digits)
694
# plt.savefig('figures/05_13.png', dpi=300)
695
plt.show()
696

697

698

699
# # Summary
700

701
# ...
702

703
# ---
704
# 
705
# Readers may ignore the next cell.
706

707

708

709

710

711
Product

Resources

Company