Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch05/ch05.py
1247 views
1
# coding: utf-8
2
3
4
import sys
5
from python_environment_check import check_packages
6
import pandas as pd
7
from sklearn.model_selection import train_test_split
8
from sklearn.preprocessing import StandardScaler
9
import numpy as np
10
import matplotlib.pyplot as plt
11
from sklearn.decomposition import PCA
12
from matplotlib.colors import ListedColormap
13
from sklearn.linear_model import LogisticRegression
14
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
15
from sklearn.datasets import load_digits
16
from sklearn.manifold import TSNE
17
import matplotlib.patheffects as PathEffects
18
19
# # Machine Learning with PyTorch and Scikit-Learn
20
# # -- Code Examples
21
22
# ## Package version checks
23
24
# Add folder to path in order to load from the check_packages.py script:
25
26
27
28
sys.path.insert(0, '..')
29
30
31
# Check recommended package versions:
32
33
34
35
36
37
d = {
38
'numpy': '1.21.2',
39
'matplotlib': '3.4.3',
40
'sklearn': '1.0',
41
'pandas': '1.3.2'
42
}
43
check_packages(d)
44
45
46
# # Chapter 5 - Compressing Data via Dimensionality Reduction
47
48
49
# ### Overview
50
51
# - [Unsupervised dimensionality reduction via principal component analysis 128](#Unsupervised-dimensionality-reduction-via-principal-component-analysis-128)
52
# - [The main steps behind principal component analysis](#The-main-steps-behind-principal-component-analysis)
53
# - [Extracting the principal components step-by-step](#Extracting-the-principal-components-step-by-step)
54
# - [Total and explained variance](#Total-and-explained-variance)
55
# - [Feature transformation](#Feature-transformation)
56
# - [Principal component analysis in scikit-learn](#Principal-component-analysis-in-scikit-learn)
57
# - [Assessing feature contributions](#Assessing-feature-contributions)
58
# - [Supervised data compression via linear discriminant analysis](#Supervised-data-compression-via-linear-discriminant-analysis)
59
# - [Principal component analysis versus linear discriminant analysis](#Principal-component-analysis-versus-linear-discriminant-analysis)
60
# - [The inner workings of linear discriminant analysis](#The-inner-workings-of-linear-discriminant-analysis)
61
# - [Computing the scatter matrices](#Computing-the-scatter-matrices)
62
# - [Selecting linear discriminants for the new feature subspace](#Selecting-linear-discriminants-for-the-new-feature-subspace)
63
# - [Projecting examples onto the new feature space](#Projecting-examples-onto-the-new-feature-space)
64
# - [LDA via scikit-learn](#LDA-via-scikit-learn)
65
# - [Nonlinear dimensionality reduction techniques](#Nonlinear-dimensionality-reduction-techniques)
66
# - [Visualizing data via t-distributed stochastic neighbor embedding](#Visualizing-data-via-t-distributed-stochastic-neighbor-embedding)
67
# - [Summary](#Summary)
68
69
70
71
72
73
74
# # Unsupervised dimensionality reduction via principal component analysis
75
76
# ## The main steps behind principal component analysis
77
78
79
80
81
82
# ## Extracting the principal components step-by-step
83
84
85
86
87
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/'
88
'machine-learning-databases/wine/wine.data',
89
header=None)
90
91
# if the Wine dataset is temporarily unavailable from the
92
# UCI machine learning repository, un-comment the following line
93
# of code to load the dataset from a local path:
94
95
# df_wine = pd.read_csv('wine.data', header=None)
96
97
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
98
'Alcalinity of ash', 'Magnesium', 'Total phenols',
99
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
100
'Color intensity', 'Hue',
101
'OD280/OD315 of diluted wines', 'Proline']
102
103
df_wine.head()
104
105
106
107
# Splitting the data into 70% training and 30% test subsets.
108
109
110
111
112
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
113
114
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
115
stratify=y,
116
random_state=0)
117
118
119
# Standardizing the data.
120
121
122
123
124
sc = StandardScaler()
125
X_train_std = sc.fit_transform(X_train)
126
X_test_std = sc.transform(X_test)
127
128
129
# ---
130
#
131
# **Note**
132
#
133
# Accidentally, I wrote `X_test_std = sc.fit_transform(X_test)` instead of `X_test_std = sc.transform(X_test)`. In this case, it wouldn't make a big difference since the mean and standard deviation of the test set should be (quite) similar to the training set. However, as remember from Chapter 3, the correct way is to re-use parameters from the training set if we are doing any kind of transformation -- the test set should basically stand for "new, unseen" data.
134
#
135
# My initial typo reflects a common mistake is that some people are *not* re-using these parameters from the model training/building and standardize the new data "from scratch." Here's simple example to explain why this is a problem.
136
#
137
# Let's assume we have a simple training set consisting of 3 examples with 1 feature (let's call this feature "length"):
138
#
139
# - train_1: 10 cm -> class_2
140
# - train_2: 20 cm -> class_2
141
# - train_3: 30 cm -> class_1
142
#
143
# mean: 20, std.: 8.2
144
#
145
# After standardization, the transformed feature values are
146
#
147
# - train_std_1: -1.21 -> class_2
148
# - train_std_2: 0 -> class_2
149
# - train_std_3: 1.21 -> class_1
150
#
151
# Next, let's assume our model has learned to classify examples with a standardized length value < 0.6 as class_2 (class_1 otherwise). So far so good. Now, let's say we have 3 unlabeled data points that we want to classify:
152
#
153
# - new_4: 5 cm -> class ?
154
# - new_5: 6 cm -> class ?
155
# - new_6: 7 cm -> class ?
156
#
157
# If we look at the "unstandardized "length" values in our training datast, it is intuitive to say that all of these examples are likely belonging to class_2. However, if we standardize these by re-computing standard deviation and and mean you would get similar values as before in the training set and your classifier would (probably incorrectly) classify examples 4 and 5 as class 2.
158
#
159
# - new_std_4: -1.21 -> class 2
160
# - new_std_5: 0 -> class 2
161
# - new_std_6: 1.21 -> class 1
162
#
163
# However, if we use the parameters from your "training set standardization," we'd get the values:
164
#
165
# - example5: -18.37 -> class 2
166
# - example6: -17.15 -> class 2
167
# - example7: -15.92 -> class 2
168
#
169
# The values 5 cm, 6 cm, and 7 cm are much lower than anything we have seen in the training set previously. Thus, it only makes sense that the standardized features of the "new examples" are much lower than every standardized feature in the training set.
170
#
171
# ---
172
173
# Eigendecomposition of the covariance matrix.
174
175
176
177
cov_mat = np.cov(X_train_std.T)
178
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
179
180
print('\nEigenvalues \n', eigen_vals)
181
182
183
# **Note**:
184
#
185
# Above, I used the [`numpy.linalg.eig`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html) function to decompose the symmetric covariance matrix into its eigenvalues and eigenvectors.
186
# <pre>>>> eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)</pre>
187
# This is not really a "mistake," but probably suboptimal. It would be better to use [`numpy.linalg.eigh`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html) in such cases, which has been designed for [Hermetian matrices](https://en.wikipedia.org/wiki/Hermitian_matrix). The latter always returns real eigenvalues; whereas the numerically less stable `np.linalg.eig` can decompose nonsymmetric square matrices, you may find that it returns complex eigenvalues in certain cases. (S.R.)
188
#
189
190
191
# ## Total and explained variance
192
193
194
195
tot = sum(eigen_vals)
196
var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
197
cum_var_exp = np.cumsum(var_exp)
198
199
200
201
202
203
204
plt.bar(range(1, 14), var_exp, align='center',
205
label='Individual explained variance')
206
plt.step(range(1, 14), cum_var_exp, where='mid',
207
label='Cumulative explained variance')
208
plt.ylabel('Explained variance ratio')
209
plt.xlabel('Principal component index')
210
plt.legend(loc='best')
211
plt.tight_layout()
212
# plt.savefig('figures/05_02.png', dpi=300)
213
plt.show()
214
215
216
217
# ## Feature transformation
218
219
220
221
# Make a list of (eigenvalue, eigenvector) tuples
222
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
223
for i in range(len(eigen_vals))]
224
225
# Sort the (eigenvalue, eigenvector) tuples from high to low
226
eigen_pairs.sort(key=lambda k: k[0], reverse=True)
227
228
229
230
231
w = np.hstack((eigen_pairs[0][1][:, np.newaxis],
232
eigen_pairs[1][1][:, np.newaxis]))
233
print('Matrix W:\n', w)
234
235
236
# **Note**
237
# Depending on which version of NumPy and LAPACK you are using, you may obtain the Matrix W with its signs flipped. Please note that this is not an issue: If $v$ is an eigenvector of a matrix $\Sigma$, we have
238
#
239
# $$\Sigma v = \lambda v,$$
240
#
241
# where $\lambda$ is our eigenvalue,
242
#
243
#
244
# then $-v$ is also an eigenvector that has the same eigenvalue, since
245
# $$\Sigma \cdot (-v) = -\Sigma v = -\lambda v = \lambda \cdot (-v).$$
246
247
248
249
X_train_std[0].dot(w)
250
251
252
253
254
X_train_pca = X_train_std.dot(w)
255
colors = ['r', 'b', 'g']
256
markers = ['o', 's', '^']
257
258
for l, c, m in zip(np.unique(y_train), colors, markers):
259
plt.scatter(X_train_pca[y_train == l, 0],
260
X_train_pca[y_train == l, 1],
261
c=c, label=f'Class {l}', marker=m)
262
263
plt.xlabel('PC 1')
264
plt.ylabel('PC 2')
265
plt.legend(loc='lower left')
266
plt.tight_layout()
267
# plt.savefig('figures/05_03.png', dpi=300)
268
plt.show()
269
270
271
272
# ## Principal component analysis in scikit-learn
273
274
# **NOTE**
275
#
276
# The following four code cells has been added in addition to the content to the book, to illustrate how to replicate the results from our own PCA implementation in scikit-learn:
277
278
279
280
281
pca = PCA()
282
X_train_pca = pca.fit_transform(X_train_std)
283
pca.explained_variance_ratio_
284
285
286
287
288
plt.bar(range(1, 14), pca.explained_variance_ratio_, align='center')
289
plt.step(range(1, 14), np.cumsum(pca.explained_variance_ratio_), where='mid')
290
plt.ylabel('Explained variance ratio')
291
plt.xlabel('Principal components')
292
293
plt.show()
294
295
296
297
298
pca = PCA(n_components=2)
299
X_train_pca = pca.fit_transform(X_train_std)
300
X_test_pca = pca.transform(X_test_std)
301
302
303
304
305
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1])
306
plt.xlabel('PC 1')
307
plt.ylabel('PC 2')
308
plt.show()
309
310
311
312
313
314
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
315
316
# setup marker generator and color map
317
markers = ('o', 's', '^', 'v', '<')
318
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
319
cmap = ListedColormap(colors[:len(np.unique(y))])
320
321
# plot the decision surface
322
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
323
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
324
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
325
np.arange(x2_min, x2_max, resolution))
326
lab = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
327
lab = lab.reshape(xx1.shape)
328
plt.contourf(xx1, xx2, lab, alpha=0.3, cmap=cmap)
329
plt.xlim(xx1.min(), xx1.max())
330
plt.ylim(xx2.min(), xx2.max())
331
332
# plot class examples
333
for idx, cl in enumerate(np.unique(y)):
334
plt.scatter(x=X[y == cl, 0],
335
y=X[y == cl, 1],
336
alpha=0.8,
337
c=colors[idx],
338
marker=markers[idx],
339
label=f'Class {cl}',
340
edgecolor='black')
341
342
343
# Training logistic regression classifier using the first 2 principal components.
344
345
346
347
348
pca = PCA(n_components=2)
349
X_train_pca = pca.fit_transform(X_train_std)
350
X_test_pca = pca.transform(X_test_std)
351
352
lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')
353
lr = lr.fit(X_train_pca, y_train)
354
355
356
357
358
plot_decision_regions(X_train_pca, y_train, classifier=lr)
359
plt.xlabel('PC 1')
360
plt.ylabel('PC 2')
361
plt.legend(loc='lower left')
362
plt.tight_layout()
363
# plt.savefig('figures/05_04.png', dpi=300)
364
plt.show()
365
366
367
368
369
plot_decision_regions(X_test_pca, y_test, classifier=lr)
370
plt.xlabel('PC 1')
371
plt.ylabel('PC 2')
372
plt.legend(loc='lower left')
373
plt.tight_layout()
374
# plt.savefig('figures/05_05.png', dpi=300)
375
plt.show()
376
377
378
379
380
pca = PCA(n_components=None)
381
X_train_pca = pca.fit_transform(X_train_std)
382
pca.explained_variance_ratio_
383
384
385
# ## Assessing feature contributions
386
387
388
389
loadings = eigen_vecs * np.sqrt(eigen_vals)
390
391
fig, ax = plt.subplots()
392
393
ax.bar(range(13), loadings[:, 0], align='center')
394
ax.set_ylabel('Loadings for PC 1')
395
ax.set_xticks(range(13))
396
ax.set_xticklabels(df_wine.columns[1:], rotation=90)
397
398
plt.ylim([-1, 1])
399
plt.tight_layout()
400
plt.savefig('figures/05_05_02.png', dpi=300)
401
plt.show()
402
403
404
405
406
loadings[:, 0]
407
408
409
410
411
sklearn_loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
412
413
fig, ax = plt.subplots()
414
415
ax.bar(range(13), sklearn_loadings[:, 0], align='center')
416
ax.set_ylabel('Loadings for PC 1')
417
ax.set_xticks(range(13))
418
ax.set_xticklabels(df_wine.columns[1:], rotation=90)
419
420
plt.ylim([-1, 1])
421
plt.tight_layout()
422
plt.savefig('figures/05_05_03.png', dpi=300)
423
plt.show()
424
425
426
427
# # Supervised data compression via linear discriminant analysis
428
429
# ## Principal component analysis versus linear discriminant analysis
430
431
432
433
434
435
# ## The inner workings of linear discriminant analysis
436
437
438
# ## Computing the scatter matrices
439
440
# Calculate the mean vectors for each class:
441
442
443
444
np.set_printoptions(precision=4)
445
446
mean_vecs = []
447
for label in range(1, 4):
448
mean_vecs.append(np.mean(X_train_std[y_train == label], axis=0))
449
print(f'MV {label}: {mean_vecs[label - 1]}\n')
450
451
452
# Compute the within-class scatter matrix:
453
454
455
456
d = 13 # number of features
457
S_W = np.zeros((d, d))
458
for label, mv in zip(range(1, 4), mean_vecs):
459
class_scatter = np.zeros((d, d)) # scatter matrix for each class
460
for row in X_train_std[y_train == label]:
461
row, mv = row.reshape(d, 1), mv.reshape(d, 1) # make column vectors
462
class_scatter += (row - mv).dot((row - mv).T)
463
S_W += class_scatter # sum class scatter matrices
464
465
print('Within-class scatter matrix: '
466
f'{S_W.shape[0]}x{S_W.shape[1]}')
467
468
469
# Better: covariance matrix since classes are not equally distributed:
470
471
472
473
print('Class label distribution:',
474
np.bincount(y_train)[1:])
475
476
477
478
479
d = 13 # number of features
480
S_W = np.zeros((d, d))
481
for label, mv in zip(range(1, 4), mean_vecs):
482
class_scatter = np.cov(X_train_std[y_train == label].T)
483
S_W += class_scatter
484
485
print('Scaled within-class scatter matrix: '
486
f'{S_W.shape[0]}x{S_W.shape[1]}')
487
488
489
# Compute the between-class scatter matrix:
490
491
492
493
mean_overall = np.mean(X_train_std, axis=0)
494
mean_overall = mean_overall.reshape(d, 1) # make column vector
495
496
d = 13 # number of features
497
S_B = np.zeros((d, d))
498
499
for i, mean_vec in enumerate(mean_vecs):
500
n = X_train_std[y_train == i + 1, :].shape[0]
501
mean_vec = mean_vec.reshape(d, 1) # make column vector
502
S_B += n * (mean_vec - mean_overall).dot((mean_vec - mean_overall).T)
503
504
print('Between-class scatter matrix: '
505
f'{S_B.shape[0]}x{S_B.shape[1]}')
506
507
508
509
# ## Selecting linear discriminants for the new feature subspace
510
511
# Solve the generalized eigenvalue problem for the matrix $S_W^{-1}S_B$:
512
513
514
515
eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))
516
517
518
# **Note**:
519
#
520
# Above, I used the [`numpy.linalg.eig`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html) function to decompose the symmetric covariance matrix into its eigenvalues and eigenvectors.
521
# <pre>>>> eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)</pre>
522
# This is not really a "mistake," but probably suboptimal. It would be better to use [`numpy.linalg.eigh`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html) in such cases, which has been designed for [Hermetian matrices](https://en.wikipedia.org/wiki/Hermitian_matrix). The latter always returns real eigenvalues; whereas the numerically less stable `np.linalg.eig` can decompose nonsymmetric square matrices, you may find that it returns complex eigenvalues in certain cases. (S.R.)
523
#
524
525
# Sort eigenvectors in descending order of the eigenvalues:
526
527
528
529
# Make a list of (eigenvalue, eigenvector) tuples
530
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
531
for i in range(len(eigen_vals))]
532
533
# Sort the (eigenvalue, eigenvector) tuples from high to low
534
eigen_pairs = sorted(eigen_pairs, key=lambda k: k[0], reverse=True)
535
536
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
537
538
print('Eigenvalues in descending order:\n')
539
for eigen_val in eigen_pairs:
540
print(eigen_val[0])
541
542
543
544
545
tot = sum(eigen_vals.real)
546
discr = [(i / tot) for i in sorted(eigen_vals.real, reverse=True)]
547
cum_discr = np.cumsum(discr)
548
549
plt.bar(range(1, 14), discr, align='center',
550
label='Individual discriminability')
551
plt.step(range(1, 14), cum_discr, where='mid',
552
label='Cumulative discriminability')
553
plt.ylabel('Discriminability ratio')
554
plt.xlabel('Linear discriminants')
555
plt.ylim([-0.1, 1.1])
556
plt.legend(loc='best')
557
plt.tight_layout()
558
#plt.savefig('figures/05_07.png', dpi=300)
559
plt.show()
560
561
562
563
564
w = np.hstack((eigen_pairs[0][1][:, np.newaxis].real,
565
eigen_pairs[1][1][:, np.newaxis].real))
566
print('Matrix W:\n', w)
567
568
569
570
# ## Projecting examples onto the new feature space
571
572
573
574
X_train_lda = X_train_std.dot(w)
575
colors = ['r', 'b', 'g']
576
markers = ['o', 's', '^']
577
578
for l, c, m in zip(np.unique(y_train), colors, markers):
579
plt.scatter(X_train_lda[y_train == l, 0],
580
X_train_lda[y_train == l, 1] * (-1),
581
c=c, label=f'Class {l}', marker=m)
582
583
plt.xlabel('LD 1')
584
plt.ylabel('LD 2')
585
plt.legend(loc='lower right')
586
plt.tight_layout()
587
plt.savefig('figures/05_08.png', dpi=300)
588
plt.show()
589
590
591
592
# ## LDA via scikit-learn
593
594
595
596
597
lda = LDA(n_components=2)
598
X_train_lda = lda.fit_transform(X_train_std, y_train)
599
600
601
602
603
604
lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')
605
lr = lr.fit(X_train_lda, y_train)
606
607
plot_decision_regions(X_train_lda, y_train, classifier=lr)
608
plt.xlabel('LD 1')
609
plt.ylabel('LD 2')
610
plt.legend(loc='lower left')
611
plt.tight_layout()
612
# plt.savefig('figures/05_09.png', dpi=300)
613
plt.show()
614
615
616
617
618
X_test_lda = lda.transform(X_test_std)
619
620
plot_decision_regions(X_test_lda, y_test, classifier=lr)
621
plt.xlabel('LD 1')
622
plt.ylabel('LD 2')
623
plt.legend(loc='lower left')
624
plt.tight_layout()
625
# plt.savefig('figures/05_10.png', dpi=300)
626
plt.show()
627
628
629
630
# # Nonlinear dimensionality reduction techniques
631
632
633
634
635
636
# ### Visualizing data via t-distributed stochastic neighbor embedding
637
638
639
640
641
digits = load_digits()
642
643
fig, ax = plt.subplots(1, 4)
644
645
for i in range(4):
646
ax[i].imshow(digits.images[i], cmap='Greys')
647
648
# plt.savefig('figures/05_12.png', dpi=300)
649
plt.show()
650
651
652
653
654
digits.data.shape
655
656
657
658
659
y_digits = digits.target
660
X_digits = digits.data
661
662
663
664
665
666
667
tsne = TSNE(n_components=2,
668
init='pca',
669
random_state=123)
670
X_digits_tsne = tsne.fit_transform(X_digits)
671
672
673
674
675
676
677
def plot_projection(x, colors):
678
679
f = plt.figure(figsize=(8, 8))
680
ax = plt.subplot(aspect='equal')
681
for i in range(10):
682
plt.scatter(x[colors == i, 0],
683
x[colors == i, 1])
684
685
for i in range(10):
686
687
xtext, ytext = np.median(x[colors == i, :], axis=0)
688
txt = ax.text(xtext, ytext, str(i), fontsize=24)
689
txt.set_path_effects([
690
PathEffects.Stroke(linewidth=5, foreground="w"),
691
PathEffects.Normal()])
692
693
plot_projection(X_digits_tsne, y_digits)
694
# plt.savefig('figures/05_13.png', dpi=300)
695
plt.show()
696
697
698
699
# # Summary
700
701
# ...
702
703
# ---
704
#
705
# Readers may ignore the next cell.
706
707
708
709
710
711