# coding: utf-8123import sys4from python_environment_check import check_packages5import pandas as pd6from sklearn.model_selection import train_test_split7from sklearn.preprocessing import StandardScaler8import numpy as np9import matplotlib.pyplot as plt10from sklearn.decomposition import PCA11from matplotlib.colors import ListedColormap12from sklearn.linear_model import LogisticRegression13from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA14from sklearn.datasets import load_digits15from sklearn.manifold import TSNE16import matplotlib.patheffects as PathEffects1718# # Machine Learning with PyTorch and Scikit-Learn19# # -- Code Examples2021# ## Package version checks2223# Add folder to path in order to load from the check_packages.py script:24252627sys.path.insert(0, '..')282930# Check recommended package versions:313233343536d = {37'numpy': '1.21.2',38'matplotlib': '3.4.3',39'sklearn': '1.0',40'pandas': '1.3.2'41}42check_packages(d)434445# # Chapter 5 - Compressing Data via Dimensionality Reduction464748# ### Overview4950# - [Unsupervised dimensionality reduction via principal component analysis 128](#Unsupervised-dimensionality-reduction-via-principal-component-analysis-128)51# - [The main steps behind principal component analysis](#The-main-steps-behind-principal-component-analysis)52# - [Extracting the principal components step-by-step](#Extracting-the-principal-components-step-by-step)53# - [Total and explained variance](#Total-and-explained-variance)54# - [Feature transformation](#Feature-transformation)55# - [Principal component analysis in scikit-learn](#Principal-component-analysis-in-scikit-learn)56# - [Assessing feature contributions](#Assessing-feature-contributions)57# - [Supervised data compression via linear discriminant analysis](#Supervised-data-compression-via-linear-discriminant-analysis)58# - [Principal component analysis versus linear discriminant analysis](#Principal-component-analysis-versus-linear-discriminant-analysis)59# - [The inner workings of linear discriminant analysis](#The-inner-workings-of-linear-discriminant-analysis)60# - [Computing the scatter matrices](#Computing-the-scatter-matrices)61# - [Selecting linear discriminants for the new feature subspace](#Selecting-linear-discriminants-for-the-new-feature-subspace)62# - [Projecting examples onto the new feature space](#Projecting-examples-onto-the-new-feature-space)63# - [LDA via scikit-learn](#LDA-via-scikit-learn)64# - [Nonlinear dimensionality reduction techniques](#Nonlinear-dimensionality-reduction-techniques)65# - [Visualizing data via t-distributed stochastic neighbor embedding](#Visualizing-data-via-t-distributed-stochastic-neighbor-embedding)66# - [Summary](#Summary)67686970717273# # Unsupervised dimensionality reduction via principal component analysis7475# ## The main steps behind principal component analysis767778798081# ## Extracting the principal components step-by-step8283848586df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/'87'machine-learning-databases/wine/wine.data',88header=None)8990# if the Wine dataset is temporarily unavailable from the91# UCI machine learning repository, un-comment the following line92# of code to load the dataset from a local path:9394# df_wine = pd.read_csv('wine.data', header=None)9596df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',97'Alcalinity of ash', 'Magnesium', 'Total phenols',98'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',99'Color intensity', 'Hue',100'OD280/OD315 of diluted wines', 'Proline']101102df_wine.head()103104105106# Splitting the data into 70% training and 30% test subsets.107108109110111X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values112113X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,114stratify=y,115random_state=0)116117118# Standardizing the data.119120121122123sc = StandardScaler()124X_train_std = sc.fit_transform(X_train)125X_test_std = sc.transform(X_test)126127128# ---129#130# **Note**131#132# Accidentally, I wrote `X_test_std = sc.fit_transform(X_test)` instead of `X_test_std = sc.transform(X_test)`. In this case, it wouldn't make a big difference since the mean and standard deviation of the test set should be (quite) similar to the training set. However, as remember from Chapter 3, the correct way is to re-use parameters from the training set if we are doing any kind of transformation -- the test set should basically stand for "new, unseen" data.133#134# My initial typo reflects a common mistake is that some people are *not* re-using these parameters from the model training/building and standardize the new data "from scratch." Here's simple example to explain why this is a problem.135#136# Let's assume we have a simple training set consisting of 3 examples with 1 feature (let's call this feature "length"):137#138# - train_1: 10 cm -> class_2139# - train_2: 20 cm -> class_2140# - train_3: 30 cm -> class_1141#142# mean: 20, std.: 8.2143#144# After standardization, the transformed feature values are145#146# - train_std_1: -1.21 -> class_2147# - train_std_2: 0 -> class_2148# - train_std_3: 1.21 -> class_1149#150# Next, let's assume our model has learned to classify examples with a standardized length value < 0.6 as class_2 (class_1 otherwise). So far so good. Now, let's say we have 3 unlabeled data points that we want to classify:151#152# - new_4: 5 cm -> class ?153# - new_5: 6 cm -> class ?154# - new_6: 7 cm -> class ?155#156# If we look at the "unstandardized "length" values in our training datast, it is intuitive to say that all of these examples are likely belonging to class_2. However, if we standardize these by re-computing standard deviation and and mean you would get similar values as before in the training set and your classifier would (probably incorrectly) classify examples 4 and 5 as class 2.157#158# - new_std_4: -1.21 -> class 2159# - new_std_5: 0 -> class 2160# - new_std_6: 1.21 -> class 1161#162# However, if we use the parameters from your "training set standardization," we'd get the values:163#164# - example5: -18.37 -> class 2165# - example6: -17.15 -> class 2166# - example7: -15.92 -> class 2167#168# The values 5 cm, 6 cm, and 7 cm are much lower than anything we have seen in the training set previously. Thus, it only makes sense that the standardized features of the "new examples" are much lower than every standardized feature in the training set.169#170# ---171172# Eigendecomposition of the covariance matrix.173174175176cov_mat = np.cov(X_train_std.T)177eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)178179print('\nEigenvalues \n', eigen_vals)180181182# **Note**:183#184# Above, I used the [`numpy.linalg.eig`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html) function to decompose the symmetric covariance matrix into its eigenvalues and eigenvectors.185# <pre>>>> eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)</pre>186# This is not really a "mistake," but probably suboptimal. It would be better to use [`numpy.linalg.eigh`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html) in such cases, which has been designed for [Hermetian matrices](https://en.wikipedia.org/wiki/Hermitian_matrix). The latter always returns real eigenvalues; whereas the numerically less stable `np.linalg.eig` can decompose nonsymmetric square matrices, you may find that it returns complex eigenvalues in certain cases. (S.R.)187#188189190# ## Total and explained variance191192193194tot = sum(eigen_vals)195var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]196cum_var_exp = np.cumsum(var_exp)197198199200201202203plt.bar(range(1, 14), var_exp, align='center',204label='Individual explained variance')205plt.step(range(1, 14), cum_var_exp, where='mid',206label='Cumulative explained variance')207plt.ylabel('Explained variance ratio')208plt.xlabel('Principal component index')209plt.legend(loc='best')210plt.tight_layout()211# plt.savefig('figures/05_02.png', dpi=300)212plt.show()213214215216# ## Feature transformation217218219220# Make a list of (eigenvalue, eigenvector) tuples221eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])222for i in range(len(eigen_vals))]223224# Sort the (eigenvalue, eigenvector) tuples from high to low225eigen_pairs.sort(key=lambda k: k[0], reverse=True)226227228229230w = np.hstack((eigen_pairs[0][1][:, np.newaxis],231eigen_pairs[1][1][:, np.newaxis]))232print('Matrix W:\n', w)233234235# **Note**236# Depending on which version of NumPy and LAPACK you are using, you may obtain the Matrix W with its signs flipped. Please note that this is not an issue: If $v$ is an eigenvector of a matrix $\Sigma$, we have237#238# $$\Sigma v = \lambda v,$$239#240# where $\lambda$ is our eigenvalue,241#242#243# then $-v$ is also an eigenvector that has the same eigenvalue, since244# $$\Sigma \cdot (-v) = -\Sigma v = -\lambda v = \lambda \cdot (-v).$$245246247248X_train_std[0].dot(w)249250251252253X_train_pca = X_train_std.dot(w)254colors = ['r', 'b', 'g']255markers = ['o', 's', '^']256257for l, c, m in zip(np.unique(y_train), colors, markers):258plt.scatter(X_train_pca[y_train == l, 0],259X_train_pca[y_train == l, 1],260c=c, label=f'Class {l}', marker=m)261262plt.xlabel('PC 1')263plt.ylabel('PC 2')264plt.legend(loc='lower left')265plt.tight_layout()266# plt.savefig('figures/05_03.png', dpi=300)267plt.show()268269270271# ## Principal component analysis in scikit-learn272273# **NOTE**274#275# The following four code cells has been added in addition to the content to the book, to illustrate how to replicate the results from our own PCA implementation in scikit-learn:276277278279280pca = PCA()281X_train_pca = pca.fit_transform(X_train_std)282pca.explained_variance_ratio_283284285286287plt.bar(range(1, 14), pca.explained_variance_ratio_, align='center')288plt.step(range(1, 14), np.cumsum(pca.explained_variance_ratio_), where='mid')289plt.ylabel('Explained variance ratio')290plt.xlabel('Principal components')291292plt.show()293294295296297pca = PCA(n_components=2)298X_train_pca = pca.fit_transform(X_train_std)299X_test_pca = pca.transform(X_test_std)300301302303304plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1])305plt.xlabel('PC 1')306plt.ylabel('PC 2')307plt.show()308309310311312313def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):314315# setup marker generator and color map316markers = ('o', 's', '^', 'v', '<')317colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')318cmap = ListedColormap(colors[:len(np.unique(y))])319320# plot the decision surface321x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1322x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1323xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),324np.arange(x2_min, x2_max, resolution))325lab = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)326lab = lab.reshape(xx1.shape)327plt.contourf(xx1, xx2, lab, alpha=0.3, cmap=cmap)328plt.xlim(xx1.min(), xx1.max())329plt.ylim(xx2.min(), xx2.max())330331# plot class examples332for idx, cl in enumerate(np.unique(y)):333plt.scatter(x=X[y == cl, 0],334y=X[y == cl, 1],335alpha=0.8,336c=colors[idx],337marker=markers[idx],338label=f'Class {cl}',339edgecolor='black')340341342# Training logistic regression classifier using the first 2 principal components.343344345346347pca = PCA(n_components=2)348X_train_pca = pca.fit_transform(X_train_std)349X_test_pca = pca.transform(X_test_std)350351lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')352lr = lr.fit(X_train_pca, y_train)353354355356357plot_decision_regions(X_train_pca, y_train, classifier=lr)358plt.xlabel('PC 1')359plt.ylabel('PC 2')360plt.legend(loc='lower left')361plt.tight_layout()362# plt.savefig('figures/05_04.png', dpi=300)363plt.show()364365366367368plot_decision_regions(X_test_pca, y_test, classifier=lr)369plt.xlabel('PC 1')370plt.ylabel('PC 2')371plt.legend(loc='lower left')372plt.tight_layout()373# plt.savefig('figures/05_05.png', dpi=300)374plt.show()375376377378379pca = PCA(n_components=None)380X_train_pca = pca.fit_transform(X_train_std)381pca.explained_variance_ratio_382383384# ## Assessing feature contributions385386387388loadings = eigen_vecs * np.sqrt(eigen_vals)389390fig, ax = plt.subplots()391392ax.bar(range(13), loadings[:, 0], align='center')393ax.set_ylabel('Loadings for PC 1')394ax.set_xticks(range(13))395ax.set_xticklabels(df_wine.columns[1:], rotation=90)396397plt.ylim([-1, 1])398plt.tight_layout()399plt.savefig('figures/05_05_02.png', dpi=300)400plt.show()401402403404405loadings[:, 0]406407408409410sklearn_loadings = pca.components_.T * np.sqrt(pca.explained_variance_)411412fig, ax = plt.subplots()413414ax.bar(range(13), sklearn_loadings[:, 0], align='center')415ax.set_ylabel('Loadings for PC 1')416ax.set_xticks(range(13))417ax.set_xticklabels(df_wine.columns[1:], rotation=90)418419plt.ylim([-1, 1])420plt.tight_layout()421plt.savefig('figures/05_05_03.png', dpi=300)422plt.show()423424425426# # Supervised data compression via linear discriminant analysis427428# ## Principal component analysis versus linear discriminant analysis429430431432433434# ## The inner workings of linear discriminant analysis435436437# ## Computing the scatter matrices438439# Calculate the mean vectors for each class:440441442443np.set_printoptions(precision=4)444445mean_vecs = []446for label in range(1, 4):447mean_vecs.append(np.mean(X_train_std[y_train == label], axis=0))448print(f'MV {label}: {mean_vecs[label - 1]}\n')449450451# Compute the within-class scatter matrix:452453454455d = 13 # number of features456S_W = np.zeros((d, d))457for label, mv in zip(range(1, 4), mean_vecs):458class_scatter = np.zeros((d, d)) # scatter matrix for each class459for row in X_train_std[y_train == label]:460row, mv = row.reshape(d, 1), mv.reshape(d, 1) # make column vectors461class_scatter += (row - mv).dot((row - mv).T)462S_W += class_scatter # sum class scatter matrices463464print('Within-class scatter matrix: '465f'{S_W.shape[0]}x{S_W.shape[1]}')466467468# Better: covariance matrix since classes are not equally distributed:469470471472print('Class label distribution:',473np.bincount(y_train)[1:])474475476477478d = 13 # number of features479S_W = np.zeros((d, d))480for label, mv in zip(range(1, 4), mean_vecs):481class_scatter = np.cov(X_train_std[y_train == label].T)482S_W += class_scatter483484print('Scaled within-class scatter matrix: '485f'{S_W.shape[0]}x{S_W.shape[1]}')486487488# Compute the between-class scatter matrix:489490491492mean_overall = np.mean(X_train_std, axis=0)493mean_overall = mean_overall.reshape(d, 1) # make column vector494495d = 13 # number of features496S_B = np.zeros((d, d))497498for i, mean_vec in enumerate(mean_vecs):499n = X_train_std[y_train == i + 1, :].shape[0]500mean_vec = mean_vec.reshape(d, 1) # make column vector501S_B += n * (mean_vec - mean_overall).dot((mean_vec - mean_overall).T)502503print('Between-class scatter matrix: '504f'{S_B.shape[0]}x{S_B.shape[1]}')505506507508# ## Selecting linear discriminants for the new feature subspace509510# Solve the generalized eigenvalue problem for the matrix $S_W^{-1}S_B$:511512513514eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))515516517# **Note**:518#519# Above, I used the [`numpy.linalg.eig`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html) function to decompose the symmetric covariance matrix into its eigenvalues and eigenvectors.520# <pre>>>> eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)</pre>521# This is not really a "mistake," but probably suboptimal. It would be better to use [`numpy.linalg.eigh`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html) in such cases, which has been designed for [Hermetian matrices](https://en.wikipedia.org/wiki/Hermitian_matrix). The latter always returns real eigenvalues; whereas the numerically less stable `np.linalg.eig` can decompose nonsymmetric square matrices, you may find that it returns complex eigenvalues in certain cases. (S.R.)522#523524# Sort eigenvectors in descending order of the eigenvalues:525526527528# Make a list of (eigenvalue, eigenvector) tuples529eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])530for i in range(len(eigen_vals))]531532# Sort the (eigenvalue, eigenvector) tuples from high to low533eigen_pairs = sorted(eigen_pairs, key=lambda k: k[0], reverse=True)534535# Visually confirm that the list is correctly sorted by decreasing eigenvalues536537print('Eigenvalues in descending order:\n')538for eigen_val in eigen_pairs:539print(eigen_val[0])540541542543544tot = sum(eigen_vals.real)545discr = [(i / tot) for i in sorted(eigen_vals.real, reverse=True)]546cum_discr = np.cumsum(discr)547548plt.bar(range(1, 14), discr, align='center',549label='Individual discriminability')550plt.step(range(1, 14), cum_discr, where='mid',551label='Cumulative discriminability')552plt.ylabel('Discriminability ratio')553plt.xlabel('Linear discriminants')554plt.ylim([-0.1, 1.1])555plt.legend(loc='best')556plt.tight_layout()557#plt.savefig('figures/05_07.png', dpi=300)558plt.show()559560561562563w = np.hstack((eigen_pairs[0][1][:, np.newaxis].real,564eigen_pairs[1][1][:, np.newaxis].real))565print('Matrix W:\n', w)566567568569# ## Projecting examples onto the new feature space570571572573X_train_lda = X_train_std.dot(w)574colors = ['r', 'b', 'g']575markers = ['o', 's', '^']576577for l, c, m in zip(np.unique(y_train), colors, markers):578plt.scatter(X_train_lda[y_train == l, 0],579X_train_lda[y_train == l, 1] * (-1),580c=c, label=f'Class {l}', marker=m)581582plt.xlabel('LD 1')583plt.ylabel('LD 2')584plt.legend(loc='lower right')585plt.tight_layout()586plt.savefig('figures/05_08.png', dpi=300)587plt.show()588589590591# ## LDA via scikit-learn592593594595596lda = LDA(n_components=2)597X_train_lda = lda.fit_transform(X_train_std, y_train)598599600601602603lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')604lr = lr.fit(X_train_lda, y_train)605606plot_decision_regions(X_train_lda, y_train, classifier=lr)607plt.xlabel('LD 1')608plt.ylabel('LD 2')609plt.legend(loc='lower left')610plt.tight_layout()611# plt.savefig('figures/05_09.png', dpi=300)612plt.show()613614615616617X_test_lda = lda.transform(X_test_std)618619plot_decision_regions(X_test_lda, y_test, classifier=lr)620plt.xlabel('LD 1')621plt.ylabel('LD 2')622plt.legend(loc='lower left')623plt.tight_layout()624# plt.savefig('figures/05_10.png', dpi=300)625plt.show()626627628629# # Nonlinear dimensionality reduction techniques630631632633634635# ### Visualizing data via t-distributed stochastic neighbor embedding636637638639640digits = load_digits()641642fig, ax = plt.subplots(1, 4)643644for i in range(4):645ax[i].imshow(digits.images[i], cmap='Greys')646647# plt.savefig('figures/05_12.png', dpi=300)648plt.show()649650651652653digits.data.shape654655656657658y_digits = digits.target659X_digits = digits.data660661662663664665666tsne = TSNE(n_components=2,667init='pca',668random_state=123)669X_digits_tsne = tsne.fit_transform(X_digits)670671672673674675676def plot_projection(x, colors):677678f = plt.figure(figsize=(8, 8))679ax = plt.subplot(aspect='equal')680for i in range(10):681plt.scatter(x[colors == i, 0],682x[colors == i, 1])683684for i in range(10):685686xtext, ytext = np.median(x[colors == i, :], axis=0)687txt = ax.text(xtext, ytext, str(i), fontsize=24)688txt.set_path_effects([689PathEffects.Stroke(linewidth=5, foreground="w"),690PathEffects.Normal()])691692plot_projection(X_digits_tsne, y_digits)693# plt.savefig('figures/05_13.png', dpi=300)694plt.show()695696697698# # Summary699700# ...701702# ---703#704# Readers may ignore the next cell.705706707708709710711