CoCalc -- pca_digits.ipynb

GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book1/20/pca_digits.ipynb
¹¹⁹² views

Kernel: Unknown Kernel

In [ ]:

# PCA in 2d on digit images.
# Based on fig 14.23 of  of "Elements of statistical learning". Code is from Andrey Gaskov's site:

# Code modified from
# https://github.com/empathy87/The-Elements-of-Statistical-Learning-Python-Notebooks/blob/master/examples/ZIP%20Code.ipynb


try:
    import pandas as pd
except ModuleNotFoundError:
    %pip install -qq pandas
    import pandas as pd
from matplotlib import transforms, pyplot as plt
import numpy as np

try:
    from sklearn.decomposition import PCA
except ModuleNotFoundError:
    %pip install -qq scikit-learn
    from sklearn.decomposition import PCA
from sklearn import datasets
import requests
from io import BytesIO

try:
    import probml_utils as pml
except ModuleNotFoundError:
    %pip install -qq git+https://github.com/probml/probml-utils.git
    import probml_utils as pml

try:
    from tensorflow import keras
except ModuleNotFoundError:
    %pip install -qq tensorflow
    from tensorflow import keras
import tensorflow as tf

# define plots common properties and color constants
plt.rcParams["font.family"] = "Arial"
plt.rcParams["axes.linewidth"] = 0.5
GRAY1, GRAY4, GRAY7 = "#231F20", "#646369", "#929497"

if 1:
    mnist = keras.datasets.mnist
    (train_images, train_labels), (test_images, test_labels) = mnist.load_data()
    train_images = train_images / 255.0
    test_images = test_images / 255.0
    ndx = np.arange(1000)
    nsamples, nrows, ncols = train_images.shape
    X_train = np.reshape(train_images, (nsamples, nrows * ncols))
    X_train = X_train[ndx, :]
    y_train = train_labels[ndx]
    w = 30  # 28+2

if 0:
    digits = datasets.load_digits()
    X_train = digits.data
    y_train = digits.target  # 0..9
    w = 10  # pixels for one digit, 8+2


if 0:
    # load numpy array from the compressed file
    url = "https://github.com/probml/probml-data/blob/main/data/goog.npy?raw=true"
    response = requests.get(url)
    rawdata = BytesIO(response.content)
    arr = np.load(rawdata)["arr_0"]  #'../data/zip.npy.npz'
    # do train-test split by the last column
    train, test = arr[arr[:, -1] == 0], arr[arr[:, -1] == 1]
    X_train, X_test = train[:, 1:-1], test[:, 1:-1]
    y_train, y_test = train[:, 0].astype(int), test[:, 0].astype(int)
    w = 20  # pixels for one digit, 16+4


n_samples, n_features = X_train.shape
print(X_train.shape)
img_size = int(np.sqrt(n_features))


# idx_3 = np.where(y_train == 3)[0]
idx_3 = np.where(y_train == 9)[0]
X_train_3 = X_train[idx_3]


X_train_3_pca = PCA(n_components=2).fit_transform(X_train_3)
x_grid = np.percentile(X_train_3_pca[:, 0], [5, 25, 50, 75, 95])
y_grid = np.percentile(X_train_3_pca[:, 1], [5, 25, 50, 75, 95])
x_grid[2], y_grid[2] = 0, 0


fig, axarr = plt.subplots(1, 2, figsize=(6.7, 3.8), dpi=150, gridspec_kw=dict(width_ratios=[3, 2]))
plt.subplots_adjust(wspace=0.1)
for s in axarr[1].spines.values():
    s.set_visible(False)
axarr[1].tick_params(bottom=False, labelbottom=False, left=False, labelleft=False)
ax = axarr[0]
ax.scatter(X_train_3_pca[:, 0], X_train_3_pca[:, 1], s=1, color="#02A4A3")
ax.set_xlabel("First Principal Component", color=GRAY4, fontsize=8)
ax.set_ylabel("Second Principal Component", color=GRAY4, fontsize=8)
for i in ax.get_yticklabels() + ax.get_xticklabels():
    i.set_fontsize(7)
ax.axhline(0, linewidth=0.5, color=GRAY1)
ax.axvline(0, linewidth=0.5, color=GRAY1)
for i in range(5):
    if i != 2:
        ax.axhline(y_grid[i], linewidth=0.5, color=GRAY4, linestyle="--")
        ax.axvline(x_grid[i], linewidth=0.5, color=GRAY4, linestyle="--")

img = np.ones(shape=(4 + w * 5, 4 + w * 5))
for i in range(5):
    for j in range(5):
        v = X_train_3_pca - np.array([x_grid[i], y_grid[j]])
        v = np.sqrt(np.sum(v**2, axis=-1))
        idx = np.argmin(v)
        ax.scatter(
            X_train_3_pca[idx : idx + 1, 0],
            X_train_3_pca[idx : idx + 1, 1],
            s=14,
            facecolors="none",
            edgecolors="r",
            linewidth=1,
        )
        img[j * w + 4 : j * w + 4 + img_size, i * w + 4 : i * w + 4 + img_size] = -X_train_3[idx].reshape(
            (img_size, img_size)
        )

ax = axarr[1]
ax.imshow(img, cmap="gray")
ax.set_aspect("equal", "datalim")
plt.tight_layout()
pml.savefig("pca_digits.pdf")
plt.show()

Product

Resources

Company