Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book1/16/knn_demo.ipynb
1192 views
Kernel: Python 3

Open In Colab

GitHub

Authors: Kevin P. Murphy ([email protected]) and Mahmoud Soliman ([email protected])

# Attribution # This notebook is based on the following: # https://github.com/probml/pyprobml/blob/master/scripts/knn_classify_demo.py
# Imports from sklearn.neighbors import KNeighborsClassifier as KNN from sklearn.model_selection import cross_val_score from sklearn.datasets.samples_generator import make_blobs from IPython import display from matplotlib import pyplot as plt import numpy as np import pathlib import shutil import tempfile from tqdm import tqdm
/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.datasets.samples_generator module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.datasets. Anything that cannot be imported from sklearn.datasets is now part of the private API. warnings.warn(message, FutureWarning)
# In this notebook we will walk through KNN clustering technique # Here we generate isotropic Gaussian blobs by using the make_blob function from sklearn X, y = make_blobs(n_samples=1000, centers=3, n_features=2, cluster_std=6, random_state=42) ntrain = 100 x_train = X[:ntrain] y_train = y[:ntrain] x_test = X[ntrain:] y_test = y[ntrain:]
# Plotting the generated training dataset by class in a scatter plot plt.figure() y_unique = np.unique(y_train) markers = "*x+" colors = "bgr" for i in range(len(y_unique)): plt.scatter(x_train[y_train == y_unique[i], 0], x_train[y_train == y_unique[i], 1], marker=markers[i], c=colors[i]) plt.title("train") plt.show()
Image in a Jupyter notebook
# Plotting the generated test dataset by class in a scatter plot plt.figure() for i in range(len(y_unique)): plt.scatter(x_test[y_test == y_unique[i], 0], x_test[y_test == y_unique[i], 1], marker=markers[i], c=colors[i]) plt.title("test") plt.show()
Image in a Jupyter notebook
x = np.linspace(np.min(x_test[:, 0]), np.max(x_test[:, 0]), 200) y = np.linspace(np.min(x_test[:, 1]), np.max(x_test[:, 1]), 200) xx, yy = np.meshgrid(x, y) xy = np.c_[xx.ravel(), yy.ravel()] # Train a knn model and use the knn model to predict for k in [1, 2, 5]: knn = KNN(n_neighbors=k) knn.fit(x_train, y_train) plt.figure() y_predicted = knn.predict(xy) plt.pcolormesh(xx, yy, y_predicted.reshape(200, 200), cmap="jet", alpha=0.2) for i in range(len(y_unique)): plt.scatter( x_train[y_train == y_unique[i], 0], x_train[y_train == y_unique[i], 1], marker=markers[i], c=colors[i] ) plt.title("k=%s" % (k)) plt.show()
Image in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebook
# plot train err and test err with different k # ks = [int(n) for n in np.linspace(1, ntrain, 10)] ks = [1, 5, 10, 20, 50, 70, 79] train_errs = [] test_errs = [] for k in ks: knn = KNN(n_neighbors=k) knn.fit(x_train, y_train) train_errs.append(1 - knn.score(x_train, y_train)) test_errs.append(1 - knn.score(x_test, y_test)) plt.figure() plt.plot(ks, train_errs, "bs:", label="train") plt.plot(ks, test_errs, "rx-", label="test") plt.legend() plt.xlabel("k") plt.ylabel("misclassification rate") plt.show()
Image in a Jupyter notebook
# cross_validate scores = [] for k in ks: knn = KNN(n_neighbors=k) score = cross_val_score(knn, x_train, y_train, cv=5) scores.append(1 - score.mean()) plt.figure() plt.plot(ks, scores, "ko-") min_k = ks[np.argmin(scores)] plt.plot([min_k, min_k], [0, 1.0], "b-") plt.xlabel("k") plt.ylabel("misclassification rate") plt.title("5-fold cross validation, n-train = 200")
Text(0.5, 1.0, '5-fold cross validation, n-train = 200')
Image in a Jupyter notebook
# draw hot-map to show the probability of different class knn = KNN(n_neighbors=10) knn.fit(x_train, y_train) xy_predic = knn.predict_proba(xy) levels = np.arange(0, 1.01, 0.1) for i in range(3): plt.figure() plt.contourf(xy_predic[:, i].ravel().reshape(200, 200), levels) plt.colorbar() plt.title("p(y=%s | data, k=10)" % (i)) plt.show()
Image in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebook