Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
YStrano
GitHub Repository: YStrano/DataScience_GA
Path: blob/master/lessons/lesson_07/code/starter-code/starter-code-7 - (done) (KNN and then fed into Train, Test, Split modell).ipynb
1904 views
Kernel: Python 3
from sklearn import datasets, neighbors, metrics import pandas as pd import seaborn as sns
%matplotlib inline

Load in the Data

## Load in the data iris = datasets.load_iris() irisdf = pd.DataFrame(iris.data, columns=iris.feature_names) irisdf['target'] = iris.target ## Apply a 'color map' for plotting purposes cmap = {'0': 'r', '1': 'g', '2': 'b' } irisdf['ctarget'] = irisdf.target.apply(lambda x: cmap[str(x)]) ## Do some plotting to illustrate the data irisdf.plot('petal length (cm)', 'petal width (cm)', kind='scatter', c=irisdf.ctarget) irisdf.head()
Image in a Jupyter notebook
irisdf.describe()
def my_classifier(row): if row['petal length (cm)'] < 2: return 0 else: return 1 predictions = irisdf.apply(my_classifier, axis=1)
irisdf['predictions'] = predictions print(float(len(irisdf[irisdf.target == irisdf.predictions])) / len(irisdf))
0.6666666666666666

Starter Code

Work on improving the classifier below.

def my_classifier(row): if row['petal length (cm)'] < 2: return 0 else: return 2 ## Fill in other if then statements here by looking at the plot and data above predictions = irisdf.apply(my_classifier, axis=1) irisdf['predictions'] = predictions print(float(len(irisdf[irisdf.target == irisdf.predictions])) / len(irisdf))
0.6666666666666666

Using distance: KNN implementation

#zip: list(zip(first, second)) x = [1,2,3] y = [a, b, c] list(zip(x, y)) for i in zip(x, y): print (i)
from sklearn import datasets, neighbors, metrics import pandas as pd iris = datasets.load_iris() X = iris.data y = iris.target # n_neighbors is our option in KNN. We'll tune this value to attempt to improve our prediction. knn = neighbors.KNeighborsClassifier(n_neighbors=3, weights='uniform') knn.fit(X, y) print(pd.DataFrame(list(zip(knn.predict(X), y)), columns = ['predicted','actual'])) print('Accuracy = {}'.format(knn.score(X, iris.target)))
predicted actual 0 0 0 1 0 0 2 0 0 3 0 0 4 0 0 5 0 0 6 0 0 7 0 0 8 0 0 9 0 0 10 0 0 11 0 0 12 0 0 13 0 0 14 0 0 15 0 0 16 0 0 17 0 0 18 0 0 19 0 0 20 0 0 21 0 0 22 0 0 23 0 0 24 0 0 25 0 0 26 0 0 27 0 0 28 0 0 29 0 0 .. ... ... 120 2 2 121 2 2 122 2 2 123 2 2 124 2 2 125 2 2 126 2 2 127 2 2 128 2 2 129 2 2 130 2 2 131 2 2 132 2 2 133 1 2 134 2 2 135 2 2 136 2 2 137 2 2 138 2 2 139 2 2 140 2 2 141 2 2 142 2 2 143 2 2 144 2 2 145 2 2 146 2 2 147 2 2 148 2 2 149 2 2 [150 rows x 2 columns] Accuracy = 0.96

Do we see a change when using more neighbors?

knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform') knn.fit(X, y) print (pd.DataFrame(list(zip(knn.predict(X), y)), columns = ['predicted','actual'])) print ('Accuracy = {}'.format(knn.score(X, iris.target)))
predicted actual 0 0 0 1 0 0 2 0 0 3 0 0 4 0 0 5 0 0 6 0 0 7 0 0 8 0 0 9 0 0 10 0 0 11 0 0 12 0 0 13 0 0 14 0 0 15 0 0 16 0 0 17 0 0 18 0 0 19 0 0 20 0 0 21 0 0 22 0 0 23 0 0 24 0 0 25 0 0 26 0 0 27 0 0 28 0 0 29 0 0 .. ... ... 120 2 2 121 2 2 122 2 2 123 2 2 124 2 2 125 2 2 126 2 2 127 2 2 128 2 2 129 2 2 130 2 2 131 2 2 132 2 2 133 2 2 134 2 2 135 2 2 136 2 2 137 2 2 138 2 2 139 2 2 140 2 2 141 2 2 142 2 2 143 2 2 144 2 2 145 2 2 146 2 2 147 2 2 148 2 2 149 2 2 [150 rows x 2 columns] Accuracy = 0.9666666666666667

Do we see a change in performance when using the distance weight?

knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance') # add in the weights parameter here knn.fit(X, y) print (knn.score(X, iris.target))
1.0

Solution to solving K

This is only one approach to the problem, but adding in the 'distance' parameter (instead of uniform) would only be additive. Note the code would need some editing to handle it properly if done in the grid search; alternatively, make the change directly in the estimator.

from sklearn.model_selection import GridSearchCV ## Parameters to tune! tuned_parameters = [{'n_neighbors': [3, 5, 7], 'weights': ['distance','uniform']}] ## How many folds to use for validation? n_folds = 5 knn = neighbors.KNeighborsClassifier() grid_search = GridSearchCV(knn, tuned_parameters, cv = n_folds) grid_search.fit(X, iris.target)
GridSearchCV(cv=5, error_score='raise', estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'), fit_params=None, iid=True, n_jobs=1, param_grid=[{'n_neighbors': [3, 5, 7], 'weights': ['distance', 'uniform']}], pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0)

What does this output?

grid_search.cv_results_
/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split3_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split4_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs)
{'mean_fit_time': array([0.00055041, 0.00060692, 0.0009181 , 0.00061622, 0.00034871, 0.00030766]), 'mean_score_time': array([0.0009201 , 0.00067348, 0.00114713, 0.00100803, 0.00071225, 0.00065012]), 'mean_test_score': array([0.96666667, 0.96666667, 0.96666667, 0.97333333, 0.98 , 0.98 ]), 'mean_train_score': array([1. , 0.96 , 1. , 0.97 , 1. , 0.97333333]), 'param_n_neighbors': masked_array(data=[3, 3, 5, 5, 7, 7], mask=[False, False, False, False, False, False], fill_value='?', dtype=object), 'param_weights': masked_array(data=['distance', 'uniform', 'distance', 'uniform', 'distance', 'uniform'], mask=[False, False, False, False, False, False], fill_value='?', dtype=object), 'params': [{'n_neighbors': 3, 'weights': 'distance'}, {'n_neighbors': 3, 'weights': 'uniform'}, {'n_neighbors': 5, 'weights': 'distance'}, {'n_neighbors': 5, 'weights': 'uniform'}, {'n_neighbors': 7, 'weights': 'distance'}, {'n_neighbors': 7, 'weights': 'uniform'}], 'rank_test_score': array([4, 4, 4, 3, 1, 1], dtype=int32), 'split0_test_score': array([0.96666667, 0.96666667, 0.96666667, 0.96666667, 0.96666667, 0.96666667]), 'split0_train_score': array([1. , 0.95833333, 1. , 0.96666667, 1. , 0.96666667]), 'split1_test_score': array([0.96666667, 0.96666667, 1. , 1. , 1. , 1. ]), 'split1_train_score': array([1. , 0.95833333, 1. , 0.96666667, 1. , 0.96666667]), 'split2_test_score': array([0.93333333, 0.93333333, 0.9 , 0.93333333, 0.96666667, 0.96666667]), 'split2_train_score': array([1. , 0.96666667, 1. , 0.975 , 1. , 0.975 ]), 'split3_test_score': array([0.96666667, 0.96666667, 0.96666667, 0.96666667, 0.96666667, 0.96666667]), 'split3_train_score': array([1. , 0.96666667, 1. , 0.975 , 1. , 0.98333333]), 'split4_test_score': array([1., 1., 1., 1., 1., 1.]), 'split4_train_score': array([1. , 0.95 , 1. , 0.96666667, 1. , 0.975 ]), 'std_fit_time': array([2.43202733e-04, 5.85316720e-04, 5.32751645e-04, 3.64376125e-04, 8.75857317e-05, 2.19125223e-05]), 'std_score_time': array([0.00033249, 0.00016714, 0.00029082, 0.00017725, 0.00014781, 0.00020538]), 'std_test_score': array([0.02108185, 0.02108185, 0.03651484, 0.02494438, 0.01632993, 0.01632993]), 'std_train_score': array([0. , 0.0062361 , 0. , 0.00408248, 0. , 0.0062361 ])}
pd.DataFrame(grid_search.cv_results_).sort_values('mean_test_score', ascending=False)
/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split3_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split4_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs) /anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs)

What is our best test accuracy? What do we expect our out of sample performance to look like?

grid_search.best_estimator_
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=7, p=2, weights='distance')

Lets Build the model and look at it in more detail

from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(X, iris.target) knn_final = grid_search.best_estimator_ knn_final.fit(x_train, y_train) preds = knn_final.predict(x_test)
from sklearn.metrics import classification_report import pprint pp = pprint.PrettyPrinter(indent=4) results = classification_report(y_pred=preds, y_true = y_test) pp.pprint(results)
(' precision recall f1-score support\n' '\n' ' 0 1.00 1.00 1.00 13\n' ' 1 0.89 0.89 0.89 9\n' ' 2 0.94 0.94 0.94 16\n' '\n' 'avg / total 0.95 0.95 0.95 38\n')
pd.DataFrame(list(zip(preds, y_test)), columns=['predicted', 'actual'])