# coding: utf-8123import sys4from python_environment_check import check_packages5import pandas as pd6from io import StringIO7from sklearn.impute import SimpleImputer8import numpy as np9from sklearn.preprocessing import LabelEncoder10from sklearn.preprocessing import OneHotEncoder11from sklearn.compose import ColumnTransformer12from sklearn.model_selection import train_test_split13from sklearn.preprocessing import MinMaxScaler14from sklearn.preprocessing import StandardScaler15from sklearn.linear_model import LogisticRegression16import matplotlib.pyplot as plt17from sklearn.base import clone18from itertools import combinations19from sklearn.metrics import accuracy_score20from sklearn.neighbors import KNeighborsClassifier21from sklearn.ensemble import RandomForestClassifier22from sklearn.feature_selection import SelectFromModel2324# # Machine Learning with PyTorch and Scikit-Learn25# # -- Code Examples2627# ## Package version checks2829# Add folder to path in order to load from the check_packages.py script:30313233sys.path.insert(0, '..')343536# Check recommended package versions:373839404142d = {43'numpy': '1.21.2',44'matplotlib': '3.4.3',45'sklearn': '1.0',46'pandas': '1.3.2'47}48check_packages(d)495051# # Chapter 4 - Building Good Training Datasets – Data Preprocessing525354# ### Overview5556# - [Dealing with missing data](#Dealing-with-missing-data)57# - [Identifying missing values in tabular data](#Identifying-missing-values-in-tabular-data)58# - [Eliminating training examples or features with missing values](#Eliminating-training-examples-or-features-with-missing-values)59# - [Imputing missing values](#Imputing-missing-values)60# - [Understanding the scikit-learn estimator API](#Understanding-the-scikit-learn-estimator-API)61# - [Handling categorical data](#Handling-categorical-data)62# - [Nominal and ordinal features](#Nominal-and-ordinal-features)63# - [Mapping ordinal features](#Mapping-ordinal-features)64# - [Encoding class labels](#Encoding-class-labels)65# - [Performing one-hot encoding on nominal features](#Performing-one-hot-encoding-on-nominal-features)66# - [Partitioning a dataset into a separate training and test set](#Partitioning-a-dataset-into-seperate-training-and-test-sets)67# - [Bringing features onto the same scale](#Bringing-features-onto-the-same-scale)68# - [Selecting meaningful features](#Selecting-meaningful-features)69# - [L1 and L2 regularization as penalties against model complexity](#L1-and-L2-regularization-as-penalties-against-model-omplexity)70# - [A geometric interpretation of L2 regularization](#A-geometric-interpretation-of-L2-regularization)71# - [Sparse solutions with L1 regularization](#Sparse-solutions-with-L1-regularization)72# - [Sequential feature selection algorithms](#Sequential-feature-selection-algorithms)73# - [Assessing feature importance with Random Forests](#Assessing-feature-importance-with-Random-Forests)74# - [Summary](#Summary)75767778798081# # Dealing with missing data8283# ## Identifying missing values in tabular data8485868788csv_data = '''A,B,C,D891.0,2.0,3.0,4.0905.0,6.0,,8.09110.0,11.0,12.0,'''9293# If you are using Python 2.7, you need94# to convert the string to unicode:9596if (sys.version_info < (3, 0)):97csv_data = unicode(csv_data)9899df = pd.read_csv(StringIO(csv_data))100df101102103104105df.isnull().sum()106107108109110# access the underlying NumPy array111# via the `values` attribute112df.values113114115116# ## Eliminating training examples or features with missing values117118119120# remove rows that contain missing values121122df.dropna(axis=0)123124125126127# remove columns that contain missing values128129df.dropna(axis=1)130131132133134# remove columns that contain missing values135136df.dropna(axis=1)137138139140141# only drop rows where all columns are NaN142143df.dropna(how='all')144145146147148# drop rows that have fewer than 3 real values149150df.dropna(thresh=4)151152153154155# only drop rows where NaN appear in specific columns (here: 'C')156157df.dropna(subset=['C'])158159160161# ## Imputing missing values162163164165# again: our original array166df.values167168169170171# impute missing values via the column mean172173174imr = SimpleImputer(missing_values=np.nan, strategy='mean')175imr = imr.fit(df.values)176imputed_data = imr.transform(df.values)177imputed_data178179180181182183df.fillna(df.mean())184185186# ## Understanding the scikit-learn estimator API187188189190191192193194195196197# # Handling categorical data198199# ## Nominal and ordinal features200201202203204df = pd.DataFrame([['green', 'M', 10.1, 'class2'],205['red', 'L', 13.5, 'class1'],206['blue', 'XL', 15.3, 'class2']])207208df.columns = ['color', 'size', 'price', 'classlabel']209df210211212213# ## Mapping ordinal features214215216217size_mapping = {'XL': 3,218'L': 2,219'M': 1}220221df['size'] = df['size'].map(size_mapping)222df223224225226227inv_size_mapping = {v: k for k, v in size_mapping.items()}228df['size'].map(inv_size_mapping)229230231232# ## Encoding class labels233234235236237# create a mapping dict238# to convert class labels from strings to integers239class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}240class_mapping241242243244245# to convert class labels from strings to integers246df['classlabel'] = df['classlabel'].map(class_mapping)247df248249250251252# reverse the class label mapping253inv_class_mapping = {v: k for k, v in class_mapping.items()}254df['classlabel'] = df['classlabel'].map(inv_class_mapping)255df256257258259260261# Label encoding with sklearn's LabelEncoder262class_le = LabelEncoder()263y = class_le.fit_transform(df['classlabel'].values)264y265266267268269# reverse mapping270class_le.inverse_transform(y)271272273274# ## Performing one-hot encoding on nominal features275276277278X = df[['color', 'size', 'price']].values279color_le = LabelEncoder()280X[:, 0] = color_le.fit_transform(X[:, 0])281X282283284285286287X = df[['color', 'size', 'price']].values288color_ohe = OneHotEncoder()289color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()290291292293294295X = df[['color', 'size', 'price']].values296c_transf = ColumnTransformer([ ('onehot', OneHotEncoder(), [0]),297('nothing', 'passthrough', [1, 2])])298c_transf.fit_transform(X).astype(float)299300301302303# one-hot encoding via pandas304305pd.get_dummies(df[['price', 'color', 'size']])306307308309310# multicollinearity guard in get_dummies311312pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)313314315316317# multicollinearity guard for the OneHotEncoder318319color_ohe = OneHotEncoder(categories='auto', drop='first')320c_transf = ColumnTransformer([ ('onehot', color_ohe, [0]),321('nothing', 'passthrough', [1, 2])])322c_transf.fit_transform(X).astype(float)323324325326# ## Optional: Encoding Ordinal Features327328# If we are unsure about the numerical differences between the categories of ordinal features, or the difference between two ordinal values is not defined, we can also encode them using a threshold encoding with 0/1 values. For example, we can split the feature "size" with values M, L, and XL into two new features "x > M" and "x > L". Let's consider the original DataFrame:329330331332df = pd.DataFrame([['green', 'M', 10.1, 'class2'],333['red', 'L', 13.5, 'class1'],334['blue', 'XL', 15.3, 'class2']])335336df.columns = ['color', 'size', 'price', 'classlabel']337df338339340# We can use the `apply` method of pandas' DataFrames to write custom lambda expressions in order to encode these variables using the value-threshold approach:341342343344df['x > M'] = df['size'].apply(lambda x: 1 if x in {'L', 'XL'} else 0)345df['x > L'] = df['size'].apply(lambda x: 1 if x == 'XL' else 0)346347del df['size']348df349350351352# # Partitioning a dataset into a seperate training and test set353354355356df_wine = pd.read_csv('https://archive.ics.uci.edu/'357'ml/machine-learning-databases/wine/wine.data',358header=None)359360# if the Wine dataset is temporarily unavailable from the361# UCI machine learning repository, un-comment the following line362# of code to load the dataset from a local path:363364# df_wine = pd.read_csv('wine.data', header=None)365366367df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',368'Alcalinity of ash', 'Magnesium', 'Total phenols',369'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',370'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',371'Proline']372373print('Class labels', np.unique(df_wine['Class label']))374df_wine.head()375376377378379380X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values381382X_train, X_test, y_train, y_test = train_test_split(X, y,383test_size=0.3,384random_state=0,385stratify=y)386387388389# # Bringing features onto the same scale390391392393394mms = MinMaxScaler()395X_train_norm = mms.fit_transform(X_train)396X_test_norm = mms.transform(X_test)397398399400401402stdsc = StandardScaler()403X_train_std = stdsc.fit_transform(X_train)404X_test_std = stdsc.transform(X_test)405406407# A visual example:408409410411ex = np.array([0, 1, 2, 3, 4, 5])412413print('standardized:', (ex - ex.mean()) / ex.std())414415# Please note that pandas uses ddof=1 (sample standard deviation)416# by default, whereas NumPy's std method and the StandardScaler417# uses ddof=0 (population standard deviation)418419# normalize420print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))421422423424# # Selecting meaningful features425426# ...427428# ## L1 and L2 regularization as penalties against model complexity429430# ## A geometric interpretation of L2 regularization431432433434435436437438439440# ## Sparse solutions with L1-regularization441442443444445446# For regularized models in scikit-learn that support L1 regularization, we can simply set the `penalty` parameter to `'l1'` to obtain a sparse solution:447448449450451LogisticRegression(penalty='l1')452453454# Applied to the standardized Wine data ...455456457458459lr = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', multi_class='ovr')460# Note that C=1.0 is the default. You can increase461# or decrease it to make the regulariztion effect462# weaker or stronger, respectively.463lr.fit(X_train_std, y_train)464print('Training accuracy:', lr.score(X_train_std, y_train))465print('Test accuracy:', lr.score(X_test_std, y_test))466467468469470lr.intercept_471472473474475np.set_printoptions(8)476477478479480lr.coef_[lr.coef_!=0].shape481482483484485lr.coef_486487488489490491fig = plt.figure()492ax = plt.subplot(111)493494colors = ['blue', 'green', 'red', 'cyan',495'magenta', 'yellow', 'black',496'pink', 'lightgreen', 'lightblue',497'gray', 'indigo', 'orange']498499weights, params = [], []500for c in np.arange(-4., 6.):501lr = LogisticRegression(penalty='l1', C=10.**c, solver='liblinear',502multi_class='ovr', random_state=0)503lr.fit(X_train_std, y_train)504weights.append(lr.coef_[1])505params.append(10**c)506507weights = np.array(weights)508509for column, color in zip(range(weights.shape[1]), colors):510plt.plot(params, weights[:, column],511label=df_wine.columns[column + 1],512color=color)513plt.axhline(0, color='black', linestyle='--', linewidth=3)514plt.xlim([10**(-5), 10**5])515plt.ylabel('Weight coefficient')516plt.xlabel('C (inverse regularization strength)')517plt.xscale('log')518plt.legend(loc='upper left')519ax.legend(loc='upper center',520bbox_to_anchor=(1.38, 1.03),521ncol=1, fancybox=True)522523#plt.savefig('figures/04_08.png', dpi=300,524# bbox_inches='tight', pad_inches=0.2)525526plt.show()527528529530# ## Sequential feature selection algorithms531532533534535536class SBS:537def __init__(self, estimator, k_features, scoring=accuracy_score,538test_size=0.25, random_state=1):539self.scoring = scoring540self.estimator = clone(estimator)541self.k_features = k_features542self.test_size = test_size543self.random_state = random_state544545def fit(self, X, y):546547X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size,548random_state=self.random_state)549550dim = X_train.shape[1]551self.indices_ = tuple(range(dim))552self.subsets_ = [self.indices_]553score = self._calc_score(X_train, y_train,554X_test, y_test, self.indices_)555self.scores_ = [score]556557while dim > self.k_features:558scores = []559subsets = []560561for p in combinations(self.indices_, r=dim - 1):562score = self._calc_score(X_train, y_train,563X_test, y_test, p)564scores.append(score)565subsets.append(p)566567best = np.argmax(scores)568self.indices_ = subsets[best]569self.subsets_.append(self.indices_)570dim -= 1571572self.scores_.append(scores[best])573self.k_score_ = self.scores_[-1]574575return self576577def transform(self, X):578return X[:, self.indices_]579580def _calc_score(self, X_train, y_train, X_test, y_test, indices):581self.estimator.fit(X_train[:, indices], y_train)582y_pred = self.estimator.predict(X_test[:, indices])583score = self.scoring(y_test, y_pred)584return score585586587588589590knn = KNeighborsClassifier(n_neighbors=5)591592# selecting features593sbs = SBS(knn, k_features=1)594sbs.fit(X_train_std, y_train)595596# plotting performance of feature subsets597k_feat = [len(k) for k in sbs.subsets_]598599plt.plot(k_feat, sbs.scores_, marker='o')600plt.ylim([0.7, 1.02])601plt.ylabel('Accuracy')602plt.xlabel('Number of features')603plt.grid()604plt.tight_layout()605# plt.savefig('figures/04_09.png', dpi=300)606plt.show()607608609610611k3 = list(sbs.subsets_[10])612print(df_wine.columns[1:][k3])613614615616617knn.fit(X_train_std, y_train)618print('Training accuracy:', knn.score(X_train_std, y_train))619print('Test accuracy:', knn.score(X_test_std, y_test))620621622623624knn.fit(X_train_std[:, k3], y_train)625print('Training accuracy:', knn.score(X_train_std[:, k3], y_train))626print('Test accuracy:', knn.score(X_test_std[:, k3], y_test))627628629630# # Assessing feature importance with Random Forests631632633634635feat_labels = df_wine.columns[1:]636637forest = RandomForestClassifier(n_estimators=500,638random_state=1)639640forest.fit(X_train, y_train)641importances = forest.feature_importances_642643indices = np.argsort(importances)[::-1]644645for f in range(X_train.shape[1]):646print("%2d) %-*s %f" % (f + 1, 30,647feat_labels[indices[f]],648importances[indices[f]]))649650plt.title('Feature importance')651plt.bar(range(X_train.shape[1]),652importances[indices],653align='center')654655plt.xticks(range(X_train.shape[1]),656feat_labels[indices], rotation=90)657plt.xlim([-1, X_train.shape[1]])658plt.tight_layout()659# plt.savefig('figures/04_10.png', dpi=300)660plt.show()661662663664665666sfm = SelectFromModel(forest, threshold=0.1, prefit=True)667X_selected = sfm.transform(X_train)668print('Number of features that meet this threshold criterion:',669X_selected.shape[1])670671672# Now, let's print the 3 features that met the threshold criterion for feature selection that we set earlier (note that this code snippet does not appear in the actual book but was added to this notebook later for illustrative purposes):673674675676for f in range(X_selected.shape[1]):677print("%2d) %-*s %f" % (f + 1, 30,678feat_labels[indices[f]],679importances[indices[f]]))680681682683# # Summary684685# ...686687# ---688#689# Readers may ignore the next cell.690691692693694695696