Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch04/ch04.py
1247 views
1
# coding: utf-8
2
3
4
import sys
5
from python_environment_check import check_packages
6
import pandas as pd
7
from io import StringIO
8
from sklearn.impute import SimpleImputer
9
import numpy as np
10
from sklearn.preprocessing import LabelEncoder
11
from sklearn.preprocessing import OneHotEncoder
12
from sklearn.compose import ColumnTransformer
13
from sklearn.model_selection import train_test_split
14
from sklearn.preprocessing import MinMaxScaler
15
from sklearn.preprocessing import StandardScaler
16
from sklearn.linear_model import LogisticRegression
17
import matplotlib.pyplot as plt
18
from sklearn.base import clone
19
from itertools import combinations
20
from sklearn.metrics import accuracy_score
21
from sklearn.neighbors import KNeighborsClassifier
22
from sklearn.ensemble import RandomForestClassifier
23
from sklearn.feature_selection import SelectFromModel
24
25
# # Machine Learning with PyTorch and Scikit-Learn
26
# # -- Code Examples
27
28
# ## Package version checks
29
30
# Add folder to path in order to load from the check_packages.py script:
31
32
33
34
sys.path.insert(0, '..')
35
36
37
# Check recommended package versions:
38
39
40
41
42
43
d = {
44
'numpy': '1.21.2',
45
'matplotlib': '3.4.3',
46
'sklearn': '1.0',
47
'pandas': '1.3.2'
48
}
49
check_packages(d)
50
51
52
# # Chapter 4 - Building Good Training Datasets – Data Preprocessing
53
54
55
# ### Overview
56
57
# - [Dealing with missing data](#Dealing-with-missing-data)
58
# - [Identifying missing values in tabular data](#Identifying-missing-values-in-tabular-data)
59
# - [Eliminating training examples or features with missing values](#Eliminating-training-examples-or-features-with-missing-values)
60
# - [Imputing missing values](#Imputing-missing-values)
61
# - [Understanding the scikit-learn estimator API](#Understanding-the-scikit-learn-estimator-API)
62
# - [Handling categorical data](#Handling-categorical-data)
63
# - [Nominal and ordinal features](#Nominal-and-ordinal-features)
64
# - [Mapping ordinal features](#Mapping-ordinal-features)
65
# - [Encoding class labels](#Encoding-class-labels)
66
# - [Performing one-hot encoding on nominal features](#Performing-one-hot-encoding-on-nominal-features)
67
# - [Partitioning a dataset into a separate training and test set](#Partitioning-a-dataset-into-seperate-training-and-test-sets)
68
# - [Bringing features onto the same scale](#Bringing-features-onto-the-same-scale)
69
# - [Selecting meaningful features](#Selecting-meaningful-features)
70
# - [L1 and L2 regularization as penalties against model complexity](#L1-and-L2-regularization-as-penalties-against-model-omplexity)
71
# - [A geometric interpretation of L2 regularization](#A-geometric-interpretation-of-L2-regularization)
72
# - [Sparse solutions with L1 regularization](#Sparse-solutions-with-L1-regularization)
73
# - [Sequential feature selection algorithms](#Sequential-feature-selection-algorithms)
74
# - [Assessing feature importance with Random Forests](#Assessing-feature-importance-with-Random-Forests)
75
# - [Summary](#Summary)
76
77
78
79
80
81
82
# # Dealing with missing data
83
84
# ## Identifying missing values in tabular data
85
86
87
88
89
csv_data = '''A,B,C,D
90
1.0,2.0,3.0,4.0
91
5.0,6.0,,8.0
92
10.0,11.0,12.0,'''
93
94
# If you are using Python 2.7, you need
95
# to convert the string to unicode:
96
97
if (sys.version_info < (3, 0)):
98
csv_data = unicode(csv_data)
99
100
df = pd.read_csv(StringIO(csv_data))
101
df
102
103
104
105
106
df.isnull().sum()
107
108
109
110
111
# access the underlying NumPy array
112
# via the `values` attribute
113
df.values
114
115
116
117
# ## Eliminating training examples or features with missing values
118
119
120
121
# remove rows that contain missing values
122
123
df.dropna(axis=0)
124
125
126
127
128
# remove columns that contain missing values
129
130
df.dropna(axis=1)
131
132
133
134
135
# remove columns that contain missing values
136
137
df.dropna(axis=1)
138
139
140
141
142
# only drop rows where all columns are NaN
143
144
df.dropna(how='all')
145
146
147
148
149
# drop rows that have fewer than 3 real values
150
151
df.dropna(thresh=4)
152
153
154
155
156
# only drop rows where NaN appear in specific columns (here: 'C')
157
158
df.dropna(subset=['C'])
159
160
161
162
# ## Imputing missing values
163
164
165
166
# again: our original array
167
df.values
168
169
170
171
172
# impute missing values via the column mean
173
174
175
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
176
imr = imr.fit(df.values)
177
imputed_data = imr.transform(df.values)
178
imputed_data
179
180
181
182
183
184
df.fillna(df.mean())
185
186
187
# ## Understanding the scikit-learn estimator API
188
189
190
191
192
193
194
195
196
197
198
# # Handling categorical data
199
200
# ## Nominal and ordinal features
201
202
203
204
205
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
206
['red', 'L', 13.5, 'class1'],
207
['blue', 'XL', 15.3, 'class2']])
208
209
df.columns = ['color', 'size', 'price', 'classlabel']
210
df
211
212
213
214
# ## Mapping ordinal features
215
216
217
218
size_mapping = {'XL': 3,
219
'L': 2,
220
'M': 1}
221
222
df['size'] = df['size'].map(size_mapping)
223
df
224
225
226
227
228
inv_size_mapping = {v: k for k, v in size_mapping.items()}
229
df['size'].map(inv_size_mapping)
230
231
232
233
# ## Encoding class labels
234
235
236
237
238
# create a mapping dict
239
# to convert class labels from strings to integers
240
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
241
class_mapping
242
243
244
245
246
# to convert class labels from strings to integers
247
df['classlabel'] = df['classlabel'].map(class_mapping)
248
df
249
250
251
252
253
# reverse the class label mapping
254
inv_class_mapping = {v: k for k, v in class_mapping.items()}
255
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
256
df
257
258
259
260
261
262
# Label encoding with sklearn's LabelEncoder
263
class_le = LabelEncoder()
264
y = class_le.fit_transform(df['classlabel'].values)
265
y
266
267
268
269
270
# reverse mapping
271
class_le.inverse_transform(y)
272
273
274
275
# ## Performing one-hot encoding on nominal features
276
277
278
279
X = df[['color', 'size', 'price']].values
280
color_le = LabelEncoder()
281
X[:, 0] = color_le.fit_transform(X[:, 0])
282
X
283
284
285
286
287
288
X = df[['color', 'size', 'price']].values
289
color_ohe = OneHotEncoder()
290
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()
291
292
293
294
295
296
X = df[['color', 'size', 'price']].values
297
c_transf = ColumnTransformer([ ('onehot', OneHotEncoder(), [0]),
298
('nothing', 'passthrough', [1, 2])])
299
c_transf.fit_transform(X).astype(float)
300
301
302
303
304
# one-hot encoding via pandas
305
306
pd.get_dummies(df[['price', 'color', 'size']])
307
308
309
310
311
# multicollinearity guard in get_dummies
312
313
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)
314
315
316
317
318
# multicollinearity guard for the OneHotEncoder
319
320
color_ohe = OneHotEncoder(categories='auto', drop='first')
321
c_transf = ColumnTransformer([ ('onehot', color_ohe, [0]),
322
('nothing', 'passthrough', [1, 2])])
323
c_transf.fit_transform(X).astype(float)
324
325
326
327
# ## Optional: Encoding Ordinal Features
328
329
# If we are unsure about the numerical differences between the categories of ordinal features, or the difference between two ordinal values is not defined, we can also encode them using a threshold encoding with 0/1 values. For example, we can split the feature "size" with values M, L, and XL into two new features "x > M" and "x > L". Let's consider the original DataFrame:
330
331
332
333
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
334
['red', 'L', 13.5, 'class1'],
335
['blue', 'XL', 15.3, 'class2']])
336
337
df.columns = ['color', 'size', 'price', 'classlabel']
338
df
339
340
341
# We can use the `apply` method of pandas' DataFrames to write custom lambda expressions in order to encode these variables using the value-threshold approach:
342
343
344
345
df['x > M'] = df['size'].apply(lambda x: 1 if x in {'L', 'XL'} else 0)
346
df['x > L'] = df['size'].apply(lambda x: 1 if x == 'XL' else 0)
347
348
del df['size']
349
df
350
351
352
353
# # Partitioning a dataset into a seperate training and test set
354
355
356
357
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
358
'ml/machine-learning-databases/wine/wine.data',
359
header=None)
360
361
# if the Wine dataset is temporarily unavailable from the
362
# UCI machine learning repository, un-comment the following line
363
# of code to load the dataset from a local path:
364
365
# df_wine = pd.read_csv('wine.data', header=None)
366
367
368
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
369
'Alcalinity of ash', 'Magnesium', 'Total phenols',
370
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
371
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
372
'Proline']
373
374
print('Class labels', np.unique(df_wine['Class label']))
375
df_wine.head()
376
377
378
379
380
381
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
382
383
X_train, X_test, y_train, y_test = train_test_split(X, y,
384
test_size=0.3,
385
random_state=0,
386
stratify=y)
387
388
389
390
# # Bringing features onto the same scale
391
392
393
394
395
mms = MinMaxScaler()
396
X_train_norm = mms.fit_transform(X_train)
397
X_test_norm = mms.transform(X_test)
398
399
400
401
402
403
stdsc = StandardScaler()
404
X_train_std = stdsc.fit_transform(X_train)
405
X_test_std = stdsc.transform(X_test)
406
407
408
# A visual example:
409
410
411
412
ex = np.array([0, 1, 2, 3, 4, 5])
413
414
print('standardized:', (ex - ex.mean()) / ex.std())
415
416
# Please note that pandas uses ddof=1 (sample standard deviation)
417
# by default, whereas NumPy's std method and the StandardScaler
418
# uses ddof=0 (population standard deviation)
419
420
# normalize
421
print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))
422
423
424
425
# # Selecting meaningful features
426
427
# ...
428
429
# ## L1 and L2 regularization as penalties against model complexity
430
431
# ## A geometric interpretation of L2 regularization
432
433
434
435
436
437
438
439
440
441
# ## Sparse solutions with L1-regularization
442
443
444
445
446
447
# For regularized models in scikit-learn that support L1 regularization, we can simply set the `penalty` parameter to `'l1'` to obtain a sparse solution:
448
449
450
451
452
LogisticRegression(penalty='l1')
453
454
455
# Applied to the standardized Wine data ...
456
457
458
459
460
lr = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', multi_class='ovr')
461
# Note that C=1.0 is the default. You can increase
462
# or decrease it to make the regulariztion effect
463
# weaker or stronger, respectively.
464
lr.fit(X_train_std, y_train)
465
print('Training accuracy:', lr.score(X_train_std, y_train))
466
print('Test accuracy:', lr.score(X_test_std, y_test))
467
468
469
470
471
lr.intercept_
472
473
474
475
476
np.set_printoptions(8)
477
478
479
480
481
lr.coef_[lr.coef_!=0].shape
482
483
484
485
486
lr.coef_
487
488
489
490
491
492
fig = plt.figure()
493
ax = plt.subplot(111)
494
495
colors = ['blue', 'green', 'red', 'cyan',
496
'magenta', 'yellow', 'black',
497
'pink', 'lightgreen', 'lightblue',
498
'gray', 'indigo', 'orange']
499
500
weights, params = [], []
501
for c in np.arange(-4., 6.):
502
lr = LogisticRegression(penalty='l1', C=10.**c, solver='liblinear',
503
multi_class='ovr', random_state=0)
504
lr.fit(X_train_std, y_train)
505
weights.append(lr.coef_[1])
506
params.append(10**c)
507
508
weights = np.array(weights)
509
510
for column, color in zip(range(weights.shape[1]), colors):
511
plt.plot(params, weights[:, column],
512
label=df_wine.columns[column + 1],
513
color=color)
514
plt.axhline(0, color='black', linestyle='--', linewidth=3)
515
plt.xlim([10**(-5), 10**5])
516
plt.ylabel('Weight coefficient')
517
plt.xlabel('C (inverse regularization strength)')
518
plt.xscale('log')
519
plt.legend(loc='upper left')
520
ax.legend(loc='upper center',
521
bbox_to_anchor=(1.38, 1.03),
522
ncol=1, fancybox=True)
523
524
#plt.savefig('figures/04_08.png', dpi=300,
525
# bbox_inches='tight', pad_inches=0.2)
526
527
plt.show()
528
529
530
531
# ## Sequential feature selection algorithms
532
533
534
535
536
537
class SBS:
538
def __init__(self, estimator, k_features, scoring=accuracy_score,
539
test_size=0.25, random_state=1):
540
self.scoring = scoring
541
self.estimator = clone(estimator)
542
self.k_features = k_features
543
self.test_size = test_size
544
self.random_state = random_state
545
546
def fit(self, X, y):
547
548
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size,
549
random_state=self.random_state)
550
551
dim = X_train.shape[1]
552
self.indices_ = tuple(range(dim))
553
self.subsets_ = [self.indices_]
554
score = self._calc_score(X_train, y_train,
555
X_test, y_test, self.indices_)
556
self.scores_ = [score]
557
558
while dim > self.k_features:
559
scores = []
560
subsets = []
561
562
for p in combinations(self.indices_, r=dim - 1):
563
score = self._calc_score(X_train, y_train,
564
X_test, y_test, p)
565
scores.append(score)
566
subsets.append(p)
567
568
best = np.argmax(scores)
569
self.indices_ = subsets[best]
570
self.subsets_.append(self.indices_)
571
dim -= 1
572
573
self.scores_.append(scores[best])
574
self.k_score_ = self.scores_[-1]
575
576
return self
577
578
def transform(self, X):
579
return X[:, self.indices_]
580
581
def _calc_score(self, X_train, y_train, X_test, y_test, indices):
582
self.estimator.fit(X_train[:, indices], y_train)
583
y_pred = self.estimator.predict(X_test[:, indices])
584
score = self.scoring(y_test, y_pred)
585
return score
586
587
588
589
590
591
knn = KNeighborsClassifier(n_neighbors=5)
592
593
# selecting features
594
sbs = SBS(knn, k_features=1)
595
sbs.fit(X_train_std, y_train)
596
597
# plotting performance of feature subsets
598
k_feat = [len(k) for k in sbs.subsets_]
599
600
plt.plot(k_feat, sbs.scores_, marker='o')
601
plt.ylim([0.7, 1.02])
602
plt.ylabel('Accuracy')
603
plt.xlabel('Number of features')
604
plt.grid()
605
plt.tight_layout()
606
# plt.savefig('figures/04_09.png', dpi=300)
607
plt.show()
608
609
610
611
612
k3 = list(sbs.subsets_[10])
613
print(df_wine.columns[1:][k3])
614
615
616
617
618
knn.fit(X_train_std, y_train)
619
print('Training accuracy:', knn.score(X_train_std, y_train))
620
print('Test accuracy:', knn.score(X_test_std, y_test))
621
622
623
624
625
knn.fit(X_train_std[:, k3], y_train)
626
print('Training accuracy:', knn.score(X_train_std[:, k3], y_train))
627
print('Test accuracy:', knn.score(X_test_std[:, k3], y_test))
628
629
630
631
# # Assessing feature importance with Random Forests
632
633
634
635
636
feat_labels = df_wine.columns[1:]
637
638
forest = RandomForestClassifier(n_estimators=500,
639
random_state=1)
640
641
forest.fit(X_train, y_train)
642
importances = forest.feature_importances_
643
644
indices = np.argsort(importances)[::-1]
645
646
for f in range(X_train.shape[1]):
647
print("%2d) %-*s %f" % (f + 1, 30,
648
feat_labels[indices[f]],
649
importances[indices[f]]))
650
651
plt.title('Feature importance')
652
plt.bar(range(X_train.shape[1]),
653
importances[indices],
654
align='center')
655
656
plt.xticks(range(X_train.shape[1]),
657
feat_labels[indices], rotation=90)
658
plt.xlim([-1, X_train.shape[1]])
659
plt.tight_layout()
660
# plt.savefig('figures/04_10.png', dpi=300)
661
plt.show()
662
663
664
665
666
667
sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
668
X_selected = sfm.transform(X_train)
669
print('Number of features that meet this threshold criterion:',
670
X_selected.shape[1])
671
672
673
# Now, let's print the 3 features that met the threshold criterion for feature selection that we set earlier (note that this code snippet does not appear in the actual book but was added to this notebook later for illustrative purposes):
674
675
676
677
for f in range(X_selected.shape[1]):
678
print("%2d) %-*s %f" % (f + 1, 30,
679
feat_labels[indices[f]],
680
importances[indices[f]]))
681
682
683
684
# # Summary
685
686
# ...
687
688
# ---
689
#
690
# Readers may ignore the next cell.
691
692
693
694
695
696