CoCalc -- ch04.py

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch04/ch04.py
¹²⁴⁷ views
1
# coding: utf-8
2

3

4
import sys
5
from python_environment_check import check_packages
6
import pandas as pd
7
from io import StringIO
8
from sklearn.impute import SimpleImputer
9
import numpy as np
10
from sklearn.preprocessing import LabelEncoder
11
from sklearn.preprocessing import OneHotEncoder
12
from sklearn.compose import ColumnTransformer
13
from sklearn.model_selection import train_test_split
14
from sklearn.preprocessing import MinMaxScaler
15
from sklearn.preprocessing import StandardScaler
16
from sklearn.linear_model import LogisticRegression
17
import matplotlib.pyplot as plt
18
from sklearn.base import clone
19
from itertools import combinations
20
from sklearn.metrics import accuracy_score
21
from sklearn.neighbors import KNeighborsClassifier
22
from sklearn.ensemble import RandomForestClassifier
23
from sklearn.feature_selection import SelectFromModel
24

25
# # Machine Learning with PyTorch and Scikit-Learn  
26
# # -- Code Examples
27

28
# ## Package version checks
29

30
# Add folder to path in order to load from the check_packages.py script:
31

32

33

34
sys.path.insert(0, '..')
35

36

37
# Check recommended package versions:
38

39

40

41

42

43
d = {
44
    'numpy': '1.21.2',
45
    'matplotlib': '3.4.3',
46
    'sklearn': '1.0',
47
    'pandas': '1.3.2'
48
}
49
check_packages(d)
50

51

52
# # Chapter 4 - Building Good Training Datasets – Data Preprocessing
53

54

55
# ### Overview
56

57
# - [Dealing with missing data](#Dealing-with-missing-data)
58
#   - [Identifying missing values in tabular data](#Identifying-missing-values-in-tabular-data)
59
#   - [Eliminating training examples or features with missing values](#Eliminating-training-examples-or-features-with-missing-values)
60
#   - [Imputing missing values](#Imputing-missing-values)
61
#   - [Understanding the scikit-learn estimator API](#Understanding-the-scikit-learn-estimator-API)
62
# - [Handling categorical data](#Handling-categorical-data)
63
#   - [Nominal and ordinal features](#Nominal-and-ordinal-features)
64
#   - [Mapping ordinal features](#Mapping-ordinal-features)
65
#   - [Encoding class labels](#Encoding-class-labels)
66
#   - [Performing one-hot encoding on nominal features](#Performing-one-hot-encoding-on-nominal-features)
67
# - [Partitioning a dataset into a separate training and test set](#Partitioning-a-dataset-into-seperate-training-and-test-sets)
68
# - [Bringing features onto the same scale](#Bringing-features-onto-the-same-scale)
69
# - [Selecting meaningful features](#Selecting-meaningful-features)
70
#   - [L1 and L2 regularization as penalties against model complexity](#L1-and-L2-regularization-as-penalties-against-model-omplexity)
71
#   - [A geometric interpretation of L2 regularization](#A-geometric-interpretation-of-L2-regularization)
72
#   - [Sparse solutions with L1 regularization](#Sparse-solutions-with-L1-regularization)
73
#   - [Sequential feature selection algorithms](#Sequential-feature-selection-algorithms)
74
# - [Assessing feature importance with Random Forests](#Assessing-feature-importance-with-Random-Forests)
75
# - [Summary](#Summary)
76

77

78

79

80

81

82
# # Dealing with missing data
83

84
# ## Identifying missing values in tabular data
85

86

87

88

89
csv_data = '''A,B,C,D
90
1.0,2.0,3.0,4.0
91
5.0,6.0,,8.0
92
10.0,11.0,12.0,'''
93

94
# If you are using Python 2.7, you need
95
# to convert the string to unicode:
96

97
if (sys.version_info < (3, 0)):
98
    csv_data = unicode(csv_data)
99

100
df = pd.read_csv(StringIO(csv_data))
101
df
102

103

104

105

106
df.isnull().sum()
107

108

109

110

111
# access the underlying NumPy array
112
# via the `values` attribute
113
df.values
114

115

116

117
# ## Eliminating training examples or features with missing values
118

119

120

121
# remove rows that contain missing values
122

123
df.dropna(axis=0)
124

125

126

127

128
# remove columns that contain missing values
129

130
df.dropna(axis=1)
131

132

133

134

135
# remove columns that contain missing values
136

137
df.dropna(axis=1)
138

139

140

141

142
# only drop rows where all columns are NaN
143

144
df.dropna(how='all')  
145

146

147

148

149
# drop rows that have fewer than 3 real values 
150

151
df.dropna(thresh=4)
152

153

154

155

156
# only drop rows where NaN appear in specific columns (here: 'C')
157

158
df.dropna(subset=['C'])
159

160

161

162
# ## Imputing missing values
163

164

165

166
# again: our original array
167
df.values
168

169

170

171

172
# impute missing values via the column mean
173

174

175
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
176
imr = imr.fit(df.values)
177
imputed_data = imr.transform(df.values)
178
imputed_data
179

180

181

182

183

184
df.fillna(df.mean())
185

186

187
# ## Understanding the scikit-learn estimator API
188

189

190

191

192

193

194

195

196

197

198
# # Handling categorical data
199

200
# ## Nominal and ordinal features
201

202

203

204

205
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
206
                   ['red', 'L', 13.5, 'class1'],
207
                   ['blue', 'XL', 15.3, 'class2']])
208

209
df.columns = ['color', 'size', 'price', 'classlabel']
210
df
211

212

213

214
# ## Mapping ordinal features
215

216

217

218
size_mapping = {'XL': 3,
219
                'L': 2,
220
                'M': 1}
221

222
df['size'] = df['size'].map(size_mapping)
223
df
224

225

226

227

228
inv_size_mapping = {v: k for k, v in size_mapping.items()}
229
df['size'].map(inv_size_mapping)
230

231

232

233
# ## Encoding class labels
234

235

236

237

238
# create a mapping dict
239
# to convert class labels from strings to integers
240
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
241
class_mapping
242

243

244

245

246
# to convert class labels from strings to integers
247
df['classlabel'] = df['classlabel'].map(class_mapping)
248
df
249

250

251

252

253
# reverse the class label mapping
254
inv_class_mapping = {v: k for k, v in class_mapping.items()}
255
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
256
df
257

258

259

260

261

262
# Label encoding with sklearn's LabelEncoder
263
class_le = LabelEncoder()
264
y = class_le.fit_transform(df['classlabel'].values)
265
y
266

267

268

269

270
# reverse mapping
271
class_le.inverse_transform(y)
272

273

274

275
# ## Performing one-hot encoding on nominal features
276

277

278

279
X = df[['color', 'size', 'price']].values
280
color_le = LabelEncoder()
281
X[:, 0] = color_le.fit_transform(X[:, 0])
282
X
283

284

285

286

287

288
X = df[['color', 'size', 'price']].values
289
color_ohe = OneHotEncoder()
290
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()
291

292

293

294

295

296
X = df[['color', 'size', 'price']].values
297
c_transf = ColumnTransformer([ ('onehot', OneHotEncoder(), [0]),
298
                               ('nothing', 'passthrough', [1, 2])])
299
c_transf.fit_transform(X).astype(float)
300

301

302

303

304
# one-hot encoding via pandas
305

306
pd.get_dummies(df[['price', 'color', 'size']])
307

308

309

310

311
# multicollinearity guard in get_dummies
312

313
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)
314

315

316

317

318
# multicollinearity guard for the OneHotEncoder
319

320
color_ohe = OneHotEncoder(categories='auto', drop='first')
321
c_transf = ColumnTransformer([ ('onehot', color_ohe, [0]),
322
                               ('nothing', 'passthrough', [1, 2])])
323
c_transf.fit_transform(X).astype(float)
324

325

326

327
# ## Optional: Encoding Ordinal Features
328

329
# If we are unsure about the numerical differences between the categories of ordinal features, or the difference between two ordinal values is not defined, we can also encode them using a threshold encoding with 0/1 values. For example, we can split the feature "size" with values M, L, and XL into two new features "x > M" and "x > L". Let's consider the original DataFrame:
330

331

332

333
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
334
                   ['red', 'L', 13.5, 'class1'],
335
                   ['blue', 'XL', 15.3, 'class2']])
336

337
df.columns = ['color', 'size', 'price', 'classlabel']
338
df
339

340

341
# We can use the `apply` method of pandas' DataFrames to write custom lambda expressions in order to encode these variables using the value-threshold approach:
342

343

344

345
df['x > M'] = df['size'].apply(lambda x: 1 if x in {'L', 'XL'} else 0)
346
df['x > L'] = df['size'].apply(lambda x: 1 if x == 'XL' else 0)
347

348
del df['size']
349
df
350

351

352

353
# # Partitioning a dataset into a seperate training and test set
354

355

356

357
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
358
                      'ml/machine-learning-databases/wine/wine.data',
359
                      header=None)
360

361
# if the Wine dataset is temporarily unavailable from the
362
# UCI machine learning repository, un-comment the following line
363
# of code to load the dataset from a local path:
364

365
# df_wine = pd.read_csv('wine.data', header=None)
366

367

368
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
369
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
370
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
371
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
372
                   'Proline']
373

374
print('Class labels', np.unique(df_wine['Class label']))
375
df_wine.head()
376

377

378

379

380

381
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
382

383
X_train, X_test, y_train, y_test =    train_test_split(X, y, 
384
                     test_size=0.3, 
385
                     random_state=0, 
386
                     stratify=y)
387

388

389

390
# # Bringing features onto the same scale
391

392

393

394

395
mms = MinMaxScaler()
396
X_train_norm = mms.fit_transform(X_train)
397
X_test_norm = mms.transform(X_test)
398

399

400

401

402

403
stdsc = StandardScaler()
404
X_train_std = stdsc.fit_transform(X_train)
405
X_test_std = stdsc.transform(X_test)
406

407

408
# A visual example:
409

410

411

412
ex = np.array([0, 1, 2, 3, 4, 5])
413

414
print('standardized:', (ex - ex.mean()) / ex.std())
415

416
# Please note that pandas uses ddof=1 (sample standard deviation) 
417
# by default, whereas NumPy's std method and the StandardScaler
418
# uses ddof=0 (population standard deviation)
419

420
# normalize
421
print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))
422

423

424

425
# # Selecting meaningful features
426

427
# ...
428

429
# ## L1 and L2 regularization as penalties against model complexity
430

431
# ## A geometric interpretation of L2 regularization
432

433

434

435

436

437

438

439

440

441
# ## Sparse solutions with L1-regularization
442

443

444

445

446

447
# For regularized models in scikit-learn that support L1 regularization, we can simply set the `penalty` parameter to `'l1'` to obtain a sparse solution:
448

449

450

451

452
LogisticRegression(penalty='l1')
453

454

455
# Applied to the standardized Wine data ...
456

457

458

459

460
lr = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', multi_class='ovr')
461
# Note that C=1.0 is the default. You can increase
462
# or decrease it to make the regulariztion effect
463
# weaker or stronger, respectively.
464
lr.fit(X_train_std, y_train)
465
print('Training accuracy:', lr.score(X_train_std, y_train))
466
print('Test accuracy:', lr.score(X_test_std, y_test))
467

468

469

470

471
lr.intercept_
472

473

474

475

476
np.set_printoptions(8)
477

478

479

480

481
lr.coef_[lr.coef_!=0].shape
482

483

484

485

486
lr.coef_
487

488

489

490

491

492
fig = plt.figure()
493
ax = plt.subplot(111)
494
    
495
colors = ['blue', 'green', 'red', 'cyan', 
496
          'magenta', 'yellow', 'black', 
497
          'pink', 'lightgreen', 'lightblue', 
498
          'gray', 'indigo', 'orange']
499

500
weights, params = [], []
501
for c in np.arange(-4., 6.):
502
    lr = LogisticRegression(penalty='l1', C=10.**c, solver='liblinear', 
503
                            multi_class='ovr', random_state=0)
504
    lr.fit(X_train_std, y_train)
505
    weights.append(lr.coef_[1])
506
    params.append(10**c)
507

508
weights = np.array(weights)
509

510
for column, color in zip(range(weights.shape[1]), colors):
511
    plt.plot(params, weights[:, column],
512
             label=df_wine.columns[column + 1],
513
             color=color)
514
plt.axhline(0, color='black', linestyle='--', linewidth=3)
515
plt.xlim([10**(-5), 10**5])
516
plt.ylabel('Weight coefficient')
517
plt.xlabel('C (inverse regularization strength)')
518
plt.xscale('log')
519
plt.legend(loc='upper left')
520
ax.legend(loc='upper center', 
521
          bbox_to_anchor=(1.38, 1.03),
522
          ncol=1, fancybox=True)
523

524
#plt.savefig('figures/04_08.png', dpi=300, 
525
#            bbox_inches='tight', pad_inches=0.2)
526

527
plt.show()
528

529

530

531
# ## Sequential feature selection algorithms
532

533

534

535

536

537
class SBS:
538
    def __init__(self, estimator, k_features, scoring=accuracy_score,
539
                 test_size=0.25, random_state=1):
540
        self.scoring = scoring
541
        self.estimator = clone(estimator)
542
        self.k_features = k_features
543
        self.test_size = test_size
544
        self.random_state = random_state
545

546
    def fit(self, X, y):
547
        
548
        X_train, X_test, y_train, y_test =             train_test_split(X, y, test_size=self.test_size,
549
                             random_state=self.random_state)
550

551
        dim = X_train.shape[1]
552
        self.indices_ = tuple(range(dim))
553
        self.subsets_ = [self.indices_]
554
        score = self._calc_score(X_train, y_train, 
555
                                 X_test, y_test, self.indices_)
556
        self.scores_ = [score]
557

558
        while dim > self.k_features:
559
            scores = []
560
            subsets = []
561

562
            for p in combinations(self.indices_, r=dim - 1):
563
                score = self._calc_score(X_train, y_train, 
564
                                         X_test, y_test, p)
565
                scores.append(score)
566
                subsets.append(p)
567

568
            best = np.argmax(scores)
569
            self.indices_ = subsets[best]
570
            self.subsets_.append(self.indices_)
571
            dim -= 1
572

573
            self.scores_.append(scores[best])
574
        self.k_score_ = self.scores_[-1]
575

576
        return self
577

578
    def transform(self, X):
579
        return X[:, self.indices_]
580

581
    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
582
        self.estimator.fit(X_train[:, indices], y_train)
583
        y_pred = self.estimator.predict(X_test[:, indices])
584
        score = self.scoring(y_test, y_pred)
585
        return score
586

587

588

589

590

591
knn = KNeighborsClassifier(n_neighbors=5)
592

593
# selecting features
594
sbs = SBS(knn, k_features=1)
595
sbs.fit(X_train_std, y_train)
596

597
# plotting performance of feature subsets
598
k_feat = [len(k) for k in sbs.subsets_]
599

600
plt.plot(k_feat, sbs.scores_, marker='o')
601
plt.ylim([0.7, 1.02])
602
plt.ylabel('Accuracy')
603
plt.xlabel('Number of features')
604
plt.grid()
605
plt.tight_layout()
606
# plt.savefig('figures/04_09.png', dpi=300)
607
plt.show()
608

609

610

611

612
k3 = list(sbs.subsets_[10])
613
print(df_wine.columns[1:][k3])
614

615

616

617

618
knn.fit(X_train_std, y_train)
619
print('Training accuracy:', knn.score(X_train_std, y_train))
620
print('Test accuracy:', knn.score(X_test_std, y_test))
621

622

623

624

625
knn.fit(X_train_std[:, k3], y_train)
626
print('Training accuracy:', knn.score(X_train_std[:, k3], y_train))
627
print('Test accuracy:', knn.score(X_test_std[:, k3], y_test))
628

629

630

631
# # Assessing feature importance with Random Forests
632

633

634

635

636
feat_labels = df_wine.columns[1:]
637

638
forest = RandomForestClassifier(n_estimators=500,
639
                                random_state=1)
640

641
forest.fit(X_train, y_train)
642
importances = forest.feature_importances_
643

644
indices = np.argsort(importances)[::-1]
645

646
for f in range(X_train.shape[1]):
647
    print("%2d) %-*s %f" % (f + 1, 30, 
648
                            feat_labels[indices[f]], 
649
                            importances[indices[f]]))
650

651
plt.title('Feature importance')
652
plt.bar(range(X_train.shape[1]), 
653
        importances[indices],
654
        align='center')
655

656
plt.xticks(range(X_train.shape[1]), 
657
           feat_labels[indices], rotation=90)
658
plt.xlim([-1, X_train.shape[1]])
659
plt.tight_layout()
660
# plt.savefig('figures/04_10.png', dpi=300)
661
plt.show()
662

663

664

665

666

667
sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
668
X_selected = sfm.transform(X_train)
669
print('Number of features that meet this threshold criterion:', 
670
      X_selected.shape[1])
671

672

673
# Now, let's print the 3 features that met the threshold criterion for feature selection that we set earlier (note that this code snippet does not appear in the actual book but was added to this notebook later for illustrative purposes):
674

675

676

677
for f in range(X_selected.shape[1]):
678
    print("%2d) %-*s %f" % (f + 1, 30, 
679
                            feat_labels[indices[f]], 
680
                            importances[indices[f]]))
681

682

683

684
# # Summary
685

686
# ...
687

688
# ---
689
# 
690
# Readers may ignore the next cell.
691

692

693

694

695

696
Product

Resources

Company