CoCalc -- ch09.py

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch09/ch09.py
¹²⁴⁵ views
1
# coding: utf-8
2

3

4
import sys
5
from python_environment_check import check_packages
6
import pandas as pd
7
import matplotlib.pyplot as plt
8
from mlxtend.plotting import scatterplotmatrix
9
import numpy as np
10
from mlxtend.plotting import heatmap
11
from sklearn.preprocessing import StandardScaler
12
from sklearn.linear_model import LinearRegression
13
from sklearn.linear_model import RANSACRegressor
14
from sklearn.model_selection import train_test_split
15
from sklearn.metrics import mean_squared_error
16
from sklearn.metrics import mean_absolute_error
17
from sklearn.metrics import r2_score
18
from sklearn.linear_model import Lasso
19
from sklearn.linear_model import Ridge
20
from sklearn.linear_model import ElasticNet
21
from sklearn.preprocessing import PolynomialFeatures
22
from sklearn.tree import DecisionTreeRegressor
23
from sklearn.ensemble import RandomForestRegressor
24

25
# # Machine Learning with PyTorch and Scikit-Learn  
26
# # -- Code Examples
27

28
# ## Package version checks
29

30
# Add folder to path in order to load from the check_packages.py script:
31

32

33

34
sys.path.insert(0, '..')
35

36

37
# Check recommended package versions:
38

39

40

41

42

43
d = {
44
    'numpy': '1.21.2',
45
    'mlxtend': '0.19.0',
46
    'matplotlib': '3.4.3',
47
    'sklearn': '1.0',
48
    'pandas': '1.3.2',
49
}
50
check_packages(d)
51

52

53
# # Chapter 09 - Predicting Continuous Target Variables with Regression Analysis
54

55

56
# ### Overview
57

58
# - [Introducing regression](#Introducing-linear-regression)
59
#   - [Simple linear regression](#Simple-linear-regression)
60
# - [Exploring the Ames Housing Dataset](#Exploring-the-Ames-Housing-Dataset)
61
#   - [Loading the Ames Housing dataset into a data frame](Loading-the-Ames-Housing-dataset-into-a-data-frame)
62
#   - [Visualizing the important characteristics of a dataset](#Visualizing-the-important-characteristics-of-a-dataset)
63
# - [Implementing an ordinary least squares linear regression model](#Implementing-an-ordinary-least-squares-linear-regression-model)
64
#   - [Solving regression for regression parameters with gradient descent](#Solving-regression-for-regression-parameters-with-gradient-descent)
65
#   - [Estimating the coefficient of a regression model via scikit-learn](#Estimating-the-coefficient-of-a-regression-model-via-scikit-learn)
66
# - [Fitting a robust regression model using RANSAC](#Fitting-a-robust-regression-model-using-RANSAC)
67
# - [Evaluating the performance of linear regression models](#Evaluating-the-performance-of-linear-regression-models)
68
# - [Using regularized methods for regression](#Using-regularized-methods-for-regression)
69
# - [Turning a linear regression model into a curve - polynomial regression](#Turning-a-linear-regression-model-into-a-curve---polynomial-regression)
70
#   - [Modeling nonlinear relationships in the Ames Housing dataset](#Modeling-nonlinear-relationships-in-the-Ames-Housing-dataset)
71
#   - [Dealing with nonlinear relationships using random forests](#Dealing-with-nonlinear-relationships-using-random-forests)
72
#     - [Decision tree regression](#Decision-tree-regression)
73
#     - [Random forest regression](#Random-forest-regression)
74
# - [Summary](#Summary)
75

76

77

78

79

80

81
# # Introducing linear regression
82

83
# ## Simple linear regression
84

85

86

87

88

89
# ## Multiple linear regression
90

91

92

93

94

95

96
# # Exploring the Ames Housing dataset
97

98
# ## Loading the Ames Housing dataset into a data frame
99

100
# - Dataset source: http://jse.amstat.org/v19n3/decock/AmesHousing.txt
101
# - Dataset documentation: http://jse.amstat.org/v19n3/decock/DataDocumentation.txt
102
# - Dataset write-up: http://jse.amstat.org/v19n3/decock.pdf
103

104
# - `'Overall Qual'`: Rates the overall material and finish of the house
105
# 
106
#        10	Very Excellent
107
#        9	Excellent
108
#        8	Very Good
109
#        7	Good
110
#        6	Above Average
111
#        5	Average
112
#        4	Below Average
113
#        3	Fair
114
#        2	Poor
115
#        1	Very Poor
116
# 	
117
# - `'Overall Cond'`: Rates the overall condition of the house
118
# 
119
#        10	Very Excellent
120
#        9	Excellent
121
#        8	Very Good
122
#        7	Good
123
#        6	Above Average	
124
#        5	Average
125
#        4	Below Average	
126
#        3	Fair
127
#        2	Poor
128
#        1	Very Poor
129
# - `'Gr Liv Area'`: Above grade (ground) living area square feet
130
# - `'Central Air'`: Central air conditioning
131
# 
132
#        N	No
133
#        Y	Yes
134
# 
135
# - `'Total Bsmt SF'`: Total square feet of basement area
136
# - `'SalePrice'`: Sale price $$
137

138

139

140

141

142
columns = ['Overall Qual', 'Overall Cond', 'Gr Liv Area',
143
           'Central Air', 'Total Bsmt SF', 'SalePrice']
144

145
df = pd.read_csv('http://jse.amstat.org/v19n3/decock/AmesHousing.txt', 
146
                 sep='\t',
147
                 usecols=columns)
148

149
df.head()
150

151

152

153

154
df.shape
155

156

157

158

159
df['Central Air'] = df['Central Air'].map({'N': 0, 'Y': 1})
160

161

162

163

164
df.isnull().sum()
165

166

167

168

169
# remove rows that contain missing values
170

171
df = df.dropna(axis=0)
172
df.isnull().sum()
173

174

175

176
# ## Visualizing the important characteristics of a dataset
177

178

179

180

181

182

183

184
scatterplotmatrix(df.values, figsize=(12, 10), 
185
                  names=df.columns, alpha=0.5)
186
plt.tight_layout()
187
#plt.savefig('figures/09_04.png', dpi=300)
188
plt.show()
189

190

191

192

193

194

195
cm = np.corrcoef(df.values.T)
196
hm = heatmap(cm, row_names=df.columns, column_names=df.columns)
197

198
plt.tight_layout()
199
#plt.savefig('figures/09_05.png', dpi=300)
200
plt.show()
201

202

203

204
# # Implementing an ordinary least squares linear regression model
205

206
# ...
207

208
# ## Solving regression for regression parameters with gradient descent
209

210

211

212
class LinearRegressionGD:
213
    def __init__(self, eta=0.01, n_iter=50, random_state=1):
214
        self.eta = eta
215
        self.n_iter = n_iter
216
        self.random_state = random_state
217

218
    def fit(self, X, y):
219
        rgen = np.random.RandomState(self.random_state)
220
        self.w_ = rgen.normal(loc=0.0, scale=0.01, size=X.shape[1])
221
        self.b_ = np.array([0.])
222
        self.losses_ = []
223

224
        for i in range(self.n_iter):
225
            output = self.net_input(X)
226
            errors = (y - output)
227
            self.w_ += self.eta * 2.0 * X.T.dot(errors) / X.shape[0]
228
            self.b_ += self.eta * 2.0 * errors.mean()
229
            loss = (errors**2).mean()
230
            self.losses_.append(loss)
231
        return self
232

233
    def net_input(self, X):
234
        return np.dot(X, self.w_) + self.b_
235

236
    def predict(self, X):
237
        return self.net_input(X)
238

239

240

241

242
X = df[['Gr Liv Area']].values
243
y = df['SalePrice'].values
244

245

246

247

248

249

250
sc_x = StandardScaler()
251
sc_y = StandardScaler()
252
X_std = sc_x.fit_transform(X)
253
y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()
254

255

256

257

258
lr = LinearRegressionGD(eta=0.1)
259
lr.fit(X_std, y_std)
260

261

262

263

264
plt.plot(range(1, lr.n_iter+1), lr.losses_)
265
plt.ylabel('MSE')
266
plt.xlabel('Epoch')
267
plt.tight_layout()
268
#plt.savefig('figures/09_06.png', dpi=300)
269
plt.show()
270

271

272

273

274
def lin_regplot(X, y, model):
275
    plt.scatter(X, y, c='steelblue', edgecolor='white', s=70)
276
    plt.plot(X, model.predict(X), color='black', lw=2)    
277
    return 
278

279

280

281

282
lin_regplot(X_std, y_std, lr)
283
plt.xlabel('Living area above ground (standardized)')
284
plt.ylabel('Sale price (standardized)')
285

286
#plt.savefig('figures/09_07.png', dpi=300)
287
plt.show()
288

289

290

291

292
feature_std = sc_x.transform(np.array([[2500]]))
293
target_std = lr.predict(feature_std)
294
target_reverted = sc_y.inverse_transform(target_std.reshape(-1, 1))
295
print(f'Sale price: ${target_reverted.flatten()[0]:.2f}')
296

297

298

299

300
print(f'Slope: {lr.w_[0]:.3f}')
301
print(f'Intercept: {lr.b_[0]:.3f}')
302

303

304

305
# ## Estimating the coefficient of a regression model via scikit-learn
306

307

308

309

310

311

312

313
slr = LinearRegression()
314
slr.fit(X, y)
315
y_pred = slr.predict(X)
316
print(f'Slope: {slr.coef_[0]:.3f}')
317
print(f'Intercept: {slr.intercept_:.3f}')
318

319

320

321

322
lin_regplot(X, y, slr)
323
plt.xlabel('Living area above ground in square feet')
324
plt.ylabel('Sale price in U.S. dollars')
325

326
plt.tight_layout()
327
#plt.savefig('figures/09_08.png', dpi=300)
328
plt.show()
329

330

331
# **Normal Equations** alternative:
332

333

334

335
# adding a column vector of "ones"
336
Xb = np.hstack((np.ones((X.shape[0], 1)), X))
337
w = np.zeros(X.shape[1])
338
z = np.linalg.inv(np.dot(Xb.T, Xb))
339
w = np.dot(z, np.dot(Xb.T, y))
340

341
print(f'Slope: {w[1]:.3f}')
342
print(f'Intercept: {w[0]:.3f}')
343

344

345

346
# # Fitting a robust regression model using RANSAC
347

348

349

350

351

352
ransac = RANSACRegressor(LinearRegression(), 
353
                         max_trials=100, # default
354
                         min_samples=0.95, 
355
                         loss='absolute_error', # default
356
                         residual_threshold=None, # default 
357
                         random_state=123)
358

359

360
ransac.fit(X, y)
361

362
inlier_mask = ransac.inlier_mask_
363
outlier_mask = np.logical_not(inlier_mask)
364

365
line_X = np.arange(3, 10, 1)
366
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
367
plt.scatter(X[inlier_mask], y[inlier_mask],
368
            c='steelblue', edgecolor='white', 
369
            marker='o', label='Inliers')
370
plt.scatter(X[outlier_mask], y[outlier_mask],
371
            c='limegreen', edgecolor='white', 
372
            marker='s', label='Outliers')
373
plt.plot(line_X, line_y_ransac, color='black', lw=2)   
374
plt.xlabel('Living area above ground in square feet')
375
plt.ylabel('Sale price in U.S. dollars')
376
plt.legend(loc='upper left')
377

378
plt.tight_layout()
379
#plt.savefig('figures/09_09.png', dpi=300)
380
plt.show()
381

382

383

384

385
print(f'Slope: {ransac.estimator_.coef_[0]:.3f}')
386
print(f'Intercept: {ransac.estimator_.intercept_:.3f}')
387

388

389

390

391
def mean_absolute_deviation(data):
392
    return np.mean(np.abs(data - np.mean(data)))
393
    
394
mean_absolute_deviation(y)
395

396

397

398

399
ransac = RANSACRegressor(LinearRegression(), 
400
                         max_trials=100, # default
401
                         min_samples=0.95, 
402
                         loss='absolute_error', # default
403
                         residual_threshold=65000, # default 
404
                         random_state=123)
405

406
ransac.fit(X, y)
407

408
inlier_mask = ransac.inlier_mask_
409
outlier_mask = np.logical_not(inlier_mask)
410

411
line_X = np.arange(3, 10, 1)
412
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
413
plt.scatter(X[inlier_mask], y[inlier_mask],
414
            c='steelblue', edgecolor='white', 
415
            marker='o', label='Inliers')
416
plt.scatter(X[outlier_mask], y[outlier_mask],
417
            c='limegreen', edgecolor='white', 
418
            marker='s', label='Outliers')
419
plt.plot(line_X, line_y_ransac, color='black', lw=2)   
420
plt.xlabel('Living area above ground in square feet')
421
plt.ylabel('Sale price in U.S. dollars')
422
plt.legend(loc='upper left')
423

424
plt.tight_layout()
425
#plt.savefig('figures/09_10.png', dpi=300)
426
plt.show()
427

428

429

430

431
print(f'Slope: {ransac.estimator_.coef_[0]:.3f}')
432
print(f'Intercept: {ransac.estimator_.intercept_:.3f}')
433

434

435

436
# # Evaluating the performance of linear regression models
437

438

439

440

441

442
target = 'SalePrice'
443
features = df.columns[df.columns != target]
444

445
X = df[features].values
446
y = df[target].values
447

448
X_train, X_test, y_train, y_test = train_test_split(
449
    X, y, test_size=0.3, random_state=123)
450

451

452

453

454
slr = LinearRegression()
455

456
slr.fit(X_train, y_train)
457
y_train_pred = slr.predict(X_train)
458
y_test_pred = slr.predict(X_test)
459

460

461

462

463
x_max = np.max([np.max(y_train_pred), np.max(y_test_pred)])
464
x_min = np.min([np.min(y_train_pred), np.min(y_test_pred)])
465

466
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 3), sharey=True)
467

468
ax1.scatter(y_test_pred, y_test_pred - y_test,
469
            c='limegreen', marker='s', edgecolor='white',
470
            label='Test data')
471
ax2.scatter(y_train_pred, y_train_pred - y_train,
472
            c='steelblue', marker='o', edgecolor='white',
473
            label='Training data')
474
ax1.set_ylabel('Residuals')
475

476
for ax in (ax1, ax2):
477
    ax.set_xlabel('Predicted values')
478
    ax.legend(loc='upper left')
479
    ax.hlines(y=0, xmin=x_min-100, xmax=x_max+100, color='black', lw=2)
480

481
plt.tight_layout()
482

483
#plt.savefig('figures/09_11.png', dpi=300)
484
plt.show()
485

486

487

488

489

490

491
mse_train = mean_squared_error(y_train, y_train_pred)
492
mse_test = mean_squared_error(y_test, y_test_pred)
493
print(f'MSE train: {mse_train:.2f}')
494
print(f'MSE test: {mse_test:.2f}')
495

496

497

498

499

500

501
mae_train = mean_absolute_error(y_train, y_train_pred)
502
mae_test = mean_absolute_error(y_test, y_test_pred)
503
print(f'MAE train: {mae_train:.2f}')
504
print(f'MAE test: {mae_test:.2f}')
505

506

507

508

509

510

511
r2_train = r2_score(y_train, y_train_pred)
512
r2_test =r2_score(y_test, y_test_pred)
513
print(f'R^2 train: {r2_train:.2f}')
514
print(f'R^2 test: {r2_test:.2f}')
515

516

517

518
# # Using regularized methods for regression
519

520

521

522

523

524
lasso = Lasso(alpha=1.0)
525
lasso.fit(X_train, y_train)
526
y_train_pred = lasso.predict(X_train)
527
y_test_pred = lasso.predict(X_test)
528
print(lasso.coef_)
529

530

531

532

533
train_mse = mean_squared_error(y_train, y_train_pred)
534
test_mse = mean_squared_error(y_test, y_test_pred)
535
print(f'MSE train: {train_mse:.3f}, test: {test_mse:.3f}')
536

537
train_r2 = r2_score(y_train, y_train_pred)
538
test_r2 = r2_score(y_test, y_test_pred)
539
print(f'R^2 train: {train_r2:.3f}, {test_r2:.3f}')
540

541

542
# Ridge regression:
543

544

545

546

547

548
ridge = Ridge(alpha=1.0)
549

550

551
# LASSO regression:
552

553

554

555

556

557
lasso = Lasso(alpha=1.0)
558

559

560
# Elastic Net regression:
561

562

563

564

565

566
elanet = ElasticNet(alpha=1.0, l1_ratio=0.5)
567

568

569

570
# # Turning a linear regression model into a curve - polynomial regression
571

572

573

574
X = np.array([258.0, 270.0, 294.0, 
575
              320.0, 342.0, 368.0, 
576
              396.0, 446.0, 480.0, 586.0])\
577
             [:, np.newaxis]
578

579
y = np.array([236.4, 234.4, 252.8, 
580
              298.6, 314.2, 342.2, 
581
              360.8, 368.0, 391.2,
582
              390.8])
583

584

585

586

587

588

589
lr = LinearRegression()
590
pr = LinearRegression()
591
quadratic = PolynomialFeatures(degree=2)
592
X_quad = quadratic.fit_transform(X)
593

594

595

596

597
# fit linear features
598
lr.fit(X, y)
599
X_fit = np.arange(250, 600, 10)[:, np.newaxis]
600
y_lin_fit = lr.predict(X_fit)
601

602
# fit quadratic features
603
pr.fit(X_quad, y)
604
y_quad_fit = pr.predict(quadratic.fit_transform(X_fit))
605

606
# plot results
607
plt.scatter(X, y, label='Training points')
608
plt.plot(X_fit, y_lin_fit, label='Linear fit', linestyle='--')
609
plt.plot(X_fit, y_quad_fit, label='Quadratic fit')
610
plt.xlabel('Explanatory variable')
611
plt.ylabel('Predicted or known target values')
612
plt.legend(loc='upper left')
613

614
plt.tight_layout()
615
#plt.savefig('figures/09_12.png', dpi=300)
616
plt.show()
617

618

619

620

621
y_lin_pred = lr.predict(X)
622
y_quad_pred = pr.predict(X_quad)
623

624

625

626

627
mse_lin = mean_squared_error(y, y_lin_pred)
628
mse_quad = mean_squared_error(y, y_quad_pred)
629
print(f'Training MSE linear: {mse_lin:.3f}'
630
      f', quadratic: {mse_quad:.3f}')
631

632

633
r2_lin = r2_score(y, y_lin_pred)
634
r2_quad = r2_score(y, y_quad_pred)
635
print(f'Training R^2 linear: {r2_lin:.3f}'
636
      f', quadratic: {r2_quad:.3f}')
637

638

639

640
# ## Modeling nonlinear relationships in the Ames Housing dataset
641

642

643

644
X = df[['Gr Liv Area']].values
645
y = df['SalePrice'].values
646

647
X = X[(df['Gr Liv Area'] < 4000)]
648
y = y[(df['Gr Liv Area'] < 4000)]
649

650

651
regr = LinearRegression()
652

653
# create quadratic features
654
quadratic = PolynomialFeatures(degree=2)
655
cubic = PolynomialFeatures(degree=3)
656
X_quad = quadratic.fit_transform(X)
657
X_cubic = cubic.fit_transform(X)
658

659
# fit features
660
X_fit = np.arange(X.min()-1, X.max()+2, 1)[:, np.newaxis]
661

662
regr = regr.fit(X, y)
663
y_lin_fit = regr.predict(X_fit)
664
linear_r2 = r2_score(y, regr.predict(X))
665

666
regr = regr.fit(X_quad, y)
667
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
668
quadratic_r2 = r2_score(y, regr.predict(X_quad))
669

670
regr = regr.fit(X_cubic, y)
671
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
672
cubic_r2 = r2_score(y, regr.predict(X_cubic))
673

674

675
# plot results
676
plt.scatter(X, y, label='Training points', color='lightgray')
677

678
plt.plot(X_fit, y_lin_fit, 
679
         label=f'Linear (d=1), $R^2$={linear_r2:.2f}',
680
         color='blue', 
681
         lw=2, 
682
         linestyle=':')
683

684
plt.plot(X_fit, y_quad_fit, 
685
         label=f'Quadratic (d=2), $R^2$={quadratic_r2:.2f}',
686
         color='red', 
687
         lw=2,
688
         linestyle='-')
689

690
plt.plot(X_fit, y_cubic_fit, 
691
         label=f'Cubic (d=3), $R^2$={cubic_r2:.2f}',
692
         color='green', 
693
         lw=2,
694
         linestyle='--')
695

696

697
plt.xlabel('Living area above ground in square feet')
698
plt.ylabel('Sale price in U.S. dollars')
699
plt.legend(loc='upper left')
700

701
plt.tight_layout()
702
plt.savefig('figures/09_13.png', dpi=300)
703
plt.show()
704

705

706

707

708
X = df[['Overall Qual']].values
709
y = df['SalePrice'].values
710

711

712
regr = LinearRegression()
713

714
# create quadratic features
715
quadratic = PolynomialFeatures(degree=2)
716
cubic = PolynomialFeatures(degree=3)
717
X_quad = quadratic.fit_transform(X)
718
X_cubic = cubic.fit_transform(X)
719

720
# fit features
721
X_fit = np.arange(X.min()-1, X.max()+2, 1)[:, np.newaxis]
722

723
regr = regr.fit(X, y)
724
y_lin_fit = regr.predict(X_fit)
725
linear_r2 = r2_score(y, regr.predict(X))
726

727
regr = regr.fit(X_quad, y)
728
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
729
quadratic_r2 = r2_score(y, regr.predict(X_quad))
730

731
regr = regr.fit(X_cubic, y)
732
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
733
cubic_r2 = r2_score(y, regr.predict(X_cubic))
734

735

736
# plot results
737
plt.scatter(X, y, label='Training points', color='lightgray')
738

739
plt.plot(X_fit, y_lin_fit, 
740
         label=f'Linear (d=1), $R^2$={linear_r2:.2f}',
741
         color='blue', 
742
         lw=2, 
743
         linestyle=':')
744

745
plt.plot(X_fit, y_quad_fit, 
746
         label=f'Quadratic (d=2), $R^2$={quadratic_r2:.2f}',
747
         color='red', 
748
         lw=2,
749
         linestyle='-')
750

751
plt.plot(X_fit, y_cubic_fit, 
752
         label=f'Cubic (d=3), $R^2$={cubic_r2:.2f}',
753
         color='green', 
754
         lw=2,
755
         linestyle='--')
756

757

758
plt.xlabel('Overall quality of the house')
759
plt.ylabel('Sale price in U.S. dollars')
760
plt.legend(loc='upper left')
761

762
plt.tight_layout()
763
#plt.savefig('figures/09_14.png', dpi=300)
764
plt.show()
765

766

767

768
# # Dealing with nonlinear relationships using random forests
769

770
# ...
771

772
# ## Decision tree regression
773

774

775

776

777

778
X = df[['Gr Liv Area']].values
779
y = df['SalePrice'].values
780

781

782

783
tree = DecisionTreeRegressor(max_depth=3)
784
tree.fit(X, y)
785
sort_idx = X.flatten().argsort()
786

787
lin_regplot(X[sort_idx], y[sort_idx], tree)
788
plt.xlabel('Living area above ground in square feet')
789
plt.ylabel('Sale price in U.S. dollars')
790

791
plt.tight_layout()
792
#plt.savefig('figures/09_15.png', dpi=300)
793
plt.show()
794

795

796

797

798
tree_r2 = r2_score(y, tree.predict(X))
799
tree_r2
800

801

802

803
# ## Random forest regression
804

805

806

807
target = 'SalePrice'
808
features = df.columns[df.columns != target]
809

810
X = df[features].values
811
y = df[target].values
812

813
X_train, X_test, y_train, y_test = train_test_split(
814
    X, y, test_size=0.3, random_state=123)
815

816

817

818

819

820

821
forest = RandomForestRegressor(n_estimators=1000, 
822
                               criterion='squared_error', 
823
                               random_state=1, 
824
                               n_jobs=-1)
825
forest.fit(X_train, y_train)
826
y_train_pred = forest.predict(X_train)
827
y_test_pred = forest.predict(X_test)
828

829

830
mae_train = mean_absolute_error(y_train, y_train_pred)
831
mae_test = mean_absolute_error(y_test, y_test_pred)
832
print(f'MAE train: {mae_train:.2f}')
833
print(f'MAE test: {mae_test:.2f}')
834

835

836
r2_train = r2_score(y_train, y_train_pred)
837
r2_test =r2_score(y_test, y_test_pred)
838
print(f'R^2 train: {r2_train:.2f}')
839
print(f'R^2 test: {r2_test:.2f}')
840

841

842

843

844
x_max = np.max([np.max(y_train_pred), np.max(y_test_pred)])
845
x_min = np.min([np.min(y_train_pred), np.min(y_test_pred)])
846

847
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 3), sharey=True)
848

849
ax1.scatter(y_test_pred, y_test_pred - y_test,
850
            c='limegreen', marker='s', edgecolor='white',
851
            label='Test data')
852
ax2.scatter(y_train_pred, y_train_pred - y_train,
853
            c='steelblue', marker='o', edgecolor='white',
854
            label='Training data')
855
ax1.set_ylabel('Residuals')
856

857
for ax in (ax1, ax2):
858
    ax.set_xlabel('Predicted values')
859
    ax.legend(loc='upper left')
860
    ax.hlines(y=0, xmin=x_min-100, xmax=x_max+100, color='black', lw=2)
861

862
plt.tight_layout()
863

864
#plt.savefig('figures/09_16.png', dpi=300)
865
plt.show()
866

867

868

869
# # Summary
870

871
# ...
872

873
# ---
874
# 
875
# Readers may ignore the next cell.
876

877

878

879

880

881

882

883

884

885

886
Product

Resources

Company