Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch09/ch09.py
1245 views
1
# coding: utf-8
2
3
4
import sys
5
from python_environment_check import check_packages
6
import pandas as pd
7
import matplotlib.pyplot as plt
8
from mlxtend.plotting import scatterplotmatrix
9
import numpy as np
10
from mlxtend.plotting import heatmap
11
from sklearn.preprocessing import StandardScaler
12
from sklearn.linear_model import LinearRegression
13
from sklearn.linear_model import RANSACRegressor
14
from sklearn.model_selection import train_test_split
15
from sklearn.metrics import mean_squared_error
16
from sklearn.metrics import mean_absolute_error
17
from sklearn.metrics import r2_score
18
from sklearn.linear_model import Lasso
19
from sklearn.linear_model import Ridge
20
from sklearn.linear_model import ElasticNet
21
from sklearn.preprocessing import PolynomialFeatures
22
from sklearn.tree import DecisionTreeRegressor
23
from sklearn.ensemble import RandomForestRegressor
24
25
# # Machine Learning with PyTorch and Scikit-Learn
26
# # -- Code Examples
27
28
# ## Package version checks
29
30
# Add folder to path in order to load from the check_packages.py script:
31
32
33
34
sys.path.insert(0, '..')
35
36
37
# Check recommended package versions:
38
39
40
41
42
43
d = {
44
'numpy': '1.21.2',
45
'mlxtend': '0.19.0',
46
'matplotlib': '3.4.3',
47
'sklearn': '1.0',
48
'pandas': '1.3.2',
49
}
50
check_packages(d)
51
52
53
# # Chapter 09 - Predicting Continuous Target Variables with Regression Analysis
54
55
56
# ### Overview
57
58
# - [Introducing regression](#Introducing-linear-regression)
59
# - [Simple linear regression](#Simple-linear-regression)
60
# - [Exploring the Ames Housing Dataset](#Exploring-the-Ames-Housing-Dataset)
61
# - [Loading the Ames Housing dataset into a data frame](Loading-the-Ames-Housing-dataset-into-a-data-frame)
62
# - [Visualizing the important characteristics of a dataset](#Visualizing-the-important-characteristics-of-a-dataset)
63
# - [Implementing an ordinary least squares linear regression model](#Implementing-an-ordinary-least-squares-linear-regression-model)
64
# - [Solving regression for regression parameters with gradient descent](#Solving-regression-for-regression-parameters-with-gradient-descent)
65
# - [Estimating the coefficient of a regression model via scikit-learn](#Estimating-the-coefficient-of-a-regression-model-via-scikit-learn)
66
# - [Fitting a robust regression model using RANSAC](#Fitting-a-robust-regression-model-using-RANSAC)
67
# - [Evaluating the performance of linear regression models](#Evaluating-the-performance-of-linear-regression-models)
68
# - [Using regularized methods for regression](#Using-regularized-methods-for-regression)
69
# - [Turning a linear regression model into a curve - polynomial regression](#Turning-a-linear-regression-model-into-a-curve---polynomial-regression)
70
# - [Modeling nonlinear relationships in the Ames Housing dataset](#Modeling-nonlinear-relationships-in-the-Ames-Housing-dataset)
71
# - [Dealing with nonlinear relationships using random forests](#Dealing-with-nonlinear-relationships-using-random-forests)
72
# - [Decision tree regression](#Decision-tree-regression)
73
# - [Random forest regression](#Random-forest-regression)
74
# - [Summary](#Summary)
75
76
77
78
79
80
81
# # Introducing linear regression
82
83
# ## Simple linear regression
84
85
86
87
88
89
# ## Multiple linear regression
90
91
92
93
94
95
96
# # Exploring the Ames Housing dataset
97
98
# ## Loading the Ames Housing dataset into a data frame
99
100
# - Dataset source: http://jse.amstat.org/v19n3/decock/AmesHousing.txt
101
# - Dataset documentation: http://jse.amstat.org/v19n3/decock/DataDocumentation.txt
102
# - Dataset write-up: http://jse.amstat.org/v19n3/decock.pdf
103
104
# - `'Overall Qual'`: Rates the overall material and finish of the house
105
#
106
# 10 Very Excellent
107
# 9 Excellent
108
# 8 Very Good
109
# 7 Good
110
# 6 Above Average
111
# 5 Average
112
# 4 Below Average
113
# 3 Fair
114
# 2 Poor
115
# 1 Very Poor
116
#
117
# - `'Overall Cond'`: Rates the overall condition of the house
118
#
119
# 10 Very Excellent
120
# 9 Excellent
121
# 8 Very Good
122
# 7 Good
123
# 6 Above Average
124
# 5 Average
125
# 4 Below Average
126
# 3 Fair
127
# 2 Poor
128
# 1 Very Poor
129
# - `'Gr Liv Area'`: Above grade (ground) living area square feet
130
# - `'Central Air'`: Central air conditioning
131
#
132
# N No
133
# Y Yes
134
#
135
# - `'Total Bsmt SF'`: Total square feet of basement area
136
# - `'SalePrice'`: Sale price $$
137
138
139
140
141
142
columns = ['Overall Qual', 'Overall Cond', 'Gr Liv Area',
143
'Central Air', 'Total Bsmt SF', 'SalePrice']
144
145
df = pd.read_csv('http://jse.amstat.org/v19n3/decock/AmesHousing.txt',
146
sep='\t',
147
usecols=columns)
148
149
df.head()
150
151
152
153
154
df.shape
155
156
157
158
159
df['Central Air'] = df['Central Air'].map({'N': 0, 'Y': 1})
160
161
162
163
164
df.isnull().sum()
165
166
167
168
169
# remove rows that contain missing values
170
171
df = df.dropna(axis=0)
172
df.isnull().sum()
173
174
175
176
# ## Visualizing the important characteristics of a dataset
177
178
179
180
181
182
183
184
scatterplotmatrix(df.values, figsize=(12, 10),
185
names=df.columns, alpha=0.5)
186
plt.tight_layout()
187
#plt.savefig('figures/09_04.png', dpi=300)
188
plt.show()
189
190
191
192
193
194
195
cm = np.corrcoef(df.values.T)
196
hm = heatmap(cm, row_names=df.columns, column_names=df.columns)
197
198
plt.tight_layout()
199
#plt.savefig('figures/09_05.png', dpi=300)
200
plt.show()
201
202
203
204
# # Implementing an ordinary least squares linear regression model
205
206
# ...
207
208
# ## Solving regression for regression parameters with gradient descent
209
210
211
212
class LinearRegressionGD:
213
def __init__(self, eta=0.01, n_iter=50, random_state=1):
214
self.eta = eta
215
self.n_iter = n_iter
216
self.random_state = random_state
217
218
def fit(self, X, y):
219
rgen = np.random.RandomState(self.random_state)
220
self.w_ = rgen.normal(loc=0.0, scale=0.01, size=X.shape[1])
221
self.b_ = np.array([0.])
222
self.losses_ = []
223
224
for i in range(self.n_iter):
225
output = self.net_input(X)
226
errors = (y - output)
227
self.w_ += self.eta * 2.0 * X.T.dot(errors) / X.shape[0]
228
self.b_ += self.eta * 2.0 * errors.mean()
229
loss = (errors**2).mean()
230
self.losses_.append(loss)
231
return self
232
233
def net_input(self, X):
234
return np.dot(X, self.w_) + self.b_
235
236
def predict(self, X):
237
return self.net_input(X)
238
239
240
241
242
X = df[['Gr Liv Area']].values
243
y = df['SalePrice'].values
244
245
246
247
248
249
250
sc_x = StandardScaler()
251
sc_y = StandardScaler()
252
X_std = sc_x.fit_transform(X)
253
y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()
254
255
256
257
258
lr = LinearRegressionGD(eta=0.1)
259
lr.fit(X_std, y_std)
260
261
262
263
264
plt.plot(range(1, lr.n_iter+1), lr.losses_)
265
plt.ylabel('MSE')
266
plt.xlabel('Epoch')
267
plt.tight_layout()
268
#plt.savefig('figures/09_06.png', dpi=300)
269
plt.show()
270
271
272
273
274
def lin_regplot(X, y, model):
275
plt.scatter(X, y, c='steelblue', edgecolor='white', s=70)
276
plt.plot(X, model.predict(X), color='black', lw=2)
277
return
278
279
280
281
282
lin_regplot(X_std, y_std, lr)
283
plt.xlabel('Living area above ground (standardized)')
284
plt.ylabel('Sale price (standardized)')
285
286
#plt.savefig('figures/09_07.png', dpi=300)
287
plt.show()
288
289
290
291
292
feature_std = sc_x.transform(np.array([[2500]]))
293
target_std = lr.predict(feature_std)
294
target_reverted = sc_y.inverse_transform(target_std.reshape(-1, 1))
295
print(f'Sale price: ${target_reverted.flatten()[0]:.2f}')
296
297
298
299
300
print(f'Slope: {lr.w_[0]:.3f}')
301
print(f'Intercept: {lr.b_[0]:.3f}')
302
303
304
305
# ## Estimating the coefficient of a regression model via scikit-learn
306
307
308
309
310
311
312
313
slr = LinearRegression()
314
slr.fit(X, y)
315
y_pred = slr.predict(X)
316
print(f'Slope: {slr.coef_[0]:.3f}')
317
print(f'Intercept: {slr.intercept_:.3f}')
318
319
320
321
322
lin_regplot(X, y, slr)
323
plt.xlabel('Living area above ground in square feet')
324
plt.ylabel('Sale price in U.S. dollars')
325
326
plt.tight_layout()
327
#plt.savefig('figures/09_08.png', dpi=300)
328
plt.show()
329
330
331
# **Normal Equations** alternative:
332
333
334
335
# adding a column vector of "ones"
336
Xb = np.hstack((np.ones((X.shape[0], 1)), X))
337
w = np.zeros(X.shape[1])
338
z = np.linalg.inv(np.dot(Xb.T, Xb))
339
w = np.dot(z, np.dot(Xb.T, y))
340
341
print(f'Slope: {w[1]:.3f}')
342
print(f'Intercept: {w[0]:.3f}')
343
344
345
346
# # Fitting a robust regression model using RANSAC
347
348
349
350
351
352
ransac = RANSACRegressor(LinearRegression(),
353
max_trials=100, # default
354
min_samples=0.95,
355
loss='absolute_error', # default
356
residual_threshold=None, # default
357
random_state=123)
358
359
360
ransac.fit(X, y)
361
362
inlier_mask = ransac.inlier_mask_
363
outlier_mask = np.logical_not(inlier_mask)
364
365
line_X = np.arange(3, 10, 1)
366
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
367
plt.scatter(X[inlier_mask], y[inlier_mask],
368
c='steelblue', edgecolor='white',
369
marker='o', label='Inliers')
370
plt.scatter(X[outlier_mask], y[outlier_mask],
371
c='limegreen', edgecolor='white',
372
marker='s', label='Outliers')
373
plt.plot(line_X, line_y_ransac, color='black', lw=2)
374
plt.xlabel('Living area above ground in square feet')
375
plt.ylabel('Sale price in U.S. dollars')
376
plt.legend(loc='upper left')
377
378
plt.tight_layout()
379
#plt.savefig('figures/09_09.png', dpi=300)
380
plt.show()
381
382
383
384
385
print(f'Slope: {ransac.estimator_.coef_[0]:.3f}')
386
print(f'Intercept: {ransac.estimator_.intercept_:.3f}')
387
388
389
390
391
def mean_absolute_deviation(data):
392
return np.mean(np.abs(data - np.mean(data)))
393
394
mean_absolute_deviation(y)
395
396
397
398
399
ransac = RANSACRegressor(LinearRegression(),
400
max_trials=100, # default
401
min_samples=0.95,
402
loss='absolute_error', # default
403
residual_threshold=65000, # default
404
random_state=123)
405
406
ransac.fit(X, y)
407
408
inlier_mask = ransac.inlier_mask_
409
outlier_mask = np.logical_not(inlier_mask)
410
411
line_X = np.arange(3, 10, 1)
412
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
413
plt.scatter(X[inlier_mask], y[inlier_mask],
414
c='steelblue', edgecolor='white',
415
marker='o', label='Inliers')
416
plt.scatter(X[outlier_mask], y[outlier_mask],
417
c='limegreen', edgecolor='white',
418
marker='s', label='Outliers')
419
plt.plot(line_X, line_y_ransac, color='black', lw=2)
420
plt.xlabel('Living area above ground in square feet')
421
plt.ylabel('Sale price in U.S. dollars')
422
plt.legend(loc='upper left')
423
424
plt.tight_layout()
425
#plt.savefig('figures/09_10.png', dpi=300)
426
plt.show()
427
428
429
430
431
print(f'Slope: {ransac.estimator_.coef_[0]:.3f}')
432
print(f'Intercept: {ransac.estimator_.intercept_:.3f}')
433
434
435
436
# # Evaluating the performance of linear regression models
437
438
439
440
441
442
target = 'SalePrice'
443
features = df.columns[df.columns != target]
444
445
X = df[features].values
446
y = df[target].values
447
448
X_train, X_test, y_train, y_test = train_test_split(
449
X, y, test_size=0.3, random_state=123)
450
451
452
453
454
slr = LinearRegression()
455
456
slr.fit(X_train, y_train)
457
y_train_pred = slr.predict(X_train)
458
y_test_pred = slr.predict(X_test)
459
460
461
462
463
x_max = np.max([np.max(y_train_pred), np.max(y_test_pred)])
464
x_min = np.min([np.min(y_train_pred), np.min(y_test_pred)])
465
466
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 3), sharey=True)
467
468
ax1.scatter(y_test_pred, y_test_pred - y_test,
469
c='limegreen', marker='s', edgecolor='white',
470
label='Test data')
471
ax2.scatter(y_train_pred, y_train_pred - y_train,
472
c='steelblue', marker='o', edgecolor='white',
473
label='Training data')
474
ax1.set_ylabel('Residuals')
475
476
for ax in (ax1, ax2):
477
ax.set_xlabel('Predicted values')
478
ax.legend(loc='upper left')
479
ax.hlines(y=0, xmin=x_min-100, xmax=x_max+100, color='black', lw=2)
480
481
plt.tight_layout()
482
483
#plt.savefig('figures/09_11.png', dpi=300)
484
plt.show()
485
486
487
488
489
490
491
mse_train = mean_squared_error(y_train, y_train_pred)
492
mse_test = mean_squared_error(y_test, y_test_pred)
493
print(f'MSE train: {mse_train:.2f}')
494
print(f'MSE test: {mse_test:.2f}')
495
496
497
498
499
500
501
mae_train = mean_absolute_error(y_train, y_train_pred)
502
mae_test = mean_absolute_error(y_test, y_test_pred)
503
print(f'MAE train: {mae_train:.2f}')
504
print(f'MAE test: {mae_test:.2f}')
505
506
507
508
509
510
511
r2_train = r2_score(y_train, y_train_pred)
512
r2_test =r2_score(y_test, y_test_pred)
513
print(f'R^2 train: {r2_train:.2f}')
514
print(f'R^2 test: {r2_test:.2f}')
515
516
517
518
# # Using regularized methods for regression
519
520
521
522
523
524
lasso = Lasso(alpha=1.0)
525
lasso.fit(X_train, y_train)
526
y_train_pred = lasso.predict(X_train)
527
y_test_pred = lasso.predict(X_test)
528
print(lasso.coef_)
529
530
531
532
533
train_mse = mean_squared_error(y_train, y_train_pred)
534
test_mse = mean_squared_error(y_test, y_test_pred)
535
print(f'MSE train: {train_mse:.3f}, test: {test_mse:.3f}')
536
537
train_r2 = r2_score(y_train, y_train_pred)
538
test_r2 = r2_score(y_test, y_test_pred)
539
print(f'R^2 train: {train_r2:.3f}, {test_r2:.3f}')
540
541
542
# Ridge regression:
543
544
545
546
547
548
ridge = Ridge(alpha=1.0)
549
550
551
# LASSO regression:
552
553
554
555
556
557
lasso = Lasso(alpha=1.0)
558
559
560
# Elastic Net regression:
561
562
563
564
565
566
elanet = ElasticNet(alpha=1.0, l1_ratio=0.5)
567
568
569
570
# # Turning a linear regression model into a curve - polynomial regression
571
572
573
574
X = np.array([258.0, 270.0, 294.0,
575
320.0, 342.0, 368.0,
576
396.0, 446.0, 480.0, 586.0])\
577
[:, np.newaxis]
578
579
y = np.array([236.4, 234.4, 252.8,
580
298.6, 314.2, 342.2,
581
360.8, 368.0, 391.2,
582
390.8])
583
584
585
586
587
588
589
lr = LinearRegression()
590
pr = LinearRegression()
591
quadratic = PolynomialFeatures(degree=2)
592
X_quad = quadratic.fit_transform(X)
593
594
595
596
597
# fit linear features
598
lr.fit(X, y)
599
X_fit = np.arange(250, 600, 10)[:, np.newaxis]
600
y_lin_fit = lr.predict(X_fit)
601
602
# fit quadratic features
603
pr.fit(X_quad, y)
604
y_quad_fit = pr.predict(quadratic.fit_transform(X_fit))
605
606
# plot results
607
plt.scatter(X, y, label='Training points')
608
plt.plot(X_fit, y_lin_fit, label='Linear fit', linestyle='--')
609
plt.plot(X_fit, y_quad_fit, label='Quadratic fit')
610
plt.xlabel('Explanatory variable')
611
plt.ylabel('Predicted or known target values')
612
plt.legend(loc='upper left')
613
614
plt.tight_layout()
615
#plt.savefig('figures/09_12.png', dpi=300)
616
plt.show()
617
618
619
620
621
y_lin_pred = lr.predict(X)
622
y_quad_pred = pr.predict(X_quad)
623
624
625
626
627
mse_lin = mean_squared_error(y, y_lin_pred)
628
mse_quad = mean_squared_error(y, y_quad_pred)
629
print(f'Training MSE linear: {mse_lin:.3f}'
630
f', quadratic: {mse_quad:.3f}')
631
632
633
r2_lin = r2_score(y, y_lin_pred)
634
r2_quad = r2_score(y, y_quad_pred)
635
print(f'Training R^2 linear: {r2_lin:.3f}'
636
f', quadratic: {r2_quad:.3f}')
637
638
639
640
# ## Modeling nonlinear relationships in the Ames Housing dataset
641
642
643
644
X = df[['Gr Liv Area']].values
645
y = df['SalePrice'].values
646
647
X = X[(df['Gr Liv Area'] < 4000)]
648
y = y[(df['Gr Liv Area'] < 4000)]
649
650
651
regr = LinearRegression()
652
653
# create quadratic features
654
quadratic = PolynomialFeatures(degree=2)
655
cubic = PolynomialFeatures(degree=3)
656
X_quad = quadratic.fit_transform(X)
657
X_cubic = cubic.fit_transform(X)
658
659
# fit features
660
X_fit = np.arange(X.min()-1, X.max()+2, 1)[:, np.newaxis]
661
662
regr = regr.fit(X, y)
663
y_lin_fit = regr.predict(X_fit)
664
linear_r2 = r2_score(y, regr.predict(X))
665
666
regr = regr.fit(X_quad, y)
667
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
668
quadratic_r2 = r2_score(y, regr.predict(X_quad))
669
670
regr = regr.fit(X_cubic, y)
671
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
672
cubic_r2 = r2_score(y, regr.predict(X_cubic))
673
674
675
# plot results
676
plt.scatter(X, y, label='Training points', color='lightgray')
677
678
plt.plot(X_fit, y_lin_fit,
679
label=f'Linear (d=1), $R^2$={linear_r2:.2f}',
680
color='blue',
681
lw=2,
682
linestyle=':')
683
684
plt.plot(X_fit, y_quad_fit,
685
label=f'Quadratic (d=2), $R^2$={quadratic_r2:.2f}',
686
color='red',
687
lw=2,
688
linestyle='-')
689
690
plt.plot(X_fit, y_cubic_fit,
691
label=f'Cubic (d=3), $R^2$={cubic_r2:.2f}',
692
color='green',
693
lw=2,
694
linestyle='--')
695
696
697
plt.xlabel('Living area above ground in square feet')
698
plt.ylabel('Sale price in U.S. dollars')
699
plt.legend(loc='upper left')
700
701
plt.tight_layout()
702
plt.savefig('figures/09_13.png', dpi=300)
703
plt.show()
704
705
706
707
708
X = df[['Overall Qual']].values
709
y = df['SalePrice'].values
710
711
712
regr = LinearRegression()
713
714
# create quadratic features
715
quadratic = PolynomialFeatures(degree=2)
716
cubic = PolynomialFeatures(degree=3)
717
X_quad = quadratic.fit_transform(X)
718
X_cubic = cubic.fit_transform(X)
719
720
# fit features
721
X_fit = np.arange(X.min()-1, X.max()+2, 1)[:, np.newaxis]
722
723
regr = regr.fit(X, y)
724
y_lin_fit = regr.predict(X_fit)
725
linear_r2 = r2_score(y, regr.predict(X))
726
727
regr = regr.fit(X_quad, y)
728
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
729
quadratic_r2 = r2_score(y, regr.predict(X_quad))
730
731
regr = regr.fit(X_cubic, y)
732
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
733
cubic_r2 = r2_score(y, regr.predict(X_cubic))
734
735
736
# plot results
737
plt.scatter(X, y, label='Training points', color='lightgray')
738
739
plt.plot(X_fit, y_lin_fit,
740
label=f'Linear (d=1), $R^2$={linear_r2:.2f}',
741
color='blue',
742
lw=2,
743
linestyle=':')
744
745
plt.plot(X_fit, y_quad_fit,
746
label=f'Quadratic (d=2), $R^2$={quadratic_r2:.2f}',
747
color='red',
748
lw=2,
749
linestyle='-')
750
751
plt.plot(X_fit, y_cubic_fit,
752
label=f'Cubic (d=3), $R^2$={cubic_r2:.2f}',
753
color='green',
754
lw=2,
755
linestyle='--')
756
757
758
plt.xlabel('Overall quality of the house')
759
plt.ylabel('Sale price in U.S. dollars')
760
plt.legend(loc='upper left')
761
762
plt.tight_layout()
763
#plt.savefig('figures/09_14.png', dpi=300)
764
plt.show()
765
766
767
768
# # Dealing with nonlinear relationships using random forests
769
770
# ...
771
772
# ## Decision tree regression
773
774
775
776
777
778
X = df[['Gr Liv Area']].values
779
y = df['SalePrice'].values
780
781
782
783
tree = DecisionTreeRegressor(max_depth=3)
784
tree.fit(X, y)
785
sort_idx = X.flatten().argsort()
786
787
lin_regplot(X[sort_idx], y[sort_idx], tree)
788
plt.xlabel('Living area above ground in square feet')
789
plt.ylabel('Sale price in U.S. dollars')
790
791
plt.tight_layout()
792
#plt.savefig('figures/09_15.png', dpi=300)
793
plt.show()
794
795
796
797
798
tree_r2 = r2_score(y, tree.predict(X))
799
tree_r2
800
801
802
803
# ## Random forest regression
804
805
806
807
target = 'SalePrice'
808
features = df.columns[df.columns != target]
809
810
X = df[features].values
811
y = df[target].values
812
813
X_train, X_test, y_train, y_test = train_test_split(
814
X, y, test_size=0.3, random_state=123)
815
816
817
818
819
820
821
forest = RandomForestRegressor(n_estimators=1000,
822
criterion='squared_error',
823
random_state=1,
824
n_jobs=-1)
825
forest.fit(X_train, y_train)
826
y_train_pred = forest.predict(X_train)
827
y_test_pred = forest.predict(X_test)
828
829
830
mae_train = mean_absolute_error(y_train, y_train_pred)
831
mae_test = mean_absolute_error(y_test, y_test_pred)
832
print(f'MAE train: {mae_train:.2f}')
833
print(f'MAE test: {mae_test:.2f}')
834
835
836
r2_train = r2_score(y_train, y_train_pred)
837
r2_test =r2_score(y_test, y_test_pred)
838
print(f'R^2 train: {r2_train:.2f}')
839
print(f'R^2 test: {r2_test:.2f}')
840
841
842
843
844
x_max = np.max([np.max(y_train_pred), np.max(y_test_pred)])
845
x_min = np.min([np.min(y_train_pred), np.min(y_test_pred)])
846
847
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 3), sharey=True)
848
849
ax1.scatter(y_test_pred, y_test_pred - y_test,
850
c='limegreen', marker='s', edgecolor='white',
851
label='Test data')
852
ax2.scatter(y_train_pred, y_train_pred - y_train,
853
c='steelblue', marker='o', edgecolor='white',
854
label='Training data')
855
ax1.set_ylabel('Residuals')
856
857
for ax in (ax1, ax2):
858
ax.set_xlabel('Predicted values')
859
ax.legend(loc='upper left')
860
ax.hlines(y=0, xmin=x_min-100, xmax=x_max+100, color='black', lw=2)
861
862
plt.tight_layout()
863
864
#plt.savefig('figures/09_16.png', dpi=300)
865
plt.show()
866
867
868
869
# # Summary
870
871
# ...
872
873
# ---
874
#
875
# Readers may ignore the next cell.
876
877
878
879
880
881
882
883
884
885
886