CoCalc -- ch10.py

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch10/ch10.py
¹²⁴⁵ views
1
# coding: utf-8
2

3
from packaging import version
4
import sys
5
from python_environment_check import check_packages
6
from sklearn.datasets import make_blobs
7
import matplotlib.pyplot as plt
8
from sklearn.cluster import KMeans
9
import numpy as np
10
from matplotlib import cm
11
from sklearn.metrics import silhouette_samples
12
import pandas as pd
13
from scipy.spatial.distance import pdist, squareform
14
from scipy.cluster.hierarchy import linkage
15
from scipy.cluster.hierarchy import dendrogram
16
# from scipy.cluster.hierarchy import set_link_color_palette
17
from sklearn.cluster import AgglomerativeClustering
18
from sklearn.datasets import make_moons
19
from sklearn.cluster import DBSCAN
20

21
# # Machine Learning with PyTorch and Scikit-Learn  
22
# # -- Code Examples
23

24
# ## Package version checks
25

26
# Add folder to path in order to load from the check_packages.py script:
27

28

29

30
sys.path.insert(0, '..')
31

32

33
# Check recommended package versions:
34

35

36

37

38

39
d = {
40
    'numpy': '1.21.2',
41
    'scipy': '1.7.0',
42
    'matplotlib': '3.4.3',
43
    'sklearn': '1.0',
44
    'pandas': '1.3.2',
45
}
46
check_packages(d)
47

48

49
# # Python Machine Learning - Code Examples
50

51
# # Chapter 10 - Working with Unlabeled Data – Clustering Analysis
52

53
# Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).
54

55

56

57

58

59
# *The use of `watermark` is optional. You can install this Jupyter extension via*  
60
# 
61
#     conda install watermark -c conda-forge  
62
# 
63
# or  
64
# 
65
#     pip install watermark   
66
# 
67
# *For more information, please see: https://github.com/rasbt/watermark.*
68

69

70
# ### Overview
71

72
# - [Grouping objects by similarity using k-means](#Grouping-objects-by-similarity-using-k-means)
73
#   - [K-means clustering using scikit-learn](#K-means-clustering-using-scikit-learn)
74
#   - [A smarter way of placing the initial cluster centroids using k-means++](#A-smarter-way-of-placing-the-initial-cluster-centroids-using-k-means++)
75
#   - [Hard versus soft clustering](#Hard-versus-soft-clustering)
76
#   - [Using the elbow method to find the optimal number of clusters](#Using-the-elbow-method-to-find-the-optimal-number-of-clusters)
77
#   - [Quantifying the quality of clustering via silhouette plots](#Quantifying-the-quality-of-clustering-via-silhouette-plots)
78
# - [Organizing clusters as a hierarchical tree](#Organizing-clusters-as-a-hierarchical-tree)
79
#   - [Grouping clusters in bottom-up fashion](#Grouping-clusters-in-bottom-up-fashion)
80
#   - [Performing hierarchical clustering on a distance matrix](#Performing-hierarchical-clustering-on-a-distance-matrix)
81
#   - [Attaching dendrograms to a heat map](#Attaching-dendrograms-to-a-heat-map)
82
#   - [Applying agglomerative clustering via scikit-learn](#Applying-agglomerative-clustering-via-scikit-learn)
83
# - [Locating regions of high density via DBSCAN](#Locating-regions-of-high-density-via-DBSCAN)
84
# - [Summary](#Summary)
85

86

87

88

89

90

91
# # Grouping objects by similarity using k-means
92

93
# ## K-means clustering using scikit-learn
94

95

96

97

98

99
X, y = make_blobs(n_samples=150, 
100
                  n_features=2, 
101
                  centers=3, 
102
                  cluster_std=0.5, 
103
                  shuffle=True, 
104
                  random_state=0)
105

106

107

108

109

110

111
plt.scatter(X[:, 0], X[:, 1], 
112
            c='white', marker='o', edgecolor='black', s=50)
113
plt.xlabel('Feature 1')
114
plt.ylabel('Feature 2')
115

116
plt.grid()
117
plt.tight_layout()
118
#plt.savefig('figures/10_01.png', dpi=300)
119
plt.show()
120

121

122

123

124

125

126
km = KMeans(n_clusters=3, 
127
            init='random', 
128
            n_init=10, 
129
            max_iter=300,
130
            tol=1e-04,
131
            random_state=0)
132

133
y_km = km.fit_predict(X)
134

135

136

137

138
plt.scatter(X[y_km == 0, 0],
139
            X[y_km == 0, 1],
140
            s=50, c='lightgreen',
141
            marker='s', edgecolor='black',
142
            label='Cluster 1')
143
plt.scatter(X[y_km == 1, 0],
144
            X[y_km == 1, 1],
145
            s=50, c='orange',
146
            marker='o', edgecolor='black',
147
            label='Cluster 2')
148
plt.scatter(X[y_km == 2, 0],
149
            X[y_km == 2, 1],
150
            s=50, c='lightblue',
151
            marker='v', edgecolor='black',
152
            label='Cluster 3')
153
plt.scatter(km.cluster_centers_[:, 0],
154
            km.cluster_centers_[:, 1],
155
            s=250, marker='*',
156
            c='red', edgecolor='black',
157
            label='Centroids')
158

159
plt.xlabel('Feature 1')
160
plt.ylabel('Feature 2')
161

162
plt.legend(scatterpoints=1)
163
plt.grid()
164
plt.tight_layout()
165
#plt.savefig('figures/10_02.png', dpi=300)
166
plt.show()
167

168

169

170
# ## A smarter way of placing the initial cluster centroids using k-means++
171

172
# ...
173

174
# ## Hard versus soft clustering
175

176
# ...
177

178
# ## Using the elbow method to find the optimal number of clusters 
179

180

181

182
print(f'Distortion: {km.inertia_:.2f}')
183

184

185

186

187
distortions = []
188
for i in range(1, 11):
189
    km = KMeans(n_clusters=i, 
190
                init='k-means++', 
191
                n_init=10, 
192
                max_iter=300, 
193
                random_state=0)
194
    km.fit(X)
195
    distortions.append(km.inertia_)
196
plt.plot(range(1, 11), distortions, marker='o')
197
plt.xlabel('Number of clusters')
198
plt.ylabel('Distortion')
199
plt.tight_layout()
200
#plt.savefig('figures/10_03.png', dpi=300)
201
plt.show()
202

203

204

205
# ## Quantifying the quality of clustering  via silhouette plots
206

207

208

209

210

211
km = KMeans(n_clusters=3, 
212
            init='k-means++', 
213
            n_init=10, 
214
            max_iter=300,
215
            tol=1e-04,
216
            random_state=0)
217
y_km = km.fit_predict(X)
218

219
cluster_labels = np.unique(y_km)
220
n_clusters = cluster_labels.shape[0]
221
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
222
y_ax_lower, y_ax_upper = 0, 0
223
yticks = []
224
for i, c in enumerate(cluster_labels):
225
    c_silhouette_vals = silhouette_vals[y_km == c]
226
    c_silhouette_vals.sort()
227
    y_ax_upper += len(c_silhouette_vals)
228
    color = cm.jet(float(i) / n_clusters)
229
    plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, 
230
             edgecolor='none', color=color)
231

232
    yticks.append((y_ax_lower + y_ax_upper) / 2.)
233
    y_ax_lower += len(c_silhouette_vals)
234
    
235
silhouette_avg = np.mean(silhouette_vals)
236
plt.axvline(silhouette_avg, color="red", linestyle="--") 
237

238
plt.yticks(yticks, cluster_labels + 1)
239
plt.ylabel('Cluster')
240
plt.xlabel('Silhouette coefficient')
241

242
plt.tight_layout()
243
#plt.savefig('figures/10_04.png', dpi=300)
244
plt.show()
245

246

247
# Comparison to "bad" clustering:
248

249

250

251
km = KMeans(n_clusters=2,
252
            init='k-means++',
253
            n_init=10,
254
            max_iter=300,
255
            tol=1e-04,
256
            random_state=0)
257
y_km = km.fit_predict(X)
258

259
plt.scatter(X[y_km == 0, 0],
260
            X[y_km == 0, 1],
261
            s=50,
262
            c='lightgreen',
263
            edgecolor='black',
264
            marker='s',
265
            label='Cluster 1')
266
plt.scatter(X[y_km == 1, 0],
267
            X[y_km == 1, 1],
268
            s=50,
269
            c='orange',
270
            edgecolor='black',
271
            marker='o',
272
            label='Cluster 2')
273

274
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
275
            s=250, marker='*', c='red', label='Centroids')
276

277
plt.xlabel('Feature 1')
278
plt.ylabel('Feature 2')
279

280
plt.legend()
281
plt.grid()
282
plt.tight_layout()
283
#plt.savefig('figures/10_05.png', dpi=300)
284
plt.show()
285

286

287

288

289
cluster_labels = np.unique(y_km)
290
n_clusters = cluster_labels.shape[0]
291
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
292
y_ax_lower, y_ax_upper = 0, 0
293
yticks = []
294
for i, c in enumerate(cluster_labels):
295
    c_silhouette_vals = silhouette_vals[y_km == c]
296
    c_silhouette_vals.sort()
297
    y_ax_upper += len(c_silhouette_vals)
298
    color = cm.jet(float(i) / n_clusters)
299
    plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, 
300
             edgecolor='none', color=color)
301

302
    yticks.append((y_ax_lower + y_ax_upper) / 2.)
303
    y_ax_lower += len(c_silhouette_vals)
304
    
305
silhouette_avg = np.mean(silhouette_vals)
306
plt.axvline(silhouette_avg, color="red", linestyle="--") 
307

308
plt.yticks(yticks, cluster_labels + 1)
309
plt.ylabel('Cluster')
310
plt.xlabel('Silhouette coefficient')
311

312
plt.tight_layout()
313
#plt.savefig('figures/10_06.png', dpi=300)
314
plt.show()
315

316

317

318
# # Organizing clusters as a hierarchical tree
319

320
# ## Grouping clusters in bottom-up fashion
321

322

323

324

325

326

327

328

329

330
np.random.seed(123)
331

332
variables = ['X', 'Y', 'Z']
333
labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']
334

335
X = np.random.random_sample([5, 3])*10
336
df = pd.DataFrame(X, columns=variables, index=labels)
337
df
338

339

340

341
# ## Performing hierarchical clustering on a distance matrix
342

343

344

345

346

347
row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')),
348
                        columns=labels,
349
                        index=labels)
350
row_dist
351

352

353
# We can either pass a condensed distance matrix (upper triangular) from the `pdist` function, or we can pass the "original" data array and define the `metric='euclidean'` argument in `linkage`. However, we should not pass the squareform distance matrix, which would yield different distance values although the overall clustering could be the same.
354

355

356

357
# 1. incorrect approach: Squareform distance matrix
358

359

360

361
row_clusters = linkage(row_dist, method='complete', metric='euclidean')
362
pd.DataFrame(row_clusters,
363
             columns=['row label 1', 'row label 2',
364
                      'distance', 'no. of items in clust.'],
365
             index=[f'cluster {(i + 1)}'
366
                    for i in range(row_clusters.shape[0])])
367

368

369

370

371
# 2. correct approach: Condensed distance matrix
372

373
row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')
374
pd.DataFrame(row_clusters,
375
             columns=['row label 1', 'row label 2',
376
                      'distance', 'no. of items in clust.'],
377
            index=[f'cluster {(i + 1)}'
378
                    for i in range(row_clusters.shape[0])])
379

380

381

382

383
# 3. correct approach: Input matrix
384

385
row_clusters = linkage(df.values, method='complete', metric='euclidean')
386
pd.DataFrame(row_clusters,
387
             columns=['row label 1', 'row label 2',
388
                      'distance', 'no. of items in clust.'],
389
             index=[f'cluster {(i + 1)}'
390
                    for i in range(row_clusters.shape[0])])
391

392

393

394

395

396

397
# make dendrogram black (part 1/2)
398
# set_link_color_palette(['black'])
399

400
row_dendr = dendrogram(row_clusters, 
401
                       labels=labels,
402
                       # make dendrogram black (part 2/2)
403
                       # color_threshold=np.inf
404
                       )
405
plt.tight_layout()
406
plt.ylabel('Euclidean distance')
407
#plt.savefig('figures/10_11.png', dpi=300, 
408
#            bbox_inches='tight')
409
plt.show()
410

411

412

413
# ## Attaching dendrograms to a heat map
414

415

416

417
# plot row dendrogram
418
fig = plt.figure(figsize=(8, 8), facecolor='white')
419
axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])
420

421
# note: for matplotlib < v1.5.1, please use orientation='right'
422
row_dendr = dendrogram(row_clusters, orientation='left')
423

424
# reorder data with respect to clustering
425
df_rowclust = df.iloc[row_dendr['leaves'][::-1]]
426

427
axd.set_xticks([])
428
axd.set_yticks([])
429

430
# remove axes spines from dendrogram
431
for i in axd.spines.values():
432
    i.set_visible(False)
433

434
# plot heatmap
435
axm = fig.add_axes([0.23, 0.1, 0.6, 0.6])  # x-pos, y-pos, width, height
436
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')
437
fig.colorbar(cax)
438

439
axm.set_xticklabels([''] + list(df_rowclust.columns))
440
axm.set_yticklabels([''] + list(df_rowclust.index))
441

442
#plt.savefig('figures/10_12.png', dpi=300)
443
plt.show()
444

445

446

447
# ## Applying agglomerative clustering via scikit-learn
448

449

450

451

452

453
if version.parse(sklearn.__version__) > version.parse("1.2"):
454
    ac = AgglomerativeClustering(n_clusters=3,
455
                                 metric="euclidean",
456
                                 linkage="complete"
457
                                )
458
else:
459
    ac = AgglomerativeClustering(n_clusters=3,
460
                                 affinity="euclidean",
461
                                 linkage="complete"
462
                                )
463

464
labels = ac.fit_predict(X)
465
print(f'Cluster labels: {labels}')
466

467

468

469

470
if version.parse(sklearn.__version__) > version.parse("1.2"):
471
    ac = AgglomerativeClustering(n_clusters=2,
472
                                 metric="euclidean",
473
                                 linkage="complete"
474
                                )
475
else:
476
    ac = AgglomerativeClustering(n_clusters=2,
477
                                 affinity="euclidean",
478
                                 linkage="complete"
479
                                )
480

481
labels = ac.fit_predict(X)
482
print(f'Cluster labels: {labels}')
483

484

485

486
# # Locating regions of high density via DBSCAN
487

488

489

490

491

492

493

494

495

496
X, y = make_moons(n_samples=200, noise=0.05, random_state=0)
497
plt.scatter(X[:, 0], X[:, 1])
498

499
plt.xlabel('Feature 1')
500
plt.ylabel('Feature 2')
501

502
plt.tight_layout()
503
#plt.savefig('figures/10_14.png', dpi=300)
504
plt.show()
505

506

507
# K-means and hierarchical clustering:
508

509

510

511
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
512

513
km = KMeans(n_clusters=2, random_state=0)
514
y_km = km.fit_predict(X)
515
ax1.scatter(X[y_km == 0, 0], X[y_km == 0, 1],
516
            edgecolor='black',
517
            c='lightblue', marker='o', s=40, label='cluster 1')
518
ax1.scatter(X[y_km == 1, 0], X[y_km == 1, 1],
519
            edgecolor='black',
520
            c='red', marker='s', s=40, label='cluster 2')
521
ax1.set_title('K-means clustering')
522

523
ax1.set_xlabel('Feature 1')
524
ax1.set_ylabel('Feature 2')
525

526
ac = AgglomerativeClustering(n_clusters=2,
527
                             affinity='euclidean',
528
                             linkage='complete')
529
y_ac = ac.fit_predict(X)
530
ax2.scatter(X[y_ac == 0, 0], X[y_ac == 0, 1], c='lightblue',
531
            edgecolor='black',
532
            marker='o', s=40, label='Cluster 1')
533
ax2.scatter(X[y_ac == 1, 0], X[y_ac == 1, 1], c='red',
534
            edgecolor='black',
535
            marker='s', s=40, label='Cluster 2')
536
ax2.set_title('Agglomerative clustering')
537

538
ax2.set_xlabel('Feature 1')
539
ax2.set_ylabel('Feature 2')
540

541
plt.legend()
542
plt.tight_layout()
543
#plt.savefig('figures/10_15.png', dpi=300)
544
plt.show()
545

546

547
# Density-based clustering:
548

549

550

551

552

553
db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
554
y_db = db.fit_predict(X)
555
plt.scatter(X[y_db == 0, 0], X[y_db == 0, 1],
556
            c='lightblue', marker='o', s=40,
557
            edgecolor='black', 
558
            label='Cluster 1')
559
plt.scatter(X[y_db == 1, 0], X[y_db == 1, 1],
560
            c='red', marker='s', s=40,
561
            edgecolor='black', 
562
            label='Cluster 2')
563

564
plt.xlabel('Feature 1')
565
plt.ylabel('Feature 2')
566

567
plt.legend()
568
plt.tight_layout()
569
#plt.savefig('figures/10_16.png', dpi=300)
570
plt.show()
571

572

573

574
# # Summary
575

576
# ...
577

578
# ---
579
# 
580
# Readers may ignore the next cell.
581

582

583

584

585

586
Product

Resources

Company