Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch10/ch10.py
1245 views
1
# coding: utf-8
2
3
from packaging import version
4
import sys
5
from python_environment_check import check_packages
6
from sklearn.datasets import make_blobs
7
import matplotlib.pyplot as plt
8
from sklearn.cluster import KMeans
9
import numpy as np
10
from matplotlib import cm
11
from sklearn.metrics import silhouette_samples
12
import pandas as pd
13
from scipy.spatial.distance import pdist, squareform
14
from scipy.cluster.hierarchy import linkage
15
from scipy.cluster.hierarchy import dendrogram
16
# from scipy.cluster.hierarchy import set_link_color_palette
17
from sklearn.cluster import AgglomerativeClustering
18
from sklearn.datasets import make_moons
19
from sklearn.cluster import DBSCAN
20
21
# # Machine Learning with PyTorch and Scikit-Learn
22
# # -- Code Examples
23
24
# ## Package version checks
25
26
# Add folder to path in order to load from the check_packages.py script:
27
28
29
30
sys.path.insert(0, '..')
31
32
33
# Check recommended package versions:
34
35
36
37
38
39
d = {
40
'numpy': '1.21.2',
41
'scipy': '1.7.0',
42
'matplotlib': '3.4.3',
43
'sklearn': '1.0',
44
'pandas': '1.3.2',
45
}
46
check_packages(d)
47
48
49
# # Python Machine Learning - Code Examples
50
51
# # Chapter 10 - Working with Unlabeled Data – Clustering Analysis
52
53
# Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).
54
55
56
57
58
59
# *The use of `watermark` is optional. You can install this Jupyter extension via*
60
#
61
# conda install watermark -c conda-forge
62
#
63
# or
64
#
65
# pip install watermark
66
#
67
# *For more information, please see: https://github.com/rasbt/watermark.*
68
69
70
# ### Overview
71
72
# - [Grouping objects by similarity using k-means](#Grouping-objects-by-similarity-using-k-means)
73
# - [K-means clustering using scikit-learn](#K-means-clustering-using-scikit-learn)
74
# - [A smarter way of placing the initial cluster centroids using k-means++](#A-smarter-way-of-placing-the-initial-cluster-centroids-using-k-means++)
75
# - [Hard versus soft clustering](#Hard-versus-soft-clustering)
76
# - [Using the elbow method to find the optimal number of clusters](#Using-the-elbow-method-to-find-the-optimal-number-of-clusters)
77
# - [Quantifying the quality of clustering via silhouette plots](#Quantifying-the-quality-of-clustering-via-silhouette-plots)
78
# - [Organizing clusters as a hierarchical tree](#Organizing-clusters-as-a-hierarchical-tree)
79
# - [Grouping clusters in bottom-up fashion](#Grouping-clusters-in-bottom-up-fashion)
80
# - [Performing hierarchical clustering on a distance matrix](#Performing-hierarchical-clustering-on-a-distance-matrix)
81
# - [Attaching dendrograms to a heat map](#Attaching-dendrograms-to-a-heat-map)
82
# - [Applying agglomerative clustering via scikit-learn](#Applying-agglomerative-clustering-via-scikit-learn)
83
# - [Locating regions of high density via DBSCAN](#Locating-regions-of-high-density-via-DBSCAN)
84
# - [Summary](#Summary)
85
86
87
88
89
90
91
# # Grouping objects by similarity using k-means
92
93
# ## K-means clustering using scikit-learn
94
95
96
97
98
99
X, y = make_blobs(n_samples=150,
100
n_features=2,
101
centers=3,
102
cluster_std=0.5,
103
shuffle=True,
104
random_state=0)
105
106
107
108
109
110
111
plt.scatter(X[:, 0], X[:, 1],
112
c='white', marker='o', edgecolor='black', s=50)
113
plt.xlabel('Feature 1')
114
plt.ylabel('Feature 2')
115
116
plt.grid()
117
plt.tight_layout()
118
#plt.savefig('figures/10_01.png', dpi=300)
119
plt.show()
120
121
122
123
124
125
126
km = KMeans(n_clusters=3,
127
init='random',
128
n_init=10,
129
max_iter=300,
130
tol=1e-04,
131
random_state=0)
132
133
y_km = km.fit_predict(X)
134
135
136
137
138
plt.scatter(X[y_km == 0, 0],
139
X[y_km == 0, 1],
140
s=50, c='lightgreen',
141
marker='s', edgecolor='black',
142
label='Cluster 1')
143
plt.scatter(X[y_km == 1, 0],
144
X[y_km == 1, 1],
145
s=50, c='orange',
146
marker='o', edgecolor='black',
147
label='Cluster 2')
148
plt.scatter(X[y_km == 2, 0],
149
X[y_km == 2, 1],
150
s=50, c='lightblue',
151
marker='v', edgecolor='black',
152
label='Cluster 3')
153
plt.scatter(km.cluster_centers_[:, 0],
154
km.cluster_centers_[:, 1],
155
s=250, marker='*',
156
c='red', edgecolor='black',
157
label='Centroids')
158
159
plt.xlabel('Feature 1')
160
plt.ylabel('Feature 2')
161
162
plt.legend(scatterpoints=1)
163
plt.grid()
164
plt.tight_layout()
165
#plt.savefig('figures/10_02.png', dpi=300)
166
plt.show()
167
168
169
170
# ## A smarter way of placing the initial cluster centroids using k-means++
171
172
# ...
173
174
# ## Hard versus soft clustering
175
176
# ...
177
178
# ## Using the elbow method to find the optimal number of clusters
179
180
181
182
print(f'Distortion: {km.inertia_:.2f}')
183
184
185
186
187
distortions = []
188
for i in range(1, 11):
189
km = KMeans(n_clusters=i,
190
init='k-means++',
191
n_init=10,
192
max_iter=300,
193
random_state=0)
194
km.fit(X)
195
distortions.append(km.inertia_)
196
plt.plot(range(1, 11), distortions, marker='o')
197
plt.xlabel('Number of clusters')
198
plt.ylabel('Distortion')
199
plt.tight_layout()
200
#plt.savefig('figures/10_03.png', dpi=300)
201
plt.show()
202
203
204
205
# ## Quantifying the quality of clustering via silhouette plots
206
207
208
209
210
211
km = KMeans(n_clusters=3,
212
init='k-means++',
213
n_init=10,
214
max_iter=300,
215
tol=1e-04,
216
random_state=0)
217
y_km = km.fit_predict(X)
218
219
cluster_labels = np.unique(y_km)
220
n_clusters = cluster_labels.shape[0]
221
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
222
y_ax_lower, y_ax_upper = 0, 0
223
yticks = []
224
for i, c in enumerate(cluster_labels):
225
c_silhouette_vals = silhouette_vals[y_km == c]
226
c_silhouette_vals.sort()
227
y_ax_upper += len(c_silhouette_vals)
228
color = cm.jet(float(i) / n_clusters)
229
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0,
230
edgecolor='none', color=color)
231
232
yticks.append((y_ax_lower + y_ax_upper) / 2.)
233
y_ax_lower += len(c_silhouette_vals)
234
235
silhouette_avg = np.mean(silhouette_vals)
236
plt.axvline(silhouette_avg, color="red", linestyle="--")
237
238
plt.yticks(yticks, cluster_labels + 1)
239
plt.ylabel('Cluster')
240
plt.xlabel('Silhouette coefficient')
241
242
plt.tight_layout()
243
#plt.savefig('figures/10_04.png', dpi=300)
244
plt.show()
245
246
247
# Comparison to "bad" clustering:
248
249
250
251
km = KMeans(n_clusters=2,
252
init='k-means++',
253
n_init=10,
254
max_iter=300,
255
tol=1e-04,
256
random_state=0)
257
y_km = km.fit_predict(X)
258
259
plt.scatter(X[y_km == 0, 0],
260
X[y_km == 0, 1],
261
s=50,
262
c='lightgreen',
263
edgecolor='black',
264
marker='s',
265
label='Cluster 1')
266
plt.scatter(X[y_km == 1, 0],
267
X[y_km == 1, 1],
268
s=50,
269
c='orange',
270
edgecolor='black',
271
marker='o',
272
label='Cluster 2')
273
274
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
275
s=250, marker='*', c='red', label='Centroids')
276
277
plt.xlabel('Feature 1')
278
plt.ylabel('Feature 2')
279
280
plt.legend()
281
plt.grid()
282
plt.tight_layout()
283
#plt.savefig('figures/10_05.png', dpi=300)
284
plt.show()
285
286
287
288
289
cluster_labels = np.unique(y_km)
290
n_clusters = cluster_labels.shape[0]
291
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
292
y_ax_lower, y_ax_upper = 0, 0
293
yticks = []
294
for i, c in enumerate(cluster_labels):
295
c_silhouette_vals = silhouette_vals[y_km == c]
296
c_silhouette_vals.sort()
297
y_ax_upper += len(c_silhouette_vals)
298
color = cm.jet(float(i) / n_clusters)
299
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0,
300
edgecolor='none', color=color)
301
302
yticks.append((y_ax_lower + y_ax_upper) / 2.)
303
y_ax_lower += len(c_silhouette_vals)
304
305
silhouette_avg = np.mean(silhouette_vals)
306
plt.axvline(silhouette_avg, color="red", linestyle="--")
307
308
plt.yticks(yticks, cluster_labels + 1)
309
plt.ylabel('Cluster')
310
plt.xlabel('Silhouette coefficient')
311
312
plt.tight_layout()
313
#plt.savefig('figures/10_06.png', dpi=300)
314
plt.show()
315
316
317
318
# # Organizing clusters as a hierarchical tree
319
320
# ## Grouping clusters in bottom-up fashion
321
322
323
324
325
326
327
328
329
330
np.random.seed(123)
331
332
variables = ['X', 'Y', 'Z']
333
labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']
334
335
X = np.random.random_sample([5, 3])*10
336
df = pd.DataFrame(X, columns=variables, index=labels)
337
df
338
339
340
341
# ## Performing hierarchical clustering on a distance matrix
342
343
344
345
346
347
row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')),
348
columns=labels,
349
index=labels)
350
row_dist
351
352
353
# We can either pass a condensed distance matrix (upper triangular) from the `pdist` function, or we can pass the "original" data array and define the `metric='euclidean'` argument in `linkage`. However, we should not pass the squareform distance matrix, which would yield different distance values although the overall clustering could be the same.
354
355
356
357
# 1. incorrect approach: Squareform distance matrix
358
359
360
361
row_clusters = linkage(row_dist, method='complete', metric='euclidean')
362
pd.DataFrame(row_clusters,
363
columns=['row label 1', 'row label 2',
364
'distance', 'no. of items in clust.'],
365
index=[f'cluster {(i + 1)}'
366
for i in range(row_clusters.shape[0])])
367
368
369
370
371
# 2. correct approach: Condensed distance matrix
372
373
row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')
374
pd.DataFrame(row_clusters,
375
columns=['row label 1', 'row label 2',
376
'distance', 'no. of items in clust.'],
377
index=[f'cluster {(i + 1)}'
378
for i in range(row_clusters.shape[0])])
379
380
381
382
383
# 3. correct approach: Input matrix
384
385
row_clusters = linkage(df.values, method='complete', metric='euclidean')
386
pd.DataFrame(row_clusters,
387
columns=['row label 1', 'row label 2',
388
'distance', 'no. of items in clust.'],
389
index=[f'cluster {(i + 1)}'
390
for i in range(row_clusters.shape[0])])
391
392
393
394
395
396
397
# make dendrogram black (part 1/2)
398
# set_link_color_palette(['black'])
399
400
row_dendr = dendrogram(row_clusters,
401
labels=labels,
402
# make dendrogram black (part 2/2)
403
# color_threshold=np.inf
404
)
405
plt.tight_layout()
406
plt.ylabel('Euclidean distance')
407
#plt.savefig('figures/10_11.png', dpi=300,
408
# bbox_inches='tight')
409
plt.show()
410
411
412
413
# ## Attaching dendrograms to a heat map
414
415
416
417
# plot row dendrogram
418
fig = plt.figure(figsize=(8, 8), facecolor='white')
419
axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])
420
421
# note: for matplotlib < v1.5.1, please use orientation='right'
422
row_dendr = dendrogram(row_clusters, orientation='left')
423
424
# reorder data with respect to clustering
425
df_rowclust = df.iloc[row_dendr['leaves'][::-1]]
426
427
axd.set_xticks([])
428
axd.set_yticks([])
429
430
# remove axes spines from dendrogram
431
for i in axd.spines.values():
432
i.set_visible(False)
433
434
# plot heatmap
435
axm = fig.add_axes([0.23, 0.1, 0.6, 0.6]) # x-pos, y-pos, width, height
436
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')
437
fig.colorbar(cax)
438
439
axm.set_xticklabels([''] + list(df_rowclust.columns))
440
axm.set_yticklabels([''] + list(df_rowclust.index))
441
442
#plt.savefig('figures/10_12.png', dpi=300)
443
plt.show()
444
445
446
447
# ## Applying agglomerative clustering via scikit-learn
448
449
450
451
452
453
if version.parse(sklearn.__version__) > version.parse("1.2"):
454
ac = AgglomerativeClustering(n_clusters=3,
455
metric="euclidean",
456
linkage="complete"
457
)
458
else:
459
ac = AgglomerativeClustering(n_clusters=3,
460
affinity="euclidean",
461
linkage="complete"
462
)
463
464
labels = ac.fit_predict(X)
465
print(f'Cluster labels: {labels}')
466
467
468
469
470
if version.parse(sklearn.__version__) > version.parse("1.2"):
471
ac = AgglomerativeClustering(n_clusters=2,
472
metric="euclidean",
473
linkage="complete"
474
)
475
else:
476
ac = AgglomerativeClustering(n_clusters=2,
477
affinity="euclidean",
478
linkage="complete"
479
)
480
481
labels = ac.fit_predict(X)
482
print(f'Cluster labels: {labels}')
483
484
485
486
# # Locating regions of high density via DBSCAN
487
488
489
490
491
492
493
494
495
496
X, y = make_moons(n_samples=200, noise=0.05, random_state=0)
497
plt.scatter(X[:, 0], X[:, 1])
498
499
plt.xlabel('Feature 1')
500
plt.ylabel('Feature 2')
501
502
plt.tight_layout()
503
#plt.savefig('figures/10_14.png', dpi=300)
504
plt.show()
505
506
507
# K-means and hierarchical clustering:
508
509
510
511
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
512
513
km = KMeans(n_clusters=2, random_state=0)
514
y_km = km.fit_predict(X)
515
ax1.scatter(X[y_km == 0, 0], X[y_km == 0, 1],
516
edgecolor='black',
517
c='lightblue', marker='o', s=40, label='cluster 1')
518
ax1.scatter(X[y_km == 1, 0], X[y_km == 1, 1],
519
edgecolor='black',
520
c='red', marker='s', s=40, label='cluster 2')
521
ax1.set_title('K-means clustering')
522
523
ax1.set_xlabel('Feature 1')
524
ax1.set_ylabel('Feature 2')
525
526
ac = AgglomerativeClustering(n_clusters=2,
527
affinity='euclidean',
528
linkage='complete')
529
y_ac = ac.fit_predict(X)
530
ax2.scatter(X[y_ac == 0, 0], X[y_ac == 0, 1], c='lightblue',
531
edgecolor='black',
532
marker='o', s=40, label='Cluster 1')
533
ax2.scatter(X[y_ac == 1, 0], X[y_ac == 1, 1], c='red',
534
edgecolor='black',
535
marker='s', s=40, label='Cluster 2')
536
ax2.set_title('Agglomerative clustering')
537
538
ax2.set_xlabel('Feature 1')
539
ax2.set_ylabel('Feature 2')
540
541
plt.legend()
542
plt.tight_layout()
543
#plt.savefig('figures/10_15.png', dpi=300)
544
plt.show()
545
546
547
# Density-based clustering:
548
549
550
551
552
553
db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
554
y_db = db.fit_predict(X)
555
plt.scatter(X[y_db == 0, 0], X[y_db == 0, 1],
556
c='lightblue', marker='o', s=40,
557
edgecolor='black',
558
label='Cluster 1')
559
plt.scatter(X[y_db == 1, 0], X[y_db == 1, 1],
560
c='red', marker='s', s=40,
561
edgecolor='black',
562
label='Cluster 2')
563
564
plt.xlabel('Feature 1')
565
plt.ylabel('Feature 2')
566
567
plt.legend()
568
plt.tight_layout()
569
#plt.savefig('figures/10_16.png', dpi=300)
570
plt.show()
571
572
573
574
# # Summary
575
576
# ...
577
578
# ---
579
#
580
# Readers may ignore the next cell.
581
582
583
584
585
586