CoCalc -- ch08.py

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch08/ch08.py
¹²⁴⁷ views
1
# coding: utf-8
2

3

4
import sys
5
from python_environment_check import check_packages
6
import os
7
import tarfile
8
import time
9
import urllib.request
10
import pyprind
11
import pandas as pd
12
import numpy as np
13
from sklearn.feature_extraction.text import CountVectorizer
14
from sklearn.feature_extraction.text import TfidfTransformer
15
import re
16
from nltk.stem.porter import PorterStemmer
17
import nltk
18
from nltk.corpus import stopwords
19
from packaging import version
20
from sklearn.pipeline import Pipeline
21
from sklearn.linear_model import LogisticRegression
22
from sklearn.feature_extraction.text import TfidfVectorizer
23
from sklearn.model_selection import GridSearchCV
24
from sklearn.model_selection import StratifiedKFold
25
from sklearn.model_selection import cross_val_score
26
import gzip
27
from sklearn.feature_extraction.text import HashingVectorizer
28
from sklearn.linear_model import SGDClassifier
29
from distutils.version import LooseVersion as Version
30
from sklearn import __version__ as sklearn_version
31
from sklearn.decomposition import LatentDirichletAllocation
32

33

34
# # Machine Learning with PyTorch and Scikit-Learn  
35
# # -- Code Examples
36

37
# ## Package version checks
38

39
# Add folder to path in order to load from the check_packages.py script:
40

41

42

43
sys.path.insert(0, '..')
44

45

46
# Check recommended package versions:
47

48

49

50

51

52
d = {
53
    'numpy': '1.21.2',
54
    'pandas': '1.3.2',
55
    'sklearn': '1.0',
56
    'pyprind': '2.11.3',
57
    'nltk': '3.6',
58
}
59
check_packages(d)
60

61

62
# # Chapter 8 - Applying Machine Learning To Sentiment Analysis
63

64
# ### Overview
65

66
# - [Preparing the IMDb movie review data for text processing](#Preparing-the-IMDb-movie-review-data-for-text-processing)
67
#   - [Obtaining the IMDb movie review dataset](#Obtaining-the-IMDb-movie-review-dataset)
68
#   - [Preprocessing the movie dataset into more convenient format](#Preprocessing-the-movie-dataset-into-more-convenient-format)
69
# - [Introducing the bag-of-words model](#Introducing-the-bag-of-words-model)
70
#   - [Transforming words into feature vectors](#Transforming-words-into-feature-vectors)
71
#   - [Assessing word relevancy via term frequency-inverse document frequency](#Assessing-word-relevancy-via-term-frequency-inverse-document-frequency)
72
#   - [Cleaning text data](#Cleaning-text-data)
73
#   - [Processing documents into tokens](#Processing-documents-into-tokens)
74
# - [Training a logistic regression model for document classification](#Training-a-logistic-regression-model-for-document-classification)
75
# - [Working with bigger data – online algorithms and out-of-core learning](#Working-with-bigger-data-–-online-algorithms-and-out-of-core-learning)
76
# - [Topic modeling](#Topic-modeling)
77
#   - [Decomposing text documents with Latent Dirichlet Allocation](#Decomposing-text-documents-with-Latent-Dirichlet-Allocation)
78
#   - [Latent Dirichlet Allocation with scikit-learn](#Latent-Dirichlet-Allocation-with-scikit-learn)
79
# - [Summary](#Summary)
80

81

82
# # Preparing the IMDb movie review data for text processing 
83

84
# ## Obtaining the IMDb movie review dataset
85

86
# The IMDB movie review set can be downloaded from [http://ai.stanford.edu/~amaas/data/sentiment/](http://ai.stanford.edu/~amaas/data/sentiment/).
87
# After downloading the dataset, decompress the files.
88
# 
89
# A) If you are working with Linux or MacOS X, open a new terminal windowm `cd` into the download directory and execute 
90
# 
91
# `tar -zxf aclImdb_v1.tar.gz`
92
# 
93
# B) If you are working with Windows, download an archiver such as [7Zip](http://www.7-zip.org) to extract the files from the download archive.
94

95
# **Optional code to download and unzip the dataset via Python:**
96

97

98

99

100
source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
101
target = 'aclImdb_v1.tar.gz'
102

103
if os.path.exists(target):
104
    os.remove(target)
105

106
def reporthook(count, block_size, total_size):
107
    global start_time
108
    if count == 0:
109
        start_time = time.time()
110
        return
111
    duration = time.time() - start_time
112
    progress_size = int(count * block_size)
113
    speed = progress_size / (1024.**2 * duration)
114
    percent = count * block_size * 100. / total_size
115

116
    sys.stdout.write(f'\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB '
117
                     f'| {speed:.2f} MB/s | {duration:.2f} sec elapsed')
118
    sys.stdout.flush()
119

120

121
if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'):
122
    urllib.request.urlretrieve(source, target, reporthook)
123

124

125

126

127
if not os.path.isdir('aclImdb'):
128

129
    with tarfile.open(target, 'r:gz') as tar:
130
        tar.extractall()
131

132

133
# ## Preprocessing the movie dataset into more convenient format
134

135
# Install pyprind by uncommenting the next code cell.
136

137

138

139
#!pip install pyprind
140

141

142

143

144

145
# change the `basepath` to the directory of the
146
# unzipped movie dataset
147

148
basepath = 'aclImdb'
149

150
labels = {'pos': 1, 'neg': 0}
151
pbar = pyprind.ProgBar(50000, stream=sys.stdout)
152
df = pd.DataFrame()
153
for s in ('test', 'train'):
154
    for l in ('pos', 'neg'):
155
        path = os.path.join(basepath, s, l)
156
        for file in sorted(os.listdir(path)):
157
            with open(os.path.join(path, file), 
158
                      'r', encoding='utf-8') as infile:
159
                txt = infile.read()
160
            if version.parse(pd.__version__) >= version.parse("1.3.2"):
161
                x = pd.DataFrame([[txt, labels[l]]], columns=['review', 'sentiment'])
162
                df = pd.concat([df, x], ignore_index=False)
163

164
            else:
165
                df = df.append([[txt, labels[l]]], 
166
                               ignore_index=True)
167
            pbar.update()
168
df.columns = ['review', 'sentiment']
169

170

171
# Shuffling the DataFrame:
172

173

174

175

176
np.random.seed(0)
177
df = df.reindex(np.random.permutation(df.index))
178

179

180
# Optional: Saving the assembled data as CSV file:
181

182

183

184
df.to_csv('movie_data.csv', index=False, encoding='utf-8')
185

186

187

188

189

190
df = pd.read_csv('movie_data.csv', encoding='utf-8')
191

192
# the following is necessary on some computers:
193
df = df.rename(columns={"0": "review", "1": "sentiment"})
194

195
df.head(3)
196

197

198

199

200
df.shape
201

202

203
# ---
204
# 
205
# ### Note
206
# 
207
# If you have problems with creating the `movie_data.csv`, you can find a download a zip archive at 
208
# https://github.com/rasbt/machine-learning-book/tree/main/ch08/
209
# 
210
# ---
211

212

213
# # Introducing the bag-of-words model
214

215
# ...
216

217
# ## Transforming documents into feature vectors
218

219
# By calling the fit_transform method on CountVectorizer, we just constructed the vocabulary of the bag-of-words model and transformed the following three sentences into sparse feature vectors:
220
# 1. The sun is shining
221
# 2. The weather is sweet
222
# 3. The sun is shining, the weather is sweet, and one and one is two
223
# 
224

225

226

227

228
count = CountVectorizer()
229
docs = np.array([
230
        'The sun is shining',
231
        'The weather is sweet',
232
        'The sun is shining, the weather is sweet, and one and one is two'])
233
bag = count.fit_transform(docs)
234

235

236
# Now let us print the contents of the vocabulary to get a better understanding of the underlying concepts:
237

238

239

240
print(count.vocabulary_)
241

242

243
# As we can see from executing the preceding command, the vocabulary is stored in a Python dictionary, which maps the unique words that are mapped to integer indices. Next let us print the feature vectors that we just created:
244

245
# Each index position in the feature vectors shown here corresponds to the integer values that are stored as dictionary items in the CountVectorizer vocabulary. For example, the  rst feature at index position 0 resembles the count of the word and, which only occurs in the last document, and the word is at index position 1 (the 2nd feature in the document vectors) occurs in all three sentences. Those values in the feature vectors are also called the raw term frequencies: *tf (t,d)*—the number of times a term t occurs in a document *d*.
246

247

248

249
print(bag.toarray())
250

251

252

253
# ## Assessing word relevancy via term frequency-inverse document frequency
254

255

256

257
np.set_printoptions(precision=2)
258

259

260
# When we are analyzing text data, we often encounter words that occur across multiple documents from both classes. Those frequently occurring words typically don't contain useful or discriminatory information. In this subsection, we will learn about a useful technique called term frequency-inverse document frequency (tf-idf) that can be used to downweight those frequently occurring words in the feature vectors. The tf-idf can be de ned as the product of the term frequency and the inverse document frequency:
261
# 
262
# $$\text{tf-idf}(t,d)=\text{tf (t,d)}\times \text{idf}(t,d)$$
263
# 
264
# Here the tf(t, d) is the term frequency that we introduced in the previous section,
265
# and the inverse document frequency *idf(t, d)* can be calculated as:
266
# 
267
# $$\text{idf}(t,d) = \text{log}\frac{n_d}{1+\text{df}(d, t)},$$
268
# 
269
# where $n_d$ is the total number of documents, and *df(d, t)* is the number of documents *d* that contain the term *t*. Note that adding the constant 1 to the denominator is optional and serves the purpose of assigning a non-zero value to terms that occur in all training examples; the log is used to ensure that low document frequencies are not given too much weight.
270
# 
271
# Scikit-learn implements yet another transformer, the `TfidfTransformer`, that takes the raw term frequencies from `CountVectorizer` as input and transforms them into tf-idfs:
272

273

274

275

276
tfidf = TfidfTransformer(use_idf=True, 
277
                         norm='l2', 
278
                         smooth_idf=True)
279
print(tfidf.fit_transform(count.fit_transform(docs))
280
      .toarray())
281

282

283
# As we saw in the previous subsection, the word is had the largest term frequency in the 3rd document, being the most frequently occurring word. However, after transforming the same feature vector into tf-idfs, we see that the word is is
284
# now associated with a relatively small tf-idf (0.45) in document 3 since it is
285
# also contained in documents 1 and 2 and thus is unlikely to contain any useful, discriminatory information.
286
# 
287

288
# However, if we'd manually calculated the tf-idfs of the individual terms in our feature vectors, we'd have noticed that the `TfidfTransformer` calculates the tf-idfs slightly differently compared to the standard textbook equations that we de ned earlier. The equations for the idf and tf-idf that were implemented in scikit-learn are:
289

290
# $$\text{idf} (t,d) = log\frac{1 + n_d}{1 + \text{df}(d, t)}$$
291
# 
292
# The tf-idf equation that was implemented in scikit-learn is as follows:
293
# 
294
# $$\text{tf-idf}(t,d) = \text{tf}(t,d) \times (\text{idf}(t,d)+1)$$
295
# 
296
# While it is also more typical to normalize the raw term frequencies before calculating the tf-idfs, the `TfidfTransformer` normalizes the tf-idfs directly.
297
# 
298
# By default (`norm='l2'`), scikit-learn's TfidfTransformer applies the L2-normalization, which returns a vector of length 1 by dividing an un-normalized feature vector *v* by its L2-norm:
299
# 
300
# $$v_{\text{norm}} = \frac{v}{||v||_2} = \frac{v}{\sqrt{v_{1}^{2} + v_{2}^{2} + \dots + v_{n}^{2}}} = \frac{v}{\big (\sum_{i=1}^{n} v_{i}^{2}\big)^\frac{1}{2}}$$
301
# 
302
# To make sure that we understand how TfidfTransformer works, let us walk
303
# through an example and calculate the tf-idf of the word is in the 3rd document.
304
# 
305
# The word is has a term frequency of 3 (tf = 3) in document 3 ($d_3$), and the document frequency of this term is 3 since the term is occurs in all three documents (df = 3). Thus, we can calculate the idf as follows:
306
# 
307
# $$\text{idf}("is", d_3) = log \frac{1+3}{1+3} = 0$$
308
# 
309
# Now in order to calculate the tf-idf, we simply need to add 1 to the inverse document frequency and multiply it by the term frequency:
310
# 
311
# $$\text{tf-idf}("is", d_3)= 3 \times (0+1) = 3$$
312

313

314

315
tf_is = 3
316
n_docs = 3
317
idf_is = np.log((n_docs+1) / (3+1))
318
tfidf_is = tf_is * (idf_is + 1)
319
print(f'tf-idf of term "is" = {tfidf_is:.2f}')
320

321

322
# If we repeated these calculations for all terms in the 3rd document, we'd obtain the following tf-idf vectors: [3.39, 3.0, 3.39, 1.29, 1.29, 1.29, 2.0 , 1.69, 1.29]. However, we notice that the values in this feature vector are different from the values that we obtained from the TfidfTransformer that we used previously. The  nal step that we are missing in this tf-idf calculation is the L2-normalization, which can be applied as follows:
323

324
# $$\text{tfi-df}_{norm} = \frac{[3.39, 3.0, 3.39, 1.29, 1.29, 1.29, 2.0 , 1.69, 1.29]}{\sqrt{[3.39^2, 3.0^2, 3.39^2, 1.29^2, 1.29^2, 1.29^2, 2.0^2 , 1.69^2, 1.29^2]}}$$
325
# 
326
# $$=[0.5, 0.45, 0.5, 0.19, 0.19, 0.19, 0.3, 0.25, 0.19]$$
327
# 
328
# $$\Rightarrow \text{tfi-df}_{norm}("is", d3) = 0.45$$
329

330
# As we can see, the results match the results returned by scikit-learn's `TfidfTransformer` (below). Since we now understand how tf-idfs are calculated, let us proceed to the next sections and apply those concepts to the movie review dataset.
331

332

333

334
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
335
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]
336
raw_tfidf 
337

338

339

340

341
l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
342
l2_tfidf
343

344

345

346
# ## Cleaning text data
347

348

349

350
df.loc[0, 'review'][-50:]
351

352

353

354

355
def preprocessor(text):
356
    text = re.sub('<[^>]*>', '', text)
357
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
358
                           text)
359
    text = (re.sub('[\W]+', ' ', text.lower()) +
360
            ' '.join(emoticons).replace('-', ''))
361
    return text
362

363

364

365

366
preprocessor(df.loc[0, 'review'][-50:])
367

368

369

370

371
preprocessor("</a>This :) is :( a test :-)!")
372

373

374

375

376
df['review'] = df['review'].apply(preprocessor)
377

378

379

380
# ## Processing documents into tokens
381

382

383

384

385
porter = PorterStemmer()
386

387
def tokenizer(text):
388
    return text.split()
389

390

391
def tokenizer_porter(text):
392
    return [porter.stem(word) for word in text.split()]
393

394

395

396

397
tokenizer('runners like running and thus they run')
398

399

400

401

402
tokenizer_porter('runners like running and thus they run')
403

404

405

406

407

408
nltk.download('stopwords')
409

410

411

412

413

414
stop = stopwords.words('english')
415
[w for w in tokenizer_porter('a runner likes running and runs a lot')
416
 if w not in stop]
417

418

419

420
# # Training a logistic regression model for document classification
421

422
# Strip HTML and punctuation to speed up the GridSearch later:
423

424

425

426
X_train = df.loc[:25000, 'review'].values
427
y_train = df.loc[:25000, 'sentiment'].values
428
X_test = df.loc[25000:, 'review'].values
429
y_test = df.loc[25000:, 'sentiment'].values
430

431

432

433

434

435
tfidf = TfidfVectorizer(strip_accents=None,
436
                        lowercase=False,
437
                        preprocessor=None)
438

439
"""
440
param_grid = [{'vect__ngram_range': [(1, 1)],
441
               'vect__stop_words': [stop, None],
442
               'vect__tokenizer': [tokenizer, tokenizer_porter],
443
               'clf__penalty': ['l1', 'l2'],
444
               'clf__C': [1.0, 10.0, 100.0]},
445
              {'vect__ngram_range': [(1, 1)],
446
               'vect__stop_words': [stop, None],
447
               'vect__tokenizer': [tokenizer, tokenizer_porter],
448
               'vect__use_idf':[False],
449
               'vect__norm':[None],
450
               'clf__penalty': ['l1', 'l2'],
451
               'clf__C': [1.0, 10.0, 100.0]},
452
              ]
453
"""
454

455
small_param_grid = [{'vect__ngram_range': [(1, 1)],
456
                     'vect__stop_words': [None],
457
                     'vect__tokenizer': [tokenizer, tokenizer_porter],
458
                     'clf__penalty': ['l2'],
459
                     'clf__C': [1.0, 10.0]},
460
                    {'vect__ngram_range': [(1, 1)],
461
                     'vect__stop_words': [stop, None],
462
                     'vect__tokenizer': [tokenizer],
463
                     'vect__use_idf':[False],
464
                     'vect__norm':[None],
465
                     'clf__penalty': ['l2'],
466
                  'clf__C': [1.0, 10.0]},
467
              ]
468

469
lr_tfidf = Pipeline([('vect', tfidf),
470
                     ('clf', LogisticRegression(solver='liblinear'))])
471

472
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid,
473
                           scoring='accuracy',
474
                           cv=5,
475
                           verbose=1,
476
                           n_jobs=-1)
477

478

479
# **Important Note about `n_jobs`**
480
# 
481
# Please note that it is highly recommended to use `n_jobs=-1` (instead of `n_jobs=1`) in the previous code example to utilize all available cores on your machine and speed up the grid search. However, some Windows users reported issues when running the previous code with the `n_jobs=-1` setting related to pickling the tokenizer and tokenizer_porter functions for multiprocessing on Windows. Another workaround would be to replace those two functions, `[tokenizer, tokenizer_porter]`, with `[str.split]`. However, note that the replacement by the simple `str.split` would not support stemming.
482

483

484

485
gs_lr_tfidf.fit(X_train, y_train)
486

487

488

489

490
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')
491
print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')
492

493

494

495

496
clf = gs_lr_tfidf.best_estimator_
497
print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')
498

499

500

501
# ####  Start comment:
502
#     
503
# Please note that `gs_lr_tfidf.best_score_` is the average k-fold cross-validation score. I.e., if we have a `GridSearchCV` object with 5-fold cross-validation (like the one above), the `best_score_` attribute returns the average score over the 5-folds of the best model. To illustrate this with an example:
504

505

506

507

508

509
np.random.seed(0)
510
np.set_printoptions(precision=6)
511
y = [np.random.randint(3) for i in range(25)]
512
X = (y + np.random.randn(25)).reshape(-1, 1)
513

514
cv5_idx = list(StratifiedKFold(n_splits=5, shuffle=False).split(X, y))
515
    
516
lr = LogisticRegression()
517
cross_val_score(lr, X, y, cv=cv5_idx)
518

519

520
# By executing the code above, we created a simple data set of random integers that shall represent our class labels. Next, we fed the indices of 5 cross-validation folds (`cv3_idx`) to the `cross_val_score` scorer, which returned 5 accuracy scores -- these are the 5 accuracy values for the 5 test folds.  
521
# 
522
# Next, let us use the `GridSearchCV` object and feed it the same 5 cross-validation sets (via the pre-generated `cv3_idx` indices):
523

524

525

526

527
lr = LogisticRegression()
528
gs = GridSearchCV(lr, {}, cv=cv5_idx, verbose=3).fit(X, y) 
529

530

531
# As we can see, the scores for the 5 folds are exactly the same as the ones from `cross_val_score` earlier.
532

533
# Now, the best_score_ attribute of the `GridSearchCV` object, which becomes available after `fit`ting, returns the average accuracy score of the best model:
534

535

536

537
gs.best_score_
538

539

540
# As we can see, the result above is consistent with the average score computed the `cross_val_score`.
541

542

543

544
lr = LogisticRegression()
545
cross_val_score(lr, X, y, cv=cv5_idx).mean()
546

547

548
# #### End comment.
549
# 
550

551

552
# # Working with bigger data - online algorithms and out-of-core learning
553

554

555

556
# This cell is not contained in the book but
557
# added for convenience so that the notebook
558
# can be executed starting here, without
559
# executing prior code in this notebook
560

561

562

563
if not os.path.isfile('movie_data.csv'):
564
    if not os.path.isfile('movie_data.csv.gz'):
565
        print('Please place a copy of the movie_data.csv.gz'
566
              'in this directory. You can obtain it by'
567
              'a) executing the code in the beginning of this'
568
              'notebook or b) by downloading it from GitHub:'
569
              'https://github.com/rasbt/machine-learning-book/'
570
              'blob/main/ch08/movie_data.csv.gz')
571
    else:
572
        with gzip.open('movie_data.csv.gz', 'rb') as in_f,                 open('movie_data.csv', 'wb') as out_f:
573
            out_f.write(in_f.read())
574

575

576

577

578

579

580
# The `stop` is defined as earlier in this chapter
581
# Added it here for convenience, so that this section
582
# can be run as standalone without executing prior code
583
# in the directory
584
stop = stopwords.words('english')
585

586

587
def tokenizer(text):
588
    text = re.sub('<[^>]*>', '', text)
589
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
590
    text = re.sub('[\W]+', ' ', text.lower()) +        ' '.join(emoticons).replace('-', '')
591
    tokenized = [w for w in text.split() if w not in stop]
592
    return tokenized
593

594

595
def stream_docs(path):
596
    with open(path, 'r', encoding='utf-8') as csv:
597
        next(csv)  # skip header
598
        for line in csv:
599
            text, label = line[:-3], int(line[-2])
600
            yield text, label
601

602

603

604

605
next(stream_docs(path='movie_data.csv'))
606

607

608

609

610
def get_minibatch(doc_stream, size):
611
    docs, y = [], []
612
    try:
613
        for _ in range(size):
614
            text, label = next(doc_stream)
615
            docs.append(text)
616
            y.append(label)
617
    except StopIteration:
618
        return None, None
619
    return docs, y
620

621

622

623

624

625

626
vect = HashingVectorizer(decode_error='ignore', 
627
                         n_features=2**21,
628
                         preprocessor=None, 
629
                         tokenizer=tokenizer)
630

631

632

633

634

635
clf = SGDClassifier(loss='log', random_state=1)
636

637

638
doc_stream = stream_docs(path='movie_data.csv')
639

640

641

642

643
pbar = pyprind.ProgBar(45)
644

645
classes = np.array([0, 1])
646
for _ in range(45):
647
    X_train, y_train = get_minibatch(doc_stream, size=1000)
648
    if not X_train:
649
        break
650
    X_train = vect.transform(X_train)
651
    clf.partial_fit(X_train, y_train, classes=classes)
652
    pbar.update()
653

654

655

656

657
X_test, y_test = get_minibatch(doc_stream, size=5000)
658
X_test = vect.transform(X_test)
659
print(f'Accuracy: {clf.score(X_test, y_test):.3f}')
660

661

662

663

664
clf = clf.partial_fit(X_test, y_test)
665

666

667
# ## Topic modeling
668

669
# ### Decomposing text documents with Latent Dirichlet Allocation
670

671
# ### Latent Dirichlet Allocation with scikit-learn
672

673

674

675

676
df = pd.read_csv('movie_data.csv', encoding='utf-8')
677

678
# the following is necessary on some computers:
679
df = df.rename(columns={"0": "review", "1": "sentiment"})
680

681
df.head(3)
682

683

684

685

686

687
count = CountVectorizer(stop_words='english',
688
                        max_df=.1,
689
                        max_features=5000)
690
X = count.fit_transform(df['review'].values)
691

692

693

694

695

696
lda = LatentDirichletAllocation(n_components=10,
697
                                random_state=123,
698
                                learning_method='batch')
699
X_topics = lda.fit_transform(X)
700

701

702

703

704
lda.components_.shape
705

706

707

708

709
n_top_words = 5
710
feature_names = count.get_feature_names_out()
711

712
for topic_idx, topic in enumerate(lda.components_):
713
    print(f'Topic {(topic_idx + 1)}:')
714
    print(' '.join([feature_names[i]
715
                    for i in topic.argsort()\
716
                        [:-n_top_words - 1:-1]]))
717

718

719
# Based on reading the 5 most important words for each topic, we may guess that the LDA identified the following topics:
720
#     
721
# 1. Generally bad movies (not really a topic category)
722
# 2. Movies about families
723
# 3. War movies
724
# 4. Art movies
725
# 5. Crime movies
726
# 6. Horror movies
727
# 7. Comedies
728
# 8. Movies somehow related to TV shows
729
# 9. Movies based on books
730
# 10. Action movies
731

732
# To confirm that the categories make sense based on the reviews, let's plot 5 movies from the horror movie category (category 6 at index position 5):
733

734

735

736
horror = X_topics[:, 5].argsort()[::-1]
737

738
for iter_idx, movie_idx in enumerate(horror[:3]):
739
    print(f'\nHorror movie #{(iter_idx + 1)}:')
740
    print(df['review'][movie_idx][:300], '...')
741

742

743
# Using the preceeding code example, we printed the first 300 characters from the top 3 horror movies and indeed, we can see that the reviews -- even though we don't know which exact movie they belong to -- sound like reviews of horror movies, indeed. (However, one might argue that movie #2 could also belong to topic category 1.)
744

745

746
# # Summary
747

748
# ...
749

750
# ---
751
# 
752
# Readers may ignore the next cell.
753

754

755

756

757

758
Product

Resources

Company