Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch08/ch08.py
1247 views
1
# coding: utf-8
2
3
4
import sys
5
from python_environment_check import check_packages
6
import os
7
import tarfile
8
import time
9
import urllib.request
10
import pyprind
11
import pandas as pd
12
import numpy as np
13
from sklearn.feature_extraction.text import CountVectorizer
14
from sklearn.feature_extraction.text import TfidfTransformer
15
import re
16
from nltk.stem.porter import PorterStemmer
17
import nltk
18
from nltk.corpus import stopwords
19
from packaging import version
20
from sklearn.pipeline import Pipeline
21
from sklearn.linear_model import LogisticRegression
22
from sklearn.feature_extraction.text import TfidfVectorizer
23
from sklearn.model_selection import GridSearchCV
24
from sklearn.model_selection import StratifiedKFold
25
from sklearn.model_selection import cross_val_score
26
import gzip
27
from sklearn.feature_extraction.text import HashingVectorizer
28
from sklearn.linear_model import SGDClassifier
29
from distutils.version import LooseVersion as Version
30
from sklearn import __version__ as sklearn_version
31
from sklearn.decomposition import LatentDirichletAllocation
32
33
34
# # Machine Learning with PyTorch and Scikit-Learn
35
# # -- Code Examples
36
37
# ## Package version checks
38
39
# Add folder to path in order to load from the check_packages.py script:
40
41
42
43
sys.path.insert(0, '..')
44
45
46
# Check recommended package versions:
47
48
49
50
51
52
d = {
53
'numpy': '1.21.2',
54
'pandas': '1.3.2',
55
'sklearn': '1.0',
56
'pyprind': '2.11.3',
57
'nltk': '3.6',
58
}
59
check_packages(d)
60
61
62
# # Chapter 8 - Applying Machine Learning To Sentiment Analysis
63
64
# ### Overview
65
66
# - [Preparing the IMDb movie review data for text processing](#Preparing-the-IMDb-movie-review-data-for-text-processing)
67
# - [Obtaining the IMDb movie review dataset](#Obtaining-the-IMDb-movie-review-dataset)
68
# - [Preprocessing the movie dataset into more convenient format](#Preprocessing-the-movie-dataset-into-more-convenient-format)
69
# - [Introducing the bag-of-words model](#Introducing-the-bag-of-words-model)
70
# - [Transforming words into feature vectors](#Transforming-words-into-feature-vectors)
71
# - [Assessing word relevancy via term frequency-inverse document frequency](#Assessing-word-relevancy-via-term-frequency-inverse-document-frequency)
72
# - [Cleaning text data](#Cleaning-text-data)
73
# - [Processing documents into tokens](#Processing-documents-into-tokens)
74
# - [Training a logistic regression model for document classification](#Training-a-logistic-regression-model-for-document-classification)
75
# - [Working with bigger data – online algorithms and out-of-core learning](#Working-with-bigger-data-–-online-algorithms-and-out-of-core-learning)
76
# - [Topic modeling](#Topic-modeling)
77
# - [Decomposing text documents with Latent Dirichlet Allocation](#Decomposing-text-documents-with-Latent-Dirichlet-Allocation)
78
# - [Latent Dirichlet Allocation with scikit-learn](#Latent-Dirichlet-Allocation-with-scikit-learn)
79
# - [Summary](#Summary)
80
81
82
# # Preparing the IMDb movie review data for text processing
83
84
# ## Obtaining the IMDb movie review dataset
85
86
# The IMDB movie review set can be downloaded from [http://ai.stanford.edu/~amaas/data/sentiment/](http://ai.stanford.edu/~amaas/data/sentiment/).
87
# After downloading the dataset, decompress the files.
88
#
89
# A) If you are working with Linux or MacOS X, open a new terminal windowm `cd` into the download directory and execute
90
#
91
# `tar -zxf aclImdb_v1.tar.gz`
92
#
93
# B) If you are working with Windows, download an archiver such as [7Zip](http://www.7-zip.org) to extract the files from the download archive.
94
95
# **Optional code to download and unzip the dataset via Python:**
96
97
98
99
100
source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
101
target = 'aclImdb_v1.tar.gz'
102
103
if os.path.exists(target):
104
os.remove(target)
105
106
def reporthook(count, block_size, total_size):
107
global start_time
108
if count == 0:
109
start_time = time.time()
110
return
111
duration = time.time() - start_time
112
progress_size = int(count * block_size)
113
speed = progress_size / (1024.**2 * duration)
114
percent = count * block_size * 100. / total_size
115
116
sys.stdout.write(f'\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB '
117
f'| {speed:.2f} MB/s | {duration:.2f} sec elapsed')
118
sys.stdout.flush()
119
120
121
if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'):
122
urllib.request.urlretrieve(source, target, reporthook)
123
124
125
126
127
if not os.path.isdir('aclImdb'):
128
129
with tarfile.open(target, 'r:gz') as tar:
130
tar.extractall()
131
132
133
# ## Preprocessing the movie dataset into more convenient format
134
135
# Install pyprind by uncommenting the next code cell.
136
137
138
139
#!pip install pyprind
140
141
142
143
144
145
# change the `basepath` to the directory of the
146
# unzipped movie dataset
147
148
basepath = 'aclImdb'
149
150
labels = {'pos': 1, 'neg': 0}
151
pbar = pyprind.ProgBar(50000, stream=sys.stdout)
152
df = pd.DataFrame()
153
for s in ('test', 'train'):
154
for l in ('pos', 'neg'):
155
path = os.path.join(basepath, s, l)
156
for file in sorted(os.listdir(path)):
157
with open(os.path.join(path, file),
158
'r', encoding='utf-8') as infile:
159
txt = infile.read()
160
if version.parse(pd.__version__) >= version.parse("1.3.2"):
161
x = pd.DataFrame([[txt, labels[l]]], columns=['review', 'sentiment'])
162
df = pd.concat([df, x], ignore_index=False)
163
164
else:
165
df = df.append([[txt, labels[l]]],
166
ignore_index=True)
167
pbar.update()
168
df.columns = ['review', 'sentiment']
169
170
171
# Shuffling the DataFrame:
172
173
174
175
176
np.random.seed(0)
177
df = df.reindex(np.random.permutation(df.index))
178
179
180
# Optional: Saving the assembled data as CSV file:
181
182
183
184
df.to_csv('movie_data.csv', index=False, encoding='utf-8')
185
186
187
188
189
190
df = pd.read_csv('movie_data.csv', encoding='utf-8')
191
192
# the following is necessary on some computers:
193
df = df.rename(columns={"0": "review", "1": "sentiment"})
194
195
df.head(3)
196
197
198
199
200
df.shape
201
202
203
# ---
204
#
205
# ### Note
206
#
207
# If you have problems with creating the `movie_data.csv`, you can find a download a zip archive at
208
# https://github.com/rasbt/machine-learning-book/tree/main/ch08/
209
#
210
# ---
211
212
213
# # Introducing the bag-of-words model
214
215
# ...
216
217
# ## Transforming documents into feature vectors
218
219
# By calling the fit_transform method on CountVectorizer, we just constructed the vocabulary of the bag-of-words model and transformed the following three sentences into sparse feature vectors:
220
# 1. The sun is shining
221
# 2. The weather is sweet
222
# 3. The sun is shining, the weather is sweet, and one and one is two
223
#
224
225
226
227
228
count = CountVectorizer()
229
docs = np.array([
230
'The sun is shining',
231
'The weather is sweet',
232
'The sun is shining, the weather is sweet, and one and one is two'])
233
bag = count.fit_transform(docs)
234
235
236
# Now let us print the contents of the vocabulary to get a better understanding of the underlying concepts:
237
238
239
240
print(count.vocabulary_)
241
242
243
# As we can see from executing the preceding command, the vocabulary is stored in a Python dictionary, which maps the unique words that are mapped to integer indices. Next let us print the feature vectors that we just created:
244
245
# Each index position in the feature vectors shown here corresponds to the integer values that are stored as dictionary items in the CountVectorizer vocabulary. For example, the rst feature at index position 0 resembles the count of the word and, which only occurs in the last document, and the word is at index position 1 (the 2nd feature in the document vectors) occurs in all three sentences. Those values in the feature vectors are also called the raw term frequencies: *tf (t,d)*—the number of times a term t occurs in a document *d*.
246
247
248
249
print(bag.toarray())
250
251
252
253
# ## Assessing word relevancy via term frequency-inverse document frequency
254
255
256
257
np.set_printoptions(precision=2)
258
259
260
# When we are analyzing text data, we often encounter words that occur across multiple documents from both classes. Those frequently occurring words typically don't contain useful or discriminatory information. In this subsection, we will learn about a useful technique called term frequency-inverse document frequency (tf-idf) that can be used to downweight those frequently occurring words in the feature vectors. The tf-idf can be de ned as the product of the term frequency and the inverse document frequency:
261
#
262
# $$\text{tf-idf}(t,d)=\text{tf (t,d)}\times \text{idf}(t,d)$$
263
#
264
# Here the tf(t, d) is the term frequency that we introduced in the previous section,
265
# and the inverse document frequency *idf(t, d)* can be calculated as:
266
#
267
# $$\text{idf}(t,d) = \text{log}\frac{n_d}{1+\text{df}(d, t)},$$
268
#
269
# where $n_d$ is the total number of documents, and *df(d, t)* is the number of documents *d* that contain the term *t*. Note that adding the constant 1 to the denominator is optional and serves the purpose of assigning a non-zero value to terms that occur in all training examples; the log is used to ensure that low document frequencies are not given too much weight.
270
#
271
# Scikit-learn implements yet another transformer, the `TfidfTransformer`, that takes the raw term frequencies from `CountVectorizer` as input and transforms them into tf-idfs:
272
273
274
275
276
tfidf = TfidfTransformer(use_idf=True,
277
norm='l2',
278
smooth_idf=True)
279
print(tfidf.fit_transform(count.fit_transform(docs))
280
.toarray())
281
282
283
# As we saw in the previous subsection, the word is had the largest term frequency in the 3rd document, being the most frequently occurring word. However, after transforming the same feature vector into tf-idfs, we see that the word is is
284
# now associated with a relatively small tf-idf (0.45) in document 3 since it is
285
# also contained in documents 1 and 2 and thus is unlikely to contain any useful, discriminatory information.
286
#
287
288
# However, if we'd manually calculated the tf-idfs of the individual terms in our feature vectors, we'd have noticed that the `TfidfTransformer` calculates the tf-idfs slightly differently compared to the standard textbook equations that we de ned earlier. The equations for the idf and tf-idf that were implemented in scikit-learn are:
289
290
# $$\text{idf} (t,d) = log\frac{1 + n_d}{1 + \text{df}(d, t)}$$
291
#
292
# The tf-idf equation that was implemented in scikit-learn is as follows:
293
#
294
# $$\text{tf-idf}(t,d) = \text{tf}(t,d) \times (\text{idf}(t,d)+1)$$
295
#
296
# While it is also more typical to normalize the raw term frequencies before calculating the tf-idfs, the `TfidfTransformer` normalizes the tf-idfs directly.
297
#
298
# By default (`norm='l2'`), scikit-learn's TfidfTransformer applies the L2-normalization, which returns a vector of length 1 by dividing an un-normalized feature vector *v* by its L2-norm:
299
#
300
# $$v_{\text{norm}} = \frac{v}{||v||_2} = \frac{v}{\sqrt{v_{1}^{2} + v_{2}^{2} + \dots + v_{n}^{2}}} = \frac{v}{\big (\sum_{i=1}^{n} v_{i}^{2}\big)^\frac{1}{2}}$$
301
#
302
# To make sure that we understand how TfidfTransformer works, let us walk
303
# through an example and calculate the tf-idf of the word is in the 3rd document.
304
#
305
# The word is has a term frequency of 3 (tf = 3) in document 3 ($d_3$), and the document frequency of this term is 3 since the term is occurs in all three documents (df = 3). Thus, we can calculate the idf as follows:
306
#
307
# $$\text{idf}("is", d_3) = log \frac{1+3}{1+3} = 0$$
308
#
309
# Now in order to calculate the tf-idf, we simply need to add 1 to the inverse document frequency and multiply it by the term frequency:
310
#
311
# $$\text{tf-idf}("is", d_3)= 3 \times (0+1) = 3$$
312
313
314
315
tf_is = 3
316
n_docs = 3
317
idf_is = np.log((n_docs+1) / (3+1))
318
tfidf_is = tf_is * (idf_is + 1)
319
print(f'tf-idf of term "is" = {tfidf_is:.2f}')
320
321
322
# If we repeated these calculations for all terms in the 3rd document, we'd obtain the following tf-idf vectors: [3.39, 3.0, 3.39, 1.29, 1.29, 1.29, 2.0 , 1.69, 1.29]. However, we notice that the values in this feature vector are different from the values that we obtained from the TfidfTransformer that we used previously. The nal step that we are missing in this tf-idf calculation is the L2-normalization, which can be applied as follows:
323
324
# $$\text{tfi-df}_{norm} = \frac{[3.39, 3.0, 3.39, 1.29, 1.29, 1.29, 2.0 , 1.69, 1.29]}{\sqrt{[3.39^2, 3.0^2, 3.39^2, 1.29^2, 1.29^2, 1.29^2, 2.0^2 , 1.69^2, 1.29^2]}}$$
325
#
326
# $$=[0.5, 0.45, 0.5, 0.19, 0.19, 0.19, 0.3, 0.25, 0.19]$$
327
#
328
# $$\Rightarrow \text{tfi-df}_{norm}("is", d3) = 0.45$$
329
330
# As we can see, the results match the results returned by scikit-learn's `TfidfTransformer` (below). Since we now understand how tf-idfs are calculated, let us proceed to the next sections and apply those concepts to the movie review dataset.
331
332
333
334
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
335
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]
336
raw_tfidf
337
338
339
340
341
l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
342
l2_tfidf
343
344
345
346
# ## Cleaning text data
347
348
349
350
df.loc[0, 'review'][-50:]
351
352
353
354
355
def preprocessor(text):
356
text = re.sub('<[^>]*>', '', text)
357
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
358
text)
359
text = (re.sub('[\W]+', ' ', text.lower()) +
360
' '.join(emoticons).replace('-', ''))
361
return text
362
363
364
365
366
preprocessor(df.loc[0, 'review'][-50:])
367
368
369
370
371
preprocessor("</a>This :) is :( a test :-)!")
372
373
374
375
376
df['review'] = df['review'].apply(preprocessor)
377
378
379
380
# ## Processing documents into tokens
381
382
383
384
385
porter = PorterStemmer()
386
387
def tokenizer(text):
388
return text.split()
389
390
391
def tokenizer_porter(text):
392
return [porter.stem(word) for word in text.split()]
393
394
395
396
397
tokenizer('runners like running and thus they run')
398
399
400
401
402
tokenizer_porter('runners like running and thus they run')
403
404
405
406
407
408
nltk.download('stopwords')
409
410
411
412
413
414
stop = stopwords.words('english')
415
[w for w in tokenizer_porter('a runner likes running and runs a lot')
416
if w not in stop]
417
418
419
420
# # Training a logistic regression model for document classification
421
422
# Strip HTML and punctuation to speed up the GridSearch later:
423
424
425
426
X_train = df.loc[:25000, 'review'].values
427
y_train = df.loc[:25000, 'sentiment'].values
428
X_test = df.loc[25000:, 'review'].values
429
y_test = df.loc[25000:, 'sentiment'].values
430
431
432
433
434
435
tfidf = TfidfVectorizer(strip_accents=None,
436
lowercase=False,
437
preprocessor=None)
438
439
"""
440
param_grid = [{'vect__ngram_range': [(1, 1)],
441
'vect__stop_words': [stop, None],
442
'vect__tokenizer': [tokenizer, tokenizer_porter],
443
'clf__penalty': ['l1', 'l2'],
444
'clf__C': [1.0, 10.0, 100.0]},
445
{'vect__ngram_range': [(1, 1)],
446
'vect__stop_words': [stop, None],
447
'vect__tokenizer': [tokenizer, tokenizer_porter],
448
'vect__use_idf':[False],
449
'vect__norm':[None],
450
'clf__penalty': ['l1', 'l2'],
451
'clf__C': [1.0, 10.0, 100.0]},
452
]
453
"""
454
455
small_param_grid = [{'vect__ngram_range': [(1, 1)],
456
'vect__stop_words': [None],
457
'vect__tokenizer': [tokenizer, tokenizer_porter],
458
'clf__penalty': ['l2'],
459
'clf__C': [1.0, 10.0]},
460
{'vect__ngram_range': [(1, 1)],
461
'vect__stop_words': [stop, None],
462
'vect__tokenizer': [tokenizer],
463
'vect__use_idf':[False],
464
'vect__norm':[None],
465
'clf__penalty': ['l2'],
466
'clf__C': [1.0, 10.0]},
467
]
468
469
lr_tfidf = Pipeline([('vect', tfidf),
470
('clf', LogisticRegression(solver='liblinear'))])
471
472
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid,
473
scoring='accuracy',
474
cv=5,
475
verbose=1,
476
n_jobs=-1)
477
478
479
# **Important Note about `n_jobs`**
480
#
481
# Please note that it is highly recommended to use `n_jobs=-1` (instead of `n_jobs=1`) in the previous code example to utilize all available cores on your machine and speed up the grid search. However, some Windows users reported issues when running the previous code with the `n_jobs=-1` setting related to pickling the tokenizer and tokenizer_porter functions for multiprocessing on Windows. Another workaround would be to replace those two functions, `[tokenizer, tokenizer_porter]`, with `[str.split]`. However, note that the replacement by the simple `str.split` would not support stemming.
482
483
484
485
gs_lr_tfidf.fit(X_train, y_train)
486
487
488
489
490
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')
491
print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')
492
493
494
495
496
clf = gs_lr_tfidf.best_estimator_
497
print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')
498
499
500
501
# #### Start comment:
502
#
503
# Please note that `gs_lr_tfidf.best_score_` is the average k-fold cross-validation score. I.e., if we have a `GridSearchCV` object with 5-fold cross-validation (like the one above), the `best_score_` attribute returns the average score over the 5-folds of the best model. To illustrate this with an example:
504
505
506
507
508
509
np.random.seed(0)
510
np.set_printoptions(precision=6)
511
y = [np.random.randint(3) for i in range(25)]
512
X = (y + np.random.randn(25)).reshape(-1, 1)
513
514
cv5_idx = list(StratifiedKFold(n_splits=5, shuffle=False).split(X, y))
515
516
lr = LogisticRegression()
517
cross_val_score(lr, X, y, cv=cv5_idx)
518
519
520
# By executing the code above, we created a simple data set of random integers that shall represent our class labels. Next, we fed the indices of 5 cross-validation folds (`cv3_idx`) to the `cross_val_score` scorer, which returned 5 accuracy scores -- these are the 5 accuracy values for the 5 test folds.
521
#
522
# Next, let us use the `GridSearchCV` object and feed it the same 5 cross-validation sets (via the pre-generated `cv3_idx` indices):
523
524
525
526
527
lr = LogisticRegression()
528
gs = GridSearchCV(lr, {}, cv=cv5_idx, verbose=3).fit(X, y)
529
530
531
# As we can see, the scores for the 5 folds are exactly the same as the ones from `cross_val_score` earlier.
532
533
# Now, the best_score_ attribute of the `GridSearchCV` object, which becomes available after `fit`ting, returns the average accuracy score of the best model:
534
535
536
537
gs.best_score_
538
539
540
# As we can see, the result above is consistent with the average score computed the `cross_val_score`.
541
542
543
544
lr = LogisticRegression()
545
cross_val_score(lr, X, y, cv=cv5_idx).mean()
546
547
548
# #### End comment.
549
#
550
551
552
# # Working with bigger data - online algorithms and out-of-core learning
553
554
555
556
# This cell is not contained in the book but
557
# added for convenience so that the notebook
558
# can be executed starting here, without
559
# executing prior code in this notebook
560
561
562
563
if not os.path.isfile('movie_data.csv'):
564
if not os.path.isfile('movie_data.csv.gz'):
565
print('Please place a copy of the movie_data.csv.gz'
566
'in this directory. You can obtain it by'
567
'a) executing the code in the beginning of this'
568
'notebook or b) by downloading it from GitHub:'
569
'https://github.com/rasbt/machine-learning-book/'
570
'blob/main/ch08/movie_data.csv.gz')
571
else:
572
with gzip.open('movie_data.csv.gz', 'rb') as in_f, open('movie_data.csv', 'wb') as out_f:
573
out_f.write(in_f.read())
574
575
576
577
578
579
580
# The `stop` is defined as earlier in this chapter
581
# Added it here for convenience, so that this section
582
# can be run as standalone without executing prior code
583
# in the directory
584
stop = stopwords.words('english')
585
586
587
def tokenizer(text):
588
text = re.sub('<[^>]*>', '', text)
589
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
590
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
591
tokenized = [w for w in text.split() if w not in stop]
592
return tokenized
593
594
595
def stream_docs(path):
596
with open(path, 'r', encoding='utf-8') as csv:
597
next(csv) # skip header
598
for line in csv:
599
text, label = line[:-3], int(line[-2])
600
yield text, label
601
602
603
604
605
next(stream_docs(path='movie_data.csv'))
606
607
608
609
610
def get_minibatch(doc_stream, size):
611
docs, y = [], []
612
try:
613
for _ in range(size):
614
text, label = next(doc_stream)
615
docs.append(text)
616
y.append(label)
617
except StopIteration:
618
return None, None
619
return docs, y
620
621
622
623
624
625
626
vect = HashingVectorizer(decode_error='ignore',
627
n_features=2**21,
628
preprocessor=None,
629
tokenizer=tokenizer)
630
631
632
633
634
635
clf = SGDClassifier(loss='log', random_state=1)
636
637
638
doc_stream = stream_docs(path='movie_data.csv')
639
640
641
642
643
pbar = pyprind.ProgBar(45)
644
645
classes = np.array([0, 1])
646
for _ in range(45):
647
X_train, y_train = get_minibatch(doc_stream, size=1000)
648
if not X_train:
649
break
650
X_train = vect.transform(X_train)
651
clf.partial_fit(X_train, y_train, classes=classes)
652
pbar.update()
653
654
655
656
657
X_test, y_test = get_minibatch(doc_stream, size=5000)
658
X_test = vect.transform(X_test)
659
print(f'Accuracy: {clf.score(X_test, y_test):.3f}')
660
661
662
663
664
clf = clf.partial_fit(X_test, y_test)
665
666
667
# ## Topic modeling
668
669
# ### Decomposing text documents with Latent Dirichlet Allocation
670
671
# ### Latent Dirichlet Allocation with scikit-learn
672
673
674
675
676
df = pd.read_csv('movie_data.csv', encoding='utf-8')
677
678
# the following is necessary on some computers:
679
df = df.rename(columns={"0": "review", "1": "sentiment"})
680
681
df.head(3)
682
683
684
685
686
687
count = CountVectorizer(stop_words='english',
688
max_df=.1,
689
max_features=5000)
690
X = count.fit_transform(df['review'].values)
691
692
693
694
695
696
lda = LatentDirichletAllocation(n_components=10,
697
random_state=123,
698
learning_method='batch')
699
X_topics = lda.fit_transform(X)
700
701
702
703
704
lda.components_.shape
705
706
707
708
709
n_top_words = 5
710
feature_names = count.get_feature_names_out()
711
712
for topic_idx, topic in enumerate(lda.components_):
713
print(f'Topic {(topic_idx + 1)}:')
714
print(' '.join([feature_names[i]
715
for i in topic.argsort()\
716
[:-n_top_words - 1:-1]]))
717
718
719
# Based on reading the 5 most important words for each topic, we may guess that the LDA identified the following topics:
720
#
721
# 1. Generally bad movies (not really a topic category)
722
# 2. Movies about families
723
# 3. War movies
724
# 4. Art movies
725
# 5. Crime movies
726
# 6. Horror movies
727
# 7. Comedies
728
# 8. Movies somehow related to TV shows
729
# 9. Movies based on books
730
# 10. Action movies
731
732
# To confirm that the categories make sense based on the reviews, let's plot 5 movies from the horror movie category (category 6 at index position 5):
733
734
735
736
horror = X_topics[:, 5].argsort()[::-1]
737
738
for iter_idx, movie_idx in enumerate(horror[:3]):
739
print(f'\nHorror movie #{(iter_idx + 1)}:')
740
print(df['review'][movie_idx][:300], '...')
741
742
743
# Using the preceeding code example, we printed the first 300 characters from the top 3 horror movies and indeed, we can see that the reviews -- even though we don't know which exact movie they belong to -- sound like reviews of horror movies, indeed. (However, one might argue that movie #2 could also belong to topic category 1.)
744
745
746
# # Summary
747
748
# ...
749
750
# ---
751
#
752
# Readers may ignore the next cell.
753
754
755
756
757
758