Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch15/ch15_part2.py
1245 views
1
# coding: utf-8
2
3
4
import sys
5
from python_environment_check import check_packages
6
import torch
7
import torch.nn as nn
8
from torchtext.datasets import IMDB
9
from torch.utils.data.dataset import random_split
10
import re
11
from collections import Counter, OrderedDict
12
from torchtext.vocab import vocab
13
from torch.utils.data import DataLoader
14
15
# # Machine Learning with PyTorch and Scikit-Learn
16
# # -- Code Examples
17
18
# ## Package version checks
19
20
# Add folder to path in order to load from the check_packages.py script:
21
22
23
24
sys.path.insert(0, '..')
25
26
27
# Check recommended package versions:
28
29
30
31
32
33
d = {
34
'torch': '1.8.0',
35
'torchtext': '0.10.0'
36
}
37
check_packages(d)
38
39
40
# # Chapter 15: Modeling Sequential Data Using Recurrent Neural Networks (Part 2/3)
41
42
# **Outline**
43
#
44
# - [Implementing RNNs for sequence modeling in PyTorch](#Implementing-RNNs-for-sequence-modeling-in-PyTorch)
45
# - [Project one -- predicting the sentiment of IMDb movie reviews](#Project-one----predicting-the-sentiment-of-IMDb-movie-reviews)
46
# - [Preparing the movie review data](#Preparing-the-movie-review-data)
47
# - [Embedding layers for sentence encoding](#Embedding-layers-for-sentence-encoding)
48
# - [Building an RNN model](#Building-an-RNN-model)
49
# - [Building an RNN model for the sentiment analysis task](#Building-an-RNN-model-for-the-sentiment-analysis-task)
50
# - [More on the bidirectional RNN](#More-on-the-bidirectional-RNN)
51
52
53
54
55
56
# # Implementing RNNs for sequence modeling in PyTorch
57
#
58
# ## Project one: predicting the sentiment of IMDb movie reviews
59
#
60
# ### Preparing the movie review data
61
#
62
#
63
64
65
66
67
68
69
70
# !pip install torchtext
71
72
73
74
75
76
# Step 1: load and create the datasets
77
78
train_dataset = IMDB(split='train')
79
test_dataset = IMDB(split='test')
80
81
torch.manual_seed(1)
82
train_dataset, valid_dataset = random_split(
83
list(train_dataset), [20000, 5000])
84
85
86
87
88
## Step 2: find unique tokens (words)
89
90
token_counts = Counter()
91
92
def tokenizer(text):
93
text = re.sub('<[^>]*>', '', text)
94
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
95
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
96
tokenized = text.split()
97
return tokenized
98
99
100
for label, line in train_dataset:
101
tokens = tokenizer(line)
102
token_counts.update(tokens)
103
104
105
print('Vocab-size:', len(token_counts))
106
107
108
109
110
## Step 3: encoding each unique token into integers
111
112
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
113
ordered_dict = OrderedDict(sorted_by_freq_tuples)
114
115
vocab = vocab(ordered_dict)
116
117
vocab.insert_token("<pad>", 0)
118
vocab.insert_token("<unk>", 1)
119
vocab.set_default_index(1)
120
121
print([vocab[token] for token in ['this', 'is', 'an', 'example']])
122
123
124
125
126
## Step 3-A: define the functions for transformation
127
128
device = torch.device("cuda:0")
129
# device = 'cpu'
130
131
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
132
label_pipeline = lambda x: 1. if x == 'pos' else 0.
133
134
135
## Step 3-B: wrap the encode and transformation function
136
def collate_batch(batch):
137
label_list, text_list, lengths = [], [], []
138
for _label, _text in batch:
139
label_list.append(label_pipeline(_label))
140
processed_text = torch.tensor(text_pipeline(_text),
141
dtype=torch.int64)
142
text_list.append(processed_text)
143
lengths.append(processed_text.size(0))
144
label_list = torch.tensor(label_list)
145
lengths = torch.tensor(lengths)
146
padded_text_list = nn.utils.rnn.pad_sequence(
147
text_list, batch_first=True)
148
return padded_text_list.to(device), label_list.to(device), lengths.to(device)
149
150
151
152
153
## Take a small batch
154
155
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
156
text_batch, label_batch, length_batch = next(iter(dataloader))
157
print(text_batch)
158
print(label_batch)
159
print(length_batch)
160
print(text_batch.shape)
161
162
163
164
165
## Step 4: batching the datasets
166
167
batch_size = 32
168
169
train_dl = DataLoader(train_dataset, batch_size=batch_size,
170
shuffle=True, collate_fn=collate_batch)
171
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
172
shuffle=False, collate_fn=collate_batch)
173
test_dl = DataLoader(test_dataset, batch_size=batch_size,
174
shuffle=False, collate_fn=collate_batch)
175
176
177
# ### Embedding layers for sentence encoding
178
#
179
#
180
# * `input_dim`: number of words, i.e. maximum integer index + 1.
181
# * `output_dim`:
182
# * `input_length`: the length of (padded) sequence
183
# * for example, `'This is an example' -> [0, 0, 0, 0, 0, 0, 3, 1, 8, 9]`
184
# => input_lenght is 10
185
#
186
#
187
#
188
# * When calling the layer, takes integr values as input,
189
# the embedding layer convert each interger into float vector of size `[output_dim]`
190
# * If input shape is `[BATCH_SIZE]`, output shape will be `[BATCH_SIZE, output_dim]`
191
# * If input shape is `[BATCH_SIZE, 10]`, output shape will be `[BATCH_SIZE, 10, output_dim]`
192
193
194
195
196
197
198
199
embedding = nn.Embedding(num_embeddings=10,
200
embedding_dim=3,
201
padding_idx=0)
202
203
# a batch of 2 samples of 4 indices each
204
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
205
print(embedding(text_encoded_input))
206
207
208
# ### Building an RNN model
209
#
210
# * **RNN layers:**
211
# * `nn.RNN(input_size, hidden_size, num_layers=1)`
212
# * `nn.LSTM(..)`
213
# * `nn.GRU(..)`
214
# * `nn.RNN(input_size, hidden_size, num_layers=1, bidirectional=True)`
215
#
216
#
217
218
219
220
## An example of building a RNN model
221
## with simple RNN layer
222
223
# Fully connected neural network with one hidden layer
224
class RNN(nn.Module):
225
def __init__(self, input_size, hidden_size):
226
super().__init__()
227
self.rnn = nn.RNN(input_size,
228
hidden_size,
229
num_layers=2,
230
batch_first=True)
231
#self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
232
#self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
233
self.fc = nn.Linear(hidden_size, 1)
234
235
def forward(self, x):
236
_, hidden = self.rnn(x)
237
out = hidden[-1, :, :]
238
out = self.fc(out)
239
return out
240
241
model = RNN(64, 32)
242
243
print(model)
244
245
model(torch.randn(5, 3, 64))
246
247
248
# ### Building an RNN model for the sentiment analysis task
249
250
251
252
class RNN(nn.Module):
253
def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
254
super().__init__()
255
self.embedding = nn.Embedding(vocab_size,
256
embed_dim,
257
padding_idx=0)
258
self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
259
batch_first=True)
260
self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
261
self.relu = nn.ReLU()
262
self.fc2 = nn.Linear(fc_hidden_size, 1)
263
self.sigmoid = nn.Sigmoid()
264
265
def forward(self, text, lengths):
266
out = self.embedding(text)
267
out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
268
out, (hidden, cell) = self.rnn(out)
269
out = hidden[-1, :, :]
270
out = self.fc1(out)
271
out = self.relu(out)
272
out = self.fc2(out)
273
out = self.sigmoid(out)
274
return out
275
276
vocab_size = len(vocab)
277
embed_dim = 20
278
rnn_hidden_size = 64
279
fc_hidden_size = 64
280
281
torch.manual_seed(1)
282
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
283
model = model.to(device)
284
285
286
287
288
def train(dataloader):
289
model.train()
290
total_acc, total_loss = 0, 0
291
for text_batch, label_batch, lengths in dataloader:
292
optimizer.zero_grad()
293
pred = model(text_batch, lengths)[:, 0]
294
loss = loss_fn(pred, label_batch)
295
loss.backward()
296
optimizer.step()
297
total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
298
total_loss += loss.item()*label_batch.size(0)
299
return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
300
301
def evaluate(dataloader):
302
model.eval()
303
total_acc, total_loss = 0, 0
304
with torch.no_grad():
305
for text_batch, label_batch, lengths in dataloader:
306
pred = model(text_batch, lengths)[:, 0]
307
loss = loss_fn(pred, label_batch)
308
total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
309
total_loss += loss.item()*label_batch.size(0)
310
return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
311
312
313
314
315
loss_fn = nn.BCELoss()
316
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
317
318
num_epochs = 10
319
320
torch.manual_seed(1)
321
322
for epoch in range(num_epochs):
323
acc_train, loss_train = train(train_dl)
324
acc_valid, loss_valid = evaluate(valid_dl)
325
print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
326
327
328
329
330
331
acc_test, _ = evaluate(test_dl)
332
print(f'test_accuracy: {acc_test:.4f}')
333
334
335
# #### More on the bidirectional RNN
336
337
# * **Trying bidirectional recurrent layer**
338
339
340
341
class RNN(nn.Module):
342
def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
343
super().__init__()
344
self.embedding = nn.Embedding(vocab_size,
345
embed_dim,
346
padding_idx=0)
347
self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
348
batch_first=True, bidirectional=True)
349
self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
350
self.relu = nn.ReLU()
351
self.fc2 = nn.Linear(fc_hidden_size, 1)
352
self.sigmoid = nn.Sigmoid()
353
354
def forward(self, text, lengths):
355
out = self.embedding(text)
356
out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
357
_, (hidden, cell) = self.rnn(out)
358
out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
359
out = self.fc1(out)
360
out = self.relu(out)
361
out = self.fc2(out)
362
out = self.sigmoid(out)
363
return out
364
365
torch.manual_seed(1)
366
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
367
model = model.to(device)
368
369
370
371
372
loss_fn = nn.BCELoss()
373
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
374
375
num_epochs = 10
376
377
torch.manual_seed(1)
378
379
for epoch in range(num_epochs):
380
acc_train, loss_train = train(train_dl)
381
acc_valid, loss_valid = evaluate(valid_dl)
382
print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
383
384
385
386
387
test_dataset = IMDB(split='test')
388
test_dl = DataLoader(test_dataset, batch_size=batch_size,
389
shuffle=False, collate_fn=collate_batch)
390
391
392
393
394
acc_test, _ = evaluate(test_dl)
395
print(f'test_accuracy: {acc_test:.4f}')
396
397
398
# ## Optional exercise:
399
#
400
# ### Uni-directional SimpleRNN with full-length sequences
401
402
#
403
# ---
404
405
#
406
#
407
# Readers may ignore the next cell.
408
#
409
410
411
412
413
414