CoCalc -- ch15

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch15/ch15_part2.py
¹²⁴⁵ views
1
# coding: utf-8
2

3

4
import sys
5
from python_environment_check import check_packages
6
import torch
7
import torch.nn as nn
8
from torchtext.datasets import IMDB
9
from torch.utils.data.dataset import random_split
10
import re
11
from collections import Counter, OrderedDict
12
from torchtext.vocab import vocab
13
from torch.utils.data import DataLoader
14

15
# # Machine Learning with PyTorch and Scikit-Learn  
16
# # -- Code Examples
17

18
# ## Package version checks
19

20
# Add folder to path in order to load from the check_packages.py script:
21

22

23

24
sys.path.insert(0, '..')
25

26

27
# Check recommended package versions:
28

29

30

31

32

33
d = {
34
    'torch': '1.8.0',
35
    'torchtext': '0.10.0'
36
}
37
check_packages(d)
38

39

40
# # Chapter 15: Modeling Sequential Data Using Recurrent Neural Networks (Part 2/3)
41

42
# **Outline**
43
# 
44
# - [Implementing RNNs for sequence modeling in PyTorch](#Implementing-RNNs-for-sequence-modeling-in-PyTorch)
45
#   - [Project one -- predicting the sentiment of IMDb movie reviews](#Project-one----predicting-the-sentiment-of-IMDb-movie-reviews)
46
#     - [Preparing the movie review data](#Preparing-the-movie-review-data)
47
#     - [Embedding layers for sentence encoding](#Embedding-layers-for-sentence-encoding)
48
#     - [Building an RNN model](#Building-an-RNN-model)
49
#     - [Building an RNN model for the sentiment analysis task](#Building-an-RNN-model-for-the-sentiment-analysis-task)
50
#       - [More on the bidirectional RNN](#More-on-the-bidirectional-RNN)
51

52

53

54

55

56
# # Implementing RNNs for sequence modeling in PyTorch
57
# 
58
# ## Project one: predicting the sentiment of IMDb movie reviews
59
# 
60
# ### Preparing the movie review data
61
# 
62
# 
63

64

65

66

67

68

69

70
# !pip install torchtext
71

72

73

74

75

76
# Step 1: load and create the datasets
77

78
train_dataset = IMDB(split='train')
79
test_dataset = IMDB(split='test')
80

81
torch.manual_seed(1)
82
train_dataset, valid_dataset = random_split(
83
    list(train_dataset), [20000, 5000])
84

85

86

87

88
## Step 2: find unique tokens (words)
89

90
token_counts = Counter()
91

92
def tokenizer(text):
93
    text = re.sub('<[^>]*>', '', text)
94
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
95
    text = re.sub('[\W]+', ' ', text.lower()) +        ' '.join(emoticons).replace('-', '')
96
    tokenized = text.split()
97
    return tokenized
98

99

100
for label, line in train_dataset:
101
    tokens = tokenizer(line)
102
    token_counts.update(tokens)
103
 
104
    
105
print('Vocab-size:', len(token_counts))
106

107

108

109

110
## Step 3: encoding each unique token into integers
111

112
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
113
ordered_dict = OrderedDict(sorted_by_freq_tuples)
114

115
vocab = vocab(ordered_dict)
116

117
vocab.insert_token("<pad>", 0)
118
vocab.insert_token("<unk>", 1)
119
vocab.set_default_index(1)
120

121
print([vocab[token] for token in ['this', 'is', 'an', 'example']])
122

123

124

125

126
## Step 3-A: define the functions for transformation
127

128
device = torch.device("cuda:0")
129
# device = 'cpu'
130

131
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
132
label_pipeline = lambda x: 1. if x == 'pos' else 0.
133

134

135
## Step 3-B: wrap the encode and transformation function
136
def collate_batch(batch):
137
    label_list, text_list, lengths = [], [], []
138
    for _label, _text in batch:
139
        label_list.append(label_pipeline(_label))
140
        processed_text = torch.tensor(text_pipeline(_text), 
141
                                      dtype=torch.int64)
142
        text_list.append(processed_text)
143
        lengths.append(processed_text.size(0))
144
    label_list = torch.tensor(label_list)
145
    lengths = torch.tensor(lengths)
146
    padded_text_list = nn.utils.rnn.pad_sequence(
147
        text_list, batch_first=True)
148
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)
149

150

151

152

153
## Take a small batch
154

155
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
156
text_batch, label_batch, length_batch = next(iter(dataloader))
157
print(text_batch)
158
print(label_batch)
159
print(length_batch)
160
print(text_batch.shape)
161

162

163

164

165
## Step 4: batching the datasets
166

167
batch_size = 32  
168

169
train_dl = DataLoader(train_dataset, batch_size=batch_size,
170
                      shuffle=True, collate_fn=collate_batch)
171
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
172
                      shuffle=False, collate_fn=collate_batch)
173
test_dl = DataLoader(test_dataset, batch_size=batch_size,
174
                     shuffle=False, collate_fn=collate_batch)
175

176

177
# ### Embedding layers for sentence encoding
178
# 
179
# 
180
#  * `input_dim`: number of words, i.e. maximum integer index + 1.
181
#  * `output_dim`: 
182
#  * `input_length`: the length of (padded) sequence
183
#     * for example, `'This is an example' -> [0, 0, 0, 0, 0, 0, 3, 1, 8, 9]`   
184
#     => input_lenght is 10
185
#  
186
#  
187
# 
188
#  * When calling the layer, takes integr values as input,   
189
#  the embedding layer convert each interger into float vector of size `[output_dim]`
190
#    * If input shape is `[BATCH_SIZE]`, output shape will be `[BATCH_SIZE, output_dim]`
191
#    * If input shape is `[BATCH_SIZE, 10]`, output shape will be `[BATCH_SIZE, 10, output_dim]`
192

193

194

195

196

197

198

199
embedding = nn.Embedding(num_embeddings=10, 
200
                         embedding_dim=3, 
201
                         padding_idx=0)
202
 
203
# a batch of 2 samples of 4 indices each
204
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
205
print(embedding(text_encoded_input))
206

207

208
# ### Building an RNN model
209
# 
210
# * **RNN layers:**
211
#   * `nn.RNN(input_size, hidden_size, num_layers=1)`
212
#   * `nn.LSTM(..)`
213
#   * `nn.GRU(..)`
214
#   * `nn.RNN(input_size, hidden_size, num_layers=1, bidirectional=True)`
215
#  
216
#  
217

218

219

220
## An example of building a RNN model
221
## with simple RNN layer
222

223
# Fully connected neural network with one hidden layer
224
class RNN(nn.Module):
225
    def __init__(self, input_size, hidden_size):
226
        super().__init__()
227
        self.rnn = nn.RNN(input_size, 
228
                          hidden_size, 
229
                          num_layers=2, 
230
                          batch_first=True)
231
        #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
232
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
233
        self.fc = nn.Linear(hidden_size, 1)
234
        
235
    def forward(self, x):
236
        _, hidden = self.rnn(x)
237
        out = hidden[-1, :, :]
238
        out = self.fc(out)
239
        return out
240

241
model = RNN(64, 32) 
242

243
print(model) 
244
 
245
model(torch.randn(5, 3, 64)) 
246

247

248
# ### Building an RNN model for the sentiment analysis task
249

250

251

252
class RNN(nn.Module):
253
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
254
        super().__init__()
255
        self.embedding = nn.Embedding(vocab_size, 
256
                                      embed_dim, 
257
                                      padding_idx=0) 
258
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
259
                           batch_first=True)
260
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
261
        self.relu = nn.ReLU()
262
        self.fc2 = nn.Linear(fc_hidden_size, 1)
263
        self.sigmoid = nn.Sigmoid()
264

265
    def forward(self, text, lengths):
266
        out = self.embedding(text)
267
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
268
        out, (hidden, cell) = self.rnn(out)
269
        out = hidden[-1, :, :]
270
        out = self.fc1(out)
271
        out = self.relu(out)
272
        out = self.fc2(out)
273
        out = self.sigmoid(out)
274
        return out
275
         
276
vocab_size = len(vocab)
277
embed_dim = 20
278
rnn_hidden_size = 64
279
fc_hidden_size = 64
280

281
torch.manual_seed(1)
282
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
283
model = model.to(device)
284

285

286

287

288
def train(dataloader):
289
    model.train()
290
    total_acc, total_loss = 0, 0
291
    for text_batch, label_batch, lengths in dataloader:
292
        optimizer.zero_grad()
293
        pred = model(text_batch, lengths)[:, 0]
294
        loss = loss_fn(pred, label_batch)
295
        loss.backward()
296
        optimizer.step()
297
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
298
        total_loss += loss.item()*label_batch.size(0)
299
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
300
 
301
def evaluate(dataloader):
302
    model.eval()
303
    total_acc, total_loss = 0, 0
304
    with torch.no_grad():
305
        for text_batch, label_batch, lengths in dataloader:
306
            pred = model(text_batch, lengths)[:, 0]
307
            loss = loss_fn(pred, label_batch)
308
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
309
            total_loss += loss.item()*label_batch.size(0)
310
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
311

312

313

314

315
loss_fn = nn.BCELoss()
316
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
317

318
num_epochs = 10 
319

320
torch.manual_seed(1)
321
 
322
for epoch in range(num_epochs):
323
    acc_train, loss_train = train(train_dl)
324
    acc_valid, loss_valid = evaluate(valid_dl)
325
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
326
 
327

328

329

330

331
acc_test, _ = evaluate(test_dl)
332
print(f'test_accuracy: {acc_test:.4f}') 
333

334

335
# #### More on the bidirectional RNN
336

337
#  * **Trying bidirectional recurrent layer**
338

339

340

341
class RNN(nn.Module):
342
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
343
        super().__init__()
344
        self.embedding = nn.Embedding(vocab_size, 
345
                                      embed_dim, 
346
                                      padding_idx=0) 
347
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
348
                           batch_first=True, bidirectional=True)
349
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
350
        self.relu = nn.ReLU()
351
        self.fc2 = nn.Linear(fc_hidden_size, 1)
352
        self.sigmoid = nn.Sigmoid()
353

354
    def forward(self, text, lengths):
355
        out = self.embedding(text)
356
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
357
        _, (hidden, cell) = self.rnn(out)
358
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
359
        out = self.fc1(out)
360
        out = self.relu(out)
361
        out = self.fc2(out)
362
        out = self.sigmoid(out)
363
        return out
364
    
365
torch.manual_seed(1)
366
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
367
model = model.to(device)
368

369

370

371

372
loss_fn = nn.BCELoss()
373
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
374

375
num_epochs = 10 
376

377
torch.manual_seed(1)
378
 
379
for epoch in range(num_epochs):
380
    acc_train, loss_train = train(train_dl)
381
    acc_valid, loss_valid = evaluate(valid_dl)
382
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
383

384

385

386

387
test_dataset = IMDB(split='test')
388
test_dl = DataLoader(test_dataset, batch_size=batch_size,
389
                     shuffle=False, collate_fn=collate_batch)
390

391

392

393

394
acc_test, _ = evaluate(test_dl)
395
print(f'test_accuracy: {acc_test:.4f}') 
396

397

398
# ## Optional exercise: 
399
# 
400
# ### Uni-directional SimpleRNN with full-length sequences
401

402
# 
403
# ---
404

405
# 
406
# 
407
# Readers may ignore the next cell.
408
# 
409

410

411

412

413

414
Product

Resources

Company