Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch15/ch15_part2.ipynb
1245 views
Kernel: Python 3 (ipykernel)

Machine Learning with PyTorch and Scikit-Learn

-- Code Examples

Package version checks

Add folder to path in order to load from the check_packages.py script:

import sys sys.path.insert(0, '..')

Check recommended package versions:

from python_environment_check import check_packages d = { 'torch': '1.8.0', 'torchtext': '0.10.0' } check_packages(d)
[OK] Your Python version is 3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:22:27) [GCC 9.3.0] [OK] torch 1.10.0 [OK] torchtext 0.11.0

Chapter 15: Modeling Sequential Data Using Recurrent Neural Networks (Part 2/3)

from IPython.display import Image %matplotlib inline

Implementing RNNs for sequence modeling in PyTorch

Project one: predicting the sentiment of IMDb movie reviews

Preparing the movie review data

import torch import torch.nn as nn
# pip install torchtext==0.10.0

Attention: To reproduce the code in the books, please make sure to use torchtext 0.10.0 (see https://pypi.org/project/torchtext/0.10.0/), which is the package I used for this chapter.

There are a few adjustments in this notebooks that also make it compatible to newer versions of torchtext.

For newer versions of torchtext, installing portalocker may be necessary:

# pip install torchtext
# pip install portalocker
from torchtext.datasets import IMDB from torch.utils.data.dataset import random_split # Step 1: load and create the datasets train_dataset = IMDB(split='train') test_dataset = IMDB(split='test') test_dataset = list(test_dataset) #datapipe to list torch.manual_seed(1) train_dataset, valid_dataset = random_split( list(train_dataset), [20000, 5000])
## Step 2: find unique tokens (words) import re from collections import Counter, OrderedDict token_counts = Counter() def tokenizer(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()) text = re.sub('[\W]+', ' ', text.lower()) +\ ' '.join(emoticons).replace('-', '') tokenized = text.split() return tokenized for label, line in train_dataset: tokens = tokenizer(line) token_counts.update(tokens) print('Vocab-size:', len(token_counts))
Vocab-size: 69023
## Step 3: encoding each unique token into integers from torchtext.vocab import vocab sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True) ordered_dict = OrderedDict(sorted_by_freq_tuples) vocab = vocab(ordered_dict) vocab.insert_token("<pad>", 0) vocab.insert_token("<unk>", 1) vocab.set_default_index(1) print([vocab[token] for token in ['this', 'is', 'an', 'example']])
[11, 7, 35, 457]
if not torch.cuda.is_available(): print("Warning: this code may be very slow on CPU")
## Step 3-A: define the functions for transformation device = torch.device("cuda" if torch.cuda.is_available() else "cpu") text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)] from torchtext import __version__ as torchtext_version from pkg_resources import parse_version if parse_version(torchtext.__version__) > parse_version("0.10"): label_pipeline = lambda x: 1. if x == 2 else 0. # 1 ~ negative, 2 ~ positive review else: label_pipeline = lambda x: 1. if x == 'pos' else 0. ## Step 3-B: wrap the encode and transformation function def collate_batch(batch): label_list, text_list, lengths = [], [], [] for _label, _text in batch: label_list.append(label_pipeline(_label)) processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) text_list.append(processed_text) lengths.append(processed_text.size(0)) label_list = torch.tensor(label_list) lengths = torch.tensor(lengths) padded_text_list = nn.utils.rnn.pad_sequence( text_list, batch_first=True) return padded_text_list.to(device), label_list.to(device), lengths.to(device)
## Take a small batch from torch.utils.data import DataLoader dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch) text_batch, label_batch, length_batch = next(iter(dataloader)) print(text_batch) print(label_batch) print(length_batch) print(text_batch.shape)
tensor([[ 35, 1739, 7, 449, 721, 6, 301, 4, 787, 9, 4, 18, 44, 2, 1705, 2460, 186, 25, 7, 24, 100, 1874, 1739, 25, 7, 34415, 3568, 1103, 7517, 787, 5, 2, 4991, 12401, 36, 7, 148, 111, 939, 6, 11598, 2, 172, 135, 62, 25, 3199, 1602, 3, 928, 1500, 9, 6, 4601, 2, 155, 36, 14, 274, 4, 42945, 9, 4991, 3, 14, 10296, 34, 3568, 8, 51, 148, 30, 2, 58, 16, 11, 1893, 125, 6, 420, 1214, 27, 14542, 940, 11, 7, 29, 951, 18, 17, 15994, 459, 34, 2480, 15211, 3713, 2, 840, 3200, 9, 3568, 13, 107, 9, 175, 94, 25, 51, 10297, 1796, 27, 712, 16, 2, 220, 17, 4, 54, 722, 238, 395, 2, 787, 32, 27, 5236, 3, 32, 27, 7252, 5118, 2461, 6390, 4, 2873, 1495, 15, 2, 1054, 2874, 155, 3, 7015, 7, 409, 9, 41, 220, 17, 41, 390, 3, 3925, 807, 37, 74, 2858, 15, 10297, 115, 31, 189, 3506, 667, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 216, 175, 724, 5, 11, 18, 10, 226, 110, 14, 182, 78, 8, 13, 24, 182, 78, 8, 13, 166, 182, 50, 150, 24, 85, 2, 4031, 5935, 107, 96, 28, 1867, 602, 19, 52, 162, 21, 1698, 8, 6, 1181, 367, 2, 351, 10, 140, 419, 4, 333, 5, 6022, 7136, 5055, 1209, 10892, 32, 219, 9, 2, 405, 1413, 13, 4031, 13, 1099, 7, 85, 19, 2, 20, 1018, 4, 85, 565, 34, 24, 807, 55, 5, 68, 658, 10, 507, 8, 4, 668, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 10, 121, 24, 28, 98, 74, 589, 9, 149, 2, 7372, 3030, 14543, 1012, 520, 2, 985, 2327, 5, 16847, 5479, 19, 25, 67, 76, 3478, 38, 2, 7372, 3, 25, 67, 76, 2951, 34, 35, 10893, 155, 449, 29495, 23725, 10, 67, 2, 554, 12, 14543, 67, 91, 4, 50, 20, 19, 8, 67, 24, 4228, 2, 2142, 37, 33, 3478, 87, 3, 2564, 160, 155, 11, 634, 126, 24, 158, 72, 286, 13, 373, 2, 4804, 19, 2, 7372, 6794, 6, 30, 128, 73, 48, 10, 886, 8, 13, 24, 4, 85, 20, 19, 8, 13, 35, 218, 3, 428, 710, 2, 107, 936, 7, 54, 72, 223, 3, 10, 96, 122, 2, 103, 54, 72, 82, 2, 658, 202, 2, 106, 293, 103, 7, 1193, 3, 3031, 708, 5760, 3, 2918, 3991, 706, 3327, 349, 148, 286, 13, 139, 6, 2, 1501, 750, 29, 1407, 62, 65, 2612, 71, 40, 14, 4, 547, 9, 62, 8, 7943, 71, 14, 2, 5687, 5, 4868, 3111, 6, 205, 2, 18, 55, 2075, 3, 403, 12, 3111, 231, 45, 5, 271, 3, 68, 1400, 7, 9774, 932, 10, 102, 2, 20, 143, 28, 76, 55, 3810, 9, 2723, 5, 12, 10, 379, 2, 7372, 15, 4, 50, 710, 8, 13, 24, 887, 32, 31, 19, 8, 13, 428], [18923, 7, 4, 4753, 1669, 12, 3019, 6, 4, 13906, 502, 40, 25, 77, 1588, 9, 115, 6, 21713, 2, 90, 305, 237, 9, 502, 33, 77, 376, 4, 16848, 847, 62, 77, 131, 9, 2, 1580, 338, 5, 18923, 32, 2, 1980, 49, 157, 306, 21713, 46, 981, 6, 10298, 2, 18924, 125, 9, 502, 3, 453, 4, 1852, 630, 407, 3407, 34, 277, 29, 242, 2, 20200, 5, 18923, 77, 95, 41, 1833, 6, 2105, 56, 3, 495, 214, 528, 2, 3479, 2, 112, 7, 181, 1813, 3, 597, 5, 2, 156, 294, 4, 543, 173, 9, 1562, 289, 10038, 5, 2, 20, 26, 841, 1392, 62, 130, 111, 72, 832, 26, 181, 12402, 15, 69, 183, 6, 66, 55, 936, 5, 2, 63, 8, 7, 43, 4, 78, 23726, 15995, 13, 20, 17, 800, 5, 392, 59, 3992, 3, 371, 103, 2596, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0') tensor([1., 1., 1., 0.], device='cuda:0') tensor([165, 86, 218, 145], device='cuda:0') torch.Size([4, 218])
## Step 4: batching the datasets batch_size = 32 train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch) valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch) test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

Embedding layers for sentence encoding

  • input_dim: number of words, i.e. maximum integer index + 1.

  • output_dim:

  • input_length: the length of (padded) sequence

    • for example, 'This is an example' -> [0, 0, 0, 0, 0, 0, 3, 1, 8, 9] => input_lenght is 10

  • When calling the layer, takes integr values as input, the embedding layer convert each interger into float vector of size [output_dim]

    • If input shape is [BATCH_SIZE], output shape will be [BATCH_SIZE, output_dim]

    • If input shape is [BATCH_SIZE, 10], output shape will be [BATCH_SIZE, 10, output_dim]

Image(filename='figures/15_10.png', width=600)
Image in a Jupyter notebook
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0) # a batch of 2 samples of 4 indices each text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]]) print(embedding(text_encoded_input))
tensor([[[ 0.7039, -0.8321, -0.4651], [-0.3203, 2.2408, 0.5566], [-0.4643, 0.3046, 0.7046], [-0.7106, -0.2959, 0.8356]], [[-0.4643, 0.3046, 0.7046], [ 0.0946, -0.3531, 0.9124], [-0.3203, 2.2408, 0.5566], [ 0.0000, 0.0000, 0.0000]]], grad_fn=<EmbeddingBackward0>)

Building an RNN model

  • RNN layers:

    • nn.RNN(input_size, hidden_size, num_layers=1)

    • nn.LSTM(..)

    • nn.GRU(..)

    • nn.RNN(input_size, hidden_size, num_layers=1, bidirectional=True)

## An example of building a RNN model ## with simple RNN layer # Fully connected neural network with one hidden layer class RNN(nn.Module): def __init__(self, input_size, hidden_size): super().__init__() self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True) #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True) #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) self.fc = nn.Linear(hidden_size, 1) def forward(self, x): _, hidden = self.rnn(x) out = hidden[-1, :, :] out = self.fc(out) return out model = RNN(64, 32) print(model) model(torch.randn(5, 3, 64))
RNN( (rnn): RNN(64, 32, num_layers=2, batch_first=True) (fc): Linear(in_features=32, out_features=1, bias=True) )
tensor([[ 0.3183], [ 0.1230], [ 0.1772], [-0.1052], [-0.1259]], grad_fn=<AddmmBackward0>)

Building an RNN model for the sentiment analysis task

class RNN(nn.Module): def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True) self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size) self.relu = nn.ReLU() self.fc2 = nn.Linear(fc_hidden_size, 1) self.sigmoid = nn.Sigmoid() def forward(self, text, lengths): out = self.embedding(text) out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True) out, (hidden, cell) = self.rnn(out) out = hidden[-1, :, :] out = self.fc1(out) out = self.relu(out) out = self.fc2(out) out = self.sigmoid(out) return out vocab_size = len(vocab) embed_dim = 20 rnn_hidden_size = 64 fc_hidden_size = 64 torch.manual_seed(1) model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) model = model.to(device)
def train(dataloader): model.train() total_acc, total_loss = 0, 0 for text_batch, label_batch, lengths in dataloader: optimizer.zero_grad() pred = model(text_batch, lengths)[:, 0] loss = loss_fn(pred, label_batch) loss.backward() optimizer.step() total_acc += ((pred>=0.5).float() == label_batch).float().sum().item() total_loss += loss.item()*label_batch.size(0) return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset) def evaluate(dataloader): model.eval() total_acc, total_loss = 0, 0 with torch.no_grad(): for text_batch, label_batch, lengths in dataloader: pred = model(text_batch, lengths)[:, 0] loss = loss_fn(pred, label_batch) total_acc += ((pred>=0.5).float() == label_batch).float().sum().item() total_loss += loss.item()*label_batch.size(0) return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
loss_fn = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) num_epochs = 10 torch.manual_seed(1) for epoch in range(num_epochs): acc_train, loss_train = train(train_dl) acc_valid, loss_valid = evaluate(valid_dl) print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
Epoch 0 accuracy: 0.5605 val_accuracy: 0.5928 Epoch 1 accuracy: 0.6868 val_accuracy: 0.7134 Epoch 2 accuracy: 0.8153 val_accuracy: 0.8168 Epoch 3 accuracy: 0.8735 val_accuracy: 0.8434 Epoch 4 accuracy: 0.9075 val_accuracy: 0.8514 Epoch 5 accuracy: 0.9306 val_accuracy: 0.8624 Epoch 6 accuracy: 0.9482 val_accuracy: 0.8426 Epoch 7 accuracy: 0.9636 val_accuracy: 0.8524 Epoch 8 accuracy: 0.9720 val_accuracy: 0.8664 Epoch 9 accuracy: 0.9832 val_accuracy: 0.8658
acc_test, _ = evaluate(test_dl) print(f'test_accuracy: {acc_test:.4f}')
test_accuracy: 0.8598

More on the bidirectional RNN

  • Trying bidirectional recurrent layer

class RNN(nn.Module): def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True, bidirectional=True) self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size) self.relu = nn.ReLU() self.fc2 = nn.Linear(fc_hidden_size, 1) self.sigmoid = nn.Sigmoid() def forward(self, text, lengths): out = self.embedding(text) out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True) _, (hidden, cell) = self.rnn(out) out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1) out = self.fc1(out) out = self.relu(out) out = self.fc2(out) out = self.sigmoid(out) return out torch.manual_seed(1) model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) model = model.to(device)
loss_fn = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.002) num_epochs = 10 torch.manual_seed(1) for epoch in range(num_epochs): acc_train, loss_train = train(train_dl) acc_valid, loss_valid = evaluate(valid_dl) print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
Epoch 0 accuracy: 0.6067 val_accuracy: 0.5418 Epoch 1 accuracy: 0.7104 val_accuracy: 0.7012 Epoch 2 accuracy: 0.8063 val_accuracy: 0.8240 Epoch 3 accuracy: 0.8924 val_accuracy: 0.8626 Epoch 4 accuracy: 0.9365 val_accuracy: 0.8658 Epoch 5 accuracy: 0.9594 val_accuracy: 0.8670 Epoch 6 accuracy: 0.9767 val_accuracy: 0.8692 Epoch 7 accuracy: 0.9863 val_accuracy: 0.8684 Epoch 8 accuracy: 0.9914 val_accuracy: 0.8430 Epoch 9 accuracy: 0.9931 val_accuracy: 0.8782
test_dataset = IMDB(split='test') test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
acc_test, _ = evaluate(test_dl) print(f'test_accuracy: {acc_test:.4f}')
test_accuracy: 0.8566

Optional exercise:

Uni-directional SimpleRNN with full-length sequences




Readers may ignore the next cell.

! python ../.convert_notebook_to_script.py --input ch15_part2.ipynb --output ch15_part2.py
[NbConvertApp] WARNING | Config option `kernel_spec_manager_class` not recognized by `NbConvertApp`. [NbConvertApp] Converting notebook ch15_part2.ipynb to script [NbConvertApp] Writing 11376 bytes to ch15_part2.py