Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/tutorials/text_preproc_torch.ipynb
1192 views
Kernel: Python 3.6.7 64-bit ('base': conda)

Open In Colab

Text preprocessing

We discuss how to convert a sequence of words or characters into numeric form, which can then be fed into an ML model.

import numpy as np import matplotlib.pyplot as plt np.random.seed(seed=1) import math try: import torch except ModuleNotFoundError: %pip install -qq torch import torch from torch import nn from torch.nn import functional as F from torch.utils import data !mkdir figures # for saving plots
import collections import re import random import os import requests import zipfile import hashlib
# Required functions for downloading data def download(name, cache_dir=os.path.join("..", "data")): """Download a file inserted into DATA_HUB, return the local filename.""" assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}." url, sha1_hash = DATA_HUB[name] os.makedirs(cache_dir, exist_ok=True) fname = os.path.join(cache_dir, url.split("/")[-1]) if os.path.exists(fname): sha1 = hashlib.sha1() with open(fname, "rb") as f: while True: data = f.read(1048576) if not data: break sha1.update(data) if sha1.hexdigest() == sha1_hash: return fname # Hit cache print(f"Downloading {fname} from {url}...") r = requests.get(url, stream=True, verify=True) with open(fname, "wb") as f: f.write(r.content) return fname def download_extract(name, folder=None): """Download and extract a zip/tar file.""" fname = download(name) base_dir = os.path.dirname(fname) data_dir, ext = os.path.splitext(fname) if ext == ".zip": fp = zipfile.ZipFile(fname, "r") elif ext in (".tar", ".gz"): fp = tarfile.open(fname, "r") else: assert False, "Only zip/tar files can be extracted." fp.extractall(base_dir) return os.path.join(base_dir, folder) if folder else data_dir

Data

As a simple example, we use the book "The Time Machine" by H G Wells, since it is short (30k words) and public domain.

DATA_HUB = dict() DATA_URL = "http://d2l-data.s3-accelerate.amazonaws.com/" DATA_HUB["time_machine"] = (DATA_URL + "timemachine.txt", "090b5e7e70c295757f55df93cb0a180b9691891a") def read_time_machine(): """Load the time machine dataset into a list of text lines.""" with open(download("time_machine"), "r") as f: lines = f.readlines() return [re.sub("[^A-Za-z]+", " ", line).strip().lower() for line in lines] lines = read_time_machine() print(f"number of lines: {len(lines)}")
number of lines: 3221
for i in range(11): print(i, lines[i])
0 the time machine by h g wells 1 2 3 4 5 i 6 7 8 the time traveller for so it will be convenient to speak of him 9 was expounding a recondite matter to us his grey eyes shone and 10 twinkled and his usually pale face was flushed and animated the
nchars = 0 nwords = 0 for i in range(len(lines)): nchars += len(lines[i]) words = lines[i].split() nwords += len(words) print("total num characters ", nchars) print("total num words ", nwords)
total num characters 170580 total num words 32775

Tokenization

def tokenize(lines, token="word"): """Split text lines into word or character tokens.""" if token == "word": return [line.split() for line in lines] elif token == "char": return [list(line) for line in lines] else: print("ERROR: unknown token type: " + token) tokens = tokenize(lines) for i in range(11): print(tokens[i])
['the', 'time', 'machine', 'by', 'h', 'g', 'wells'] [] [] [] [] ['i'] [] [] ['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him'] ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and'] ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']

Vocabulary

We map each word to a unique integer id, sorted by decreasing frequency. We reserve the special id of 0 for the "unknown word". We also allow for a list of reserved tokens, such as “pad" for padding, "bos" to present the beginning for a sequence, and “eos” for the end of a sequence.

class Vocab: """Vocabulary for text.""" def __init__(self, tokens=None, min_freq=0, reserved_tokens=None): if tokens is None: tokens = [] if reserved_tokens is None: reserved_tokens = [] # Sort according to frequencies counter = count_corpus(tokens) self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True) # The index for the unknown token is 0 self.unk, uniq_tokens = 0, ["<unk>"] + reserved_tokens uniq_tokens += [token for token, freq in self.token_freqs if freq >= min_freq and token not in uniq_tokens] self.idx_to_token, self.token_to_idx = [], dict() for token in uniq_tokens: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1 def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices] def count_corpus(tokens): """Count token frequencies.""" # Here `tokens` is a 1D list or 2D list if len(tokens) == 0 or isinstance(tokens[0], list): # Flatten a list of token lists into a list of tokens tokens = [token for line in tokens for token in line] return collections.Counter(tokens)

Here are the top 10 words (and their codes) in our corpus.

vocab = Vocab(tokens) print(list(vocab.token_to_idx.items())[:10])
[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]

Here is a tokenization of a few sentences.

for i in [0, 10]: print("words:", tokens[i]) print("indices:", vocab[tokens[i]])
words: ['the', 'time', 'machine', 'by', 'h', 'g', 'wells'] indices: [1, 19, 50, 40, 2183, 2184, 400] words: ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the'] indices: [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]

Putting it altogether

We tokenize the corpus at the character level, and return the sequence of integers, as well as the corresponding Vocab object.

def load_corpus_time_machine(max_tokens=-1): """Return token indices and the vocabulary of the time machine dataset.""" lines = read_time_machine() tokens = tokenize(lines, "char") vocab = Vocab(tokens) # Since each text line in the time machine dataset is not necessarily a # sentence or a paragraph, flatten all the text lines into a single list corpus = [vocab[token] for line in tokens for token in line] if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab
corpus, vocab = load_corpus_time_machine() len(corpus), len(vocab)
(170580, 28)
print(corpus[:20])
[3, 9, 2, 1, 3, 5, 13, 2, 1, 13, 4, 15, 9, 5, 6, 2, 1, 21, 19, 1]
print(list(vocab.token_to_idx.items())[:10])
[('<unk>', 0), (' ', 1), ('e', 2), ('t', 3), ('a', 4), ('i', 5), ('n', 6), ('o', 7), ('s', 8), ('h', 9)]
print([vocab.idx_to_token[i] for i in corpus[:20]])
['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm', 'a', 'c', 'h', 'i', 'n', 'e', ' ', 'b', 'y', ' ']

One-hot encodings

We can convert a sequence of N integers into a N*V one-hot matrix, where V is the vocabulary size.

x = torch.tensor(corpus[:3]) print(x) X = F.one_hot(x, len(vocab)) print(X.shape) print(X)
tensor([3, 9, 2]) torch.Size([3, 28]) tensor([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

Language modeling

When fitting language models, we often need to chop up a long sequence into a set of short sequences, which may be overlapping, as shown below, where we extract subsequences of length n=5n=5.

Below we show how to do this.

This section is based on sec 8.3.4 of http://d2l.ai/chapter_recurrent-neural-networks/language-models-and-dataset.html#reading-long-sequence-data

Random ordering

To increase variety of the data, we can start the extraction at a random offset. We can thus create a random sequence data iterator, as follows.

def seq_data_iter_random(corpus, batch_size, num_steps): """Generate a minibatch of subsequences using random sampling.""" # Start with a random offset (inclusive of `num_steps - 1`) to partition a # sequence corpus = corpus[random.randint(0, num_steps - 1) :] # Subtract 1 since we need to account for labels num_subseqs = (len(corpus) - 1) // num_steps # The starting indices for subsequences of length `num_steps` initial_indices = list(range(0, num_subseqs * num_steps, num_steps)) # In random sampling, the subsequences from two adjacent random # minibatches during iteration are not necessarily adjacent on the # original sequence random.shuffle(initial_indices) def data(pos): # Return a sequence of length `num_steps` starting from `pos` return corpus[pos : pos + num_steps] num_batches = num_subseqs // batch_size for i in range(0, batch_size * num_batches, batch_size): # Here, `initial_indices` contains randomized starting indices for # subsequences initial_indices_per_batch = initial_indices[i : i + batch_size] X = [data(j) for j in initial_indices_per_batch] Y = [data(j + 1) for j in initial_indices_per_batch] yield torch.tensor(X), torch.tensor(Y)

For example, let us generate a sequence 0,1,..,34, and then extract subsequences of length 5. Each minibatch will have 2 such subsequences, starting at random offsets. There is no ordering between the subsequences, either within or across minibatches. There are (351)/5=6\lfloor (35-1)/5 \rfloor = 6 such subsequences, so the iterator will generate 3 minibatches, each of size 2.

For language modeling tasks, we define XX to be the first n1n-1 tokens, and YY to be the nn'th token, which is the one to be predicted.

my_seq = list(range(35)) b = 0 for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_steps=5): print("batch: ", b) print("X: ", X, "\nY:", Y) b += 1
batch: 0 X: tensor([[27, 28, 29, 30, 31], [ 2, 3, 4, 5, 6]]) Y: tensor([[28, 29, 30, 31, 32], [ 3, 4, 5, 6, 7]]) batch: 1 X: tensor([[22, 23, 24, 25, 26], [ 7, 8, 9, 10, 11]]) Y: tensor([[23, 24, 25, 26, 27], [ 8, 9, 10, 11, 12]]) batch: 2 X: tensor([[17, 18, 19, 20, 21], [12, 13, 14, 15, 16]]) Y: tensor([[18, 19, 20, 21, 22], [13, 14, 15, 16, 17]])

Sequential ordering

We can also require that the ii'th subsequence in minibatch bb follows the ii'th subsequence in minibatch b1b-1. This is useful when training RNNs, since when the model encounters batch bb, the hidden state of the model will already be initialized by the last token in sequence ii of batch b1b-1.

def seq_data_iter_sequential(corpus, batch_size, num_steps): """Generate a minibatch of subsequences using sequential partitioning.""" # Start with a random offset to partition a sequence offset = random.randint(0, num_steps) num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size Xs = torch.tensor(corpus[offset : offset + num_tokens]) Ys = torch.tensor(corpus[offset + 1 : offset + 1 + num_tokens]) Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1) num_batches = Xs.shape[1] // num_steps for i in range(0, num_steps * num_batches, num_steps): X = Xs[:, i : i + num_steps] Y = Ys[:, i : i + num_steps] yield X, Y

Below we give an example. We see that the first subsequence in batch 1 is [0,1,2,3,4], and the first subsequence in batch 2 is [5,6,7,8,9], as desired.

for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5): print("X: ", X, "\nY:", Y)
X: tensor([[ 1, 2, 3, 4, 5], [17, 18, 19, 20, 21]]) Y: tensor([[ 2, 3, 4, 5, 6], [18, 19, 20, 21, 22]]) X: tensor([[ 6, 7, 8, 9, 10], [22, 23, 24, 25, 26]]) Y: tensor([[ 7, 8, 9, 10, 11], [23, 24, 25, 26, 27]]) X: tensor([[11, 12, 13, 14, 15], [27, 28, 29, 30, 31]]) Y: tensor([[12, 13, 14, 15, 16], [28, 29, 30, 31, 32]])

Data iterator

def load_corpus_time_machine(max_tokens=-1): """Return token indices and the vocabulary of the time machine dataset.""" lines = read_time_machine() tokens = tokenize(lines, "char") vocab = Vocab(tokens) # Since each text line in the time machine dataset is not necessarily a # sentence or a paragraph, flatten all the text lines into a single list corpus = [vocab[token] for line in tokens for token in line] if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab
class SeqDataLoader: # @save """An iterator to load sequence data.""" def __init__(self, batch_size, num_steps, use_random_iter, max_tokens): if use_random_iter: self.data_iter_fn = seq_data_iter_random else: self.data_iter_fn = seq_data_iter_sequential self.corpus, self.vocab = load_corpus_time_machine(max_tokens) self.batch_size, self.num_steps = batch_size, num_steps def __iter__(self): return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
def load_data_time_machine(batch_size, num_steps, use_random_iter=False, max_tokens=10000): # @save """Return the iterator and the vocabulary of the time machine dataset.""" data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter, max_tokens) return data_iter, data_iter.vocab
data_iter, vocab = load_data_time_machine(2, 5)
print(list(vocab.token_to_idx.items())[:10])
[('<unk>', 0), (' ', 1), ('e', 2), ('t', 3), ('a', 4), ('i', 5), ('n', 6), ('o', 7), ('s', 8), ('h', 9)]
b = 0 for X, Y in data_iter: print("batch: ", b) print("X: ", X, "\nY:", Y) b += 1 if b > 2: break
batch: 0 X: tensor([[ 5, 13, 2, 1, 13], [11, 2, 1, 17, 4]]) Y: tensor([[13, 2, 1, 13, 4], [ 2, 1, 17, 4, 8]]) batch: 1 X: tensor([[ 4, 15, 9, 5, 6], [ 8, 1, 4, 12, 7]]) Y: tensor([[15, 9, 5, 6, 2], [ 1, 4, 12, 7, 6]]) batch: 2 X: tensor([[ 2, 1, 21, 19, 1], [ 6, 18, 3, 9, 2]]) Y: tensor([[ 1, 21, 19, 1, 9], [18, 3, 9, 2, 1]])

Machine translation

When dealing with sequence-to-sequence tasks, such as NMT, we need to create a vocabulary for the source and target language. In addition, the input and output sequences may have different lengths, so we need to use padding to ensure that we can create fixed-size minibatches. We show how to do this below.

This is based on sec 9.5 of http://d2l.ai/chapter_recurrent-modern/machine-translation-and-dataset.html

Data

We use an English-French dataset that consists of bilingual sentence pairs from the Tatoeba Project. Each line in the dataset is a tab-delimited pair of an English text sequence (source) and the translated French text sequence (target).

DATA_HUB["fra-eng"] = (DATA_URL + "fra-eng.zip", "94646ad1522d915e7b0f9296181140edcf86a4f5") def read_data_nmt(): """Load the English-French dataset.""" data_dir = download_extract("fra-eng") with open(os.path.join(data_dir, "fra.txt"), "r") as f: return f.read() raw_text = read_data_nmt() print(raw_text[:100])
Go. Va ! Hi. Salut ! Run! Cours ! Run! Courez ! Who? Qui ? Wow! Ça alors ! Fire! Au feu ! Help! À l'

Preprocessing

We apply several preprocessing steps: we replace non-breaking space with space, convert uppercase letters to lowercase ones, and insert space between words and punctuation marks.

def preprocess_nmt(text): """Preprocess the English-French dataset.""" def no_space(char, prev_char): return char in set(",.!?") and prev_char != " " # Replace non-breaking space with space, and convert uppercase letters to # lowercase ones text = text.replace("\u202f", " ").replace("\xa0", " ").lower() # Insert space between words and punctuation marks out = [" " + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text)] return "".join(out) text = preprocess_nmt(raw_text) print(text[:110])
go . va ! hi . salut ! run ! cours ! run ! courez ! who ? qui ? wow ! ça alors ! fire ! au feu ! help ! à l'ai

We tokenize at the word level. The following tokenize_nmt function tokenizes the the first num_examples text sequence pairs, where each token is either a word or a punctuation mark.

def tokenize_nmt(text, num_examples=None): """Tokenize the English-French dataset.""" source, target = [], [] for i, line in enumerate(text.split("\n")): if num_examples and i > num_examples: break parts = line.split("\t") if len(parts) == 2: source.append(parts[0].split(" ")) target.append(parts[1].split(" ")) return source, target
source, target = tokenize_nmt(text) source[:10], target[:10]
([['go', '.'], ['hi', '.'], ['run', '!'], ['run', '!'], ['who', '?'], ['wow', '!'], ['fire', '!'], ['help', '!'], ['jump', '.'], ['stop', '!']], [['va', '!'], ['salut', '!'], ['cours', '!'], ['courez', '!'], ['qui', '?'], ['ça', 'alors', '!'], ['au', 'feu', '!'], ['à', "l'aide", '!'], ['saute', '.'], ['ça', 'suffit', '!']])

Vocabulary

We can make a source and target vocabulary. To avoid having too many unique tokens, we specify a minimum frequency of 2 - all others will get replaced by "unk". We also add special tags for padding, begin of sentence, and end of sentence.

src_vocab = Vocab(source, min_freq=2, reserved_tokens=["<pad>", "<bos>", "<eos>"]) len(src_vocab)
10012
# French has more high frequency words than English target_vocab = Vocab(target, min_freq=2, reserved_tokens=["<pad>", "<bos>", "<eos>"]) len(target_vocab)
17851

Truncation and padding

To create minibatches of sequences, all of the same length, we truncate sentences that are too long, and pad ones that are too short.

def truncate_pad(line, num_steps, padding_token): """Truncate or pad sequences.""" if len(line) > num_steps: return line[:num_steps] # Truncate return line + [padding_token] * (num_steps - len(line)) # Pad print(truncate_pad(source[0], 10, "pad")) print(truncate_pad(src_vocab[source[0]], 10, src_vocab["<pad>"]))
['go', '.', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad'] [47, 4, 1, 1, 1, 1, 1, 1, 1, 1]
def build_array_nmt(lines, vocab, num_steps): """Transform text sequences of machine translation into minibatches.""" lines = [vocab[l] for l in lines] lines = [l + [vocab["<eos>"]] for l in lines] array = torch.tensor([truncate_pad(l, num_steps, vocab["<pad>"]) for l in lines]) valid_len = (array != vocab["<pad>"]).type(torch.int32).sum(1) return array, valid_len
num_steps = 10 src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps) print(src_array.shape) print(src_valid_len.shape)
torch.Size([167130, 10]) torch.Size([167130])
print(src_array[0, :]) # go, ., eos, pad, ..., pad print(src_valid_len[0])
tensor([47, 4, 3, 1, 1, 1, 1, 1, 1, 1]) tensor(3)

Data iterator

Below we combine all of the above pieces into a handy function.

def load_array(data_arrays, batch_size, is_train=True): """Construct a PyTorch data iterator.""" dataset = data.TensorDataset(*data_arrays) return data.DataLoader(dataset, batch_size, shuffle=is_train) def load_data_nmt(batch_size, num_steps, num_examples=600): """Return the iterator and the vocabularies of the translation dataset.""" text = preprocess_nmt(read_data_nmt()) source, target = tokenize_nmt(text, num_examples) src_vocab = Vocab(source, min_freq=2, reserved_tokens=["<pad>", "<bos>", "<eos>"]) tgt_vocab = Vocab(target, min_freq=2, reserved_tokens=["<pad>", "<bos>", "<eos>"]) src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps) tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps) data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len) data_iter = load_array(data_arrays, batch_size) return data_iter, src_vocab, tgt_vocab

Show the first minibatch.

train_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size=2, num_steps=8) for X, X_valid_len, Y, Y_valid_len in train_iter: print("X:", X.type(torch.int32)) print("valid lengths for X:", X_valid_len) print("Y:", Y.type(torch.int32)) print("valid lengths for Y:", Y_valid_len) break
X: tensor([[29, 20, 4, 3, 1, 1, 1, 1], [44, 12, 4, 3, 1, 1, 1, 1]], dtype=torch.int32) valid lengths for X: tensor([4, 4]) Y: tensor([[147, 0, 48, 4, 3, 1, 1, 1], [ 0, 5, 3, 1, 1, 1, 1, 1]], dtype=torch.int32) valid lengths for Y: tensor([5, 3])