GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/tutorials/text_preproc_jax.ipynb
¹¹⁹² views

Kernel: Python [conda env:pyprobml]

Please find torch implementation of this notebook here: https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/text_preproc_torch.ipynb

Text preprocessing

We discuss how to convert a sequence of words or characters into numeric form, which can then be fed into an ML model.

In [1]:

import os
import numpy as np
import jax
import jax.numpy as jnp
import matplotlib.pyplot as plt

import math

try:
    import torch
except ModuleNotFoundError:
    %pip install -qq torch
    import torch
from torch.utils import data

if not os.path.exists("figures"):
    os.makedirs("figures")  # for saving plots

In [2]:

import collections
import re
import random
import os
import requests
import zipfile
import hashlib

In [3]:

# Required functions for downloading data


def download(name, cache_dir=os.path.join("..", "data")):
    """Download a file inserted into DATA_HUB, return the local filename."""
    assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split("/")[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, "rb") as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # Hit cache
    print(f"Downloading {fname} from {url}...")
    r = requests.get(url, stream=True, verify=True)
    with open(fname, "wb") as f:
        f.write(r.content)
    return fname


def download_extract(name, folder=None):
    """Download and extract a zip/tar file."""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == ".zip":
        fp = zipfile.ZipFile(fname, "r")
    elif ext in (".tar", ".gz"):
        fp = tarfile.open(fname, "r")
    else:
        assert False, "Only zip/tar files can be extracted."
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

Basics

This section is based on sec 8.2 of http://d2l.ai/chapter_recurrent-neural-networks/text-preprocessing.html

Data

As a simple example, we use the book "The Time Machine" by H G Wells, since it is short (30k words) and public domain.

In [4]:

DATA_HUB = dict()
DATA_URL = "http://d2l-data.s3-accelerate.amazonaws.com/"

DATA_HUB["time_machine"] = (DATA_URL + "timemachine.txt", "090b5e7e70c295757f55df93cb0a180b9691891a")


def read_time_machine():
    """Load the time machine dataset into a list of text lines."""
    with open(download("time_machine"), "r") as f:
        lines = f.readlines()
    return [re.sub("[^A-Za-z]+", " ", line).strip().lower() for line in lines]


lines = read_time_machine()
print(f"number of lines: {len(lines)}")

Out[4]:

number of lines: 3221

In [5]:

for i in range(11):
    print(i, lines[i])

Out[5]:

the time machine by h g wells




i


the time traveller for so it will be convenient to speak of him
was expounding a recondite matter to us his grey eyes shone and
twinkled and his usually pale face was flushed and animated the

In [6]:

nchars = 0
nwords = 0
for i in range(len(lines)):
    nchars += len(lines[i])
    words = lines[i].split()
    nwords += len(words)
print("total num characters ", nchars)
print("total num words ", nwords)

Out[6]:

total num characters  170580
total num words  32775

Tokenization

In [7]:

def tokenize(lines, token="word"):
    """Split text lines into word or character tokens."""
    if token == "word":
        return [line.split() for line in lines]
    elif token == "char":
        return [list(line) for line in lines]
    else:
        print("ERROR: unknown token type: " + token)


tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

Out[7]:

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']

Vocabulary

We map each word to a unique integer id, sorted by decreasing frequency. We reserve the special id of 0 for the "unknown word". We also allow for a list of reserved tokens, such as “pad" for padding, "bos" to present the beginning for a sequence, and “eos” for the end of a sequence.

In [8]:

class Vocab:
    """Vocabulary for text."""

    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # Sort according to frequencies
        counter = count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        # The index for the unknown token is 0
        self.unk, uniq_tokens = 0, ["<unk>"] + reserved_tokens
        uniq_tokens += [token for token, freq in self.token_freqs if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]


def count_corpus(tokens):
    """Count token frequencies."""
    # Here `tokens` is a 1D list or 2D list
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # Flatten a list of token lists into a list of tokens
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

Here are the top 10 words (and their codes) in our corpus.

In [9]:

vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

Out[9]:

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]

Here is a tokenization of a few sentences.

In [10]:

for i in [0, 10]:
    print("words:", tokens[i])
    print("indices:", vocab[tokens[i]])

Out[10]:

words: ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
indices: [1, 19, 50, 40, 2183, 2184, 400]
words: ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
indices: [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]

Putting it altogether

We tokenize the corpus at the character level, and return the sequence of integers, as well as the corresponding Vocab object.

In [11]:

def load_corpus_time_machine(max_tokens=-1):
    """Return token indices and the vocabulary of the time machine dataset."""
    lines = read_time_machine()
    tokens = tokenize(lines, "char")
    vocab = Vocab(tokens)
    # Since each text line in the time machine dataset is not necessarily a
    # sentence or a paragraph, flatten all the text lines into a single list
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

In [12]:

corpus, vocab = load_corpus_time_machine()
len(corpus), len(vocab)

Out[12]:

(170580, 28)

In [13]:

print(corpus[:20])

Out[13]:

[3, 9, 2, 1, 3, 5, 13, 2, 1, 13, 4, 15, 9, 5, 6, 2, 1, 21, 19, 1]

In [14]:

print(list(vocab.token_to_idx.items())[:10])

Out[14]:

[('<unk>', 0), (' ', 1), ('e', 2), ('t', 3), ('a', 4), ('i', 5), ('n', 6), ('o', 7), ('s', 8), ('h', 9)]

In [15]:

print([vocab.idx_to_token[i] for i in corpus[:20]])

Out[15]:

['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm', 'a', 'c', 'h', 'i', 'n', 'e', ' ', 'b', 'y', ' ']

One-hot encodings

We can convert a sequence of N integers into a N*V one-hot matrix, where V is the vocabulary size.

In [16]:

x = jnp.array(corpus[:3])
print(x)
X = jax.nn.one_hot(x, len(vocab))
print(X.shape)
print(X)

Out[16]:

WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)

[3 9 2]
(3, 28)
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]]

Language modeling

When fitting language models, we often need to chop up a long sequence into a set of short sequences, which may be overlapping, as shown below, where we extract subsequences of length $n=5$ .

Below we show how to do this.

This section is based on sec 8.3.4 of http://d2l.ai/chapter_recurrent-neural-networks/language-models-and-dataset.html#reading-long-sequence-data

Random ordering

To increase variety of the data, we can start the extraction at a random offset. We can thus create a random sequence data iterator, as follows.

In [17]:

def seq_data_iter_random(corpus, batch_size, num_steps):
    """Generate a minibatch of subsequences using random sampling."""
    # Start with a random offset (inclusive of `num_steps - 1`) to partition a
    # sequence
    corpus = corpus[random.randint(0, num_steps - 1) :]
    # Subtract 1 since we need to account for labels
    num_subseqs = (len(corpus) - 1) // num_steps
    # The starting indices for subsequences of length `num_steps`
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # In random sampling, the subsequences from two adjacent random
    # minibatches during iteration are not necessarily adjacent on the
    # original sequence
    random.shuffle(initial_indices)

    def data(pos):
        # Return a sequence of length `num_steps` starting from `pos`
        return corpus[pos : pos + num_steps]

    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        # Here, `initial_indices` contains randomized starting indices for
        # subsequences
        initial_indices_per_batch = initial_indices[i : i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield jnp.array(X), jnp.array(Y)

For example, let us generate a sequence 0,1,..,34, and then extract subsequences of length 5. Each minibatch will have 2 such subsequences, starting at random offsets. There is no ordering between the subsequences, either within or across minibatches. There are $\lfloor (35-1)/5 \rfloor = 6$ such subsequences, so the iterator will generate 3 minibatches, each of size 2.

For language modeling tasks, we define $X$ to be the first $n-1$ tokens, and $Y$ to be the $n$ 'th token, which is the one to be predicted.

In [18]:

my_seq = list(range(35))
b = 0
for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_steps=5):
    print("batch: ", b)
    print("X: ", X, "\nY:", Y)
    b += 1

Out[18]:

batch:  0
X:  [[18 19 20 21 22]
 [ 8  9 10 11 12]] 
Y: [[19 20 21 22 23]
 [ 9 10 11 12 13]]
batch:  1
X:  [[23 24 25 26 27]
 [ 3  4  5  6  7]] 
Y: [[24 25 26 27 28]
 [ 4  5  6  7  8]]
batch:  2
X:  [[28 29 30 31 32]
 [13 14 15 16 17]] 
Y: [[29 30 31 32 33]
 [14 15 16 17 18]]

Sequential ordering

We can also require that the $i$ 'th subsequence in minibatch $b$ follows the $i$ 'th subsequence in minibatch $b-1$ . This is useful when training RNNs, since when the model encounters batch $b$ , the hidden state of the model will already be initialized by the last token in sequence $i$ of batch $b-1$ .

In [19]:

def seq_data_iter_sequential(corpus, batch_size, num_steps):
    """Generate a minibatch of subsequences using sequential partitioning."""
    # Start with a random offset to partition a sequence
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    Xs = jnp.array(corpus[offset : offset + num_tokens])
    Ys = jnp.array(corpus[offset + 1 : offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1] // num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i : i + num_steps]
        Y = Ys[:, i : i + num_steps]
        yield X, Y

Below we give an example. We see that the first subsequence in batch 1 is [0,1,2,3,4], and the first subsequence in batch 2 is [5,6,7,8,9], as desired.

In [20]:

for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print("X: ", X, "\nY:", Y)

Out[20]:

X:  [[ 1  2  3  4  5]
 [17 18 19 20 21]] 
Y: [[ 2  3  4  5  6]
 [18 19 20 21 22]]
X:  [[ 6  7  8  9 10]
 [22 23 24 25 26]] 
Y: [[ 7  8  9 10 11]
 [23 24 25 26 27]]
X:  [[11 12 13 14 15]
 [27 28 29 30 31]] 
Y: [[12 13 14 15 16]
 [28 29 30 31 32]]

Data iterator

In [21]:

def load_corpus_time_machine(max_tokens=-1):
    """Return token indices and the vocabulary of the time machine dataset."""
    lines = read_time_machine()
    tokens = tokenize(lines, "char")
    vocab = Vocab(tokens)
    # Since each text line in the time machine dataset is not necessarily a
    # sentence or a paragraph, flatten all the text lines into a single list
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

In [22]:

class SeqDataLoader:  # @save
    """An iterator to load sequence data."""

    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = seq_data_iter_random
        else:
            self.data_iter_fn = seq_data_iter_sequential
        self.corpus, self.vocab = load_corpus_time_machine(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

In [23]:

def load_data_time_machine(batch_size, num_steps, use_random_iter=False, max_tokens=10000):  # @save
    """Return the iterator and the vocabulary of the time machine dataset."""
    data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab

In [24]:

data_iter, vocab = load_data_time_machine(2, 5)

In [25]:

print(list(vocab.token_to_idx.items())[:10])

Out[25]:

[('<unk>', 0), (' ', 1), ('e', 2), ('t', 3), ('a', 4), ('i', 5), ('n', 6), ('o', 7), ('s', 8), ('h', 9)]

In [26]:

b = 0
for X, Y in data_iter:
    print("batch: ", b)
    print("X: ", X, "\nY:", Y)
    b += 1
    if b > 2:
        break

Out[26]:

batch:  0
X:  [[ 3  9  2  1  3]
 [15 12 14 11  2]] 
Y: [[ 9  2  1  3  5]
 [12 14 11  2  1]]
batch:  1
X:  [[ 5 13  2  1 13]
 [ 1 17  4  8  1]] 
Y: [[13  2  1 13  4]
 [17  4  8  1  4]]
batch:  2
X:  [[ 4 15  9  5  6]
 [ 4 12  7  6 18]] 
Y: [[15  9  5  6  2]
 [12  7  6 18  3]]

Machine translation

When dealing with sequence-to-sequence tasks, such as NMT, we need to create a vocabulary for the source and target language. In addition, the input and output sequences may have different lengths, so we need to use padding to ensure that we can create fixed-size minibatches. We show how to do this below.

This is based on sec 9.5 of http://d2l.ai/chapter_recurrent-modern/machine-translation-and-dataset.html

Data

We use an English-French dataset that consists of bilingual sentence pairs from the Tatoeba Project. Each line in the dataset is a tab-delimited pair of an English text sequence (source) and the translated French text sequence (target).

In [27]:

DATA_HUB["fra-eng"] = (DATA_URL + "fra-eng.zip", "94646ad1522d915e7b0f9296181140edcf86a4f5")


def read_data_nmt():
    """Load the English-French dataset."""
    data_dir = download_extract("fra-eng")
    with open(os.path.join(data_dir, "fra.txt"), "r") as f:
        return f.read()


raw_text = read_data_nmt()
print(raw_text[:100])

Out[27]:

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !
Fire!	Au feu !
Help!	À l'

Preprocessing

We apply several preprocessing steps: we replace non-breaking space with space, convert uppercase letters to lowercase ones, and insert space between words and punctuation marks.

In [28]:

def preprocess_nmt(text):
    """Preprocess the English-French dataset."""

    def no_space(char, prev_char):
        return char in set(",.!?") and prev_char != " "

    # Replace non-breaking space with space, and convert uppercase letters to
    # lowercase ones
    text = text.replace("\u202f", " ").replace("\xa0", " ").lower()
    # Insert space between words and punctuation marks
    out = [" " + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text)]
    return "".join(out)


text = preprocess_nmt(raw_text)
print(text[:110])

Out[28]:

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !
fire !	au feu !
help !	à l'ai

We tokenize at the word level. The following tokenize_nmt function tokenizes the the first num_examples text sequence pairs, where each token is either a word or a punctuation mark.

In [29]:

def tokenize_nmt(text, num_examples=None):
    """Tokenize the English-French dataset."""
    source, target = [], []
    for i, line in enumerate(text.split("\n")):
        if num_examples and i > num_examples:
            break
        parts = line.split("\t")
        if len(parts) == 2:
            source.append(parts[0].split(" "))
            target.append(parts[1].split(" "))
    return source, target

In [30]:

source, target = tokenize_nmt(text)
source[:10], target[:10]

Out[30]:

([['go', '.'],
  ['hi', '.'],
  ['run', '!'],
  ['run', '!'],
  ['who', '?'],
  ['wow', '!'],
  ['fire', '!'],
  ['help', '!'],
  ['jump', '.'],
  ['stop', '!']],
 [['va', '!'],
  ['salut', '!'],
  ['cours', '!'],
  ['courez', '!'],
  ['qui', '?'],
  ['ça', 'alors', '!'],
  ['au', 'feu', '!'],
  ['à', "l'aide", '!'],
  ['saute', '.'],
  ['ça', 'suffit', '!']])

Vocabulary

We can make a source and target vocabulary. To avoid having too many unique tokens, we specify a minimum frequency of 2 - all others will get replaced by "unk". We also add special tags for padding, begin of sentence, and end of sentence.

In [31]:

src_vocab = Vocab(source, min_freq=2, reserved_tokens=["<pad>", "<bos>", "<eos>"])
len(src_vocab)

Out[31]:

10012

In [32]:

# French has more high frequency words than English
target_vocab = Vocab(target, min_freq=2, reserved_tokens=["<pad>", "<bos>", "<eos>"])
len(target_vocab)

Out[32]:

17851

Truncation and padding

To create minibatches of sequences, all of the same length, we truncate sentences that are too long, and pad ones that are too short.

In [33]:

def truncate_pad(line, num_steps, padding_token):
    """Truncate or pad sequences."""
    if len(line) > num_steps:
        return line[:num_steps]  # Truncate
    return line + [padding_token] * (num_steps - len(line))  # Pad


print(truncate_pad(source[0], 10, "pad"))
print(truncate_pad(src_vocab[source[0]], 10, src_vocab["<pad>"]))

Out[33]:

['go', '.', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad']
[47, 4, 1, 1, 1, 1, 1, 1, 1, 1]

In [34]:

def build_array_nmt(lines, vocab, num_steps):
    """Transform text sequences of machine translation into minibatches."""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab["<eos>"]] for l in lines]
    array = torch.tensor([truncate_pad(l, num_steps, vocab["<pad>"]) for l in lines])
    valid_len = (array != vocab["<pad>"]).type(torch.int32).sum(1)
    return array, valid_len

In [35]:

num_steps = 10
src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
print(jnp.array(src_array).shape)
print(jnp.array(src_valid_len).shape)

Out[35]:

(167130, 10)
(167130,)

In [36]:

print(jnp.array(src_array[0, :]))  # go, ., eos, pad, ..., pad
print(jnp.array(src_valid_len[0]))

Out[36]:

[47  4  3  1  1  1  1  1  1  1]
3

Data iterator

Below we combine all of the above pieces into a handy function.

In [37]:

def load_array(data_arrays, batch_size, is_train=True):
    """Construct a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)


def load_data_nmt(batch_size, num_steps, num_examples=600):
    """Return the iterator and the vocabularies of the translation dataset."""
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
    src_vocab = Vocab(source, min_freq=2, reserved_tokens=["<pad>", "<bos>", "<eos>"])
    tgt_vocab = Vocab(target, min_freq=2, reserved_tokens=["<pad>", "<bos>", "<eos>"])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab

Show the first minibatch.

In [38]:

train_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size=2, num_steps=8)
for X, X_valid_len, Y, Y_valid_len in train_iter:
    print("X:", jnp.array(X).astype(jnp.int32))
    print("valid lengths for X:", jnp.array(X_valid_len))
    print("Y:", jnp.array(Y).astype(jnp.int32))
    print("valid lengths for Y:", jnp.array(Y_valid_len))
    break

Out[38]:

X: [[95 11  3  1  1  1  1  1]
 [ 6  0  4  3  1  1  1  1]]
valid lengths for X: [3 4]
Y: [[ 0  9  3  1  1  1  1  1]
 [10  0  4  3  1  1  1  1]]
valid lengths for Y: [3 4]

Text preprocessing

Basics

Data

Tokenization

Vocabulary

Putting it altogether

One-hot encodings

Language modeling

Random ordering

Sequential ordering

Data iterator

Machine translation

Data

Preprocessing

Vocabulary

Truncation and padding

Data iterator

Product

Resources

Company