Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
fchollet
GitHub Repository: fchollet/deep-learning-with-python-notebooks
Path: blob/master/chapter15_language-models-and-the-transformer.ipynb
709 views
Kernel: Python 3

This is a companion notebook for the book Deep Learning with Python, Third Edition. For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.

If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.

The book's contents are available online at deeplearningwithpython.io.

!pip install keras keras-hub --upgrade -q
import os os.environ["KERAS_BACKEND"] = "jax"
# @title import os from IPython.core.magic import register_cell_magic @register_cell_magic def backend(line, cell): current, required = os.environ.get("KERAS_BACKEND", ""), line.split()[-1] if current == required: get_ipython().run_cell(cell) else: print( f"This cell requires the {required} backend. To run it, change KERAS_BACKEND to " f"\"{required}\" at the top of the notebook, restart the runtime, and rerun the notebook." )

Language models and the Transformer

The language model

Training a Shakespeare language model

import keras filename = keras.utils.get_file( origin=( "https://storage.googleapis.com/download.tensorflow.org/" "data/shakespeare.txt" ), ) shakespeare = open(filename, "r").read()
shakespeare[:250]
import tensorflow as tf sequence_length = 100 def split_input(input, sequence_length): for i in range(0, len(input), sequence_length): yield input[i : i + sequence_length] features = list(split_input(shakespeare[:-1], sequence_length)) labels = list(split_input(shakespeare[1:], sequence_length)) dataset = tf.data.Dataset.from_tensor_slices((features, labels))
x, y = next(dataset.as_numpy_iterator()) x[:50], y[:50]
from keras import layers tokenizer = layers.TextVectorization( standardize=None, split="character", output_sequence_length=sequence_length, ) tokenizer.adapt(dataset.map(lambda text, labels: text))
vocabulary_size = tokenizer.vocabulary_size() vocabulary_size
dataset = dataset.map( lambda features, labels: (tokenizer(features), tokenizer(labels)), num_parallel_calls=8, ) training_data = dataset.shuffle(10_000).batch(64).cache()
embedding_dim = 256 hidden_dim = 1024 inputs = layers.Input(shape=(sequence_length,), dtype="int", name="token_ids") x = layers.Embedding(vocabulary_size, embedding_dim)(inputs) x = layers.GRU(hidden_dim, return_sequences=True)(x) x = layers.Dropout(0.1)(x) outputs = layers.Dense(vocabulary_size, activation="softmax")(x) model = keras.Model(inputs, outputs)
model.summary(line_length=80)
model.compile( optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"], ) model.fit(training_data, epochs=20)

Generating Shakespeare

inputs = keras.Input(shape=(1,), dtype="int", name="token_ids") input_state = keras.Input(shape=(hidden_dim,), name="state") x = layers.Embedding(vocabulary_size, embedding_dim)(inputs) x, output_state = layers.GRU(hidden_dim, return_state=True)( x, initial_state=input_state ) outputs = layers.Dense(vocabulary_size, activation="softmax")(x) generation_model = keras.Model( inputs=(inputs, input_state), outputs=(outputs, output_state), ) generation_model.set_weights(model.get_weights())
tokens = tokenizer.get_vocabulary() token_ids = range(vocabulary_size) char_to_id = dict(zip(tokens, token_ids)) id_to_char = dict(zip(token_ids, tokens)) prompt = """ KING RICHARD III: """
input_ids = [char_to_id[c] for c in prompt] state = keras.ops.zeros(shape=(1, hidden_dim)) for token_id in input_ids: inputs = keras.ops.expand_dims([token_id], axis=0) predictions, state = generation_model.predict((inputs, state), verbose=0)
import numpy as np generated_ids = [] max_length = 250 for i in range(max_length): next_char = int(np.argmax(predictions, axis=-1)[0]) generated_ids.append(next_char) inputs = keras.ops.expand_dims([next_char], axis=0) predictions, state = generation_model.predict((inputs, state), verbose=0)
output = "".join([id_to_char[token_id] for token_id in generated_ids]) print(prompt + output)

Sequence-to-sequence learning

English-to-Spanish Translation

import pathlib zip_path = keras.utils.get_file( origin=( "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip" ), fname="spa-eng", extract=True, ) text_path = pathlib.Path(zip_path) / "spa-eng" / "spa.txt"
with open(text_path) as f: lines = f.read().split("\n")[:-1] text_pairs = [] for line in lines: english, spanish = line.split("\t") spanish = "[start] " + spanish + " [end]" text_pairs.append((english, spanish))
import random random.choice(text_pairs)
import random random.shuffle(text_pairs) val_samples = int(0.15 * len(text_pairs)) train_samples = len(text_pairs) - 2 * val_samples train_pairs = text_pairs[:train_samples] val_pairs = text_pairs[train_samples : train_samples + val_samples] test_pairs = text_pairs[train_samples + val_samples :]
import string import re strip_chars = string.punctuation + "¿" strip_chars = strip_chars.replace("[", "") strip_chars = strip_chars.replace("]", "") def custom_standardization(input_string): lowercase = tf.strings.lower(input_string) return tf.strings.regex_replace( lowercase, f"[{re.escape(strip_chars)}]", "" ) vocab_size = 15000 sequence_length = 20 english_tokenizer = layers.TextVectorization( max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length, ) spanish_tokenizer = layers.TextVectorization( max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length + 1, standardize=custom_standardization, ) train_english_texts = [pair[0] for pair in train_pairs] train_spanish_texts = [pair[1] for pair in train_pairs] english_tokenizer.adapt(train_english_texts) spanish_tokenizer.adapt(train_spanish_texts)
batch_size = 64 def format_dataset(eng, spa): eng = english_tokenizer(eng) spa = spanish_tokenizer(spa) features = {"english": eng, "spanish": spa[:, :-1]} labels = spa[:, 1:] sample_weights = labels != 0 return features, labels, sample_weights def make_dataset(pairs): eng_texts, spa_texts = zip(*pairs) eng_texts = list(eng_texts) spa_texts = list(spa_texts) dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts)) dataset = dataset.batch(batch_size) dataset = dataset.map(format_dataset, num_parallel_calls=4) return dataset.shuffle(2048).cache() train_ds = make_dataset(train_pairs) val_ds = make_dataset(val_pairs)
inputs, targets, sample_weights = next(iter(train_ds)) print(inputs["english"].shape)
print(inputs["spanish"].shape)
print(targets.shape)
print(sample_weights.shape)

Sequence-to-sequence learning with RNNs

embed_dim = 256 hidden_dim = 1024 source = keras.Input(shape=(None,), dtype="int32", name="english") x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source) rnn_layer = layers.GRU(hidden_dim) rnn_layer = layers.Bidirectional(rnn_layer, merge_mode="sum") encoder_output = rnn_layer(x)
target = keras.Input(shape=(None,), dtype="int32", name="spanish") x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(target) rnn_layer = layers.GRU(hidden_dim, return_sequences=True) x = rnn_layer(x, initial_state=encoder_output) x = layers.Dropout(0.5)(x) target_predictions = layers.Dense(vocab_size, activation="softmax")(x) seq2seq_rnn = keras.Model([source, target], target_predictions)
seq2seq_rnn.summary(line_length=80)
seq2seq_rnn.compile( optimizer="adam", loss="sparse_categorical_crossentropy", weighted_metrics=["accuracy"], ) seq2seq_rnn.fit(train_ds, epochs=15, validation_data=val_ds)
import numpy as np spa_vocab = spanish_tokenizer.get_vocabulary() spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab)) def generate_translation(input_sentence): tokenized_input_sentence = english_tokenizer([input_sentence]) decoded_sentence = "[start]" for i in range(sequence_length): tokenized_target_sentence = spanish_tokenizer([decoded_sentence]) inputs = [tokenized_input_sentence, tokenized_target_sentence] next_token_predictions = seq2seq_rnn.predict(inputs, verbose=0) sampled_token_index = np.argmax(next_token_predictions[0, i, :]) sampled_token = spa_index_lookup[sampled_token_index] decoded_sentence += " " + sampled_token if sampled_token == "[end]": break return decoded_sentence test_eng_texts = [pair[0] for pair in test_pairs] for _ in range(5): input_sentence = random.choice(test_eng_texts) print("-") print(input_sentence) print(generate_translation(input_sentence))

The Transformer architecture

Dot-product attention

Transformer encoder block

class TransformerEncoder(keras.Layer): def __init__(self, hidden_dim, intermediate_dim, num_heads): super().__init__() key_dim = hidden_dim // num_heads self.self_attention = layers.MultiHeadAttention(num_heads, key_dim) self.self_attention_layernorm = layers.LayerNormalization() self.feed_forward_1 = layers.Dense(intermediate_dim, activation="relu") self.feed_forward_2 = layers.Dense(hidden_dim) self.feed_forward_layernorm = layers.LayerNormalization() def call(self, source, source_mask): residual = x = source mask = source_mask[:, None, :] x = self.self_attention(query=x, key=x, value=x, attention_mask=mask) x = x + residual x = self.self_attention_layernorm(x) residual = x x = self.feed_forward_1(x) x = self.feed_forward_2(x) x = x + residual x = self.feed_forward_layernorm(x) return x

Transformer decoder block

class TransformerDecoder(keras.Layer): def __init__(self, hidden_dim, intermediate_dim, num_heads): super().__init__() key_dim = hidden_dim // num_heads self.self_attention = layers.MultiHeadAttention(num_heads, key_dim) self.self_attention_layernorm = layers.LayerNormalization() self.cross_attention = layers.MultiHeadAttention(num_heads, key_dim) self.cross_attention_layernorm = layers.LayerNormalization() self.feed_forward_1 = layers.Dense(intermediate_dim, activation="relu") self.feed_forward_2 = layers.Dense(hidden_dim) self.feed_forward_layernorm = layers.LayerNormalization() def call(self, target, source, source_mask): residual = x = target x = self.self_attention(query=x, key=x, value=x, use_causal_mask=True) x = x + residual x = self.self_attention_layernorm(x) residual = x mask = source_mask[:, None, :] x = self.cross_attention( query=x, key=source, value=source, attention_mask=mask ) x = x + residual x = self.cross_attention_layernorm(x) residual = x x = self.feed_forward_1(x) x = self.feed_forward_2(x) x = x + residual x = self.feed_forward_layernorm(x) return x

Sequence-to-sequence learning with a Transformer

hidden_dim = 256 intermediate_dim = 2048 num_heads = 8 source = keras.Input(shape=(None,), dtype="int32", name="english") x = layers.Embedding(vocab_size, hidden_dim)(source) encoder_output = TransformerEncoder(hidden_dim, intermediate_dim, num_heads)( source=x, source_mask=source != 0, ) target = keras.Input(shape=(None,), dtype="int32", name="spanish") x = layers.Embedding(vocab_size, hidden_dim)(target) x = TransformerDecoder(hidden_dim, intermediate_dim, num_heads)( target=x, source=encoder_output, source_mask=source != 0, ) x = layers.Dropout(0.5)(x) target_predictions = layers.Dense(vocab_size, activation="softmax")(x) transformer = keras.Model([source, target], target_predictions)
transformer.summary(line_length=80)
transformer.compile( optimizer="adam", loss="sparse_categorical_crossentropy", weighted_metrics=["accuracy"], ) transformer.fit(train_ds, epochs=15, validation_data=val_ds)

Embedding positional information

from keras import ops class PositionalEmbedding(keras.Layer): def __init__(self, sequence_length, input_dim, output_dim): super().__init__() self.token_embeddings = layers.Embedding(input_dim, output_dim) self.position_embeddings = layers.Embedding(sequence_length, output_dim) def call(self, inputs): positions = ops.cumsum(ops.ones_like(inputs), axis=-1) - 1 embedded_tokens = self.token_embeddings(inputs) embedded_positions = self.position_embeddings(positions) return embedded_tokens + embedded_positions
hidden_dim = 256 intermediate_dim = 2056 num_heads = 8 source = keras.Input(shape=(None,), dtype="int32", name="english") x = PositionalEmbedding(sequence_length, vocab_size, hidden_dim)(source) encoder_output = TransformerEncoder(hidden_dim, intermediate_dim, num_heads)( source=x, source_mask=source != 0, ) target = keras.Input(shape=(None,), dtype="int32", name="spanish") x = PositionalEmbedding(sequence_length, vocab_size, hidden_dim)(target) x = TransformerDecoder(hidden_dim, intermediate_dim, num_heads)( target=x, source=encoder_output, source_mask=source != 0, ) x = layers.Dropout(0.5)(x) target_predictions = layers.Dense(vocab_size, activation="softmax")(x) transformer = keras.Model([source, target], target_predictions)
transformer.compile( optimizer="adam", loss="sparse_categorical_crossentropy", weighted_metrics=["accuracy"], ) transformer.fit(train_ds, epochs=30, validation_data=val_ds)
import numpy as np spa_vocab = spanish_tokenizer.get_vocabulary() spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab)) def generate_translation(input_sentence): tokenized_input_sentence = english_tokenizer([input_sentence]) decoded_sentence = "[start]" for i in range(sequence_length): tokenized_target_sentence = spanish_tokenizer([decoded_sentence]) tokenized_target_sentence = tokenized_target_sentence[:, :-1] inputs = [tokenized_input_sentence, tokenized_target_sentence] next_token_predictions = transformer.predict(inputs, verbose=0) sampled_token_index = np.argmax(next_token_predictions[0, i, :]) sampled_token = spa_index_lookup[sampled_token_index] decoded_sentence += " " + sampled_token if sampled_token == "[end]": break return decoded_sentence test_eng_texts = [pair[0] for pair in test_pairs] for _ in range(5): input_sentence = random.choice(test_eng_texts) print("-") print(input_sentence) print(generate_translation(input_sentence))

Classification with a pretrained Transformer

Pretraining a Transformer encoder

Loading a pretrained Transformer

import keras_hub tokenizer = keras_hub.models.Tokenizer.from_preset("roberta_base_en") backbone = keras_hub.models.Backbone.from_preset("roberta_base_en")
tokenizer("The quick brown fox")
backbone.summary(line_length=80)

Preprocessing IMDb movie reviews

import os, pathlib, shutil, random zip_path = keras.utils.get_file( origin="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", fname="imdb", extract=True, ) imdb_extract_dir = pathlib.Path(zip_path) / "aclImdb" train_dir = pathlib.Path("imdb_train") test_dir = pathlib.Path("imdb_test") val_dir = pathlib.Path("imdb_val") shutil.copytree(imdb_extract_dir / "test", test_dir, dirs_exist_ok=True) val_percentage = 0.2 for category in ("neg", "pos"): src_dir = imdb_extract_dir / "train" / category src_files = os.listdir(src_dir) random.Random(1337).shuffle(src_files) num_val_samples = int(len(src_files) * val_percentage) os.makedirs(train_dir / category, exist_ok=True) os.makedirs(val_dir / category, exist_ok=True) for index, file in enumerate(src_files): if index < num_val_samples: shutil.copy(src_dir / file, val_dir / category / file) else: shutil.copy(src_dir / file, train_dir / category / file)
from keras.utils import text_dataset_from_directory batch_size = 16 train_ds = text_dataset_from_directory(train_dir, batch_size=batch_size) val_ds = text_dataset_from_directory(val_dir, batch_size=batch_size) test_ds = text_dataset_from_directory(test_dir, batch_size=batch_size)
def preprocess(text, label): packer = keras_hub.layers.StartEndPacker( sequence_length=512, start_value=tokenizer.start_token_id, end_value=tokenizer.end_token_id, pad_value=tokenizer.pad_token_id, return_padding_mask=True, ) token_ids, padding_mask = packer(tokenizer(text)) return {"token_ids": token_ids, "padding_mask": padding_mask}, label preprocessed_train_ds = train_ds.map(preprocess) preprocessed_val_ds = val_ds.map(preprocess) preprocessed_test_ds = test_ds.map(preprocess)
next(iter(preprocessed_train_ds))

Fine-tuning a pretrained Transformer

inputs = backbone.input x = backbone(inputs) x = x[:, 0, :] x = layers.Dropout(0.1)(x) x = layers.Dense(768, activation="relu")(x) x = layers.Dropout(0.1)(x) outputs = layers.Dense(1, activation="sigmoid")(x) classifier = keras.Model(inputs, outputs)
classifier.compile( optimizer=keras.optimizers.Adam(5e-5), loss="binary_crossentropy", metrics=["accuracy"], ) classifier.fit( preprocessed_train_ds, validation_data=preprocessed_val_ds, )
classifier.evaluate(preprocessed_test_ds)

What makes the Transformer effective?