Path: blob/master/examples/nlp/text_classification_with_transformer.py
8133 views
"""1Title: Text classification with Transformer2Author: [Apoorv Nandan](https://twitter.com/NandanApoorv)3Date created: 2020/05/104Last modified: 2024/01/185Description: Implement a Transformer block as a Keras layer and use it for text classification.6Accelerator: GPU7Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)8"""910"""11## Setup12"""1314import keras15from keras import ops16from keras import layers1718"""19## Implement a Transformer block as a layer20"""212223class TransformerBlock(layers.Layer):24def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):25super().__init__()26self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)27self.ffn = keras.Sequential(28[29layers.Dense(ff_dim, activation="relu"),30layers.Dense(embed_dim),31]32)33self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)34self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)35self.dropout1 = layers.Dropout(rate)36self.dropout2 = layers.Dropout(rate)3738def call(self, inputs):39attn_output = self.att(inputs, inputs)40attn_output = self.dropout1(attn_output)41out1 = self.layernorm1(inputs + attn_output)42ffn_output = self.ffn(out1)43ffn_output = self.dropout2(ffn_output)44return self.layernorm2(out1 + ffn_output)454647"""48## Implement embedding layer4950Two separate embedding layers, one for tokens, one for token index (positions).51"""525354class TokenAndPositionEmbedding(layers.Layer):55def __init__(self, maxlen, vocab_size, embed_dim):56super().__init__()57self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)58self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)5960def call(self, x):61maxlen = ops.shape(x)[-1]62positions = ops.arange(start=0, stop=maxlen, step=1)63positions = self.pos_emb(positions)64x = self.token_emb(x)65return x + positions666768"""69## Download and prepare dataset70"""7172vocab_size = 20000 # Only consider the top 20k words73maxlen = 200 # Only consider the first 200 words of each movie review74(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)75print(len(x_train), "Training sequences")76print(len(x_val), "Validation sequences")77x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)78x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)7980"""81## Create classifier model using transformer layer8283Transformer layer outputs one vector for each time step of our input sequence.84Here, we take the mean across all time steps and85use a feed forward network on top of it to classify text.86"""878889embed_dim = 32 # Embedding size for each token90num_heads = 2 # Number of attention heads91ff_dim = 32 # Hidden layer size in feed forward network inside transformer9293inputs = layers.Input(shape=(maxlen,))94embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)95x = embedding_layer(inputs)96transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)97x = transformer_block(x)98x = layers.GlobalAveragePooling1D()(x)99x = layers.Dropout(0.1)(x)100x = layers.Dense(20, activation="relu")(x)101x = layers.Dropout(0.1)(x)102outputs = layers.Dense(2, activation="softmax")(x)103104model = keras.Model(inputs=inputs, outputs=outputs)105106107"""108## Train and Evaluate109"""110111model.compile(112optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]113)114history = model.fit(115x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)116)117118"""119## Relevant Chapters from Deep Learning with Python120- [Chapter 14: Text classification](https://deeplearningwithpython.io/chapters/chapter14_text-classification)121- [Chapter 15: Language models and the Transformer](https://deeplearningwithpython.io/chapters/chapter15_language-models-and-the-transformer)122"""123124125