Path: blob/master/labml_nn/activations/fta/experiment.py
4931 views
"""1---2title: Fuzzy Tiling Activation Experiment3summary: >4Training a transformer with FTA in FFN on Tiny Shakespeare.5---67# [Fuzzy Tiling Activation](index.html) Experiment89[](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/activations/fta/experiment.ipynb)1011Here we train a transformer that uses [Fuzzy Tiling Activation](index.html) in the12[Feed-Forward Network](../../transformers/feed_forward.html).13We use it for a language model and train it on Tiny Shakespeare dataset14for demonstration.1516However, this is probably not the ideal task for FTA, and we17believe FTA is more suitable for modeling data with continuous variables.18"""1920import copy2122import torch23import torch.nn as nn2425from labml import experiment26from labml.configs import option27from labml_nn.activations.fta import FTA28from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs29from labml_nn.transformers import MultiHeadAttention, TransformerLayer30from labml_nn.transformers.utils import subsequent_mask313233class FeedForwardFTA(nn.Module):34"""35## FFN module with [FTA](index.html) activation36"""3738def __init__(self, d_model: int, d_ff: int,39activation: FTA,40dropout: float = 0.1):41"""42* `d_model` is the number of features in a token embedding43* `d_ff` is the number of features in the hidden layer of the FFN44* `activation` is FTA activation module45* `dropout` is dropout probability for the hidden layer46"""47super().__init__()48# Layer one parameterized by weight $W_1$ and bias $b_1$49self.layer1 = nn.Linear(d_model, d_ff)50# Layer two parameterized by weight $W_1$ and bias $b_1$51self.layer2 = nn.Linear(d_ff * activation.expansion_factor, d_model)52# Hidden layer dropout53self.dropout = nn.Dropout(dropout)54# Activation function $f$55self.activation = activation5657def forward(self, x: torch.Tensor):58# $f(x W_1 + b_1)$59x = self.activation(self.layer1(x))60# Apply dropout61x = self.dropout(x)62#63return self.layer2(x)646566class AutoregressiveTransformer(nn.Module):67"""68## Auto-Regressive model6970This is an autoregressive transformer model that uses Feed-Forward Networks with71(Fuzzy Tiling Activations)(index.html).72"""7374def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: TransformerLayer):75"""76:param n_tokens: is the number of tokens in the vocabulary77:param d_model: is the embedding size78:param n_layers: is the number of transformer layers79:param layer: is the layer. We use `n_layers` copies of this for the transformer.80"""81super().__init__()82# Transformer with `n_layers` layers83self.transformer_layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(n_layers)])8485# Token embedding layer86self.emb = nn.Embedding(n_tokens, d_model)87# Readout layer88self.readout = nn.Linear(d_model, n_tokens)8990# The mask will be initialized on the first call91self.mask = None9293def forward(self, x: torch.Tensor):94"""95:param x: are the input tokens of shape `[seq_len, batch_size]`96"""97# Create auto-regressive mask98if self.mask is None or self.mask.size(0) != len(x):99# Subsequent mask, will mask out tokens from seeing future tokens100self.mask = subsequent_mask(len(x)).to(x.device)101102# Get the token embeddings103x = self.emb(x)104# Transformer encoder105for layer in self.transformer_layers:106x = layer(x=x, mask=self.mask)107# Get logits108x = self.readout(x)109110# Return results111return x, None112113114class Configs(NLPAutoRegressionConfigs):115"""116## Configurations117118This inherits from119[`NLPAutoRegressionConfigs`](../../experiments/nlp_autoregression.html#NLPAutoRegressionConfigs)120"""121122# Model123model: AutoregressiveTransformer124125# Number of layers126n_layers: int = 4127128# $\alpha$ and $\beta$ for DeepNorm129deep_norm_alpha: float130deep_norm_beta: float131132# Number of heads in the attention133n_heads: int = 4134# Embedding size135d_model: int = 256136# Size of each attention head137d_k: int = 16138# Feed forward layer size139d_ff: int = 256140141# FTA142fta_lower_limit: float = -1.143fta_upper_limit: float = +1.144fta_delta: float = 0.2145fta_eta: float = 0.05146147148@option(Configs.model)149def _model(c: Configs):150"""151#### Initialize the model152"""153154# Create FTA activation module155fta = FTA(c.fta_lower_limit, c.fta_upper_limit, c.fta_delta, c.fta_eta)156# Create the transformer.157# We re-use [`TransformerLayer`](../../transformers/models.html#TransformerLayer) and158# [`MultiHeadAttention`](../../transformers/mha.html) implementations.159m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,160TransformerLayer(d_model=c.d_model,161feed_forward=FeedForwardFTA(d_model=c.d_model,162d_ff=c.d_ff,163activation=fta,164dropout=0.1),165self_attn=MultiHeadAttention(c.n_heads, c.d_model,166dropout_prob=0.0),167dropout_prob=0.0))168169# Move to the device170return m.to(c.device)171172173def main():174"""175#### Create and run the experiment176"""177# Create experiment178experiment.create(name="fta", writers={'screen', 'labml'})179# Create configs180conf = Configs()181# Override configurations182experiment.configs(conf, {183# Use character level tokenizer184'tokenizer': 'character',185# Prompt separator is blank186'prompt_separator': '',187# Starting prompt for sampling188'prompt': 'It is ',189# Use Tiny Shakespeare dataset190'text': 'tiny_shakespeare',191192# Use a context size of $256$193'seq_len': 256,194# Train for 32 epochs195'epochs': 32,196# Batch size $16$197'batch_size': 16,198# Switch between training and validation for $10$ times per epoch199'inner_iterations': 10,200201# Adam optimizer with no warmup202'optimizer.optimizer': 'Adam',203'optimizer.learning_rate': 3e-4,204})205206# Set model(s) for saving and loading207experiment.add_pytorch_models({'model': conf.model})208209# Start the experiment210with experiment.start():211# Run training212conf.run()213214215#216if __name__ == '__main__':217main()218219220