CoCalc -- experiment.py

GitHub Repository: labmlai/annotated_deep_learning_paper_implementations
Path: blob/master/labml_nn/activations/fta/experiment.py
⁴⁹³¹ views
1
"""
2
---
3
title: Fuzzy Tiling Activation Experiment
4
summary: >
5
 Training a transformer with FTA in FFN on Tiny Shakespeare.
6
---
7

8
# [Fuzzy Tiling Activation](index.html) Experiment
9

10
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/activations/fta/experiment.ipynb)
11

12
Here we train a transformer that uses [Fuzzy Tiling Activation](index.html) in the
13
[Feed-Forward Network](../../transformers/feed_forward.html).
14
We use it for a language model and train it on Tiny Shakespeare dataset
15
for demonstration.
16

17
However, this is probably not the ideal task for FTA, and we
18
believe FTA is more suitable for modeling data with continuous variables.
19
"""
20

21
import copy
22

23
import torch
24
import torch.nn as nn
25

26
from labml import experiment
27
from labml.configs import option
28
from labml_nn.activations.fta import FTA
29
from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
30
from labml_nn.transformers import MultiHeadAttention, TransformerLayer
31
from labml_nn.transformers.utils import subsequent_mask
32

33

34
class FeedForwardFTA(nn.Module):
35
    """
36
    ## FFN module with [FTA](index.html) activation
37
    """
38

39
    def __init__(self, d_model: int, d_ff: int,
40
                 activation: FTA,
41
                 dropout: float = 0.1):
42
        """
43
        * `d_model` is the number of features in a token embedding
44
        * `d_ff` is the number of features in the hidden layer of the FFN
45
        * `activation` is FTA activation module
46
        * `dropout` is dropout probability for the hidden layer
47
        """
48
        super().__init__()
49
        # Layer one parameterized by weight $W_1$ and bias $b_1$
50
        self.layer1 = nn.Linear(d_model, d_ff)
51
        # Layer two parameterized by weight $W_1$ and bias $b_1$
52
        self.layer2 = nn.Linear(d_ff * activation.expansion_factor, d_model)
53
        # Hidden layer dropout
54
        self.dropout = nn.Dropout(dropout)
55
        # Activation function $f$
56
        self.activation = activation
57

58
    def forward(self, x: torch.Tensor):
59
        # $f(x W_1 + b_1)$
60
        x = self.activation(self.layer1(x))
61
        # Apply dropout
62
        x = self.dropout(x)
63
        #
64
        return self.layer2(x)
65

66

67
class AutoregressiveTransformer(nn.Module):
68
    """
69
    ## Auto-Regressive model
70

71
    This is an autoregressive transformer model that uses Feed-Forward Networks with
72
     (Fuzzy Tiling Activations)(index.html).
73
    """
74

75
    def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: TransformerLayer):
76
        """
77
        :param n_tokens: is the number of tokens in the vocabulary
78
        :param d_model: is the embedding size
79
        :param n_layers: is the number of transformer layers
80
        :param layer: is the layer. We use `n_layers` copies of this for the transformer.
81
        """
82
        super().__init__()
83
        # Transformer with `n_layers` layers
84
        self.transformer_layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(n_layers)])
85

86
        # Token embedding layer
87
        self.emb = nn.Embedding(n_tokens, d_model)
88
        # Readout layer
89
        self.readout = nn.Linear(d_model, n_tokens)
90

91
        # The mask will be initialized on the first call
92
        self.mask = None
93

94
    def forward(self, x: torch.Tensor):
95
        """
96
        :param x: are the input tokens of shape `[seq_len, batch_size]`
97
        """
98
        # Create auto-regressive mask
99
        if self.mask is None or self.mask.size(0) != len(x):
100
            # Subsequent mask, will mask out tokens from seeing future tokens
101
            self.mask = subsequent_mask(len(x)).to(x.device)
102

103
        # Get the token embeddings
104
        x = self.emb(x)
105
        # Transformer encoder
106
        for layer in self.transformer_layers:
107
            x = layer(x=x, mask=self.mask)
108
        # Get logits
109
        x = self.readout(x)
110

111
        # Return results
112
        return x, None
113

114

115
class Configs(NLPAutoRegressionConfigs):
116
    """
117
    ## Configurations
118

119
    This inherits from
120
    [`NLPAutoRegressionConfigs`](../../experiments/nlp_autoregression.html#NLPAutoRegressionConfigs)
121
    """
122

123
    # Model
124
    model: AutoregressiveTransformer
125

126
    # Number of layers
127
    n_layers: int = 4
128

129
    # $\alpha$ and $\beta$ for DeepNorm
130
    deep_norm_alpha: float
131
    deep_norm_beta: float
132

133
    # Number of heads in the attention
134
    n_heads: int = 4
135
    # Embedding size
136
    d_model: int = 256
137
    # Size of each attention head
138
    d_k: int = 16
139
    # Feed forward layer size
140
    d_ff: int = 256
141

142
    # FTA
143
    fta_lower_limit: float = -1.
144
    fta_upper_limit: float = +1.
145
    fta_delta: float = 0.2
146
    fta_eta: float = 0.05
147

148

149
@option(Configs.model)
150
def _model(c: Configs):
151
    """
152
    #### Initialize the model
153
    """
154

155
    # Create FTA activation module
156
    fta = FTA(c.fta_lower_limit, c.fta_upper_limit, c.fta_delta, c.fta_eta)
157
    # Create the transformer.
158
    # We re-use [`TransformerLayer`](../../transformers/models.html#TransformerLayer) and
159
    # [`MultiHeadAttention`](../../transformers/mha.html) implementations.
160
    m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,
161
                                  TransformerLayer(d_model=c.d_model,
162
                                                   feed_forward=FeedForwardFTA(d_model=c.d_model,
163
                                                                               d_ff=c.d_ff,
164
                                                                               activation=fta,
165
                                                                               dropout=0.1),
166
                                                   self_attn=MultiHeadAttention(c.n_heads, c.d_model,
167
                                                                                dropout_prob=0.0),
168
                                                   dropout_prob=0.0))
169

170
    # Move to the device
171
    return m.to(c.device)
172

173

174
def main():
175
    """
176
    #### Create and run the experiment
177
    """
178
    # Create experiment
179
    experiment.create(name="fta", writers={'screen', 'labml'})
180
    # Create configs
181
    conf = Configs()
182
    # Override configurations
183
    experiment.configs(conf, {
184
        # Use character level tokenizer
185
        'tokenizer': 'character',
186
        # Prompt separator is blank
187
        'prompt_separator': '',
188
        # Starting prompt for sampling
189
        'prompt': 'It is ',
190
        # Use Tiny Shakespeare dataset
191
        'text': 'tiny_shakespeare',
192

193
        # Use a context size of $256$
194
        'seq_len': 256,
195
        # Train for 32 epochs
196
        'epochs': 32,
197
        # Batch size $16$
198
        'batch_size': 16,
199
        # Switch between training and validation for $10$ times per epoch
200
        'inner_iterations': 10,
201

202
        # Adam optimizer with no warmup
203
        'optimizer.optimizer': 'Adam',
204
        'optimizer.learning_rate': 3e-4,
205
    })
206

207
    # Set model(s) for saving and loading
208
    experiment.add_pytorch_models({'model': conf.model})
209

210
    # Start the experiment
211
    with experiment.start():
212
        # Run training
213
        conf.run()
214

215

216
#
217
if __name__ == '__main__':
218
    main()
219

220
Product

Resources

Company