Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
labmlai
GitHub Repository: labmlai/annotated_deep_learning_paper_implementations
Path: blob/master/labml_nn/activations/fta/experiment.py
4931 views
1
"""
2
---
3
title: Fuzzy Tiling Activation Experiment
4
summary: >
5
Training a transformer with FTA in FFN on Tiny Shakespeare.
6
---
7
8
# [Fuzzy Tiling Activation](index.html) Experiment
9
10
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/activations/fta/experiment.ipynb)
11
12
Here we train a transformer that uses [Fuzzy Tiling Activation](index.html) in the
13
[Feed-Forward Network](../../transformers/feed_forward.html).
14
We use it for a language model and train it on Tiny Shakespeare dataset
15
for demonstration.
16
17
However, this is probably not the ideal task for FTA, and we
18
believe FTA is more suitable for modeling data with continuous variables.
19
"""
20
21
import copy
22
23
import torch
24
import torch.nn as nn
25
26
from labml import experiment
27
from labml.configs import option
28
from labml_nn.activations.fta import FTA
29
from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
30
from labml_nn.transformers import MultiHeadAttention, TransformerLayer
31
from labml_nn.transformers.utils import subsequent_mask
32
33
34
class FeedForwardFTA(nn.Module):
35
"""
36
## FFN module with [FTA](index.html) activation
37
"""
38
39
def __init__(self, d_model: int, d_ff: int,
40
activation: FTA,
41
dropout: float = 0.1):
42
"""
43
* `d_model` is the number of features in a token embedding
44
* `d_ff` is the number of features in the hidden layer of the FFN
45
* `activation` is FTA activation module
46
* `dropout` is dropout probability for the hidden layer
47
"""
48
super().__init__()
49
# Layer one parameterized by weight $W_1$ and bias $b_1$
50
self.layer1 = nn.Linear(d_model, d_ff)
51
# Layer two parameterized by weight $W_1$ and bias $b_1$
52
self.layer2 = nn.Linear(d_ff * activation.expansion_factor, d_model)
53
# Hidden layer dropout
54
self.dropout = nn.Dropout(dropout)
55
# Activation function $f$
56
self.activation = activation
57
58
def forward(self, x: torch.Tensor):
59
# $f(x W_1 + b_1)$
60
x = self.activation(self.layer1(x))
61
# Apply dropout
62
x = self.dropout(x)
63
#
64
return self.layer2(x)
65
66
67
class AutoregressiveTransformer(nn.Module):
68
"""
69
## Auto-Regressive model
70
71
This is an autoregressive transformer model that uses Feed-Forward Networks with
72
(Fuzzy Tiling Activations)(index.html).
73
"""
74
75
def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: TransformerLayer):
76
"""
77
:param n_tokens: is the number of tokens in the vocabulary
78
:param d_model: is the embedding size
79
:param n_layers: is the number of transformer layers
80
:param layer: is the layer. We use `n_layers` copies of this for the transformer.
81
"""
82
super().__init__()
83
# Transformer with `n_layers` layers
84
self.transformer_layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(n_layers)])
85
86
# Token embedding layer
87
self.emb = nn.Embedding(n_tokens, d_model)
88
# Readout layer
89
self.readout = nn.Linear(d_model, n_tokens)
90
91
# The mask will be initialized on the first call
92
self.mask = None
93
94
def forward(self, x: torch.Tensor):
95
"""
96
:param x: are the input tokens of shape `[seq_len, batch_size]`
97
"""
98
# Create auto-regressive mask
99
if self.mask is None or self.mask.size(0) != len(x):
100
# Subsequent mask, will mask out tokens from seeing future tokens
101
self.mask = subsequent_mask(len(x)).to(x.device)
102
103
# Get the token embeddings
104
x = self.emb(x)
105
# Transformer encoder
106
for layer in self.transformer_layers:
107
x = layer(x=x, mask=self.mask)
108
# Get logits
109
x = self.readout(x)
110
111
# Return results
112
return x, None
113
114
115
class Configs(NLPAutoRegressionConfigs):
116
"""
117
## Configurations
118
119
This inherits from
120
[`NLPAutoRegressionConfigs`](../../experiments/nlp_autoregression.html#NLPAutoRegressionConfigs)
121
"""
122
123
# Model
124
model: AutoregressiveTransformer
125
126
# Number of layers
127
n_layers: int = 4
128
129
# $\alpha$ and $\beta$ for DeepNorm
130
deep_norm_alpha: float
131
deep_norm_beta: float
132
133
# Number of heads in the attention
134
n_heads: int = 4
135
# Embedding size
136
d_model: int = 256
137
# Size of each attention head
138
d_k: int = 16
139
# Feed forward layer size
140
d_ff: int = 256
141
142
# FTA
143
fta_lower_limit: float = -1.
144
fta_upper_limit: float = +1.
145
fta_delta: float = 0.2
146
fta_eta: float = 0.05
147
148
149
@option(Configs.model)
150
def _model(c: Configs):
151
"""
152
#### Initialize the model
153
"""
154
155
# Create FTA activation module
156
fta = FTA(c.fta_lower_limit, c.fta_upper_limit, c.fta_delta, c.fta_eta)
157
# Create the transformer.
158
# We re-use [`TransformerLayer`](../../transformers/models.html#TransformerLayer) and
159
# [`MultiHeadAttention`](../../transformers/mha.html) implementations.
160
m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,
161
TransformerLayer(d_model=c.d_model,
162
feed_forward=FeedForwardFTA(d_model=c.d_model,
163
d_ff=c.d_ff,
164
activation=fta,
165
dropout=0.1),
166
self_attn=MultiHeadAttention(c.n_heads, c.d_model,
167
dropout_prob=0.0),
168
dropout_prob=0.0))
169
170
# Move to the device
171
return m.to(c.device)
172
173
174
def main():
175
"""
176
#### Create and run the experiment
177
"""
178
# Create experiment
179
experiment.create(name="fta", writers={'screen', 'labml'})
180
# Create configs
181
conf = Configs()
182
# Override configurations
183
experiment.configs(conf, {
184
# Use character level tokenizer
185
'tokenizer': 'character',
186
# Prompt separator is blank
187
'prompt_separator': '',
188
# Starting prompt for sampling
189
'prompt': 'It is ',
190
# Use Tiny Shakespeare dataset
191
'text': 'tiny_shakespeare',
192
193
# Use a context size of $256$
194
'seq_len': 256,
195
# Train for 32 epochs
196
'epochs': 32,
197
# Batch size $16$
198
'batch_size': 16,
199
# Switch between training and validation for $10$ times per epoch
200
'inner_iterations': 10,
201
202
# Adam optimizer with no warmup
203
'optimizer.optimizer': 'Adam',
204
'optimizer.learning_rate': 3e-4,
205
})
206
207
# Set model(s) for saving and loading
208
experiment.add_pytorch_models({'model': conf.model})
209
210
# Start the experiment
211
with experiment.start():
212
# Run training
213
conf.run()
214
215
216
#
217
if __name__ == '__main__':
218
main()
219
220