Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
labmlai
GitHub Repository: labmlai/annotated_deep_learning_paper_implementations
Path: blob/master/labml_nn/optimizers/adam_warmup.py
4925 views
1
"""
2
---
3
title: Adam optimizer with warm-up
4
summary: A simple PyTorch implementation/tutorial of Adam optimizer with warm-up.
5
---
6
7
# Adam Optimizer with Warmup
8
9
This extends [AMSGrad optimizer](amsgrad.html) and adds a warmup stage.
10
"""
11
12
from typing import Dict
13
14
from labml_nn.optimizers import WeightDecay
15
from labml_nn.optimizers.amsgrad import AMSGrad
16
17
18
class AdamWarmup(AMSGrad):
19
"""
20
## Adam Optimizer with Warmup
21
22
This class extends from AMSGrad optimizer defined in [`amsgrad.py`](amsgrad.html).
23
"""
24
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
25
weight_decay: WeightDecay = WeightDecay(),
26
optimized_update: bool = True,
27
amsgrad=False, warmup=0, defaults=None):
28
"""
29
### Initialize the optimizer
30
31
* `params` is the list of parameters
32
* `lr` is the learning rate $\alpha$
33
* `betas` is a tuple of ($\beta_1$, $\beta_2$)
34
* `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update`
35
* `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html)
36
* 'optimized_update' is a flag whether to optimize the bias correction of the second moment
37
by doing it after adding $\epsilon$
38
* `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam
39
* `warmup` number of warmup steps
40
* `defaults` is a dictionary of default for group values.
41
This is useful when you want to extend the class `AdamWarmup`.
42
"""
43
44
defaults = {} if defaults is None else defaults
45
defaults.update(dict(warmup=warmup))
46
super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
47
48
def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
49
"""
50
### Get learning-rate
51
52
$$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$
53
where $w$ is the number of warmup steps.
54
"""
55
# If we are in warmup stage
56
if group['warmup'] > state['step']:
57
# A linearly increasing learning rate from $0$ to $\alpha$
58
return 1e-8 + state['step'] * group['lr'] / group['warmup']
59
else:
60
# Constant learning rate $\alpha$
61
return group['lr']
62
63