Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
labmlai
GitHub Repository: labmlai/annotated_deep_learning_paper_implementations
Path: blob/master/labml_nn/neox/samples/finetune.py
4935 views
1
"""
2
---
3
title: Fine Tune GPT-NeoX
4
summary: >
5
Fine tune GPT-NeoX biases with Fairscale pipeline parallel module
6
---
7
8
# Fine Tune GPT-NeoX
9
10
This shows how to fine tune GPT-NeoX with pipeline parallelism.
11
"""
12
13
import fairscale
14
import torch
15
import torch.nn as nn
16
import torch.utils.data
17
import torch.utils.data
18
import typing
19
from torch.utils.data import DataLoader, RandomSampler
20
21
from labml import experiment, monit, tracker, lab
22
from labml.configs import option
23
from labml.logger import inspect
24
from labml_nn.neox.utils.text_dataset import get_training_data
25
from labml_nn.neox.utils.finetune import FineTuneBiases
26
from labml_nn.neox.model import LayerGenerator, NeoXModule
27
from labml_nn.neox.utils import balance_layers_simple
28
from labml_nn.neox.utils.trainer import PipelineParallelTrainerConf
29
30
31
@option(PipelineParallelTrainerConf.layers, 'PipelineBiases')
32
def neox_layers(c: PipelineParallelTrainerConf):
33
"""
34
### Load GPT-NeoX layers
35
"""
36
return list(LayerGenerator(is_clone_layers=c.is_clone_layers,
37
filter_layers=c.filter_layers,
38
dtype=c.dtype,
39
).load())
40
41
42
@option(PipelineParallelTrainerConf.fine_tuner, 'PipelineBiases')
43
def fine_tune_biases(c: PipelineParallelTrainerConf):
44
"""
45
### Create fine tuner for biases
46
"""
47
48
fine_tuner = FineTuneBiases(typing.cast(typing.List[NeoXModule], c.layers))
49
# Mark biases as trainable
50
fine_tuner.set_trainable_params()
51
52
#
53
return fine_tuner
54
55
56
@option(PipelineParallelTrainerConf.model, 'PipelineBiases')
57
def pipe_model(c: PipelineParallelTrainerConf):
58
"""
59
### Create pipeline parallel model
60
"""
61
62
if c.is_checkpointing:
63
raise NotImplementedError()
64
else:
65
layers = c.layers
66
67
# Make sure the finetuner is initialized
68
_ = c.fine_tuner
69
70
# Create the Pipe module
71
with monit.section('Pipe'):
72
# Get the layer distribution across GPUs
73
balance = balance_layers_simple(len(layers), c.n_gpus)
74
inspect(balance=balance)
75
# Devices for each GPU
76
devices = [torch.device(f'cuda:{i}') for i in range(c.n_gpus)]
77
# Create Fairscale Pipe module
78
pipe_model = fairscale.nn.Pipe(nn.Sequential(*layers),
79
balance=balance,
80
devices=devices,
81
chunks=c.chunks)
82
83
#
84
return pipe_model
85
86
87
@option(PipelineParallelTrainerConf.train_loader)
88
def tiny_shakespeare(c: PipelineParallelTrainerConf):
89
"""
90
#### Tiny Shakespeare dataset
91
"""
92
dataset = get_training_data(c.max_seq_len)
93
94
return DataLoader(dataset,
95
batch_size=c.batch_size,
96
sampler=RandomSampler(dataset, replacement=True))
97
98
99
def main():
100
# Create experiment
101
experiment.create(name='pipe_neox_biases',
102
writers={'screen', 'web_api'})
103
104
# Initialize configs
105
conf = PipelineParallelTrainerConf()
106
experiment.configs(conf, {
107
'learning_rate': 3e-4,
108
'is_checkpointing': False,
109
'max_seq_len': 128,
110
'batch_size': 64,
111
'chunks': 8,
112
})
113
114
# Start the experiment
115
with experiment.start():
116
# Initialize the model. Do this before the loop for cleaner logs.
117
_ = conf.model
118
119
# Train
120
for epoch in monit.loop(conf.epochs):
121
conf.train_epoch()
122
tracker.new_line()
123
torch.save(conf.fine_tuner.state_dict(), str(lab.get_data_path() / 'fine_tune.pt'))
124
125
126
#
127
if __name__ == '__main__':
128
main()
129
130