Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch16/ch16-part3-bert.py
1245 views
1
# coding: utf-8
2
3
4
import sys
5
from python_environment_check import check_packages
6
import gzip
7
import shutil
8
import time
9
import pandas as pd
10
import requests
11
import torch
12
import torch.nn.functional as F
13
import torchtext
14
import transformers
15
from transformers import DistilBertTokenizerFast
16
from transformers import DistilBertForSequenceClassification
17
from transformers import Trainer, TrainingArguments
18
from datasets import load_metric
19
import numpy as np
20
21
# # Machine Learning with PyTorch and Scikit-Learn
22
# # -- Code Examples
23
24
# ## Package version checks
25
26
# Add folder to path in order to load from the check_packages.py script:
27
28
29
30
sys.path.insert(0, '..')
31
32
33
# Check recommended package versions:
34
35
36
37
38
39
d = {
40
'pandas': '1.3.2',
41
'torch': '1.9.0',
42
'torchtext': '0.11.0',
43
'datasets': '1.11.0',
44
'transformers': '4.9.1',
45
}
46
check_packages(d)
47
48
49
# # Chapter 16: Transformers – Improving Natural Language Processing with Attention Mechanisms (Part 3/3)
50
51
# **Outline**
52
#
53
# - [Fine-tuning a BERT model in PyTorch](#Fine-tuning-a-BERT-model-in-PyTorch)
54
# - [Loading the IMDb movie review dataset](#Loading-the-IMDb-movie-review-dataset)
55
# - [Tokenizing the dataset](#Tokenizing-the-dataset)
56
# - [Loading and fine-tuning a pre-trained BERT model](#[Loading-and-fine-tuning-a-pre-trained-BERT-model)
57
# - [Fine-tuning a transformer more conveniently using the Trainer API](#Fine-tuning-a-transformer-more-conveniently-using-the-Trainer-API)
58
# - [Summary](#Summary)
59
60
# ---
61
#
62
# Quote from https://huggingface.co/transformers/custom_datasets.html:
63
#
64
# > DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased , runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark.
65
#
66
# ---
67
68
69
70
71
72
# ## Fine-tuning a BERT model in PyTorch
73
74
# ### Loading the IMDb movie review dataset
75
#
76
77
78
79
80
81
82
83
# **General Settings**
84
85
86
87
torch.backends.cudnn.deterministic = True
88
RANDOM_SEED = 123
89
torch.manual_seed(RANDOM_SEED)
90
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
91
92
NUM_EPOCHS = 3
93
94
95
# **Download Dataset**
96
97
# The following cells will download the IMDB movie review dataset (http://ai.stanford.edu/~amaas/data/sentiment/) for positive-negative sentiment classification in as CSV-formatted file:
98
99
100
101
url = "https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz"
102
filename = url.split("/")[-1]
103
104
with open(filename, "wb") as f:
105
r = requests.get(url)
106
f.write(r.content)
107
108
with gzip.open('movie_data.csv.gz', 'rb') as f_in:
109
with open('movie_data.csv', 'wb') as f_out:
110
shutil.copyfileobj(f_in, f_out)
111
112
113
# Check that the dataset looks okay:
114
115
116
117
df = pd.read_csv('movie_data.csv')
118
df.head()
119
120
121
122
123
df.shape
124
125
126
# **Split Dataset into Train/Validation/Test**
127
128
129
130
train_texts = df.iloc[:35000]['review'].values
131
train_labels = df.iloc[:35000]['sentiment'].values
132
133
valid_texts = df.iloc[35000:40000]['review'].values
134
valid_labels = df.iloc[35000:40000]['sentiment'].values
135
136
test_texts = df.iloc[40000:]['review'].values
137
test_labels = df.iloc[40000:]['sentiment'].values
138
139
140
# ## Tokenizing the dataset
141
142
143
144
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
145
146
147
148
149
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
150
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
151
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
152
153
154
155
156
train_encodings[0]
157
158
159
# **Dataset Class and Loaders**
160
161
162
163
class IMDbDataset(torch.utils.data.Dataset):
164
def __init__(self, encodings, labels):
165
self.encodings = encodings
166
self.labels = labels
167
168
def __getitem__(self, idx):
169
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
170
item['labels'] = torch.tensor(self.labels[idx])
171
return item
172
173
def __len__(self):
174
return len(self.labels)
175
176
177
train_dataset = IMDbDataset(train_encodings, train_labels)
178
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
179
test_dataset = IMDbDataset(test_encodings, test_labels)
180
181
182
183
184
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
185
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)
186
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)
187
188
189
# ## Loading and fine-tuning a pre-trained BERT model
190
191
192
193
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
194
model.to(DEVICE)
195
model.train()
196
197
optim = torch.optim.Adam(model.parameters(), lr=5e-5)
198
199
200
# **Train Model -- Manual Training Loop**
201
202
203
204
def compute_accuracy(model, data_loader, device):
205
with torch.no_grad():
206
correct_pred, num_examples = 0, 0
207
208
for batch_idx, batch in enumerate(data_loader):
209
210
### Prepare data
211
input_ids = batch['input_ids'].to(device)
212
attention_mask = batch['attention_mask'].to(device)
213
labels = batch['labels'].to(device)
214
outputs = model(input_ids, attention_mask=attention_mask)
215
logits = outputs['logits']
216
predicted_labels = torch.argmax(logits, 1)
217
num_examples += labels.size(0)
218
correct_pred += (predicted_labels == labels).sum()
219
220
return correct_pred.float()/num_examples * 100
221
222
223
224
225
start_time = time.time()
226
227
for epoch in range(NUM_EPOCHS):
228
229
model.train()
230
231
for batch_idx, batch in enumerate(train_loader):
232
233
### Prepare data
234
input_ids = batch['input_ids'].to(DEVICE)
235
attention_mask = batch['attention_mask'].to(DEVICE)
236
labels = batch['labels'].to(DEVICE)
237
238
### Forward
239
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
240
loss, logits = outputs['loss'], outputs['logits']
241
242
### Backward
243
optim.zero_grad()
244
loss.backward()
245
optim.step()
246
247
### Logging
248
if not batch_idx % 250:
249
print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
250
f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
251
f'Loss: {loss:.4f}')
252
253
model.eval()
254
255
with torch.set_grad_enabled(False):
256
print(f'Training accuracy: '
257
f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
258
f'\nValid accuracy: '
259
f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
260
261
print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
262
263
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
264
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')
265
266
267
268
269
del model # free memory
270
271
272
# ### Fine-tuning a transformer more conveniently using the Trainer API
273
274
# Reload pretrained model:
275
276
277
278
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
279
model.to(DEVICE)
280
model.train();
281
282
283
284
285
286
287
optim = torch.optim.Adam(model.parameters(), lr=5e-5)
288
training_args = TrainingArguments(
289
output_dir='./results',
290
num_train_epochs=3,
291
per_device_train_batch_size=16,
292
per_device_eval_batch_size=16,
293
logging_dir='./logs',
294
logging_steps=10,
295
)
296
297
trainer = Trainer(
298
model=model,
299
args=training_args,
300
train_dataset=train_dataset,
301
)
302
303
304
305
306
# install dataset via pip install datasets
307
308
309
metric = load_metric("accuracy")
310
311
def compute_metrics(eval_pred):
312
logits, labels = eval_pred # logits are a numpy array, not pytorch tensor
313
predictions = np.argmax(logits, axis=-1)
314
return metric.compute(
315
predictions=predictions, references=labels)
316
317
318
319
320
optim = torch.optim.Adam(model.parameters(), lr=5e-5)
321
322
323
training_args = TrainingArguments(
324
output_dir='./results',
325
num_train_epochs=3,
326
per_device_train_batch_size=16,
327
per_device_eval_batch_size=16,
328
logging_dir='./logs',
329
logging_steps=10
330
)
331
332
trainer = Trainer(
333
model=model,
334
compute_metrics=compute_metrics,
335
args=training_args,
336
train_dataset=train_dataset,
337
eval_dataset=test_dataset,
338
optimizers=(optim, None) # optimizer and learning rate scheduler
339
)
340
341
# force model to only use 1 GPU (even if multiple are availabe)
342
# to compare more fairly to previous code
343
344
trainer.args._n_gpu = 1
345
346
347
348
349
start_time = time.time()
350
trainer.train()
351
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
352
353
354
355
356
trainer.evaluate()
357
358
359
360
361
model.eval()
362
model.to(DEVICE)
363
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')
364
365
366
# ...
367
368
# ---
369
#
370
# Readers may ignore the next cell.
371
372
373
374
375
376
377
378
379
380
381