CoCalc -- ch16-part3-bert.py

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch16/ch16-part3-bert.py
¹²⁴⁵ views
1
# coding: utf-8
2

3

4
import sys
5
from python_environment_check import check_packages
6
import gzip
7
import shutil
8
import time
9
import pandas as pd
10
import requests
11
import torch
12
import torch.nn.functional as F
13
import torchtext
14
import transformers
15
from transformers import DistilBertTokenizerFast
16
from transformers import DistilBertForSequenceClassification
17
from transformers import Trainer, TrainingArguments
18
from datasets import load_metric
19
import numpy as np
20

21
# # Machine Learning with PyTorch and Scikit-Learn  
22
# # -- Code Examples
23

24
# ## Package version checks
25

26
# Add folder to path in order to load from the check_packages.py script:
27

28

29

30
sys.path.insert(0, '..')
31

32

33
# Check recommended package versions:
34

35

36

37

38

39
d = {
40
    'pandas': '1.3.2',
41
    'torch': '1.9.0',
42
    'torchtext': '0.11.0',
43
    'datasets': '1.11.0',
44
    'transformers': '4.9.1',
45
}
46
check_packages(d)
47

48

49
# # Chapter 16: Transformers – Improving Natural Language Processing with Attention Mechanisms (Part 3/3)
50

51
# **Outline**
52
# 
53
# - [Fine-tuning a BERT model in PyTorch](#Fine-tuning-a-BERT-model-in-PyTorch)
54
#   - [Loading the IMDb movie review dataset](#Loading-the-IMDb-movie-review-dataset)
55
#   - [Tokenizing the dataset](#Tokenizing-the-dataset)
56
#   - [Loading and fine-tuning a pre-trained BERT model](#[Loading-and-fine-tuning-a-pre-trained-BERT-model)
57
#   - [Fine-tuning a transformer more conveniently using the Trainer API](#Fine-tuning-a-transformer-more-conveniently-using-the-Trainer-API)
58
# - [Summary](#Summary)
59

60
# ---
61
# 
62
# Quote from https://huggingface.co/transformers/custom_datasets.html:
63
# 
64
# > DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased , runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark.
65
# 
66
# ---
67

68

69

70

71

72
# ## Fine-tuning a BERT model in PyTorch
73

74
# ### Loading the IMDb movie review dataset
75
# 
76

77

78

79

80

81

82

83
# **General Settings**
84

85

86

87
torch.backends.cudnn.deterministic = True
88
RANDOM_SEED = 123
89
torch.manual_seed(RANDOM_SEED)
90
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
91

92
NUM_EPOCHS = 3
93

94

95
# **Download Dataset**
96

97
# The following cells will download the IMDB movie review dataset (http://ai.stanford.edu/~amaas/data/sentiment/) for positive-negative sentiment classification in as CSV-formatted file:
98

99

100

101
url = "https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz"
102
filename = url.split("/")[-1]
103

104
with open(filename, "wb") as f:
105
    r = requests.get(url)
106
    f.write(r.content)
107

108
with gzip.open('movie_data.csv.gz', 'rb') as f_in:
109
    with open('movie_data.csv', 'wb') as f_out:
110
        shutil.copyfileobj(f_in, f_out)
111

112

113
# Check that the dataset looks okay:
114

115

116

117
df = pd.read_csv('movie_data.csv')
118
df.head()
119

120

121

122

123
df.shape
124

125

126
# **Split Dataset into Train/Validation/Test**
127

128

129

130
train_texts = df.iloc[:35000]['review'].values
131
train_labels = df.iloc[:35000]['sentiment'].values
132

133
valid_texts = df.iloc[35000:40000]['review'].values
134
valid_labels = df.iloc[35000:40000]['sentiment'].values
135

136
test_texts = df.iloc[40000:]['review'].values
137
test_labels = df.iloc[40000:]['sentiment'].values
138

139

140
# ## Tokenizing the dataset
141

142

143

144
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
145

146

147

148

149
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
150
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
151
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
152

153

154

155

156
train_encodings[0]
157

158

159
# **Dataset Class and Loaders**
160

161

162

163
class IMDbDataset(torch.utils.data.Dataset):
164
    def __init__(self, encodings, labels):
165
        self.encodings = encodings
166
        self.labels = labels
167

168
    def __getitem__(self, idx):
169
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
170
        item['labels'] = torch.tensor(self.labels[idx])
171
        return item
172

173
    def __len__(self):
174
        return len(self.labels)
175

176

177
train_dataset = IMDbDataset(train_encodings, train_labels)
178
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
179
test_dataset = IMDbDataset(test_encodings, test_labels)
180

181

182

183

184
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
185
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)
186
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)
187

188

189
# ## Loading and fine-tuning a pre-trained BERT model
190

191

192

193
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
194
model.to(DEVICE)
195
model.train()
196

197
optim = torch.optim.Adam(model.parameters(), lr=5e-5)
198

199

200
# **Train Model -- Manual Training Loop**
201

202

203

204
def compute_accuracy(model, data_loader, device):
205
    with torch.no_grad():
206
        correct_pred, num_examples = 0, 0
207
        
208
        for batch_idx, batch in enumerate(data_loader):
209
        
210
        ### Prepare data
211
            input_ids = batch['input_ids'].to(device)
212
            attention_mask = batch['attention_mask'].to(device)
213
            labels = batch['labels'].to(device)
214
            outputs = model(input_ids, attention_mask=attention_mask)
215
            logits = outputs['logits']
216
            predicted_labels = torch.argmax(logits, 1)
217
            num_examples += labels.size(0)
218
            correct_pred += (predicted_labels == labels).sum()
219
        
220
        return correct_pred.float()/num_examples * 100
221

222

223

224

225
start_time = time.time()
226

227
for epoch in range(NUM_EPOCHS):
228
    
229
    model.train()
230
    
231
    for batch_idx, batch in enumerate(train_loader):
232
        
233
        ### Prepare data
234
        input_ids = batch['input_ids'].to(DEVICE)
235
        attention_mask = batch['attention_mask'].to(DEVICE)
236
        labels = batch['labels'].to(DEVICE)
237

238
        ### Forward
239
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
240
        loss, logits = outputs['loss'], outputs['logits']
241
        
242
        ### Backward
243
        optim.zero_grad()
244
        loss.backward()
245
        optim.step()
246
        
247
        ### Logging
248
        if not batch_idx % 250:
249
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
250
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
251
                   f'Loss: {loss:.4f}')
252
            
253
    model.eval()
254

255
    with torch.set_grad_enabled(False):
256
        print(f'Training accuracy: '
257
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
258
              f'\nValid accuracy: '
259
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
260
        
261
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
262
    
263
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
264
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')
265

266

267

268

269
del model # free memory
270

271

272
# ### Fine-tuning a transformer more conveniently using the Trainer API
273

274
# Reload pretrained model:
275

276

277

278
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
279
model.to(DEVICE)
280
model.train();
281

282

283

284

285

286

287
optim = torch.optim.Adam(model.parameters(), lr=5e-5)
288
training_args = TrainingArguments(
289
    output_dir='./results', 
290
    num_train_epochs=3,     
291
    per_device_train_batch_size=16, 
292
    per_device_eval_batch_size=16,   
293
    logging_dir='./logs',
294
    logging_steps=10,
295
)
296

297
trainer = Trainer(
298
    model=model,
299
    args=training_args,
300
    train_dataset=train_dataset,
301
)
302

303

304

305

306
# install dataset via pip install datasets
307

308

309
metric = load_metric("accuracy")
310

311
def compute_metrics(eval_pred):
312
    logits, labels = eval_pred # logits are a numpy array, not pytorch tensor
313
    predictions = np.argmax(logits, axis=-1)
314
    return metric.compute(
315
               predictions=predictions, references=labels)
316

317

318

319

320
optim = torch.optim.Adam(model.parameters(), lr=5e-5)
321

322

323
training_args = TrainingArguments(
324
    output_dir='./results', 
325
    num_train_epochs=3,     
326
    per_device_train_batch_size=16, 
327
    per_device_eval_batch_size=16,   
328
    logging_dir='./logs',
329
    logging_steps=10
330
)
331

332
trainer = Trainer(
333
    model=model,
334
    compute_metrics=compute_metrics,
335
    args=training_args,
336
    train_dataset=train_dataset,
337
    eval_dataset=test_dataset,
338
    optimizers=(optim, None) # optimizer and learning rate scheduler
339
)
340

341
# force model to only use 1 GPU (even if multiple are availabe)
342
# to compare more fairly to previous code
343

344
trainer.args._n_gpu = 1
345

346

347

348

349
start_time = time.time()
350
trainer.train()
351
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
352

353

354

355

356
trainer.evaluate()
357

358

359

360

361
model.eval()
362
model.to(DEVICE)
363
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')
364

365

366
# ...
367

368
# ---
369
# 
370
# Readers may ignore the next cell.
371

372

373

374

375

376

377

378

379

380

381
Product

Resources

Company