CoCalc -- ch15

GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch15/ch15_part3.py
¹²⁴⁵ views
1
# coding: utf-8
2

3

4
import sys
5
from python_environment_check import check_packages
6
import numpy as np
7
import torch
8
from torch.utils.data import Dataset
9
from torch.utils.data import DataLoader
10
import torch.nn as nn
11
from torch.distributions.categorical import Categorical
12

13
# # Machine Learning with PyTorch and Scikit-Learn  
14
# # -- Code Examples
15

16
# ## Package version checks
17

18
# Add folder to path in order to load from the check_packages.py script:
19

20

21

22
sys.path.insert(0, '..')
23

24

25
# Check recommended package versions:
26

27

28

29

30

31
d = {
32
    'torch': '1.8.0',
33
}
34
check_packages(d)
35

36

37
# Chapter 15: Modeling Sequential Data Using Recurrent Neural Networks (part 3/3)
38
# ========
39
# 
40
# 
41

42
# **Outline**
43
# 
44
# - Implementing RNNs for sequence modeling in PyTorch
45
#   - [Project two -- character-level language modeling in PyTorch](#Project-two----character-level-language-modeling-in-PyTorch)
46
#     - [Preprocessing the dataset](#Preprocessing-the-dataset)
47
#     - [Evaluation phase -- generating new text passages](#Evaluation-phase----generating-new-text-passages)
48
# - [Summary](#Summary)
49

50
# Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).
51

52

53

54

55

56
# ## Project two: character-level language modeling in PyTorch
57
# 
58

59

60

61

62

63
# ### Preprocessing the dataset
64

65

66

67

68
## Reading and processing text
69
with open('1268-0.txt', 'r', encoding="utf8") as fp:
70
    text=fp.read()
71
    
72
start_indx = text.find('THE MYSTERIOUS ISLAND')
73
end_indx = text.find('End of the Project Gutenberg')
74

75
text = text[start_indx:end_indx]
76
char_set = set(text)
77
print('Total Length:', len(text))
78
print('Unique Characters:', len(char_set))
79

80

81

82

83

84

85

86

87
chars_sorted = sorted(char_set)
88
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
89
char_array = np.array(chars_sorted)
90

91
text_encoded = np.array(
92
    [char2int[ch] for ch in text],
93
    dtype=np.int32)
94

95
print('Text encoded shape: ', text_encoded.shape)
96

97
print(text[:15], '     == Encoding ==> ', text_encoded[:15])
98
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(char_array[text_encoded[15:21]]))
99

100

101

102

103
for ex in text_encoded[:5]:
104
    print('{} -> {}'.format(ex, char_array[ex]))
105

106

107

108

109

110

111

112

113

114

115

116

117
seq_length = 40
118
chunk_size = seq_length + 1
119

120
text_chunks = [text_encoded[i:i+chunk_size] 
121
               for i in range(len(text_encoded)-chunk_size+1)] 
122

123
## inspection:
124
for seq in text_chunks[:1]:
125
    input_seq = seq[:seq_length]
126
    target = seq[seq_length] 
127
    print(input_seq, ' -> ', target)
128
    print(repr(''.join(char_array[input_seq])), 
129
          ' -> ', repr(''.join(char_array[target])))
130

131

132

133

134

135
class TextDataset(Dataset):
136
    def __init__(self, text_chunks):
137
        self.text_chunks = text_chunks
138

139
    def __len__(self):
140
        return len(self.text_chunks)
141
    
142
    def __getitem__(self, idx):
143
        text_chunk = self.text_chunks[idx]
144
        return text_chunk[:-1].long(), text_chunk[1:].long()
145
    
146
seq_dataset = TextDataset(torch.tensor(text_chunks))
147

148

149

150

151
for i, (seq, target) in enumerate(seq_dataset):
152
    print(' Input (x):', repr(''.join(char_array[seq])))
153
    print('Target (y):', repr(''.join(char_array[target])))
154
    print()
155
    if i == 1:
156
        break
157
    
158

159

160

161

162
device = torch.device("cuda:0")
163
# device = 'cpu'
164

165

166

167

168
 
169
batch_size = 64
170

171
torch.manual_seed(1)
172
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
173

174

175
# ### Building a character-level RNN model
176

177

178

179

180
class RNN(nn.Module):
181
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
182
        super().__init__()
183
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
184
        self.rnn_hidden_size = rnn_hidden_size
185
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
186
                           batch_first=True)
187
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)
188

189
    def forward(self, x, hidden, cell):
190
        out = self.embedding(x).unsqueeze(1)
191
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
192
        out = self.fc(out).reshape(out.size(0), -1)
193
        return out, hidden, cell
194

195
    def init_hidden(self, batch_size):
196
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
197
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
198
        return hidden.to(device), cell.to(device)
199
    
200
vocab_size = len(char_array)
201
embed_dim = 256
202
rnn_hidden_size = 512
203

204
torch.manual_seed(1)
205
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
206
model = model.to(device)
207
model
208

209

210

211

212
loss_fn = nn.CrossEntropyLoss()
213
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
214

215
num_epochs = 10000 
216

217
torch.manual_seed(1)
218

219
for epoch in range(num_epochs):
220
    hidden, cell = model.init_hidden(batch_size)
221
    seq_batch, target_batch = next(iter(seq_dl))
222
    seq_batch = seq_batch.to(device)
223
    target_batch = target_batch.to(device)
224
    optimizer.zero_grad()
225
    loss = 0
226
    for c in range(seq_length):
227
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell) 
228
        loss += loss_fn(pred, target_batch[:, c])
229
    loss.backward()
230
    optimizer.step()
231
    loss = loss.item()/seq_length
232
    if epoch % 500 == 0:
233
        print(f'Epoch {epoch} loss: {loss:.4f}')
234
 
235

236

237
# ### Evaluation phase: generating new text passages
238

239

240

241

242
torch.manual_seed(1)
243

244
logits = torch.tensor([[1.0, 1.0, 1.0]])
245

246
print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])
247

248
m = Categorical(logits=logits)
249
samples = m.sample((10,))
250
 
251
print(samples.numpy())
252

253

254

255

256
torch.manual_seed(1)
257

258
logits = torch.tensor([[1.0, 1.0, 3.0]])
259

260
print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])
261

262
m = Categorical(logits=logits)
263
samples = m.sample((10,))
264
 
265
print(samples.numpy())
266

267

268

269

270
def sample(model, starting_str, 
271
           len_generated_text=500, 
272
           scale_factor=1.0):
273

274
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
275
    encoded_input = torch.reshape(encoded_input, (1, -1))
276

277
    generated_str = starting_str
278

279
    model.eval()
280
    hidden, cell = model.init_hidden(1)
281
    hidden = hidden.to('cpu')
282
    cell = cell.to('cpu')
283
    for c in range(len(starting_str)-1):
284
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
285
    
286
    last_char = encoded_input[:, -1]
287
    for i in range(len_generated_text):
288
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
289
        logits = torch.squeeze(logits, 0)
290
        scaled_logits = logits * scale_factor
291
        m = Categorical(logits=scaled_logits)
292
        last_char = m.sample()
293
        generated_str += str(char_array[last_char])
294
        
295
    return generated_str
296

297
torch.manual_seed(1)
298
model.to('cpu')
299
print(sample(model, starting_str='The island'))
300

301

302
# * **Predictability vs. randomness**
303

304

305

306
logits = torch.tensor([[1.0, 1.0, 3.0]])
307

308
print('Probabilities before scaling:        ', nn.functional.softmax(logits, dim=1).numpy()[0])
309

310
print('Probabilities after scaling with 0.5:', nn.functional.softmax(0.5*logits, dim=1).numpy()[0])
311

312
print('Probabilities after scaling with 0.1:', nn.functional.softmax(0.1*logits, dim=1).numpy()[0])
313

314

315

316

317
torch.manual_seed(1)
318
print(sample(model, starting_str='The island', 
319
             scale_factor=2.0))
320

321

322

323

324
torch.manual_seed(1)
325
print(sample(model, starting_str='The island', 
326
             scale_factor=0.5))
327

328

329
# 
330
# ...
331
# 
332
# 
333
# # Summary
334
# 
335
# ...
336
# 
337

338
# 
339
# 
340
# Readers may ignore the next cell.
341
# 
342

343

344

345

346

347
Product

Resources

Company