Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
rasbt
GitHub Repository: rasbt/machine-learning-book
Path: blob/main/ch15/ch15_part3.py
1245 views
1
# coding: utf-8
2
3
4
import sys
5
from python_environment_check import check_packages
6
import numpy as np
7
import torch
8
from torch.utils.data import Dataset
9
from torch.utils.data import DataLoader
10
import torch.nn as nn
11
from torch.distributions.categorical import Categorical
12
13
# # Machine Learning with PyTorch and Scikit-Learn
14
# # -- Code Examples
15
16
# ## Package version checks
17
18
# Add folder to path in order to load from the check_packages.py script:
19
20
21
22
sys.path.insert(0, '..')
23
24
25
# Check recommended package versions:
26
27
28
29
30
31
d = {
32
'torch': '1.8.0',
33
}
34
check_packages(d)
35
36
37
# Chapter 15: Modeling Sequential Data Using Recurrent Neural Networks (part 3/3)
38
# ========
39
#
40
#
41
42
# **Outline**
43
#
44
# - Implementing RNNs for sequence modeling in PyTorch
45
# - [Project two -- character-level language modeling in PyTorch](#Project-two----character-level-language-modeling-in-PyTorch)
46
# - [Preprocessing the dataset](#Preprocessing-the-dataset)
47
# - [Evaluation phase -- generating new text passages](#Evaluation-phase----generating-new-text-passages)
48
# - [Summary](#Summary)
49
50
# Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).
51
52
53
54
55
56
# ## Project two: character-level language modeling in PyTorch
57
#
58
59
60
61
62
63
# ### Preprocessing the dataset
64
65
66
67
68
## Reading and processing text
69
with open('1268-0.txt', 'r', encoding="utf8") as fp:
70
text=fp.read()
71
72
start_indx = text.find('THE MYSTERIOUS ISLAND')
73
end_indx = text.find('End of the Project Gutenberg')
74
75
text = text[start_indx:end_indx]
76
char_set = set(text)
77
print('Total Length:', len(text))
78
print('Unique Characters:', len(char_set))
79
80
81
82
83
84
85
86
87
chars_sorted = sorted(char_set)
88
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
89
char_array = np.array(chars_sorted)
90
91
text_encoded = np.array(
92
[char2int[ch] for ch in text],
93
dtype=np.int32)
94
95
print('Text encoded shape: ', text_encoded.shape)
96
97
print(text[:15], ' == Encoding ==> ', text_encoded[:15])
98
print(text_encoded[15:21], ' == Reverse ==> ', ''.join(char_array[text_encoded[15:21]]))
99
100
101
102
103
for ex in text_encoded[:5]:
104
print('{} -> {}'.format(ex, char_array[ex]))
105
106
107
108
109
110
111
112
113
114
115
116
117
seq_length = 40
118
chunk_size = seq_length + 1
119
120
text_chunks = [text_encoded[i:i+chunk_size]
121
for i in range(len(text_encoded)-chunk_size+1)]
122
123
## inspection:
124
for seq in text_chunks[:1]:
125
input_seq = seq[:seq_length]
126
target = seq[seq_length]
127
print(input_seq, ' -> ', target)
128
print(repr(''.join(char_array[input_seq])),
129
' -> ', repr(''.join(char_array[target])))
130
131
132
133
134
135
class TextDataset(Dataset):
136
def __init__(self, text_chunks):
137
self.text_chunks = text_chunks
138
139
def __len__(self):
140
return len(self.text_chunks)
141
142
def __getitem__(self, idx):
143
text_chunk = self.text_chunks[idx]
144
return text_chunk[:-1].long(), text_chunk[1:].long()
145
146
seq_dataset = TextDataset(torch.tensor(text_chunks))
147
148
149
150
151
for i, (seq, target) in enumerate(seq_dataset):
152
print(' Input (x):', repr(''.join(char_array[seq])))
153
print('Target (y):', repr(''.join(char_array[target])))
154
print()
155
if i == 1:
156
break
157
158
159
160
161
162
device = torch.device("cuda:0")
163
# device = 'cpu'
164
165
166
167
168
169
batch_size = 64
170
171
torch.manual_seed(1)
172
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
173
174
175
# ### Building a character-level RNN model
176
177
178
179
180
class RNN(nn.Module):
181
def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
182
super().__init__()
183
self.embedding = nn.Embedding(vocab_size, embed_dim)
184
self.rnn_hidden_size = rnn_hidden_size
185
self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
186
batch_first=True)
187
self.fc = nn.Linear(rnn_hidden_size, vocab_size)
188
189
def forward(self, x, hidden, cell):
190
out = self.embedding(x).unsqueeze(1)
191
out, (hidden, cell) = self.rnn(out, (hidden, cell))
192
out = self.fc(out).reshape(out.size(0), -1)
193
return out, hidden, cell
194
195
def init_hidden(self, batch_size):
196
hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
197
cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
198
return hidden.to(device), cell.to(device)
199
200
vocab_size = len(char_array)
201
embed_dim = 256
202
rnn_hidden_size = 512
203
204
torch.manual_seed(1)
205
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
206
model = model.to(device)
207
model
208
209
210
211
212
loss_fn = nn.CrossEntropyLoss()
213
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
214
215
num_epochs = 10000
216
217
torch.manual_seed(1)
218
219
for epoch in range(num_epochs):
220
hidden, cell = model.init_hidden(batch_size)
221
seq_batch, target_batch = next(iter(seq_dl))
222
seq_batch = seq_batch.to(device)
223
target_batch = target_batch.to(device)
224
optimizer.zero_grad()
225
loss = 0
226
for c in range(seq_length):
227
pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
228
loss += loss_fn(pred, target_batch[:, c])
229
loss.backward()
230
optimizer.step()
231
loss = loss.item()/seq_length
232
if epoch % 500 == 0:
233
print(f'Epoch {epoch} loss: {loss:.4f}')
234
235
236
237
# ### Evaluation phase: generating new text passages
238
239
240
241
242
torch.manual_seed(1)
243
244
logits = torch.tensor([[1.0, 1.0, 1.0]])
245
246
print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])
247
248
m = Categorical(logits=logits)
249
samples = m.sample((10,))
250
251
print(samples.numpy())
252
253
254
255
256
torch.manual_seed(1)
257
258
logits = torch.tensor([[1.0, 1.0, 3.0]])
259
260
print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])
261
262
m = Categorical(logits=logits)
263
samples = m.sample((10,))
264
265
print(samples.numpy())
266
267
268
269
270
def sample(model, starting_str,
271
len_generated_text=500,
272
scale_factor=1.0):
273
274
encoded_input = torch.tensor([char2int[s] for s in starting_str])
275
encoded_input = torch.reshape(encoded_input, (1, -1))
276
277
generated_str = starting_str
278
279
model.eval()
280
hidden, cell = model.init_hidden(1)
281
hidden = hidden.to('cpu')
282
cell = cell.to('cpu')
283
for c in range(len(starting_str)-1):
284
_, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell)
285
286
last_char = encoded_input[:, -1]
287
for i in range(len_generated_text):
288
logits, hidden, cell = model(last_char.view(1), hidden, cell)
289
logits = torch.squeeze(logits, 0)
290
scaled_logits = logits * scale_factor
291
m = Categorical(logits=scaled_logits)
292
last_char = m.sample()
293
generated_str += str(char_array[last_char])
294
295
return generated_str
296
297
torch.manual_seed(1)
298
model.to('cpu')
299
print(sample(model, starting_str='The island'))
300
301
302
# * **Predictability vs. randomness**
303
304
305
306
logits = torch.tensor([[1.0, 1.0, 3.0]])
307
308
print('Probabilities before scaling: ', nn.functional.softmax(logits, dim=1).numpy()[0])
309
310
print('Probabilities after scaling with 0.5:', nn.functional.softmax(0.5*logits, dim=1).numpy()[0])
311
312
print('Probabilities after scaling with 0.1:', nn.functional.softmax(0.1*logits, dim=1).numpy()[0])
313
314
315
316
317
torch.manual_seed(1)
318
print(sample(model, starting_str='The island',
319
scale_factor=2.0))
320
321
322
323
324
torch.manual_seed(1)
325
print(sample(model, starting_str='The island',
326
scale_factor=0.5))
327
328
329
#
330
# ...
331
#
332
#
333
# # Summary
334
#
335
# ...
336
#
337
338
#
339
#
340
# Readers may ignore the next cell.
341
#
342
343
344
345
346
347