Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
yiming-wange
GitHub Repository: yiming-wange/cs224n-2023-solution
Path: blob/main/a2/word2vec.py
995 views
1
#!/usr/bin/env python
2
3
import argparse
4
import numpy as np
5
import random
6
7
from utils.gradcheck import gradcheck_naive, grad_tests_softmax, grad_tests_negsamp
8
from utils.utils import normalizeRows, softmax
9
10
11
def sigmoid(x):
12
"""
13
Compute the sigmoid function for the input here.
14
Arguments:
15
x -- A scalar or numpy array.
16
Return:
17
s -- sigmoid(x)
18
"""
19
20
# YOUR CODE HERE (~1 Line)
21
s = 1 / (1 + np.exp(-x))
22
# END YOUR CODE
23
24
return s
25
26
27
def naiveSoftmaxLossAndGradient(
28
centerWordVec,
29
outsideWordIdx,
30
outsideVectors,
31
dataset
32
):
33
""" Naive Softmax loss & gradient function for word2vec models
34
35
Implement the naive softmax loss and gradients between a center word's
36
embedding and an outside word's embedding. This will be the building block
37
for our word2vec models. For those unfamiliar with numpy notation, note
38
that a numpy ndarray with a shape of (x, ) is a one-dimensional array, which
39
you can effectively treat as a vector with length x.
40
41
Arguments:
42
centerWordVec -- numpy ndarray, center word's embedding
43
in shape (word vector length, )
44
(v_c in the pdf handout)
45
outsideWordIdx -- integer, the index of the outside word
46
(o of u_o in the pdf handout)
47
outsideVectors -- outside vectors is
48
in shape (num words in vocab, word vector length)
49
for all words in vocab (tranpose of U in the pdf handout)
50
dataset -- needed for negative sampling, unused here.
51
52
Return:
53
loss -- naive softmax loss
54
gradCenterVec -- the gradient with respect to the center word vector
55
in shape (word vector length, )
56
(dJ / dv_c in the pdf handout)
57
gradOutsideVecs -- the gradient with respect to all the outside word vectors
58
in shape (num words in vocab, word vector length)
59
(dJ / dU)
60
"""
61
62
# YOUR CODE HERE (~6-8 Lines)
63
64
# Please use the provided softmax function (imported earlier in this file)
65
# This numerically stable implementation helps you avoid issues pertaining
66
# to integer overflow.
67
y_hat = softmax(outsideVectors @ centerWordVec) # number of words in vocab
68
y = np.zeros_like(y_hat)
69
y[outsideWordIdx] = 1
70
71
loss = -np.log(y_hat[outsideWordIdx])
72
gradCenterVec = (y_hat - y) @ outsideVectors # (N,) @ (N, W)
73
gradOutsideVecs = np.outer((y_hat - y), centerWordVec)
74
# END YOUR CODE
75
76
return loss, gradCenterVec, gradOutsideVecs
77
78
79
def getNegativeSamples(outsideWordIdx, dataset, K):
80
""" Samples K indexes which are not the outsideWordIdx """
81
82
negSampleWordIndices = [None] * K
83
for k in range(K):
84
newidx = dataset.sampleTokenIdx()
85
while newidx == outsideWordIdx:
86
newidx = dataset.sampleTokenIdx()
87
negSampleWordIndices[k] = newidx
88
return negSampleWordIndices
89
90
91
def negSamplingLossAndGradient(
92
centerWordVec,
93
outsideWordIdx,
94
outsideVectors,
95
dataset,
96
K=10
97
):
98
""" Negative sampling loss function for word2vec models
99
100
Implement the negative sampling loss and gradients for a centerWordVec
101
and a outsideWordIdx word vector as a building block for word2vec
102
models. K is the number of negative samples to take.
103
104
Note: The same word may be negatively sampled multiple times. For
105
example if an outside word is sampled twice, you shall have to
106
double count the gradient with respect to this word. Thrice if
107
it was sampled three times, and so forth.
108
109
Arguments/Return Specifications: same as naiveSoftmaxLossAndGradient
110
"""
111
112
# Negative sampling of words is done for you. Do not modify this if you
113
# wish to match the autograder and receive points!
114
negSampleWordIndices = getNegativeSamples(outsideWordIdx, dataset, K)
115
indices = [outsideWordIdx] + negSampleWordIndices
116
117
# YOUR CODE HERE (~10 Lines)
118
119
# Please use your implementation of sigmoid in here.
120
negative = outsideVectors[indices, :] #(K+1, D)
121
negative[1:] *= -1
122
# convience to sum
123
score = sigmoid(negative @ centerWordVec) #(K+1, )
124
loss = -np.sum(np.log(score))
125
dout = score - 1 #(K+1, )
126
gradCenterVec = dout @ negative
127
grad_sample = np.outer(dout, centerWordVec)
128
grad_sample[1:] *= -1
129
gradOutsideVecs = np.zeros_like(outsideVectors)
130
np.add.at(gradOutsideVecs, indices, grad_sample)
131
# END YOUR CODE
132
133
return loss, gradCenterVec, gradOutsideVecs
134
135
136
def skipgram(currentCenterWord, windowSize, outsideWords, word2Ind,
137
centerWordVectors, outsideVectors, dataset,
138
word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
139
""" Skip-gram model in word2vec
140
141
Implement the skip-gram model in this function.
142
143
Arguments:
144
currentCenterWord -- a string of the current center word
145
windowSize -- integer, context window size
146
outsideWords -- list of no more than 2*windowSize strings, the outside words
147
word2Ind -- a dictionary that maps words to their indices in
148
the word vector list
149
centerWordVectors -- center word vectors (as rows) is in shape
150
(num words in vocab, word vector length)
151
for all words in vocab (V in pdf handout)
152
outsideVectors -- outside vectors is in shape
153
(num words in vocab, word vector length)
154
for all words in vocab (transpose of U in the pdf handout)
155
word2vecLossAndGradient -- the loss and gradient function for
156
a prediction vector given the outsideWordIdx
157
word vectors, could be one of the two
158
loss functions you implemented above.
159
160
Return:
161
loss -- the loss function value for the skip-gram model
162
(J in the pdf handout)
163
gradCenterVecs -- the gradient with respect to the center word vector
164
in shape (num words in vocab, word vector length)
165
(dJ / dv_c in the pdf handout)
166
gradOutsideVecs -- the gradient with respect to all the outside word vectors
167
in shape (num words in vocab, word vector length)
168
(dJ / dU)
169
"""
170
171
loss = 0.0
172
gradCenterVecs = np.zeros(centerWordVectors.shape)
173
gradOutsideVectors = np.zeros(outsideVectors.shape)
174
175
# YOUR CODE HERE (~8 Lines)
176
center_idx = word2Ind[currentCenterWord]
177
centerWordVec = centerWordVectors[center_idx, :]
178
outsideWordIdx = [word2Ind[word] for word in outsideWords]
179
for idx in outsideWordIdx:
180
loss_, gradCenter, gradOutside = word2vecLossAndGradient(centerWordVec, idx, outsideVectors, dataset)
181
loss += loss_
182
gradCenterVecs[center_idx] += gradCenter
183
gradOutsideVectors += gradOutside
184
# END YOUR CODE
185
186
return loss, gradCenterVecs, gradOutsideVectors
187
188
189
#############################################
190
# Testing functions below. DO NOT MODIFY! #
191
#############################################
192
193
def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset,
194
windowSize,
195
word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
196
batchsize = 50
197
loss = 0.0
198
grad = np.zeros(wordVectors.shape)
199
N = wordVectors.shape[0]
200
centerWordVectors = wordVectors[:int(N/2), :]
201
outsideVectors = wordVectors[int(N/2):, :]
202
for i in range(batchsize):
203
windowSize1 = random.randint(1, windowSize)
204
centerWord, context = dataset.getRandomContext(windowSize1)
205
206
c, gin, gout = word2vecModel(
207
centerWord, windowSize1, context, word2Ind, centerWordVectors,
208
outsideVectors, dataset, word2vecLossAndGradient
209
)
210
loss += c / batchsize
211
grad[:int(N/2), :] += gin / batchsize
212
grad[int(N/2):, :] += gout / batchsize
213
214
return loss, grad
215
216
217
def test_sigmoid():
218
""" Test sigmoid function """
219
print("=== Sanity check for sigmoid ===")
220
assert sigmoid(0) == 0.5
221
assert np.allclose(sigmoid(np.array([0])), np.array([0.5]))
222
assert np.allclose(sigmoid(np.array([1, 2, 3])), np.array(
223
[0.73105858, 0.88079708, 0.95257413]))
224
print("Tests for sigmoid passed!")
225
226
227
def getDummyObjects():
228
""" Helper method for naiveSoftmaxLossAndGradient and negSamplingLossAndGradient tests """
229
230
def dummySampleTokenIdx():
231
return random.randint(0, 4)
232
233
def getRandomContext(C):
234
tokens = ["a", "b", "c", "d", "e"]
235
return tokens[random.randint(0, 4)], \
236
[tokens[random.randint(0, 4)] for i in range(2*C)]
237
238
dataset = type('dummy', (), {})()
239
dataset.sampleTokenIdx = dummySampleTokenIdx
240
dataset.getRandomContext = getRandomContext
241
242
random.seed(31415)
243
np.random.seed(9265)
244
dummy_vectors = normalizeRows(np.random.randn(10, 3))
245
dummy_tokens = dict([("a", 0), ("b", 1), ("c", 2), ("d", 3), ("e", 4)])
246
247
return dataset, dummy_vectors, dummy_tokens
248
249
250
def test_naiveSoftmaxLossAndGradient():
251
""" Test naiveSoftmaxLossAndGradient """
252
dataset, dummy_vectors, dummy_tokens = getDummyObjects()
253
254
print("==== Gradient check for naiveSoftmaxLossAndGradient ====")
255
256
def temp(vec):
257
loss, gradCenterVec, gradOutsideVecs = naiveSoftmaxLossAndGradient(
258
vec, 1, dummy_vectors, dataset)
259
return loss, gradCenterVec
260
gradcheck_naive(temp, np.random.randn(
261
3), "naiveSoftmaxLossAndGradient gradCenterVec")
262
263
centerVec = np.random.randn(3)
264
265
def temp(vec):
266
loss, gradCenterVec, gradOutsideVecs = naiveSoftmaxLossAndGradient(
267
centerVec, 1, vec, dataset)
268
return loss, gradOutsideVecs
269
gradcheck_naive(temp, dummy_vectors,
270
"naiveSoftmaxLossAndGradient gradOutsideVecs")
271
272
273
def test_negSamplingLossAndGradient():
274
""" Test negSamplingLossAndGradient """
275
dataset, dummy_vectors, dummy_tokens = getDummyObjects()
276
277
print("==== Gradient check for negSamplingLossAndGradient ====")
278
279
def temp(vec):
280
loss, gradCenterVec, gradOutsideVecs = negSamplingLossAndGradient(
281
vec, 1, dummy_vectors, dataset)
282
return loss, gradCenterVec
283
gradcheck_naive(temp, np.random.randn(
284
3), "negSamplingLossAndGradient gradCenterVec")
285
286
centerVec = np.random.randn(3)
287
288
def temp(vec):
289
loss, gradCenterVec, gradOutsideVecs = negSamplingLossAndGradient(
290
centerVec, 1, vec, dataset)
291
return loss, gradOutsideVecs
292
gradcheck_naive(temp, dummy_vectors,
293
"negSamplingLossAndGradient gradOutsideVecs")
294
295
296
def test_skipgram():
297
""" Test skip-gram with naiveSoftmaxLossAndGradient """
298
dataset, dummy_vectors, dummy_tokens = getDummyObjects()
299
300
print("==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====")
301
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
302
skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient),
303
dummy_vectors, "naiveSoftmaxLossAndGradient Gradient")
304
grad_tests_softmax(skipgram, dummy_tokens, dummy_vectors, dataset)
305
306
print("==== Gradient check for skip-gram with negSamplingLossAndGradient ====")
307
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
308
skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient),
309
dummy_vectors, "negSamplingLossAndGradient Gradient")
310
grad_tests_negsamp(skipgram, dummy_tokens, dummy_vectors,
311
dataset, negSamplingLossAndGradient)
312
313
314
def test_word2vec():
315
""" Test the two word2vec implementations, before running on Stanford Sentiment Treebank """
316
test_sigmoid()
317
test_naiveSoftmaxLossAndGradient()
318
test_negSamplingLossAndGradient()
319
test_skipgram()
320
321
322
if __name__ == "__main__":
323
parser = argparse.ArgumentParser(description='Test your implementations.')
324
parser.add_argument('function', nargs='?', type=str, default='all',
325
help='Name of the function you would like to test.')
326
327
args = parser.parse_args()
328
if args.function == 'sigmoid':
329
test_sigmoid()
330
elif args.function == 'naiveSoftmaxLossAndGradient':
331
test_naiveSoftmaxLossAndGradient()
332
elif args.function == 'negSamplingLossAndGradient':
333
test_negSamplingLossAndGradient()
334
elif args.function == 'skipgram':
335
test_skipgram()
336
elif args.function == 'all':
337
test_word2vec()
338
339