CoCalc -- word2vec.py

GitHub Repository: yiming-wange/cs224n-2023-solution
Path: blob/main/a2/word2vec.py
⁹⁹⁵ views
1
#!/usr/bin/env python
2

3
import argparse
4
import numpy as np
5
import random
6

7
from utils.gradcheck import gradcheck_naive, grad_tests_softmax, grad_tests_negsamp
8
from utils.utils import normalizeRows, softmax
9

10

11
def sigmoid(x):
12
    """
13
    Compute the sigmoid function for the input here.
14
    Arguments:
15
    x -- A scalar or numpy array.
16
    Return:
17
    s -- sigmoid(x)
18
    """
19

20
    # YOUR CODE HERE (~1 Line)
21
    s = 1 / (1 + np.exp(-x))
22
    # END YOUR CODE
23

24
    return s
25

26

27
def naiveSoftmaxLossAndGradient(
28
    centerWordVec,
29
    outsideWordIdx,
30
    outsideVectors,
31
    dataset
32
):
33
    """ Naive Softmax loss & gradient function for word2vec models
34

35
    Implement the naive softmax loss and gradients between a center word's 
36
    embedding and an outside word's embedding. This will be the building block
37
    for our word2vec models. For those unfamiliar with numpy notation, note 
38
    that a numpy ndarray with a shape of (x, ) is a one-dimensional array, which
39
    you can effectively treat as a vector with length x.
40

41
    Arguments:
42
    centerWordVec -- numpy ndarray, center word's embedding
43
                    in shape (word vector length, )
44
                    (v_c in the pdf handout)
45
    outsideWordIdx -- integer, the index of the outside word
46
                    (o of u_o in the pdf handout)
47
    outsideVectors -- outside vectors is
48
                    in shape (num words in vocab, word vector length) 
49
                    for all words in vocab (tranpose of U in the pdf handout)
50
    dataset -- needed for negative sampling, unused here.
51

52
    Return:
53
    loss -- naive softmax loss
54
    gradCenterVec -- the gradient with respect to the center word vector
55
                     in shape (word vector length, )
56
                     (dJ / dv_c in the pdf handout)
57
    gradOutsideVecs -- the gradient with respect to all the outside word vectors
58
                    in shape (num words in vocab, word vector length) 
59
                    (dJ / dU)
60
    """
61

62
    # YOUR CODE HERE (~6-8 Lines)
63

64
    # Please use the provided softmax function (imported earlier in this file)
65
    # This numerically stable implementation helps you avoid issues pertaining
66
    # to integer overflow.
67
    y_hat = softmax(outsideVectors @ centerWordVec)  # number of words in vocab
68
    y = np.zeros_like(y_hat)
69
    y[outsideWordIdx] = 1
70

71
    loss = -np.log(y_hat[outsideWordIdx])
72
    gradCenterVec = (y_hat - y) @ outsideVectors  # (N,) @ (N, W)
73
    gradOutsideVecs = np.outer((y_hat - y), centerWordVec)
74
    # END YOUR CODE
75

76
    return loss, gradCenterVec, gradOutsideVecs
77

78

79
def getNegativeSamples(outsideWordIdx, dataset, K):
80
    """ Samples K indexes which are not the outsideWordIdx """
81

82
    negSampleWordIndices = [None] * K
83
    for k in range(K):
84
        newidx = dataset.sampleTokenIdx()
85
        while newidx == outsideWordIdx:
86
            newidx = dataset.sampleTokenIdx()
87
        negSampleWordIndices[k] = newidx
88
    return negSampleWordIndices
89

90

91
def negSamplingLossAndGradient(
92
    centerWordVec,
93
    outsideWordIdx,
94
    outsideVectors,
95
    dataset,
96
    K=10
97
):
98
    """ Negative sampling loss function for word2vec models
99

100
    Implement the negative sampling loss and gradients for a centerWordVec
101
    and a outsideWordIdx word vector as a building block for word2vec
102
    models. K is the number of negative samples to take.
103

104
    Note: The same word may be negatively sampled multiple times. For
105
    example if an outside word is sampled twice, you shall have to
106
    double count the gradient with respect to this word. Thrice if
107
    it was sampled three times, and so forth.
108

109
    Arguments/Return Specifications: same as naiveSoftmaxLossAndGradient
110
    """
111

112
    # Negative sampling of words is done for you. Do not modify this if you
113
    # wish to match the autograder and receive points!
114
    negSampleWordIndices = getNegativeSamples(outsideWordIdx, dataset, K)
115
    indices = [outsideWordIdx] + negSampleWordIndices
116

117
    # YOUR CODE HERE (~10 Lines)
118

119
    # Please use your implementation of sigmoid in here.
120
    negative = outsideVectors[indices, :] #(K+1, D)
121
    negative[1:] *= -1
122
    # convience to sum
123
    score = sigmoid(negative @ centerWordVec) #(K+1, )
124
    loss = -np.sum(np.log(score))
125
    dout = score - 1 #(K+1, )
126
    gradCenterVec = dout @ negative
127
    grad_sample = np.outer(dout, centerWordVec)
128
    grad_sample[1:] *= -1
129
    gradOutsideVecs = np.zeros_like(outsideVectors)
130
    np.add.at(gradOutsideVecs, indices, grad_sample)
131
    # END YOUR CODE
132

133
    return loss, gradCenterVec, gradOutsideVecs
134

135

136
def skipgram(currentCenterWord, windowSize, outsideWords, word2Ind,
137
             centerWordVectors, outsideVectors, dataset,
138
             word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
139
    """ Skip-gram model in word2vec
140

141
    Implement the skip-gram model in this function.
142

143
    Arguments:
144
    currentCenterWord -- a string of the current center word
145
    windowSize -- integer, context window size
146
    outsideWords -- list of no more than 2*windowSize strings, the outside words
147
    word2Ind -- a dictionary that maps words to their indices in
148
              the word vector list
149
    centerWordVectors -- center word vectors (as rows) is in shape 
150
                        (num words in vocab, word vector length) 
151
                        for all words in vocab (V in pdf handout)
152
    outsideVectors -- outside vectors is in shape 
153
                        (num words in vocab, word vector length) 
154
                        for all words in vocab (transpose of U in the pdf handout)
155
    word2vecLossAndGradient -- the loss and gradient function for
156
                               a prediction vector given the outsideWordIdx
157
                               word vectors, could be one of the two
158
                               loss functions you implemented above.
159

160
    Return:
161
    loss -- the loss function value for the skip-gram model
162
            (J in the pdf handout)
163
    gradCenterVecs -- the gradient with respect to the center word vector
164
                     in shape (num words in vocab, word vector length)
165
                     (dJ / dv_c in the pdf handout)
166
    gradOutsideVecs -- the gradient with respect to all the outside word vectors
167
                    in shape (num words in vocab, word vector length) 
168
                    (dJ / dU)
169
    """
170

171
    loss = 0.0
172
    gradCenterVecs = np.zeros(centerWordVectors.shape)
173
    gradOutsideVectors = np.zeros(outsideVectors.shape)
174

175
    # YOUR CODE HERE (~8 Lines)
176
    center_idx = word2Ind[currentCenterWord]
177
    centerWordVec = centerWordVectors[center_idx, :]
178
    outsideWordIdx = [word2Ind[word] for word in outsideWords]
179
    for idx in outsideWordIdx:
180
        loss_, gradCenter, gradOutside = word2vecLossAndGradient(centerWordVec, idx, outsideVectors, dataset)
181
        loss += loss_
182
        gradCenterVecs[center_idx] += gradCenter
183
        gradOutsideVectors += gradOutside
184
    # END YOUR CODE
185

186
    return loss, gradCenterVecs, gradOutsideVectors
187

188

189
#############################################
190
# Testing functions below. DO NOT MODIFY!   #
191
#############################################
192

193
def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset,
194
                         windowSize,
195
                         word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
196
    batchsize = 50
197
    loss = 0.0
198
    grad = np.zeros(wordVectors.shape)
199
    N = wordVectors.shape[0]
200
    centerWordVectors = wordVectors[:int(N/2), :]
201
    outsideVectors = wordVectors[int(N/2):, :]
202
    for i in range(batchsize):
203
        windowSize1 = random.randint(1, windowSize)
204
        centerWord, context = dataset.getRandomContext(windowSize1)
205

206
        c, gin, gout = word2vecModel(
207
            centerWord, windowSize1, context, word2Ind, centerWordVectors,
208
            outsideVectors, dataset, word2vecLossAndGradient
209
        )
210
        loss += c / batchsize
211
        grad[:int(N/2), :] += gin / batchsize
212
        grad[int(N/2):, :] += gout / batchsize
213

214
    return loss, grad
215

216

217
def test_sigmoid():
218
    """ Test sigmoid function """
219
    print("=== Sanity check for sigmoid ===")
220
    assert sigmoid(0) == 0.5
221
    assert np.allclose(sigmoid(np.array([0])), np.array([0.5]))
222
    assert np.allclose(sigmoid(np.array([1, 2, 3])), np.array(
223
        [0.73105858, 0.88079708, 0.95257413]))
224
    print("Tests for sigmoid passed!")
225

226

227
def getDummyObjects():
228
    """ Helper method for naiveSoftmaxLossAndGradient and negSamplingLossAndGradient tests """
229

230
    def dummySampleTokenIdx():
231
        return random.randint(0, 4)
232

233
    def getRandomContext(C):
234
        tokens = ["a", "b", "c", "d", "e"]
235
        return tokens[random.randint(0, 4)], \
236
            [tokens[random.randint(0, 4)] for i in range(2*C)]
237

238
    dataset = type('dummy', (), {})()
239
    dataset.sampleTokenIdx = dummySampleTokenIdx
240
    dataset.getRandomContext = getRandomContext
241

242
    random.seed(31415)
243
    np.random.seed(9265)
244
    dummy_vectors = normalizeRows(np.random.randn(10, 3))
245
    dummy_tokens = dict([("a", 0), ("b", 1), ("c", 2), ("d", 3), ("e", 4)])
246

247
    return dataset, dummy_vectors, dummy_tokens
248

249

250
def test_naiveSoftmaxLossAndGradient():
251
    """ Test naiveSoftmaxLossAndGradient """
252
    dataset, dummy_vectors, dummy_tokens = getDummyObjects()
253

254
    print("==== Gradient check for naiveSoftmaxLossAndGradient ====")
255

256
    def temp(vec):
257
        loss, gradCenterVec, gradOutsideVecs = naiveSoftmaxLossAndGradient(
258
            vec, 1, dummy_vectors, dataset)
259
        return loss, gradCenterVec
260
    gradcheck_naive(temp, np.random.randn(
261
        3), "naiveSoftmaxLossAndGradient gradCenterVec")
262

263
    centerVec = np.random.randn(3)
264

265
    def temp(vec):
266
        loss, gradCenterVec, gradOutsideVecs = naiveSoftmaxLossAndGradient(
267
            centerVec, 1, vec, dataset)
268
        return loss, gradOutsideVecs
269
    gradcheck_naive(temp, dummy_vectors,
270
                    "naiveSoftmaxLossAndGradient gradOutsideVecs")
271

272

273
def test_negSamplingLossAndGradient():
274
    """ Test negSamplingLossAndGradient """
275
    dataset, dummy_vectors, dummy_tokens = getDummyObjects()
276

277
    print("==== Gradient check for negSamplingLossAndGradient ====")
278

279
    def temp(vec):
280
        loss, gradCenterVec, gradOutsideVecs = negSamplingLossAndGradient(
281
            vec, 1, dummy_vectors, dataset)
282
        return loss, gradCenterVec
283
    gradcheck_naive(temp, np.random.randn(
284
        3), "negSamplingLossAndGradient gradCenterVec")
285

286
    centerVec = np.random.randn(3)
287

288
    def temp(vec):
289
        loss, gradCenterVec, gradOutsideVecs = negSamplingLossAndGradient(
290
            centerVec, 1, vec, dataset)
291
        return loss, gradOutsideVecs
292
    gradcheck_naive(temp, dummy_vectors,
293
                    "negSamplingLossAndGradient gradOutsideVecs")
294

295

296
def test_skipgram():
297
    """ Test skip-gram with naiveSoftmaxLossAndGradient """
298
    dataset, dummy_vectors, dummy_tokens = getDummyObjects()
299

300
    print("==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====")
301
    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
302
        skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient),
303
        dummy_vectors, "naiveSoftmaxLossAndGradient Gradient")
304
    grad_tests_softmax(skipgram, dummy_tokens, dummy_vectors, dataset)
305

306
    print("==== Gradient check for skip-gram with negSamplingLossAndGradient ====")
307
    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
308
        skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient),
309
        dummy_vectors, "negSamplingLossAndGradient Gradient")
310
    grad_tests_negsamp(skipgram, dummy_tokens, dummy_vectors,
311
                       dataset, negSamplingLossAndGradient)
312

313

314
def test_word2vec():
315
    """ Test the two word2vec implementations, before running on Stanford Sentiment Treebank """
316
    test_sigmoid()
317
    test_naiveSoftmaxLossAndGradient()
318
    test_negSamplingLossAndGradient()
319
    test_skipgram()
320

321

322
if __name__ == "__main__":
323
    parser = argparse.ArgumentParser(description='Test your implementations.')
324
    parser.add_argument('function', nargs='?', type=str, default='all',
325
                        help='Name of the function you would like to test.')
326

327
    args = parser.parse_args()
328
    if args.function == 'sigmoid':
329
        test_sigmoid()
330
    elif args.function == 'naiveSoftmaxLossAndGradient':
331
        test_naiveSoftmaxLossAndGradient()
332
    elif args.function == 'negSamplingLossAndGradient':
333
        test_negSamplingLossAndGradient()
334
    elif args.function == 'skipgram':
335
        test_skipgram()
336
    elif args.function == 'all':
337
        test_word2vec()
338

339
Product

Resources

Company