CoCalc -- run.py

GitHub Repository: yiming-wange/cs224n-2023-solution
Path: blob/main/a2/run.py
⁹⁹⁵ views
1
#!/usr/bin/env python
2

3
import random
4
import numpy as np
5
from utils.treebank import StanfordSentiment
6
import matplotlib
7
matplotlib.use('agg')
8
import matplotlib.pyplot as plt
9
import time
10

11
from word2vec import *
12
from sgd import *
13

14
# Check Python Version
15
import sys
16
assert sys.version_info[0] == 3
17
assert sys.version_info[1] >= 5
18

19
# Reset the random seed to make sure that everyone gets the same results
20
random.seed(314)
21
dataset = StanfordSentiment()
22
tokens = dataset.tokens()
23
nWords = len(tokens)
24

25
# We are going to train 10-dimensional vectors for this assignment
26
dimVectors = 10
27

28
# Context size
29
C = 5
30

31
# Reset the random seed to make sure that everyone gets the same results
32
random.seed(31415)
33
np.random.seed(9265)
34

35
startTime=time.time()
36
wordVectors = np.concatenate(
37
    ((np.random.rand(nWords, dimVectors) - 0.5) /
38
       dimVectors, np.zeros((nWords, dimVectors))),
39
    axis=0)
40
wordVectors = sgd(
41
    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
42
        negSamplingLossAndGradient),
43
    wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
44
# Note that normalization is not called here. This is not a bug,
45
# normalizing during training loses the notion of length.
46

47
print("sanity check: cost at convergence should be around or below 10")
48
print("training took %d seconds" % (time.time() - startTime))
49

50
# concatenate the input and output word vectors
51
wordVectors = np.concatenate(
52
    (wordVectors[:nWords,:], wordVectors[nWords:,:]),
53
    axis=0)
54

55
visualizeWords = [
56
    "great", "cool", "brilliant", "wonderful", "well", "amazing",
57
    "worth", "sweet", "enjoyable", "boring", "bad", "dumb",
58
    "annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
59
    "hail", "coffee", "tea"]
60

61
visualizeIdx = [tokens[word] for word in visualizeWords]
62
visualizeVecs = wordVectors[visualizeIdx, :]
63
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
64
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
65
U,S,V = np.linalg.svd(covariance)
66
coord = temp.dot(U[:,0:2])
67

68
for i in range(len(visualizeWords)):
69
    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
70
        bbox=dict(facecolor='green', alpha=0.1))
71

72
plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
73
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
74

75
plt.savefig('word_vectors.png')
76

77
Product

Resources

Company