Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
kavgan
GitHub Repository: kavgan/nlp-in-practice
Path: blob/master/word2vec/scripts/word2vec.py
314 views
1
import gzip
2
import gensim
3
import logging
4
import os
5
6
logging.basicConfig(
7
format='%(asctime)s : %(levelname)s : %(message)s',
8
level=logging.INFO)
9
10
11
def show_file_contents(input_file):
12
with gzip.open(input_file, 'rb') as f:
13
for i, line in enumerate(f):
14
print(line)
15
break
16
17
18
def read_input(input_file):
19
"""This method reads the input file which is in gzip format"""
20
21
logging.info("reading file {0}...this may take a while".format(input_file))
22
with gzip.open(input_file, 'rb') as f:
23
for i, line in enumerate(f):
24
25
if (i % 10000 == 0):
26
logging.info("read {0} reviews".format(i))
27
# do some pre-processing and return list of words for each review
28
# text
29
yield gensim.utils.simple_preprocess(line)
30
31
32
if __name__ == '__main__':
33
34
abspath = os.path.dirname(os.path.abspath(__file__))
35
data_file = os.path.join(abspath, "../reviews_data.txt.gz")
36
37
# read the tokenized reviews into a list
38
# each review item becomes a serries of words
39
# so this becomes a list of lists
40
documents = list(read_input(data_file))
41
logging.info("Done reading data file")
42
43
# build vocabulary and train model
44
model = gensim.models.Word2Vec(
45
documents,
46
size=150,
47
window=10,
48
min_count=2,
49
workers=10)
50
model.train(documents, total_examples=len(documents), epochs=10)
51
52
# save only the word vectors
53
model.wv.save(os.path.join(abspath, "../vectors/default"))
54
55
w1 = "dirty"
56
print("Most similar to {0}".format(w1), model.wv.most_similar(positive=w1))
57
58
# look up top 6 words similar to 'polite'
59
w1 = ["polite"]
60
print(
61
"Most similar to {0}".format(w1),
62
model.wv.most_similar(
63
positive=w1,
64
topn=6))
65
66
# look up top 6 words similar to 'france'
67
w1 = ["france"]
68
print(
69
"Most similar to {0}".format(w1),
70
model.wv.most_similar(
71
positive=w1,
72
topn=6))
73
74
# look up top 6 words similar to 'shocked'
75
w1 = ["shocked"]
76
print(
77
"Most similar to {0}".format(w1),
78
model.wv.most_similar(
79
positive=w1,
80
topn=6))
81
82
# look up top 6 words similar to 'shocked'
83
w1 = ["beautiful"]
84
print(
85
"Most similar to {0}".format(w1),
86
model.wv.most_similar(
87
positive=w1,
88
topn=6))
89
90
# get everything related to stuff on the bed
91
w1 = ["bed", 'sheet', 'pillow']
92
w2 = ['couch']
93
print(
94
"Most similar to {0}".format(w1),
95
model.wv.most_similar(
96
positive=w1,
97
negative=w2,
98
topn=10))
99
100
# similarity between two different words
101
print("Similarity between 'dirty' and 'smelly'",
102
model.wv.similarity(w1="dirty", w2="smelly"))
103
104
# similarity between two identical words
105
print("Similarity between 'dirty' and 'dirty'",
106
model.wv.similarity(w1="dirty", w2="dirty"))
107
108
# similarity between two unrelated words
109
print("Similarity between 'dirty' and 'clean'",
110
model.wv.similarity(w1="dirty", w2="clean"))
111
112