CoCalc -- w2v_utils.py

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Sequence Models/Week 2/Word Vector Representation/w2v_utils.py
¹⁷⁷¹⁰ views
1
from keras.models import Model
2
from keras.layers import Input, Dense, Reshape, merge
3
from keras.layers.embeddings import Embedding
4
from keras.preprocessing.sequence import skipgrams
5
from keras.preprocessing import sequence
6

7
import urllib.request
8
import collections
9
import os
10
import zipfile
11

12
import numpy as np
13
import tensorflow as tf
14

15
window_size = 3
16
vector_dim = 300
17
epochs = 1000
18

19
valid_size = 16     # Random set of words to evaluate similarity on.
20
valid_window = 100  # Only pick dev samples in the head of the distribution.
21
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
22

23
def maybe_download(filename, url, expected_bytes):
24
    """Download a file if not present, and make sure it's the right size."""
25
    if not os.path.exists(filename):
26
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
27
    statinfo = os.stat(filename)
28
    if statinfo.st_size == expected_bytes:
29
        print('Found and verified', filename)
30
    else:
31
        print(statinfo.st_size)
32
        raise Exception(
33
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
34
    return filename
35

36

37
# Read the data into a list of strings.
38
def read_data(filename):
39
    """Extract the first file enclosed in a zip file as a list of words."""
40
    with zipfile.ZipFile(filename) as f:
41
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
42
    return data
43

44

45
def build_dataset(words, n_words):
46
    """Process raw inputs into a dataset."""
47
    count = [['UNK', -1]]
48
    count.extend(collections.Counter(words).most_common(n_words - 1))
49
    dictionary = dict()
50
    for word, _ in count:
51
        dictionary[word] = len(dictionary)
52
    data = list()
53
    unk_count = 0
54
    for word in words:
55
        if word in dictionary:
56
            index = dictionary[word]
57
        else:
58
            index = 0  # dictionary['UNK']
59
            unk_count += 1
60
        data.append(index)
61
    count[0][1] = unk_count
62
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
63
    return data, count, dictionary, reversed_dictionary
64

65
def collect_data(vocabulary_size=10000):
66
    url = 'http://mattmahoney.net/dc/'
67
    filename = maybe_download('text8.zip', url, 31344016)
68
    vocabulary = read_data(filename)
69
    print(vocabulary[:7])
70
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
71
                                                                vocabulary_size)
72
    del vocabulary  # Hint to reduce memory.
73
    return data, count, dictionary, reverse_dictionary
74

75
class SimilarityCallback:
76
    def run_sim(self):
77
        for i in range(valid_size):
78
            valid_word = reverse_dictionary[valid_examples[i]]
79
            top_k = 8  # number of nearest neighbors
80
            sim = self._get_sim(valid_examples[i])
81
            nearest = (-sim).argsort()[1:top_k + 1]
82
            log_str = 'Nearest to %s:' % valid_word
83
            for k in range(top_k):
84
                close_word = reverse_dictionary[nearest[k]]
85
                log_str = '%s %s,' % (log_str, close_word)
86
            print(log_str)
87

88
    @staticmethod
89
    def _get_sim(valid_word_idx):
90
        sim = np.zeros((vocab_size,))
91
        in_arr1 = np.zeros((1,))
92
        in_arr2 = np.zeros((1,))
93
        in_arr1[0,] = valid_word_idx
94
        for i in range(vocab_size):
95
            in_arr2[0,] = i
96
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
97
            sim[i] = out
98
        return sim
99
    
100

101
def read_glove_vecs(glove_file):
102
    with open(glove_file, 'r') as f:
103
        words = set()
104
        word_to_vec_map = {}
105
        
106
        for line in f:
107
            line = line.strip().split()
108
            curr_word = line[0]
109
            words.add(curr_word)
110
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
111
            
112
    return words, word_to_vec_map
113

114
def relu(x):
115
    """
116
    Compute the relu of x
117

118
    Arguments:
119
    x -- A scalar or numpy array of any size.
120

121
    Return:
122
    s -- relu(x)
123
    """
124
    s = np.maximum(0,x)
125
    
126
    return s
127

128

129
def initialize_parameters(vocab_size, n_h):
130
    """
131
    Arguments:
132
    layer_dims -- python array (list) containing the dimensions of each layer in our network
133
    
134
    Returns:
135
    parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2":
136
                    W1 -- weight matrix of shape (n_h, vocab_size)
137
                    b1 -- bias vector of shape (n_h, 1)
138
                    W2 -- weight matrix of shape (vocab_size, n_h)
139
                    b2 -- bias vector of shape (vocab_size, 1)
140
    """
141
    
142
    np.random.seed(3)
143
    parameters = {}
144

145
    parameters['W1'] = np.random.randn(n_h, vocab_size) / np.sqrt(vocab_size)
146
    parameters['b1'] = np.zeros((n_h, 1))
147
    parameters['W2'] = np.random.randn(vocab_size, n_h) / np.sqrt(n_h)
148
    parameters['b2'] = np.zeros((vocab_size, 1))
149

150
    return parameters
151

152
def softmax(x):
153
    """Compute softmax values for each sets of scores in x."""
154
    e_x = np.exp(x - np.max(x))
155
    return e_x / e_x.sum()
156

157
Product

Resources

Company