Path: blob/master/Sequence Models/Week 2/Word Vector Representation/w2v_utils.py
17710 views
from keras.models import Model1from keras.layers import Input, Dense, Reshape, merge2from keras.layers.embeddings import Embedding3from keras.preprocessing.sequence import skipgrams4from keras.preprocessing import sequence56import urllib.request7import collections8import os9import zipfile1011import numpy as np12import tensorflow as tf1314window_size = 315vector_dim = 30016epochs = 10001718valid_size = 16 # Random set of words to evaluate similarity on.19valid_window = 100 # Only pick dev samples in the head of the distribution.20valid_examples = np.random.choice(valid_window, valid_size, replace=False)2122def maybe_download(filename, url, expected_bytes):23"""Download a file if not present, and make sure it's the right size."""24if not os.path.exists(filename):25filename, _ = urllib.request.urlretrieve(url + filename, filename)26statinfo = os.stat(filename)27if statinfo.st_size == expected_bytes:28print('Found and verified', filename)29else:30print(statinfo.st_size)31raise Exception(32'Failed to verify ' + filename + '. Can you get to it with a browser?')33return filename343536# Read the data into a list of strings.37def read_data(filename):38"""Extract the first file enclosed in a zip file as a list of words."""39with zipfile.ZipFile(filename) as f:40data = tf.compat.as_str(f.read(f.namelist()[0])).split()41return data424344def build_dataset(words, n_words):45"""Process raw inputs into a dataset."""46count = [['UNK', -1]]47count.extend(collections.Counter(words).most_common(n_words - 1))48dictionary = dict()49for word, _ in count:50dictionary[word] = len(dictionary)51data = list()52unk_count = 053for word in words:54if word in dictionary:55index = dictionary[word]56else:57index = 0 # dictionary['UNK']58unk_count += 159data.append(index)60count[0][1] = unk_count61reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))62return data, count, dictionary, reversed_dictionary6364def collect_data(vocabulary_size=10000):65url = 'http://mattmahoney.net/dc/'66filename = maybe_download('text8.zip', url, 31344016)67vocabulary = read_data(filename)68print(vocabulary[:7])69data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,70vocabulary_size)71del vocabulary # Hint to reduce memory.72return data, count, dictionary, reverse_dictionary7374class SimilarityCallback:75def run_sim(self):76for i in range(valid_size):77valid_word = reverse_dictionary[valid_examples[i]]78top_k = 8 # number of nearest neighbors79sim = self._get_sim(valid_examples[i])80nearest = (-sim).argsort()[1:top_k + 1]81log_str = 'Nearest to %s:' % valid_word82for k in range(top_k):83close_word = reverse_dictionary[nearest[k]]84log_str = '%s %s,' % (log_str, close_word)85print(log_str)8687@staticmethod88def _get_sim(valid_word_idx):89sim = np.zeros((vocab_size,))90in_arr1 = np.zeros((1,))91in_arr2 = np.zeros((1,))92in_arr1[0,] = valid_word_idx93for i in range(vocab_size):94in_arr2[0,] = i95out = validation_model.predict_on_batch([in_arr1, in_arr2])96sim[i] = out97return sim9899100def read_glove_vecs(glove_file):101with open(glove_file, 'r') as f:102words = set()103word_to_vec_map = {}104105for line in f:106line = line.strip().split()107curr_word = line[0]108words.add(curr_word)109word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)110111return words, word_to_vec_map112113def relu(x):114"""115Compute the relu of x116117Arguments:118x -- A scalar or numpy array of any size.119120Return:121s -- relu(x)122"""123s = np.maximum(0,x)124125return s126127128def initialize_parameters(vocab_size, n_h):129"""130Arguments:131layer_dims -- python array (list) containing the dimensions of each layer in our network132133Returns:134parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2":135W1 -- weight matrix of shape (n_h, vocab_size)136b1 -- bias vector of shape (n_h, 1)137W2 -- weight matrix of shape (vocab_size, n_h)138b2 -- bias vector of shape (vocab_size, 1)139"""140141np.random.seed(3)142parameters = {}143144parameters['W1'] = np.random.randn(n_h, vocab_size) / np.sqrt(vocab_size)145parameters['b1'] = np.zeros((n_h, 1))146parameters['W2'] = np.random.randn(vocab_size, n_h) / np.sqrt(n_h)147parameters['b2'] = np.zeros((vocab_size, 1))148149return parameters150151def softmax(x):152"""Compute softmax values for each sets of scores in x."""153e_x = np.exp(x - np.max(x))154return e_x / e_x.sum()155156157