Path: blob/main/latex-templates/templates/nlp/word_embeddings.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{booktabs}6\usepackage{siunitx}7\usepackage[makestderr]{pythontex}89\title{Word Embeddings: Skip-gram Model and Vector Semantics}10\author{Natural Language Processing Templates}11\date{\today}1213\begin{document}14\maketitle1516\section{Introduction}17Word embeddings map words to dense vector representations where semantic relationships are captured through geometric properties. This template implements a simplified Word2Vec skip-gram model, demonstrates cosine similarity for word relationships, and visualizes embeddings using t-SNE dimensionality reduction.1819\section{Mathematical Framework}2021\subsection{Skip-gram Objective}22The skip-gram model maximizes the probability of context words given a target word:23\begin{equation}24J(\theta) = \frac{1}{T} \sum_{t=1}^{T} \sum_{-c \leq j \leq c, j \neq 0} \log P(w_{t+j} | w_t)25\end{equation}26where $c$ is the context window size.2728\subsection{Softmax Probability}29The probability is computed using softmax over dot products:30\begin{equation}31P(w_O | w_I) = \frac{\exp(\mathbf{v}'_{w_O} \cdot \mathbf{v}_{w_I})}{\sum_{w=1}^{V} \exp(\mathbf{v}'_w \cdot \mathbf{v}_{w_I})}32\end{equation}3334\subsection{Negative Sampling}35For efficiency, negative sampling approximates the full softmax:36\begin{equation}37\log \sigma(\mathbf{v}'_{w_O} \cdot \mathbf{v}_{w_I}) + \sum_{i=1}^{k} \mathbb{E}_{w_i \sim P_n(w)} \left[ \log \sigma(-\mathbf{v}'_{w_i} \cdot \mathbf{v}_{w_I}) \right]38\end{equation}3940\subsection{Cosine Similarity}41Word similarity is measured by cosine of the angle between vectors:42\begin{equation}43\text{sim}(\mathbf{u}, \mathbf{v}) = \frac{\mathbf{u} \cdot \mathbf{v}}{\|\mathbf{u}\| \|\mathbf{v}\|}44\end{equation}4546\subsection{Word Analogy}47Analogies are solved by vector arithmetic:48\begin{equation}49\mathbf{v}_{\text{king}} - \mathbf{v}_{\text{man}} + \mathbf{v}_{\text{woman}} \approx \mathbf{v}_{\text{queen}}50\end{equation}5152\section{Environment Setup}5354\begin{pycode}55import numpy as np56import matplotlib.pyplot as plt57from collections import Counter, defaultdict58import re5960plt.rc('text', usetex=True)61plt.rc('font', family='serif')62np.random.seed(42)6364def save_plot(filename, caption=""):65plt.savefig(filename, bbox_inches='tight', dpi=150)66print(r'\begin{figure}[htbp]')67print(r'\centering')68print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')69if caption:70print(r'\caption{' + caption + '}')71print(r'\end{figure}')72plt.close()73\end{pycode}7475\section{Skip-gram Implementation}7677\begin{pycode}78class Word2VecSkipGram:79def __init__(self, embedding_dim=50, window_size=2, learning_rate=0.025,80negative_samples=5, min_count=1):81self.embedding_dim = embedding_dim82self.window_size = window_size83self.lr = learning_rate84self.neg_samples = negative_samples85self.min_count = min_count8687def tokenize(self, text):88return re.findall(r'\b\w+\b', text.lower())8990def build_vocab(self, corpus):91word_counts = Counter()92for sentence in corpus:93word_counts.update(self.tokenize(sentence))9495self.vocab = {w: i for i, (w, c) in enumerate(word_counts.items())96if c >= self.min_count}97self.inv_vocab = {i: w for w, i in self.vocab.items()}98self.vocab_size = len(self.vocab)99100# Compute sampling distribution for negative sampling101counts = np.array([word_counts[self.inv_vocab[i]] for i in range(self.vocab_size)])102self.sample_probs = (counts ** 0.75) / np.sum(counts ** 0.75)103104def init_embeddings(self):105self.W_in = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01106self.W_out = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01107108def sigmoid(self, x):109return 1 / (1 + np.exp(-np.clip(x, -500, 500)))110111def get_context_pairs(self, corpus):112pairs = []113for sentence in corpus:114words = self.tokenize(sentence)115indices = [self.vocab[w] for w in words if w in self.vocab]116117for i, center in enumerate(indices):118start = max(0, i - self.window_size)119end = min(len(indices), i + self.window_size + 1)120121for j in range(start, end):122if i != j:123pairs.append((center, indices[j]))124return pairs125126def train_pair(self, center_idx, context_idx):127# Positive sample128center_vec = self.W_in[center_idx]129context_vec = self.W_out[context_idx]130131score = np.dot(center_vec, context_vec)132pred = self.sigmoid(score)133error = pred - 1134135grad_out = error * center_vec136grad_in = error * context_vec137138# Negative samples139neg_indices = np.random.choice(self.vocab_size, size=self.neg_samples,140p=self.sample_probs)141142for neg_idx in neg_indices:143if neg_idx == context_idx:144continue145neg_vec = self.W_out[neg_idx]146score = np.dot(center_vec, neg_vec)147pred = self.sigmoid(score)148149grad_out_neg = pred * center_vec150grad_in += pred * neg_vec151152self.W_out[neg_idx] -= self.lr * grad_out_neg153154self.W_out[context_idx] -= self.lr * grad_out155self.W_in[center_idx] -= self.lr * grad_in156157def train(self, corpus, epochs=5):158self.build_vocab(corpus)159self.init_embeddings()160161pairs = self.get_context_pairs(corpus)162n_pairs = len(pairs)163164self.losses = []165for epoch in range(epochs):166np.random.shuffle(pairs)167epoch_loss = 0168169for center_idx, context_idx in pairs:170self.train_pair(center_idx, context_idx)171172# Compute loss for monitoring173score = np.dot(self.W_in[center_idx], self.W_out[context_idx])174epoch_loss -= np.log(self.sigmoid(score) + 1e-10)175176self.losses.append(epoch_loss / n_pairs)177178# Final embeddings: average of input and output179self.embeddings = (self.W_in + self.W_out) / 2180181def get_embedding(self, word):182if word in self.vocab:183return self.embeddings[self.vocab[word]]184return None185186def most_similar(self, word, n=5):187if word not in self.vocab:188return []189190vec = self.get_embedding(word)191vec = vec / np.linalg.norm(vec)192193similarities = []194for w, idx in self.vocab.items():195if w == word:196continue197other_vec = self.embeddings[idx]198other_vec = other_vec / np.linalg.norm(other_vec)199sim = np.dot(vec, other_vec)200similarities.append((w, sim))201202similarities.sort(key=lambda x: x[1], reverse=True)203return similarities[:n]204205def analogy(self, a, b, c, n=5):206if a not in self.vocab or b not in self.vocab or c not in self.vocab:207return []208209vec = self.get_embedding(b) - self.get_embedding(a) + self.get_embedding(c)210vec = vec / np.linalg.norm(vec)211212similarities = []213exclude = {a, b, c}214for w, idx in self.vocab.items():215if w in exclude:216continue217other_vec = self.embeddings[idx]218other_vec = other_vec / np.linalg.norm(other_vec)219sim = np.dot(vec, other_vec)220similarities.append((w, sim))221222similarities.sort(key=lambda x: x[1], reverse=True)223return similarities[:n]224225# Training corpus with semantic relationships226corpus = [227"the king rules the kingdom with power",228"the queen rules beside the king",229"the prince is son of the king and queen",230"the princess is daughter of the king",231"man and woman are different",232"boy grows into man",233"girl grows into woman",234"python is a programming language",235"java is a programming language",236"code written in python",237"code written in java",238"machine learning uses data",239"deep learning is machine learning",240"neural networks learn patterns",241"data science analyzes data",242"the cat sits on the mat",243"the dog runs in the park",244"cats and dogs are pets",245"paris is capital of france",246"london is capital of england",247"berlin is capital of germany",248"france is in europe",249"england is in europe",250"germany is in europe"251]252253# Train model254model = Word2VecSkipGram(embedding_dim=30, window_size=2, learning_rate=0.05,255negative_samples=5)256model.train(corpus, epochs=100)257\end{pycode}258259\section{Training Visualization}260261\begin{pycode}262fig, axes = plt.subplots(2, 2, figsize=(12, 10))263264# Plot 1: Training loss265axes[0, 0].plot(model.losses, 'b-', linewidth=2)266axes[0, 0].set_xlabel('Epoch')267axes[0, 0].set_ylabel('Loss')268axes[0, 0].set_title('Training Loss')269axes[0, 0].grid(True, alpha=0.3)270271# Plot 2: Embedding norms272norms = np.linalg.norm(model.embeddings, axis=1)273axes[0, 1].hist(norms, bins=20, color='green', alpha=0.7, edgecolor='black')274axes[0, 1].set_xlabel('Vector Norm')275axes[0, 1].set_ylabel('Frequency')276axes[0, 1].set_title('Embedding Norm Distribution')277axes[0, 1].axvline(x=np.mean(norms), color='red', linestyle='--',278label=f'Mean: {np.mean(norms):.2f}')279axes[0, 1].legend()280281# Plot 3: Similarity matrix for selected words282selected_words = ['king', 'queen', 'man', 'woman', 'python', 'java', 'cat', 'dog']283selected_words = [w for w in selected_words if w in model.vocab]284n_selected = len(selected_words)285286sim_matrix = np.zeros((n_selected, n_selected))287for i, w1 in enumerate(selected_words):288v1 = model.get_embedding(w1)289v1 = v1 / np.linalg.norm(v1)290for j, w2 in enumerate(selected_words):291v2 = model.get_embedding(w2)292v2 = v2 / np.linalg.norm(v2)293sim_matrix[i, j] = np.dot(v1, v2)294295im = axes[1, 0].imshow(sim_matrix, cmap='RdYlBu', vmin=-1, vmax=1)296axes[1, 0].set_xticks(range(n_selected))297axes[1, 0].set_yticks(range(n_selected))298axes[1, 0].set_xticklabels(selected_words, rotation=45, ha='right', fontsize=8)299axes[1, 0].set_yticklabels(selected_words, fontsize=8)300axes[1, 0].set_title('Word Similarity Matrix')301plt.colorbar(im, ax=axes[1, 0], shrink=0.8)302303# Plot 4: Vocabulary frequency304word_counts = Counter()305for sentence in corpus:306word_counts.update(model.tokenize(sentence))307common = word_counts.most_common(15)308words, counts = zip(*common)309y_pos = np.arange(len(words))310axes[1, 1].barh(y_pos, counts, color='purple', alpha=0.7)311axes[1, 1].set_yticks(y_pos)312axes[1, 1].set_yticklabels(words, fontsize=8)313axes[1, 1].set_xlabel('Frequency')314axes[1, 1].set_title('Top Words by Frequency')315axes[1, 1].invert_yaxis()316317plt.tight_layout()318save_plot('embeddings_training.pdf', 'Word embedding training analysis')319\end{pycode}320321\section{Cosine Similarity Analysis}322323\begin{pycode}324# Compute most similar words for key terms325test_words = ['king', 'python', 'data', 'cat', 'france']326similarity_results = {}327328for word in test_words:329if word in model.vocab:330similar = model.most_similar(word, n=5)331similarity_results[word] = similar332333# Visualize similarities334fig, axes = plt.subplots(2, 3, figsize=(14, 8))335336for idx, word in enumerate(test_words):337if word not in similarity_results:338continue339ax = axes[idx // 3, idx % 3]340similar = similarity_results[word]341if similar:342words, sims = zip(*similar)343y_pos = np.arange(len(words))344colors = plt.cm.viridis(np.array(sims))345ax.barh(y_pos, sims, color=colors, alpha=0.8)346ax.set_yticks(y_pos)347ax.set_yticklabels(words, fontsize=9)348ax.set_xlabel('Cosine Similarity')349ax.set_title(f'Similar to "{word}"')350ax.set_xlim(0, 1)351ax.invert_yaxis()352353# Remove empty subplot354axes[1, 2].axis('off')355356plt.tight_layout()357save_plot('similarity_analysis.pdf', 'Cosine similarity analysis for key words')358\end{pycode}359360\section{Word Analogy Tasks}361362\begin{pycode}363# Test analogies364analogies = [365('king', 'queen', 'man', 'woman'), # king:queen :: man:?366('france', 'paris', 'england', 'london'), # france:paris :: england:?367('python', 'code', 'java', 'code'), # python:code :: java:?368]369370analogy_results = []371for a, b, c, expected in analogies:372if all(w in model.vocab for w in [a, b, c]):373results = model.analogy(a, b, c, n=3)374analogy_results.append((a, b, c, expected, results))375376# Visualize analogy computation377fig, axes = plt.subplots(2, 2, figsize=(12, 10))378379# Plot 1: Vector arithmetic visualization380if len(analogy_results) > 0:381a, b, c, expected, results = analogy_results[0]382words = [a, b, c] + [r[0] for r in results[:2]]383vecs = [model.get_embedding(w) for w in words]384385# Simple 2D projection using first two principal components386if len(vecs) > 0:387vecs_matrix = np.array(vecs)388mean_vec = np.mean(vecs_matrix, axis=0)389centered = vecs_matrix - mean_vec390cov = np.cov(centered.T)391eigenvalues, eigenvectors = np.linalg.eigh(cov)392idx = np.argsort(eigenvalues)[::-1]393proj_matrix = eigenvectors[:, idx[:2]]394projected = centered @ proj_matrix395396axes[0, 0].scatter(projected[:, 0], projected[:, 1], s=100, alpha=0.7)397for i, word in enumerate(words):398axes[0, 0].annotate(word, (projected[i, 0], projected[i, 1]),399xytext=(5, 5), textcoords='offset points', fontsize=10)400401# Draw analogy vectors402if len(projected) >= 3:403# a -> b vector404axes[0, 0].arrow(projected[0, 0], projected[0, 1],405projected[1, 0] - projected[0, 0],406projected[1, 1] - projected[0, 1],407head_width=0.05, head_length=0.02, fc='blue', ec='blue', alpha=0.5)408# c -> result vector409axes[0, 0].arrow(projected[2, 0], projected[2, 1],410projected[3, 0] - projected[2, 0],411projected[3, 1] - projected[2, 1],412head_width=0.05, head_length=0.02, fc='red', ec='red', alpha=0.5)413414axes[0, 0].set_xlabel('PC1')415axes[0, 0].set_ylabel('PC2')416axes[0, 0].set_title(f'Analogy: {a}:{b} :: {c}:?')417axes[0, 0].grid(True, alpha=0.3)418419# Plot 2: Analogy results bar chart420if analogy_results:421a, b, c, expected, results = analogy_results[0]422words = [r[0] for r in results]423sims = [r[1] for r in results]424y_pos = np.arange(len(words))425axes[0, 1].barh(y_pos, sims, color='orange', alpha=0.7)426axes[0, 1].set_yticks(y_pos)427axes[0, 1].set_yticklabels(words)428axes[0, 1].set_xlabel('Similarity Score')429axes[0, 1].set_title(f'{a} - {b} + {c} = ?')430axes[0, 1].invert_yaxis()431432# Plot 3: Embedding space visualization (t-SNE-like using PCA)433# Select subset of words for visualization434viz_words = list(model.vocab.keys())[:30]435viz_vecs = np.array([model.get_embedding(w) for w in viz_words])436437# PCA projection438mean_vec = np.mean(viz_vecs, axis=0)439centered = viz_vecs - mean_vec440cov = np.cov(centered.T)441eigenvalues, eigenvectors = np.linalg.eigh(cov)442idx = np.argsort(eigenvalues)[::-1]443proj = centered @ eigenvectors[:, idx[:2]]444445scatter = axes[1, 0].scatter(proj[:, 0], proj[:, 1], c=range(len(viz_words)),446cmap='tab20', s=60, alpha=0.7)447for i, word in enumerate(viz_words):448axes[1, 0].annotate(word, (proj[i, 0], proj[i, 1]),449xytext=(3, 3), textcoords='offset points', fontsize=7)450axes[1, 0].set_xlabel('Component 1')451axes[1, 0].set_ylabel('Component 2')452axes[1, 0].set_title('Word Embedding Space (PCA Projection)')453axes[1, 0].grid(True, alpha=0.3)454455# Plot 4: Pairwise similarity distribution456all_sims = []457for i in range(model.vocab_size):458v1 = model.embeddings[i] / np.linalg.norm(model.embeddings[i])459for j in range(i+1, model.vocab_size):460v2 = model.embeddings[j] / np.linalg.norm(model.embeddings[j])461all_sims.append(np.dot(v1, v2))462463axes[1, 1].hist(all_sims, bins=30, color='steelblue', alpha=0.7, edgecolor='black')464axes[1, 1].axvline(x=np.mean(all_sims), color='red', linestyle='--',465label=f'Mean: {np.mean(all_sims):.3f}')466axes[1, 1].set_xlabel('Cosine Similarity')467axes[1, 1].set_ylabel('Frequency')468axes[1, 1].set_title('Pairwise Similarity Distribution')469axes[1, 1].legend()470471plt.tight_layout()472save_plot('analogy_visualization.pdf', 'Word analogy and embedding space visualization')473\end{pycode}474475\section{t-SNE Visualization}476477\begin{pycode}478def tsne(X, n_components=2, perplexity=5.0, n_iter=500, learning_rate=100.0):479"""Simplified t-SNE implementation"""480n_samples = X.shape[0]481482# Compute pairwise distances483sum_X = np.sum(X ** 2, axis=1)484D = sum_X[:, np.newaxis] + sum_X[np.newaxis, :] - 2 * X @ X.T485D = np.maximum(D, 0)486487# Compute conditional probabilities488P = np.zeros((n_samples, n_samples))489for i in range(n_samples):490Di = D[i, np.concatenate([np.arange(i), np.arange(i+1, n_samples)])]491Pi = np.exp(-Di / (2 * perplexity))492Pi = Pi / np.sum(Pi)493P[i, np.concatenate([np.arange(i), np.arange(i+1, n_samples)])] = Pi494495# Symmetrize496P = (P + P.T) / (2 * n_samples)497P = np.maximum(P, 1e-12)498499# Initialize embedding500Y = np.random.randn(n_samples, n_components) * 0.01501502# Gradient descent503for iteration in range(n_iter):504# Compute Q505sum_Y = np.sum(Y ** 2, axis=1)506num = 1 / (1 + sum_Y[:, np.newaxis] + sum_Y[np.newaxis, :] - 2 * Y @ Y.T)507np.fill_diagonal(num, 0)508Q = num / np.sum(num)509Q = np.maximum(Q, 1e-12)510511# Compute gradient512PQ_diff = P - Q513grad = np.zeros_like(Y)514for i in range(n_samples):515diff = Y[i] - Y516grad[i] = 4 * np.sum((PQ_diff[i] * num[i])[:, np.newaxis] * diff, axis=0)517518Y -= learning_rate * grad519520return Y521522# Apply t-SNE to embeddings523tsne_result = tsne(model.embeddings, perplexity=5.0, n_iter=300)524525# Visualize t-SNE result526fig, axes = plt.subplots(1, 2, figsize=(14, 6))527528# Color by word frequency529word_freqs = [word_counts[model.inv_vocab[i]] for i in range(model.vocab_size)]530scatter = axes[0].scatter(tsne_result[:, 0], tsne_result[:, 1],531c=word_freqs, cmap='viridis', s=60, alpha=0.7)532for i in range(model.vocab_size):533axes[0].annotate(model.inv_vocab[i], (tsne_result[i, 0], tsne_result[i, 1]),534xytext=(3, 3), textcoords='offset points', fontsize=7)535axes[0].set_xlabel('t-SNE 1')536axes[0].set_ylabel('t-SNE 2')537axes[0].set_title('t-SNE Visualization (colored by frequency)')538plt.colorbar(scatter, ax=axes[0], label='Frequency')539540# Highlight semantic clusters541cluster_words = {542'royalty': ['king', 'queen', 'prince', 'princess'],543'gender': ['man', 'woman', 'boy', 'girl'],544'programming': ['python', 'java', 'code'],545'places': ['france', 'england', 'germany', 'paris', 'london', 'berlin']546}547548colors = {'royalty': 'red', 'gender': 'blue', 'programming': 'green', 'places': 'orange'}549550axes[1].scatter(tsne_result[:, 0], tsne_result[:, 1], c='lightgray', s=40, alpha=0.3)551552for cluster_name, cluster_words_list in cluster_words.items():553for word in cluster_words_list:554if word in model.vocab:555idx = model.vocab[word]556axes[1].scatter(tsne_result[idx, 0], tsne_result[idx, 1],557c=colors[cluster_name], s=100, alpha=0.8, label=cluster_name)558axes[1].annotate(word, (tsne_result[idx, 0], tsne_result[idx, 1]),559xytext=(5, 5), textcoords='offset points', fontsize=9)560561# Custom legend562from matplotlib.lines import Line2D563legend_elements = [Line2D([0], [0], marker='o', color='w',564markerfacecolor=c, markersize=10, label=n)565for n, c in colors.items()]566axes[1].legend(handles=legend_elements, loc='best')567axes[1].set_xlabel('t-SNE 1')568axes[1].set_ylabel('t-SNE 2')569axes[1].set_title('t-SNE with Semantic Clusters')570571plt.tight_layout()572save_plot('tsne_visualization.pdf', 't-SNE visualization of word embeddings')573\end{pycode}574575\section{Results Summary}576577\subsection{Model Statistics}578\begin{pycode}579print(r'\begin{table}[htbp]')580print(r'\centering')581print(r'\caption{Word2Vec Model Statistics}')582print(r'\begin{tabular}{lr}')583print(r'\toprule')584print(r'Metric & Value \\')585print(r'\midrule')586print(f"Vocabulary size & {model.vocab_size} \\\\")587print(f"Embedding dimension & {model.embedding_dim} \\\\")588print(f"Window size & {model.window_size} \\\\")589print(f"Negative samples & {model.neg_samples} \\\\")590print(f"Final loss & {model.losses[-1]:.4f} \\\\")591print(f"Mean vector norm & {np.mean(norms):.3f} \\\\")592print(r'\bottomrule')593print(r'\end{tabular}')594print(r'\end{table}')595\end{pycode}596597\subsection{Word Similarity Results}598\begin{pycode}599print(r'\begin{table}[htbp]')600print(r'\centering')601print(r'\caption{Top similar words for selected queries}')602print(r'\begin{tabular}{lll}')603print(r'\toprule')604print(r'Query & Similar Words & Scores \\')605print(r'\midrule')606607for word, results in similarity_results.items():608if results:609similar_str = ', '.join([f"{w}" for w, s in results[:3]])610scores_str = ', '.join([f"{s:.2f}" for w, s in results[:3]])611print(f"{word} & {similar_str} & {scores_str} \\\\")612613print(r'\bottomrule')614print(r'\end{tabular}')615print(r'\end{table}')616\end{pycode}617618\subsection{Analogy Results}619\begin{pycode}620print(r'\begin{table}[htbp]')621print(r'\centering')622print(r'\caption{Word analogy task results}')623print(r'\begin{tabular}{llll}')624print(r'\toprule')625print(r'Analogy & Expected & Top Result & Score \\')626print(r'\midrule')627628for a, b, c, expected, results in analogy_results:629if results:630top_word, top_score = results[0]631print(f"{a}:{b}::{c}:? & {expected} & {top_word} & {top_score:.3f} \\\\")632633print(r'\bottomrule')634print(r'\end{tabular}')635print(r'\end{table}')636\end{pycode}637638\subsection{Statistical Summary}639\begin{itemize}640\item Mean pairwise similarity: \py{f"{np.mean(all_sims):.3f}"}641\item Similarity std deviation: \py{f"{np.std(all_sims):.3f}"}642\item Training epochs: \py{f"{len(model.losses)}"}643\item Loss reduction: \py{f"{(model.losses[0] - model.losses[-1]) / model.losses[0] * 100:.1f}"}\\%644\end{itemize}645646\section{Conclusion}647This template demonstrates word embedding concepts through a simplified Word2Vec skip-gram implementation. The model learns semantic relationships from co-occurrence patterns, enabling similarity search and analogy tasks. The t-SNE visualization reveals clustering of semantically related words, validating the quality of learned representations. With vocabulary size of \py{f"{model.vocab_size}"} words and embedding dimension of \py{f"{model.embedding_dim}"}, the model achieves meaningful word relationships despite the limited training corpus.648649\end{document}650651652