Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/nlp/word_embeddings.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage[makestderr]{pythontex}
9
10
\title{Word Embeddings: Skip-gram Model and Vector Semantics}
11
\author{Natural Language Processing Templates}
12
\date{\today}
13
14
\begin{document}
15
\maketitle
16
17
\section{Introduction}
18
Word embeddings map words to dense vector representations where semantic relationships are captured through geometric properties. This template implements a simplified Word2Vec skip-gram model, demonstrates cosine similarity for word relationships, and visualizes embeddings using t-SNE dimensionality reduction.
19
20
\section{Mathematical Framework}
21
22
\subsection{Skip-gram Objective}
23
The skip-gram model maximizes the probability of context words given a target word:
24
\begin{equation}
25
J(\theta) = \frac{1}{T} \sum_{t=1}^{T} \sum_{-c \leq j \leq c, j \neq 0} \log P(w_{t+j} | w_t)
26
\end{equation}
27
where $c$ is the context window size.
28
29
\subsection{Softmax Probability}
30
The probability is computed using softmax over dot products:
31
\begin{equation}
32
P(w_O | w_I) = \frac{\exp(\mathbf{v}'_{w_O} \cdot \mathbf{v}_{w_I})}{\sum_{w=1}^{V} \exp(\mathbf{v}'_w \cdot \mathbf{v}_{w_I})}
33
\end{equation}
34
35
\subsection{Negative Sampling}
36
For efficiency, negative sampling approximates the full softmax:
37
\begin{equation}
38
\log \sigma(\mathbf{v}'_{w_O} \cdot \mathbf{v}_{w_I}) + \sum_{i=1}^{k} \mathbb{E}_{w_i \sim P_n(w)} \left[ \log \sigma(-\mathbf{v}'_{w_i} \cdot \mathbf{v}_{w_I}) \right]
39
\end{equation}
40
41
\subsection{Cosine Similarity}
42
Word similarity is measured by cosine of the angle between vectors:
43
\begin{equation}
44
\text{sim}(\mathbf{u}, \mathbf{v}) = \frac{\mathbf{u} \cdot \mathbf{v}}{\|\mathbf{u}\| \|\mathbf{v}\|}
45
\end{equation}
46
47
\subsection{Word Analogy}
48
Analogies are solved by vector arithmetic:
49
\begin{equation}
50
\mathbf{v}_{\text{king}} - \mathbf{v}_{\text{man}} + \mathbf{v}_{\text{woman}} \approx \mathbf{v}_{\text{queen}}
51
\end{equation}
52
53
\section{Environment Setup}
54
55
\begin{pycode}
56
import numpy as np
57
import matplotlib.pyplot as plt
58
from collections import Counter, defaultdict
59
import re
60
61
plt.rc('text', usetex=True)
62
plt.rc('font', family='serif')
63
np.random.seed(42)
64
65
def save_plot(filename, caption=""):
66
plt.savefig(filename, bbox_inches='tight', dpi=150)
67
print(r'\begin{figure}[htbp]')
68
print(r'\centering')
69
print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')
70
if caption:
71
print(r'\caption{' + caption + '}')
72
print(r'\end{figure}')
73
plt.close()
74
\end{pycode}
75
76
\section{Skip-gram Implementation}
77
78
\begin{pycode}
79
class Word2VecSkipGram:
80
def __init__(self, embedding_dim=50, window_size=2, learning_rate=0.025,
81
negative_samples=5, min_count=1):
82
self.embedding_dim = embedding_dim
83
self.window_size = window_size
84
self.lr = learning_rate
85
self.neg_samples = negative_samples
86
self.min_count = min_count
87
88
def tokenize(self, text):
89
return re.findall(r'\b\w+\b', text.lower())
90
91
def build_vocab(self, corpus):
92
word_counts = Counter()
93
for sentence in corpus:
94
word_counts.update(self.tokenize(sentence))
95
96
self.vocab = {w: i for i, (w, c) in enumerate(word_counts.items())
97
if c >= self.min_count}
98
self.inv_vocab = {i: w for w, i in self.vocab.items()}
99
self.vocab_size = len(self.vocab)
100
101
# Compute sampling distribution for negative sampling
102
counts = np.array([word_counts[self.inv_vocab[i]] for i in range(self.vocab_size)])
103
self.sample_probs = (counts ** 0.75) / np.sum(counts ** 0.75)
104
105
def init_embeddings(self):
106
self.W_in = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01
107
self.W_out = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01
108
109
def sigmoid(self, x):
110
return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
111
112
def get_context_pairs(self, corpus):
113
pairs = []
114
for sentence in corpus:
115
words = self.tokenize(sentence)
116
indices = [self.vocab[w] for w in words if w in self.vocab]
117
118
for i, center in enumerate(indices):
119
start = max(0, i - self.window_size)
120
end = min(len(indices), i + self.window_size + 1)
121
122
for j in range(start, end):
123
if i != j:
124
pairs.append((center, indices[j]))
125
return pairs
126
127
def train_pair(self, center_idx, context_idx):
128
# Positive sample
129
center_vec = self.W_in[center_idx]
130
context_vec = self.W_out[context_idx]
131
132
score = np.dot(center_vec, context_vec)
133
pred = self.sigmoid(score)
134
error = pred - 1
135
136
grad_out = error * center_vec
137
grad_in = error * context_vec
138
139
# Negative samples
140
neg_indices = np.random.choice(self.vocab_size, size=self.neg_samples,
141
p=self.sample_probs)
142
143
for neg_idx in neg_indices:
144
if neg_idx == context_idx:
145
continue
146
neg_vec = self.W_out[neg_idx]
147
score = np.dot(center_vec, neg_vec)
148
pred = self.sigmoid(score)
149
150
grad_out_neg = pred * center_vec
151
grad_in += pred * neg_vec
152
153
self.W_out[neg_idx] -= self.lr * grad_out_neg
154
155
self.W_out[context_idx] -= self.lr * grad_out
156
self.W_in[center_idx] -= self.lr * grad_in
157
158
def train(self, corpus, epochs=5):
159
self.build_vocab(corpus)
160
self.init_embeddings()
161
162
pairs = self.get_context_pairs(corpus)
163
n_pairs = len(pairs)
164
165
self.losses = []
166
for epoch in range(epochs):
167
np.random.shuffle(pairs)
168
epoch_loss = 0
169
170
for center_idx, context_idx in pairs:
171
self.train_pair(center_idx, context_idx)
172
173
# Compute loss for monitoring
174
score = np.dot(self.W_in[center_idx], self.W_out[context_idx])
175
epoch_loss -= np.log(self.sigmoid(score) + 1e-10)
176
177
self.losses.append(epoch_loss / n_pairs)
178
179
# Final embeddings: average of input and output
180
self.embeddings = (self.W_in + self.W_out) / 2
181
182
def get_embedding(self, word):
183
if word in self.vocab:
184
return self.embeddings[self.vocab[word]]
185
return None
186
187
def most_similar(self, word, n=5):
188
if word not in self.vocab:
189
return []
190
191
vec = self.get_embedding(word)
192
vec = vec / np.linalg.norm(vec)
193
194
similarities = []
195
for w, idx in self.vocab.items():
196
if w == word:
197
continue
198
other_vec = self.embeddings[idx]
199
other_vec = other_vec / np.linalg.norm(other_vec)
200
sim = np.dot(vec, other_vec)
201
similarities.append((w, sim))
202
203
similarities.sort(key=lambda x: x[1], reverse=True)
204
return similarities[:n]
205
206
def analogy(self, a, b, c, n=5):
207
if a not in self.vocab or b not in self.vocab or c not in self.vocab:
208
return []
209
210
vec = self.get_embedding(b) - self.get_embedding(a) + self.get_embedding(c)
211
vec = vec / np.linalg.norm(vec)
212
213
similarities = []
214
exclude = {a, b, c}
215
for w, idx in self.vocab.items():
216
if w in exclude:
217
continue
218
other_vec = self.embeddings[idx]
219
other_vec = other_vec / np.linalg.norm(other_vec)
220
sim = np.dot(vec, other_vec)
221
similarities.append((w, sim))
222
223
similarities.sort(key=lambda x: x[1], reverse=True)
224
return similarities[:n]
225
226
# Training corpus with semantic relationships
227
corpus = [
228
"the king rules the kingdom with power",
229
"the queen rules beside the king",
230
"the prince is son of the king and queen",
231
"the princess is daughter of the king",
232
"man and woman are different",
233
"boy grows into man",
234
"girl grows into woman",
235
"python is a programming language",
236
"java is a programming language",
237
"code written in python",
238
"code written in java",
239
"machine learning uses data",
240
"deep learning is machine learning",
241
"neural networks learn patterns",
242
"data science analyzes data",
243
"the cat sits on the mat",
244
"the dog runs in the park",
245
"cats and dogs are pets",
246
"paris is capital of france",
247
"london is capital of england",
248
"berlin is capital of germany",
249
"france is in europe",
250
"england is in europe",
251
"germany is in europe"
252
]
253
254
# Train model
255
model = Word2VecSkipGram(embedding_dim=30, window_size=2, learning_rate=0.05,
256
negative_samples=5)
257
model.train(corpus, epochs=100)
258
\end{pycode}
259
260
\section{Training Visualization}
261
262
\begin{pycode}
263
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
264
265
# Plot 1: Training loss
266
axes[0, 0].plot(model.losses, 'b-', linewidth=2)
267
axes[0, 0].set_xlabel('Epoch')
268
axes[0, 0].set_ylabel('Loss')
269
axes[0, 0].set_title('Training Loss')
270
axes[0, 0].grid(True, alpha=0.3)
271
272
# Plot 2: Embedding norms
273
norms = np.linalg.norm(model.embeddings, axis=1)
274
axes[0, 1].hist(norms, bins=20, color='green', alpha=0.7, edgecolor='black')
275
axes[0, 1].set_xlabel('Vector Norm')
276
axes[0, 1].set_ylabel('Frequency')
277
axes[0, 1].set_title('Embedding Norm Distribution')
278
axes[0, 1].axvline(x=np.mean(norms), color='red', linestyle='--',
279
label=f'Mean: {np.mean(norms):.2f}')
280
axes[0, 1].legend()
281
282
# Plot 3: Similarity matrix for selected words
283
selected_words = ['king', 'queen', 'man', 'woman', 'python', 'java', 'cat', 'dog']
284
selected_words = [w for w in selected_words if w in model.vocab]
285
n_selected = len(selected_words)
286
287
sim_matrix = np.zeros((n_selected, n_selected))
288
for i, w1 in enumerate(selected_words):
289
v1 = model.get_embedding(w1)
290
v1 = v1 / np.linalg.norm(v1)
291
for j, w2 in enumerate(selected_words):
292
v2 = model.get_embedding(w2)
293
v2 = v2 / np.linalg.norm(v2)
294
sim_matrix[i, j] = np.dot(v1, v2)
295
296
im = axes[1, 0].imshow(sim_matrix, cmap='RdYlBu', vmin=-1, vmax=1)
297
axes[1, 0].set_xticks(range(n_selected))
298
axes[1, 0].set_yticks(range(n_selected))
299
axes[1, 0].set_xticklabels(selected_words, rotation=45, ha='right', fontsize=8)
300
axes[1, 0].set_yticklabels(selected_words, fontsize=8)
301
axes[1, 0].set_title('Word Similarity Matrix')
302
plt.colorbar(im, ax=axes[1, 0], shrink=0.8)
303
304
# Plot 4: Vocabulary frequency
305
word_counts = Counter()
306
for sentence in corpus:
307
word_counts.update(model.tokenize(sentence))
308
common = word_counts.most_common(15)
309
words, counts = zip(*common)
310
y_pos = np.arange(len(words))
311
axes[1, 1].barh(y_pos, counts, color='purple', alpha=0.7)
312
axes[1, 1].set_yticks(y_pos)
313
axes[1, 1].set_yticklabels(words, fontsize=8)
314
axes[1, 1].set_xlabel('Frequency')
315
axes[1, 1].set_title('Top Words by Frequency')
316
axes[1, 1].invert_yaxis()
317
318
plt.tight_layout()
319
save_plot('embeddings_training.pdf', 'Word embedding training analysis')
320
\end{pycode}
321
322
\section{Cosine Similarity Analysis}
323
324
\begin{pycode}
325
# Compute most similar words for key terms
326
test_words = ['king', 'python', 'data', 'cat', 'france']
327
similarity_results = {}
328
329
for word in test_words:
330
if word in model.vocab:
331
similar = model.most_similar(word, n=5)
332
similarity_results[word] = similar
333
334
# Visualize similarities
335
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
336
337
for idx, word in enumerate(test_words):
338
if word not in similarity_results:
339
continue
340
ax = axes[idx // 3, idx % 3]
341
similar = similarity_results[word]
342
if similar:
343
words, sims = zip(*similar)
344
y_pos = np.arange(len(words))
345
colors = plt.cm.viridis(np.array(sims))
346
ax.barh(y_pos, sims, color=colors, alpha=0.8)
347
ax.set_yticks(y_pos)
348
ax.set_yticklabels(words, fontsize=9)
349
ax.set_xlabel('Cosine Similarity')
350
ax.set_title(f'Similar to "{word}"')
351
ax.set_xlim(0, 1)
352
ax.invert_yaxis()
353
354
# Remove empty subplot
355
axes[1, 2].axis('off')
356
357
plt.tight_layout()
358
save_plot('similarity_analysis.pdf', 'Cosine similarity analysis for key words')
359
\end{pycode}
360
361
\section{Word Analogy Tasks}
362
363
\begin{pycode}
364
# Test analogies
365
analogies = [
366
('king', 'queen', 'man', 'woman'), # king:queen :: man:?
367
('france', 'paris', 'england', 'london'), # france:paris :: england:?
368
('python', 'code', 'java', 'code'), # python:code :: java:?
369
]
370
371
analogy_results = []
372
for a, b, c, expected in analogies:
373
if all(w in model.vocab for w in [a, b, c]):
374
results = model.analogy(a, b, c, n=3)
375
analogy_results.append((a, b, c, expected, results))
376
377
# Visualize analogy computation
378
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
379
380
# Plot 1: Vector arithmetic visualization
381
if len(analogy_results) > 0:
382
a, b, c, expected, results = analogy_results[0]
383
words = [a, b, c] + [r[0] for r in results[:2]]
384
vecs = [model.get_embedding(w) for w in words]
385
386
# Simple 2D projection using first two principal components
387
if len(vecs) > 0:
388
vecs_matrix = np.array(vecs)
389
mean_vec = np.mean(vecs_matrix, axis=0)
390
centered = vecs_matrix - mean_vec
391
cov = np.cov(centered.T)
392
eigenvalues, eigenvectors = np.linalg.eigh(cov)
393
idx = np.argsort(eigenvalues)[::-1]
394
proj_matrix = eigenvectors[:, idx[:2]]
395
projected = centered @ proj_matrix
396
397
axes[0, 0].scatter(projected[:, 0], projected[:, 1], s=100, alpha=0.7)
398
for i, word in enumerate(words):
399
axes[0, 0].annotate(word, (projected[i, 0], projected[i, 1]),
400
xytext=(5, 5), textcoords='offset points', fontsize=10)
401
402
# Draw analogy vectors
403
if len(projected) >= 3:
404
# a -> b vector
405
axes[0, 0].arrow(projected[0, 0], projected[0, 1],
406
projected[1, 0] - projected[0, 0],
407
projected[1, 1] - projected[0, 1],
408
head_width=0.05, head_length=0.02, fc='blue', ec='blue', alpha=0.5)
409
# c -> result vector
410
axes[0, 0].arrow(projected[2, 0], projected[2, 1],
411
projected[3, 0] - projected[2, 0],
412
projected[3, 1] - projected[2, 1],
413
head_width=0.05, head_length=0.02, fc='red', ec='red', alpha=0.5)
414
415
axes[0, 0].set_xlabel('PC1')
416
axes[0, 0].set_ylabel('PC2')
417
axes[0, 0].set_title(f'Analogy: {a}:{b} :: {c}:?')
418
axes[0, 0].grid(True, alpha=0.3)
419
420
# Plot 2: Analogy results bar chart
421
if analogy_results:
422
a, b, c, expected, results = analogy_results[0]
423
words = [r[0] for r in results]
424
sims = [r[1] for r in results]
425
y_pos = np.arange(len(words))
426
axes[0, 1].barh(y_pos, sims, color='orange', alpha=0.7)
427
axes[0, 1].set_yticks(y_pos)
428
axes[0, 1].set_yticklabels(words)
429
axes[0, 1].set_xlabel('Similarity Score')
430
axes[0, 1].set_title(f'{a} - {b} + {c} = ?')
431
axes[0, 1].invert_yaxis()
432
433
# Plot 3: Embedding space visualization (t-SNE-like using PCA)
434
# Select subset of words for visualization
435
viz_words = list(model.vocab.keys())[:30]
436
viz_vecs = np.array([model.get_embedding(w) for w in viz_words])
437
438
# PCA projection
439
mean_vec = np.mean(viz_vecs, axis=0)
440
centered = viz_vecs - mean_vec
441
cov = np.cov(centered.T)
442
eigenvalues, eigenvectors = np.linalg.eigh(cov)
443
idx = np.argsort(eigenvalues)[::-1]
444
proj = centered @ eigenvectors[:, idx[:2]]
445
446
scatter = axes[1, 0].scatter(proj[:, 0], proj[:, 1], c=range(len(viz_words)),
447
cmap='tab20', s=60, alpha=0.7)
448
for i, word in enumerate(viz_words):
449
axes[1, 0].annotate(word, (proj[i, 0], proj[i, 1]),
450
xytext=(3, 3), textcoords='offset points', fontsize=7)
451
axes[1, 0].set_xlabel('Component 1')
452
axes[1, 0].set_ylabel('Component 2')
453
axes[1, 0].set_title('Word Embedding Space (PCA Projection)')
454
axes[1, 0].grid(True, alpha=0.3)
455
456
# Plot 4: Pairwise similarity distribution
457
all_sims = []
458
for i in range(model.vocab_size):
459
v1 = model.embeddings[i] / np.linalg.norm(model.embeddings[i])
460
for j in range(i+1, model.vocab_size):
461
v2 = model.embeddings[j] / np.linalg.norm(model.embeddings[j])
462
all_sims.append(np.dot(v1, v2))
463
464
axes[1, 1].hist(all_sims, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
465
axes[1, 1].axvline(x=np.mean(all_sims), color='red', linestyle='--',
466
label=f'Mean: {np.mean(all_sims):.3f}')
467
axes[1, 1].set_xlabel('Cosine Similarity')
468
axes[1, 1].set_ylabel('Frequency')
469
axes[1, 1].set_title('Pairwise Similarity Distribution')
470
axes[1, 1].legend()
471
472
plt.tight_layout()
473
save_plot('analogy_visualization.pdf', 'Word analogy and embedding space visualization')
474
\end{pycode}
475
476
\section{t-SNE Visualization}
477
478
\begin{pycode}
479
def tsne(X, n_components=2, perplexity=5.0, n_iter=500, learning_rate=100.0):
480
"""Simplified t-SNE implementation"""
481
n_samples = X.shape[0]
482
483
# Compute pairwise distances
484
sum_X = np.sum(X ** 2, axis=1)
485
D = sum_X[:, np.newaxis] + sum_X[np.newaxis, :] - 2 * X @ X.T
486
D = np.maximum(D, 0)
487
488
# Compute conditional probabilities
489
P = np.zeros((n_samples, n_samples))
490
for i in range(n_samples):
491
Di = D[i, np.concatenate([np.arange(i), np.arange(i+1, n_samples)])]
492
Pi = np.exp(-Di / (2 * perplexity))
493
Pi = Pi / np.sum(Pi)
494
P[i, np.concatenate([np.arange(i), np.arange(i+1, n_samples)])] = Pi
495
496
# Symmetrize
497
P = (P + P.T) / (2 * n_samples)
498
P = np.maximum(P, 1e-12)
499
500
# Initialize embedding
501
Y = np.random.randn(n_samples, n_components) * 0.01
502
503
# Gradient descent
504
for iteration in range(n_iter):
505
# Compute Q
506
sum_Y = np.sum(Y ** 2, axis=1)
507
num = 1 / (1 + sum_Y[:, np.newaxis] + sum_Y[np.newaxis, :] - 2 * Y @ Y.T)
508
np.fill_diagonal(num, 0)
509
Q = num / np.sum(num)
510
Q = np.maximum(Q, 1e-12)
511
512
# Compute gradient
513
PQ_diff = P - Q
514
grad = np.zeros_like(Y)
515
for i in range(n_samples):
516
diff = Y[i] - Y
517
grad[i] = 4 * np.sum((PQ_diff[i] * num[i])[:, np.newaxis] * diff, axis=0)
518
519
Y -= learning_rate * grad
520
521
return Y
522
523
# Apply t-SNE to embeddings
524
tsne_result = tsne(model.embeddings, perplexity=5.0, n_iter=300)
525
526
# Visualize t-SNE result
527
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
528
529
# Color by word frequency
530
word_freqs = [word_counts[model.inv_vocab[i]] for i in range(model.vocab_size)]
531
scatter = axes[0].scatter(tsne_result[:, 0], tsne_result[:, 1],
532
c=word_freqs, cmap='viridis', s=60, alpha=0.7)
533
for i in range(model.vocab_size):
534
axes[0].annotate(model.inv_vocab[i], (tsne_result[i, 0], tsne_result[i, 1]),
535
xytext=(3, 3), textcoords='offset points', fontsize=7)
536
axes[0].set_xlabel('t-SNE 1')
537
axes[0].set_ylabel('t-SNE 2')
538
axes[0].set_title('t-SNE Visualization (colored by frequency)')
539
plt.colorbar(scatter, ax=axes[0], label='Frequency')
540
541
# Highlight semantic clusters
542
cluster_words = {
543
'royalty': ['king', 'queen', 'prince', 'princess'],
544
'gender': ['man', 'woman', 'boy', 'girl'],
545
'programming': ['python', 'java', 'code'],
546
'places': ['france', 'england', 'germany', 'paris', 'london', 'berlin']
547
}
548
549
colors = {'royalty': 'red', 'gender': 'blue', 'programming': 'green', 'places': 'orange'}
550
551
axes[1].scatter(tsne_result[:, 0], tsne_result[:, 1], c='lightgray', s=40, alpha=0.3)
552
553
for cluster_name, cluster_words_list in cluster_words.items():
554
for word in cluster_words_list:
555
if word in model.vocab:
556
idx = model.vocab[word]
557
axes[1].scatter(tsne_result[idx, 0], tsne_result[idx, 1],
558
c=colors[cluster_name], s=100, alpha=0.8, label=cluster_name)
559
axes[1].annotate(word, (tsne_result[idx, 0], tsne_result[idx, 1]),
560
xytext=(5, 5), textcoords='offset points', fontsize=9)
561
562
# Custom legend
563
from matplotlib.lines import Line2D
564
legend_elements = [Line2D([0], [0], marker='o', color='w',
565
markerfacecolor=c, markersize=10, label=n)
566
for n, c in colors.items()]
567
axes[1].legend(handles=legend_elements, loc='best')
568
axes[1].set_xlabel('t-SNE 1')
569
axes[1].set_ylabel('t-SNE 2')
570
axes[1].set_title('t-SNE with Semantic Clusters')
571
572
plt.tight_layout()
573
save_plot('tsne_visualization.pdf', 't-SNE visualization of word embeddings')
574
\end{pycode}
575
576
\section{Results Summary}
577
578
\subsection{Model Statistics}
579
\begin{pycode}
580
print(r'\begin{table}[htbp]')
581
print(r'\centering')
582
print(r'\caption{Word2Vec Model Statistics}')
583
print(r'\begin{tabular}{lr}')
584
print(r'\toprule')
585
print(r'Metric & Value \\')
586
print(r'\midrule')
587
print(f"Vocabulary size & {model.vocab_size} \\\\")
588
print(f"Embedding dimension & {model.embedding_dim} \\\\")
589
print(f"Window size & {model.window_size} \\\\")
590
print(f"Negative samples & {model.neg_samples} \\\\")
591
print(f"Final loss & {model.losses[-1]:.4f} \\\\")
592
print(f"Mean vector norm & {np.mean(norms):.3f} \\\\")
593
print(r'\bottomrule')
594
print(r'\end{tabular}')
595
print(r'\end{table}')
596
\end{pycode}
597
598
\subsection{Word Similarity Results}
599
\begin{pycode}
600
print(r'\begin{table}[htbp]')
601
print(r'\centering')
602
print(r'\caption{Top similar words for selected queries}')
603
print(r'\begin{tabular}{lll}')
604
print(r'\toprule')
605
print(r'Query & Similar Words & Scores \\')
606
print(r'\midrule')
607
608
for word, results in similarity_results.items():
609
if results:
610
similar_str = ', '.join([f"{w}" for w, s in results[:3]])
611
scores_str = ', '.join([f"{s:.2f}" for w, s in results[:3]])
612
print(f"{word} & {similar_str} & {scores_str} \\\\")
613
614
print(r'\bottomrule')
615
print(r'\end{tabular}')
616
print(r'\end{table}')
617
\end{pycode}
618
619
\subsection{Analogy Results}
620
\begin{pycode}
621
print(r'\begin{table}[htbp]')
622
print(r'\centering')
623
print(r'\caption{Word analogy task results}')
624
print(r'\begin{tabular}{llll}')
625
print(r'\toprule')
626
print(r'Analogy & Expected & Top Result & Score \\')
627
print(r'\midrule')
628
629
for a, b, c, expected, results in analogy_results:
630
if results:
631
top_word, top_score = results[0]
632
print(f"{a}:{b}::{c}:? & {expected} & {top_word} & {top_score:.3f} \\\\")
633
634
print(r'\bottomrule')
635
print(r'\end{tabular}')
636
print(r'\end{table}')
637
\end{pycode}
638
639
\subsection{Statistical Summary}
640
\begin{itemize}
641
\item Mean pairwise similarity: \py{f"{np.mean(all_sims):.3f}"}
642
\item Similarity std deviation: \py{f"{np.std(all_sims):.3f}"}
643
\item Training epochs: \py{f"{len(model.losses)}"}
644
\item Loss reduction: \py{f"{(model.losses[0] - model.losses[-1]) / model.losses[0] * 100:.1f}"}\\%
645
\end{itemize}
646
647
\section{Conclusion}
648
This template demonstrates word embedding concepts through a simplified Word2Vec skip-gram implementation. The model learns semantic relationships from co-occurrence patterns, enabling similarity search and analogy tasks. The t-SNE visualization reveals clustering of semantically related words, validating the quality of learned representations. With vocabulary size of \py{f"{model.vocab_size}"} words and embedding dimension of \py{f"{model.embedding_dim}"}, the model achieves meaningful word relationships despite the limited training corpus.
649
650
\end{document}
651
652