CoCalc -- word_embeddings.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/nlp/word_embeddings.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage[makestderr]{pythontex}
9

10
\title{Word Embeddings: Skip-gram Model and Vector Semantics}
11
\author{Natural Language Processing Templates}
12
\date{\today}
13

14
\begin{document}
15
\maketitle
16

17
\section{Introduction}
18
Word embeddings map words to dense vector representations where semantic relationships are captured through geometric properties. This template implements a simplified Word2Vec skip-gram model, demonstrates cosine similarity for word relationships, and visualizes embeddings using t-SNE dimensionality reduction.
19

20
\section{Mathematical Framework}
21

22
\subsection{Skip-gram Objective}
23
The skip-gram model maximizes the probability of context words given a target word:
24
\begin{equation}
25
J(\theta) = \frac{1}{T} \sum_{t=1}^{T} \sum_{-c \leq j \leq c, j \neq 0} \log P(w_{t+j} | w_t)
26
\end{equation}
27
where $c$ is the context window size.
28

29
\subsection{Softmax Probability}
30
The probability is computed using softmax over dot products:
31
\begin{equation}
32
P(w_O | w_I) = \frac{\exp(\mathbf{v}'_{w_O} \cdot \mathbf{v}_{w_I})}{\sum_{w=1}^{V} \exp(\mathbf{v}'_w \cdot \mathbf{v}_{w_I})}
33
\end{equation}
34

35
\subsection{Negative Sampling}
36
For efficiency, negative sampling approximates the full softmax:
37
\begin{equation}
38
\log \sigma(\mathbf{v}'_{w_O} \cdot \mathbf{v}_{w_I}) + \sum_{i=1}^{k} \mathbb{E}_{w_i \sim P_n(w)} \left[ \log \sigma(-\mathbf{v}'_{w_i} \cdot \mathbf{v}_{w_I}) \right]
39
\end{equation}
40

41
\subsection{Cosine Similarity}
42
Word similarity is measured by cosine of the angle between vectors:
43
\begin{equation}
44
\text{sim}(\mathbf{u}, \mathbf{v}) = \frac{\mathbf{u} \cdot \mathbf{v}}{\|\mathbf{u}\| \|\mathbf{v}\|}
45
\end{equation}
46

47
\subsection{Word Analogy}
48
Analogies are solved by vector arithmetic:
49
\begin{equation}
50
\mathbf{v}_{\text{king}} - \mathbf{v}_{\text{man}} + \mathbf{v}_{\text{woman}} \approx \mathbf{v}_{\text{queen}}
51
\end{equation}
52

53
\section{Environment Setup}
54

55
\begin{pycode}
56
import numpy as np
57
import matplotlib.pyplot as plt
58
from collections import Counter, defaultdict
59
import re
60

61
plt.rc('text', usetex=True)
62
plt.rc('font', family='serif')
63
np.random.seed(42)
64

65
def save_plot(filename, caption=""):
66
    plt.savefig(filename, bbox_inches='tight', dpi=150)
67
    print(r'\begin{figure}[htbp]')
68
    print(r'\centering')
69
    print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')
70
    if caption:
71
        print(r'\caption{' + caption + '}')
72
    print(r'\end{figure}')
73
    plt.close()
74
\end{pycode}
75

76
\section{Skip-gram Implementation}
77

78
\begin{pycode}
79
class Word2VecSkipGram:
80
    def __init__(self, embedding_dim=50, window_size=2, learning_rate=0.025,
81
                 negative_samples=5, min_count=1):
82
        self.embedding_dim = embedding_dim
83
        self.window_size = window_size
84
        self.lr = learning_rate
85
        self.neg_samples = negative_samples
86
        self.min_count = min_count
87

88
    def tokenize(self, text):
89
        return re.findall(r'\b\w+\b', text.lower())
90

91
    def build_vocab(self, corpus):
92
        word_counts = Counter()
93
        for sentence in corpus:
94
            word_counts.update(self.tokenize(sentence))
95

96
        self.vocab = {w: i for i, (w, c) in enumerate(word_counts.items())
97
                     if c >= self.min_count}
98
        self.inv_vocab = {i: w for w, i in self.vocab.items()}
99
        self.vocab_size = len(self.vocab)
100

101
        # Compute sampling distribution for negative sampling
102
        counts = np.array([word_counts[self.inv_vocab[i]] for i in range(self.vocab_size)])
103
        self.sample_probs = (counts ** 0.75) / np.sum(counts ** 0.75)
104

105
    def init_embeddings(self):
106
        self.W_in = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01
107
        self.W_out = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01
108

109
    def sigmoid(self, x):
110
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
111

112
    def get_context_pairs(self, corpus):
113
        pairs = []
114
        for sentence in corpus:
115
            words = self.tokenize(sentence)
116
            indices = [self.vocab[w] for w in words if w in self.vocab]
117

118
            for i, center in enumerate(indices):
119
                start = max(0, i - self.window_size)
120
                end = min(len(indices), i + self.window_size + 1)
121

122
                for j in range(start, end):
123
                    if i != j:
124
                        pairs.append((center, indices[j]))
125
        return pairs
126

127
    def train_pair(self, center_idx, context_idx):
128
        # Positive sample
129
        center_vec = self.W_in[center_idx]
130
        context_vec = self.W_out[context_idx]
131

132
        score = np.dot(center_vec, context_vec)
133
        pred = self.sigmoid(score)
134
        error = pred - 1
135

136
        grad_out = error * center_vec
137
        grad_in = error * context_vec
138

139
        # Negative samples
140
        neg_indices = np.random.choice(self.vocab_size, size=self.neg_samples,
141
                                       p=self.sample_probs)
142

143
        for neg_idx in neg_indices:
144
            if neg_idx == context_idx:
145
                continue
146
            neg_vec = self.W_out[neg_idx]
147
            score = np.dot(center_vec, neg_vec)
148
            pred = self.sigmoid(score)
149

150
            grad_out_neg = pred * center_vec
151
            grad_in += pred * neg_vec
152

153
            self.W_out[neg_idx] -= self.lr * grad_out_neg
154

155
        self.W_out[context_idx] -= self.lr * grad_out
156
        self.W_in[center_idx] -= self.lr * grad_in
157

158
    def train(self, corpus, epochs=5):
159
        self.build_vocab(corpus)
160
        self.init_embeddings()
161

162
        pairs = self.get_context_pairs(corpus)
163
        n_pairs = len(pairs)
164

165
        self.losses = []
166
        for epoch in range(epochs):
167
            np.random.shuffle(pairs)
168
            epoch_loss = 0
169

170
            for center_idx, context_idx in pairs:
171
                self.train_pair(center_idx, context_idx)
172

173
                # Compute loss for monitoring
174
                score = np.dot(self.W_in[center_idx], self.W_out[context_idx])
175
                epoch_loss -= np.log(self.sigmoid(score) + 1e-10)
176

177
            self.losses.append(epoch_loss / n_pairs)
178

179
        # Final embeddings: average of input and output
180
        self.embeddings = (self.W_in + self.W_out) / 2
181

182
    def get_embedding(self, word):
183
        if word in self.vocab:
184
            return self.embeddings[self.vocab[word]]
185
        return None
186

187
    def most_similar(self, word, n=5):
188
        if word not in self.vocab:
189
            return []
190

191
        vec = self.get_embedding(word)
192
        vec = vec / np.linalg.norm(vec)
193

194
        similarities = []
195
        for w, idx in self.vocab.items():
196
            if w == word:
197
                continue
198
            other_vec = self.embeddings[idx]
199
            other_vec = other_vec / np.linalg.norm(other_vec)
200
            sim = np.dot(vec, other_vec)
201
            similarities.append((w, sim))
202

203
        similarities.sort(key=lambda x: x[1], reverse=True)
204
        return similarities[:n]
205

206
    def analogy(self, a, b, c, n=5):
207
        if a not in self.vocab or b not in self.vocab or c not in self.vocab:
208
            return []
209

210
        vec = self.get_embedding(b) - self.get_embedding(a) + self.get_embedding(c)
211
        vec = vec / np.linalg.norm(vec)
212

213
        similarities = []
214
        exclude = {a, b, c}
215
        for w, idx in self.vocab.items():
216
            if w in exclude:
217
                continue
218
            other_vec = self.embeddings[idx]
219
            other_vec = other_vec / np.linalg.norm(other_vec)
220
            sim = np.dot(vec, other_vec)
221
            similarities.append((w, sim))
222

223
        similarities.sort(key=lambda x: x[1], reverse=True)
224
        return similarities[:n]
225

226
# Training corpus with semantic relationships
227
corpus = [
228
    "the king rules the kingdom with power",
229
    "the queen rules beside the king",
230
    "the prince is son of the king and queen",
231
    "the princess is daughter of the king",
232
    "man and woman are different",
233
    "boy grows into man",
234
    "girl grows into woman",
235
    "python is a programming language",
236
    "java is a programming language",
237
    "code written in python",
238
    "code written in java",
239
    "machine learning uses data",
240
    "deep learning is machine learning",
241
    "neural networks learn patterns",
242
    "data science analyzes data",
243
    "the cat sits on the mat",
244
    "the dog runs in the park",
245
    "cats and dogs are pets",
246
    "paris is capital of france",
247
    "london is capital of england",
248
    "berlin is capital of germany",
249
    "france is in europe",
250
    "england is in europe",
251
    "germany is in europe"
252
]
253

254
# Train model
255
model = Word2VecSkipGram(embedding_dim=30, window_size=2, learning_rate=0.05,
256
                         negative_samples=5)
257
model.train(corpus, epochs=100)
258
\end{pycode}
259

260
\section{Training Visualization}
261

262
\begin{pycode}
263
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
264

265
# Plot 1: Training loss
266
axes[0, 0].plot(model.losses, 'b-', linewidth=2)
267
axes[0, 0].set_xlabel('Epoch')
268
axes[0, 0].set_ylabel('Loss')
269
axes[0, 0].set_title('Training Loss')
270
axes[0, 0].grid(True, alpha=0.3)
271

272
# Plot 2: Embedding norms
273
norms = np.linalg.norm(model.embeddings, axis=1)
274
axes[0, 1].hist(norms, bins=20, color='green', alpha=0.7, edgecolor='black')
275
axes[0, 1].set_xlabel('Vector Norm')
276
axes[0, 1].set_ylabel('Frequency')
277
axes[0, 1].set_title('Embedding Norm Distribution')
278
axes[0, 1].axvline(x=np.mean(norms), color='red', linestyle='--',
279
                   label=f'Mean: {np.mean(norms):.2f}')
280
axes[0, 1].legend()
281

282
# Plot 3: Similarity matrix for selected words
283
selected_words = ['king', 'queen', 'man', 'woman', 'python', 'java', 'cat', 'dog']
284
selected_words = [w for w in selected_words if w in model.vocab]
285
n_selected = len(selected_words)
286

287
sim_matrix = np.zeros((n_selected, n_selected))
288
for i, w1 in enumerate(selected_words):
289
    v1 = model.get_embedding(w1)
290
    v1 = v1 / np.linalg.norm(v1)
291
    for j, w2 in enumerate(selected_words):
292
        v2 = model.get_embedding(w2)
293
        v2 = v2 / np.linalg.norm(v2)
294
        sim_matrix[i, j] = np.dot(v1, v2)
295

296
im = axes[1, 0].imshow(sim_matrix, cmap='RdYlBu', vmin=-1, vmax=1)
297
axes[1, 0].set_xticks(range(n_selected))
298
axes[1, 0].set_yticks(range(n_selected))
299
axes[1, 0].set_xticklabels(selected_words, rotation=45, ha='right', fontsize=8)
300
axes[1, 0].set_yticklabels(selected_words, fontsize=8)
301
axes[1, 0].set_title('Word Similarity Matrix')
302
plt.colorbar(im, ax=axes[1, 0], shrink=0.8)
303

304
# Plot 4: Vocabulary frequency
305
word_counts = Counter()
306
for sentence in corpus:
307
    word_counts.update(model.tokenize(sentence))
308
common = word_counts.most_common(15)
309
words, counts = zip(*common)
310
y_pos = np.arange(len(words))
311
axes[1, 1].barh(y_pos, counts, color='purple', alpha=0.7)
312
axes[1, 1].set_yticks(y_pos)
313
axes[1, 1].set_yticklabels(words, fontsize=8)
314
axes[1, 1].set_xlabel('Frequency')
315
axes[1, 1].set_title('Top Words by Frequency')
316
axes[1, 1].invert_yaxis()
317

318
plt.tight_layout()
319
save_plot('embeddings_training.pdf', 'Word embedding training analysis')
320
\end{pycode}
321

322
\section{Cosine Similarity Analysis}
323

324
\begin{pycode}
325
# Compute most similar words for key terms
326
test_words = ['king', 'python', 'data', 'cat', 'france']
327
similarity_results = {}
328

329
for word in test_words:
330
    if word in model.vocab:
331
        similar = model.most_similar(word, n=5)
332
        similarity_results[word] = similar
333

334
# Visualize similarities
335
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
336

337
for idx, word in enumerate(test_words):
338
    if word not in similarity_results:
339
        continue
340
    ax = axes[idx // 3, idx % 3]
341
    similar = similarity_results[word]
342
    if similar:
343
        words, sims = zip(*similar)
344
        y_pos = np.arange(len(words))
345
        colors = plt.cm.viridis(np.array(sims))
346
        ax.barh(y_pos, sims, color=colors, alpha=0.8)
347
        ax.set_yticks(y_pos)
348
        ax.set_yticklabels(words, fontsize=9)
349
        ax.set_xlabel('Cosine Similarity')
350
        ax.set_title(f'Similar to "{word}"')
351
        ax.set_xlim(0, 1)
352
        ax.invert_yaxis()
353

354
# Remove empty subplot
355
axes[1, 2].axis('off')
356

357
plt.tight_layout()
358
save_plot('similarity_analysis.pdf', 'Cosine similarity analysis for key words')
359
\end{pycode}
360

361
\section{Word Analogy Tasks}
362

363
\begin{pycode}
364
# Test analogies
365
analogies = [
366
    ('king', 'queen', 'man', 'woman'),  # king:queen :: man:?
367
    ('france', 'paris', 'england', 'london'),  # france:paris :: england:?
368
    ('python', 'code', 'java', 'code'),  # python:code :: java:?
369
]
370

371
analogy_results = []
372
for a, b, c, expected in analogies:
373
    if all(w in model.vocab for w in [a, b, c]):
374
        results = model.analogy(a, b, c, n=3)
375
        analogy_results.append((a, b, c, expected, results))
376

377
# Visualize analogy computation
378
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
379

380
# Plot 1: Vector arithmetic visualization
381
if len(analogy_results) > 0:
382
    a, b, c, expected, results = analogy_results[0]
383
    words = [a, b, c] + [r[0] for r in results[:2]]
384
    vecs = [model.get_embedding(w) for w in words]
385

386
    # Simple 2D projection using first two principal components
387
    if len(vecs) > 0:
388
        vecs_matrix = np.array(vecs)
389
        mean_vec = np.mean(vecs_matrix, axis=0)
390
        centered = vecs_matrix - mean_vec
391
        cov = np.cov(centered.T)
392
        eigenvalues, eigenvectors = np.linalg.eigh(cov)
393
        idx = np.argsort(eigenvalues)[::-1]
394
        proj_matrix = eigenvectors[:, idx[:2]]
395
        projected = centered @ proj_matrix
396

397
        axes[0, 0].scatter(projected[:, 0], projected[:, 1], s=100, alpha=0.7)
398
        for i, word in enumerate(words):
399
            axes[0, 0].annotate(word, (projected[i, 0], projected[i, 1]),
400
                               xytext=(5, 5), textcoords='offset points', fontsize=10)
401

402
        # Draw analogy vectors
403
        if len(projected) >= 3:
404
            # a -> b vector
405
            axes[0, 0].arrow(projected[0, 0], projected[0, 1],
406
                            projected[1, 0] - projected[0, 0],
407
                            projected[1, 1] - projected[0, 1],
408
                            head_width=0.05, head_length=0.02, fc='blue', ec='blue', alpha=0.5)
409
            # c -> result vector
410
            axes[0, 0].arrow(projected[2, 0], projected[2, 1],
411
                            projected[3, 0] - projected[2, 0],
412
                            projected[3, 1] - projected[2, 1],
413
                            head_width=0.05, head_length=0.02, fc='red', ec='red', alpha=0.5)
414

415
        axes[0, 0].set_xlabel('PC1')
416
        axes[0, 0].set_ylabel('PC2')
417
        axes[0, 0].set_title(f'Analogy: {a}:{b} :: {c}:?')
418
        axes[0, 0].grid(True, alpha=0.3)
419

420
# Plot 2: Analogy results bar chart
421
if analogy_results:
422
    a, b, c, expected, results = analogy_results[0]
423
    words = [r[0] for r in results]
424
    sims = [r[1] for r in results]
425
    y_pos = np.arange(len(words))
426
    axes[0, 1].barh(y_pos, sims, color='orange', alpha=0.7)
427
    axes[0, 1].set_yticks(y_pos)
428
    axes[0, 1].set_yticklabels(words)
429
    axes[0, 1].set_xlabel('Similarity Score')
430
    axes[0, 1].set_title(f'{a} - {b} + {c} = ?')
431
    axes[0, 1].invert_yaxis()
432

433
# Plot 3: Embedding space visualization (t-SNE-like using PCA)
434
# Select subset of words for visualization
435
viz_words = list(model.vocab.keys())[:30]
436
viz_vecs = np.array([model.get_embedding(w) for w in viz_words])
437

438
# PCA projection
439
mean_vec = np.mean(viz_vecs, axis=0)
440
centered = viz_vecs - mean_vec
441
cov = np.cov(centered.T)
442
eigenvalues, eigenvectors = np.linalg.eigh(cov)
443
idx = np.argsort(eigenvalues)[::-1]
444
proj = centered @ eigenvectors[:, idx[:2]]
445

446
scatter = axes[1, 0].scatter(proj[:, 0], proj[:, 1], c=range(len(viz_words)),
447
                             cmap='tab20', s=60, alpha=0.7)
448
for i, word in enumerate(viz_words):
449
    axes[1, 0].annotate(word, (proj[i, 0], proj[i, 1]),
450
                       xytext=(3, 3), textcoords='offset points', fontsize=7)
451
axes[1, 0].set_xlabel('Component 1')
452
axes[1, 0].set_ylabel('Component 2')
453
axes[1, 0].set_title('Word Embedding Space (PCA Projection)')
454
axes[1, 0].grid(True, alpha=0.3)
455

456
# Plot 4: Pairwise similarity distribution
457
all_sims = []
458
for i in range(model.vocab_size):
459
    v1 = model.embeddings[i] / np.linalg.norm(model.embeddings[i])
460
    for j in range(i+1, model.vocab_size):
461
        v2 = model.embeddings[j] / np.linalg.norm(model.embeddings[j])
462
        all_sims.append(np.dot(v1, v2))
463

464
axes[1, 1].hist(all_sims, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
465
axes[1, 1].axvline(x=np.mean(all_sims), color='red', linestyle='--',
466
                   label=f'Mean: {np.mean(all_sims):.3f}')
467
axes[1, 1].set_xlabel('Cosine Similarity')
468
axes[1, 1].set_ylabel('Frequency')
469
axes[1, 1].set_title('Pairwise Similarity Distribution')
470
axes[1, 1].legend()
471

472
plt.tight_layout()
473
save_plot('analogy_visualization.pdf', 'Word analogy and embedding space visualization')
474
\end{pycode}
475

476
\section{t-SNE Visualization}
477

478
\begin{pycode}
479
def tsne(X, n_components=2, perplexity=5.0, n_iter=500, learning_rate=100.0):
480
    """Simplified t-SNE implementation"""
481
    n_samples = X.shape[0]
482

483
    # Compute pairwise distances
484
    sum_X = np.sum(X ** 2, axis=1)
485
    D = sum_X[:, np.newaxis] + sum_X[np.newaxis, :] - 2 * X @ X.T
486
    D = np.maximum(D, 0)
487

488
    # Compute conditional probabilities
489
    P = np.zeros((n_samples, n_samples))
490
    for i in range(n_samples):
491
        Di = D[i, np.concatenate([np.arange(i), np.arange(i+1, n_samples)])]
492
        Pi = np.exp(-Di / (2 * perplexity))
493
        Pi = Pi / np.sum(Pi)
494
        P[i, np.concatenate([np.arange(i), np.arange(i+1, n_samples)])] = Pi
495

496
    # Symmetrize
497
    P = (P + P.T) / (2 * n_samples)
498
    P = np.maximum(P, 1e-12)
499

500
    # Initialize embedding
501
    Y = np.random.randn(n_samples, n_components) * 0.01
502

503
    # Gradient descent
504
    for iteration in range(n_iter):
505
        # Compute Q
506
        sum_Y = np.sum(Y ** 2, axis=1)
507
        num = 1 / (1 + sum_Y[:, np.newaxis] + sum_Y[np.newaxis, :] - 2 * Y @ Y.T)
508
        np.fill_diagonal(num, 0)
509
        Q = num / np.sum(num)
510
        Q = np.maximum(Q, 1e-12)
511

512
        # Compute gradient
513
        PQ_diff = P - Q
514
        grad = np.zeros_like(Y)
515
        for i in range(n_samples):
516
            diff = Y[i] - Y
517
            grad[i] = 4 * np.sum((PQ_diff[i] * num[i])[:, np.newaxis] * diff, axis=0)
518

519
        Y -= learning_rate * grad
520

521
    return Y
522

523
# Apply t-SNE to embeddings
524
tsne_result = tsne(model.embeddings, perplexity=5.0, n_iter=300)
525

526
# Visualize t-SNE result
527
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
528

529
# Color by word frequency
530
word_freqs = [word_counts[model.inv_vocab[i]] for i in range(model.vocab_size)]
531
scatter = axes[0].scatter(tsne_result[:, 0], tsne_result[:, 1],
532
                         c=word_freqs, cmap='viridis', s=60, alpha=0.7)
533
for i in range(model.vocab_size):
534
    axes[0].annotate(model.inv_vocab[i], (tsne_result[i, 0], tsne_result[i, 1]),
535
                    xytext=(3, 3), textcoords='offset points', fontsize=7)
536
axes[0].set_xlabel('t-SNE 1')
537
axes[0].set_ylabel('t-SNE 2')
538
axes[0].set_title('t-SNE Visualization (colored by frequency)')
539
plt.colorbar(scatter, ax=axes[0], label='Frequency')
540

541
# Highlight semantic clusters
542
cluster_words = {
543
    'royalty': ['king', 'queen', 'prince', 'princess'],
544
    'gender': ['man', 'woman', 'boy', 'girl'],
545
    'programming': ['python', 'java', 'code'],
546
    'places': ['france', 'england', 'germany', 'paris', 'london', 'berlin']
547
}
548

549
colors = {'royalty': 'red', 'gender': 'blue', 'programming': 'green', 'places': 'orange'}
550

551
axes[1].scatter(tsne_result[:, 0], tsne_result[:, 1], c='lightgray', s=40, alpha=0.3)
552

553
for cluster_name, cluster_words_list in cluster_words.items():
554
    for word in cluster_words_list:
555
        if word in model.vocab:
556
            idx = model.vocab[word]
557
            axes[1].scatter(tsne_result[idx, 0], tsne_result[idx, 1],
558
                           c=colors[cluster_name], s=100, alpha=0.8, label=cluster_name)
559
            axes[1].annotate(word, (tsne_result[idx, 0], tsne_result[idx, 1]),
560
                            xytext=(5, 5), textcoords='offset points', fontsize=9)
561

562
# Custom legend
563
from matplotlib.lines import Line2D
564
legend_elements = [Line2D([0], [0], marker='o', color='w',
565
                         markerfacecolor=c, markersize=10, label=n)
566
                  for n, c in colors.items()]
567
axes[1].legend(handles=legend_elements, loc='best')
568
axes[1].set_xlabel('t-SNE 1')
569
axes[1].set_ylabel('t-SNE 2')
570
axes[1].set_title('t-SNE with Semantic Clusters')
571

572
plt.tight_layout()
573
save_plot('tsne_visualization.pdf', 't-SNE visualization of word embeddings')
574
\end{pycode}
575

576
\section{Results Summary}
577

578
\subsection{Model Statistics}
579
\begin{pycode}
580
print(r'\begin{table}[htbp]')
581
print(r'\centering')
582
print(r'\caption{Word2Vec Model Statistics}')
583
print(r'\begin{tabular}{lr}')
584
print(r'\toprule')
585
print(r'Metric & Value \\')
586
print(r'\midrule')
587
print(f"Vocabulary size & {model.vocab_size} \\\\")
588
print(f"Embedding dimension & {model.embedding_dim} \\\\")
589
print(f"Window size & {model.window_size} \\\\")
590
print(f"Negative samples & {model.neg_samples} \\\\")
591
print(f"Final loss & {model.losses[-1]:.4f} \\\\")
592
print(f"Mean vector norm & {np.mean(norms):.3f} \\\\")
593
print(r'\bottomrule')
594
print(r'\end{tabular}')
595
print(r'\end{table}')
596
\end{pycode}
597

598
\subsection{Word Similarity Results}
599
\begin{pycode}
600
print(r'\begin{table}[htbp]')
601
print(r'\centering')
602
print(r'\caption{Top similar words for selected queries}')
603
print(r'\begin{tabular}{lll}')
604
print(r'\toprule')
605
print(r'Query & Similar Words & Scores \\')
606
print(r'\midrule')
607

608
for word, results in similarity_results.items():
609
    if results:
610
        similar_str = ', '.join([f"{w}" for w, s in results[:3]])
611
        scores_str = ', '.join([f"{s:.2f}" for w, s in results[:3]])
612
        print(f"{word} & {similar_str} & {scores_str} \\\\")
613

614
print(r'\bottomrule')
615
print(r'\end{tabular}')
616
print(r'\end{table}')
617
\end{pycode}
618

619
\subsection{Analogy Results}
620
\begin{pycode}
621
print(r'\begin{table}[htbp]')
622
print(r'\centering')
623
print(r'\caption{Word analogy task results}')
624
print(r'\begin{tabular}{llll}')
625
print(r'\toprule')
626
print(r'Analogy & Expected & Top Result & Score \\')
627
print(r'\midrule')
628

629
for a, b, c, expected, results in analogy_results:
630
    if results:
631
        top_word, top_score = results[0]
632
        print(f"{a}:{b}::{c}:? & {expected} & {top_word} & {top_score:.3f} \\\\")
633

634
print(r'\bottomrule')
635
print(r'\end{tabular}')
636
print(r'\end{table}')
637
\end{pycode}
638

639
\subsection{Statistical Summary}
640
\begin{itemize}
641
    \item Mean pairwise similarity: \py{f"{np.mean(all_sims):.3f}"}
642
    \item Similarity std deviation: \py{f"{np.std(all_sims):.3f}"}
643
    \item Training epochs: \py{f"{len(model.losses)}"}
644
    \item Loss reduction: \py{f"{(model.losses[0] - model.losses[-1]) / model.losses[0] * 100:.1f}"}\\%
645
\end{itemize}
646

647
\section{Conclusion}
648
This template demonstrates word embedding concepts through a simplified Word2Vec skip-gram implementation. The model learns semantic relationships from co-occurrence patterns, enabling similarity search and analogy tasks. The t-SNE visualization reveals clustering of semantically related words, validating the quality of learned representations. With vocabulary size of \py{f"{model.vocab_size}"} words and embedding dimension of \py{f"{model.embedding_dim}"}, the model achieves meaningful word relationships despite the limited training corpus.
649

650
\end{document}
651

652
Product

Resources

Company