Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/nlp/text_analysis.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage[makestderr]{pythontex}
9
10
\title{Text Analysis: TF-IDF Vectorization and Document Similarity}
11
\author{Natural Language Processing Templates}
12
\date{\today}
13
14
\begin{document}
15
\maketitle
16
17
\section{Introduction}
18
This template explores fundamental text analysis techniques including Term Frequency-Inverse Document Frequency (TF-IDF) vectorization, document similarity computation, topic modeling concepts, and word frequency analysis.
19
20
\section{Mathematical Framework}
21
22
\subsection{Term Frequency (TF)}
23
Raw term frequency and its normalized variants:
24
\begin{equation}
25
\text{TF}(t, d) = \frac{f_{t,d}}{\sum_{t' \in d} f_{t',d}}
26
\end{equation}
27
where $f_{t,d}$ is the count of term $t$ in document $d$.
28
29
\subsection{Inverse Document Frequency (IDF)}
30
IDF measures the importance of a term across the corpus:
31
\begin{equation}
32
\text{IDF}(t, D) = \log\left(\frac{|D|}{|\{d \in D : t \in d\}|}\right)
33
\end{equation}
34
where $|D|$ is the total number of documents.
35
36
\subsection{TF-IDF Score}
37
The combined TF-IDF weight:
38
\begin{equation}
39
\text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D)
40
\end{equation}
41
42
\subsection{Cosine Similarity}
43
Document similarity in vector space:
44
\begin{equation}
45
\text{sim}(\mathbf{d}_1, \mathbf{d}_2) = \frac{\mathbf{d}_1 \cdot \mathbf{d}_2}{\|\mathbf{d}_1\| \|\mathbf{d}_2\|}
46
\end{equation}
47
48
\section{Environment Setup}
49
50
\begin{pycode}
51
import numpy as np
52
import matplotlib.pyplot as plt
53
from collections import Counter, defaultdict
54
import re
55
56
plt.rc('text', usetex=True)
57
plt.rc('font', family='serif')
58
np.random.seed(42)
59
60
def save_plot(filename, caption=""):
61
plt.savefig(filename, bbox_inches='tight', dpi=150)
62
print(r'\begin{figure}[htbp]')
63
print(r'\centering')
64
print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')
65
if caption:
66
print(r'\caption{' + caption + '}')
67
print(r'\end{figure}')
68
plt.close()
69
\end{pycode}
70
71
\section{TF-IDF Implementation}
72
73
\begin{pycode}
74
class TFIDFVectorizer:
75
def __init__(self, min_df=1, max_df=1.0):
76
self.min_df = min_df
77
self.max_df = max_df
78
self.vocabulary_ = {}
79
self.idf_ = None
80
81
def tokenize(self, text):
82
words = re.findall(r'\b\w+\b', text.lower())
83
stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
84
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
85
'would', 'could', 'should', 'may', 'might', 'must', 'shall',
86
'can', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by',
87
'from', 'as', 'into', 'through', 'and', 'but', 'or', 'nor',
88
'so', 'yet', 'both', 'either', 'neither', 'not', 'only',
89
'that', 'this', 'these', 'those', 'it', 'its'}
90
return [w for w in words if w not in stopwords and len(w) > 2]
91
92
def fit(self, documents):
93
doc_freq = defaultdict(int)
94
all_terms = set()
95
n_docs = len(documents)
96
97
for doc in documents:
98
terms = set(self.tokenize(doc))
99
for term in terms:
100
doc_freq[term] += 1
101
all_terms.add(term)
102
103
min_count = self.min_df if isinstance(self.min_df, int) else int(self.min_df * n_docs)
104
max_count = int(self.max_df * n_docs) if isinstance(self.max_df, float) else self.max_df
105
106
filtered_terms = [t for t in all_terms if min_count <= doc_freq[t] <= max_count]
107
self.vocabulary_ = {term: idx for idx, term in enumerate(sorted(filtered_terms))}
108
109
self.idf_ = np.zeros(len(self.vocabulary_))
110
for term, idx in self.vocabulary_.items():
111
self.idf_[idx] = np.log(n_docs / (doc_freq[term] + 1)) + 1
112
113
return self
114
115
def transform(self, documents):
116
n_docs = len(documents)
117
n_terms = len(self.vocabulary_)
118
tfidf_matrix = np.zeros((n_docs, n_terms))
119
120
for doc_idx, doc in enumerate(documents):
121
terms = self.tokenize(doc)
122
term_counts = Counter(terms)
123
total_terms = len(terms)
124
125
for term, count in term_counts.items():
126
if term in self.vocabulary_:
127
term_idx = self.vocabulary_[term]
128
tf = count / total_terms if total_terms > 0 else 0
129
tfidf_matrix[doc_idx, term_idx] = tf * self.idf_[term_idx]
130
131
norms = np.linalg.norm(tfidf_matrix, axis=1, keepdims=True)
132
norms[norms == 0] = 1
133
return tfidf_matrix / norms
134
135
def fit_transform(self, documents):
136
self.fit(documents)
137
return self.transform(documents)
138
139
def get_feature_names(self):
140
return sorted(self.vocabulary_.keys(), key=lambda x: self.vocabulary_[x])
141
142
corpus = [
143
"Machine learning algorithms can analyze large datasets efficiently.",
144
"Deep learning neural networks excel at image recognition tasks.",
145
"Natural language processing enables computers to understand text.",
146
"Data science combines statistics and programming for insights.",
147
"Computer vision uses deep learning for object detection.",
148
"Text mining extracts information from unstructured documents.",
149
"Artificial intelligence transforms healthcare diagnostics.",
150
"Big data analytics requires distributed computing systems.",
151
"Supervised learning needs labeled training data.",
152
"Unsupervised learning discovers patterns without labels."
153
]
154
155
vectorizer = TFIDFVectorizer(min_df=1, max_df=0.9)
156
tfidf_matrix = vectorizer.fit_transform(corpus)
157
vocab_size = len(vectorizer.vocabulary_)
158
feature_names = vectorizer.get_feature_names()
159
\end{pycode}
160
161
\section{TF-IDF Analysis Visualization}
162
163
\begin{pycode}
164
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
165
166
top_terms_idx = np.argsort(np.sum(tfidf_matrix, axis=0))[-15:]
167
subset_matrix = tfidf_matrix[:, top_terms_idx]
168
top_terms = [feature_names[i] for i in top_terms_idx]
169
170
im = axes[0, 0].imshow(subset_matrix, aspect='auto', cmap='YlOrRd')
171
axes[0, 0].set_xlabel('Terms')
172
axes[0, 0].set_ylabel('Documents')
173
axes[0, 0].set_title('TF-IDF Heatmap (Top 15 Terms)')
174
axes[0, 0].set_xticks(range(len(top_terms)))
175
axes[0, 0].set_xticklabels(top_terms, rotation=45, ha='right', fontsize=7)
176
plt.colorbar(im, ax=axes[0, 0], shrink=0.8)
177
178
idf_values = vectorizer.idf_[top_terms_idx]
179
y_pos = np.arange(len(top_terms))
180
axes[0, 1].barh(y_pos, idf_values, color='steelblue', alpha=0.7)
181
axes[0, 1].set_yticks(y_pos)
182
axes[0, 1].set_yticklabels(top_terms, fontsize=7)
183
axes[0, 1].set_xlabel('IDF Score')
184
axes[0, 1].set_title('Inverse Document Frequency')
185
186
all_terms = []
187
for doc in corpus:
188
all_terms.extend(vectorizer.tokenize(doc))
189
term_counts = Counter(all_terms)
190
most_common = term_counts.most_common(15)
191
terms, counts = zip(*most_common)
192
193
y_pos = np.arange(len(terms))
194
axes[1, 0].barh(y_pos, counts, color='green', alpha=0.7)
195
axes[1, 0].set_yticks(y_pos)
196
axes[1, 0].set_yticklabels(terms, fontsize=7)
197
axes[1, 0].set_xlabel('Frequency')
198
axes[1, 0].set_title('Term Frequency Distribution')
199
200
doc_nonzero = np.sum(tfidf_matrix > 0, axis=1)
201
axes[1, 1].bar(range(1, len(corpus)+1), doc_nonzero, color='purple', alpha=0.7)
202
axes[1, 1].set_xlabel('Document')
203
axes[1, 1].set_ylabel('Unique Terms')
204
axes[1, 1].set_title('Document Term Coverage')
205
206
plt.tight_layout()
207
save_plot('tfidf_analysis.pdf', 'TF-IDF vectorization analysis')
208
\end{pycode}
209
210
\section{Document Similarity}
211
212
\begin{pycode}
213
def cosine_similarity(matrix):
214
return np.dot(matrix, matrix.T)
215
216
similarity_matrix = cosine_similarity(tfidf_matrix)
217
218
n_docs = len(corpus)
219
similar_pairs = []
220
for i in range(n_docs):
221
for j in range(i+1, n_docs):
222
similar_pairs.append((i, j, similarity_matrix[i, j]))
223
224
similar_pairs.sort(key=lambda x: x[2], reverse=True)
225
top_pairs = similar_pairs[:5]
226
227
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
228
229
im = axes[0, 0].imshow(similarity_matrix, cmap='coolwarm', vmin=0, vmax=1)
230
axes[0, 0].set_xlabel('Document')
231
axes[0, 0].set_ylabel('Document')
232
axes[0, 0].set_title('Document Similarity Matrix')
233
axes[0, 0].set_xticks(range(n_docs))
234
axes[0, 0].set_yticks(range(n_docs))
235
axes[0, 0].set_xticklabels([f'D{i+1}' for i in range(n_docs)], fontsize=8)
236
axes[0, 0].set_yticklabels([f'D{i+1}' for i in range(n_docs)], fontsize=8)
237
plt.colorbar(im, ax=axes[0, 0], shrink=0.8)
238
239
upper_tri = similarity_matrix[np.triu_indices(n_docs, k=1)]
240
axes[0, 1].hist(upper_tri, bins=20, color='purple', alpha=0.7, edgecolor='black')
241
axes[0, 1].axvline(x=np.mean(upper_tri), color='red', linestyle='--',
242
label=f'Mean: {np.mean(upper_tri):.3f}')
243
axes[0, 1].set_xlabel('Cosine Similarity')
244
axes[0, 1].set_ylabel('Frequency')
245
axes[0, 1].set_title('Similarity Distribution')
246
axes[0, 1].legend()
247
248
pair_labels = [f'D{p[0]+1}-D{p[1]+1}' for p in top_pairs]
249
pair_sims = [p[2] for p in top_pairs]
250
y_pos = np.arange(len(top_pairs))
251
axes[1, 0].barh(y_pos, pair_sims, color='orange', alpha=0.7)
252
axes[1, 0].set_yticks(y_pos)
253
axes[1, 0].set_yticklabels(pair_labels)
254
axes[1, 0].set_xlabel('Cosine Similarity')
255
axes[1, 0].set_title('Top 5 Most Similar Document Pairs')
256
axes[1, 0].set_xlim(0, 1)
257
258
def mds_projection(sim_matrix, n_components=2):
259
n = sim_matrix.shape[0]
260
dist_matrix = 1 - sim_matrix
261
J = np.eye(n) - np.ones((n, n)) / n
262
B = -0.5 * J @ (dist_matrix ** 2) @ J
263
eigenvalues, eigenvectors = np.linalg.eigh(B)
264
idx = np.argsort(eigenvalues)[::-1]
265
eigenvalues = eigenvalues[idx]
266
eigenvectors = eigenvectors[:, idx]
267
coords = eigenvectors[:, :n_components] * np.sqrt(np.abs(eigenvalues[:n_components]))
268
return coords
269
270
coords = mds_projection(similarity_matrix)
271
axes[1, 1].scatter(coords[:, 0], coords[:, 1], s=100, alpha=0.7,
272
c=range(n_docs), cmap='tab10', edgecolors='black')
273
for i in range(n_docs):
274
axes[1, 1].annotate(f'D{i+1}', (coords[i, 0], coords[i, 1]),
275
xytext=(5, 5), textcoords='offset points', fontsize=9)
276
axes[1, 1].set_xlabel('Component 1')
277
axes[1, 1].set_ylabel('Component 2')
278
axes[1, 1].set_title('Document Clustering (MDS)')
279
axes[1, 1].grid(True, alpha=0.3)
280
281
plt.tight_layout()
282
save_plot('similarity_analysis.pdf', 'Document similarity analysis')
283
\end{pycode}
284
285
\section{Topic Modeling with NMF}
286
287
\begin{pycode}
288
def simple_nmf(V, n_topics=3, max_iter=100, tol=1e-4):
289
n_docs, n_terms = V.shape
290
W = np.random.rand(n_docs, n_topics) + 0.1
291
H = np.random.rand(n_topics, n_terms) + 0.1
292
293
for iteration in range(max_iter):
294
H = H * (W.T @ V) / (W.T @ W @ H + 1e-10)
295
W = W * (V @ H.T) / (W @ H @ H.T + 1e-10)
296
error = np.linalg.norm(V - W @ H, 'fro')
297
if iteration > 0 and abs(prev_error - error) < tol:
298
break
299
prev_error = error
300
301
return W, H
302
303
n_topics = 3
304
W, H = simple_nmf(tfidf_matrix, n_topics=n_topics)
305
306
def get_top_terms(H, feature_names, n_top=8):
307
topics = []
308
for topic_idx in range(H.shape[0]):
309
top_indices = H[topic_idx].argsort()[::-1][:n_top]
310
top_terms = [(feature_names[i], H[topic_idx, i]) for i in top_indices]
311
topics.append(top_terms)
312
return topics
313
314
topics = get_top_terms(H, feature_names)
315
316
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
317
318
doc_topics = W / W.sum(axis=1, keepdims=True)
319
im = axes[0, 0].imshow(doc_topics, aspect='auto', cmap='Blues')
320
axes[0, 0].set_xlabel('Topic')
321
axes[0, 0].set_ylabel('Document')
322
axes[0, 0].set_title('Document-Topic Distribution')
323
axes[0, 0].set_xticks(range(n_topics))
324
axes[0, 0].set_xticklabels([f'Topic {i+1}' for i in range(n_topics)])
325
plt.colorbar(im, ax=axes[0, 0], shrink=0.8)
326
327
for topic_idx in range(n_topics):
328
ax = axes[(topic_idx + 1) // 2, (topic_idx + 1) % 2]
329
terms, weights = zip(*topics[topic_idx])
330
y_pos = np.arange(len(terms))
331
colors = plt.cm.Set2(topic_idx / n_topics)
332
ax.barh(y_pos, weights, color=colors, alpha=0.7)
333
ax.set_yticks(y_pos)
334
ax.set_yticklabels(terms, fontsize=8)
335
ax.set_xlabel('Weight')
336
ax.set_title(f'Topic {topic_idx + 1} Top Terms')
337
ax.invert_yaxis()
338
339
plt.tight_layout()
340
save_plot('topic_modeling.pdf', 'Topic modeling using NMF')
341
\end{pycode}
342
343
\section{Word Frequency Analysis}
344
345
\begin{pycode}
346
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
347
348
sorted_counts = sorted(total_freq.values(), reverse=True) if 'total_freq' in dir() else sorted(term_counts.values(), reverse=True)
349
ranks = np.arange(1, len(sorted_counts) + 1)
350
axes[0, 0].loglog(ranks, sorted_counts, 'b-', marker='o', markersize=4)
351
log_ranks = np.log(ranks)
352
log_counts = np.log(sorted_counts)
353
slope, intercept = np.polyfit(log_ranks, log_counts, 1)
354
fit_line = np.exp(intercept) * ranks ** slope
355
axes[0, 0].loglog(ranks, fit_line, 'r--', label=f'Slope: {slope:.2f}')
356
axes[0, 0].set_xlabel('Rank')
357
axes[0, 0].set_ylabel('Frequency')
358
axes[0, 0].set_title("Zipf's Law Analysis")
359
axes[0, 0].legend()
360
axes[0, 0].grid(True, alpha=0.3)
361
362
term_df = defaultdict(int)
363
for doc in corpus:
364
for term in set(vectorizer.tokenize(doc)):
365
term_df[term] += 1
366
367
tf_vals = [term_counts[t] for t in term_counts.keys()]
368
df_vals = [term_df[t] for t in term_counts.keys()]
369
370
axes[0, 1].scatter(tf_vals, df_vals, alpha=0.6, s=50)
371
axes[0, 1].set_xlabel('Term Frequency')
372
axes[0, 1].set_ylabel('Document Frequency')
373
axes[0, 1].set_title('TF vs DF')
374
axes[0, 1].grid(True, alpha=0.3)
375
376
vocab_growth = []
377
seen_terms = set()
378
for i, doc in enumerate(corpus):
379
terms = vectorizer.tokenize(doc)
380
seen_terms.update(terms)
381
vocab_growth.append(len(seen_terms))
382
383
axes[1, 0].plot(range(1, len(corpus)+1), vocab_growth, 'g-', marker='o', linewidth=2)
384
axes[1, 0].set_xlabel('Number of Documents')
385
axes[1, 0].set_ylabel('Vocabulary Size')
386
axes[1, 0].set_title('Vocabulary Growth Curve')
387
axes[1, 0].grid(True, alpha=0.3)
388
389
doc_lengths = [len(vectorizer.tokenize(doc)) for doc in corpus]
390
axes[1, 1].bar(range(1, len(corpus)+1), doc_lengths, color='purple', alpha=0.7)
391
axes[1, 1].axhline(y=np.mean(doc_lengths), color='red', linestyle='--',
392
label=f'Mean: {np.mean(doc_lengths):.1f}')
393
axes[1, 1].set_xlabel('Document')
394
axes[1, 1].set_ylabel('Number of Terms')
395
axes[1, 1].set_title('Document Length Distribution')
396
axes[1, 1].legend()
397
398
plt.tight_layout()
399
save_plot('word_frequency.pdf', 'Word frequency analysis')
400
401
reconstruction_error = np.linalg.norm(tfidf_matrix - W @ H, 'fro')
402
\end{pycode}
403
404
\section{Results Summary}
405
406
\subsection{Vectorization Statistics}
407
\begin{pycode}
408
print(r'\begin{table}[htbp]')
409
print(r'\centering')
410
print(r'\caption{TF-IDF Vectorization Statistics}')
411
print(r'\begin{tabular}{lr}')
412
print(r'\toprule')
413
print(r'Metric & Value \\')
414
print(r'\midrule')
415
print(f"Number of documents & {len(corpus)} \\\\")
416
print(f"Vocabulary size & {vocab_size} \\\\")
417
print(f"Total terms in corpus & {len(all_terms)} \\\\")
418
print(f"Mean document length & {np.mean(doc_lengths):.1f} \\\\")
419
print(f"Sparsity & {100 * (1 - np.count_nonzero(tfidf_matrix) / tfidf_matrix.size):.1f}\\% \\\\")
420
print(r'\bottomrule')
421
print(r'\end{tabular}')
422
print(r'\end{table}')
423
\end{pycode}
424
425
\subsection{Top Similar Document Pairs}
426
\begin{pycode}
427
print(r'\begin{table}[htbp]')
428
print(r'\centering')
429
print(r'\caption{Most similar document pairs by cosine similarity}')
430
print(r'\begin{tabular}{cc}')
431
print(r'\toprule')
432
print(r'Document Pair & Similarity \\')
433
print(r'\midrule')
434
435
for d1, d2, sim in top_pairs:
436
print(f"D{d1+1} -- D{d2+1} & {sim:.3f} \\\\")
437
438
print(r'\bottomrule')
439
print(r'\end{tabular}')
440
print(r'\end{table}')
441
\end{pycode}
442
443
\subsection{Topic Summary}
444
\begin{pycode}
445
print(r'\begin{table}[htbp]')
446
print(r'\centering')
447
print(r'\caption{Extracted topics and top terms}')
448
print(r'\begin{tabular}{cl}')
449
print(r'\toprule')
450
print(r'Topic & Top Terms \\')
451
print(r'\midrule')
452
453
for i, topic_terms in enumerate(topics):
454
terms_str = ', '.join([t[0] for t in topic_terms[:5]])
455
print(f"Topic {i+1} & {terms_str} \\\\")
456
457
print(r'\bottomrule')
458
print(r'\end{tabular}')
459
print(r'\end{table}')
460
\end{pycode}
461
462
\subsection{Statistical Summary}
463
\begin{itemize}
464
\item Mean similarity score: \py{f"{np.mean(upper_tri):.3f}"}
465
\item Max similarity score: \py{f"{np.max(upper_tri):.3f}"}
466
\item Zipf's law exponent: \py{f"{abs(slope):.2f}"}
467
\item NMF reconstruction error: \py{f"{reconstruction_error:.3f}"}
468
\end{itemize}
469
470
\section{Conclusion}
471
This template demonstrates core text analysis techniques. TF-IDF vectorization transforms text into numerical representations suitable for similarity computation and topic modeling. The analysis reveals document clusters based on semantic content, while the NMF-based topic extraction identifies latent themes. Word frequency analysis confirms Zipf's law with an exponent of \py{f"{abs(slope):.2f}"}.
472
473
\end{document}
474
475