CoCalc -- text_analysis.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/nlp/text_analysis.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage[makestderr]{pythontex}
9

10
\title{Text Analysis: TF-IDF Vectorization and Document Similarity}
11
\author{Natural Language Processing Templates}
12
\date{\today}
13

14
\begin{document}
15
\maketitle
16

17
\section{Introduction}
18
This template explores fundamental text analysis techniques including Term Frequency-Inverse Document Frequency (TF-IDF) vectorization, document similarity computation, topic modeling concepts, and word frequency analysis.
19

20
\section{Mathematical Framework}
21

22
\subsection{Term Frequency (TF)}
23
Raw term frequency and its normalized variants:
24
\begin{equation}
25
\text{TF}(t, d) = \frac{f_{t,d}}{\sum_{t' \in d} f_{t',d}}
26
\end{equation}
27
where $f_{t,d}$ is the count of term $t$ in document $d$.
28

29
\subsection{Inverse Document Frequency (IDF)}
30
IDF measures the importance of a term across the corpus:
31
\begin{equation}
32
\text{IDF}(t, D) = \log\left(\frac{|D|}{|\{d \in D : t \in d\}|}\right)
33
\end{equation}
34
where $|D|$ is the total number of documents.
35

36
\subsection{TF-IDF Score}
37
The combined TF-IDF weight:
38
\begin{equation}
39
\text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D)
40
\end{equation}
41

42
\subsection{Cosine Similarity}
43
Document similarity in vector space:
44
\begin{equation}
45
\text{sim}(\mathbf{d}_1, \mathbf{d}_2) = \frac{\mathbf{d}_1 \cdot \mathbf{d}_2}{\|\mathbf{d}_1\| \|\mathbf{d}_2\|}
46
\end{equation}
47

48
\section{Environment Setup}
49

50
\begin{pycode}
51
import numpy as np
52
import matplotlib.pyplot as plt
53
from collections import Counter, defaultdict
54
import re
55

56
plt.rc('text', usetex=True)
57
plt.rc('font', family='serif')
58
np.random.seed(42)
59

60
def save_plot(filename, caption=""):
61
    plt.savefig(filename, bbox_inches='tight', dpi=150)
62
    print(r'\begin{figure}[htbp]')
63
    print(r'\centering')
64
    print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')
65
    if caption:
66
        print(r'\caption{' + caption + '}')
67
    print(r'\end{figure}')
68
    plt.close()
69
\end{pycode}
70

71
\section{TF-IDF Implementation}
72

73
\begin{pycode}
74
class TFIDFVectorizer:
75
    def __init__(self, min_df=1, max_df=1.0):
76
        self.min_df = min_df
77
        self.max_df = max_df
78
        self.vocabulary_ = {}
79
        self.idf_ = None
80

81
    def tokenize(self, text):
82
        words = re.findall(r'\b\w+\b', text.lower())
83
        stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
84
                    'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
85
                    'would', 'could', 'should', 'may', 'might', 'must', 'shall',
86
                    'can', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by',
87
                    'from', 'as', 'into', 'through', 'and', 'but', 'or', 'nor',
88
                    'so', 'yet', 'both', 'either', 'neither', 'not', 'only',
89
                    'that', 'this', 'these', 'those', 'it', 'its'}
90
        return [w for w in words if w not in stopwords and len(w) > 2]
91

92
    def fit(self, documents):
93
        doc_freq = defaultdict(int)
94
        all_terms = set()
95
        n_docs = len(documents)
96

97
        for doc in documents:
98
            terms = set(self.tokenize(doc))
99
            for term in terms:
100
                doc_freq[term] += 1
101
                all_terms.add(term)
102

103
        min_count = self.min_df if isinstance(self.min_df, int) else int(self.min_df * n_docs)
104
        max_count = int(self.max_df * n_docs) if isinstance(self.max_df, float) else self.max_df
105

106
        filtered_terms = [t for t in all_terms if min_count <= doc_freq[t] <= max_count]
107
        self.vocabulary_ = {term: idx for idx, term in enumerate(sorted(filtered_terms))}
108

109
        self.idf_ = np.zeros(len(self.vocabulary_))
110
        for term, idx in self.vocabulary_.items():
111
            self.idf_[idx] = np.log(n_docs / (doc_freq[term] + 1)) + 1
112

113
        return self
114

115
    def transform(self, documents):
116
        n_docs = len(documents)
117
        n_terms = len(self.vocabulary_)
118
        tfidf_matrix = np.zeros((n_docs, n_terms))
119

120
        for doc_idx, doc in enumerate(documents):
121
            terms = self.tokenize(doc)
122
            term_counts = Counter(terms)
123
            total_terms = len(terms)
124

125
            for term, count in term_counts.items():
126
                if term in self.vocabulary_:
127
                    term_idx = self.vocabulary_[term]
128
                    tf = count / total_terms if total_terms > 0 else 0
129
                    tfidf_matrix[doc_idx, term_idx] = tf * self.idf_[term_idx]
130

131
        norms = np.linalg.norm(tfidf_matrix, axis=1, keepdims=True)
132
        norms[norms == 0] = 1
133
        return tfidf_matrix / norms
134

135
    def fit_transform(self, documents):
136
        self.fit(documents)
137
        return self.transform(documents)
138

139
    def get_feature_names(self):
140
        return sorted(self.vocabulary_.keys(), key=lambda x: self.vocabulary_[x])
141

142
corpus = [
143
    "Machine learning algorithms can analyze large datasets efficiently.",
144
    "Deep learning neural networks excel at image recognition tasks.",
145
    "Natural language processing enables computers to understand text.",
146
    "Data science combines statistics and programming for insights.",
147
    "Computer vision uses deep learning for object detection.",
148
    "Text mining extracts information from unstructured documents.",
149
    "Artificial intelligence transforms healthcare diagnostics.",
150
    "Big data analytics requires distributed computing systems.",
151
    "Supervised learning needs labeled training data.",
152
    "Unsupervised learning discovers patterns without labels."
153
]
154

155
vectorizer = TFIDFVectorizer(min_df=1, max_df=0.9)
156
tfidf_matrix = vectorizer.fit_transform(corpus)
157
vocab_size = len(vectorizer.vocabulary_)
158
feature_names = vectorizer.get_feature_names()
159
\end{pycode}
160

161
\section{TF-IDF Analysis Visualization}
162

163
\begin{pycode}
164
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
165

166
top_terms_idx = np.argsort(np.sum(tfidf_matrix, axis=0))[-15:]
167
subset_matrix = tfidf_matrix[:, top_terms_idx]
168
top_terms = [feature_names[i] for i in top_terms_idx]
169

170
im = axes[0, 0].imshow(subset_matrix, aspect='auto', cmap='YlOrRd')
171
axes[0, 0].set_xlabel('Terms')
172
axes[0, 0].set_ylabel('Documents')
173
axes[0, 0].set_title('TF-IDF Heatmap (Top 15 Terms)')
174
axes[0, 0].set_xticks(range(len(top_terms)))
175
axes[0, 0].set_xticklabels(top_terms, rotation=45, ha='right', fontsize=7)
176
plt.colorbar(im, ax=axes[0, 0], shrink=0.8)
177

178
idf_values = vectorizer.idf_[top_terms_idx]
179
y_pos = np.arange(len(top_terms))
180
axes[0, 1].barh(y_pos, idf_values, color='steelblue', alpha=0.7)
181
axes[0, 1].set_yticks(y_pos)
182
axes[0, 1].set_yticklabels(top_terms, fontsize=7)
183
axes[0, 1].set_xlabel('IDF Score')
184
axes[0, 1].set_title('Inverse Document Frequency')
185

186
all_terms = []
187
for doc in corpus:
188
    all_terms.extend(vectorizer.tokenize(doc))
189
term_counts = Counter(all_terms)
190
most_common = term_counts.most_common(15)
191
terms, counts = zip(*most_common)
192

193
y_pos = np.arange(len(terms))
194
axes[1, 0].barh(y_pos, counts, color='green', alpha=0.7)
195
axes[1, 0].set_yticks(y_pos)
196
axes[1, 0].set_yticklabels(terms, fontsize=7)
197
axes[1, 0].set_xlabel('Frequency')
198
axes[1, 0].set_title('Term Frequency Distribution')
199

200
doc_nonzero = np.sum(tfidf_matrix > 0, axis=1)
201
axes[1, 1].bar(range(1, len(corpus)+1), doc_nonzero, color='purple', alpha=0.7)
202
axes[1, 1].set_xlabel('Document')
203
axes[1, 1].set_ylabel('Unique Terms')
204
axes[1, 1].set_title('Document Term Coverage')
205

206
plt.tight_layout()
207
save_plot('tfidf_analysis.pdf', 'TF-IDF vectorization analysis')
208
\end{pycode}
209

210
\section{Document Similarity}
211

212
\begin{pycode}
213
def cosine_similarity(matrix):
214
    return np.dot(matrix, matrix.T)
215

216
similarity_matrix = cosine_similarity(tfidf_matrix)
217

218
n_docs = len(corpus)
219
similar_pairs = []
220
for i in range(n_docs):
221
    for j in range(i+1, n_docs):
222
        similar_pairs.append((i, j, similarity_matrix[i, j]))
223

224
similar_pairs.sort(key=lambda x: x[2], reverse=True)
225
top_pairs = similar_pairs[:5]
226

227
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
228

229
im = axes[0, 0].imshow(similarity_matrix, cmap='coolwarm', vmin=0, vmax=1)
230
axes[0, 0].set_xlabel('Document')
231
axes[0, 0].set_ylabel('Document')
232
axes[0, 0].set_title('Document Similarity Matrix')
233
axes[0, 0].set_xticks(range(n_docs))
234
axes[0, 0].set_yticks(range(n_docs))
235
axes[0, 0].set_xticklabels([f'D{i+1}' for i in range(n_docs)], fontsize=8)
236
axes[0, 0].set_yticklabels([f'D{i+1}' for i in range(n_docs)], fontsize=8)
237
plt.colorbar(im, ax=axes[0, 0], shrink=0.8)
238

239
upper_tri = similarity_matrix[np.triu_indices(n_docs, k=1)]
240
axes[0, 1].hist(upper_tri, bins=20, color='purple', alpha=0.7, edgecolor='black')
241
axes[0, 1].axvline(x=np.mean(upper_tri), color='red', linestyle='--',
242
                   label=f'Mean: {np.mean(upper_tri):.3f}')
243
axes[0, 1].set_xlabel('Cosine Similarity')
244
axes[0, 1].set_ylabel('Frequency')
245
axes[0, 1].set_title('Similarity Distribution')
246
axes[0, 1].legend()
247

248
pair_labels = [f'D{p[0]+1}-D{p[1]+1}' for p in top_pairs]
249
pair_sims = [p[2] for p in top_pairs]
250
y_pos = np.arange(len(top_pairs))
251
axes[1, 0].barh(y_pos, pair_sims, color='orange', alpha=0.7)
252
axes[1, 0].set_yticks(y_pos)
253
axes[1, 0].set_yticklabels(pair_labels)
254
axes[1, 0].set_xlabel('Cosine Similarity')
255
axes[1, 0].set_title('Top 5 Most Similar Document Pairs')
256
axes[1, 0].set_xlim(0, 1)
257

258
def mds_projection(sim_matrix, n_components=2):
259
    n = sim_matrix.shape[0]
260
    dist_matrix = 1 - sim_matrix
261
    J = np.eye(n) - np.ones((n, n)) / n
262
    B = -0.5 * J @ (dist_matrix ** 2) @ J
263
    eigenvalues, eigenvectors = np.linalg.eigh(B)
264
    idx = np.argsort(eigenvalues)[::-1]
265
    eigenvalues = eigenvalues[idx]
266
    eigenvectors = eigenvectors[:, idx]
267
    coords = eigenvectors[:, :n_components] * np.sqrt(np.abs(eigenvalues[:n_components]))
268
    return coords
269

270
coords = mds_projection(similarity_matrix)
271
axes[1, 1].scatter(coords[:, 0], coords[:, 1], s=100, alpha=0.7,
272
                   c=range(n_docs), cmap='tab10', edgecolors='black')
273
for i in range(n_docs):
274
    axes[1, 1].annotate(f'D{i+1}', (coords[i, 0], coords[i, 1]),
275
                        xytext=(5, 5), textcoords='offset points', fontsize=9)
276
axes[1, 1].set_xlabel('Component 1')
277
axes[1, 1].set_ylabel('Component 2')
278
axes[1, 1].set_title('Document Clustering (MDS)')
279
axes[1, 1].grid(True, alpha=0.3)
280

281
plt.tight_layout()
282
save_plot('similarity_analysis.pdf', 'Document similarity analysis')
283
\end{pycode}
284

285
\section{Topic Modeling with NMF}
286

287
\begin{pycode}
288
def simple_nmf(V, n_topics=3, max_iter=100, tol=1e-4):
289
    n_docs, n_terms = V.shape
290
    W = np.random.rand(n_docs, n_topics) + 0.1
291
    H = np.random.rand(n_topics, n_terms) + 0.1
292

293
    for iteration in range(max_iter):
294
        H = H * (W.T @ V) / (W.T @ W @ H + 1e-10)
295
        W = W * (V @ H.T) / (W @ H @ H.T + 1e-10)
296
        error = np.linalg.norm(V - W @ H, 'fro')
297
        if iteration > 0 and abs(prev_error - error) < tol:
298
            break
299
        prev_error = error
300

301
    return W, H
302

303
n_topics = 3
304
W, H = simple_nmf(tfidf_matrix, n_topics=n_topics)
305

306
def get_top_terms(H, feature_names, n_top=8):
307
    topics = []
308
    for topic_idx in range(H.shape[0]):
309
        top_indices = H[topic_idx].argsort()[::-1][:n_top]
310
        top_terms = [(feature_names[i], H[topic_idx, i]) for i in top_indices]
311
        topics.append(top_terms)
312
    return topics
313

314
topics = get_top_terms(H, feature_names)
315

316
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
317

318
doc_topics = W / W.sum(axis=1, keepdims=True)
319
im = axes[0, 0].imshow(doc_topics, aspect='auto', cmap='Blues')
320
axes[0, 0].set_xlabel('Topic')
321
axes[0, 0].set_ylabel('Document')
322
axes[0, 0].set_title('Document-Topic Distribution')
323
axes[0, 0].set_xticks(range(n_topics))
324
axes[0, 0].set_xticklabels([f'Topic {i+1}' for i in range(n_topics)])
325
plt.colorbar(im, ax=axes[0, 0], shrink=0.8)
326

327
for topic_idx in range(n_topics):
328
    ax = axes[(topic_idx + 1) // 2, (topic_idx + 1) % 2]
329
    terms, weights = zip(*topics[topic_idx])
330
    y_pos = np.arange(len(terms))
331
    colors = plt.cm.Set2(topic_idx / n_topics)
332
    ax.barh(y_pos, weights, color=colors, alpha=0.7)
333
    ax.set_yticks(y_pos)
334
    ax.set_yticklabels(terms, fontsize=8)
335
    ax.set_xlabel('Weight')
336
    ax.set_title(f'Topic {topic_idx + 1} Top Terms')
337
    ax.invert_yaxis()
338

339
plt.tight_layout()
340
save_plot('topic_modeling.pdf', 'Topic modeling using NMF')
341
\end{pycode}
342

343
\section{Word Frequency Analysis}
344

345
\begin{pycode}
346
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
347

348
sorted_counts = sorted(total_freq.values(), reverse=True) if 'total_freq' in dir() else sorted(term_counts.values(), reverse=True)
349
ranks = np.arange(1, len(sorted_counts) + 1)
350
axes[0, 0].loglog(ranks, sorted_counts, 'b-', marker='o', markersize=4)
351
log_ranks = np.log(ranks)
352
log_counts = np.log(sorted_counts)
353
slope, intercept = np.polyfit(log_ranks, log_counts, 1)
354
fit_line = np.exp(intercept) * ranks ** slope
355
axes[0, 0].loglog(ranks, fit_line, 'r--', label=f'Slope: {slope:.2f}')
356
axes[0, 0].set_xlabel('Rank')
357
axes[0, 0].set_ylabel('Frequency')
358
axes[0, 0].set_title("Zipf's Law Analysis")
359
axes[0, 0].legend()
360
axes[0, 0].grid(True, alpha=0.3)
361

362
term_df = defaultdict(int)
363
for doc in corpus:
364
    for term in set(vectorizer.tokenize(doc)):
365
        term_df[term] += 1
366

367
tf_vals = [term_counts[t] for t in term_counts.keys()]
368
df_vals = [term_df[t] for t in term_counts.keys()]
369

370
axes[0, 1].scatter(tf_vals, df_vals, alpha=0.6, s=50)
371
axes[0, 1].set_xlabel('Term Frequency')
372
axes[0, 1].set_ylabel('Document Frequency')
373
axes[0, 1].set_title('TF vs DF')
374
axes[0, 1].grid(True, alpha=0.3)
375

376
vocab_growth = []
377
seen_terms = set()
378
for i, doc in enumerate(corpus):
379
    terms = vectorizer.tokenize(doc)
380
    seen_terms.update(terms)
381
    vocab_growth.append(len(seen_terms))
382

383
axes[1, 0].plot(range(1, len(corpus)+1), vocab_growth, 'g-', marker='o', linewidth=2)
384
axes[1, 0].set_xlabel('Number of Documents')
385
axes[1, 0].set_ylabel('Vocabulary Size')
386
axes[1, 0].set_title('Vocabulary Growth Curve')
387
axes[1, 0].grid(True, alpha=0.3)
388

389
doc_lengths = [len(vectorizer.tokenize(doc)) for doc in corpus]
390
axes[1, 1].bar(range(1, len(corpus)+1), doc_lengths, color='purple', alpha=0.7)
391
axes[1, 1].axhline(y=np.mean(doc_lengths), color='red', linestyle='--',
392
                   label=f'Mean: {np.mean(doc_lengths):.1f}')
393
axes[1, 1].set_xlabel('Document')
394
axes[1, 1].set_ylabel('Number of Terms')
395
axes[1, 1].set_title('Document Length Distribution')
396
axes[1, 1].legend()
397

398
plt.tight_layout()
399
save_plot('word_frequency.pdf', 'Word frequency analysis')
400

401
reconstruction_error = np.linalg.norm(tfidf_matrix - W @ H, 'fro')
402
\end{pycode}
403

404
\section{Results Summary}
405

406
\subsection{Vectorization Statistics}
407
\begin{pycode}
408
print(r'\begin{table}[htbp]')
409
print(r'\centering')
410
print(r'\caption{TF-IDF Vectorization Statistics}')
411
print(r'\begin{tabular}{lr}')
412
print(r'\toprule')
413
print(r'Metric & Value \\')
414
print(r'\midrule')
415
print(f"Number of documents & {len(corpus)} \\\\")
416
print(f"Vocabulary size & {vocab_size} \\\\")
417
print(f"Total terms in corpus & {len(all_terms)} \\\\")
418
print(f"Mean document length & {np.mean(doc_lengths):.1f} \\\\")
419
print(f"Sparsity & {100 * (1 - np.count_nonzero(tfidf_matrix) / tfidf_matrix.size):.1f}\\% \\\\")
420
print(r'\bottomrule')
421
print(r'\end{tabular}')
422
print(r'\end{table}')
423
\end{pycode}
424

425
\subsection{Top Similar Document Pairs}
426
\begin{pycode}
427
print(r'\begin{table}[htbp]')
428
print(r'\centering')
429
print(r'\caption{Most similar document pairs by cosine similarity}')
430
print(r'\begin{tabular}{cc}')
431
print(r'\toprule')
432
print(r'Document Pair & Similarity \\')
433
print(r'\midrule')
434

435
for d1, d2, sim in top_pairs:
436
    print(f"D{d1+1} -- D{d2+1} & {sim:.3f} \\\\")
437

438
print(r'\bottomrule')
439
print(r'\end{tabular}')
440
print(r'\end{table}')
441
\end{pycode}
442

443
\subsection{Topic Summary}
444
\begin{pycode}
445
print(r'\begin{table}[htbp]')
446
print(r'\centering')
447
print(r'\caption{Extracted topics and top terms}')
448
print(r'\begin{tabular}{cl}')
449
print(r'\toprule')
450
print(r'Topic & Top Terms \\')
451
print(r'\midrule')
452

453
for i, topic_terms in enumerate(topics):
454
    terms_str = ', '.join([t[0] for t in topic_terms[:5]])
455
    print(f"Topic {i+1} & {terms_str} \\\\")
456

457
print(r'\bottomrule')
458
print(r'\end{tabular}')
459
print(r'\end{table}')
460
\end{pycode}
461

462
\subsection{Statistical Summary}
463
\begin{itemize}
464
    \item Mean similarity score: \py{f"{np.mean(upper_tri):.3f}"}
465
    \item Max similarity score: \py{f"{np.max(upper_tri):.3f}"}
466
    \item Zipf's law exponent: \py{f"{abs(slope):.2f}"}
467
    \item NMF reconstruction error: \py{f"{reconstruction_error:.3f}"}
468
\end{itemize}
469

470
\section{Conclusion}
471
This template demonstrates core text analysis techniques. TF-IDF vectorization transforms text into numerical representations suitable for similarity computation and topic modeling. The analysis reveals document clusters based on semantic content, while the NMF-based topic extraction identifies latent themes. Word frequency analysis confirms Zipf's law with an exponent of \py{f"{abs(slope):.2f}"}.
472

473
\end{document}
474

475
Product

Resources

Company