Path: blob/main/latex-templates/templates/nlp/text_analysis.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{booktabs}6\usepackage{siunitx}7\usepackage[makestderr]{pythontex}89\title{Text Analysis: TF-IDF Vectorization and Document Similarity}10\author{Natural Language Processing Templates}11\date{\today}1213\begin{document}14\maketitle1516\section{Introduction}17This template explores fundamental text analysis techniques including Term Frequency-Inverse Document Frequency (TF-IDF) vectorization, document similarity computation, topic modeling concepts, and word frequency analysis.1819\section{Mathematical Framework}2021\subsection{Term Frequency (TF)}22Raw term frequency and its normalized variants:23\begin{equation}24\text{TF}(t, d) = \frac{f_{t,d}}{\sum_{t' \in d} f_{t',d}}25\end{equation}26where $f_{t,d}$ is the count of term $t$ in document $d$.2728\subsection{Inverse Document Frequency (IDF)}29IDF measures the importance of a term across the corpus:30\begin{equation}31\text{IDF}(t, D) = \log\left(\frac{|D|}{|\{d \in D : t \in d\}|}\right)32\end{equation}33where $|D|$ is the total number of documents.3435\subsection{TF-IDF Score}36The combined TF-IDF weight:37\begin{equation}38\text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D)39\end{equation}4041\subsection{Cosine Similarity}42Document similarity in vector space:43\begin{equation}44\text{sim}(\mathbf{d}_1, \mathbf{d}_2) = \frac{\mathbf{d}_1 \cdot \mathbf{d}_2}{\|\mathbf{d}_1\| \|\mathbf{d}_2\|}45\end{equation}4647\section{Environment Setup}4849\begin{pycode}50import numpy as np51import matplotlib.pyplot as plt52from collections import Counter, defaultdict53import re5455plt.rc('text', usetex=True)56plt.rc('font', family='serif')57np.random.seed(42)5859def save_plot(filename, caption=""):60plt.savefig(filename, bbox_inches='tight', dpi=150)61print(r'\begin{figure}[htbp]')62print(r'\centering')63print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')64if caption:65print(r'\caption{' + caption + '}')66print(r'\end{figure}')67plt.close()68\end{pycode}6970\section{TF-IDF Implementation}7172\begin{pycode}73class TFIDFVectorizer:74def __init__(self, min_df=1, max_df=1.0):75self.min_df = min_df76self.max_df = max_df77self.vocabulary_ = {}78self.idf_ = None7980def tokenize(self, text):81words = re.findall(r'\b\w+\b', text.lower())82stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',83'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',84'would', 'could', 'should', 'may', 'might', 'must', 'shall',85'can', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by',86'from', 'as', 'into', 'through', 'and', 'but', 'or', 'nor',87'so', 'yet', 'both', 'either', 'neither', 'not', 'only',88'that', 'this', 'these', 'those', 'it', 'its'}89return [w for w in words if w not in stopwords and len(w) > 2]9091def fit(self, documents):92doc_freq = defaultdict(int)93all_terms = set()94n_docs = len(documents)9596for doc in documents:97terms = set(self.tokenize(doc))98for term in terms:99doc_freq[term] += 1100all_terms.add(term)101102min_count = self.min_df if isinstance(self.min_df, int) else int(self.min_df * n_docs)103max_count = int(self.max_df * n_docs) if isinstance(self.max_df, float) else self.max_df104105filtered_terms = [t for t in all_terms if min_count <= doc_freq[t] <= max_count]106self.vocabulary_ = {term: idx for idx, term in enumerate(sorted(filtered_terms))}107108self.idf_ = np.zeros(len(self.vocabulary_))109for term, idx in self.vocabulary_.items():110self.idf_[idx] = np.log(n_docs / (doc_freq[term] + 1)) + 1111112return self113114def transform(self, documents):115n_docs = len(documents)116n_terms = len(self.vocabulary_)117tfidf_matrix = np.zeros((n_docs, n_terms))118119for doc_idx, doc in enumerate(documents):120terms = self.tokenize(doc)121term_counts = Counter(terms)122total_terms = len(terms)123124for term, count in term_counts.items():125if term in self.vocabulary_:126term_idx = self.vocabulary_[term]127tf = count / total_terms if total_terms > 0 else 0128tfidf_matrix[doc_idx, term_idx] = tf * self.idf_[term_idx]129130norms = np.linalg.norm(tfidf_matrix, axis=1, keepdims=True)131norms[norms == 0] = 1132return tfidf_matrix / norms133134def fit_transform(self, documents):135self.fit(documents)136return self.transform(documents)137138def get_feature_names(self):139return sorted(self.vocabulary_.keys(), key=lambda x: self.vocabulary_[x])140141corpus = [142"Machine learning algorithms can analyze large datasets efficiently.",143"Deep learning neural networks excel at image recognition tasks.",144"Natural language processing enables computers to understand text.",145"Data science combines statistics and programming for insights.",146"Computer vision uses deep learning for object detection.",147"Text mining extracts information from unstructured documents.",148"Artificial intelligence transforms healthcare diagnostics.",149"Big data analytics requires distributed computing systems.",150"Supervised learning needs labeled training data.",151"Unsupervised learning discovers patterns without labels."152]153154vectorizer = TFIDFVectorizer(min_df=1, max_df=0.9)155tfidf_matrix = vectorizer.fit_transform(corpus)156vocab_size = len(vectorizer.vocabulary_)157feature_names = vectorizer.get_feature_names()158\end{pycode}159160\section{TF-IDF Analysis Visualization}161162\begin{pycode}163fig, axes = plt.subplots(2, 2, figsize=(12, 10))164165top_terms_idx = np.argsort(np.sum(tfidf_matrix, axis=0))[-15:]166subset_matrix = tfidf_matrix[:, top_terms_idx]167top_terms = [feature_names[i] for i in top_terms_idx]168169im = axes[0, 0].imshow(subset_matrix, aspect='auto', cmap='YlOrRd')170axes[0, 0].set_xlabel('Terms')171axes[0, 0].set_ylabel('Documents')172axes[0, 0].set_title('TF-IDF Heatmap (Top 15 Terms)')173axes[0, 0].set_xticks(range(len(top_terms)))174axes[0, 0].set_xticklabels(top_terms, rotation=45, ha='right', fontsize=7)175plt.colorbar(im, ax=axes[0, 0], shrink=0.8)176177idf_values = vectorizer.idf_[top_terms_idx]178y_pos = np.arange(len(top_terms))179axes[0, 1].barh(y_pos, idf_values, color='steelblue', alpha=0.7)180axes[0, 1].set_yticks(y_pos)181axes[0, 1].set_yticklabels(top_terms, fontsize=7)182axes[0, 1].set_xlabel('IDF Score')183axes[0, 1].set_title('Inverse Document Frequency')184185all_terms = []186for doc in corpus:187all_terms.extend(vectorizer.tokenize(doc))188term_counts = Counter(all_terms)189most_common = term_counts.most_common(15)190terms, counts = zip(*most_common)191192y_pos = np.arange(len(terms))193axes[1, 0].barh(y_pos, counts, color='green', alpha=0.7)194axes[1, 0].set_yticks(y_pos)195axes[1, 0].set_yticklabels(terms, fontsize=7)196axes[1, 0].set_xlabel('Frequency')197axes[1, 0].set_title('Term Frequency Distribution')198199doc_nonzero = np.sum(tfidf_matrix > 0, axis=1)200axes[1, 1].bar(range(1, len(corpus)+1), doc_nonzero, color='purple', alpha=0.7)201axes[1, 1].set_xlabel('Document')202axes[1, 1].set_ylabel('Unique Terms')203axes[1, 1].set_title('Document Term Coverage')204205plt.tight_layout()206save_plot('tfidf_analysis.pdf', 'TF-IDF vectorization analysis')207\end{pycode}208209\section{Document Similarity}210211\begin{pycode}212def cosine_similarity(matrix):213return np.dot(matrix, matrix.T)214215similarity_matrix = cosine_similarity(tfidf_matrix)216217n_docs = len(corpus)218similar_pairs = []219for i in range(n_docs):220for j in range(i+1, n_docs):221similar_pairs.append((i, j, similarity_matrix[i, j]))222223similar_pairs.sort(key=lambda x: x[2], reverse=True)224top_pairs = similar_pairs[:5]225226fig, axes = plt.subplots(2, 2, figsize=(12, 10))227228im = axes[0, 0].imshow(similarity_matrix, cmap='coolwarm', vmin=0, vmax=1)229axes[0, 0].set_xlabel('Document')230axes[0, 0].set_ylabel('Document')231axes[0, 0].set_title('Document Similarity Matrix')232axes[0, 0].set_xticks(range(n_docs))233axes[0, 0].set_yticks(range(n_docs))234axes[0, 0].set_xticklabels([f'D{i+1}' for i in range(n_docs)], fontsize=8)235axes[0, 0].set_yticklabels([f'D{i+1}' for i in range(n_docs)], fontsize=8)236plt.colorbar(im, ax=axes[0, 0], shrink=0.8)237238upper_tri = similarity_matrix[np.triu_indices(n_docs, k=1)]239axes[0, 1].hist(upper_tri, bins=20, color='purple', alpha=0.7, edgecolor='black')240axes[0, 1].axvline(x=np.mean(upper_tri), color='red', linestyle='--',241label=f'Mean: {np.mean(upper_tri):.3f}')242axes[0, 1].set_xlabel('Cosine Similarity')243axes[0, 1].set_ylabel('Frequency')244axes[0, 1].set_title('Similarity Distribution')245axes[0, 1].legend()246247pair_labels = [f'D{p[0]+1}-D{p[1]+1}' for p in top_pairs]248pair_sims = [p[2] for p in top_pairs]249y_pos = np.arange(len(top_pairs))250axes[1, 0].barh(y_pos, pair_sims, color='orange', alpha=0.7)251axes[1, 0].set_yticks(y_pos)252axes[1, 0].set_yticklabels(pair_labels)253axes[1, 0].set_xlabel('Cosine Similarity')254axes[1, 0].set_title('Top 5 Most Similar Document Pairs')255axes[1, 0].set_xlim(0, 1)256257def mds_projection(sim_matrix, n_components=2):258n = sim_matrix.shape[0]259dist_matrix = 1 - sim_matrix260J = np.eye(n) - np.ones((n, n)) / n261B = -0.5 * J @ (dist_matrix ** 2) @ J262eigenvalues, eigenvectors = np.linalg.eigh(B)263idx = np.argsort(eigenvalues)[::-1]264eigenvalues = eigenvalues[idx]265eigenvectors = eigenvectors[:, idx]266coords = eigenvectors[:, :n_components] * np.sqrt(np.abs(eigenvalues[:n_components]))267return coords268269coords = mds_projection(similarity_matrix)270axes[1, 1].scatter(coords[:, 0], coords[:, 1], s=100, alpha=0.7,271c=range(n_docs), cmap='tab10', edgecolors='black')272for i in range(n_docs):273axes[1, 1].annotate(f'D{i+1}', (coords[i, 0], coords[i, 1]),274xytext=(5, 5), textcoords='offset points', fontsize=9)275axes[1, 1].set_xlabel('Component 1')276axes[1, 1].set_ylabel('Component 2')277axes[1, 1].set_title('Document Clustering (MDS)')278axes[1, 1].grid(True, alpha=0.3)279280plt.tight_layout()281save_plot('similarity_analysis.pdf', 'Document similarity analysis')282\end{pycode}283284\section{Topic Modeling with NMF}285286\begin{pycode}287def simple_nmf(V, n_topics=3, max_iter=100, tol=1e-4):288n_docs, n_terms = V.shape289W = np.random.rand(n_docs, n_topics) + 0.1290H = np.random.rand(n_topics, n_terms) + 0.1291292for iteration in range(max_iter):293H = H * (W.T @ V) / (W.T @ W @ H + 1e-10)294W = W * (V @ H.T) / (W @ H @ H.T + 1e-10)295error = np.linalg.norm(V - W @ H, 'fro')296if iteration > 0 and abs(prev_error - error) < tol:297break298prev_error = error299300return W, H301302n_topics = 3303W, H = simple_nmf(tfidf_matrix, n_topics=n_topics)304305def get_top_terms(H, feature_names, n_top=8):306topics = []307for topic_idx in range(H.shape[0]):308top_indices = H[topic_idx].argsort()[::-1][:n_top]309top_terms = [(feature_names[i], H[topic_idx, i]) for i in top_indices]310topics.append(top_terms)311return topics312313topics = get_top_terms(H, feature_names)314315fig, axes = plt.subplots(2, 2, figsize=(12, 10))316317doc_topics = W / W.sum(axis=1, keepdims=True)318im = axes[0, 0].imshow(doc_topics, aspect='auto', cmap='Blues')319axes[0, 0].set_xlabel('Topic')320axes[0, 0].set_ylabel('Document')321axes[0, 0].set_title('Document-Topic Distribution')322axes[0, 0].set_xticks(range(n_topics))323axes[0, 0].set_xticklabels([f'Topic {i+1}' for i in range(n_topics)])324plt.colorbar(im, ax=axes[0, 0], shrink=0.8)325326for topic_idx in range(n_topics):327ax = axes[(topic_idx + 1) // 2, (topic_idx + 1) % 2]328terms, weights = zip(*topics[topic_idx])329y_pos = np.arange(len(terms))330colors = plt.cm.Set2(topic_idx / n_topics)331ax.barh(y_pos, weights, color=colors, alpha=0.7)332ax.set_yticks(y_pos)333ax.set_yticklabels(terms, fontsize=8)334ax.set_xlabel('Weight')335ax.set_title(f'Topic {topic_idx + 1} Top Terms')336ax.invert_yaxis()337338plt.tight_layout()339save_plot('topic_modeling.pdf', 'Topic modeling using NMF')340\end{pycode}341342\section{Word Frequency Analysis}343344\begin{pycode}345fig, axes = plt.subplots(2, 2, figsize=(12, 10))346347sorted_counts = sorted(total_freq.values(), reverse=True) if 'total_freq' in dir() else sorted(term_counts.values(), reverse=True)348ranks = np.arange(1, len(sorted_counts) + 1)349axes[0, 0].loglog(ranks, sorted_counts, 'b-', marker='o', markersize=4)350log_ranks = np.log(ranks)351log_counts = np.log(sorted_counts)352slope, intercept = np.polyfit(log_ranks, log_counts, 1)353fit_line = np.exp(intercept) * ranks ** slope354axes[0, 0].loglog(ranks, fit_line, 'r--', label=f'Slope: {slope:.2f}')355axes[0, 0].set_xlabel('Rank')356axes[0, 0].set_ylabel('Frequency')357axes[0, 0].set_title("Zipf's Law Analysis")358axes[0, 0].legend()359axes[0, 0].grid(True, alpha=0.3)360361term_df = defaultdict(int)362for doc in corpus:363for term in set(vectorizer.tokenize(doc)):364term_df[term] += 1365366tf_vals = [term_counts[t] for t in term_counts.keys()]367df_vals = [term_df[t] for t in term_counts.keys()]368369axes[0, 1].scatter(tf_vals, df_vals, alpha=0.6, s=50)370axes[0, 1].set_xlabel('Term Frequency')371axes[0, 1].set_ylabel('Document Frequency')372axes[0, 1].set_title('TF vs DF')373axes[0, 1].grid(True, alpha=0.3)374375vocab_growth = []376seen_terms = set()377for i, doc in enumerate(corpus):378terms = vectorizer.tokenize(doc)379seen_terms.update(terms)380vocab_growth.append(len(seen_terms))381382axes[1, 0].plot(range(1, len(corpus)+1), vocab_growth, 'g-', marker='o', linewidth=2)383axes[1, 0].set_xlabel('Number of Documents')384axes[1, 0].set_ylabel('Vocabulary Size')385axes[1, 0].set_title('Vocabulary Growth Curve')386axes[1, 0].grid(True, alpha=0.3)387388doc_lengths = [len(vectorizer.tokenize(doc)) for doc in corpus]389axes[1, 1].bar(range(1, len(corpus)+1), doc_lengths, color='purple', alpha=0.7)390axes[1, 1].axhline(y=np.mean(doc_lengths), color='red', linestyle='--',391label=f'Mean: {np.mean(doc_lengths):.1f}')392axes[1, 1].set_xlabel('Document')393axes[1, 1].set_ylabel('Number of Terms')394axes[1, 1].set_title('Document Length Distribution')395axes[1, 1].legend()396397plt.tight_layout()398save_plot('word_frequency.pdf', 'Word frequency analysis')399400reconstruction_error = np.linalg.norm(tfidf_matrix - W @ H, 'fro')401\end{pycode}402403\section{Results Summary}404405\subsection{Vectorization Statistics}406\begin{pycode}407print(r'\begin{table}[htbp]')408print(r'\centering')409print(r'\caption{TF-IDF Vectorization Statistics}')410print(r'\begin{tabular}{lr}')411print(r'\toprule')412print(r'Metric & Value \\')413print(r'\midrule')414print(f"Number of documents & {len(corpus)} \\\\")415print(f"Vocabulary size & {vocab_size} \\\\")416print(f"Total terms in corpus & {len(all_terms)} \\\\")417print(f"Mean document length & {np.mean(doc_lengths):.1f} \\\\")418print(f"Sparsity & {100 * (1 - np.count_nonzero(tfidf_matrix) / tfidf_matrix.size):.1f}\\% \\\\")419print(r'\bottomrule')420print(r'\end{tabular}')421print(r'\end{table}')422\end{pycode}423424\subsection{Top Similar Document Pairs}425\begin{pycode}426print(r'\begin{table}[htbp]')427print(r'\centering')428print(r'\caption{Most similar document pairs by cosine similarity}')429print(r'\begin{tabular}{cc}')430print(r'\toprule')431print(r'Document Pair & Similarity \\')432print(r'\midrule')433434for d1, d2, sim in top_pairs:435print(f"D{d1+1} -- D{d2+1} & {sim:.3f} \\\\")436437print(r'\bottomrule')438print(r'\end{tabular}')439print(r'\end{table}')440\end{pycode}441442\subsection{Topic Summary}443\begin{pycode}444print(r'\begin{table}[htbp]')445print(r'\centering')446print(r'\caption{Extracted topics and top terms}')447print(r'\begin{tabular}{cl}')448print(r'\toprule')449print(r'Topic & Top Terms \\')450print(r'\midrule')451452for i, topic_terms in enumerate(topics):453terms_str = ', '.join([t[0] for t in topic_terms[:5]])454print(f"Topic {i+1} & {terms_str} \\\\")455456print(r'\bottomrule')457print(r'\end{tabular}')458print(r'\end{table}')459\end{pycode}460461\subsection{Statistical Summary}462\begin{itemize}463\item Mean similarity score: \py{f"{np.mean(upper_tri):.3f}"}464\item Max similarity score: \py{f"{np.max(upper_tri):.3f}"}465\item Zipf's law exponent: \py{f"{abs(slope):.2f}"}466\item NMF reconstruction error: \py{f"{reconstruction_error:.3f}"}467\end{itemize}468469\section{Conclusion}470This template demonstrates core text analysis techniques. TF-IDF vectorization transforms text into numerical representations suitable for similarity computation and topic modeling. The analysis reveals document clusters based on semantic content, while the NMF-based topic extraction identifies latent themes. Word frequency analysis confirms Zipf's law with an exponent of \py{f"{abs(slope):.2f}"}.471472\end{document}473474475