Path: blob/main/latex-templates/templates/nlp/sentiment.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{booktabs}6\usepackage{siunitx}7\usepackage[makestderr]{pythontex}89\title{Sentiment Analysis: Lexicon-Based and Machine Learning Approaches}10\author{Natural Language Processing Templates}11\date{\today}1213\begin{document}14\maketitle1516\section{Introduction}17Sentiment analysis determines the emotional tone of text, classifying it as positive, negative, or neutral. This template implements two complementary approaches: lexicon-based scoring (similar to VADER) and a Naive Bayes classifier.1819\section{Mathematical Framework}2021\subsection{Lexicon-Based Sentiment Scoring}22The compound sentiment score aggregates individual word scores:23\begin{equation}24S_{compound} = \frac{\sum_{i=1}^{n} v_i}{\sqrt{\left(\sum_{i=1}^{n} v_i\right)^2 + \alpha}}25\end{equation}26where $v_i$ is the valence score for word $i$ and $\alpha$ is a normalization constant.2728\subsection{Naive Bayes Classification}29For text classification, we use Bayes' theorem:30\begin{equation}31P(c|d) = \frac{P(d|c)P(c)}{P(d)}32\end{equation}33Under the naive independence assumption:34\begin{equation}35P(d|c) = \prod_{i=1}^{n} P(w_i|c)36\end{equation}3738Using log-probabilities to avoid underflow:39\begin{equation}40\log P(c|d) \propto \log P(c) + \sum_{i=1}^{n} \log P(w_i|c)41\end{equation}4243\section{Environment Setup}4445\begin{pycode}46import numpy as np47import matplotlib.pyplot as plt48from collections import Counter, defaultdict49import re5051plt.rc('text', usetex=True)52plt.rc('font', family='serif')53np.random.seed(42)5455def save_plot(filename, caption=""):56plt.savefig(filename, bbox_inches='tight', dpi=150)57print(r'\begin{figure}[htbp]')58print(r'\centering')59print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')60if caption:61print(r'\caption{' + caption + '}')62print(r'\end{figure}')63plt.close()64\end{pycode}6566\section{Lexicon-Based Sentiment Analysis}6768\begin{pycode}69# VADER-like sentiment lexicon (simplified)70sentiment_lexicon = {71# Positive words72'good': 1.9, 'great': 3.1, 'excellent': 3.4, 'amazing': 3.6,73'wonderful': 3.2, 'fantastic': 3.4, 'love': 3.2, 'happy': 2.7,74'best': 3.0, 'beautiful': 2.9, 'perfect': 3.4, 'awesome': 3.3,75'nice': 1.8, 'pleasant': 2.1, 'helpful': 2.0, 'recommend': 2.4,76# Negative words77'bad': -2.5, 'terrible': -3.4, 'awful': -3.5, 'horrible': -3.6,78'hate': -3.3, 'worst': -3.4, 'poor': -2.6, 'disappointing': -2.7,79'broken': -2.3, 'waste': -2.8, 'useless': -2.9, 'avoid': -2.5,80'problem': -1.8, 'fail': -2.8, 'wrong': -2.1, 'annoying': -2.3,81# Intensifiers and negations82'very': 0.3, 'really': 0.3, 'extremely': 0.4, 'not': -0.74,83'never': -0.74, 'barely': -0.4, 'hardly': -0.484}8586# Booster words that modify intensity87boosters = {'very': 0.293, 'really': 0.293, 'extremely': 0.326,88'absolutely': 0.312, 'completely': 0.296, 'totally': 0.287}8990def normalize_score(score, alpha=15):91"""Normalize score to [-1, 1] range using VADER-like normalization"""92return score / np.sqrt(score**2 + alpha)9394def lexicon_sentiment(text):95"""Calculate sentiment scores using lexicon-based approach"""96words = re.findall(r'\b\w+\b', text.lower())9798pos_sum = 099neg_sum = 0100scores = []101102for i, word in enumerate(words):103if word in sentiment_lexicon:104score = sentiment_lexicon[word]105106# Check for preceding booster107if i > 0 and words[i-1] in boosters:108if score > 0:109score += boosters[words[i-1]]110else:111score -= boosters[words[i-1]]112113# Check for negation114if i > 0 and words[i-1] in ['not', 'never', "n't"]:115score *= -0.5116117scores.append(score)118if score > 0:119pos_sum += score120else:121neg_sum += score122123compound = normalize_score(sum(scores)) if scores else 0124125# Normalize positive and negative to [0, 1]126total = pos_sum + abs(neg_sum)127if total > 0:128pos = pos_sum / total129neg = abs(neg_sum) / total130neu = 1 - (pos + neg)131else:132pos = neg = 0133neu = 1.0134135return {'compound': compound, 'pos': pos, 'neg': neg, 'neu': neu}136\end{pycode}137138\section{Sample Text Analysis}139140\begin{pycode}141# Sample reviews for analysis142sample_texts = [143"This product is absolutely amazing! Best purchase I've ever made. Highly recommend!",144"Terrible quality, completely broken on arrival. Worst experience ever. Avoid!",145"It's okay, nothing special. Does what it's supposed to do.",146"Really love this! Great quality and very helpful customer service.",147"Very disappointing. The product failed after one week. Poor design.",148"Fantastic! Exceeded all my expectations. Wonderful purchase!",149"Not bad, but not great either. Some minor problems.",150"Absolutely horrible. Total waste of money. Never buying again.",151"Good value for the price. Nice quality overall.",152"Perfect! Beautiful design and excellent functionality. Amazing product!"153]154155# Analyze all samples156results = []157for text in sample_texts:158scores = lexicon_sentiment(text)159results.append(scores)160161# Extract compound scores162compound_scores = [r['compound'] for r in results]163164# Create sentiment distribution plot165fig, axes = plt.subplots(2, 2, figsize=(12, 10))166167# Plot 1: Compound scores bar chart168colors = ['green' if s > 0.05 else 'red' if s < -0.05 else 'gray' for s in compound_scores]169axes[0, 0].barh(range(len(compound_scores)), compound_scores, color=colors, alpha=0.7)170axes[0, 0].axvline(x=0, color='black', linestyle='-', linewidth=0.5)171axes[0, 0].axvline(x=0.05, color='green', linestyle='--', alpha=0.5)172axes[0, 0].axvline(x=-0.05, color='red', linestyle='--', alpha=0.5)173axes[0, 0].set_xlabel('Compound Score')174axes[0, 0].set_ylabel('Document Index')175axes[0, 0].set_title('Sentiment Compound Scores')176axes[0, 0].set_xlim(-1, 1)177178# Plot 2: Sentiment component breakdown179pos_scores = [r['pos'] for r in results]180neg_scores = [r['neg'] for r in results]181neu_scores = [r['neu'] for r in results]182183x = np.arange(len(results))184width = 0.25185axes[0, 1].bar(x - width, pos_scores, width, label='Positive', color='green', alpha=0.7)186axes[0, 1].bar(x, neu_scores, width, label='Neutral', color='gray', alpha=0.7)187axes[0, 1].bar(x + width, neg_scores, width, label='Negative', color='red', alpha=0.7)188axes[0, 1].set_xlabel('Document Index')189axes[0, 1].set_ylabel('Score')190axes[0, 1].set_title('Sentiment Components')191axes[0, 1].legend()192axes[0, 1].set_xticks(x)193194# Plot 3: Score distribution histogram195axes[1, 0].hist(compound_scores, bins=15, range=(-1, 1), color='steelblue',196edgecolor='black', alpha=0.7)197axes[1, 0].axvline(x=np.mean(compound_scores), color='red', linestyle='--',198label=f'Mean: {np.mean(compound_scores):.2f}')199axes[1, 0].set_xlabel('Compound Score')200axes[1, 0].set_ylabel('Frequency')201axes[1, 0].set_title('Score Distribution')202axes[1, 0].legend()203204# Plot 4: Polarity scatter plot205axes[1, 1].scatter(pos_scores, neg_scores, c=compound_scores, cmap='RdYlGn',206s=100, alpha=0.7, edgecolors='black')207axes[1, 1].set_xlabel('Positive Score')208axes[1, 1].set_ylabel('Negative Score')209axes[1, 1].set_title('Polarity Distribution')210cbar = plt.colorbar(axes[1, 1].collections[0], ax=axes[1, 1])211cbar.set_label('Compound')212213plt.tight_layout()214save_plot('sentiment_lexicon.pdf', 'Lexicon-based sentiment analysis results')215\end{pycode}216217\section{Naive Bayes Classifier Implementation}218219\begin{pycode}220class NaiveBayesSentiment:221def __init__(self, alpha=1.0):222"""Initialize with Laplace smoothing parameter"""223self.alpha = alpha224self.class_priors = {}225self.word_probs = {}226self.vocab = set()227228def tokenize(self, text):229"""Simple tokenization"""230return re.findall(r'\b\w+\b', text.lower())231232def fit(self, texts, labels):233"""Train the classifier"""234# Count documents per class235class_counts = Counter(labels)236total_docs = len(labels)237238# Calculate class priors239for c in class_counts:240self.class_priors[c] = class_counts[c] / total_docs241242# Count words per class243word_counts = defaultdict(lambda: defaultdict(int))244class_word_totals = defaultdict(int)245246for text, label in zip(texts, labels):247words = self.tokenize(text)248for word in words:249self.vocab.add(word)250word_counts[label][word] += 1251class_word_totals[label] += 1252253# Calculate word probabilities with Laplace smoothing254vocab_size = len(self.vocab)255self.word_probs = {}256257for c in class_counts:258self.word_probs[c] = {}259for word in self.vocab:260count = word_counts[c][word]261self.word_probs[c][word] = (count + self.alpha) / \262(class_word_totals[c] + self.alpha * vocab_size)263264self.class_word_totals = class_word_totals265self.vocab_size = vocab_size266267def predict_proba(self, text):268"""Calculate log-probabilities for each class"""269words = self.tokenize(text)270log_probs = {}271272for c in self.class_priors:273log_prob = np.log(self.class_priors[c])274275for word in words:276if word in self.vocab:277log_prob += np.log(self.word_probs[c][word])278else:279# Handle unknown words280log_prob += np.log(self.alpha /281(self.class_word_totals[c] + self.alpha * self.vocab_size))282283log_probs[c] = log_prob284285# Convert to probabilities286max_log = max(log_probs.values())287probs = {c: np.exp(lp - max_log) for c, lp in log_probs.items()}288total = sum(probs.values())289probs = {c: p/total for c, p in probs.items()}290291return probs292293def predict(self, text):294"""Predict class label"""295probs = self.predict_proba(text)296return max(probs, key=probs.get)297298# Training data299train_texts = [300"I love this movie so much", "Great film excellent acting",301"Best movie I have ever seen", "Wonderful story beautiful cinematography",302"Amazing performance highly recommend", "Fantastic movie loved every minute",303"Perfect film masterpiece", "Brilliant acting great direction",304"Terrible movie waste of time", "Horrible film awful acting",305"Worst movie ever made", "Boring story bad direction",306"Disappointing film poor quality", "Dreadful movie avoid at all costs",307"Bad acting terrible script", "Awful waste of money"308]309310train_labels = ['positive'] * 8 + ['negative'] * 8311312# Train classifier313nb_classifier = NaiveBayesSentiment(alpha=1.0)314nb_classifier.fit(train_texts, train_labels)315316# Test predictions317test_texts = [318"This movie was absolutely wonderful and amazing",319"Terrible film with horrible acting",320"Good movie but some boring parts",321"I really loved the great performances",322"Worst experience ever very disappointing"323]324325predictions = []326probabilities = []327for text in test_texts:328pred = nb_classifier.predict(text)329prob = nb_classifier.predict_proba(text)330predictions.append(pred)331probabilities.append(prob)332\end{pycode}333334\section{Word Cloud Visualization}335336\begin{pycode}337# Create word frequency analysis for visualization338from collections import Counter339340# Combine all positive and negative words341positive_texts = ' '.join([t for t, l in zip(train_texts, train_labels) if l == 'positive'])342negative_texts = ' '.join([t for t, l in zip(train_texts, train_labels) if l == 'negative'])343344def get_word_freq(text):345words = re.findall(r'\b\w+\b', text.lower())346# Remove stopwords347stopwords = {'i', 'the', 'a', 'an', 'is', 'was', 'of', 'to', 'and', 'this', 'that', 'it', 'so'}348words = [w for w in words if w not in stopwords and len(w) > 2]349return Counter(words)350351pos_freq = get_word_freq(positive_texts)352neg_freq = get_word_freq(negative_texts)353354# Create word cloud-like visualization355fig, axes = plt.subplots(2, 2, figsize=(12, 10))356357# Plot 1: Positive word frequencies358pos_words, pos_counts = zip(*pos_freq.most_common(10))359y_pos = np.arange(len(pos_words))360axes[0, 0].barh(y_pos, pos_counts, color='green', alpha=0.7)361axes[0, 0].set_yticks(y_pos)362axes[0, 0].set_yticklabels(pos_words)363axes[0, 0].set_xlabel('Frequency')364axes[0, 0].set_title('Top Positive Words')365axes[0, 0].invert_yaxis()366367# Plot 2: Negative word frequencies368neg_words, neg_counts = zip(*neg_freq.most_common(10))369y_neg = np.arange(len(neg_words))370axes[0, 1].barh(y_neg, neg_counts, color='red', alpha=0.7)371axes[0, 1].set_yticks(y_neg)372axes[0, 1].set_yticklabels(neg_words)373axes[0, 1].set_xlabel('Frequency')374axes[0, 1].set_title('Top Negative Words')375axes[0, 1].invert_yaxis()376377# Plot 3: Word importance (log probability ratios)378common_words = list(set(pos_freq.keys()) & set(neg_freq.keys()) |379set(list(pos_freq.keys())[:5]) | set(list(neg_freq.keys())[:5]))380381if len(common_words) > 0:382# Calculate log-odds ratio for top words383word_scores = []384for word in nb_classifier.vocab:385if word in nb_classifier.word_probs['positive'] and word in nb_classifier.word_probs['negative']:386log_ratio = np.log(nb_classifier.word_probs['positive'][word] /387nb_classifier.word_probs['negative'][word])388word_scores.append((word, log_ratio))389390# Sort by absolute value and get top words391word_scores.sort(key=lambda x: abs(x[1]), reverse=True)392top_words = word_scores[:15]393394words, scores = zip(*top_words)395colors = ['green' if s > 0 else 'red' for s in scores]396y_pos = np.arange(len(words))397axes[1, 0].barh(y_pos, scores, color=colors, alpha=0.7)398axes[1, 0].axvline(x=0, color='black', linestyle='-', linewidth=0.5)399axes[1, 0].set_yticks(y_pos)400axes[1, 0].set_yticklabels(words)401axes[1, 0].set_xlabel('Log Probability Ratio (Pos/Neg)')402axes[1, 0].set_title('Word Sentiment Scores')403404# Plot 4: Classification probabilities for test samples405test_labels = [f'Test {i+1}' for i in range(len(test_texts))]406pos_probs = [p['positive'] for p in probabilities]407neg_probs = [p['negative'] for p in probabilities]408409x = np.arange(len(test_texts))410width = 0.35411axes[1, 1].bar(x - width/2, pos_probs, width, label='Positive', color='green', alpha=0.7)412axes[1, 1].bar(x + width/2, neg_probs, width, label='Negative', color='red', alpha=0.7)413axes[1, 1].set_xlabel('Test Sample')414axes[1, 1].set_ylabel('Probability')415axes[1, 1].set_title('Classification Probabilities')416axes[1, 1].set_xticks(x)417axes[1, 1].set_xticklabels(test_labels, rotation=45)418axes[1, 1].legend()419axes[1, 1].axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)420421plt.tight_layout()422save_plot('sentiment_wordcloud.pdf', 'Word frequency analysis and classification results')423\end{pycode}424425\section{Comparative Analysis}426427\begin{pycode}428# Compare lexicon-based and Naive Bayes results429comparison_texts = [430"Absolutely fantastic movie with brilliant performances",431"Terrible waste of time boring and dull",432"Good film but nothing special",433"Loved it wonderful experience highly recommend"434]435436fig, axes = plt.subplots(1, 2, figsize=(12, 5))437438# Get both scores439lexicon_scores = [lexicon_sentiment(t)['compound'] for t in comparison_texts]440nb_probs = [nb_classifier.predict_proba(t)['positive'] * 2 - 1 for t in comparison_texts] # Scale to [-1, 1]441442# Scatter plot comparison443axes[0].scatter(lexicon_scores, nb_probs, s=100, alpha=0.7, edgecolors='black')444axes[0].plot([-1, 1], [-1, 1], 'r--', alpha=0.5, label='Perfect agreement')445axes[0].set_xlabel('Lexicon Score')446axes[0].set_ylabel('Naive Bayes Score (scaled)')447axes[0].set_title('Method Comparison')448axes[0].set_xlim(-1.1, 1.1)449axes[0].set_ylim(-1.1, 1.1)450axes[0].legend()451axes[0].grid(True, alpha=0.3)452453# Bar comparison454x = np.arange(len(comparison_texts))455width = 0.35456axes[1].bar(x - width/2, lexicon_scores, width, label='Lexicon', alpha=0.7)457axes[1].bar(x + width/2, nb_probs, width, label='Naive Bayes', alpha=0.7)458axes[1].set_xlabel('Text Index')459axes[1].set_ylabel('Sentiment Score')460axes[1].set_title('Side-by-Side Comparison')461axes[1].set_xticks(x)462axes[1].legend()463axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)464465plt.tight_layout()466save_plot('sentiment_comparison.pdf', 'Comparison of lexicon-based and Naive Bayes approaches')467468# Calculate correlation469correlation = np.corrcoef(lexicon_scores, nb_probs)[0, 1]470\end{pycode}471472\section{Results Summary}473474\subsection{Lexicon Analysis Results}475\begin{pycode}476# Generate results table477print(r'\begin{table}[htbp]')478print(r'\centering')479print(r'\caption{Lexicon-based sentiment scores for sample texts}')480print(r'\begin{tabular}{ccccc}')481print(r'\toprule')482print(r'Doc & Compound & Positive & Neutral & Negative \\')483print(r'\midrule')484485for i, r in enumerate(results):486print(f"{i+1} & {r['compound']:.3f} & {r['pos']:.3f} & {r['neu']:.3f} & {r['neg']:.3f} \\\\")487488print(r'\bottomrule')489print(r'\end{tabular}')490print(r'\end{table}')491\end{pycode}492493\subsection{Naive Bayes Classification Results}494\begin{pycode}495print(r'\begin{table}[htbp]')496print(r'\centering')497print(r'\caption{Naive Bayes classification results}')498print(r'\begin{tabular}{clcc}')499print(r'\toprule')500print(r'Test & Prediction & P(Positive) & P(Negative) \\')501print(r'\midrule')502503for i, (pred, prob) in enumerate(zip(predictions, probabilities)):504print(f"{i+1} & {pred} & {prob['positive']:.3f} & {prob['negative']:.3f} \\\\")505506print(r'\bottomrule')507print(r'\end{tabular}')508print(r'\end{table}')509\end{pycode}510511\subsection{Statistical Summary}512\begin{itemize}513\item Mean compound score: \py{f"{np.mean(compound_scores):.3f}"}514\item Standard deviation: \py{f"{np.std(compound_scores):.3f}"}515\item Positive documents: \py{f"{sum(1 for s in compound_scores if s > 0.05)}"}516\item Negative documents: \py{f"{sum(1 for s in compound_scores if s < -0.05)}"}517\item Vocabulary size: \py{f"{len(nb_classifier.vocab)}"}518\item Method correlation: \py{f"{correlation:.3f}"}519\end{itemize}520521\section{Conclusion}522This template demonstrates two fundamental approaches to sentiment analysis. The lexicon-based method provides interpretable scores based on word valence, while Naive Bayes learns from labeled examples using probabilistic principles. Both methods show strong agreement (correlation: \py{f"{correlation:.2f}"}) on clear sentiment cases, with differences primarily in handling neutral or mixed-sentiment text.523524\end{document}525526527