Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Ok-landscape
GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/nlp/sentiment.tex
51 views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage[makestderr]{pythontex}
9
10
\title{Sentiment Analysis: Lexicon-Based and Machine Learning Approaches}
11
\author{Natural Language Processing Templates}
12
\date{\today}
13
14
\begin{document}
15
\maketitle
16
17
\section{Introduction}
18
Sentiment analysis determines the emotional tone of text, classifying it as positive, negative, or neutral. This template implements two complementary approaches: lexicon-based scoring (similar to VADER) and a Naive Bayes classifier.
19
20
\section{Mathematical Framework}
21
22
\subsection{Lexicon-Based Sentiment Scoring}
23
The compound sentiment score aggregates individual word scores:
24
\begin{equation}
25
S_{compound} = \frac{\sum_{i=1}^{n} v_i}{\sqrt{\left(\sum_{i=1}^{n} v_i\right)^2 + \alpha}}
26
\end{equation}
27
where $v_i$ is the valence score for word $i$ and $\alpha$ is a normalization constant.
28
29
\subsection{Naive Bayes Classification}
30
For text classification, we use Bayes' theorem:
31
\begin{equation}
32
P(c|d) = \frac{P(d|c)P(c)}{P(d)}
33
\end{equation}
34
Under the naive independence assumption:
35
\begin{equation}
36
P(d|c) = \prod_{i=1}^{n} P(w_i|c)
37
\end{equation}
38
39
Using log-probabilities to avoid underflow:
40
\begin{equation}
41
\log P(c|d) \propto \log P(c) + \sum_{i=1}^{n} \log P(w_i|c)
42
\end{equation}
43
44
\section{Environment Setup}
45
46
\begin{pycode}
47
import numpy as np
48
import matplotlib.pyplot as plt
49
from collections import Counter, defaultdict
50
import re
51
52
plt.rc('text', usetex=True)
53
plt.rc('font', family='serif')
54
np.random.seed(42)
55
56
def save_plot(filename, caption=""):
57
plt.savefig(filename, bbox_inches='tight', dpi=150)
58
print(r'\begin{figure}[htbp]')
59
print(r'\centering')
60
print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')
61
if caption:
62
print(r'\caption{' + caption + '}')
63
print(r'\end{figure}')
64
plt.close()
65
\end{pycode}
66
67
\section{Lexicon-Based Sentiment Analysis}
68
69
\begin{pycode}
70
# VADER-like sentiment lexicon (simplified)
71
sentiment_lexicon = {
72
# Positive words
73
'good': 1.9, 'great': 3.1, 'excellent': 3.4, 'amazing': 3.6,
74
'wonderful': 3.2, 'fantastic': 3.4, 'love': 3.2, 'happy': 2.7,
75
'best': 3.0, 'beautiful': 2.9, 'perfect': 3.4, 'awesome': 3.3,
76
'nice': 1.8, 'pleasant': 2.1, 'helpful': 2.0, 'recommend': 2.4,
77
# Negative words
78
'bad': -2.5, 'terrible': -3.4, 'awful': -3.5, 'horrible': -3.6,
79
'hate': -3.3, 'worst': -3.4, 'poor': -2.6, 'disappointing': -2.7,
80
'broken': -2.3, 'waste': -2.8, 'useless': -2.9, 'avoid': -2.5,
81
'problem': -1.8, 'fail': -2.8, 'wrong': -2.1, 'annoying': -2.3,
82
# Intensifiers and negations
83
'very': 0.3, 'really': 0.3, 'extremely': 0.4, 'not': -0.74,
84
'never': -0.74, 'barely': -0.4, 'hardly': -0.4
85
}
86
87
# Booster words that modify intensity
88
boosters = {'very': 0.293, 'really': 0.293, 'extremely': 0.326,
89
'absolutely': 0.312, 'completely': 0.296, 'totally': 0.287}
90
91
def normalize_score(score, alpha=15):
92
"""Normalize score to [-1, 1] range using VADER-like normalization"""
93
return score / np.sqrt(score**2 + alpha)
94
95
def lexicon_sentiment(text):
96
"""Calculate sentiment scores using lexicon-based approach"""
97
words = re.findall(r'\b\w+\b', text.lower())
98
99
pos_sum = 0
100
neg_sum = 0
101
scores = []
102
103
for i, word in enumerate(words):
104
if word in sentiment_lexicon:
105
score = sentiment_lexicon[word]
106
107
# Check for preceding booster
108
if i > 0 and words[i-1] in boosters:
109
if score > 0:
110
score += boosters[words[i-1]]
111
else:
112
score -= boosters[words[i-1]]
113
114
# Check for negation
115
if i > 0 and words[i-1] in ['not', 'never', "n't"]:
116
score *= -0.5
117
118
scores.append(score)
119
if score > 0:
120
pos_sum += score
121
else:
122
neg_sum += score
123
124
compound = normalize_score(sum(scores)) if scores else 0
125
126
# Normalize positive and negative to [0, 1]
127
total = pos_sum + abs(neg_sum)
128
if total > 0:
129
pos = pos_sum / total
130
neg = abs(neg_sum) / total
131
neu = 1 - (pos + neg)
132
else:
133
pos = neg = 0
134
neu = 1.0
135
136
return {'compound': compound, 'pos': pos, 'neg': neg, 'neu': neu}
137
\end{pycode}
138
139
\section{Sample Text Analysis}
140
141
\begin{pycode}
142
# Sample reviews for analysis
143
sample_texts = [
144
"This product is absolutely amazing! Best purchase I've ever made. Highly recommend!",
145
"Terrible quality, completely broken on arrival. Worst experience ever. Avoid!",
146
"It's okay, nothing special. Does what it's supposed to do.",
147
"Really love this! Great quality and very helpful customer service.",
148
"Very disappointing. The product failed after one week. Poor design.",
149
"Fantastic! Exceeded all my expectations. Wonderful purchase!",
150
"Not bad, but not great either. Some minor problems.",
151
"Absolutely horrible. Total waste of money. Never buying again.",
152
"Good value for the price. Nice quality overall.",
153
"Perfect! Beautiful design and excellent functionality. Amazing product!"
154
]
155
156
# Analyze all samples
157
results = []
158
for text in sample_texts:
159
scores = lexicon_sentiment(text)
160
results.append(scores)
161
162
# Extract compound scores
163
compound_scores = [r['compound'] for r in results]
164
165
# Create sentiment distribution plot
166
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
167
168
# Plot 1: Compound scores bar chart
169
colors = ['green' if s > 0.05 else 'red' if s < -0.05 else 'gray' for s in compound_scores]
170
axes[0, 0].barh(range(len(compound_scores)), compound_scores, color=colors, alpha=0.7)
171
axes[0, 0].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
172
axes[0, 0].axvline(x=0.05, color='green', linestyle='--', alpha=0.5)
173
axes[0, 0].axvline(x=-0.05, color='red', linestyle='--', alpha=0.5)
174
axes[0, 0].set_xlabel('Compound Score')
175
axes[0, 0].set_ylabel('Document Index')
176
axes[0, 0].set_title('Sentiment Compound Scores')
177
axes[0, 0].set_xlim(-1, 1)
178
179
# Plot 2: Sentiment component breakdown
180
pos_scores = [r['pos'] for r in results]
181
neg_scores = [r['neg'] for r in results]
182
neu_scores = [r['neu'] for r in results]
183
184
x = np.arange(len(results))
185
width = 0.25
186
axes[0, 1].bar(x - width, pos_scores, width, label='Positive', color='green', alpha=0.7)
187
axes[0, 1].bar(x, neu_scores, width, label='Neutral', color='gray', alpha=0.7)
188
axes[0, 1].bar(x + width, neg_scores, width, label='Negative', color='red', alpha=0.7)
189
axes[0, 1].set_xlabel('Document Index')
190
axes[0, 1].set_ylabel('Score')
191
axes[0, 1].set_title('Sentiment Components')
192
axes[0, 1].legend()
193
axes[0, 1].set_xticks(x)
194
195
# Plot 3: Score distribution histogram
196
axes[1, 0].hist(compound_scores, bins=15, range=(-1, 1), color='steelblue',
197
edgecolor='black', alpha=0.7)
198
axes[1, 0].axvline(x=np.mean(compound_scores), color='red', linestyle='--',
199
label=f'Mean: {np.mean(compound_scores):.2f}')
200
axes[1, 0].set_xlabel('Compound Score')
201
axes[1, 0].set_ylabel('Frequency')
202
axes[1, 0].set_title('Score Distribution')
203
axes[1, 0].legend()
204
205
# Plot 4: Polarity scatter plot
206
axes[1, 1].scatter(pos_scores, neg_scores, c=compound_scores, cmap='RdYlGn',
207
s=100, alpha=0.7, edgecolors='black')
208
axes[1, 1].set_xlabel('Positive Score')
209
axes[1, 1].set_ylabel('Negative Score')
210
axes[1, 1].set_title('Polarity Distribution')
211
cbar = plt.colorbar(axes[1, 1].collections[0], ax=axes[1, 1])
212
cbar.set_label('Compound')
213
214
plt.tight_layout()
215
save_plot('sentiment_lexicon.pdf', 'Lexicon-based sentiment analysis results')
216
\end{pycode}
217
218
\section{Naive Bayes Classifier Implementation}
219
220
\begin{pycode}
221
class NaiveBayesSentiment:
222
def __init__(self, alpha=1.0):
223
"""Initialize with Laplace smoothing parameter"""
224
self.alpha = alpha
225
self.class_priors = {}
226
self.word_probs = {}
227
self.vocab = set()
228
229
def tokenize(self, text):
230
"""Simple tokenization"""
231
return re.findall(r'\b\w+\b', text.lower())
232
233
def fit(self, texts, labels):
234
"""Train the classifier"""
235
# Count documents per class
236
class_counts = Counter(labels)
237
total_docs = len(labels)
238
239
# Calculate class priors
240
for c in class_counts:
241
self.class_priors[c] = class_counts[c] / total_docs
242
243
# Count words per class
244
word_counts = defaultdict(lambda: defaultdict(int))
245
class_word_totals = defaultdict(int)
246
247
for text, label in zip(texts, labels):
248
words = self.tokenize(text)
249
for word in words:
250
self.vocab.add(word)
251
word_counts[label][word] += 1
252
class_word_totals[label] += 1
253
254
# Calculate word probabilities with Laplace smoothing
255
vocab_size = len(self.vocab)
256
self.word_probs = {}
257
258
for c in class_counts:
259
self.word_probs[c] = {}
260
for word in self.vocab:
261
count = word_counts[c][word]
262
self.word_probs[c][word] = (count + self.alpha) / \
263
(class_word_totals[c] + self.alpha * vocab_size)
264
265
self.class_word_totals = class_word_totals
266
self.vocab_size = vocab_size
267
268
def predict_proba(self, text):
269
"""Calculate log-probabilities for each class"""
270
words = self.tokenize(text)
271
log_probs = {}
272
273
for c in self.class_priors:
274
log_prob = np.log(self.class_priors[c])
275
276
for word in words:
277
if word in self.vocab:
278
log_prob += np.log(self.word_probs[c][word])
279
else:
280
# Handle unknown words
281
log_prob += np.log(self.alpha /
282
(self.class_word_totals[c] + self.alpha * self.vocab_size))
283
284
log_probs[c] = log_prob
285
286
# Convert to probabilities
287
max_log = max(log_probs.values())
288
probs = {c: np.exp(lp - max_log) for c, lp in log_probs.items()}
289
total = sum(probs.values())
290
probs = {c: p/total for c, p in probs.items()}
291
292
return probs
293
294
def predict(self, text):
295
"""Predict class label"""
296
probs = self.predict_proba(text)
297
return max(probs, key=probs.get)
298
299
# Training data
300
train_texts = [
301
"I love this movie so much", "Great film excellent acting",
302
"Best movie I have ever seen", "Wonderful story beautiful cinematography",
303
"Amazing performance highly recommend", "Fantastic movie loved every minute",
304
"Perfect film masterpiece", "Brilliant acting great direction",
305
"Terrible movie waste of time", "Horrible film awful acting",
306
"Worst movie ever made", "Boring story bad direction",
307
"Disappointing film poor quality", "Dreadful movie avoid at all costs",
308
"Bad acting terrible script", "Awful waste of money"
309
]
310
311
train_labels = ['positive'] * 8 + ['negative'] * 8
312
313
# Train classifier
314
nb_classifier = NaiveBayesSentiment(alpha=1.0)
315
nb_classifier.fit(train_texts, train_labels)
316
317
# Test predictions
318
test_texts = [
319
"This movie was absolutely wonderful and amazing",
320
"Terrible film with horrible acting",
321
"Good movie but some boring parts",
322
"I really loved the great performances",
323
"Worst experience ever very disappointing"
324
]
325
326
predictions = []
327
probabilities = []
328
for text in test_texts:
329
pred = nb_classifier.predict(text)
330
prob = nb_classifier.predict_proba(text)
331
predictions.append(pred)
332
probabilities.append(prob)
333
\end{pycode}
334
335
\section{Word Cloud Visualization}
336
337
\begin{pycode}
338
# Create word frequency analysis for visualization
339
from collections import Counter
340
341
# Combine all positive and negative words
342
positive_texts = ' '.join([t for t, l in zip(train_texts, train_labels) if l == 'positive'])
343
negative_texts = ' '.join([t for t, l in zip(train_texts, train_labels) if l == 'negative'])
344
345
def get_word_freq(text):
346
words = re.findall(r'\b\w+\b', text.lower())
347
# Remove stopwords
348
stopwords = {'i', 'the', 'a', 'an', 'is', 'was', 'of', 'to', 'and', 'this', 'that', 'it', 'so'}
349
words = [w for w in words if w not in stopwords and len(w) > 2]
350
return Counter(words)
351
352
pos_freq = get_word_freq(positive_texts)
353
neg_freq = get_word_freq(negative_texts)
354
355
# Create word cloud-like visualization
356
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
357
358
# Plot 1: Positive word frequencies
359
pos_words, pos_counts = zip(*pos_freq.most_common(10))
360
y_pos = np.arange(len(pos_words))
361
axes[0, 0].barh(y_pos, pos_counts, color='green', alpha=0.7)
362
axes[0, 0].set_yticks(y_pos)
363
axes[0, 0].set_yticklabels(pos_words)
364
axes[0, 0].set_xlabel('Frequency')
365
axes[0, 0].set_title('Top Positive Words')
366
axes[0, 0].invert_yaxis()
367
368
# Plot 2: Negative word frequencies
369
neg_words, neg_counts = zip(*neg_freq.most_common(10))
370
y_neg = np.arange(len(neg_words))
371
axes[0, 1].barh(y_neg, neg_counts, color='red', alpha=0.7)
372
axes[0, 1].set_yticks(y_neg)
373
axes[0, 1].set_yticklabels(neg_words)
374
axes[0, 1].set_xlabel('Frequency')
375
axes[0, 1].set_title('Top Negative Words')
376
axes[0, 1].invert_yaxis()
377
378
# Plot 3: Word importance (log probability ratios)
379
common_words = list(set(pos_freq.keys()) & set(neg_freq.keys()) |
380
set(list(pos_freq.keys())[:5]) | set(list(neg_freq.keys())[:5]))
381
382
if len(common_words) > 0:
383
# Calculate log-odds ratio for top words
384
word_scores = []
385
for word in nb_classifier.vocab:
386
if word in nb_classifier.word_probs['positive'] and word in nb_classifier.word_probs['negative']:
387
log_ratio = np.log(nb_classifier.word_probs['positive'][word] /
388
nb_classifier.word_probs['negative'][word])
389
word_scores.append((word, log_ratio))
390
391
# Sort by absolute value and get top words
392
word_scores.sort(key=lambda x: abs(x[1]), reverse=True)
393
top_words = word_scores[:15]
394
395
words, scores = zip(*top_words)
396
colors = ['green' if s > 0 else 'red' for s in scores]
397
y_pos = np.arange(len(words))
398
axes[1, 0].barh(y_pos, scores, color=colors, alpha=0.7)
399
axes[1, 0].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
400
axes[1, 0].set_yticks(y_pos)
401
axes[1, 0].set_yticklabels(words)
402
axes[1, 0].set_xlabel('Log Probability Ratio (Pos/Neg)')
403
axes[1, 0].set_title('Word Sentiment Scores')
404
405
# Plot 4: Classification probabilities for test samples
406
test_labels = [f'Test {i+1}' for i in range(len(test_texts))]
407
pos_probs = [p['positive'] for p in probabilities]
408
neg_probs = [p['negative'] for p in probabilities]
409
410
x = np.arange(len(test_texts))
411
width = 0.35
412
axes[1, 1].bar(x - width/2, pos_probs, width, label='Positive', color='green', alpha=0.7)
413
axes[1, 1].bar(x + width/2, neg_probs, width, label='Negative', color='red', alpha=0.7)
414
axes[1, 1].set_xlabel('Test Sample')
415
axes[1, 1].set_ylabel('Probability')
416
axes[1, 1].set_title('Classification Probabilities')
417
axes[1, 1].set_xticks(x)
418
axes[1, 1].set_xticklabels(test_labels, rotation=45)
419
axes[1, 1].legend()
420
axes[1, 1].axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
421
422
plt.tight_layout()
423
save_plot('sentiment_wordcloud.pdf', 'Word frequency analysis and classification results')
424
\end{pycode}
425
426
\section{Comparative Analysis}
427
428
\begin{pycode}
429
# Compare lexicon-based and Naive Bayes results
430
comparison_texts = [
431
"Absolutely fantastic movie with brilliant performances",
432
"Terrible waste of time boring and dull",
433
"Good film but nothing special",
434
"Loved it wonderful experience highly recommend"
435
]
436
437
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
438
439
# Get both scores
440
lexicon_scores = [lexicon_sentiment(t)['compound'] for t in comparison_texts]
441
nb_probs = [nb_classifier.predict_proba(t)['positive'] * 2 - 1 for t in comparison_texts] # Scale to [-1, 1]
442
443
# Scatter plot comparison
444
axes[0].scatter(lexicon_scores, nb_probs, s=100, alpha=0.7, edgecolors='black')
445
axes[0].plot([-1, 1], [-1, 1], 'r--', alpha=0.5, label='Perfect agreement')
446
axes[0].set_xlabel('Lexicon Score')
447
axes[0].set_ylabel('Naive Bayes Score (scaled)')
448
axes[0].set_title('Method Comparison')
449
axes[0].set_xlim(-1.1, 1.1)
450
axes[0].set_ylim(-1.1, 1.1)
451
axes[0].legend()
452
axes[0].grid(True, alpha=0.3)
453
454
# Bar comparison
455
x = np.arange(len(comparison_texts))
456
width = 0.35
457
axes[1].bar(x - width/2, lexicon_scores, width, label='Lexicon', alpha=0.7)
458
axes[1].bar(x + width/2, nb_probs, width, label='Naive Bayes', alpha=0.7)
459
axes[1].set_xlabel('Text Index')
460
axes[1].set_ylabel('Sentiment Score')
461
axes[1].set_title('Side-by-Side Comparison')
462
axes[1].set_xticks(x)
463
axes[1].legend()
464
axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
465
466
plt.tight_layout()
467
save_plot('sentiment_comparison.pdf', 'Comparison of lexicon-based and Naive Bayes approaches')
468
469
# Calculate correlation
470
correlation = np.corrcoef(lexicon_scores, nb_probs)[0, 1]
471
\end{pycode}
472
473
\section{Results Summary}
474
475
\subsection{Lexicon Analysis Results}
476
\begin{pycode}
477
# Generate results table
478
print(r'\begin{table}[htbp]')
479
print(r'\centering')
480
print(r'\caption{Lexicon-based sentiment scores for sample texts}')
481
print(r'\begin{tabular}{ccccc}')
482
print(r'\toprule')
483
print(r'Doc & Compound & Positive & Neutral & Negative \\')
484
print(r'\midrule')
485
486
for i, r in enumerate(results):
487
print(f"{i+1} & {r['compound']:.3f} & {r['pos']:.3f} & {r['neu']:.3f} & {r['neg']:.3f} \\\\")
488
489
print(r'\bottomrule')
490
print(r'\end{tabular}')
491
print(r'\end{table}')
492
\end{pycode}
493
494
\subsection{Naive Bayes Classification Results}
495
\begin{pycode}
496
print(r'\begin{table}[htbp]')
497
print(r'\centering')
498
print(r'\caption{Naive Bayes classification results}')
499
print(r'\begin{tabular}{clcc}')
500
print(r'\toprule')
501
print(r'Test & Prediction & P(Positive) & P(Negative) \\')
502
print(r'\midrule')
503
504
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
505
print(f"{i+1} & {pred} & {prob['positive']:.3f} & {prob['negative']:.3f} \\\\")
506
507
print(r'\bottomrule')
508
print(r'\end{tabular}')
509
print(r'\end{table}')
510
\end{pycode}
511
512
\subsection{Statistical Summary}
513
\begin{itemize}
514
\item Mean compound score: \py{f"{np.mean(compound_scores):.3f}"}
515
\item Standard deviation: \py{f"{np.std(compound_scores):.3f}"}
516
\item Positive documents: \py{f"{sum(1 for s in compound_scores if s > 0.05)}"}
517
\item Negative documents: \py{f"{sum(1 for s in compound_scores if s < -0.05)}"}
518
\item Vocabulary size: \py{f"{len(nb_classifier.vocab)}"}
519
\item Method correlation: \py{f"{correlation:.3f}"}
520
\end{itemize}
521
522
\section{Conclusion}
523
This template demonstrates two fundamental approaches to sentiment analysis. The lexicon-based method provides interpretable scores based on word valence, while Naive Bayes learns from labeled examples using probabilistic principles. Both methods show strong agreement (correlation: \py{f"{correlation:.2f}"}) on clear sentiment cases, with differences primarily in handling neutral or mixed-sentiment text.
524
525
\end{document}
526
527