CoCalc -- sentiment.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/nlp/sentiment.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage[makestderr]{pythontex}
9

10
\title{Sentiment Analysis: Lexicon-Based and Machine Learning Approaches}
11
\author{Natural Language Processing Templates}
12
\date{\today}
13

14
\begin{document}
15
\maketitle
16

17
\section{Introduction}
18
Sentiment analysis determines the emotional tone of text, classifying it as positive, negative, or neutral. This template implements two complementary approaches: lexicon-based scoring (similar to VADER) and a Naive Bayes classifier.
19

20
\section{Mathematical Framework}
21

22
\subsection{Lexicon-Based Sentiment Scoring}
23
The compound sentiment score aggregates individual word scores:
24
\begin{equation}
25
S_{compound} = \frac{\sum_{i=1}^{n} v_i}{\sqrt{\left(\sum_{i=1}^{n} v_i\right)^2 + \alpha}}
26
\end{equation}
27
where $v_i$ is the valence score for word $i$ and $\alpha$ is a normalization constant.
28

29
\subsection{Naive Bayes Classification}
30
For text classification, we use Bayes' theorem:
31
\begin{equation}
32
P(c|d) = \frac{P(d|c)P(c)}{P(d)}
33
\end{equation}
34
Under the naive independence assumption:
35
\begin{equation}
36
P(d|c) = \prod_{i=1}^{n} P(w_i|c)
37
\end{equation}
38

39
Using log-probabilities to avoid underflow:
40
\begin{equation}
41
\log P(c|d) \propto \log P(c) + \sum_{i=1}^{n} \log P(w_i|c)
42
\end{equation}
43

44
\section{Environment Setup}
45

46
\begin{pycode}
47
import numpy as np
48
import matplotlib.pyplot as plt
49
from collections import Counter, defaultdict
50
import re
51

52
plt.rc('text', usetex=True)
53
plt.rc('font', family='serif')
54
np.random.seed(42)
55

56
def save_plot(filename, caption=""):
57
    plt.savefig(filename, bbox_inches='tight', dpi=150)
58
    print(r'\begin{figure}[htbp]')
59
    print(r'\centering')
60
    print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')
61
    if caption:
62
        print(r'\caption{' + caption + '}')
63
    print(r'\end{figure}')
64
    plt.close()
65
\end{pycode}
66

67
\section{Lexicon-Based Sentiment Analysis}
68

69
\begin{pycode}
70
# VADER-like sentiment lexicon (simplified)
71
sentiment_lexicon = {
72
    # Positive words
73
    'good': 1.9, 'great': 3.1, 'excellent': 3.4, 'amazing': 3.6,
74
    'wonderful': 3.2, 'fantastic': 3.4, 'love': 3.2, 'happy': 2.7,
75
    'best': 3.0, 'beautiful': 2.9, 'perfect': 3.4, 'awesome': 3.3,
76
    'nice': 1.8, 'pleasant': 2.1, 'helpful': 2.0, 'recommend': 2.4,
77
    # Negative words
78
    'bad': -2.5, 'terrible': -3.4, 'awful': -3.5, 'horrible': -3.6,
79
    'hate': -3.3, 'worst': -3.4, 'poor': -2.6, 'disappointing': -2.7,
80
    'broken': -2.3, 'waste': -2.8, 'useless': -2.9, 'avoid': -2.5,
81
    'problem': -1.8, 'fail': -2.8, 'wrong': -2.1, 'annoying': -2.3,
82
    # Intensifiers and negations
83
    'very': 0.3, 'really': 0.3, 'extremely': 0.4, 'not': -0.74,
84
    'never': -0.74, 'barely': -0.4, 'hardly': -0.4
85
}
86

87
# Booster words that modify intensity
88
boosters = {'very': 0.293, 'really': 0.293, 'extremely': 0.326,
89
            'absolutely': 0.312, 'completely': 0.296, 'totally': 0.287}
90

91
def normalize_score(score, alpha=15):
92
    """Normalize score to [-1, 1] range using VADER-like normalization"""
93
    return score / np.sqrt(score**2 + alpha)
94

95
def lexicon_sentiment(text):
96
    """Calculate sentiment scores using lexicon-based approach"""
97
    words = re.findall(r'\b\w+\b', text.lower())
98

99
    pos_sum = 0
100
    neg_sum = 0
101
    scores = []
102

103
    for i, word in enumerate(words):
104
        if word in sentiment_lexicon:
105
            score = sentiment_lexicon[word]
106

107
            # Check for preceding booster
108
            if i > 0 and words[i-1] in boosters:
109
                if score > 0:
110
                    score += boosters[words[i-1]]
111
                else:
112
                    score -= boosters[words[i-1]]
113

114
            # Check for negation
115
            if i > 0 and words[i-1] in ['not', 'never', "n't"]:
116
                score *= -0.5
117

118
            scores.append(score)
119
            if score > 0:
120
                pos_sum += score
121
            else:
122
                neg_sum += score
123

124
    compound = normalize_score(sum(scores)) if scores else 0
125

126
    # Normalize positive and negative to [0, 1]
127
    total = pos_sum + abs(neg_sum)
128
    if total > 0:
129
        pos = pos_sum / total
130
        neg = abs(neg_sum) / total
131
        neu = 1 - (pos + neg)
132
    else:
133
        pos = neg = 0
134
        neu = 1.0
135

136
    return {'compound': compound, 'pos': pos, 'neg': neg, 'neu': neu}
137
\end{pycode}
138

139
\section{Sample Text Analysis}
140

141
\begin{pycode}
142
# Sample reviews for analysis
143
sample_texts = [
144
    "This product is absolutely amazing! Best purchase I've ever made. Highly recommend!",
145
    "Terrible quality, completely broken on arrival. Worst experience ever. Avoid!",
146
    "It's okay, nothing special. Does what it's supposed to do.",
147
    "Really love this! Great quality and very helpful customer service.",
148
    "Very disappointing. The product failed after one week. Poor design.",
149
    "Fantastic! Exceeded all my expectations. Wonderful purchase!",
150
    "Not bad, but not great either. Some minor problems.",
151
    "Absolutely horrible. Total waste of money. Never buying again.",
152
    "Good value for the price. Nice quality overall.",
153
    "Perfect! Beautiful design and excellent functionality. Amazing product!"
154
]
155

156
# Analyze all samples
157
results = []
158
for text in sample_texts:
159
    scores = lexicon_sentiment(text)
160
    results.append(scores)
161

162
# Extract compound scores
163
compound_scores = [r['compound'] for r in results]
164

165
# Create sentiment distribution plot
166
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
167

168
# Plot 1: Compound scores bar chart
169
colors = ['green' if s > 0.05 else 'red' if s < -0.05 else 'gray' for s in compound_scores]
170
axes[0, 0].barh(range(len(compound_scores)), compound_scores, color=colors, alpha=0.7)
171
axes[0, 0].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
172
axes[0, 0].axvline(x=0.05, color='green', linestyle='--', alpha=0.5)
173
axes[0, 0].axvline(x=-0.05, color='red', linestyle='--', alpha=0.5)
174
axes[0, 0].set_xlabel('Compound Score')
175
axes[0, 0].set_ylabel('Document Index')
176
axes[0, 0].set_title('Sentiment Compound Scores')
177
axes[0, 0].set_xlim(-1, 1)
178

179
# Plot 2: Sentiment component breakdown
180
pos_scores = [r['pos'] for r in results]
181
neg_scores = [r['neg'] for r in results]
182
neu_scores = [r['neu'] for r in results]
183

184
x = np.arange(len(results))
185
width = 0.25
186
axes[0, 1].bar(x - width, pos_scores, width, label='Positive', color='green', alpha=0.7)
187
axes[0, 1].bar(x, neu_scores, width, label='Neutral', color='gray', alpha=0.7)
188
axes[0, 1].bar(x + width, neg_scores, width, label='Negative', color='red', alpha=0.7)
189
axes[0, 1].set_xlabel('Document Index')
190
axes[0, 1].set_ylabel('Score')
191
axes[0, 1].set_title('Sentiment Components')
192
axes[0, 1].legend()
193
axes[0, 1].set_xticks(x)
194

195
# Plot 3: Score distribution histogram
196
axes[1, 0].hist(compound_scores, bins=15, range=(-1, 1), color='steelblue',
197
                edgecolor='black', alpha=0.7)
198
axes[1, 0].axvline(x=np.mean(compound_scores), color='red', linestyle='--',
199
                   label=f'Mean: {np.mean(compound_scores):.2f}')
200
axes[1, 0].set_xlabel('Compound Score')
201
axes[1, 0].set_ylabel('Frequency')
202
axes[1, 0].set_title('Score Distribution')
203
axes[1, 0].legend()
204

205
# Plot 4: Polarity scatter plot
206
axes[1, 1].scatter(pos_scores, neg_scores, c=compound_scores, cmap='RdYlGn',
207
                   s=100, alpha=0.7, edgecolors='black')
208
axes[1, 1].set_xlabel('Positive Score')
209
axes[1, 1].set_ylabel('Negative Score')
210
axes[1, 1].set_title('Polarity Distribution')
211
cbar = plt.colorbar(axes[1, 1].collections[0], ax=axes[1, 1])
212
cbar.set_label('Compound')
213

214
plt.tight_layout()
215
save_plot('sentiment_lexicon.pdf', 'Lexicon-based sentiment analysis results')
216
\end{pycode}
217

218
\section{Naive Bayes Classifier Implementation}
219

220
\begin{pycode}
221
class NaiveBayesSentiment:
222
    def __init__(self, alpha=1.0):
223
        """Initialize with Laplace smoothing parameter"""
224
        self.alpha = alpha
225
        self.class_priors = {}
226
        self.word_probs = {}
227
        self.vocab = set()
228

229
    def tokenize(self, text):
230
        """Simple tokenization"""
231
        return re.findall(r'\b\w+\b', text.lower())
232

233
    def fit(self, texts, labels):
234
        """Train the classifier"""
235
        # Count documents per class
236
        class_counts = Counter(labels)
237
        total_docs = len(labels)
238

239
        # Calculate class priors
240
        for c in class_counts:
241
            self.class_priors[c] = class_counts[c] / total_docs
242

243
        # Count words per class
244
        word_counts = defaultdict(lambda: defaultdict(int))
245
        class_word_totals = defaultdict(int)
246

247
        for text, label in zip(texts, labels):
248
            words = self.tokenize(text)
249
            for word in words:
250
                self.vocab.add(word)
251
                word_counts[label][word] += 1
252
                class_word_totals[label] += 1
253

254
        # Calculate word probabilities with Laplace smoothing
255
        vocab_size = len(self.vocab)
256
        self.word_probs = {}
257

258
        for c in class_counts:
259
            self.word_probs[c] = {}
260
            for word in self.vocab:
261
                count = word_counts[c][word]
262
                self.word_probs[c][word] = (count + self.alpha) / \
263
                                           (class_word_totals[c] + self.alpha * vocab_size)
264

265
        self.class_word_totals = class_word_totals
266
        self.vocab_size = vocab_size
267

268
    def predict_proba(self, text):
269
        """Calculate log-probabilities for each class"""
270
        words = self.tokenize(text)
271
        log_probs = {}
272

273
        for c in self.class_priors:
274
            log_prob = np.log(self.class_priors[c])
275

276
            for word in words:
277
                if word in self.vocab:
278
                    log_prob += np.log(self.word_probs[c][word])
279
                else:
280
                    # Handle unknown words
281
                    log_prob += np.log(self.alpha /
282
                                      (self.class_word_totals[c] + self.alpha * self.vocab_size))
283

284
            log_probs[c] = log_prob
285

286
        # Convert to probabilities
287
        max_log = max(log_probs.values())
288
        probs = {c: np.exp(lp - max_log) for c, lp in log_probs.items()}
289
        total = sum(probs.values())
290
        probs = {c: p/total for c, p in probs.items()}
291

292
        return probs
293

294
    def predict(self, text):
295
        """Predict class label"""
296
        probs = self.predict_proba(text)
297
        return max(probs, key=probs.get)
298

299
# Training data
300
train_texts = [
301
    "I love this movie so much", "Great film excellent acting",
302
    "Best movie I have ever seen", "Wonderful story beautiful cinematography",
303
    "Amazing performance highly recommend", "Fantastic movie loved every minute",
304
    "Perfect film masterpiece", "Brilliant acting great direction",
305
    "Terrible movie waste of time", "Horrible film awful acting",
306
    "Worst movie ever made", "Boring story bad direction",
307
    "Disappointing film poor quality", "Dreadful movie avoid at all costs",
308
    "Bad acting terrible script", "Awful waste of money"
309
]
310

311
train_labels = ['positive'] * 8 + ['negative'] * 8
312

313
# Train classifier
314
nb_classifier = NaiveBayesSentiment(alpha=1.0)
315
nb_classifier.fit(train_texts, train_labels)
316

317
# Test predictions
318
test_texts = [
319
    "This movie was absolutely wonderful and amazing",
320
    "Terrible film with horrible acting",
321
    "Good movie but some boring parts",
322
    "I really loved the great performances",
323
    "Worst experience ever very disappointing"
324
]
325

326
predictions = []
327
probabilities = []
328
for text in test_texts:
329
    pred = nb_classifier.predict(text)
330
    prob = nb_classifier.predict_proba(text)
331
    predictions.append(pred)
332
    probabilities.append(prob)
333
\end{pycode}
334

335
\section{Word Cloud Visualization}
336

337
\begin{pycode}
338
# Create word frequency analysis for visualization
339
from collections import Counter
340

341
# Combine all positive and negative words
342
positive_texts = ' '.join([t for t, l in zip(train_texts, train_labels) if l == 'positive'])
343
negative_texts = ' '.join([t for t, l in zip(train_texts, train_labels) if l == 'negative'])
344

345
def get_word_freq(text):
346
    words = re.findall(r'\b\w+\b', text.lower())
347
    # Remove stopwords
348
    stopwords = {'i', 'the', 'a', 'an', 'is', 'was', 'of', 'to', 'and', 'this', 'that', 'it', 'so'}
349
    words = [w for w in words if w not in stopwords and len(w) > 2]
350
    return Counter(words)
351

352
pos_freq = get_word_freq(positive_texts)
353
neg_freq = get_word_freq(negative_texts)
354

355
# Create word cloud-like visualization
356
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
357

358
# Plot 1: Positive word frequencies
359
pos_words, pos_counts = zip(*pos_freq.most_common(10))
360
y_pos = np.arange(len(pos_words))
361
axes[0, 0].barh(y_pos, pos_counts, color='green', alpha=0.7)
362
axes[0, 0].set_yticks(y_pos)
363
axes[0, 0].set_yticklabels(pos_words)
364
axes[0, 0].set_xlabel('Frequency')
365
axes[0, 0].set_title('Top Positive Words')
366
axes[0, 0].invert_yaxis()
367

368
# Plot 2: Negative word frequencies
369
neg_words, neg_counts = zip(*neg_freq.most_common(10))
370
y_neg = np.arange(len(neg_words))
371
axes[0, 1].barh(y_neg, neg_counts, color='red', alpha=0.7)
372
axes[0, 1].set_yticks(y_neg)
373
axes[0, 1].set_yticklabels(neg_words)
374
axes[0, 1].set_xlabel('Frequency')
375
axes[0, 1].set_title('Top Negative Words')
376
axes[0, 1].invert_yaxis()
377

378
# Plot 3: Word importance (log probability ratios)
379
common_words = list(set(pos_freq.keys()) & set(neg_freq.keys()) |
380
                   set(list(pos_freq.keys())[:5]) | set(list(neg_freq.keys())[:5]))
381

382
if len(common_words) > 0:
383
    # Calculate log-odds ratio for top words
384
    word_scores = []
385
    for word in nb_classifier.vocab:
386
        if word in nb_classifier.word_probs['positive'] and word in nb_classifier.word_probs['negative']:
387
            log_ratio = np.log(nb_classifier.word_probs['positive'][word] /
388
                              nb_classifier.word_probs['negative'][word])
389
            word_scores.append((word, log_ratio))
390

391
    # Sort by absolute value and get top words
392
    word_scores.sort(key=lambda x: abs(x[1]), reverse=True)
393
    top_words = word_scores[:15]
394

395
    words, scores = zip(*top_words)
396
    colors = ['green' if s > 0 else 'red' for s in scores]
397
    y_pos = np.arange(len(words))
398
    axes[1, 0].barh(y_pos, scores, color=colors, alpha=0.7)
399
    axes[1, 0].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
400
    axes[1, 0].set_yticks(y_pos)
401
    axes[1, 0].set_yticklabels(words)
402
    axes[1, 0].set_xlabel('Log Probability Ratio (Pos/Neg)')
403
    axes[1, 0].set_title('Word Sentiment Scores')
404

405
# Plot 4: Classification probabilities for test samples
406
test_labels = [f'Test {i+1}' for i in range(len(test_texts))]
407
pos_probs = [p['positive'] for p in probabilities]
408
neg_probs = [p['negative'] for p in probabilities]
409

410
x = np.arange(len(test_texts))
411
width = 0.35
412
axes[1, 1].bar(x - width/2, pos_probs, width, label='Positive', color='green', alpha=0.7)
413
axes[1, 1].bar(x + width/2, neg_probs, width, label='Negative', color='red', alpha=0.7)
414
axes[1, 1].set_xlabel('Test Sample')
415
axes[1, 1].set_ylabel('Probability')
416
axes[1, 1].set_title('Classification Probabilities')
417
axes[1, 1].set_xticks(x)
418
axes[1, 1].set_xticklabels(test_labels, rotation=45)
419
axes[1, 1].legend()
420
axes[1, 1].axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
421

422
plt.tight_layout()
423
save_plot('sentiment_wordcloud.pdf', 'Word frequency analysis and classification results')
424
\end{pycode}
425

426
\section{Comparative Analysis}
427

428
\begin{pycode}
429
# Compare lexicon-based and Naive Bayes results
430
comparison_texts = [
431
    "Absolutely fantastic movie with brilliant performances",
432
    "Terrible waste of time boring and dull",
433
    "Good film but nothing special",
434
    "Loved it wonderful experience highly recommend"
435
]
436

437
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
438

439
# Get both scores
440
lexicon_scores = [lexicon_sentiment(t)['compound'] for t in comparison_texts]
441
nb_probs = [nb_classifier.predict_proba(t)['positive'] * 2 - 1 for t in comparison_texts]  # Scale to [-1, 1]
442

443
# Scatter plot comparison
444
axes[0].scatter(lexicon_scores, nb_probs, s=100, alpha=0.7, edgecolors='black')
445
axes[0].plot([-1, 1], [-1, 1], 'r--', alpha=0.5, label='Perfect agreement')
446
axes[0].set_xlabel('Lexicon Score')
447
axes[0].set_ylabel('Naive Bayes Score (scaled)')
448
axes[0].set_title('Method Comparison')
449
axes[0].set_xlim(-1.1, 1.1)
450
axes[0].set_ylim(-1.1, 1.1)
451
axes[0].legend()
452
axes[0].grid(True, alpha=0.3)
453

454
# Bar comparison
455
x = np.arange(len(comparison_texts))
456
width = 0.35
457
axes[1].bar(x - width/2, lexicon_scores, width, label='Lexicon', alpha=0.7)
458
axes[1].bar(x + width/2, nb_probs, width, label='Naive Bayes', alpha=0.7)
459
axes[1].set_xlabel('Text Index')
460
axes[1].set_ylabel('Sentiment Score')
461
axes[1].set_title('Side-by-Side Comparison')
462
axes[1].set_xticks(x)
463
axes[1].legend()
464
axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
465

466
plt.tight_layout()
467
save_plot('sentiment_comparison.pdf', 'Comparison of lexicon-based and Naive Bayes approaches')
468

469
# Calculate correlation
470
correlation = np.corrcoef(lexicon_scores, nb_probs)[0, 1]
471
\end{pycode}
472

473
\section{Results Summary}
474

475
\subsection{Lexicon Analysis Results}
476
\begin{pycode}
477
# Generate results table
478
print(r'\begin{table}[htbp]')
479
print(r'\centering')
480
print(r'\caption{Lexicon-based sentiment scores for sample texts}')
481
print(r'\begin{tabular}{ccccc}')
482
print(r'\toprule')
483
print(r'Doc & Compound & Positive & Neutral & Negative \\')
484
print(r'\midrule')
485

486
for i, r in enumerate(results):
487
    print(f"{i+1} & {r['compound']:.3f} & {r['pos']:.3f} & {r['neu']:.3f} & {r['neg']:.3f} \\\\")
488

489
print(r'\bottomrule')
490
print(r'\end{tabular}')
491
print(r'\end{table}')
492
\end{pycode}
493

494
\subsection{Naive Bayes Classification Results}
495
\begin{pycode}
496
print(r'\begin{table}[htbp]')
497
print(r'\centering')
498
print(r'\caption{Naive Bayes classification results}')
499
print(r'\begin{tabular}{clcc}')
500
print(r'\toprule')
501
print(r'Test & Prediction & P(Positive) & P(Negative) \\')
502
print(r'\midrule')
503

504
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
505
    print(f"{i+1} & {pred} & {prob['positive']:.3f} & {prob['negative']:.3f} \\\\")
506

507
print(r'\bottomrule')
508
print(r'\end{tabular}')
509
print(r'\end{table}')
510
\end{pycode}
511

512
\subsection{Statistical Summary}
513
\begin{itemize}
514
    \item Mean compound score: \py{f"{np.mean(compound_scores):.3f}"}
515
    \item Standard deviation: \py{f"{np.std(compound_scores):.3f}"}
516
    \item Positive documents: \py{f"{sum(1 for s in compound_scores if s > 0.05)}"}
517
    \item Negative documents: \py{f"{sum(1 for s in compound_scores if s < -0.05)}"}
518
    \item Vocabulary size: \py{f"{len(nb_classifier.vocab)}"}
519
    \item Method correlation: \py{f"{correlation:.3f}"}
520
\end{itemize}
521

522
\section{Conclusion}
523
This template demonstrates two fundamental approaches to sentiment analysis. The lexicon-based method provides interpretable scores based on word valence, while Naive Bayes learns from labeled examples using probabilistic principles. Both methods show strong agreement (correlation: \py{f"{correlation:.2f}"}) on clear sentiment cases, with differences primarily in handling neutral or mixed-sentiment text.
524

525
\end{document}
526

527
Product

Resources

Company