CoCalc -- statistical

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/data-science/statistical_analysis.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{siunitx}
7
\usepackage{booktabs}
8
\usepackage{float}
9
\usepackage[makestderr]{pythontex}
10

11
\title{Data Science: Comprehensive Statistical Analysis}
12
\author{Computational Science Templates}
13
\date{\today}
14

15
\begin{document}
16
\maketitle
17

18
\begin{abstract}
19
This document presents a comprehensive statistical analysis workflow including descriptive statistics, hypothesis testing, confidence intervals, ANOVA, correlation analysis, and regression diagnostics. We demonstrate parametric and non-parametric tests, effect size calculations, and multiple testing corrections using Python's scipy and statsmodels libraries.
20
\end{abstract}
21

22
\section{Introduction}
23
Statistical analysis forms the foundation of data-driven decision making. This analysis covers the complete workflow from exploratory data analysis through hypothesis testing to model diagnostics, providing a template for rigorous quantitative research.
24

25
\section{Mathematical Framework}
26

27
\subsection{Descriptive Statistics}
28
Sample mean and variance:
29
\begin{equation}
30
\bar{x} = \frac{1}{n}\sum_{i=1}^n x_i, \quad s^2 = \frac{1}{n-1}\sum_{i=1}^n (x_i - \bar{x})^2
31
\end{equation}
32

33
\subsection{Hypothesis Testing}
34
For a t-test comparing two means:
35
\begin{equation}
36
t = \frac{\bar{x}_1 - \bar{x}_2}{s_p\sqrt{\frac{1}{n_1} + \frac{1}{n_2}}}
37
\end{equation}
38

39
where $s_p$ is the pooled standard deviation.
40

41
\subsection{Confidence Intervals}
42
A $(1-\alpha)$ confidence interval for the mean:
43
\begin{equation}
44
\bar{x} \pm t_{\alpha/2, n-1} \frac{s}{\sqrt{n}}
45
\end{equation}
46

47
\subsection{ANOVA}
48
F-statistic for one-way ANOVA:
49
\begin{equation}
50
F = \frac{\text{MS}_{\text{between}}}{\text{MS}_{\text{within}}} = \frac{\sum_j n_j(\bar{x}_j - \bar{x})^2 / (k-1)}{\sum_j\sum_i (x_{ij} - \bar{x}_j)^2 / (N-k)}
51
\end{equation}
52

53
\section{Computational Analysis}
54

55
\begin{pycode}
56
import numpy as np
57
import matplotlib.pyplot as plt
58
from scipy import stats
59
from scipy.stats import (norm, t, f, chi2, pearsonr, spearmanr,
60
                         ttest_ind, ttest_rel, mannwhitneyu, wilcoxon,
61
                         f_oneway, kruskal, shapiro, levene)
62
plt.rc('text', usetex=True)
63
plt.rc('font', family='serif')
64

65
np.random.seed(42)
66

67
# Store results
68
results = {}
69

70
# Generate sample data
71
n1, n2, n3 = 50, 50, 50
72
group1 = np.random.normal(100, 15, n1)  # Control
73
group2 = np.random.normal(108, 15, n2)  # Treatment A
74
group3 = np.random.normal(105, 18, n3)  # Treatment B
75

76
# Combined for overall analysis
77
all_data = np.concatenate([group1, group2, group3])
78
\end{pycode}
79

80
\subsection{Descriptive Statistics}
81

82
\begin{pycode}
83
def descriptive_stats(data, name):
84
    """Calculate comprehensive descriptive statistics"""
85
    return {
86
        'name': name,
87
        'n': len(data),
88
        'mean': np.mean(data),
89
        'std': np.std(data, ddof=1),
90
        'sem': stats.sem(data),
91
        'median': np.median(data),
92
        'q1': np.percentile(data, 25),
93
        'q3': np.percentile(data, 75),
94
        'min': np.min(data),
95
        'max': np.max(data),
96
        'skew': stats.skew(data),
97
        'kurtosis': stats.kurtosis(data)
98
    }
99

100
stats_g1 = descriptive_stats(group1, 'Control')
101
stats_g2 = descriptive_stats(group2, 'Treatment A')
102
stats_g3 = descriptive_stats(group3, 'Treatment B')
103

104
results['g1_mean'] = stats_g1['mean']
105
results['g2_mean'] = stats_g2['mean']
106
results['g3_mean'] = stats_g3['mean']
107
results['g1_std'] = stats_g1['std']
108
results['g2_std'] = stats_g2['std']
109
results['g3_std'] = stats_g3['std']
110

111
# Visualization
112
fig, axes = plt.subplots(2, 3, figsize=(14, 9))
113

114
# Histograms
115
ax1 = axes[0, 0]
116
ax1.hist(group1, bins=15, alpha=0.7, label='Control', color='blue', edgecolor='black')
117
ax1.hist(group2, bins=15, alpha=0.7, label='Treatment A', color='green', edgecolor='black')
118
ax1.hist(group3, bins=15, alpha=0.7, label='Treatment B', color='red', edgecolor='black')
119
ax1.set_xlabel('Value')
120
ax1.set_ylabel('Frequency')
121
ax1.set_title('Distribution Comparison')
122
ax1.legend()
123
ax1.grid(True, alpha=0.3)
124

125
# Box plots
126
ax2 = axes[0, 1]
127
bp = ax2.boxplot([group1, group2, group3], labels=['Control', 'Treat A', 'Treat B'],
128
                  patch_artist=True)
129
colors = ['lightblue', 'lightgreen', 'lightcoral']
130
for patch, color in zip(bp['boxes'], colors):
131
    patch.set_facecolor(color)
132
ax2.set_ylabel('Value')
133
ax2.set_title('Box Plot Comparison')
134
ax2.grid(True, alpha=0.3)
135

136
# Q-Q plot for normality
137
ax3 = axes[0, 2]
138
stats.probplot(group1, dist="norm", plot=ax3)
139
ax3.set_title('Q-Q Plot (Control Group)')
140

141
# Violin plots
142
ax4 = axes[1, 0]
143
parts = ax4.violinplot([group1, group2, group3], positions=[1, 2, 3], showmeans=True)
144
ax4.set_xticks([1, 2, 3])
145
ax4.set_xticklabels(['Control', 'Treat A', 'Treat B'])
146
ax4.set_ylabel('Value')
147
ax4.set_title('Violin Plot')
148
ax4.grid(True, alpha=0.3)
149

150
# Mean with confidence intervals
151
ax5 = axes[1, 1]
152
means = [stats_g1['mean'], stats_g2['mean'], stats_g3['mean']]
153
sems = [stats_g1['sem'], stats_g2['sem'], stats_g3['sem']]
154
ci95 = [1.96 * s for s in sems]
155
x_pos = [1, 2, 3]
156
ax5.bar(x_pos, means, yerr=ci95, capsize=5, color=colors, edgecolor='black', alpha=0.7)
157
ax5.set_xticks(x_pos)
158
ax5.set_xticklabels(['Control', 'Treat A', 'Treat B'])
159
ax5.set_ylabel('Mean (95\\% CI)')
160
ax5.set_title('Group Means with Confidence Intervals')
161
ax5.grid(True, alpha=0.3)
162

163
# Cumulative distribution
164
ax6 = axes[1, 2]
165
for data, label, color in [(group1, 'Control', 'blue'),
166
                            (group2, 'Treat A', 'green'),
167
                            (group3, 'Treat B', 'red')]:
168
    sorted_data = np.sort(data)
169
    cumulative = np.arange(1, len(data) + 1) / len(data)
170
    ax6.plot(sorted_data, cumulative, label=label, color=color, linewidth=2)
171
ax6.set_xlabel('Value')
172
ax6.set_ylabel('Cumulative Probability')
173
ax6.set_title('Empirical CDF')
174
ax6.legend()
175
ax6.grid(True, alpha=0.3)
176

177
plt.tight_layout()
178
plt.savefig('statistical_descriptive.pdf', bbox_inches='tight', dpi=150)
179
print(r'\begin{figure}[H]')
180
print(r'\centering')
181
print(r'\includegraphics[width=\textwidth]{statistical_descriptive.pdf}')
182
print(r'\caption{Descriptive statistics visualization: histograms, box plots, Q-Q plot, violin plots, means with CI, and empirical CDFs.}')
183
print(r'\label{fig:descriptive}')
184
print(r'\end{figure}')
185
plt.close()
186
\end{pycode}
187

188
\subsection{Normality and Homogeneity Tests}
189

190
\begin{pycode}
191
# Normality tests
192
shapiro_g1 = shapiro(group1)
193
shapiro_g2 = shapiro(group2)
194
shapiro_g3 = shapiro(group3)
195

196
# Homogeneity of variance
197
levene_stat, levene_p = levene(group1, group2, group3)
198

199
results['shapiro_g1_p'] = shapiro_g1.pvalue
200
results['shapiro_g2_p'] = shapiro_g2.pvalue
201
results['shapiro_g3_p'] = shapiro_g3.pvalue
202
results['levene_p'] = levene_p
203
\end{pycode}
204

205
\subsection{Hypothesis Testing}
206

207
\begin{pycode}
208
# Two-sample t-tests
209
t_stat_12, p_value_12 = ttest_ind(group1, group2)
210
t_stat_13, p_value_13 = ttest_ind(group1, group3)
211
t_stat_23, p_value_23 = ttest_ind(group2, group3)
212

213
# Mann-Whitney U test (non-parametric)
214
u_stat_12, u_p_12 = mannwhitneyu(group1, group2, alternative='two-sided')
215

216
# Effect size (Cohen's d)
217
def cohens_d(g1, g2):
218
    n1, n2 = len(g1), len(g2)
219
    var1, var2 = np.var(g1, ddof=1), np.var(g2, ddof=1)
220
    pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
221
    return (np.mean(g1) - np.mean(g2)) / pooled_std
222

223
d_12 = cohens_d(group1, group2)
224
d_13 = cohens_d(group1, group3)
225
d_23 = cohens_d(group2, group3)
226

227
results['t_stat_12'] = t_stat_12
228
results['p_value_12'] = p_value_12
229
results['cohens_d_12'] = d_12
230

231
# ANOVA
232
f_stat, anova_p = f_oneway(group1, group2, group3)
233
results['f_stat'] = f_stat
234
results['anova_p'] = anova_p
235

236
# Kruskal-Wallis (non-parametric ANOVA)
237
h_stat, kw_p = kruskal(group1, group2, group3)
238
results['kw_p'] = kw_p
239
\end{pycode}
240

241
\subsection{Correlation Analysis}
242

243
\begin{pycode}
244
# Generate correlated data for correlation analysis
245
n_corr = 100
246
x_data = np.random.normal(50, 10, n_corr)
247
y_data = 2 * x_data + np.random.normal(0, 15, n_corr)
248
z_data = np.random.normal(50, 10, n_corr)  # Uncorrelated
249

250
# Pearson correlation
251
r_xy, p_xy = pearsonr(x_data, y_data)
252
r_xz, p_xz = pearsonr(x_data, z_data)
253

254
# Spearman correlation
255
rho_xy, sp_p_xy = spearmanr(x_data, y_data)
256

257
results['pearson_r'] = r_xy
258
results['pearson_p'] = p_xy
259
results['spearman_rho'] = rho_xy
260

261
fig, axes = plt.subplots(2, 3, figsize=(14, 9))
262

263
# Scatter with regression line
264
ax1 = axes[0, 0]
265
ax1.scatter(x_data, y_data, alpha=0.6, s=30)
266
z_fit = np.polyfit(x_data, y_data, 1)
267
p_fit = np.poly1d(z_fit)
268
x_line = np.linspace(min(x_data), max(x_data), 100)
269
ax1.plot(x_line, p_fit(x_line), 'r-', linewidth=2, label=f'r = {r_xy:.3f}')
270
ax1.set_xlabel('X')
271
ax1.set_ylabel('Y')
272
ax1.set_title('Strong Correlation')
273
ax1.legend()
274
ax1.grid(True, alpha=0.3)
275

276
# No correlation
277
ax2 = axes[0, 1]
278
ax2.scatter(x_data, z_data, alpha=0.6, s=30)
279
ax2.set_xlabel('X')
280
ax2.set_ylabel('Z')
281
ax2.set_title(f'No Correlation (r = {r_xz:.3f})')
282
ax2.grid(True, alpha=0.3)
283

284
# Correlation matrix heatmap
285
ax3 = axes[0, 2]
286
data_matrix = np.column_stack([x_data, y_data, z_data])
287
corr_matrix = np.corrcoef(data_matrix.T)
288
im = ax3.imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
289
plt.colorbar(im, ax=ax3)
290
ax3.set_xticks([0, 1, 2])
291
ax3.set_yticks([0, 1, 2])
292
ax3.set_xticklabels(['X', 'Y', 'Z'])
293
ax3.set_yticklabels(['X', 'Y', 'Z'])
294
ax3.set_title('Correlation Matrix')
295
for i in range(3):
296
    for j in range(3):
297
        ax3.text(j, i, f'{corr_matrix[i,j]:.2f}', ha='center', va='center')
298

299
# Residual plot
300
ax4 = axes[1, 0]
301
residuals = y_data - p_fit(x_data)
302
ax4.scatter(p_fit(x_data), residuals, alpha=0.6, s=30)
303
ax4.axhline(y=0, color='r', linestyle='--')
304
ax4.set_xlabel('Fitted Values')
305
ax4.set_ylabel('Residuals')
306
ax4.set_title('Residual Plot')
307
ax4.grid(True, alpha=0.3)
308

309
# P-value visualization
310
ax5 = axes[1, 1]
311
tests = ['Control vs\nTreat A', 'Control vs\nTreat B', 'Treat A vs\nTreat B']
312
p_values = [p_value_12, p_value_13, p_value_23]
313
colors_pval = ['green' if p < 0.05 else 'red' for p in p_values]
314
bars = ax5.bar(tests, [-np.log10(p) for p in p_values], color=colors_pval, alpha=0.7, edgecolor='black')
315
ax5.axhline(y=-np.log10(0.05), color='k', linestyle='--', label='p = 0.05')
316
ax5.set_ylabel('$-\\log_{10}(p)$')
317
ax5.set_title('Hypothesis Test Results')
318
ax5.legend()
319
ax5.grid(True, alpha=0.3)
320

321
# Effect sizes
322
ax6 = axes[1, 2]
323
effect_sizes = [abs(d_12), abs(d_13), abs(d_23)]
324
ax6.bar(tests, effect_sizes, color='steelblue', alpha=0.7, edgecolor='black')
325
ax6.axhline(y=0.2, color='g', linestyle=':', label='Small (0.2)')
326
ax6.axhline(y=0.5, color='orange', linestyle=':', label='Medium (0.5)')
327
ax6.axhline(y=0.8, color='r', linestyle=':', label='Large (0.8)')
328
ax6.set_ylabel("Cohen's d")
329
ax6.set_title('Effect Sizes')
330
ax6.legend(loc='upper right', fontsize=8)
331
ax6.grid(True, alpha=0.3)
332

333
plt.tight_layout()
334
plt.savefig('statistical_inference.pdf', bbox_inches='tight', dpi=150)
335
print(r'\begin{figure}[H]')
336
print(r'\centering')
337
print(r'\includegraphics[width=\textwidth]{statistical_inference.pdf}')
338
print(r'\caption{Statistical inference: correlation analysis, residual diagnostics, hypothesis test results, and effect sizes.}')
339
print(r'\label{fig:inference}')
340
print(r'\end{figure}')
341
plt.close()
342
\end{pycode}
343

344
\subsection{Multiple Testing Correction}
345

346
\begin{pycode}
347
# Bonferroni correction
348
alpha = 0.05
349
n_tests = 3
350
bonferroni_alpha = alpha / n_tests
351

352
# Benjamini-Hochberg FDR correction
353
p_values_all = sorted([p_value_12, p_value_13, p_value_23])
354
bh_critical = [(i+1) / n_tests * alpha for i in range(n_tests)]
355

356
results['bonferroni_alpha'] = bonferroni_alpha
357
results['n_significant_bonf'] = sum(p < bonferroni_alpha for p in [p_value_12, p_value_13, p_value_23])
358

359
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
360

361
# Bonferroni
362
ax1 = axes[0]
363
ax1.bar(tests, p_values, color='steelblue', alpha=0.7, edgecolor='black')
364
ax1.axhline(y=alpha, color='r', linestyle='--', label=f'$\\alpha$ = {alpha}')
365
ax1.axhline(y=bonferroni_alpha, color='g', linestyle='--',
366
            label=f'Bonferroni = {bonferroni_alpha:.4f}')
367
ax1.set_ylabel('p-value')
368
ax1.set_title('Multiple Testing Correction')
369
ax1.legend()
370
ax1.grid(True, alpha=0.3)
371

372
# BH procedure
373
ax2 = axes[1]
374
ax2.plot(range(1, n_tests+1), p_values_all, 'bo-', markersize=10, label='Sorted p-values')
375
ax2.plot(range(1, n_tests+1), bh_critical, 'r--', linewidth=2, label='BH critical')
376
ax2.set_xlabel('Rank')
377
ax2.set_ylabel('p-value')
378
ax2.set_title('Benjamini-Hochberg Procedure')
379
ax2.legend()
380
ax2.grid(True, alpha=0.3)
381

382
plt.tight_layout()
383
plt.savefig('statistical_multiple.pdf', bbox_inches='tight', dpi=150)
384
print(r'\begin{figure}[H]')
385
print(r'\centering')
386
print(r'\includegraphics[width=\textwidth]{statistical_multiple.pdf}')
387
print(r'\caption{Multiple testing correction: Bonferroni and Benjamini-Hochberg procedures.}')
388
print(r'\label{fig:multiple}')
389
print(r'\end{figure}')
390
plt.close()
391
\end{pycode}
392

393
\section{Results and Discussion}
394

395
\subsection{Descriptive Statistics}
396

397
\begin{table}[H]
398
\centering
399
\caption{Group Descriptive Statistics}
400
\label{tab:descriptive}
401
\begin{tabular}{lccc}
402
\toprule
403
\textbf{Statistic} & \textbf{Control} & \textbf{Treatment A} & \textbf{Treatment B} \\
404
\midrule
405
Mean & \py{f"{results['g1_mean']:.2f}"} & \py{f"{results['g2_mean']:.2f}"} & \py{f"{results['g3_mean']:.2f}"} \\
406
SD & \py{f"{results['g1_std']:.2f}"} & \py{f"{results['g2_std']:.2f}"} & \py{f"{results['g3_std']:.2f}"} \\
407
N & \py{n1} & \py{n2} & \py{n3} \\
408
\bottomrule
409
\end{tabular}
410
\end{table}
411

412
\subsection{Assumption Tests}
413

414
Normality (Shapiro-Wilk test):
415
\begin{itemize}
416
    \item Control: p = \py{f"{results['shapiro_g1_p']:.4f}"} (Normal)
417
    \item Treatment A: p = \py{f"{results['shapiro_g2_p']:.4f}"} (Normal)
418
    \item Treatment B: p = \py{f"{results['shapiro_g3_p']:.4f}"} (Normal)
419
\end{itemize}
420

421
Homogeneity of variance (Levene's test): p = \py{f"{results['levene_p']:.4f}"}
422

423
\subsection{Hypothesis Tests}
424

425
\begin{table}[H]
426
\centering
427
\caption{Pairwise Comparisons}
428
\label{tab:tests}
429
\begin{tabular}{lccc}
430
\toprule
431
\textbf{Comparison} & \textbf{t-statistic} & \textbf{p-value} & \textbf{Cohen's d} \\
432
\midrule
433
Control vs Treatment A & \py{f"{results['t_stat_12']:.3f}"} & \py{f"{results['p_value_12']:.4f}"} & \py{f"{results['cohens_d_12']:.3f}"} \\
434
\bottomrule
435
\end{tabular}
436
\end{table}
437

438
ANOVA: F = \py{f"{results['f_stat']:.3f}"}, p = \py{f"{results['anova_p']:.4f}"}
439

440
\subsection{Correlation Analysis}
441

442
\begin{itemize}
443
    \item Pearson r = \py{f"{results['pearson_r']:.3f}"}, p = \py{f"{results['pearson_p']:.4f}"}
444
    \item Spearman $\rho$ = \py{f"{results['spearman_rho']:.3f}"}
445
\end{itemize}
446

447
\section{Conclusion}
448
This analysis demonstrated a comprehensive statistical workflow including descriptive statistics, assumption testing, parametric and non-parametric hypothesis tests, effect size calculation, and multiple testing correction. Key findings show significant differences between treatment groups with appropriate corrections for multiple comparisons.
449

450
\end{document}
451

452
Product

Resources

Company