Path: blob/main/latex-templates/templates/data-science/statistical_analysis.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{siunitx}6\usepackage{booktabs}7\usepackage{float}8\usepackage[makestderr]{pythontex}910\title{Data Science: Comprehensive Statistical Analysis}11\author{Computational Science Templates}12\date{\today}1314\begin{document}15\maketitle1617\begin{abstract}18This document presents a comprehensive statistical analysis workflow including descriptive statistics, hypothesis testing, confidence intervals, ANOVA, correlation analysis, and regression diagnostics. We demonstrate parametric and non-parametric tests, effect size calculations, and multiple testing corrections using Python's scipy and statsmodels libraries.19\end{abstract}2021\section{Introduction}22Statistical analysis forms the foundation of data-driven decision making. This analysis covers the complete workflow from exploratory data analysis through hypothesis testing to model diagnostics, providing a template for rigorous quantitative research.2324\section{Mathematical Framework}2526\subsection{Descriptive Statistics}27Sample mean and variance:28\begin{equation}29\bar{x} = \frac{1}{n}\sum_{i=1}^n x_i, \quad s^2 = \frac{1}{n-1}\sum_{i=1}^n (x_i - \bar{x})^230\end{equation}3132\subsection{Hypothesis Testing}33For a t-test comparing two means:34\begin{equation}35t = \frac{\bar{x}_1 - \bar{x}_2}{s_p\sqrt{\frac{1}{n_1} + \frac{1}{n_2}}}36\end{equation}3738where $s_p$ is the pooled standard deviation.3940\subsection{Confidence Intervals}41A $(1-\alpha)$ confidence interval for the mean:42\begin{equation}43\bar{x} \pm t_{\alpha/2, n-1} \frac{s}{\sqrt{n}}44\end{equation}4546\subsection{ANOVA}47F-statistic for one-way ANOVA:48\begin{equation}49F = \frac{\text{MS}_{\text{between}}}{\text{MS}_{\text{within}}} = \frac{\sum_j n_j(\bar{x}_j - \bar{x})^2 / (k-1)}{\sum_j\sum_i (x_{ij} - \bar{x}_j)^2 / (N-k)}50\end{equation}5152\section{Computational Analysis}5354\begin{pycode}55import numpy as np56import matplotlib.pyplot as plt57from scipy import stats58from scipy.stats import (norm, t, f, chi2, pearsonr, spearmanr,59ttest_ind, ttest_rel, mannwhitneyu, wilcoxon,60f_oneway, kruskal, shapiro, levene)61plt.rc('text', usetex=True)62plt.rc('font', family='serif')6364np.random.seed(42)6566# Store results67results = {}6869# Generate sample data70n1, n2, n3 = 50, 50, 5071group1 = np.random.normal(100, 15, n1) # Control72group2 = np.random.normal(108, 15, n2) # Treatment A73group3 = np.random.normal(105, 18, n3) # Treatment B7475# Combined for overall analysis76all_data = np.concatenate([group1, group2, group3])77\end{pycode}7879\subsection{Descriptive Statistics}8081\begin{pycode}82def descriptive_stats(data, name):83"""Calculate comprehensive descriptive statistics"""84return {85'name': name,86'n': len(data),87'mean': np.mean(data),88'std': np.std(data, ddof=1),89'sem': stats.sem(data),90'median': np.median(data),91'q1': np.percentile(data, 25),92'q3': np.percentile(data, 75),93'min': np.min(data),94'max': np.max(data),95'skew': stats.skew(data),96'kurtosis': stats.kurtosis(data)97}9899stats_g1 = descriptive_stats(group1, 'Control')100stats_g2 = descriptive_stats(group2, 'Treatment A')101stats_g3 = descriptive_stats(group3, 'Treatment B')102103results['g1_mean'] = stats_g1['mean']104results['g2_mean'] = stats_g2['mean']105results['g3_mean'] = stats_g3['mean']106results['g1_std'] = stats_g1['std']107results['g2_std'] = stats_g2['std']108results['g3_std'] = stats_g3['std']109110# Visualization111fig, axes = plt.subplots(2, 3, figsize=(14, 9))112113# Histograms114ax1 = axes[0, 0]115ax1.hist(group1, bins=15, alpha=0.7, label='Control', color='blue', edgecolor='black')116ax1.hist(group2, bins=15, alpha=0.7, label='Treatment A', color='green', edgecolor='black')117ax1.hist(group3, bins=15, alpha=0.7, label='Treatment B', color='red', edgecolor='black')118ax1.set_xlabel('Value')119ax1.set_ylabel('Frequency')120ax1.set_title('Distribution Comparison')121ax1.legend()122ax1.grid(True, alpha=0.3)123124# Box plots125ax2 = axes[0, 1]126bp = ax2.boxplot([group1, group2, group3], labels=['Control', 'Treat A', 'Treat B'],127patch_artist=True)128colors = ['lightblue', 'lightgreen', 'lightcoral']129for patch, color in zip(bp['boxes'], colors):130patch.set_facecolor(color)131ax2.set_ylabel('Value')132ax2.set_title('Box Plot Comparison')133ax2.grid(True, alpha=0.3)134135# Q-Q plot for normality136ax3 = axes[0, 2]137stats.probplot(group1, dist="norm", plot=ax3)138ax3.set_title('Q-Q Plot (Control Group)')139140# Violin plots141ax4 = axes[1, 0]142parts = ax4.violinplot([group1, group2, group3], positions=[1, 2, 3], showmeans=True)143ax4.set_xticks([1, 2, 3])144ax4.set_xticklabels(['Control', 'Treat A', 'Treat B'])145ax4.set_ylabel('Value')146ax4.set_title('Violin Plot')147ax4.grid(True, alpha=0.3)148149# Mean with confidence intervals150ax5 = axes[1, 1]151means = [stats_g1['mean'], stats_g2['mean'], stats_g3['mean']]152sems = [stats_g1['sem'], stats_g2['sem'], stats_g3['sem']]153ci95 = [1.96 * s for s in sems]154x_pos = [1, 2, 3]155ax5.bar(x_pos, means, yerr=ci95, capsize=5, color=colors, edgecolor='black', alpha=0.7)156ax5.set_xticks(x_pos)157ax5.set_xticklabels(['Control', 'Treat A', 'Treat B'])158ax5.set_ylabel('Mean (95\\% CI)')159ax5.set_title('Group Means with Confidence Intervals')160ax5.grid(True, alpha=0.3)161162# Cumulative distribution163ax6 = axes[1, 2]164for data, label, color in [(group1, 'Control', 'blue'),165(group2, 'Treat A', 'green'),166(group3, 'Treat B', 'red')]:167sorted_data = np.sort(data)168cumulative = np.arange(1, len(data) + 1) / len(data)169ax6.plot(sorted_data, cumulative, label=label, color=color, linewidth=2)170ax6.set_xlabel('Value')171ax6.set_ylabel('Cumulative Probability')172ax6.set_title('Empirical CDF')173ax6.legend()174ax6.grid(True, alpha=0.3)175176plt.tight_layout()177plt.savefig('statistical_descriptive.pdf', bbox_inches='tight', dpi=150)178print(r'\begin{figure}[H]')179print(r'\centering')180print(r'\includegraphics[width=\textwidth]{statistical_descriptive.pdf}')181print(r'\caption{Descriptive statistics visualization: histograms, box plots, Q-Q plot, violin plots, means with CI, and empirical CDFs.}')182print(r'\label{fig:descriptive}')183print(r'\end{figure}')184plt.close()185\end{pycode}186187\subsection{Normality and Homogeneity Tests}188189\begin{pycode}190# Normality tests191shapiro_g1 = shapiro(group1)192shapiro_g2 = shapiro(group2)193shapiro_g3 = shapiro(group3)194195# Homogeneity of variance196levene_stat, levene_p = levene(group1, group2, group3)197198results['shapiro_g1_p'] = shapiro_g1.pvalue199results['shapiro_g2_p'] = shapiro_g2.pvalue200results['shapiro_g3_p'] = shapiro_g3.pvalue201results['levene_p'] = levene_p202\end{pycode}203204\subsection{Hypothesis Testing}205206\begin{pycode}207# Two-sample t-tests208t_stat_12, p_value_12 = ttest_ind(group1, group2)209t_stat_13, p_value_13 = ttest_ind(group1, group3)210t_stat_23, p_value_23 = ttest_ind(group2, group3)211212# Mann-Whitney U test (non-parametric)213u_stat_12, u_p_12 = mannwhitneyu(group1, group2, alternative='two-sided')214215# Effect size (Cohen's d)216def cohens_d(g1, g2):217n1, n2 = len(g1), len(g2)218var1, var2 = np.var(g1, ddof=1), np.var(g2, ddof=1)219pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))220return (np.mean(g1) - np.mean(g2)) / pooled_std221222d_12 = cohens_d(group1, group2)223d_13 = cohens_d(group1, group3)224d_23 = cohens_d(group2, group3)225226results['t_stat_12'] = t_stat_12227results['p_value_12'] = p_value_12228results['cohens_d_12'] = d_12229230# ANOVA231f_stat, anova_p = f_oneway(group1, group2, group3)232results['f_stat'] = f_stat233results['anova_p'] = anova_p234235# Kruskal-Wallis (non-parametric ANOVA)236h_stat, kw_p = kruskal(group1, group2, group3)237results['kw_p'] = kw_p238\end{pycode}239240\subsection{Correlation Analysis}241242\begin{pycode}243# Generate correlated data for correlation analysis244n_corr = 100245x_data = np.random.normal(50, 10, n_corr)246y_data = 2 * x_data + np.random.normal(0, 15, n_corr)247z_data = np.random.normal(50, 10, n_corr) # Uncorrelated248249# Pearson correlation250r_xy, p_xy = pearsonr(x_data, y_data)251r_xz, p_xz = pearsonr(x_data, z_data)252253# Spearman correlation254rho_xy, sp_p_xy = spearmanr(x_data, y_data)255256results['pearson_r'] = r_xy257results['pearson_p'] = p_xy258results['spearman_rho'] = rho_xy259260fig, axes = plt.subplots(2, 3, figsize=(14, 9))261262# Scatter with regression line263ax1 = axes[0, 0]264ax1.scatter(x_data, y_data, alpha=0.6, s=30)265z_fit = np.polyfit(x_data, y_data, 1)266p_fit = np.poly1d(z_fit)267x_line = np.linspace(min(x_data), max(x_data), 100)268ax1.plot(x_line, p_fit(x_line), 'r-', linewidth=2, label=f'r = {r_xy:.3f}')269ax1.set_xlabel('X')270ax1.set_ylabel('Y')271ax1.set_title('Strong Correlation')272ax1.legend()273ax1.grid(True, alpha=0.3)274275# No correlation276ax2 = axes[0, 1]277ax2.scatter(x_data, z_data, alpha=0.6, s=30)278ax2.set_xlabel('X')279ax2.set_ylabel('Z')280ax2.set_title(f'No Correlation (r = {r_xz:.3f})')281ax2.grid(True, alpha=0.3)282283# Correlation matrix heatmap284ax3 = axes[0, 2]285data_matrix = np.column_stack([x_data, y_data, z_data])286corr_matrix = np.corrcoef(data_matrix.T)287im = ax3.imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)288plt.colorbar(im, ax=ax3)289ax3.set_xticks([0, 1, 2])290ax3.set_yticks([0, 1, 2])291ax3.set_xticklabels(['X', 'Y', 'Z'])292ax3.set_yticklabels(['X', 'Y', 'Z'])293ax3.set_title('Correlation Matrix')294for i in range(3):295for j in range(3):296ax3.text(j, i, f'{corr_matrix[i,j]:.2f}', ha='center', va='center')297298# Residual plot299ax4 = axes[1, 0]300residuals = y_data - p_fit(x_data)301ax4.scatter(p_fit(x_data), residuals, alpha=0.6, s=30)302ax4.axhline(y=0, color='r', linestyle='--')303ax4.set_xlabel('Fitted Values')304ax4.set_ylabel('Residuals')305ax4.set_title('Residual Plot')306ax4.grid(True, alpha=0.3)307308# P-value visualization309ax5 = axes[1, 1]310tests = ['Control vs\nTreat A', 'Control vs\nTreat B', 'Treat A vs\nTreat B']311p_values = [p_value_12, p_value_13, p_value_23]312colors_pval = ['green' if p < 0.05 else 'red' for p in p_values]313bars = ax5.bar(tests, [-np.log10(p) for p in p_values], color=colors_pval, alpha=0.7, edgecolor='black')314ax5.axhline(y=-np.log10(0.05), color='k', linestyle='--', label='p = 0.05')315ax5.set_ylabel('$-\\log_{10}(p)$')316ax5.set_title('Hypothesis Test Results')317ax5.legend()318ax5.grid(True, alpha=0.3)319320# Effect sizes321ax6 = axes[1, 2]322effect_sizes = [abs(d_12), abs(d_13), abs(d_23)]323ax6.bar(tests, effect_sizes, color='steelblue', alpha=0.7, edgecolor='black')324ax6.axhline(y=0.2, color='g', linestyle=':', label='Small (0.2)')325ax6.axhline(y=0.5, color='orange', linestyle=':', label='Medium (0.5)')326ax6.axhline(y=0.8, color='r', linestyle=':', label='Large (0.8)')327ax6.set_ylabel("Cohen's d")328ax6.set_title('Effect Sizes')329ax6.legend(loc='upper right', fontsize=8)330ax6.grid(True, alpha=0.3)331332plt.tight_layout()333plt.savefig('statistical_inference.pdf', bbox_inches='tight', dpi=150)334print(r'\begin{figure}[H]')335print(r'\centering')336print(r'\includegraphics[width=\textwidth]{statistical_inference.pdf}')337print(r'\caption{Statistical inference: correlation analysis, residual diagnostics, hypothesis test results, and effect sizes.}')338print(r'\label{fig:inference}')339print(r'\end{figure}')340plt.close()341\end{pycode}342343\subsection{Multiple Testing Correction}344345\begin{pycode}346# Bonferroni correction347alpha = 0.05348n_tests = 3349bonferroni_alpha = alpha / n_tests350351# Benjamini-Hochberg FDR correction352p_values_all = sorted([p_value_12, p_value_13, p_value_23])353bh_critical = [(i+1) / n_tests * alpha for i in range(n_tests)]354355results['bonferroni_alpha'] = bonferroni_alpha356results['n_significant_bonf'] = sum(p < bonferroni_alpha for p in [p_value_12, p_value_13, p_value_23])357358fig, axes = plt.subplots(1, 2, figsize=(12, 5))359360# Bonferroni361ax1 = axes[0]362ax1.bar(tests, p_values, color='steelblue', alpha=0.7, edgecolor='black')363ax1.axhline(y=alpha, color='r', linestyle='--', label=f'$\\alpha$ = {alpha}')364ax1.axhline(y=bonferroni_alpha, color='g', linestyle='--',365label=f'Bonferroni = {bonferroni_alpha:.4f}')366ax1.set_ylabel('p-value')367ax1.set_title('Multiple Testing Correction')368ax1.legend()369ax1.grid(True, alpha=0.3)370371# BH procedure372ax2 = axes[1]373ax2.plot(range(1, n_tests+1), p_values_all, 'bo-', markersize=10, label='Sorted p-values')374ax2.plot(range(1, n_tests+1), bh_critical, 'r--', linewidth=2, label='BH critical')375ax2.set_xlabel('Rank')376ax2.set_ylabel('p-value')377ax2.set_title('Benjamini-Hochberg Procedure')378ax2.legend()379ax2.grid(True, alpha=0.3)380381plt.tight_layout()382plt.savefig('statistical_multiple.pdf', bbox_inches='tight', dpi=150)383print(r'\begin{figure}[H]')384print(r'\centering')385print(r'\includegraphics[width=\textwidth]{statistical_multiple.pdf}')386print(r'\caption{Multiple testing correction: Bonferroni and Benjamini-Hochberg procedures.}')387print(r'\label{fig:multiple}')388print(r'\end{figure}')389plt.close()390\end{pycode}391392\section{Results and Discussion}393394\subsection{Descriptive Statistics}395396\begin{table}[H]397\centering398\caption{Group Descriptive Statistics}399\label{tab:descriptive}400\begin{tabular}{lccc}401\toprule402\textbf{Statistic} & \textbf{Control} & \textbf{Treatment A} & \textbf{Treatment B} \\403\midrule404Mean & \py{f"{results['g1_mean']:.2f}"} & \py{f"{results['g2_mean']:.2f}"} & \py{f"{results['g3_mean']:.2f}"} \\405SD & \py{f"{results['g1_std']:.2f}"} & \py{f"{results['g2_std']:.2f}"} & \py{f"{results['g3_std']:.2f}"} \\406N & \py{n1} & \py{n2} & \py{n3} \\407\bottomrule408\end{tabular}409\end{table}410411\subsection{Assumption Tests}412413Normality (Shapiro-Wilk test):414\begin{itemize}415\item Control: p = \py{f"{results['shapiro_g1_p']:.4f}"} (Normal)416\item Treatment A: p = \py{f"{results['shapiro_g2_p']:.4f}"} (Normal)417\item Treatment B: p = \py{f"{results['shapiro_g3_p']:.4f}"} (Normal)418\end{itemize}419420Homogeneity of variance (Levene's test): p = \py{f"{results['levene_p']:.4f}"}421422\subsection{Hypothesis Tests}423424\begin{table}[H]425\centering426\caption{Pairwise Comparisons}427\label{tab:tests}428\begin{tabular}{lccc}429\toprule430\textbf{Comparison} & \textbf{t-statistic} & \textbf{p-value} & \textbf{Cohen's d} \\431\midrule432Control vs Treatment A & \py{f"{results['t_stat_12']:.3f}"} & \py{f"{results['p_value_12']:.4f}"} & \py{f"{results['cohens_d_12']:.3f}"} \\433\bottomrule434\end{tabular}435\end{table}436437ANOVA: F = \py{f"{results['f_stat']:.3f}"}, p = \py{f"{results['anova_p']:.4f}"}438439\subsection{Correlation Analysis}440441\begin{itemize}442\item Pearson r = \py{f"{results['pearson_r']:.3f}"}, p = \py{f"{results['pearson_p']:.4f}"}443\item Spearman $\rho$ = \py{f"{results['spearman_rho']:.3f}"}444\end{itemize}445446\section{Conclusion}447This analysis demonstrated a comprehensive statistical workflow including descriptive statistics, assumption testing, parametric and non-parametric hypothesis tests, effect size calculation, and multiple testing correction. Key findings show significant differences between treatment groups with appropriate corrections for multiple comparisons.448449\end{document}450451452