ubuntu2404
\documentclass[11pt,a4paper]{article}12% Statistical Computing and Data Analysis Template3% SEO Keywords: statistical computing latex template, data analysis latex, bayesian statistics latex45\usepackage[utf8]{inputenc}6\usepackage[T1]{fontenc}7\usepackage{amsmath,amsfonts,amssymb}8\usepackage{mathtools}9\usepackage{siunitx}10\usepackage{graphicx}11\usepackage{float}12\usepackage{hyperref}13\usepackage{cleveref}14\usepackage{booktabs}15\usepackage{pythontex}1617\usepackage[style=numeric,backend=biber]{biblatex}18\addbibresource{references.bib}1920\title{Statistical Computing and Data Analysis:\\21Modern Methods and Computational Techniques}22\author{CoCalc Scientific Templates}23\date{\today}2425\begin{document}2627\maketitle2829\begin{abstract}30This comprehensive statistical computing template demonstrates modern data analysis techniques including Bayesian inference, Monte Carlo methods, hypothesis testing, regression analysis, and machine learning fundamentals. Features computational implementations of statistical algorithms with convergence diagnostics and professional visualizations for reproducible statistical research.3132\textbf{Keywords:} statistical computing, data analysis, Bayesian statistics, Monte Carlo methods, hypothesis testing, regression analysis, statistical inference33\end{abstract}3435\tableofcontents36\newpage3738\section{Statistical Inference and Hypothesis Testing}39\label{sec:statistical-inference}4041\begin{pycode}42import numpy as np43import matplotlib.pyplot as plt44import matplotlib45matplotlib.use('Agg')46from scipy import stats47import seaborn as sns4849# Set style for statistical plots50plt.style.use('seaborn-v0_8-whitegrid')51sns.set_palette("husl")52np.random.seed(42)5354# Hypothesis testing demonstration55def perform_statistical_tests():56"""Demonstrate various statistical tests"""5758# Generate sample data59n1, n2 = 50, 4560sample1 = np.random.normal(10, 2, n1) # Group 1: μ=10, σ=261sample2 = np.random.normal(12, 2.5, n2) # Group 2: μ=12, σ=2.56263# One-sample t-test64t_stat, p_value = stats.ttest_1samp(sample1, 9.5)6566# Two-sample t-test67t2_stat, p2_value = stats.ttest_ind(sample1, sample2)6869# Kolmogorov-Smirnov test for normality70ks_stat, ks_p = stats.kstest(sample1, 'norm', args=(sample1.mean(), sample1.std()))7172print("Statistical Hypothesis Testing Results:")73print(f"One-sample t-test: t = {t_stat:.4f}, p = {p_value:.4f}")74print(f"Two-sample t-test: t = {t2_stat:.4f}, p = {p2_value:.4f}")75print(f"KS normality test: D = {ks_stat:.4f}, p = {ks_p:.4f}")7677return sample1, sample2, t_stat, p_value, t2_stat, p2_value7879sample1, sample2, t_stat, p_val, t2_stat, p2_val = perform_statistical_tests()80\end{pycode}8182\section{Bayesian Statistics and MCMC}83\label{sec:bayesian-mcmc}8485\begin{pycode}86# Bayesian inference using Metropolis-Hastings algorithm87def metropolis_hastings(log_posterior, initial_theta, n_samples, proposal_cov):88"""Metropolis-Hastings sampler for Bayesian inference"""8990n_params = len(initial_theta)91samples = np.zeros((n_samples, n_params))92samples[0] = initial_theta9394n_accepted = 095current_theta = initial_theta.copy()96current_log_prob = log_posterior(current_theta)9798for i in range(1, n_samples):99# Propose new state100proposal = np.random.multivariate_normal(current_theta, proposal_cov)101proposal_log_prob = log_posterior(proposal)102103# Acceptance probability104log_alpha = proposal_log_prob - current_log_prob105alpha = min(1, np.exp(log_alpha))106107# Accept or reject108if np.random.rand() < alpha:109current_theta = proposal110current_log_prob = proposal_log_prob111n_accepted += 1112113samples[i] = current_theta114115acceptance_rate = n_accepted / (n_samples - 1)116return samples, acceptance_rate117118# Bayesian linear regression example119def bayesian_linear_regression():120"""Bayesian inference for linear regression parameters"""121122# Generate synthetic data123n_obs = 50124true_beta = np.array([2.0, -1.5])125sigma_true = 0.5126127X = np.column_stack([np.ones(n_obs), np.random.uniform(-2, 2, n_obs)])128y = X @ true_beta + np.random.normal(0, sigma_true, n_obs)129130# Define log-posterior (assuming flat priors)131def log_posterior(params):132beta = params[:2]133sigma = params[2]134135if sigma <= 0:136return -np.inf137138# Likelihood139y_pred = X @ beta140log_likelihood = -0.5 * n_obs * np.log(2 * np.pi * sigma**2) - \1410.5 * np.sum((y - y_pred)**2) / sigma**2142143# Weak priors144log_prior = -0.5 * np.sum(beta**2) / 10 # N(0, 10) prior on beta145log_prior += -np.log(sigma) # 1/sigma prior on sigma146147return log_likelihood + log_prior148149# Run MCMC150initial_params = np.array([0.0, 0.0, 1.0]) # [beta0, beta1, sigma]151proposal_cov = np.diag([0.1, 0.1, 0.05])152n_samples = 5000153154samples, accept_rate = metropolis_hastings(log_posterior, initial_params, n_samples, proposal_cov)155156# Remove burn-in157burn_in = 1000158samples_clean = samples[burn_in:]159160print(f"\nBayesian Linear Regression:")161print(f"True parameters: beta0={true_beta[0]}, beta1={true_beta[1]}, sigma={sigma_true}")162print(f"Acceptance rate: {accept_rate:.3f}")163164# Posterior summaries165for i, param_name in enumerate(['beta0', 'beta1', 'sigma']):166posterior_mean = np.mean(samples_clean[:, i])167posterior_std = np.std(samples_clean[:, i])168ci_lower = np.percentile(samples_clean[:, i], 2.5)169ci_upper = np.percentile(samples_clean[:, i], 97.5)170171print(f"{param_name}: {posterior_mean:.3f} ± {posterior_std:.3f}, 95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")172173return X, y, samples_clean, true_beta, sigma_true174175X, y, mcmc_samples, true_beta, sigma_true = bayesian_linear_regression()176\end{pycode}177178\section{Regression Analysis and Model Selection}179\label{sec:regression-analysis}180181\begin{pycode}182# Advanced regression techniques183from sklearn.linear_model import Ridge, Lasso, ElasticNet184from sklearn.model_selection import cross_val_score185from sklearn.preprocessing import PolynomialFeatures, StandardScaler186from sklearn.pipeline import Pipeline187import warnings188from sklearn.exceptions import ConvergenceWarning189190# Suppress sklearn convergence warnings for cleaner output191warnings.filterwarnings('ignore', category=ConvergenceWarning)192193def regression_comparison():194"""Compare different regression methods"""195196# Generate polynomial regression data with noise197np.random.seed(42)198n_samples = 100199x = np.linspace(0, 1, n_samples)200y_true = 2 * x - 3 * x**2 + 0.5 * x**3201noise = np.random.normal(0, 0.1, n_samples)202y = y_true + noise203204X = x.reshape(-1, 1)205206# Create polynomial features207degree = 8208poly_features = PolynomialFeatures(degree=degree, include_bias=False)209X_poly = poly_features.fit_transform(X)210211# Standardize features212scaler = StandardScaler()213X_poly_scaled = scaler.fit_transform(X_poly)214215# Compare regularization methods216alphas = np.logspace(-4, 2, 50)217methods = {218'Ridge': Ridge(),219'Lasso': Lasso(max_iter=20000, tol=1e-6),220'ElasticNet': ElasticNet(max_iter=20000, tol=1e-6, l1_ratio=0.5)221}222223results = {}224for name, model in methods.items():225cv_scores = []226coefficients = []227228for alpha in alphas:229model.set_params(alpha=alpha)230# Cross-validation score231scores = cross_val_score(model, X_poly_scaled, y, cv=5, scoring='neg_mean_squared_error')232cv_scores.append(-np.mean(scores))233234# Fit to get coefficients235model.fit(X_poly_scaled, y)236coefficients.append(model.coef_.copy())237238results[name] = {239'cv_scores': cv_scores,240'coefficients': np.array(coefficients)241}242243print(f"\nRegression Analysis:")244print(f" Dataset size: {n_samples} samples")245print(f" Polynomial degree: {degree}")246print(f" True function: y = 2x - 3x² + 0.5x³")247248return x, y, y_true, alphas, results, X_poly_scaled249250x, y, y_true, alphas, reg_results, X_poly_scaled = regression_comparison()251\end{pycode}252253\begin{pycode}254# Create comprehensive statistical analysis visualization255import os256os.makedirs('assets', exist_ok=True)257258fig, axes = plt.subplots(2, 3, figsize=(18, 12))259fig.suptitle('Statistical Computing and Data Analysis', fontsize=16, fontweight='bold')260261# Hypothesis testing visualization262ax1 = axes[0, 0]263ax1.hist(sample1, alpha=0.7, bins=15, label='Sample 1', density=True, color='skyblue')264ax1.hist(sample2, alpha=0.7, bins=15, label='Sample 2', density=True, color='salmon')265ax1.axvline(np.mean(sample1), color='blue', linestyle='--', linewidth=2, label=f'Mean 1: {np.mean(sample1):.2f}')266ax1.axvline(np.mean(sample2), color='red', linestyle='--', linewidth=2, label=f'Mean 2: {np.mean(sample2):.2f}')267ax1.set_xlabel('Value')268ax1.set_ylabel('Density')269ax1.set_title(f'Two-Sample Comparison\n(t = {t2_stat:.2f}, p = {p2_val:.4f})')270ax1.legend()271ax1.grid(True, alpha=0.3)272273# MCMC trace plots274ax2 = axes[0, 1]275param_names = ['beta0', 'beta1', 'sigma']276colors = ['blue', 'red', 'green']277for i, (name, color) in enumerate(zip(param_names, colors)):278ax2.plot(mcmc_samples[:, i], color=color, alpha=0.7, label=name, linewidth=1)279ax2.set_xlabel('MCMC Iteration')280ax2.set_ylabel('Parameter Value')281ax2.set_title('MCMC Trace Plots')282ax2.legend()283ax2.grid(True, alpha=0.3)284285# Posterior distributions286ax3 = axes[0, 2]287for i, (name, color) in enumerate(zip(param_names, colors)):288ax3.hist(mcmc_samples[:, i], bins=30, alpha=0.6, density=True,289label=f'{name} (mean: {np.mean(mcmc_samples[:, i]):.3f})', color=color)290ax3.set_xlabel('Parameter Value')291ax3.set_ylabel('Posterior Density')292ax3.set_title('Posterior Distributions')293ax3.legend()294ax3.grid(True, alpha=0.3)295296# Regression data and true function297ax4 = axes[1, 0]298ax4.scatter(x, y, alpha=0.6, s=20, color='lightblue', label='Noisy data')299ax4.plot(x, y_true, 'r-', linewidth=3, label='True function')300ax4.set_xlabel('x')301ax4.set_ylabel('y')302ax4.set_title('Polynomial Regression Data')303ax4.legend()304ax4.grid(True, alpha=0.3)305306# Cross-validation scores307ax5 = axes[1, 1]308for name, results in reg_results.items():309ax5.semilogx(alphas, results['cv_scores'], 'o-', linewidth=2, label=name, markersize=4)310ax5.set_xlabel('Regularization Parameter (α)')311ax5.set_ylabel('Cross-Validation MSE')312ax5.set_title('Model Selection via Cross-Validation')313ax5.legend()314ax5.grid(True, alpha=0.3)315316# Regularization paths317ax6 = axes[1, 2]318for name, results in reg_results.items():319if name == 'Lasso': # Show Lasso path as example320coeffs = results['coefficients']321for i in range(coeffs.shape[1]):322ax6.semilogx(alphas, coeffs[:, i], linewidth=2, alpha=0.7)323break324325ax6.set_xlabel('Regularization Parameter (α)')326ax6.set_ylabel('Coefficient Value')327ax6.set_title('Lasso Regularization Path')328ax6.grid(True, alpha=0.3)329330plt.tight_layout()331plt.savefig('assets/statistical-analysis.pdf', dpi=300, bbox_inches='tight')332plt.close()333334print("Statistical analysis visualization saved to assets/statistical-analysis.pdf")335\end{pycode}336337\begin{figure}[H]338\centering339\includegraphics[width=0.95\textwidth]{assets/statistical-analysis.pdf}340\caption{Comprehensive statistical computing and data analysis including (top row) two-sample hypothesis testing comparison with distributions and means, MCMC trace plots showing convergence behavior, and posterior distributions from Bayesian inference, and (bottom row) polynomial regression data with true function, cross-validation model selection curves, and Lasso regularization paths demonstrating coefficient shrinkage effects.}341\label{fig:statistical-analysis}342\end{figure}343344\section{Time Series Analysis}345\label{sec:time-series}346347\begin{pycode}348# Time series analysis and forecasting349from scipy.signal import find_peaks350351def time_series_analysis():352"""Demonstrate time series analysis techniques"""353354# Generate synthetic time series data355n_points = 500356t = np.linspace(0, 10, n_points)357358# Multiple components: trend + seasonal + noise359trend = 0.5 * t + 0.02 * t**2360seasonal = 2 * np.sin(2 * np.pi * t) + np.sin(4 * np.pi * t)361noise = np.random.normal(0, 0.5, n_points)362363ts_data = trend + seasonal + noise364365# Simple moving averages366window_sizes = [5, 20, 50]367moving_averages = {}368369for window in window_sizes:370ma = np.convolve(ts_data, np.ones(window)/window, mode='same')371moving_averages[window] = ma372373# Autocorrelation function374def autocorrelation(x, max_lags=50):375"""Compute autocorrelation function"""376n = len(x)377x = x - np.mean(x)378autocorr = np.correlate(x, x, mode='full')379autocorr = autocorr[n-1:]380autocorr = autocorr / autocorr[0]381return autocorr[:max_lags+1]382383lags = range(51)384autocorr = autocorrelation(ts_data)385386print(f"\nTime Series Analysis:")387print(f" Data points: {n_points}")388print(f" Time span: {t[-1]:.1f} units")389print(f" Mean: {np.mean(ts_data):.3f}")390print(f" Std: {np.std(ts_data):.3f}")391392return t, ts_data, moving_averages, lags, autocorr, trend, seasonal393394t, ts_data, moving_avgs, lags, autocorr, trend, seasonal = time_series_analysis()395396# Additional statistical summaries397print(f"\nStatistical Summary:")398# Fix array dimension mismatch by using minimum length399min_len = min(len(sample1), len(sample2))400print(f" Sample correlation between samples: {np.corrcoef(sample1[:min_len], sample2[:min_len])[0,1]:.3f}")401print(f" MCMC effective sample size (beta1): {len(mcmc_samples) / (1 + 2*np.sum(np.correlate(mcmc_samples[:,1] - np.mean(mcmc_samples[:,1]), mcmc_samples[:,1] - np.mean(mcmc_samples[:,1]), mode='full')[len(mcmc_samples):len(mcmc_samples)+10])):.0f}")402\end{pycode}403404\section{Conclusion}405\label{sec:conclusion}406407This statistical computing template demonstrates modern computational methods for data analysis including:408409\begin{itemize}410\item Hypothesis testing with multiple comparison procedures411\item Bayesian inference using MCMC methods412\item Regularized regression with cross-validation413\item Time series analysis and forecasting414\item Professional statistical visualization415\end{itemize}416417The computational implementations provide a foundation for reproducible statistical research with rigorous uncertainty quantification.418419\printbibliography420421\end{document}422423