ubuntu2404
=>PYTHONTEX#py#default#default#0#code#####42#
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
from scipy import stats
import seaborn as sns
# Set style for statistical plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
np.random.seed(42)
# Hypothesis testing demonstration
def perform_statistical_tests():
"""Demonstrate various statistical tests"""
# Generate sample data
n1, n2 = 50, 45
sample1 = np.random.normal(10, 2, n1) # Group 1: μ=10, σ=2
sample2 = np.random.normal(12, 2.5, n2) # Group 2: μ=12, σ=2.5
# One-sample t-test
t_stat, p_value = stats.ttest_1samp(sample1, 9.5)
# Two-sample t-test
t2_stat, p2_value = stats.ttest_ind(sample1, sample2)
# Kolmogorov-Smirnov test for normality
ks_stat, ks_p = stats.kstest(sample1, 'norm', args=(sample1.mean(), sample1.std()))
print("Statistical Hypothesis Testing Results:")
print(f"One-sample t-test: t = {t_stat:.4f}, p = {p_value:.4f}")
print(f"Two-sample t-test: t = {t2_stat:.4f}, p = {p2_value:.4f}")
print(f"KS normality test: D = {ks_stat:.4f}, p = {ks_p:.4f}")
return sample1, sample2, t_stat, p_value, t2_stat, p2_value
sample1, sample2, t_stat, p_val, t2_stat, p2_val = perform_statistical_tests()
=>PYTHONTEX#py#default#default#1#code#####86#
# Bayesian inference using Metropolis-Hastings algorithm
def metropolis_hastings(log_posterior, initial_theta, n_samples, proposal_cov):
"""Metropolis-Hastings sampler for Bayesian inference"""
n_params = len(initial_theta)
samples = np.zeros((n_samples, n_params))
samples[0] = initial_theta
n_accepted = 0
current_theta = initial_theta.copy()
current_log_prob = log_posterior(current_theta)
for i in range(1, n_samples):
# Propose new state
proposal = np.random.multivariate_normal(current_theta, proposal_cov)
proposal_log_prob = log_posterior(proposal)
# Acceptance probability
log_alpha = proposal_log_prob - current_log_prob
alpha = min(1, np.exp(log_alpha))
# Accept or reject
if np.random.rand() < alpha:
current_theta = proposal
current_log_prob = proposal_log_prob
n_accepted += 1
samples[i] = current_theta
acceptance_rate = n_accepted / (n_samples - 1)
return samples, acceptance_rate
# Bayesian linear regression example
def bayesian_linear_regression():
"""Bayesian inference for linear regression parameters"""
# Generate synthetic data
n_obs = 50
true_beta = np.array([2.0, -1.5])
sigma_true = 0.5
X = np.column_stack([np.ones(n_obs), np.random.uniform(-2, 2, n_obs)])
y = X @ true_beta + np.random.normal(0, sigma_true, n_obs)
# Define log-posterior (assuming flat priors)
def log_posterior(params):
beta = params[:2]
sigma = params[2]
if sigma <= 0:
return -np.inf
# Likelihood
y_pred = X @ beta
log_likelihood = -0.5 * n_obs * np.log(2 * np.pi * sigma**2) - \
0.5 * np.sum((y - y_pred)**2) / sigma**2
# Weak priors
log_prior = -0.5 * np.sum(beta**2) / 10 # N(0, 10) prior on beta
log_prior += -np.log(sigma) # 1/sigma prior on sigma
return log_likelihood + log_prior
# Run MCMC
initial_params = np.array([0.0, 0.0, 1.0]) # [beta0, beta1, sigma]
proposal_cov = np.diag([0.1, 0.1, 0.05])
n_samples = 5000
samples, accept_rate = metropolis_hastings(log_posterior, initial_params, n_samples, proposal_cov)
# Remove burn-in
burn_in = 1000
samples_clean = samples[burn_in:]
print(f"\nBayesian Linear Regression:")
print(f"True parameters: beta0={true_beta[0]}, beta1={true_beta[1]}, sigma={sigma_true}")
print(f"Acceptance rate: {accept_rate:.3f}")
# Posterior summaries
for i, param_name in enumerate(['beta0', 'beta1', 'sigma']):
posterior_mean = np.mean(samples_clean[:, i])
posterior_std = np.std(samples_clean[:, i])
ci_lower = np.percentile(samples_clean[:, i], 2.5)
ci_upper = np.percentile(samples_clean[:, i], 97.5)
print(f"{param_name}: {posterior_mean:.3f} ± {posterior_std:.3f}, 95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")
return X, y, samples_clean, true_beta, sigma_true
X, y, mcmc_samples, true_beta, sigma_true = bayesian_linear_regression()
=>PYTHONTEX#py#default#default#2#code#####182#
# Advanced regression techniques
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
import warnings
from sklearn.exceptions import ConvergenceWarning
# Suppress sklearn convergence warnings for cleaner output
warnings.filterwarnings('ignore', category=ConvergenceWarning)
def regression_comparison():
"""Compare different regression methods"""
# Generate polynomial regression data with noise
np.random.seed(42)
n_samples = 100
x = np.linspace(0, 1, n_samples)
y_true = 2 * x - 3 * x**2 + 0.5 * x**3
noise = np.random.normal(0, 0.1, n_samples)
y = y_true + noise
X = x.reshape(-1, 1)
# Create polynomial features
degree = 8
poly_features = PolynomialFeatures(degree=degree, include_bias=False)
X_poly = poly_features.fit_transform(X)
# Standardize features
scaler = StandardScaler()
X_poly_scaled = scaler.fit_transform(X_poly)
# Compare regularization methods
alphas = np.logspace(-4, 2, 50)
methods = {
'Ridge': Ridge(),
'Lasso': Lasso(max_iter=20000, tol=1e-6),
'ElasticNet': ElasticNet(max_iter=20000, tol=1e-6, l1_ratio=0.5)
}
results = {}
for name, model in methods.items():
cv_scores = []
coefficients = []
for alpha in alphas:
model.set_params(alpha=alpha)
# Cross-validation score
scores = cross_val_score(model, X_poly_scaled, y, cv=5, scoring='neg_mean_squared_error')
cv_scores.append(-np.mean(scores))
# Fit to get coefficients
model.fit(X_poly_scaled, y)
coefficients.append(model.coef_.copy())
results[name] = {
'cv_scores': cv_scores,
'coefficients': np.array(coefficients)
}
print(f"\nRegression Analysis:")
print(f" Dataset size: {n_samples} samples")
print(f" Polynomial degree: {degree}")
print(f" True function: y = 2x - 3x² + 0.5x³")
return x, y, y_true, alphas, results, X_poly_scaled
x, y, y_true, alphas, reg_results, X_poly_scaled = regression_comparison()
=>PYTHONTEX#py#default#default#3#code#####254#
# Create comprehensive statistical analysis visualization
import os
os.makedirs('assets', exist_ok=True)
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Statistical Computing and Data Analysis', fontsize=16, fontweight='bold')
# Hypothesis testing visualization
ax1 = axes[0, 0]
ax1.hist(sample1, alpha=0.7, bins=15, label='Sample 1', density=True, color='skyblue')
ax1.hist(sample2, alpha=0.7, bins=15, label='Sample 2', density=True, color='salmon')
ax1.axvline(np.mean(sample1), color='blue', linestyle='--', linewidth=2, label=f'Mean 1: {np.mean(sample1):.2f}')
ax1.axvline(np.mean(sample2), color='red', linestyle='--', linewidth=2, label=f'Mean 2: {np.mean(sample2):.2f}')
ax1.set_xlabel('Value')
ax1.set_ylabel('Density')
ax1.set_title(f'Two-Sample Comparison\n(t = {t2_stat:.2f}, p = {p2_val:.4f})')
ax1.legend()
ax1.grid(True, alpha=0.3)
# MCMC trace plots
ax2 = axes[0, 1]
param_names = ['beta0', 'beta1', 'sigma']
colors = ['blue', 'red', 'green']
for i, (name, color) in enumerate(zip(param_names, colors)):
ax2.plot(mcmc_samples[:, i], color=color, alpha=0.7, label=name, linewidth=1)
ax2.set_xlabel('MCMC Iteration')
ax2.set_ylabel('Parameter Value')
ax2.set_title('MCMC Trace Plots')
ax2.legend()
ax2.grid(True, alpha=0.3)
# Posterior distributions
ax3 = axes[0, 2]
for i, (name, color) in enumerate(zip(param_names, colors)):
ax3.hist(mcmc_samples[:, i], bins=30, alpha=0.6, density=True,
label=f'{name} (mean: {np.mean(mcmc_samples[:, i]):.3f})', color=color)
ax3.set_xlabel('Parameter Value')
ax3.set_ylabel('Posterior Density')
ax3.set_title('Posterior Distributions')
ax3.legend()
ax3.grid(True, alpha=0.3)
# Regression data and true function
ax4 = axes[1, 0]
ax4.scatter(x, y, alpha=0.6, s=20, color='lightblue', label='Noisy data')
ax4.plot(x, y_true, 'r-', linewidth=3, label='True function')
ax4.set_xlabel('x')
ax4.set_ylabel('y')
ax4.set_title('Polynomial Regression Data')
ax4.legend()
ax4.grid(True, alpha=0.3)
# Cross-validation scores
ax5 = axes[1, 1]
for name, results in reg_results.items():
ax5.semilogx(alphas, results['cv_scores'], 'o-', linewidth=2, label=name, markersize=4)
ax5.set_xlabel('Regularization Parameter (α)')
ax5.set_ylabel('Cross-Validation MSE')
ax5.set_title('Model Selection via Cross-Validation')
ax5.legend()
ax5.grid(True, alpha=0.3)
# Regularization paths
ax6 = axes[1, 2]
for name, results in reg_results.items():
if name == 'Lasso': # Show Lasso path as example
coeffs = results['coefficients']
for i in range(coeffs.shape[1]):
ax6.semilogx(alphas, coeffs[:, i], linewidth=2, alpha=0.7)
break
ax6.set_xlabel('Regularization Parameter (α)')
ax6.set_ylabel('Coefficient Value')
ax6.set_title('Lasso Regularization Path')
ax6.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('assets/statistical-analysis.pdf', dpi=300, bbox_inches='tight')
plt.close()
print("Statistical analysis visualization saved to assets/statistical-analysis.pdf")
=>PYTHONTEX#py#default#default#4#code#####348#
# Time series analysis and forecasting
from scipy.signal import find_peaks
def time_series_analysis():
"""Demonstrate time series analysis techniques"""
# Generate synthetic time series data
n_points = 500
t = np.linspace(0, 10, n_points)
# Multiple components: trend + seasonal + noise
trend = 0.5 * t + 0.02 * t**2
seasonal = 2 * np.sin(2 * np.pi * t) + np.sin(4 * np.pi * t)
noise = np.random.normal(0, 0.5, n_points)
ts_data = trend + seasonal + noise
# Simple moving averages
window_sizes = [5, 20, 50]
moving_averages = {}
for window in window_sizes:
ma = np.convolve(ts_data, np.ones(window)/window, mode='same')
moving_averages[window] = ma
# Autocorrelation function
def autocorrelation(x, max_lags=50):
"""Compute autocorrelation function"""
n = len(x)
x = x - np.mean(x)
autocorr = np.correlate(x, x, mode='full')
autocorr = autocorr[n-1:]
autocorr = autocorr / autocorr[0]
return autocorr[:max_lags+1]
lags = range(51)
autocorr = autocorrelation(ts_data)
print(f"\nTime Series Analysis:")
print(f" Data points: {n_points}")
print(f" Time span: {t[-1]:.1f} units")
print(f" Mean: {np.mean(ts_data):.3f}")
print(f" Std: {np.std(ts_data):.3f}")
return t, ts_data, moving_averages, lags, autocorr, trend, seasonal
t, ts_data, moving_avgs, lags, autocorr, trend, seasonal = time_series_analysis()
# Additional statistical summaries
print(f"\nStatistical Summary:")
# Fix array dimension mismatch by using minimum length
min_len = min(len(sample1), len(sample2))
print(f" Sample correlation between samples: {np.corrcoef(sample1[:min_len], sample2[:min_len])[0,1]:.3f}")
print(f" MCMC effective sample size (beta1): {len(mcmc_samples) / (1 + 2*np.sum(np.correlate(mcmc_samples[:,1] - np.mean(mcmc_samples[:,1]), mcmc_samples[:,1] - np.mean(mcmc_samples[:,1]), mode='full')[len(mcmc_samples):len(mcmc_samples)+10])):.0f}")
=>PYTHONTEX:SETTINGS#
version=0.18
outputdir=pythontex-files-main
workingdir=.
workingdirset=false
gobble=none
rerun=default
hashdependencies=default
makestderr=false
stderrfilename=full
keeptemps=none
pyfuture=default
pyconfuture=none
pygments=true
pygglobal=:GLOBAL||
fvextfile=-1
pyconbanner=none
pyconfilename=stdin
depythontex=false
pygfamily=py|python3|
pygfamily=pycon|pycon|
pygfamily=sympy|python3|
pygfamily=sympycon|pycon|
pygfamily=pylab|python3|
pygfamily=pylabcon|pycon|