Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
161 views
ubuntu2404
=>PYTHONTEX#py#default#default#0#code#####42#
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
from scipy import stats
import seaborn as sns

# Set style for statistical plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
np.random.seed(42)

# Hypothesis testing demonstration
def perform_statistical_tests():
    """Demonstrate various statistical tests"""

    # Generate sample data
    n1, n2 = 50, 45
    sample1 = np.random.normal(10, 2, n1)  # Group 1: μ=10, σ=2
    sample2 = np.random.normal(12, 2.5, n2)  # Group 2: μ=12, σ=2.5

    # One-sample t-test
    t_stat, p_value = stats.ttest_1samp(sample1, 9.5)

    # Two-sample t-test
    t2_stat, p2_value = stats.ttest_ind(sample1, sample2)

    # Kolmogorov-Smirnov test for normality
    ks_stat, ks_p = stats.kstest(sample1, 'norm', args=(sample1.mean(), sample1.std()))

    print("Statistical Hypothesis Testing Results:")
    print(f"One-sample t-test: t = {t_stat:.4f}, p = {p_value:.4f}")
    print(f"Two-sample t-test: t = {t2_stat:.4f}, p = {p2_value:.4f}")
    print(f"KS normality test: D = {ks_stat:.4f}, p = {ks_p:.4f}")

    return sample1, sample2, t_stat, p_value, t2_stat, p2_value

sample1, sample2, t_stat, p_val, t2_stat, p2_val = perform_statistical_tests()
=>PYTHONTEX#py#default#default#1#code#####86#
# Bayesian inference using Metropolis-Hastings algorithm
def metropolis_hastings(log_posterior, initial_theta, n_samples, proposal_cov):
    """Metropolis-Hastings sampler for Bayesian inference"""

    n_params = len(initial_theta)
    samples = np.zeros((n_samples, n_params))
    samples[0] = initial_theta

    n_accepted = 0
    current_theta = initial_theta.copy()
    current_log_prob = log_posterior(current_theta)

    for i in range(1, n_samples):
        # Propose new state
        proposal = np.random.multivariate_normal(current_theta, proposal_cov)
        proposal_log_prob = log_posterior(proposal)

        # Acceptance probability
        log_alpha = proposal_log_prob - current_log_prob
        alpha = min(1, np.exp(log_alpha))

        # Accept or reject
        if np.random.rand() < alpha:
            current_theta = proposal
            current_log_prob = proposal_log_prob
            n_accepted += 1

        samples[i] = current_theta

    acceptance_rate = n_accepted / (n_samples - 1)
    return samples, acceptance_rate

# Bayesian linear regression example
def bayesian_linear_regression():
    """Bayesian inference for linear regression parameters"""

    # Generate synthetic data
    n_obs = 50
    true_beta = np.array([2.0, -1.5])
    sigma_true = 0.5

    X = np.column_stack([np.ones(n_obs), np.random.uniform(-2, 2, n_obs)])
    y = X @ true_beta + np.random.normal(0, sigma_true, n_obs)

    # Define log-posterior (assuming flat priors)
    def log_posterior(params):
        beta = params[:2]
        sigma = params[2]

        if sigma <= 0:
            return -np.inf

        # Likelihood
        y_pred = X @ beta
        log_likelihood = -0.5 * n_obs * np.log(2 * np.pi * sigma**2) - \
                        0.5 * np.sum((y - y_pred)**2) / sigma**2

        # Weak priors
        log_prior = -0.5 * np.sum(beta**2) / 10  # N(0, 10) prior on beta
        log_prior += -np.log(sigma)  # 1/sigma prior on sigma

        return log_likelihood + log_prior

    # Run MCMC
    initial_params = np.array([0.0, 0.0, 1.0])  # [beta0, beta1, sigma]
    proposal_cov = np.diag([0.1, 0.1, 0.05])
    n_samples = 5000

    samples, accept_rate = metropolis_hastings(log_posterior, initial_params, n_samples, proposal_cov)

    # Remove burn-in
    burn_in = 1000
    samples_clean = samples[burn_in:]

    print(f"\nBayesian Linear Regression:")
    print(f"True parameters: beta0={true_beta[0]}, beta1={true_beta[1]}, sigma={sigma_true}")
    print(f"Acceptance rate: {accept_rate:.3f}")

    # Posterior summaries
    for i, param_name in enumerate(['beta0', 'beta1', 'sigma']):
        posterior_mean = np.mean(samples_clean[:, i])
        posterior_std = np.std(samples_clean[:, i])
        ci_lower = np.percentile(samples_clean[:, i], 2.5)
        ci_upper = np.percentile(samples_clean[:, i], 97.5)

        print(f"{param_name}: {posterior_mean:.3f} ± {posterior_std:.3f}, 95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

    return X, y, samples_clean, true_beta, sigma_true

X, y, mcmc_samples, true_beta, sigma_true = bayesian_linear_regression()
=>PYTHONTEX#py#default#default#2#code#####182#
# Advanced regression techniques
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress sklearn convergence warnings for cleaner output
warnings.filterwarnings('ignore', category=ConvergenceWarning)

def regression_comparison():
    """Compare different regression methods"""

    # Generate polynomial regression data with noise
    np.random.seed(42)
    n_samples = 100
    x = np.linspace(0, 1, n_samples)
    y_true = 2 * x - 3 * x**2 + 0.5 * x**3
    noise = np.random.normal(0, 0.1, n_samples)
    y = y_true + noise

    X = x.reshape(-1, 1)

    # Create polynomial features
    degree = 8
    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly_features.fit_transform(X)

    # Standardize features
    scaler = StandardScaler()
    X_poly_scaled = scaler.fit_transform(X_poly)

    # Compare regularization methods
    alphas = np.logspace(-4, 2, 50)
    methods = {
        'Ridge': Ridge(),
        'Lasso': Lasso(max_iter=20000, tol=1e-6),
        'ElasticNet': ElasticNet(max_iter=20000, tol=1e-6, l1_ratio=0.5)
    }

    results = {}
    for name, model in methods.items():
        cv_scores = []
        coefficients = []

        for alpha in alphas:
            model.set_params(alpha=alpha)
            # Cross-validation score
            scores = cross_val_score(model, X_poly_scaled, y, cv=5, scoring='neg_mean_squared_error')
            cv_scores.append(-np.mean(scores))

            # Fit to get coefficients
            model.fit(X_poly_scaled, y)
            coefficients.append(model.coef_.copy())

        results[name] = {
            'cv_scores': cv_scores,
            'coefficients': np.array(coefficients)
        }

    print(f"\nRegression Analysis:")
    print(f"  Dataset size: {n_samples} samples")
    print(f"  Polynomial degree: {degree}")
    print(f"  True function: y = 2x - 3x² + 0.5x³")

    return x, y, y_true, alphas, results, X_poly_scaled

x, y, y_true, alphas, reg_results, X_poly_scaled = regression_comparison()
=>PYTHONTEX#py#default#default#3#code#####254#
# Create comprehensive statistical analysis visualization
import os
os.makedirs('assets', exist_ok=True)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Statistical Computing and Data Analysis', fontsize=16, fontweight='bold')

# Hypothesis testing visualization
ax1 = axes[0, 0]
ax1.hist(sample1, alpha=0.7, bins=15, label='Sample 1', density=True, color='skyblue')
ax1.hist(sample2, alpha=0.7, bins=15, label='Sample 2', density=True, color='salmon')
ax1.axvline(np.mean(sample1), color='blue', linestyle='--', linewidth=2, label=f'Mean 1: {np.mean(sample1):.2f}')
ax1.axvline(np.mean(sample2), color='red', linestyle='--', linewidth=2, label=f'Mean 2: {np.mean(sample2):.2f}')
ax1.set_xlabel('Value')
ax1.set_ylabel('Density')
ax1.set_title(f'Two-Sample Comparison\n(t = {t2_stat:.2f}, p = {p2_val:.4f})')
ax1.legend()
ax1.grid(True, alpha=0.3)

# MCMC trace plots
ax2 = axes[0, 1]
param_names = ['beta0', 'beta1', 'sigma']
colors = ['blue', 'red', 'green']
for i, (name, color) in enumerate(zip(param_names, colors)):
    ax2.plot(mcmc_samples[:, i], color=color, alpha=0.7, label=name, linewidth=1)
ax2.set_xlabel('MCMC Iteration')
ax2.set_ylabel('Parameter Value')
ax2.set_title('MCMC Trace Plots')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Posterior distributions
ax3 = axes[0, 2]
for i, (name, color) in enumerate(zip(param_names, colors)):
    ax3.hist(mcmc_samples[:, i], bins=30, alpha=0.6, density=True,
             label=f'{name} (mean: {np.mean(mcmc_samples[:, i]):.3f})', color=color)
ax3.set_xlabel('Parameter Value')
ax3.set_ylabel('Posterior Density')
ax3.set_title('Posterior Distributions')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Regression data and true function
ax4 = axes[1, 0]
ax4.scatter(x, y, alpha=0.6, s=20, color='lightblue', label='Noisy data')
ax4.plot(x, y_true, 'r-', linewidth=3, label='True function')
ax4.set_xlabel('x')
ax4.set_ylabel('y')
ax4.set_title('Polynomial Regression Data')
ax4.legend()
ax4.grid(True, alpha=0.3)

# Cross-validation scores
ax5 = axes[1, 1]
for name, results in reg_results.items():
    ax5.semilogx(alphas, results['cv_scores'], 'o-', linewidth=2, label=name, markersize=4)
ax5.set_xlabel('Regularization Parameter (α)')
ax5.set_ylabel('Cross-Validation MSE')
ax5.set_title('Model Selection via Cross-Validation')
ax5.legend()
ax5.grid(True, alpha=0.3)

# Regularization paths
ax6 = axes[1, 2]
for name, results in reg_results.items():
    if name == 'Lasso':  # Show Lasso path as example
        coeffs = results['coefficients']
        for i in range(coeffs.shape[1]):
            ax6.semilogx(alphas, coeffs[:, i], linewidth=2, alpha=0.7)
        break

ax6.set_xlabel('Regularization Parameter (α)')
ax6.set_ylabel('Coefficient Value')
ax6.set_title('Lasso Regularization Path')
ax6.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('assets/statistical-analysis.pdf', dpi=300, bbox_inches='tight')
plt.close()

print("Statistical analysis visualization saved to assets/statistical-analysis.pdf")
=>PYTHONTEX#py#default#default#4#code#####348#
# Time series analysis and forecasting
from scipy.signal import find_peaks

def time_series_analysis():
    """Demonstrate time series analysis techniques"""

    # Generate synthetic time series data
    n_points = 500
    t = np.linspace(0, 10, n_points)

    # Multiple components: trend + seasonal + noise
    trend = 0.5 * t + 0.02 * t**2
    seasonal = 2 * np.sin(2 * np.pi * t) + np.sin(4 * np.pi * t)
    noise = np.random.normal(0, 0.5, n_points)

    ts_data = trend + seasonal + noise

    # Simple moving averages
    window_sizes = [5, 20, 50]
    moving_averages = {}

    for window in window_sizes:
        ma = np.convolve(ts_data, np.ones(window)/window, mode='same')
        moving_averages[window] = ma

    # Autocorrelation function
    def autocorrelation(x, max_lags=50):
        """Compute autocorrelation function"""
        n = len(x)
        x = x - np.mean(x)
        autocorr = np.correlate(x, x, mode='full')
        autocorr = autocorr[n-1:]
        autocorr = autocorr / autocorr[0]
        return autocorr[:max_lags+1]

    lags = range(51)
    autocorr = autocorrelation(ts_data)

    print(f"\nTime Series Analysis:")
    print(f"  Data points: {n_points}")
    print(f"  Time span: {t[-1]:.1f} units")
    print(f"  Mean: {np.mean(ts_data):.3f}")
    print(f"  Std: {np.std(ts_data):.3f}")

    return t, ts_data, moving_averages, lags, autocorr, trend, seasonal

t, ts_data, moving_avgs, lags, autocorr, trend, seasonal = time_series_analysis()

# Additional statistical summaries
print(f"\nStatistical Summary:")
# Fix array dimension mismatch by using minimum length
min_len = min(len(sample1), len(sample2))
print(f"  Sample correlation between samples: {np.corrcoef(sample1[:min_len], sample2[:min_len])[0,1]:.3f}")
print(f"  MCMC effective sample size (beta1): {len(mcmc_samples) / (1 + 2*np.sum(np.correlate(mcmc_samples[:,1] - np.mean(mcmc_samples[:,1]), mcmc_samples[:,1] - np.mean(mcmc_samples[:,1]), mode='full')[len(mcmc_samples):len(mcmc_samples)+10])):.0f}")
=>PYTHONTEX:SETTINGS#
version=0.18
outputdir=pythontex-files-main
workingdir=.
workingdirset=false
gobble=none
rerun=default
hashdependencies=default
makestderr=false
stderrfilename=full
keeptemps=none
pyfuture=default
pyconfuture=none
pygments=true
pygglobal=:GLOBAL||
fvextfile=-1
pyconbanner=none
pyconfilename=stdin
depythontex=false
pygfamily=py|python3|
pygfamily=pycon|pycon|
pygfamily=sympy|python3|
pygfamily=sympycon|pycon|
pygfamily=pylab|python3|
pygfamily=pylabcon|pycon|