CoCalc -- main.pytxcode

²⁸⁶ views
ubuntu2404
=>PYTHONTEX#py#default#default#0#code#####62#
# Import comprehensive ML and data science libraries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
import seaborn as sns
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import make_classification, make_regression

# Try to import deep learning frameworks
try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    pytorch_available = True
except ImportError:
    print("PyTorch not available - using sklearn alternatives")
    pytorch_available = False

try:
    import tensorflow as tf
    tensorflow_available = True
except ImportError:
    print("TensorFlow not available - using sklearn alternatives")
    tensorflow_available = False

# Set visualization parameters for ML plots
plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)
sns.set_palette("husl")

# ML-optimized figure settings
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.pad_inches'] = 0.1
plt.rcParams['font.size'] = 11

print("Machine Learning environment initialized")
print("Available frameworks:")
print(f"  - NumPy: {np.__version__}")
print(f"  - Scikit-learn: available")
print(f"  - PyTorch: {pytorch_available}")
print(f"  - TensorFlow: {tensorflow_available}")
=>PYTHONTEX#py#default#default#1#code#####203#
# Generate synthetic image classification dataset
# In practice, this would load real image data (CIFAR-10, ImageNet, etc.)

# Create synthetic dataset with multiple classes
n_samples = 2000
n_features = 64  # Simulating flattened 8x8 images
n_classes = 5
n_informative = 40

X, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_informative,
    n_redundant=10,
    n_classes=n_classes,
    n_clusters_per_class=1,
    random_state=42
)

# Reshape to simulate image-like structure (for visualization)
image_height, image_width = 8, 8
X_images = X.reshape(-1, image_height, image_width)

# Split into train/validation/test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"Dataset created:")
print(f"  Training samples: {X_train.shape[0]}")
print(f"  Validation samples: {X_val.shape[0]}")
print(f"  Test samples: {X_test.shape[0]}")
print(f"  Features per sample: {X_train.shape[1]}")
print(f"  Number of classes: {n_classes}")
print(f"  Class distribution: {np.bincount(y)}")

# Basic dataset statistics
print(f"\nDataset statistics:")
print(f"  Feature mean: {X_train.mean():.3f}")
print(f"  Feature std: {X_train.std():.3f}")
print(f"  Feature range: [{X_train.min():.3f}, {X_train.max():.3f}]")
=>PYTHONTEX#py#default#default#2#code#####256#
# Define and train multiple model architectures
models = {}
model_results = {}

# 1. Logistic Regression (baseline)
models['logistic'] = LogisticRegression(random_state=42, max_iter=1000)

# 2. Random Forest (tree-based baseline)
models['random_forest'] = RandomForestClassifier(n_estimators=100, random_state=42)

# 3. Support Vector Machine
models['svm'] = SVC(kernel='rbf', random_state=42, probability=True)

# 4. Multi-Layer Perceptron (Neural Network)
models['mlp'] = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate='adaptive',
    max_iter=500,
    random_state=42
)

print("Model architectures defined:")
for name, model in models.items():
    print(f"  {name.replace('_', r'\_')}: {type(model).__name__}")

# Train all models and collect results
for name, model in models.items():
    print(f"\nTraining {name.replace('_', r'\_')}...")

    # Train model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    train_pred = model.predict(X_train_scaled)
    val_pred = model.predict(X_val_scaled)
    test_pred = model.predict(X_test_scaled)

    # Calculate metrics
    train_acc = accuracy_score(y_train, train_pred)
    val_acc = accuracy_score(y_val, val_pred)
    test_acc = accuracy_score(y_test, test_pred)

    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')

    model_results[name] = {
        'model': model,
        'train_accuracy': train_acc,
        'val_accuracy': val_acc,
        'test_accuracy': test_acc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'train_pred': train_pred,
        'val_pred': val_pred,
        'test_pred': test_pred
    }

    print(f"  Training accuracy: {train_acc:.4f}")
    print(f"  Validation accuracy: {val_acc:.4f}")
    print(f"  Test accuracy: {test_acc:.4f}")
    print(f"  CV accuracy: {cv_scores.mean():.4f} $\\pm$ {cv_scores.std():.4f}")
=>PYTHONTEX#py#default#default#3#code#####327#
# Generate learning curves for the MLP model
mlp_model = models['mlp']

# Compute learning curves
train_sizes, train_scores, val_scores = learning_curve(
    mlp_model, X_train_scaled, y_train,
    cv=5, n_jobs=1, train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy', random_state=42
)

# Calculate mean and std for plotting
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

print("Learning curve analysis completed")
print(f"Training sizes evaluated: {train_sizes}")
print(f"Final training score: {train_mean[-1]:.4f} $\\pm$ {train_std[-1]:.4f}")
print(f"Final validation score: {val_mean[-1]:.4f} $\\pm$ {val_std[-1]:.4f}")
=>PYTHONTEX#py#default#default#4#code#####360#
# Create comprehensive model comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Machine Learning Model Performance Analysis', fontsize=16, fontweight='bold')

# Extract data for plotting
model_names = list(model_results.keys())
train_accs = [model_results[name]['train_accuracy'] for name in model_names]
val_accs = [model_results[name]['val_accuracy'] for name in model_names]
test_accs = [model_results[name]['test_accuracy'] for name in model_names]
cv_means = [model_results[name]['cv_mean'] for name in model_names]
cv_stds = [model_results[name]['cv_std'] for name in model_names]

# 1. Accuracy comparison bar plot
ax1 = axes[0, 0]
x_pos = np.arange(len(model_names))
width = 0.25

bars1 = ax1.bar(x_pos - width, train_accs, width, label='Training', alpha=0.8)
bars2 = ax1.bar(x_pos, val_accs, width, label='Validation', alpha=0.8)
bars3 = ax1.bar(x_pos + width, test_accs, width, label='Test', alpha=0.8)

ax1.set_xlabel('Models')
ax1.set_ylabel('Accuracy')
ax1.set_title('Model Accuracy Comparison')
ax1.set_xticks(x_pos)
ax1.set_xticklabels([name.replace('_', ' ').title() for name in model_names], rotation=45)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax1.annotate(f'{height:.3f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=8)

# 2. Cross-validation scores with error bars
ax2 = axes[0, 1]
bars = ax2.bar(model_names, cv_means, yerr=cv_stds, capsize=5, alpha=0.8, color='skyblue')
ax2.set_xlabel('Models')
ax2.set_ylabel('Cross-Validation Accuracy')
ax2.set_title('Cross-Validation Performance')

# Fix the tick labels warning by setting ticks first
ax2.set_xticks(range(len(model_names)))
ax2.set_xticklabels([name.replace('_', ' ').title() for name in model_names], rotation=45)

ax2.grid(True, alpha=0.3)

# 3. Learning curves for MLP
ax3 = axes[1, 0]
ax3.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score')
ax3.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
ax3.plot(train_sizes, val_mean, 'o-', color='red', label='Validation Score')
ax3.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
ax3.set_xlabel('Training Set Size')
ax3.set_ylabel('Accuracy Score')
ax3.set_title('Learning Curves (MLP)')
ax3.legend()
ax3.grid(True, alpha=0.3)

# 4. Confusion matrix for best model
best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['test_accuracy'])
best_test_pred = model_results[best_model_name]['test_pred']

cm = confusion_matrix(y_test, best_test_pred)
ax4 = axes[1, 1]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax4)
ax4.set_xlabel('Predicted Label')
ax4.set_ylabel('True Label')
ax4.set_title(f'Confusion Matrix - {best_model_name.replace("_", " ").title()}')

import os
os.makedirs('figures', exist_ok=True)
plt.savefig('figures/model_comparison.pdf', dpi=300, bbox_inches='tight')

plt.tight_layout()
plt.savefig('figures/model_comparison.pdf', dpi=300, bbox_inches='tight')
plt.close()

print(f"Best performing model: {best_model_name.replace('_', r'\_')} (Test accuracy: {model_results[best_model_name]['test_accuracy']:.4f})")
=>PYTHONTEX#py#default#default#5#code#####458#
# Fix incomplete import in your preamble pycode (elsewhere):
# from sklearn.metrics import accuracy_score, classification_
# -> should be:
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Hyperparameter analysis for neural network
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from pprint import pformat

def verbprint(s):
    print(r'{\small')
    print(r'\begin{verbatim}')
    print(s)
    print(r'\end{verbatim}')
    print(r'}')

def format_params_compact(params_dict):
    """Format parameter dictionary in a compact, abbreviated form"""
    parts = []
    for key, value in params_dict.items():
        # Abbreviate common parameter names
        if key == 'hidden_layer_sizes':
            key_abbr = 'layers'
        elif key == 'learning_rate_init':
            key_abbr = 'lr'
        elif key == 'alpha':
            key_abbr = 'alpha'
        else:
            key_abbr = key[:8]  # Truncate long keys

        parts.append(f"{key_abbr}={value}")
    return ", ".join(parts)

print("Hyperparameter optimization for MLP:")

param_grid = {
    'hidden_layer_sizes': [(64,), (128,), (64, 32), (128, 64), (128, 64, 32)],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'alpha': [0.0001, 0.001, 0.01]
}

mlp_grid = MLPClassifier(max_iter=300, random_state=42, early_stopping=True)
grid_search = GridSearchCV(mlp_grid, param_grid, cv=3, scoring='accuracy', n_jobs=1)

subset_size = 500
X_subset = X_train_scaled[:subset_size]
y_subset = y_train[:subset_size]

grid_search.fit(X_subset, y_subset)

print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
print("Best parameters:")
verbprint(pformat(grid_search.best_params_))

results_df = pd.DataFrame(grid_search.cv_results_)
print("\nTop 5 parameter combinations:")
top_results = results_df.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score']]
for _, row in top_results.iterrows():
    params_str = format_params_compact(row['params'])
    verbprint(f"{params_str} : {row['mean_test_score']:.4f} $\\pm$ {row['std_test_score']:.4f}")

rf_model = models['random_forest']
feature_importance = rf_model.feature_importances_

print("\nRandom Forest feature importance analysis:")
print(f"  Top feature importance: {feature_importance.max():.4f}")
print(f"  Mean feature importance: {feature_importance.mean():.4f}")
print(f"  Features with zero importance: {np.sum(feature_importance == 0)}")
=>PYTHONTEX#py#default#default#6#code#####537#
# Create advanced analysis visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Advanced Model Analysis and Insights', fontsize=16, fontweight='bold')

# 1. Feature importance (Random Forest)
ax1 = axes[0, 0]

top_k = min(20, len(feature_importance))
top_features_idx = np.argsort(feature_importance)[-top_k:][::-1]
top_features_importance = feature_importance[top_features_idx]
y = np.arange(top_k)

ax1.barh(y, top_features_importance, color='tab:blue')
ax1.set_yticks(y)
ax1.set_yticklabels([f'Feature {idx}' for idx in top_features_idx])
ax1.invert_yaxis()  # largest at top
ax1.set_xlabel('Feature Importance')
ax1.set_ylabel('Feature')
ax1.set_title(f'Top {top_k} Feature Importances (Random Forest)')

# 2. Model prediction confidence analysis
ax2 = axes[0, 1]
# Get prediction probabilities for models that support it
if hasattr(models['mlp'], 'predict_proba'):
    mlp_proba = models['mlp'].predict_proba(X_test_scaled)
    max_proba = np.max(mlp_proba, axis=1)

    ax2.hist(max_proba, bins=20, alpha=0.7, color='lightcoral', edgecolor='black')
    ax2.axvline(max_proba.mean(), color='red', linestyle='--', linewidth=2,
               label=f'Mean: {max_proba.mean():.3f}')
    ax2.set_xlabel('Maximum Prediction Probability')
    ax2.set_ylabel('Number of Samples')
    ax2.set_title('Prediction Confidence Distribution (MLP)')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

# 3. Performance vs model complexity
ax3 = axes[1, 0]
complexity_measures = [
    len(models['logistic'].coef_[0]),  # Number of parameters (approximation)
    models['random_forest'].n_estimators * 10,  # Rough complexity measure
    1000,  # SVM complexity (approximation)
    sum([layer[0] * layer[1] if isinstance(layer, tuple) else layer
         for layer in [(64, 128), (128, 64), (64, 32), 32]])  # MLP parameters
]

test_accuracies = [model_results[name]['test_accuracy'] for name in model_names]

ax3.scatter(complexity_measures, test_accuracies, s=100, alpha=0.7, c=range(len(model_names)), cmap='viridis')
for i, name in enumerate(model_names):
    ax3.annotate(name.replace('_', ' ').title(),
                (complexity_measures[i], test_accuracies[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=9)
ax3.set_xlabel('Model Complexity (approximate)')
ax3.set_ylabel('Test Accuracy')
ax3.set_title('Performance vs Model Complexity')
ax3.grid(True, alpha=0.3)

# 4. Training vs validation accuracy (overfitting analysis)
ax4 = axes[1, 1]
train_accs_plot = [model_results[name]['train_accuracy'] for name in model_names]
val_accs_plot = [model_results[name]['val_accuracy'] for name in model_names]

ax4.scatter(train_accs_plot, val_accs_plot, s=100, alpha=0.7, c=range(len(model_names)), cmap='plasma')
# Add diagonal line for perfect generalization
min_acc = min(min(train_accs_plot), min(val_accs_plot)) - 0.01
max_acc = max(max(train_accs_plot), max(val_accs_plot)) + 0.01
ax4.plot([min_acc, max_acc], [min_acc, max_acc], 'k--', alpha=0.5, label='Perfect Generalization')

for i, name in enumerate(model_names):
    ax4.annotate(name.replace('_', ' ').title(),
                (train_accs_plot[i], val_accs_plot[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=9)

ax4.set_xlabel('Training Accuracy')
ax4.set_ylabel('Validation Accuracy')
ax4.set_title('Overfitting Analysis')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('figures/advanced_analysis.pdf', dpi=300, bbox_inches='tight')
plt.close()
=>PYTHONTEX#py#default#default#7#i#####641#
f"{model_results['mlp']['test_accuracy']:.3f}"
=>PYTHONTEX:SETTINGS#
version=0.18
outputdir=pythontex-files-main
workingdir=.
workingdirset=false
gobble=none
rerun=default
hashdependencies=default
makestderr=false
stderrfilename=full
keeptemps=none
pyfuture=default
pyconfuture=none
pygments=true
pygglobal=:GLOBAL||
fvextfile=-1
pyconbanner=none
pyconfilename=stdin
depythontex=false
pygfamily=py|python3|
pygfamily=pycon|pycon|
pygfamily=sympy|python3|
pygfamily=sympycon|pycon|
pygfamily=pylab|python3|
pygfamily=pylabcon|pycon|
Product

Resources

Company