ubuntu2404
=>PYTHONTEX#py#default#default#0#code#####62# # Import comprehensive ML and data science libraries import numpy as np import matplotlib.pyplot as plt import matplotlib matplotlib.use('Agg') import seaborn as sns import pandas as pd from scipy import stats from sklearn.model_selection import train_test_split, cross_val_score, learning_curve from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.datasets import make_classification, make_regression # Try to import deep learning frameworks try: import torch import torch.nn as nn import torch.nn.functional as F pytorch_available = True except ImportError: print("PyTorch not available - using sklearn alternatives") pytorch_available = False try: import tensorflow as tf tensorflow_available = True except ImportError: print("TensorFlow not available - using sklearn alternatives") tensorflow_available = False # Set visualization parameters for ML plots plt.style.use('seaborn-v0_8-whitegrid') np.random.seed(42) sns.set_palette("husl") # ML-optimized figure settings plt.rcParams['figure.figsize'] = (10, 6) plt.rcParams['figure.dpi'] = 150 plt.rcParams['savefig.bbox'] = 'tight' plt.rcParams['savefig.pad_inches'] = 0.1 plt.rcParams['font.size'] = 11 print("Machine Learning environment initialized") print("Available frameworks:") print(f" - NumPy: {np.__version__}") print(f" - Scikit-learn: available") print(f" - PyTorch: {pytorch_available}") print(f" - TensorFlow: {tensorflow_available}") =>PYTHONTEX#py#default#default#1#code#####203# # Generate synthetic image classification dataset # In practice, this would load real image data (CIFAR-10, ImageNet, etc.) # Create synthetic dataset with multiple classes n_samples = 2000 n_features = 64 # Simulating flattened 8x8 images n_classes = 5 n_informative = 40 X, y = make_classification( n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=10, n_classes=n_classes, n_clusters_per_class=1, random_state=42 ) # Reshape to simulate image-like structure (for visualization) image_height, image_width = 8, 8 X_images = X.reshape(-1, image_height, image_width) # Split into train/validation/test sets X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y) X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp) # Standardize features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_val_scaled = scaler.transform(X_val) X_test_scaled = scaler.transform(X_test) print(f"Dataset created:") print(f" Training samples: {X_train.shape[0]}") print(f" Validation samples: {X_val.shape[0]}") print(f" Test samples: {X_test.shape[0]}") print(f" Features per sample: {X_train.shape[1]}") print(f" Number of classes: {n_classes}") print(f" Class distribution: {np.bincount(y)}") # Basic dataset statistics print(f"\nDataset statistics:") print(f" Feature mean: {X_train.mean():.3f}") print(f" Feature std: {X_train.std():.3f}") print(f" Feature range: [{X_train.min():.3f}, {X_train.max():.3f}]") =>PYTHONTEX#py#default#default#2#code#####256# # Define and train multiple model architectures models = {} model_results = {} # 1. Logistic Regression (baseline) models['logistic'] = LogisticRegression(random_state=42, max_iter=1000) # 2. Random Forest (tree-based baseline) models['random_forest'] = RandomForestClassifier(n_estimators=100, random_state=42) # 3. Support Vector Machine models['svm'] = SVC(kernel='rbf', random_state=42, probability=True) # 4. Multi-Layer Perceptron (Neural Network) models['mlp'] = MLPClassifier( hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', alpha=0.001, learning_rate='adaptive', max_iter=500, random_state=42 ) print("Model architectures defined:") for name, model in models.items(): print(f" {name.replace('_', r'\_')}: {type(model).__name__}") # Train all models and collect results for name, model in models.items(): print(f"\nTraining {name.replace('_', r'\_')}...") # Train model model.fit(X_train_scaled, y_train) # Make predictions train_pred = model.predict(X_train_scaled) val_pred = model.predict(X_val_scaled) test_pred = model.predict(X_test_scaled) # Calculate metrics train_acc = accuracy_score(y_train, train_pred) val_acc = accuracy_score(y_val, val_pred) test_acc = accuracy_score(y_test, test_pred) # Cross-validation score cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy') model_results[name] = { 'model': model, 'train_accuracy': train_acc, 'val_accuracy': val_acc, 'test_accuracy': test_acc, 'cv_mean': cv_scores.mean(), 'cv_std': cv_scores.std(), 'train_pred': train_pred, 'val_pred': val_pred, 'test_pred': test_pred } print(f" Training accuracy: {train_acc:.4f}") print(f" Validation accuracy: {val_acc:.4f}") print(f" Test accuracy: {test_acc:.4f}") print(f" CV accuracy: {cv_scores.mean():.4f} $\\pm$ {cv_scores.std():.4f}") =>PYTHONTEX#py#default#default#3#code#####327# # Generate learning curves for the MLP model mlp_model = models['mlp'] # Compute learning curves train_sizes, train_scores, val_scores = learning_curve( mlp_model, X_train_scaled, y_train, cv=5, n_jobs=1, train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy', random_state=42 ) # Calculate mean and std for plotting train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) val_mean = np.mean(val_scores, axis=1) val_std = np.std(val_scores, axis=1) print("Learning curve analysis completed") print(f"Training sizes evaluated: {train_sizes}") print(f"Final training score: {train_mean[-1]:.4f} $\\pm$ {train_std[-1]:.4f}") print(f"Final validation score: {val_mean[-1]:.4f} $\\pm$ {val_std[-1]:.4f}") =>PYTHONTEX#py#default#default#4#code#####360# # Create comprehensive model comparison visualization fig, axes = plt.subplots(2, 2, figsize=(15, 12)) fig.suptitle('Machine Learning Model Performance Analysis', fontsize=16, fontweight='bold') # Extract data for plotting model_names = list(model_results.keys()) train_accs = [model_results[name]['train_accuracy'] for name in model_names] val_accs = [model_results[name]['val_accuracy'] for name in model_names] test_accs = [model_results[name]['test_accuracy'] for name in model_names] cv_means = [model_results[name]['cv_mean'] for name in model_names] cv_stds = [model_results[name]['cv_std'] for name in model_names] # 1. Accuracy comparison bar plot ax1 = axes[0, 0] x_pos = np.arange(len(model_names)) width = 0.25 bars1 = ax1.bar(x_pos - width, train_accs, width, label='Training', alpha=0.8) bars2 = ax1.bar(x_pos, val_accs, width, label='Validation', alpha=0.8) bars3 = ax1.bar(x_pos + width, test_accs, width, label='Test', alpha=0.8) ax1.set_xlabel('Models') ax1.set_ylabel('Accuracy') ax1.set_title('Model Accuracy Comparison') ax1.set_xticks(x_pos) ax1.set_xticklabels([name.replace('_', ' ').title() for name in model_names], rotation=45) ax1.legend() ax1.grid(True, alpha=0.3) # Add value labels on bars for bars in [bars1, bars2, bars3]: for bar in bars: height = bar.get_height() ax1.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), # 3 points vertical offset textcoords="offset points", ha='center', va='bottom', fontsize=8) # 2. Cross-validation scores with error bars ax2 = axes[0, 1] bars = ax2.bar(model_names, cv_means, yerr=cv_stds, capsize=5, alpha=0.8, color='skyblue') ax2.set_xlabel('Models') ax2.set_ylabel('Cross-Validation Accuracy') ax2.set_title('Cross-Validation Performance') # Fix the tick labels warning by setting ticks first ax2.set_xticks(range(len(model_names))) ax2.set_xticklabels([name.replace('_', ' ').title() for name in model_names], rotation=45) ax2.grid(True, alpha=0.3) # 3. Learning curves for MLP ax3 = axes[1, 0] ax3.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score') ax3.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue') ax3.plot(train_sizes, val_mean, 'o-', color='red', label='Validation Score') ax3.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red') ax3.set_xlabel('Training Set Size') ax3.set_ylabel('Accuracy Score') ax3.set_title('Learning Curves (MLP)') ax3.legend() ax3.grid(True, alpha=0.3) # 4. Confusion matrix for best model best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['test_accuracy']) best_test_pred = model_results[best_model_name]['test_pred'] cm = confusion_matrix(y_test, best_test_pred) ax4 = axes[1, 1] sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax4) ax4.set_xlabel('Predicted Label') ax4.set_ylabel('True Label') ax4.set_title(f'Confusion Matrix - {best_model_name.replace("_", " ").title()}') import os os.makedirs('figures', exist_ok=True) plt.savefig('figures/model_comparison.pdf', dpi=300, bbox_inches='tight') plt.tight_layout() plt.savefig('figures/model_comparison.pdf', dpi=300, bbox_inches='tight') plt.close() print(f"Best performing model: {best_model_name.replace('_', r'\_')} (Test accuracy: {model_results[best_model_name]['test_accuracy']:.4f})") =>PYTHONTEX#py#default#default#5#code#####458# # Fix incomplete import in your preamble pycode (elsewhere): # from sklearn.metrics import accuracy_score, classification_ # -> should be: # from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Hyperparameter analysis for neural network import numpy as np import pandas as pd from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score, classification_report from sklearn.model_selection import GridSearchCV from pprint import pformat def verbprint(s): print(r'{\small') print(r'\begin{verbatim}') print(s) print(r'\end{verbatim}') print(r'}') def format_params_compact(params_dict): """Format parameter dictionary in a compact, abbreviated form""" parts = [] for key, value in params_dict.items(): # Abbreviate common parameter names if key == 'hidden_layer_sizes': key_abbr = 'layers' elif key == 'learning_rate_init': key_abbr = 'lr' elif key == 'alpha': key_abbr = 'alpha' else: key_abbr = key[:8] # Truncate long keys parts.append(f"{key_abbr}={value}") return ", ".join(parts) print("Hyperparameter optimization for MLP:") param_grid = { 'hidden_layer_sizes': [(64,), (128,), (64, 32), (128, 64), (128, 64, 32)], 'learning_rate_init': [0.001, 0.01, 0.1], 'alpha': [0.0001, 0.001, 0.01] } mlp_grid = MLPClassifier(max_iter=300, random_state=42, early_stopping=True) grid_search = GridSearchCV(mlp_grid, param_grid, cv=3, scoring='accuracy', n_jobs=1) subset_size = 500 X_subset = X_train_scaled[:subset_size] y_subset = y_train[:subset_size] grid_search.fit(X_subset, y_subset) print(f"Best cross-validation score: {grid_search.best_score_:.4f}") print("Best parameters:") verbprint(pformat(grid_search.best_params_)) results_df = pd.DataFrame(grid_search.cv_results_) print("\nTop 5 parameter combinations:") top_results = results_df.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score']] for _, row in top_results.iterrows(): params_str = format_params_compact(row['params']) verbprint(f"{params_str} : {row['mean_test_score']:.4f} $\\pm$ {row['std_test_score']:.4f}") rf_model = models['random_forest'] feature_importance = rf_model.feature_importances_ print("\nRandom Forest feature importance analysis:") print(f" Top feature importance: {feature_importance.max():.4f}") print(f" Mean feature importance: {feature_importance.mean():.4f}") print(f" Features with zero importance: {np.sum(feature_importance == 0)}") =>PYTHONTEX#py#default#default#6#code#####537# # Create advanced analysis visualization fig, axes = plt.subplots(2, 2, figsize=(15, 10)) fig.suptitle('Advanced Model Analysis and Insights', fontsize=16, fontweight='bold') # 1. Feature importance (Random Forest) ax1 = axes[0, 0] top_k = min(20, len(feature_importance)) top_features_idx = np.argsort(feature_importance)[-top_k:][::-1] top_features_importance = feature_importance[top_features_idx] y = np.arange(top_k) ax1.barh(y, top_features_importance, color='tab:blue') ax1.set_yticks(y) ax1.set_yticklabels([f'Feature {idx}' for idx in top_features_idx]) ax1.invert_yaxis() # largest at top ax1.set_xlabel('Feature Importance') ax1.set_ylabel('Feature') ax1.set_title(f'Top {top_k} Feature Importances (Random Forest)') # 2. Model prediction confidence analysis ax2 = axes[0, 1] # Get prediction probabilities for models that support it if hasattr(models['mlp'], 'predict_proba'): mlp_proba = models['mlp'].predict_proba(X_test_scaled) max_proba = np.max(mlp_proba, axis=1) ax2.hist(max_proba, bins=20, alpha=0.7, color='lightcoral', edgecolor='black') ax2.axvline(max_proba.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {max_proba.mean():.3f}') ax2.set_xlabel('Maximum Prediction Probability') ax2.set_ylabel('Number of Samples') ax2.set_title('Prediction Confidence Distribution (MLP)') ax2.legend() ax2.grid(True, alpha=0.3) # 3. Performance vs model complexity ax3 = axes[1, 0] complexity_measures = [ len(models['logistic'].coef_[0]), # Number of parameters (approximation) models['random_forest'].n_estimators * 10, # Rough complexity measure 1000, # SVM complexity (approximation) sum([layer[0] * layer[1] if isinstance(layer, tuple) else layer for layer in [(64, 128), (128, 64), (64, 32), 32]]) # MLP parameters ] test_accuracies = [model_results[name]['test_accuracy'] for name in model_names] ax3.scatter(complexity_measures, test_accuracies, s=100, alpha=0.7, c=range(len(model_names)), cmap='viridis') for i, name in enumerate(model_names): ax3.annotate(name.replace('_', ' ').title(), (complexity_measures[i], test_accuracies[i]), xytext=(5, 5), textcoords='offset points', fontsize=9) ax3.set_xlabel('Model Complexity (approximate)') ax3.set_ylabel('Test Accuracy') ax3.set_title('Performance vs Model Complexity') ax3.grid(True, alpha=0.3) # 4. Training vs validation accuracy (overfitting analysis) ax4 = axes[1, 1] train_accs_plot = [model_results[name]['train_accuracy'] for name in model_names] val_accs_plot = [model_results[name]['val_accuracy'] for name in model_names] ax4.scatter(train_accs_plot, val_accs_plot, s=100, alpha=0.7, c=range(len(model_names)), cmap='plasma') # Add diagonal line for perfect generalization min_acc = min(min(train_accs_plot), min(val_accs_plot)) - 0.01 max_acc = max(max(train_accs_plot), max(val_accs_plot)) + 0.01 ax4.plot([min_acc, max_acc], [min_acc, max_acc], 'k--', alpha=0.5, label='Perfect Generalization') for i, name in enumerate(model_names): ax4.annotate(name.replace('_', ' ').title(), (train_accs_plot[i], val_accs_plot[i]), xytext=(5, 5), textcoords='offset points', fontsize=9) ax4.set_xlabel('Training Accuracy') ax4.set_ylabel('Validation Accuracy') ax4.set_title('Overfitting Analysis') ax4.legend() ax4.grid(True, alpha=0.3) plt.tight_layout() plt.savefig('figures/advanced_analysis.pdf', dpi=300, bbox_inches='tight') plt.close() =>PYTHONTEX#py#default#default#7#i#####641# f"{model_results['mlp']['test_accuracy']:.3f}" =>PYTHONTEX:SETTINGS# version=0.18 outputdir=pythontex-files-main workingdir=. workingdirset=false gobble=none rerun=default hashdependencies=default makestderr=false stderrfilename=full keeptemps=none pyfuture=default pyconfuture=none pygments=true pygglobal=:GLOBAL|| fvextfile=-1 pyconbanner=none pyconfilename=stdin depythontex=false pygfamily=py|python3| pygfamily=pycon|pycon| pygfamily=sympy|python3| pygfamily=sympycon|pycon| pygfamily=pylab|python3| pygfamily=pylabcon|pycon|