Contact Us!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
Avatar for stephanie's main branch.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

| Download

"Guiding Future STEM Leaders through Innovative Research Training" ~ thinkingbeyond.education

Views: 1148
Image: ubuntu2204
Kernel: Python 3
import numpy as np import pandas as pd from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score import time
X, y = make_classification( n_samples=1000, n_features=10, n_informative=5, n_redundant=2, n_clusters_per_class=1, flip_y=0.1, random_state=42 )
scaler = StandardScaler() X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
def add_outliers(X, y, outlier_fraction=0.1): n_outliers = int(outlier_fraction * X.shape[0]) random_state = np.random.RandomState(42) outliers = random_state.uniform(low=-10, high=10, size=(n_outliers, X.shape[1])) # Random noise outlier_labels = random_state.randint(0, 2, size=n_outliers) # Random binary labels X_with_outliers = np.vstack([X, outliers]) y_with_outliers = np.hstack([y, outlier_labels]) return X_with_outliers, y_with_outliers
X_train_outliers, y_train_outliers = add_outliers(X_train, y_train)
def evaluate_classifier(name, clf, X_train, y_train, X_test, y_test): print(f"### {name} ###") start_time = time.time() clf.fit(X_train, y_train) train_time = time.time() - start_time y_pred = clf.predict(X_test) # Calculate metrics acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) print(f"Accuracy: {acc:.2f}") print(f"F1 Score: {f1:.2f}") print(f"Recall: {recall:.2f}") print(f"Precision: {precision:.2f}") print(f"Training Time: {train_time:.4f} seconds\n") return { "Classifier": name, "Accuracy": acc, "F1 Score": f1, "Recall": recall, "Precision": precision, "Training Time (s)": train_time }
classifiers = [ ("Logistic Regression", LogisticRegression(max_iter=1000)), ("SVM with RBF Kernel", SVC(kernel="rbf", probability=True)), ("Decision Tree", DecisionTreeClassifier()), ("Random Forest", RandomForestClassifier()), ("Gradient Boosting", GradientBoostingClassifier()), ("Naive Bayes", GaussianNB()) ]
results_no_outliers = [] results_with_outliers = [] for name, clf in classifiers: print(f"Evaluating {name} without outliers...") results_no_outliers.append(evaluate_classifier(name, clf, X_train, y_train, X_test, y_test)) print(f"Evaluating {name} with outliers...") results_with_outliers.append(evaluate_classifier(name, clf, X_train_outliers, y_train_outliers, X_test, y_test))
Evaluating Logistic Regression without outliers... ### Logistic Regression ### Accuracy: 0.94 F1 Score: 0.94 Recall: 0.93 Precision: 0.96 Training Time: 0.0078 seconds Evaluating Logistic Regression with outliers... ### Logistic Regression ### Accuracy: 0.87 F1 Score: 0.87 Recall: 0.79 Precision: 0.97 Training Time: 0.0053 seconds Evaluating SVM with RBF Kernel without outliers... ### SVM with RBF Kernel ### Accuracy: 0.95 F1 Score: 0.96 Recall: 0.95 Precision: 0.96 Training Time: 0.1233 seconds Evaluating SVM with RBF Kernel with outliers... ### SVM with RBF Kernel ### Accuracy: 0.94 F1 Score: 0.95 Recall: 0.93 Precision: 0.96 Training Time: 0.1549 seconds Evaluating Decision Tree without outliers... ### Decision Tree ### Accuracy: 0.83 F1 Score: 0.83 Recall: 0.78 Precision: 0.90 Training Time: 0.0156 seconds Evaluating Decision Tree with outliers... ### Decision Tree ### Accuracy: 0.86 F1 Score: 0.87 Recall: 0.85 Precision: 0.89 Training Time: 0.0197 seconds Evaluating Random Forest without outliers... ### Random Forest ### Accuracy: 0.95 F1 Score: 0.95 Recall: 0.94 Precision: 0.96 Training Time: 0.4334 seconds Evaluating Random Forest with outliers... ### Random Forest ### Accuracy: 0.95 F1 Score: 0.96 Recall: 0.95 Precision: 0.96 Training Time: 0.3818 seconds Evaluating Gradient Boosting without outliers... ### Gradient Boosting ### Accuracy: 0.94 F1 Score: 0.95 Recall: 0.93 Precision: 0.97 Training Time: 0.4897 seconds Evaluating Gradient Boosting with outliers... ### Gradient Boosting ### Accuracy: 0.94 F1 Score: 0.95 Recall: 0.93 Precision: 0.97 Training Time: 0.5542 seconds Evaluating Naive Bayes without outliers... ### Naive Bayes ### Accuracy: 0.89 F1 Score: 0.89 Recall: 0.81 Precision: 0.98 Training Time: 0.0036 seconds Evaluating Naive Bayes with outliers... ### Naive Bayes ### Accuracy: 0.72 F1 Score: 0.78 Recall: 0.95 Precision: 0.67 Training Time: 0.0029 seconds
df_no_outliers = pd.DataFrame(results_no_outliers) df_with_outliers = pd.DataFrame(results_with_outliers)
df_difference = df_with_outliers.copy() df_difference[["Accuracy", "F1 Score", "Recall", "Precision"]] -= df_no_outliers[["Accuracy", "F1 Score", "Recall", "Precision"]] df_difference["Classifier"] = df_no_outliers["Classifier"] df_difference.rename(columns={"Accuracy": "Accuracy Change", "F1 Score": "F1 Score Change", "Recall": "Recall Change", "Precision": "Precision Change"}, inplace=True)
print("\n### Metrics Without Outliers ###\n") print(df_no_outliers) print("\n### Metrics With Outliers ###\n") print(df_with_outliers) print("\n### Outlier Sensitivity (Difference in Metrics) ###\n") print(df_difference)
### Metrics Without Outliers ### Classifier Accuracy F1 Score Recall Precision \ 0 Logistic Regression 0.940 0.942857 0.925234 0.961165 1 SVM with RBF Kernel 0.955 0.957746 0.953271 0.962264 2 Decision Tree 0.835 0.834171 0.775701 0.902174 3 Random Forest 0.950 0.952830 0.943925 0.961905 4 Gradient Boosting 0.945 0.947368 0.925234 0.970588 5 Naive Bayes 0.890 0.887755 0.813084 0.977528 Training Time (s) 0 0.007818 1 0.123307 2 0.015565 3 0.433434 4 0.489675 5 0.003552 ### Metrics With Outliers ### Classifier Accuracy F1 Score Recall Precision \ 0 Logistic Regression 0.870 0.865979 0.785047 0.965517 1 SVM with RBF Kernel 0.945 0.947867 0.934579 0.961538 2 Decision Tree 0.865 0.870813 0.850467 0.892157 3 Random Forest 0.955 0.957746 0.953271 0.962264 4 Gradient Boosting 0.945 0.947368 0.925234 0.970588 5 Naive Bayes 0.720 0.784615 0.953271 0.666667 Training Time (s) 0 0.005347 1 0.154885 2 0.019695 3 0.381819 4 0.554216 5 0.002920 ### Outlier Sensitivity (Difference in Metrics) ### Classifier Accuracy Change F1 Score Change Recall Change \ 0 Logistic Regression -0.070 -0.076878 -0.140187 1 SVM with RBF Kernel -0.010 -0.009879 -0.018692 2 Decision Tree 0.030 0.036643 0.074766 3 Random Forest 0.005 0.004916 0.009346 4 Gradient Boosting 0.000 0.000000 0.000000 5 Naive Bayes -0.170 -0.103140 0.140187 Precision Change Training Time (s) 0 0.004352 0.005347 1 -0.000726 0.154885 2 -0.010017 0.019695 3 0.000359 0.381819 4 0.000000 0.554216 5 -0.310861 0.002920
from IPython.display import display print("\n### Results in Colab-Friendly Format ###") print("\nMetrics Without Outliers:") display(df_no_outliers)
### Results in Colab-Friendly Format ### Metrics Without Outliers:
print("\nMetrics With Outliers:") display(df_with_outliers)
Metrics With Outliers:
print("\nOutlier Sensitivity (Change in Metrics):") display(df_difference)
Outlier Sensitivity (Change in Metrics):