Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
suyashi29
GitHub Repository: suyashi29/python-su
Path: blob/master/ML Classification using Python/Hiring_DecisionTree_CaseStudy.ipynb
4745 views
Kernel: Python 3 (ipykernel)

Hiring Decision Tree Case Study with GridSearchCV

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.tree import DecisionTreeClassifier, plot_tree from sklearn.metrics import classification_report, confusion_matrix, accuracy_score plt.style.use('ggplot') df = pd.read_csv('hiredata.csv') df.head()

Handle Missing Values (if any)

df.isnull().sum() df = df.fillna(df.median(numeric_only=True)) df.isnull().sum()
YearsExperience 0 EducationLevel 0 SkillsScore 0 CertificationCount 0 Hired 0 dtype: int64

Encode Categorical Column

le = LabelEncoder() df['EducationLevelEncoded'] = le.fit_transform(df['EducationLevel']) df.head()

Exploratory Data Analysis

df.describe()
plt.figure(figsize=(8,4)) sns.countplot(data=df, x='EducationLevel', hue='Hired') plt.xticks(rotation=45) plt.show()
Image in a Jupyter notebook

Train-Test Split

X = df[['YearsExperience','EducationLevelEncoded','SkillsScore','CertificationCount']] y = df['Hired'] X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

GridSearchCV for Best Decision Tree

params = { 'criterion': ['gini','entropy'], 'max_depth': [2,3,4,5,6,7,8], 'min_samples_split': [2,4,6,8,10] } grid = GridSearchCV(DecisionTreeClassifier(), params, cv=5, scoring='accuracy') grid.fit(X_train, y_train) grid.best_params_, grid.best_score_
({'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}, 0.9458333333333332)

Train Final Model

model = grid.best_estimator_ y_pred = model.predict(X_test) print("Accuracy:", accuracy_score(y_test, y_pred)) print(classification_report(y_test,y_pred))
Accuracy: 0.9291666666666667 precision recall f1-score support 0 0.93 0.96 0.94 145 1 0.93 0.88 0.91 95 accuracy 0.93 240 macro avg 0.93 0.92 0.93 240 weighted avg 0.93 0.93 0.93 240

Confusion Matrix

sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, fmt='d', cmap='Blues') plt.show()
Image in a Jupyter notebook

Feature Importance

plt.figure(figsize=(6,4)) plt.barh(X.columns, model.feature_importances_) plt.title("Feature Importance") plt.show()
Image in a Jupyter notebook

Decision Tree Visualization

from sklearn.tree import DecisionTreeClassifier, plot_tree import matplotlib.pyplot as plt # Train the model model = DecisionTreeClassifier(max_depth=4, random_state=42) model.fit(X, y) # Plot the tree plt.figure(figsize=(20,10)) plot_tree(model, feature_names=['YearsExperience','EducationLevelEncoded','SkillsScore','CertificationCount'], class_names=['Not Hired','Hired'], filled=True, rounded=True, fontsize=12) plt.show()
Image in a Jupyter notebook
import pickle # Save the trained model pickle.dump(model, open("hiring_decision_tree_model.pkl", "wb")) print("Model saved as hiring_decision_tree_model.pkl")
Model saved as hiring_decision_tree_model.pkl

Add a Prediction Function (User Input → Model Output)

  • This function takes candidate details and returns Hired / Not Hired.

def predict_hiring(years_exp, education_level, skills_score, cert_count): # Encode education level education_encoded = le.transform([education_level])[0] # Prepare input row input_data = pd.DataFrame([{ "YearsExperience": years_exp, "EducationLevelEncoded": education_encoded, "SkillsScore": skills_score, "CertificationCount": cert_count }]) # Predict pred = model.predict(input_data)[0] proba = model.predict_proba(input_data)[0] return { "Prediction": "Hired" if pred == 1 else "Not Hired", "Probability_Not_Hired": round(proba[0], 3), "Probability_Hired": round(proba[1], 3) } # Example usage: predict_hiring(5, "Graduate", 78, 2)
{'Prediction': 'Hired', 'Probability_Not_Hired': 0.085, 'Probability_Hired': 0.915}
### Add Code to Load the Pickle File Later loaded_model = pickle.load(open("hiring_decision_tree_model.pkl", "rb")) # Test prediction predict_hiring(6, "Post-Graduate", 85, 3)
{'Prediction': 'Hired', 'Probability_Not_Hired': 0.031, 'Probability_Hired': 0.969}