Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
YStrano
GitHub Repository: YStrano/DataScience_GA
Path: blob/master/lessons/lesson_10-sub-Jacob_Koehler/02-BankMarketing-solutions - done.ipynb
1904 views
Kernel: Python 3
import pandas as pd
bank = pd.read_csv('data/bank_marketing.csv')
bank.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4119 entries, 0 to 4118 Data columns (total 22 columns): Unnamed: 0 4119 non-null int64 age 4119 non-null int64 job 4119 non-null object marital 4119 non-null object education 4119 non-null object default 4119 non-null object housing 4119 non-null object loan 4119 non-null object contact 4119 non-null object month 4119 non-null object day_of_week 4119 non-null object duration 4119 non-null int64 campaign 4119 non-null int64 pdays 4119 non-null int64 previous 4119 non-null int64 poutcome 4119 non-null object emp.var.rate 4119 non-null float64 cons.price.idx 4119 non-null float64 cons.conf.idx 4119 non-null float64 euribor3m 4119 non-null float64 nr.employed 4119 non-null float64 y 4119 non-null int64 dtypes: float64(5), int64(7), object(10) memory usage: 708.0+ KB
X = bank[['age', 'job', 'education', 'day_of_week']]
y = bank['y']
X.describe(include = 'all') #without "include=all" you only get descrive for quantifiable variables
X.job.value_counts()
admin. 1012 blue-collar 884 technician 691 services 393 management 324 retired 166 self-employed 159 entrepreneur 148 unemployed 111 housemaid 110 student 82 unknown 39 Name: job, dtype: int64
X.education.value_counts()
university.degree 1264 high.school 921 basic.9y 574 professional.course 535 basic.4y 429 basic.6y 228 unknown 167 illiterate 1 Name: education, dtype: int64
X.education.value_counts().sum() / X.education.value_counts()
university.degree 3.258703 high.school 4.472313 basic.9y 7.175958 professional.course 7.699065 basic.4y 9.601399 basic.6y 18.065789 unknown 24.664671 illiterate 4119.000000 Name: education, dtype: float64
X.day_of_week.value_counts()
thu 860 mon 855 tue 841 wed 795 fri 768 Name: day_of_week, dtype: int64
bank_dummies = pd.get_dummies(bank[['age', 'job', 'education', 'day_of_week']], drop_first = True) #bank_dummies = pd.get_dummies(bank[X], drop_first = True)
bank_dummies.head()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(bank_dummies, y)
from sklearn.dummy import DummyClassifier from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier
dummy = DummyClassifier() lgr_clf = LogisticRegression() knn_clf = KNeighborsClassifier(n_neighbors=4)
y.value_counts()
0 3668 1 451 Name: y, dtype: int64
perc = y.value_counts()[0] / (y.value_counts()[0]+y.value_counts()[1]) perc #does this mean that for every 9 guesses you'll get it right?
0.890507404709881
for model in [dummy, lgr_clf, knn_clf]: model.fit(X_train, y_train) print('The ', model, '\nhas accuracy\n', model.score(X_test, y_test), '\n')
The DummyClassifier(constant=None, random_state=None, strategy='stratified') has accuracy 0.8116504854368932 The LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False) has accuracy 0.8815533980582524 The KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=4, p=2, weights='uniform') has accuracy 0.8786407766990292
#dummy = DummyClassifier (strategy = 'most_frequent') #this will change the result quite a lot from strategy = stratified
pd.DataFrame([bank_dummies.columns, lgr_clf.coef_[0]]).T
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, lgr_clf.predict(X_test))
array([[908, 0], [122, 0]])
#lgr_imbalanced = LogisticRegression?
lgr_imbalanced = LogisticRegression(class_weight='balanced')
lgr_imbalanced.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
lgr_imbalanced.score(X_test, y_test)
0.5640776699029126
confusion_matrix(y_test, lgr_imbalanced.predict(X_test))
array([[524, 384], [ 65, 57]])
#you see differential performance based on your models and their class of balance or imbalance, class weights play a big role
#you can't score well with your classifier if you don't balance your classes
#class imbalance is an idicator for poor performance