CoCalc -- unbalanced.R

GitHub Repository: ethen8181/machine-learning
Path: blob/master/unbalanced/unbalanced_code/unbalanced.R
¹⁴⁸⁰ views
1
# logistic regression
2

3
# environment setting 
4
library(ROCR)
5
library(grid)
6
library(broom)
7
library(caret)
8
library(tidyr)
9
library(dplyr)
10
library(scales)
11
library(ggplot2)
12
library(ggthemr) 
13
library(ggthemes)
14
library(gridExtra)
15
library(data.table)
16
setwd("/Users/mingyuliu/personal/machine-learning/unbalanced")
17

18
# read in HR dataset 
19
data <- fread( list.files( "data", full.names = TRUE )[2] )
20
str(data)
21

22
# using summary to check if columns contain missing values like NAs 
23
summary(data)
24

25
# find correlations to exclude from the model 
26
findCorrelation( cor(data), cutoff = .75, names = TRUE )
27

28
# from this probability table we can see that 16 percent of 
29
# your emplyees have left
30
prop.table( table(data$left) )
31

32

33
# -------------------------------------------------------------------------
34
#						Model Training 
35
# -------------------------------------------------------------------------
36

37
# convert the newborn to factor variables
38
data[ , Newborn := as.factor(Newborn) ]
39

40
# split the dataset into two parts. 80 percent of the dataset will be used to actually 
41
# train the model, while the rest will be used to evaluate the accuracy of this model, 
42
# i.e. out of sample error
43
set.seed(4321)
44
test <- createDataPartition( data$left, p = .2, list = FALSE )
45
data_train <- data[ -test, ]
46
data_test  <- data[ test, ]
47
rm(data)
48

49
model_glm <- glm( left ~ . , data = data_train, family = binomial(logit) )
50
summary_glm <- summary(model_glm)
51

52
# p-value and pseudo r squared 
53
list( summary_glm$coefficient, 
54
	  1- ( summary_glm$deviance / summary_glm$null.deviance ) )
55
# all the p value of the coefficients indicates significance 
56

57

58
# -------------------------------------------------------------------------
59
#						Predicting and Assessing the Model 
60
# -------------------------------------------------------------------------
61

62
# obtain the predicted value that a employee will leave in the future on the train
63
# and test set, after that we'll perform a quick evaluation by using the double density plot
64
data_train$prediction <- predict( model_glm, newdata = data_train, type = "response" )
65
data_test$prediction  <- predict( model_glm, newdata = data_test , type = "response" )
66

67
# given that our model's final objective is to classify new instances 
68
# into one of two categories, whether the employee will leave or not
69
# we will want the model to give high scores to positive
70
# instances ( 1: employee left ) and low scores ( 0 : employee stayed ) otherwise. 
71

72
# distribution of the prediction score grouped by known outcome
73
ggplot( data_train, aes( prediction, color = as.factor(left) ) ) + 
74
geom_density( size = 1 ) +
75
ggtitle( "Training Set's Predicted Score" ) + 
76
scale_color_economist( name = "data", labels = c( "negative", "positive" ) ) + 
77
theme_economist()
78

79
# Ideally you want the distribution of scores to be separated, 
80
# with the score of the negative instances to be on the left and the score of the
81
# positive instance to be on the right.
82
# In the current case, both distributions are slight skewed to the left. 
83
# Not only is the predicted probability for the negative outcomes low, but 
84
# the probability for the positive outcomes are also lower than it should be. 
85
# The reason for this is because our dataset only consists of 16 percent of positive 
86
# instances ( employees that left ). Thus our predicted scores sort of gets pulled 
87
# towards a lower number because of the majority of the data being negative instances.
88

89
# A slight digression, when developing models for prediction, we all know that we want the model to be
90
# as accurate as possible, or in other words, to do a good job in 
91
# predicting the target variable on out of sample observations.
92

93
# Our plot, however, can actually tell us a very important thing :
94
# Accuracy will not be a suitable measurement for this model 
95

96
# We'll show why below :
97

98
# Since the prediction of a logistic regression model is a 
99
# probability, in order to use it as a classifier, we'll have a choose a cutoff value,
100
# or you can say its a threshold. Where scores above this value will classified as 
101
# positive, those below as negative. We'll be using the term cutoff for the rest of 
102
# the documentation
103

104
# Here we'll use a function to loop through several cutoff values and 
105
# compute the model's accuracy on both training and testing set
106
source("unbalanced_code/unbalanced_functions.R")
107

108
accuracy_info <- AccuracyCutoffInfo( train = data_train, test = data_test, 
109
									 predict = "prediction", actual = "left" )
110
# define the theme for the next plot
111
ggthemr("light")
112
accuracy_info$plot
113

114

115
# from the output, you can see that starting from the cutoff value of .6
116
# our accuracy for both training and testing set grows higher and higher showing 
117
# no sign of decreasing at all 
118
# we'll visualize the confusion matrix of the test set to see what's causing this
119
cm_info <- ConfusionMatrixInfo( data = data_test, predict = "prediction", 
120
					 			actual = "left", cutoff = .6 )
121
ggthemr("flat")
122
cm_info$plot
123

124
# wiki : https://en.wikipedia.org/wiki/Sensitivity_and_specificity#Worked_example
125
# The above plot depicts the tradeoff we face upon choosing a reasonable cutoff. 
126

127
# if we increase the cutoff value, 
128
# the number of true negative (TN) increases and the number of true positive (TP) decreases.
129
# Or you can say, If we increase the cutoff's value, the number of false positive (FP) is lowered, 
130
# while the number of false negative (FN) rises. 
131
# Here, because we have very few positive instances, thus our model will be 
132
# less likely to make a false negative mistake, so if we keep on adding 
133
# the cutoff value, we'll actually increase our model's accuracy, since 
134
# we have a higher chance of turning the false positive into true negative. 
135

136
# predict all the test set's outcome as 0
137
prop.table( table( data_test$left ) )
138

139
# Section conclusion : 
140
# Accuracy is not the suitable indicator for the model 
141
# for unbalanced distribution or costs
142

143
# -------------------------------------------------------------------------
144
#						Choosing the Suitable Cutoff Value 
145
# -------------------------------------------------------------------------
146

147
# use the roc curve to determine the cutoff
148
# it plots the false positive rate (FPR) on the x-axis and the true positive rate (TPR) on the y-axis
149
print(cm_info$data)
150

151
ggthemr_reset()
152
# different cost for false negative and false positive 
153
cost_fp <- 100
154
cost_fn <- 200
155
roc_info <- ROCInfo( data = cm_info$data, predict = "predict", 
156
					 actual = "actual", cost.fp = cost_fp, cost.fn = cost_fn )
157

158
# reset to default ggplot theme 
159
grid.draw(roc_info$plot)
160

161

162
# re plot the confusion matrix plot 
163
cm_info <- ConfusionMatrixInfo( data = data_test, predict = "prediction", 
164
                                actual = "left", cutoff = roc_info$cutoff )
165
ggthemr("flat")
166
cm_info$plot
167

168

169
# -------------------------------------------------------------------------
170
#						Interpretation 
171
# -------------------------------------------------------------------------
172

173
# tidy from the broom package
174
coefficient <- tidy(model_glm)[ , c( "term", "estimate", "statistic" ) ]
175

176
coefficient$estimate <- exp( coefficient$estimate )
177

178
# one unit increase in statisfaction, the odds of leaving the company 
179
# (versus not leaving) increase by a factor of
180
coefficient[ coefficient$term == "S", "estimate" ]
181

182
# use the model to predict a unknown outcome data "HR_unknown.csv"
183
# specify the column's class 
184
col_class <- sapply( data_test, class )[1:6]
185
data <- read.csv( list.files( "data", full.names = TRUE )[1], colClasses = col_class )
186
data$prediction <- predict( model_glm, newdata = data, type = "response" )
187

188
# cutoff
189
data <- data[ data$prediction >= roc_info$cutoff, ]
190

191
# time spent in the company 
192
median_tic <- data %>% group_by(TIC) %>% 
193
					   summarise( prediction = median(prediction), count = n() )
194
ggthemr("fresh")
195
ggplot( median_tic, aes( TIC, prediction, size = count ) ) + 
196
geom_point() + theme( legend.position = "none" ) +
197
labs( title = "Time and Employee Attrition", y = "Attrition Probability", 
198
	  x = "Time Spent in the Company" ) 
199

200
# last project evaluation 
201
data$LPECUT <- cut( data$LPE, breaks = quantile(data$LPE), include.lowest = TRUE )
202
median_lpe <- data %>% group_by(LPECUT) %>% 
203
					   summarise( prediction = median(prediction), count = n() )
204

205
ggplot( median_lpe, aes( LPECUT, prediction ) ) + 
206
geom_point( aes( size = count ), color = "royalblue3" ) + 
207
theme( legend.position = "none" ) +
208
labs( title = "Last Project's Evaluation and Employee Attrition", 
209
	  y = "Attrition Probability", x = "Last Project's Evaluation by Client" )
210

211
# This is probabily an indication that it'll be worth trying out other classification 
212
# algorithms. Since logistic regressions assumes monotonic relationships ( either entirely increasing or decreasing )
213
# between the input paramters and the outcome ( also true for linear regression ). Meaning the   
214
# if more of a quantity is good, then much more of the quantity is better. This is often not the case in the real world
215

216
# given this probability we can prioritize our actions by adding back how much 
217
# do we wish to retain these employees. Recall that from our dataset, we have the performance
218
# information of the employee ( last project evaluation ). 
219
# given this table, we can easily create a visualization to tell the story
220
ggplot( data, aes( prediction, LPE ) ) + 
221
geom_point() + 
222
ggtitle( "Performace v.s. Probability to Leave" )
223

224
# we first have the employees that are underperforming, we probably should 
225
# improve their performance or you can say you can't wait for them to leave....
226
# for employees that are not likely to leave, we should manage them as usual
227
# then on the short run, we should focus on those with a good performance, but
228
# also has a high probability to leave.
229

230
# the next thing we can do, is to quantify our priority by 
231
# multiplying the probablity to leave with the performance.
232
# we'll also use row names of the data.frame to 
233
# to serve as imaginery employee ids.
234
# Then we will obtain a priority score. Where the score will be high for 
235
# the employees we wish to act upon as soon as possible, and low for the other ones
236
result <- data %>% 
237
		  mutate( priority = prediction * LPE ) %>%
238
		  mutate( id = rownames(data) ) %>%
239
		  arrange( desc(priority) )
240

241
# after obtaining this result, we can schedule a face to face interview with employees 
242
# at the top of the list.
243

244
# using classification in this example enabled us to detect events that will 
245
# happen in the future. That is which employees are more likely to leave the company.
246
# Based on this information, we can come up with a more efficient strategy to cope
247
# with matter at hand.
248

249

250
# ----------------------------------------------------------------------
251
# document later, strange statistic test 
252
# http://www.r-bloggers.com/evaluating-logistic-regression-models/
253
		  	
254

255
Product

Resources

Company