Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/linear_regression/linear_regession_code/LMPlot.R
2577 views
1
library(grid)
2
library(scales)
3
library(ggplot2)
4
library(gridExtra)
5
library(data.table)
6
7
# [LMPlot] :
8
# visualizations that works with linear regression
9
# @model : linear regression model object
10
# @actual : your data's actual (original) output value
11
# returns : 1. plot : returns the four plot in one side by side plot
12
# 2. outlier : observation index of the possible outliers, if none return NULL
13
14
LMPlot <- function( model, actual )
15
{
16
if( class(model) != "lm" )
17
stop( "Must be a linear model" )
18
19
cooks_distance <- cooks.distance(model)
20
plot_data <- data.table( actual = actual,
21
predicted = model$fitted.values,
22
residuals = model$residuals,
23
cooks_distance = cooks_distance )
24
25
# cooks distance > 1 or > 4 / number of data is considered a possible outlier
26
boolean <- ( cooks_distance > 1 ) | ( cooks_distance > 4 / length(actual) )
27
outlier <- which(boolean)
28
29
if( length(outlier) > 0 )
30
plot_data[ , boolean := boolean ]
31
32
# -- plot --
33
# defines the basic layout,
34
# if there's outlier, then color the outlier plots,
35
# if not then simply add the points to the aesthetic
36
37
# 1. cooks distance v.s. predicted value
38
cooks <- ggplot( plot_data, aes( predicted, cooks_distance ) ) +
39
scale_x_continuous( labels = comma ) +
40
ggtitle( "Cooks Distance of Predicted Value" )
41
42
# 2. predicted value versus actual value :
43
# if the model is considered to a good estimate of the outcome,
44
# there should be strong correlation between the model’s predictions and its actual results.
45
pred <- ggplot( plot_data, aes( predicted, actual ) ) +
46
ggtitle( "Predicted Value v.s. Actual Value" ) +
47
scale_x_continuous( labels = comma ) +
48
scale_y_continuous( labels = comma )
49
50
# 3. residual plot :
51
# Ideally your plot of the residuals should be symmetrically distributed around the lower
52
# digits of the y-axis, with no clear patterns what so ever.
53
resid <- ggplot( plot_data, aes( predicted, residuals ) ) +
54
ggtitle("Residuals of the Predicted Value") +
55
scale_x_continuous( labels = comma ) +
56
scale_y_continuous( labels = comma )
57
58
# 4. QQ-plot of the residuals :
59
# The plot will be very close to the y = x straight line if the residuals
60
# is a close approximation to a normal distribution.
61
QQPlot <- function( plot_data )
62
{
63
# qqline draws the line between the 25% and 75% quantile by default
64
y <- quantile( plot_data$residuals, c(0.25, 0.75) )
65
x <- qnorm( c(0.25, 0.75) )
66
67
# y = slope * x + intercept
68
slope <- diff(y) / diff(x)
69
intercept <- y[1] - slope * x[1]
70
71
qqplot <- ggplot( plot_data, aes( sample = residuals ) ) +
72
scale_y_continuous( labels = comma ) +
73
geom_abline( slope = slope, intercept = intercept, color = "blue" ) +
74
ggtitle( "Residual's QQ Plot " )
75
return(qqplot)
76
}
77
qqplot <- QQPlot( plot_data = plot_data )
78
79
# color the plot to distinguish outlier and normal data point if there is in fact one
80
if( length(outlier) > 0 )
81
{
82
cooks <- cooks + geom_point( aes( color = boolean ), size = 2, shape = 1 ) +
83
guides( color = FALSE )
84
85
pred <- pred + geom_point( aes( color = boolean ), size = 2, shape = 1 ) +
86
geom_smooth( method = "lm" ) +
87
guides( color = FALSE )
88
89
resid <- resid + geom_point( aes( color = boolean ), size = 2, shape = 1 ) +
90
geom_smooth( aes( x = predicted, y = residuals ) ) +
91
guides( color = FALSE )
92
93
qqplot <- qqplot + stat_qq( aes( color = boolean ), size = 2, shape = 1 ) +
94
guides( color = FALSE )
95
96
plot <- arrangeGrob( pred, cooks, resid, qqplot )
97
return( list( plot = plot, outlier = outlier ) )
98
}else
99
{
100
cooks <- cooks + geom_point( size = 2, shape = 1 )
101
102
pred <- pred + geom_point( size = 2, shape = 1 ) +
103
geom_smooth( method = "lm" )
104
105
resid <- resid + geom_point( size = 2, shape = 1 ) +
106
geom_smooth( aes( x = predicted, y = residuals ) )
107
108
qqplot <- qqplot + stat_qq( size = 2, shape = 1 )
109
110
plot <- arrangeGrob( pred, cooks, resid, qqplot )
111
return( list( plot = plot, outlier = NULL ) )
112
}
113
}
114
115
116