Path: blob/master/linear_regression/linear_regession_code/LMPlot.R
2577 views
library(grid)1library(scales)2library(ggplot2)3library(gridExtra)4library(data.table)56# [LMPlot] :7# visualizations that works with linear regression8# @model : linear regression model object9# @actual : your data's actual (original) output value10# returns : 1. plot : returns the four plot in one side by side plot11# 2. outlier : observation index of the possible outliers, if none return NULL1213LMPlot <- function( model, actual )14{15if( class(model) != "lm" )16stop( "Must be a linear model" )1718cooks_distance <- cooks.distance(model)19plot_data <- data.table( actual = actual,20predicted = model$fitted.values,21residuals = model$residuals,22cooks_distance = cooks_distance )2324# cooks distance > 1 or > 4 / number of data is considered a possible outlier25boolean <- ( cooks_distance > 1 ) | ( cooks_distance > 4 / length(actual) )26outlier <- which(boolean)2728if( length(outlier) > 0 )29plot_data[ , boolean := boolean ]3031# -- plot --32# defines the basic layout,33# if there's outlier, then color the outlier plots,34# if not then simply add the points to the aesthetic3536# 1. cooks distance v.s. predicted value37cooks <- ggplot( plot_data, aes( predicted, cooks_distance ) ) +38scale_x_continuous( labels = comma ) +39ggtitle( "Cooks Distance of Predicted Value" )4041# 2. predicted value versus actual value :42# if the model is considered to a good estimate of the outcome,43# there should be strong correlation between the model’s predictions and its actual results.44pred <- ggplot( plot_data, aes( predicted, actual ) ) +45ggtitle( "Predicted Value v.s. Actual Value" ) +46scale_x_continuous( labels = comma ) +47scale_y_continuous( labels = comma )4849# 3. residual plot :50# Ideally your plot of the residuals should be symmetrically distributed around the lower51# digits of the y-axis, with no clear patterns what so ever.52resid <- ggplot( plot_data, aes( predicted, residuals ) ) +53ggtitle("Residuals of the Predicted Value") +54scale_x_continuous( labels = comma ) +55scale_y_continuous( labels = comma )5657# 4. QQ-plot of the residuals :58# The plot will be very close to the y = x straight line if the residuals59# is a close approximation to a normal distribution.60QQPlot <- function( plot_data )61{62# qqline draws the line between the 25% and 75% quantile by default63y <- quantile( plot_data$residuals, c(0.25, 0.75) )64x <- qnorm( c(0.25, 0.75) )6566# y = slope * x + intercept67slope <- diff(y) / diff(x)68intercept <- y[1] - slope * x[1]6970qqplot <- ggplot( plot_data, aes( sample = residuals ) ) +71scale_y_continuous( labels = comma ) +72geom_abline( slope = slope, intercept = intercept, color = "blue" ) +73ggtitle( "Residual's QQ Plot " )74return(qqplot)75}76qqplot <- QQPlot( plot_data = plot_data )7778# color the plot to distinguish outlier and normal data point if there is in fact one79if( length(outlier) > 0 )80{81cooks <- cooks + geom_point( aes( color = boolean ), size = 2, shape = 1 ) +82guides( color = FALSE )8384pred <- pred + geom_point( aes( color = boolean ), size = 2, shape = 1 ) +85geom_smooth( method = "lm" ) +86guides( color = FALSE )8788resid <- resid + geom_point( aes( color = boolean ), size = 2, shape = 1 ) +89geom_smooth( aes( x = predicted, y = residuals ) ) +90guides( color = FALSE )9192qqplot <- qqplot + stat_qq( aes( color = boolean ), size = 2, shape = 1 ) +93guides( color = FALSE )9495plot <- arrangeGrob( pred, cooks, resid, qqplot )96return( list( plot = plot, outlier = outlier ) )97}else98{99cooks <- cooks + geom_point( size = 2, shape = 1 )100101pred <- pred + geom_point( size = 2, shape = 1 ) +102geom_smooth( method = "lm" )103104resid <- resid + geom_point( size = 2, shape = 1 ) +105geom_smooth( aes( x = predicted, y = residuals ) )106107qqplot <- qqplot + stat_qq( size = 2, shape = 1 )108109plot <- arrangeGrob( pred, cooks, resid, qqplot )110return( list( plot = plot, outlier = NULL ) )111}112}113114115116