Path: blob/master/linear_regression/linear_regession_code/linear_regession.R
2573 views
# linear regression1library(grid)2library(dplyr)3library(scales)4library(ggplot2)5setwd("/Users/ethen/machine-learning/linear_regression")678# original formula9Formula <- function(x) 1.2 * (x-2)^2 + 3.21011# visualize the function, and the optimal solution12# drawing function formula after giving the x coordinates13ggplot( data.frame( x = c( 0, 4 ) ), aes( x ) ) +14stat_function( fun = Formula ) +15geom_point( data = data.frame( x = 2, y = Formula(2) ), aes( x, y ),16color = "blue", size = 3 ) +17ggtitle( expression( 1.2 * (x-2)^2 + 3.2 ) )181920# ------------------------------------------------------------------------------------21# gradient descent toy example :22# keep updating the x value until the difference between this iteration and the last23# one, is smaller than epsilon (a given small value) or the process count of updating the24# x value surpass user-specified iteration2526# first derivative of the formula27Derivative <- function(x) 2 * 1.2 * (x-2)2829# x_new : initial guess for the x value30# x_old : assign a random value to start for the first iteration31x_new <- .132x_old <- 033# manually assign a fix learning rate34learning_rate <- .635# other paramaters : manually assign epilson value, maximum iteration allowed36epsilon <- .0537step <- 138iteration <- 103940# records the x and y value for visualization ; add the inital guess41xtrace <- list() ; ytrace <- list()42xtrace[[1]] <- x_new ; ytrace[[1]] <- Formula(x_new)4344while( abs( x_new - x_old ) > epsilon & step <= iteration )45{46# update iteration count47step <- step + 14849# gradient descent50x_old <- x_new51x_new <- x_old - learning_rate * Derivative(x_old)5253# record keeping54xtrace[[step]] <- x_new55ytrace[[step]] <- Formula(x_new)56}5758# create a data points' coordinates59record <- data.frame( x = do.call( rbind, xtrace ), y = do.call( rbind, ytrace ) )6061# create the segment between each points (gradient steps)62segment <- data.frame( x = double(), y = double(), xend = double(), yend = double() )63for( i in 1:( nrow(record)-1 ) )64{65segment[ i, ] <- cbind( record[ i, ], record[ i+1, ] )66}6768# visualize the gradient descent's value69ggplot( data.frame( x = c( 0, 4 ) ), aes( x ) ) +70stat_function( fun = Formula ) +71ggtitle( expression( 1.2 * (x-2)^2 + 3.2 ) ) +72geom_point( data = record, aes( x, y ), color = "red", size = 3, alpha = .8, shape = 2 ) +73geom_segment( data = segment , aes( x = x, y = y, xend = xend, yend = yend ),74color = "blue", alpha = .8, arrow = arrow( length = unit( 0.25, "cm" ) ) )757677# --------------------------------------------------------------------------------78# housing data7980housing <- read.table( "housing.txt", header = TRUE, sep = "," )8182# example :83# suppose you've already calculated that the difference of84# the two rows are 100 and 200 respectively, then85# using the first two rows of the input variables86housing[ c( 1, 2 ), -3 ]8788# multiply 100 with row 189( row1 <- 100 * housing[ 1, -3 ] )9091# multuply 200 with row 292( row2 <- 200 * housing[ 1, -3 ] )9394# sum each row up95list( area = sum( row1[1] + row2[1] ), bedrooms = sum( row1[2] + row2[2] ) )969798# z-score normalize99Normalize <- function(x) ( x - mean(x) ) / sd(x)100101102# --------------------------------------------------------------------------------------------103# gradient descent104source("linear_regession_code/gradient_descent.R")105106trace_b <- GradientDescent( data = housing, target = "price",107learning_rate = 0.05, iteration = 500, method = "batch" )108# final parameters109parameters_b <- trace_b$theta[ nrow(trace_b$theta), ]110111# linear regression112normed <- apply( housing[ , -3 ], 2, scale )113normed_data <- data.frame( cbind( normed, price = housing$price ) )114model <- lm( price ~ ., data = normed_data )115116117# visualize cost118costs_df <- data.frame( iteration = 1:nrow(trace_b$cost),119costs = trace_b$cost / 1e+8 )120ggplot( costs_df, aes( iteration, costs ) ) + geom_line()121122123# ---------------------------------------------------------------------------124# appendix : summary of the linear model125126summary(model)127128# residuals129summary( model$residuals )130summary( normed_data$price - model$fitted.values )131132133# t-value134library(broom)135( coefficient <- tidy(model) )136coefficient$estimate / coefficient$std.error137138139# p-value140summary(model)$df141( df <- nrow(normed_data) - nrow(coefficient) )142pt( abs(coefficient$statistic), df = df, lower.tail = FALSE ) * 2143144145# r squared146147# @y : original output value148# @py : predicted ouput value149RSquared <- function( y , py )150{151rss <- sum( ( y - py )^2 )152tss <- sum( ( y - mean(y) )^2 )153return( 1 - rss / tss )154}155156summary(model)$r.squared157RSquared( normed_data$price, model$fitted.values )158159160# adjusted r square161k <- nrow(coefficient) - 1162163# @y : original output value164# @py : predicted ouput value165# @k : number of the model's coefficient, excluding the intercept166AdjustedRSquared <- function( y, py, k )167{168n <- length(y)169r2 <- RSquared( y, py )170return( 1 - ( 1 - r2 ) * ( n - 1 ) / ( n - k - 1 ) )171}172summary(model)$r.squared * df / nrow(normed_data)173summary(model)$adj.r.squared174AdjustedRSquared( normed_data$price, model$fitted.values, k )175176177# linear regression plot178source("/Users/ethen/machine-learning/linear_regression/linear_regession_code/LMPlot.R")179lm_plot <- LMPlot( model = model, actual = normed_data$price )180grid.draw(lm_plot$plot)181lm_plot$outlier182183184# variance inflation score185library(car)186car::vif(model)187188# area calculation189area_model <- lm( area ~ .-price, data = normed_data )190area_r2 <- RSquared( y = normed_data$area, py = area_model$fitted.values )1911 / ( 1 - area_r2 )192193# ----------------------------------------------------------------------------194# test code195# normal equation196solve( t(input) %*% input ) %*% t(input) %*% output197198199# stochastic approach200trace_s <- GradientDescent( target = "price", data = housing,201learning_rate = 0.05, iteration = 3000, method = "stochastic" )202parameters_s <- trace_s$theta[ nrow(trace$theta), ]203204library(microbenchmark)205runtime <- microbenchmark(206207batch = GradientDescent( target = "price", data = housing,208learning_rate = 0.05, iteration = 500, method = "batch" ),209stochastic = GradientDescent( target = "price", data = housing,210learning_rate = 0.05, iteration = 521, method = "stochastic" )211212)213214215216