Path: blob/master/linear_regression/linear_regession_code/gradient_descent.R
2573 views
library(dplyr)12# gradient descent for linear regression34# [GradientDescent] :5# @data : The whole data frame type data.6# @target : Takes a character stating column name that serves as the output variable.7# @learning_rate : Learning rate for the gradient descent algorithm.8# @iteration : Halting criterion : maximum iteration allowed for training the gradient descent algorithm.9# @epsilon : Halting criterion : If the trained parameter's difference between the two iteration is smaller than this value then the algorithm will halt.10# @normalize : Boolean value indicating whether to performing z-score normalization for the input variables. Default to TRUE.11# @method : Specify either "batch" or "stochastic" for the gradient descent method. Use batch for now, this will be explained later.1213GradientDescent <- function( data, target, learning_rate, iteration,14epsilon = .001, normalize = TRUE, method )15{16# separate the input and output variables17input <- data %>% select( -one_of(target) ) %>% as.matrix()18output <- data %>% select( one_of(target) ) %>% as.matrix()1920# normalize the input variables if specified21# record the mean and standard deviation22if(normalize)23{24input <- scale(input)25input_mean <- attr( input, "scaled:center" )26input_sd <- attr( input, "scaled:scale" )27}2829# implementation trick, after the normalizing the original input column30# add a new column of all 1's to the first column, this serves as X031input <- cbind( theta0 = 1, input )3233# theta_new : initialize the theta value as all 1s34# theta_old : a random number whose absolute difference between new one is35# larger than than epsilon36theta_new <- matrix( 1, ncol = ncol(input) )37theta_old <- matrix( 2, ncol = ncol(input) )3839# cost function40costs <- function( input, output, theta )41{42sum( ( input %*% t(theta) - output )^2 ) / ( 2 * nrow(output) )43}4445# records the theta and cost value for visualization ; add the inital guess46theta_trace <- vector( mode = "list", length = iteration )47theta_trace[[1]] <- theta_new48costs_trace <- numeric( length = iteration )49costs_trace[1] <- costs( input, output, theta_old )5051# first derivative of the cost function52if( method == "batch" )53{54derivative <- function( input, output, theta, step )55{56error <- ( input %*% t(theta) ) - output57descent <- ( t(input) %*% error ) / nrow(output)58return( t(descent) )59}60}else # stochastic gradient descent, using one training sample per update61{62derivative <- function( input, output, theta, step )63{64r <- step %% nrow(input) + 165error <- input[ r, ] %*% t(theta) - output[ r, ]66descent <- input[ r, ] * error67return(descent)68}69}7071# keep updating as long as any of the theta difference is still larger than epsilon72# or exceeds the maximum iteration allowed73step <- 174while( any( abs(theta_new - theta_old) > epsilon ) & step <= iteration )75{76step <- step + 17778# gradient descent79theta_old <- theta_new80theta_new <- theta_old - learning_rate * derivative( input, output, theta_old, step )8182# record keeping83theta_trace[[step]] <- theta_new84costs_trace[step] <- costs( input, output, theta_new )85}8687# returns the noramalized mean and standard deviation for each input column88# and the cost, theta record89costs <- data.frame( costs = costs_trace )90theta <- data.frame( do.call( rbind, theta_trace ), row.names = NULL )91norm <- data.frame( input_mean = input_mean, input_sd = input_sd )9293return( list( costs = costs, theta = theta, norm = norm ) )94}9596979899100101