CoCalc -- linear_regession.R

GitHub Repository: ethen8181/machine-learning
Path: blob/master/linear_regression/linear_regession_code/linear_regession.R
²⁵⁷³ views
1
# linear regression
2
library(grid)
3
library(dplyr)
4
library(scales)
5
library(ggplot2)
6
setwd("/Users/ethen/machine-learning/linear_regression")
7

8

9
# original formula 
10
Formula <- function(x) 1.2 * (x-2)^2 + 3.2
11

12
# visualize the function, and the optimal solution
13
# drawing function formula after giving the x coordinates 
14
ggplot( data.frame( x = c( 0, 4 ) ), aes( x ) ) + 
15
stat_function( fun = Formula ) + 
16
geom_point( data = data.frame( x = 2, y = Formula(2) ), aes( x, y ), 
17
	        color = "blue", size = 3 ) + 
18
ggtitle( expression( 1.2 * (x-2)^2 + 3.2 ) )
19

20

21
# ------------------------------------------------------------------------------------
22
# gradient descent toy example : 
23
# keep updating the x value until the difference between this iteration and the last 
24
# one, is smaller than epsilon (a given small value) or the process count of updating the 
25
# x value surpass user-specified iteration
26

27
# first derivative of the formula
28
Derivative <- function(x) 2 * 1.2 * (x-2) 
29

30
# x_new : initial guess for the x value
31
# x_old : assign a random value to start for the first iteration 
32
x_new <- .1 
33
x_old <- 0
34
# manually assign a fix learning rate 
35
learning_rate <- .6
36
# other paramaters : manually assign epilson value, maximum iteration allowed 
37
epsilon <- .05
38
step <- 1
39
iteration <- 10
40

41
# records the x and y value for visualization ; add the inital guess 
42
xtrace <- list() ; ytrace <- list()
43
xtrace[[1]] <- x_new ; ytrace[[1]] <- Formula(x_new)
44

45
while( abs( x_new - x_old ) > epsilon & step <= iteration )
46
{
47
	# update iteration count 
48
	step <- step + 1	
49
	
50
	# gradient descent
51
	x_old <- x_new
52
	x_new <- x_old - learning_rate * Derivative(x_old)
53
	
54
	# record keeping 
55
	xtrace[[step]] <- x_new
56
	ytrace[[step]] <- Formula(x_new)	
57
}
58

59
# create a data points' coordinates
60
record <- data.frame( x = do.call( rbind, xtrace ), y = do.call( rbind, ytrace ) )
61

62
# create the segment between each points (gradient steps)
63
segment <- data.frame( x = double(), y = double(), xend = double(), yend = double() )
64
for( i in 1:( nrow(record)-1 ) )
65
{
66
	segment[ i, ] <- cbind( record[ i, ], record[ i+1, ] )	
67
}
68

69
# visualize the gradient descent's value 
70
ggplot( data.frame( x = c( 0, 4 ) ), aes( x ) ) + 
71
stat_function( fun = Formula ) + 
72
ggtitle( expression( 1.2 * (x-2)^2 + 3.2 ) ) + 
73
geom_point( data = record, aes( x, y ), color = "red", size = 3, alpha = .8, shape = 2 ) +
74
geom_segment( data = segment , aes( x = x, y = y, xend = xend, yend = yend ), 
75
              color = "blue", alpha = .8, arrow = arrow( length = unit( 0.25, "cm" ) ) )
76

77

78
# --------------------------------------------------------------------------------
79
# housing data
80

81
housing <- read.table( "housing.txt", header = TRUE, sep = "," )
82

83
# example :
84
# suppose you've already calculated that the difference of 
85
# the two rows are 100 and 200 respectively, then 
86
# using the first two rows of the input variables
87
housing[ c( 1, 2 ), -3 ]
88

89
# multiply 100 with row 1 
90
( row1 <- 100 * housing[ 1, -3 ] )
91

92
# multuply 200 with row 2
93
( row2 <- 200 * housing[ 1, -3 ] )
94

95
# sum each row up
96
list( area = sum( row1[1] + row2[1] ), bedrooms = sum( row1[2] + row2[2] ) )
97

98

99
# z-score normalize 
100
Normalize <- function(x) ( x - mean(x) ) / sd(x)
101

102

103
# --------------------------------------------------------------------------------------------
104
# gradient descent 
105
source("linear_regession_code/gradient_descent.R")
106

107
trace_b <- GradientDescent( data = housing, target = "price",  
108
	                        learning_rate = 0.05, iteration = 500, method = "batch" )
109
# final parameters 
110
parameters_b <- trace_b$theta[ nrow(trace_b$theta), ]
111

112
# linear regression 
113
normed <- apply( housing[ , -3 ], 2, scale )
114
normed_data <- data.frame( cbind( normed, price = housing$price ) )
115
model <- lm( price ~ ., data = normed_data )
116

117

118
# visualize cost
119
costs_df <- data.frame( iteration = 1:nrow(trace_b$cost), 
120
	                    costs = trace_b$cost / 1e+8 )
121
ggplot( costs_df, aes( iteration, costs ) ) + geom_line()
122

123

124
# ---------------------------------------------------------------------------
125
# appendix : summary of the linear model 
126

127
summary(model)
128

129
# residuals 
130
summary( model$residuals )
131
summary( normed_data$price - model$fitted.values )
132

133

134
# t-value
135
library(broom)
136
( coefficient <- tidy(model) )
137
coefficient$estimate / coefficient$std.error
138

139

140
# p-value
141
summary(model)$df
142
( df <- nrow(normed_data) - nrow(coefficient) )
143
pt( abs(coefficient$statistic), df = df, lower.tail = FALSE ) * 2
144

145

146
# r squared
147

148
# @y  : original output value
149
# @py : predicted ouput value
150
RSquared <- function( y , py )
151
{
152
	rss <- sum( ( y - py )^2 )
153
	tss <- sum( ( y - mean(y) )^2 )
154
	return( 1 - rss / tss )
155
}
156

157
summary(model)$r.squared
158
RSquared( normed_data$price, model$fitted.values )
159

160

161
# adjusted r square 
162
k  <- nrow(coefficient) - 1
163

164
# @y  : original output value
165
# @py : predicted ouput value
166
# @k  : number of the model's coefficient, excluding the intercept
167
AdjustedRSquared <- function( y, py, k )
168
{	 
169
	n  <- length(y)
170
	r2 <- RSquared( y, py )	
171
	return( 1 - ( 1 - r2 ) * ( n - 1 ) / ( n - k - 1 ) )
172
}
173
summary(model)$r.squared * df / nrow(normed_data)
174
summary(model)$adj.r.squared
175
AdjustedRSquared( normed_data$price, model$fitted.values, k )
176

177

178
# linear regression plot 
179
source("/Users/ethen/machine-learning/linear_regression/linear_regession_code/LMPlot.R")
180
lm_plot <- LMPlot( model = model, actual = normed_data$price )
181
grid.draw(lm_plot$plot)
182
lm_plot$outlier
183

184

185
# variance inflation score 
186
library(car)
187
car::vif(model)
188

189
# area calculation 
190
area_model <- lm( area ~ .-price, data = normed_data )
191
area_r2 <- RSquared( y = normed_data$area, py = area_model$fitted.values )
192
1 / ( 1 - area_r2 )
193

194
# ----------------------------------------------------------------------------
195
# test code 
196
# normal equation
197
solve( t(input) %*% input ) %*% t(input) %*% output
198

199

200
# stochastic approach 
201
trace_s <- GradientDescent( target = "price", data = housing, 
202
	                        learning_rate = 0.05, iteration = 3000, method = "stochastic" )
203
parameters_s <- trace_s$theta[ nrow(trace$theta), ]
204

205
library(microbenchmark)
206
runtime <- microbenchmark( 
207

208
	batch = GradientDescent( target = "price", data = housing, 
209
	                         learning_rate = 0.05, iteration = 500, method = "batch" ),
210
	stochastic = GradientDescent( target = "price", data = housing, 
211
	                              learning_rate = 0.05, iteration = 521, method = "stochastic" )
212

213
)
214

215

216
Product

Resources

Company