Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/linear_regression/linear_regession_code/linear_regession.R
2573 views
1
# linear regression
2
library(grid)
3
library(dplyr)
4
library(scales)
5
library(ggplot2)
6
setwd("/Users/ethen/machine-learning/linear_regression")
7
8
9
# original formula
10
Formula <- function(x) 1.2 * (x-2)^2 + 3.2
11
12
# visualize the function, and the optimal solution
13
# drawing function formula after giving the x coordinates
14
ggplot( data.frame( x = c( 0, 4 ) ), aes( x ) ) +
15
stat_function( fun = Formula ) +
16
geom_point( data = data.frame( x = 2, y = Formula(2) ), aes( x, y ),
17
color = "blue", size = 3 ) +
18
ggtitle( expression( 1.2 * (x-2)^2 + 3.2 ) )
19
20
21
# ------------------------------------------------------------------------------------
22
# gradient descent toy example :
23
# keep updating the x value until the difference between this iteration and the last
24
# one, is smaller than epsilon (a given small value) or the process count of updating the
25
# x value surpass user-specified iteration
26
27
# first derivative of the formula
28
Derivative <- function(x) 2 * 1.2 * (x-2)
29
30
# x_new : initial guess for the x value
31
# x_old : assign a random value to start for the first iteration
32
x_new <- .1
33
x_old <- 0
34
# manually assign a fix learning rate
35
learning_rate <- .6
36
# other paramaters : manually assign epilson value, maximum iteration allowed
37
epsilon <- .05
38
step <- 1
39
iteration <- 10
40
41
# records the x and y value for visualization ; add the inital guess
42
xtrace <- list() ; ytrace <- list()
43
xtrace[[1]] <- x_new ; ytrace[[1]] <- Formula(x_new)
44
45
while( abs( x_new - x_old ) > epsilon & step <= iteration )
46
{
47
# update iteration count
48
step <- step + 1
49
50
# gradient descent
51
x_old <- x_new
52
x_new <- x_old - learning_rate * Derivative(x_old)
53
54
# record keeping
55
xtrace[[step]] <- x_new
56
ytrace[[step]] <- Formula(x_new)
57
}
58
59
# create a data points' coordinates
60
record <- data.frame( x = do.call( rbind, xtrace ), y = do.call( rbind, ytrace ) )
61
62
# create the segment between each points (gradient steps)
63
segment <- data.frame( x = double(), y = double(), xend = double(), yend = double() )
64
for( i in 1:( nrow(record)-1 ) )
65
{
66
segment[ i, ] <- cbind( record[ i, ], record[ i+1, ] )
67
}
68
69
# visualize the gradient descent's value
70
ggplot( data.frame( x = c( 0, 4 ) ), aes( x ) ) +
71
stat_function( fun = Formula ) +
72
ggtitle( expression( 1.2 * (x-2)^2 + 3.2 ) ) +
73
geom_point( data = record, aes( x, y ), color = "red", size = 3, alpha = .8, shape = 2 ) +
74
geom_segment( data = segment , aes( x = x, y = y, xend = xend, yend = yend ),
75
color = "blue", alpha = .8, arrow = arrow( length = unit( 0.25, "cm" ) ) )
76
77
78
# --------------------------------------------------------------------------------
79
# housing data
80
81
housing <- read.table( "housing.txt", header = TRUE, sep = "," )
82
83
# example :
84
# suppose you've already calculated that the difference of
85
# the two rows are 100 and 200 respectively, then
86
# using the first two rows of the input variables
87
housing[ c( 1, 2 ), -3 ]
88
89
# multiply 100 with row 1
90
( row1 <- 100 * housing[ 1, -3 ] )
91
92
# multuply 200 with row 2
93
( row2 <- 200 * housing[ 1, -3 ] )
94
95
# sum each row up
96
list( area = sum( row1[1] + row2[1] ), bedrooms = sum( row1[2] + row2[2] ) )
97
98
99
# z-score normalize
100
Normalize <- function(x) ( x - mean(x) ) / sd(x)
101
102
103
# --------------------------------------------------------------------------------------------
104
# gradient descent
105
source("linear_regession_code/gradient_descent.R")
106
107
trace_b <- GradientDescent( data = housing, target = "price",
108
learning_rate = 0.05, iteration = 500, method = "batch" )
109
# final parameters
110
parameters_b <- trace_b$theta[ nrow(trace_b$theta), ]
111
112
# linear regression
113
normed <- apply( housing[ , -3 ], 2, scale )
114
normed_data <- data.frame( cbind( normed, price = housing$price ) )
115
model <- lm( price ~ ., data = normed_data )
116
117
118
# visualize cost
119
costs_df <- data.frame( iteration = 1:nrow(trace_b$cost),
120
costs = trace_b$cost / 1e+8 )
121
ggplot( costs_df, aes( iteration, costs ) ) + geom_line()
122
123
124
# ---------------------------------------------------------------------------
125
# appendix : summary of the linear model
126
127
summary(model)
128
129
# residuals
130
summary( model$residuals )
131
summary( normed_data$price - model$fitted.values )
132
133
134
# t-value
135
library(broom)
136
( coefficient <- tidy(model) )
137
coefficient$estimate / coefficient$std.error
138
139
140
# p-value
141
summary(model)$df
142
( df <- nrow(normed_data) - nrow(coefficient) )
143
pt( abs(coefficient$statistic), df = df, lower.tail = FALSE ) * 2
144
145
146
# r squared
147
148
# @y : original output value
149
# @py : predicted ouput value
150
RSquared <- function( y , py )
151
{
152
rss <- sum( ( y - py )^2 )
153
tss <- sum( ( y - mean(y) )^2 )
154
return( 1 - rss / tss )
155
}
156
157
summary(model)$r.squared
158
RSquared( normed_data$price, model$fitted.values )
159
160
161
# adjusted r square
162
k <- nrow(coefficient) - 1
163
164
# @y : original output value
165
# @py : predicted ouput value
166
# @k : number of the model's coefficient, excluding the intercept
167
AdjustedRSquared <- function( y, py, k )
168
{
169
n <- length(y)
170
r2 <- RSquared( y, py )
171
return( 1 - ( 1 - r2 ) * ( n - 1 ) / ( n - k - 1 ) )
172
}
173
summary(model)$r.squared * df / nrow(normed_data)
174
summary(model)$adj.r.squared
175
AdjustedRSquared( normed_data$price, model$fitted.values, k )
176
177
178
# linear regression plot
179
source("/Users/ethen/machine-learning/linear_regression/linear_regession_code/LMPlot.R")
180
lm_plot <- LMPlot( model = model, actual = normed_data$price )
181
grid.draw(lm_plot$plot)
182
lm_plot$outlier
183
184
185
# variance inflation score
186
library(car)
187
car::vif(model)
188
189
# area calculation
190
area_model <- lm( area ~ .-price, data = normed_data )
191
area_r2 <- RSquared( y = normed_data$area, py = area_model$fitted.values )
192
1 / ( 1 - area_r2 )
193
194
# ----------------------------------------------------------------------------
195
# test code
196
# normal equation
197
solve( t(input) %*% input ) %*% t(input) %*% output
198
199
200
# stochastic approach
201
trace_s <- GradientDescent( target = "price", data = housing,
202
learning_rate = 0.05, iteration = 3000, method = "stochastic" )
203
parameters_s <- trace_s$theta[ nrow(trace$theta), ]
204
205
library(microbenchmark)
206
runtime <- microbenchmark(
207
208
batch = GradientDescent( target = "price", data = housing,
209
learning_rate = 0.05, iteration = 500, method = "batch" ),
210
stochastic = GradientDescent( target = "price", data = housing,
211
learning_rate = 0.05, iteration = 521, method = "stochastic" )
212
213
)
214
215
216