Shared2017-12-13-100013.ipynbOpen in CoCalc
R code final
#Load the data:
# Notice: if reproduction of this code is of interst, delete blank columns to achieve this result.
PATH <- "weekly_in_situ_co2_mlo.csv"
Data <- read.csv(PATH, header = TRUE)

#Change the time
Time <- as.Date(Data$Date)
plot(Time,Data$CO2, xlab = "Year", ylab = "CO2 Levels")

lm <- lm(CO2~Time, data = Data)
plot(Time,Data$CO2, xlab = "Year", ylab = "CO2 Levels")
abline(lm, col="red")

#turn the time parameter into days from day one:
Time <- c(Time-Time[1])
#Sanity check: we should have a length of 3040 (weeks between )
length(Time) 

3040
#turn time to numeric vector:
Time <- as.numeric(Time, units="days")
Time[3040]

21770
#Create a numeric dataframe to work with:
numeric_data <- data.frame(Data$CO2,Time)
colnames(numeric_data) <-c("CO2","Time")
#Expect to get dbl type (int) for both variables:
head(numeric_data)

CO2Time
316.19 0
317.31 7
317.6914
317.5821
316.4828
316.9535
#model test on the esixting data:
#use the first 70% to fit and 30% to predict:
set.seed(23)
train_size <- nrow(numeric_data)* .7 #<- train size
#let's split the data to train and test:
#train <- numeric_data[1:train_size, ]
train <- numeric_data[1:train_size,]
test_rows <- (nrow(numeric_data) - nrow(train))
test <- tail(numeric_data, test_rows)
#Sanity check: we should expect the test data to start at 2129 (one item after 70% of observations , and to have 3040-2128 = 912 )
head(test)

CO2Time
2129369.6115309
2130370.1115316
2131370.0915323
2132370.9815330
2133370.8615337
2134371.0415344
nrow(test)

912
#Add the seasonal parameter:
seasonal_var <- ts(cos((2*pi*train$Time)/365.25))
train <- data.frame(train$Time, seasonal_var, CO2=train$CO2)
colnames(train) <-c("Time","seasonal_var","CO2")
head(train)

Timeseasonal_varCO2
0 1.0000000316.19
7 0.9927586317.31
14 0.9711394317.69
21 0.9354554317.58
28 0.8862235316.48
35 0.8241566316.95
#Fit the model:
tslm <- lm(CO2~Time+seasonal_var, data = train)
#Later we'd want to plot the fit of the model:
seasonal_var_numeric <- as.numeric(seasonal_var)
tslm_var_for_plot <-(Time[1:nrow(train)]+seasonal_var_numeric)
summary(tslm)

Call: lm(formula = CO2 ~ Time + seasonal_var, data = train) Residuals: Min 1Q Median 3Q Max -4.2844 -1.6280 -0.2005 1.2968 6.1090 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.105e+02 9.116e-02 3406.23 <2e-16 *** Time 3.633e-03 1.020e-05 356.23 <2e-16 *** seasonal_var 2.544e+00 6.316e-02 40.27 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.055 on 2125 degrees of freedom Multiple R-squared: 0.9837, Adjusted R-squared: 0.9837 F-statistic: 6.422e+04 on 2 and 2125 DF, p-value: < 2.2e-16
plot(Time[1:nrow(train)],Data$CO2[1:nrow(train)], xlab = "Indexed Time", ylab = "CO2 Levels", main="CO2 (in black) predicted by the updated model(red) - Not bad at all!")
lines(tslm_var_for_plot, train$CO2, col="red")

#use the model and its predictor (time, seasonal variation) to predict CO2 for different times:
#X input for the model - copmlement the test set with the seasonal var:
seasonal_var <- ts(cos((2*pi*test$Time)/365.25))
test <- data.frame(test$Time, seasonal_var, CO2=test$CO2)
colnames(test) <-c("Time","seasonal_var","CO2")
head(test)

Timeseasonal_varCO2
15309 0.8567425369.61
15316 0.9124929370.11
15323 0.9550279370.09
15330 0.9837315370.98
15337 0.9981880370.86
15344 0.9981880371.04
tslm <- lm(CO2~Time+seasonal_var, data = train)
prediction<-predict(tslm, test)
summary(tslm)

Call: lm(formula = CO2 ~ Time + seasonal_var, data = train) Residuals: Min 1Q Median 3Q Max -4.2844 -1.6280 -0.2005 1.2968 6.1090 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.105e+02 9.116e-02 3406.23 <2e-16 *** Time 3.633e-03 1.020e-05 356.23 <2e-16 *** seasonal_var 2.544e+00 6.316e-02 40.27 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.055 on 2125 degrees of freedom Multiple R-squared: 0.9837, Adjusted R-squared: 0.9837 F-statistic: 6.422e+04 on 2 and 2125 DF, p-value: < 2.2e-16

pred_data <- data.frame(test$Time, prediction)
plot(test$Time,test$CO2, xlab = "Indexed Time", ylab = "CO2 Levels", main="Prediction: CO2 (in black) predicted by the updated model(red), \nnot so great...")
lines(pred_data, col="red")

#This model builds on the linear model that we have used so far as a mean to a normal distribution:
#priors (taken from the linear model):
c0 = 310.5; c1 = .003; c2 = 2.53; c3 = .01; c4=.5
counter = 0
seasonal_var_df <- as.array(seasonal_var_numeric)
#constract the Bayes-Gaussian Model:
bayesian_model <- function(num_days){
  bys <- rnorm(1,(c0+c1*num_days+c2*cos(2*pi*num_days/365.25 + c3)),c4)
  if (is.numeric(bys)){   #this subblock is just for the counter
     counter <- counter+1
  }
  return(rnorm(1,(c0+c1*num_days+c2*cos(2*pi*num_days/365.25 + c3)),c4))
}
#Let's see how this model fits the training data:
bayesian_results_func <- function(X_input) {
  #generate list for iteration:
  items_for_iteration <- length(X_input)
  l <- vector("integer", items_for_iteration)
  for (i in 1:length(X_input)){
    #see: https://stackoverflow.com/questions/26508519/how-to-add-elements-to-a-list-in-r-loop
    l[i] <- bayesian_model(X_input[i])
  }
  return (subset(l,!is.na(l)) )
  #l = l[!is.na(l)]
  #return(l)
}
bayesian_results <- bayesian_results_func(c(train$Time))

#Sanity Check: we should expect 2128 obs / 7 = 304
head(bayesian_results)
length(bayesian_results)
print(counter)
  1. 313.18011608506
  2. 312.082467171125
  3. 313.553690562177
  4. 312.590890964596
  5. 312.291578041677
  6. 313.506027261337
2128
[1] 0
is.numeric(bayesian_results)
TRUE
Time[1]
0
seasonal_var_df[1]
1
sum(is.na(rnorm(1000,(c0+c1*Time[1]+c2*seasonal_var_df[1]),c4)))
0
X_input
Error in eval(expr, envir, enclos): object 'X_input' not found Traceback:
train$Time
WARNING: Some output was deleted.