Shared2017-12-13-100013.ipynbOpen in CoCalc
R code final
#Load the data:
# Notice: if reproduction of this code is of interst, delete blank columns to achieve this result.
PATH <- "weekly_in_situ_co2_mlo.csv"


#Change the time
Time <- as.Date(Data$Date) plot(Time,Data$CO2, xlab = "Year", ylab = "CO2 Levels")


lm <- lm(CO2~Time, data = Data)
plot(Time,Data$CO2, xlab = "Year", ylab = "CO2 Levels") abline(lm, col="red")  #turn the time parameter into days from day one: Time <- c(Time-Time[1]) #Sanity check: we should have a length of 3040 (weeks between ) length(Time)  3040 #turn time to numeric vector: Time <- as.numeric(Time, units="days") Time[3040]  21770 #Create a numeric dataframe to work with: numeric_data <- data.frame(Data$CO2,Time)
colnames(numeric_data) <-c("CO2","Time")
#Expect to get dbl type (int) for both variables:


CO2Time
316.19 0
317.31 7
317.6914
317.5821
316.4828
316.9535
#model test on the esixting data:
#use the first 70% to fit and 30% to predict:
set.seed(23)
train_size <- nrow(numeric_data)* .7 #<- train size
#let's split the data to train and test:
#train <- numeric_data[1:train_size, ]
train <- numeric_data[1:train_size,]
test_rows <- (nrow(numeric_data) - nrow(train))
test <- tail(numeric_data, test_rows)
#Sanity check: we should expect the test data to start at 2129 (one item after 70% of observations , and to have 3040-2128 = 912 )


CO2Time
2129369.6115309
2130370.1115316
2131370.0915323
2132370.9815330
2133370.8615337
2134371.0415344
nrow(test)


912
#Add the seasonal parameter:
seasonal_var <- ts(cos((2*pi*train$Time)/365.25)) train <- data.frame(train$Time, seasonal_var, CO2=train$CO2) colnames(train) <-c("Time","seasonal_var","CO2") head(train)  Timeseasonal_varCO2 0 1.0000000316.19 7 0.9927586317.31 14 0.9711394317.69 21 0.9354554317.58 28 0.8862235316.48 35 0.8241566316.95 #Fit the model: tslm <- lm(CO2~Time+seasonal_var, data = train) #Later we'd want to plot the fit of the model: seasonal_var_numeric <- as.numeric(seasonal_var) tslm_var_for_plot <-(Time[1:nrow(train)]+seasonal_var_numeric) summary(tslm)  Call: lm(formula = CO2 ~ Time + seasonal_var, data = train) Residuals: Min 1Q Median 3Q Max -4.2844 -1.6280 -0.2005 1.2968 6.1090 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.105e+02 9.116e-02 3406.23 <2e-16 *** Time 3.633e-03 1.020e-05 356.23 <2e-16 *** seasonal_var 2.544e+00 6.316e-02 40.27 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.055 on 2125 degrees of freedom Multiple R-squared: 0.9837, Adjusted R-squared: 0.9837 F-statistic: 6.422e+04 on 2 and 2125 DF, p-value: < 2.2e-16 plot(Time[1:nrow(train)],Data$CO2[1:nrow(train)], xlab = "Indexed Time", ylab = "CO2 Levels", main="CO2 (in black) predicted by the updated model(red) - Not bad at all!")
lines(tslm_var_for_plot, train$CO2, col="red")  #use the model and its predictor (time, seasonal variation) to predict CO2 for different times: #X input for the model - copmlement the test set with the seasonal var: seasonal_var <- ts(cos((2*pi*test$Time)/365.25))
test <- data.frame(test$Time, seasonal_var, CO2=test$CO2)
colnames(test) <-c("Time","seasonal_var","CO2")


Timeseasonal_varCO2
15309 0.8567425369.61
15316 0.9124929370.11
15323 0.9550279370.09
15330 0.9837315370.98
15337 0.9981880370.86
15344 0.9981880371.04
tslm <- lm(CO2~Time+seasonal_var, data = train)
prediction<-predict(tslm, test)
summary(tslm)


Call: lm(formula = CO2 ~ Time + seasonal_var, data = train) Residuals: Min 1Q Median 3Q Max -4.2844 -1.6280 -0.2005 1.2968 6.1090 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.105e+02 9.116e-02 3406.23 <2e-16 *** Time 3.633e-03 1.020e-05 356.23 <2e-16 *** seasonal_var 2.544e+00 6.316e-02 40.27 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.055 on 2125 degrees of freedom Multiple R-squared: 0.9837, Adjusted R-squared: 0.9837 F-statistic: 6.422e+04 on 2 and 2125 DF, p-value: < 2.2e-16

pred_data <- data.frame(test$Time, prediction) plot(test$Time,test$CO2, xlab = "Indexed Time", ylab = "CO2 Levels", main="Prediction: CO2 (in black) predicted by the updated model(red), \nnot so great...") lines(pred_data, col="red")  #This model builds on the linear model that we have used so far as a mean to a normal distribution: #priors (taken from the linear model): c0 = 310.5; c1 = .003; c2 = 2.53; c3 = .01; c4=.5 counter = 0 seasonal_var_df <- as.array(seasonal_var_numeric) #constract the Bayes-Gaussian Model: bayesian_model <- function(num_days){ bys <- rnorm(1,(c0+c1*num_days+c2*cos(2*pi*num_days/365.25 + c3)),c4) if (is.numeric(bys)){ #this subblock is just for the counter counter <- counter+1 } return(rnorm(1,(c0+c1*num_days+c2*cos(2*pi*num_days/365.25 + c3)),c4)) } #Let's see how this model fits the training data: bayesian_results_func <- function(X_input) { #generate list for iteration: items_for_iteration <- length(X_input) l <- vector("integer", items_for_iteration) for (i in 1:length(X_input)){ #see: https://stackoverflow.com/questions/26508519/how-to-add-elements-to-a-list-in-r-loop l[i] <- bayesian_model(X_input[i]) } return (subset(l,!is.na(l)) ) #l = l[!is.na(l)] #return(l) } bayesian_results <- bayesian_results_func(c(train$Time))


#Sanity Check: we should expect 2128 obs / 7 = 304
length(bayesian_results)
print(counter)

1. 313.18011608506
2. 312.082467171125
3. 313.553690562177
4. 312.590890964596
5. 312.291578041677
6. 313.506027261337
2128
[1] 0
is.numeric(bayesian_results)

TRUE
Time[1]

0
seasonal_var_df[1]

1
sum(is.na(rnorm(1000,(c0+c1*Time[1]+c2*seasonal_var_df[1]),c4)))

0
X_input

Error in eval(expr, envir, enclos): object 'X_input' not found Traceback: 
train\$Time

WARNING: Some output was deleted.