Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News Sign UpSign In
| Download

r code final

Project: 146final
Views: 120
Kernel: SageMath (stable)
#Load the data: # Notice: if reproduction of this code is of interst, delete blank columns to achieve this result. PATH <- "weekly_in_situ_co2_mlo.csv" Data <- read.csv(PATH, header = TRUE)
#Change the time Time <- as.Date(Data$Date) plot(Time,Data$CO2, xlab = "Year", ylab = "CO2 Levels")
Image in a Jupyter notebook
lm <- lm(CO2~Time, data = Data) plot(Time,Data$CO2, xlab = "Year", ylab = "CO2 Levels") abline(lm, col="red")
Image in a Jupyter notebook
#turn the time parameter into days from day one: Time <- c(Time-Time[1]) #Sanity check: we should have a length of 3040 (weeks between ) length(Time)
3040
#turn time to numeric vector: Time <- as.numeric(Time, units="days") Time[3040]
21770
#Create a numeric dataframe to work with: numeric_data <- data.frame(Data$CO2,Time) colnames(numeric_data) <-c("CO2","Time") #Expect to get dbl type (int) for both variables: head(numeric_data)
CO2Time
316.19 0
317.31 7
317.6914
317.5821
316.4828
316.9535
#model test on the esixting data: #use the first 70% to fit and 30% to predict: set.seed(23) train_size <- nrow(numeric_data)* .7 #<- train size #let's split the data to train and test: #train <- numeric_data[1:train_size, ] train <- numeric_data[1:train_size,] test_rows <- (nrow(numeric_data) - nrow(train)) test <- tail(numeric_data, test_rows) #Sanity check: we should expect the test data to start at 2129 (one item after 70% of observations , and to have 3040-2128 = 912 ) head(test)
CO2Time
2129369.6115309
2130370.1115316
2131370.0915323
2132370.9815330
2133370.8615337
2134371.0415344
nrow(test)
912
#Add the seasonal parameter: seasonal_var <- ts(cos((2*pi*train$Time)/365.25)) train <- data.frame(train$Time, seasonal_var, CO2=train$CO2) colnames(train) <-c("Time","seasonal_var","CO2") head(train)
Timeseasonal_varCO2
0 1.0000000316.19
7 0.9927586317.31
14 0.9711394317.69
21 0.9354554317.58
28 0.8862235316.48
35 0.8241566316.95
#Fit the model: tslm <- lm(CO2~Time+seasonal_var, data = train) #Later we'd want to plot the fit of the model: seasonal_var_numeric <- as.numeric(seasonal_var) tslm_var_for_plot <-(Time[1:nrow(train)]+seasonal_var_numeric) summary(tslm)
Call: lm(formula = CO2 ~ Time + seasonal_var, data = train) Residuals: Min 1Q Median 3Q Max -4.2844 -1.6280 -0.2005 1.2968 6.1090 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.105e+02 9.116e-02 3406.23 <2e-16 *** Time 3.633e-03 1.020e-05 356.23 <2e-16 *** seasonal_var 2.544e+00 6.316e-02 40.27 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.055 on 2125 degrees of freedom Multiple R-squared: 0.9837, Adjusted R-squared: 0.9837 F-statistic: 6.422e+04 on 2 and 2125 DF, p-value: < 2.2e-16
plot(Time[1:nrow(train)],Data$CO2[1:nrow(train)], xlab = "Indexed Time", ylab = "CO2 Levels", main="CO2 (in black) predicted by the updated model(red) - Not bad at all!") lines(tslm_var_for_plot, train$CO2, col="red")
Image in a Jupyter notebook
#use the model and its predictor (time, seasonal variation) to predict CO2 for different times: #X input for the model - copmlement the test set with the seasonal var: seasonal_var <- ts(cos((2*pi*test$Time)/365.25)) test <- data.frame(test$Time, seasonal_var, CO2=test$CO2) colnames(test) <-c("Time","seasonal_var","CO2") head(test)
Timeseasonal_varCO2
15309 0.8567425369.61
15316 0.9124929370.11
15323 0.9550279370.09
15330 0.9837315370.98
15337 0.9981880370.86
15344 0.9981880371.04
tslm <- lm(CO2~Time+seasonal_var, data = train) prediction<-predict(tslm, test) summary(tslm)
Call: lm(formula = CO2 ~ Time + seasonal_var, data = train) Residuals: Min 1Q Median 3Q Max -4.2844 -1.6280 -0.2005 1.2968 6.1090 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.105e+02 9.116e-02 3406.23 <2e-16 *** Time 3.633e-03 1.020e-05 356.23 <2e-16 *** seasonal_var 2.544e+00 6.316e-02 40.27 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.055 on 2125 degrees of freedom Multiple R-squared: 0.9837, Adjusted R-squared: 0.9837 F-statistic: 6.422e+04 on 2 and 2125 DF, p-value: < 2.2e-16
pred_data <- data.frame(test$Time, prediction) plot(test$Time,test$CO2, xlab = "Indexed Time", ylab = "CO2 Levels", main="Prediction: CO2 (in black) predicted by the updated model(red), \nnot so great...") lines(pred_data, col="red")
Image in a Jupyter notebook
#This model builds on the linear model that we have used so far as a mean to a normal distribution: #priors (taken from the linear model): c0 = 310.5; c1 = .003; c2 = 2.53; c3 = .01; c4=.5 counter = 0 seasonal_var_df <- as.array(seasonal_var_numeric) #constract the Bayes-Gaussian Model: bayesian_model <- function(num_days){ bys <- rnorm(1,(c0+c1*num_days+c2*cos(2*pi*num_days/365.25 + c3)),c4) if (is.numeric(bys)){ #this subblock is just for the counter counter <- counter+1 } return(rnorm(1,(c0+c1*num_days+c2*cos(2*pi*num_days/365.25 + c3)),c4)) } #Let's see how this model fits the training data: bayesian_results_func <- function(X_input) { #generate list for iteration: items_for_iteration <- length(X_input) l <- vector("integer", items_for_iteration) for (i in 1:length(X_input)){ #see: https://stackoverflow.com/questions/26508519/how-to-add-elements-to-a-list-in-r-loop l[i] <- bayesian_model(X_input[i]) } return (subset(l,!is.na(l)) ) #l = l[!is.na(l)] #return(l) } bayesian_results <- bayesian_results_func(c(train$Time))
#Sanity Check: we should expect 2128 obs / 7 = 304 head(bayesian_results) length(bayesian_results) print(counter)
  1. 313.18011608506
  2. 312.082467171125
  3. 313.553690562177
  4. 312.590890964596
  5. 312.291578041677
  6. 313.506027261337
2128
[1] 0
is.numeric(bayesian_results)
TRUE
Time[1]
0
seasonal_var_df[1]
1
sum(is.na(rnorm(1000,(c0+c1*Time[1]+c2*seasonal_var_df[1]),c4)))
0
X_input
Error in eval(expr, envir, enclos): object 'X_input' not found Traceback:
train$Time
WARNING: Some output was deleted.