{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "#Load the data:\n", "# Notice: if reproduction of this code is of interst, delete blank columns to achieve this result.\n", "PATH <- \"weekly_in_situ_co2_mlo.csv\"\n", "Data <- read.csv(PATH, header = TRUE)\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": "6d1e728413d3439ce3dafea9ed2c7f941afac72e" }, "execution_count": 2, "metadata": { }, "output_type": "execute_result" } ], "source": [ "#Change the time\n", "Time <- as.Date(Data$Date)\n", "plot(Time,Data$CO2, xlab = \"Year\", ylab = \"CO2 Levels\")\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": "8f108a7146da293d548d35f82c118ddef21ffcfd" }, "execution_count": 3, "metadata": { }, "output_type": "execute_result" } ], "source": [ "lm <- lm(CO2~Time, data = Data)\n", "plot(Time,Data$CO2, xlab = \"Year\", ylab = \"CO2 Levels\")\n", "abline(lm, col=\"red\")\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "3040" ] }, "execution_count": 4, "metadata": { }, "output_type": "execute_result" } ], "source": [ "#turn the time parameter into days from day one:\n", "Time <- c(Time-Time[1])\n", "#Sanity check: we should have a length of 3040 (weeks between )\n", "length(Time) \n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "21770" ] }, "execution_count": 5, "metadata": { }, "output_type": "execute_result" } ], "source": [ "#turn time to numeric vector:\n", "Time <- as.numeric(Time, units=\"days\")\n", "Time[3040]\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
CO2Time
316.19 0
317.31 7
317.6914
317.5821
316.4828
316.9535
\n" ] }, "execution_count": 6, "metadata": { }, "output_type": "execute_result" } ], "source": [ "#Create a numeric dataframe to work with:\n", "numeric_data <- data.frame(Data$CO2,Time)\n", "colnames(numeric_data) <-c(\"CO2\",\"Time\")\n", "#Expect to get dbl type (int) for both variables:\n", "head(numeric_data)\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
CO2Time
2129369.6115309
2130370.1115316
2131370.0915323
2132370.9815330
2133370.8615337
2134371.0415344
\n" ] }, "execution_count": 7, "metadata": { }, "output_type": "execute_result" } ], "source": [ "#model test on the esixting data:\n", "#use the first 70% to fit and 30% to predict:\n", "set.seed(23)\n", "train_size <- nrow(numeric_data)* .7 #<- train size\n", "#let's split the data to train and test:\n", "#train <- numeric_data[1:train_size, ]\n", "train <- numeric_data[1:train_size,]\n", "test_rows <- (nrow(numeric_data) - nrow(train))\n", "test <- tail(numeric_data, test_rows)\n", "#Sanity check: we should expect the test data to start at 2129 (one item after 70% of observations , and to have 3040-2128 = 912 )\n", "head(test)\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "912" ] }, "execution_count": 8, "metadata": { }, "output_type": "execute_result" } ], "source": [ "nrow(test)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
Timeseasonal_varCO2
0 1.0000000316.19
7 0.9927586317.31
14 0.9711394317.69
21 0.9354554317.58
28 0.8862235316.48
35 0.8241566316.95
\n" ] }, "execution_count": 9, "metadata": { }, "output_type": "execute_result" } ], "source": [ "#Add the seasonal parameter:\n", "seasonal_var <- ts(cos((2*pi*train$Time)/365.25))\n", "train <- data.frame(train$Time, seasonal_var, CO2=train$CO2)\n", "colnames(train) <-c(\"Time\",\"seasonal_var\",\"CO2\")\n", "head(train)\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "Call:\n", "lm(formula = CO2 ~ Time + seasonal_var, data = train)\n", "\n", "Residuals:\n", " Min 1Q Median 3Q Max \n", "-4.2844 -1.6280 -0.2005 1.2968 6.1090 \n", "\n", "Coefficients:\n", " Estimate Std. Error t value Pr(>|t|) \n", "(Intercept) 3.105e+02 9.116e-02 3406.23 <2e-16 ***\n", "Time 3.633e-03 1.020e-05 356.23 <2e-16 ***\n", "seasonal_var 2.544e+00 6.316e-02 40.27 <2e-16 ***\n", "---\n", "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n", "\n", "Residual standard error: 2.055 on 2125 degrees of freedom\n", "Multiple R-squared: 0.9837,\tAdjusted R-squared: 0.9837 \n", "F-statistic: 6.422e+04 on 2 and 2125 DF, p-value: < 2.2e-16\n" ] }, "execution_count": 10, "metadata": { }, "output_type": "execute_result" } ], "source": [ "#Fit the model:\n", "tslm <- lm(CO2~Time+seasonal_var, data = train)\n", "#Later we'd want to plot the fit of the model:\n", "seasonal_var_numeric <- as.numeric(seasonal_var)\n", "tslm_var_for_plot <-(Time[1:nrow(train)]+seasonal_var_numeric)\n", "summary(tslm)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": "b2d46b012701857b60818e5edaadc19523161924" }, "execution_count": 11, "metadata": { }, "output_type": "execute_result" } ], "source": [ "plot(Time[1:nrow(train)],Data$CO2[1:nrow(train)], xlab = \"Indexed Time\", ylab = \"CO2 Levels\", main=\"CO2 (in black) predicted by the updated model(red) - Not bad at all!\")\n", "lines(tslm_var_for_plot, train$CO2, col=\"red\")\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
Timeseasonal_varCO2
15309 0.8567425369.61
15316 0.9124929370.11
15323 0.9550279370.09
15330 0.9837315370.98
15337 0.9981880370.86
15344 0.9981880371.04
\n" ] }, "execution_count": 12, "metadata": { }, "output_type": "execute_result" } ], "source": [ "#use the model and its predictor (time, seasonal variation) to predict CO2 for different times:\n", "#X input for the model - copmlement the test set with the seasonal var:\n", "seasonal_var <- ts(cos((2*pi*test$Time)/365.25))\n", "test <- data.frame(test$Time, seasonal_var, CO2=test$CO2)\n", "colnames(test) <-c(\"Time\",\"seasonal_var\",\"CO2\")\n", "head(test)\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "Call:\n", "lm(formula = CO2 ~ Time + seasonal_var, data = train)\n", "\n", "Residuals:\n", " Min 1Q Median 3Q Max \n", "-4.2844 -1.6280 -0.2005 1.2968 6.1090 \n", "\n", "Coefficients:\n", " Estimate Std. Error t value Pr(>|t|) \n", "(Intercept) 3.105e+02 9.116e-02 3406.23 <2e-16 ***\n", "Time 3.633e-03 1.020e-05 356.23 <2e-16 ***\n", "seasonal_var 2.544e+00 6.316e-02 40.27 <2e-16 ***\n", "---\n", "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n", "\n", "Residual standard error: 2.055 on 2125 degrees of freedom\n", "Multiple R-squared: 0.9837,\tAdjusted R-squared: 0.9837 \n", "F-statistic: 6.422e+04 on 2 and 2125 DF, p-value: < 2.2e-16\n" ] }, "execution_count": 13, "metadata": { }, "output_type": "execute_result" } ], "source": [ "tslm <- lm(CO2~Time+seasonal_var, data = train)\n", "prediction<-predict(tslm, test)\n", "summary(tslm)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": "6d489645a26852e3fda727993d5332534b763a07" }, "execution_count": 14, "metadata": { }, "output_type": "execute_result" } ], "source": [ "\n", "pred_data <- data.frame(test$Time, prediction)\n", "plot(test$Time,test$CO2, xlab = \"Indexed Time\", ylab = \"CO2 Levels\", main=\"Prediction: CO2 (in black) predicted by the updated model(red), \\nnot so great...\")\n", "lines(pred_data, col=\"red\")\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "#This model builds on the linear model that we have used so far as a mean to a normal distribution:\n", "#priors (taken from the linear model):\n", "c0 = 310.5; c1 = .003; c2 = 2.53; c3 = .01; c4=.5\n", "counter = 0\n", "seasonal_var_df <- as.array(seasonal_var_numeric)\n", "#constract the Bayes-Gaussian Model:\n", "bayesian_model <- function(num_days){\n", " bys <- rnorm(1,(c0+c1*num_days+c2*cos(2*pi*num_days/365.25 + c3)),c4)\n", " if (is.numeric(bys)){ #this subblock is just for the counter\n", " counter <- counter+1\n", " }\n", " return(rnorm(1,(c0+c1*num_days+c2*cos(2*pi*num_days/365.25 + c3)),c4))\n", "}\n", "#Let's see how this model fits the training data:\n", "bayesian_results_func <- function(X_input) {\n", " #generate list for iteration:\n", " items_for_iteration <- length(X_input)\n", " l <- vector(\"integer\", items_for_iteration)\n", " for (i in 1:length(X_input)){\n", " #see: https://stackoverflow.com/questions/26508519/how-to-add-elements-to-a-list-in-r-loop\n", " l[i] <- bayesian_model(X_input[i])\n", " }\n", " return (subset(l,!is.na(l)) )\n", " #l = l[!is.na(l)]\n", " #return(l)\n", "}\n", "bayesian_results <- bayesian_results_func(c(train$Time))\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 313.18011608506
  2. \n", "\t
  3. 312.082467171125
  4. \n", "\t
  5. 313.553690562177
  6. \n", "\t
  7. 312.590890964596
  8. \n", "\t
  9. 312.291578041677
  10. \n", "\t
  11. 313.506027261337
  12. \n", "
\n" ] }, "execution_count": 36, "metadata": { }, "output_type": "execute_result" }, { "data": { "text/html": [ "2128" ] }, "execution_count": 36, "metadata": { }, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "[1] 0\n" ] } ], "source": [ "#Sanity Check: we should expect 2128 obs / 7 = 304\n", "head(bayesian_results)\n", "length(bayesian_results)\n", "print(counter)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "TRUE" ] }, "execution_count": 41, "metadata": { }, "output_type": "execute_result" } ], "source": [ "is.numeric(bayesian_results)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "0" ] }, "execution_count": 25, "metadata": { }, "output_type": "execute_result" } ], "source": [ "Time[1]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "1" ] }, "execution_count": 26, "metadata": { }, "output_type": "execute_result" } ], "source": [ "seasonal_var_df[1]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "0" ] }, "execution_count": 32, "metadata": { }, "output_type": "execute_result" } ], "source": [ "sum(is.na(rnorm(1000,(c0+c1*Time[1]+c2*seasonal_var_df[1]),c4)))" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "ename": "ERROR", "evalue": "Error in eval(expr, envir, enclos): object 'X_input' not found\n", "output_type": "error", "traceback": [ "Error in eval(expr, envir, enclos): object 'X_input' not found\nTraceback:\n" ] } ], "source": [ "X_input" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: Some output was deleted.\n" ] } ], "source": [ "train$Time" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] } ], "metadata": { "kernelspec": { "display_name": "SageMath (stable)", "name": "sagemath" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 0 }