Path: blob/main/latex-templates/templates/data-science/time_series.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{booktabs}6\usepackage{siunitx}7\usepackage{float}8\usepackage{geometry}9\geometry{margin=1in}10\usepackage[makestderr]{pythontex}1112\title{Time Series Analysis and Forecasting}13\author{Computational Data Science}14\date{\today}1516\begin{document}17\maketitle1819\begin{abstract}20This document presents a comprehensive analysis of time series data, including decomposition into trend, seasonality, and residual components, autocorrelation analysis, ARIMA modeling, and forecasting with prediction intervals. The analysis demonstrates statistical tests for stationarity and model selection criteria.21\end{abstract}2223\section{Introduction}24Time series analysis is fundamental to understanding temporal patterns in data. A time series $\{y_t\}$ can be decomposed as:25\begin{equation}26y_t = T_t + S_t + R_t27\end{equation}28where $T_t$ is the trend component, $S_t$ is the seasonal component, and $R_t$ is the residual.2930The ARIMA$(p,d,q)$ model combines autoregression, differencing, and moving average:31\begin{equation}32\phi(B)(1-B)^d y_t = \theta(B)\epsilon_t33\end{equation}34where $B$ is the backshift operator, $\phi(B)$ is the AR polynomial, and $\theta(B)$ is the MA polynomial.3536\section{Computational Environment}37\begin{pycode}38import numpy as np39import matplotlib.pyplot as plt40from scipy import stats, signal41from scipy.fft import fft, fftfreq42import warnings43warnings.filterwarnings('ignore')4445plt.rc('text', usetex=True)46plt.rc('font', family='serif')47np.random.seed(42)4849def save_plot(filename, caption):50plt.savefig(filename, bbox_inches='tight', dpi=150)51print(r'\begin{figure}[H]')52print(r'\centering')53print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')54print(r'\caption{' + caption + '}')55print(r'\end{figure}')56plt.close()57\end{pycode}5859\section{Data Generation and Overview}60\begin{pycode}61# Generate synthetic time series with trend, seasonality, and noise62n = 20063t = np.arange(n)6465# Components66trend = 0.05 * t + 0.0002 * t**267seasonal_period = 1268seasonality = 3 * np.sin(2 * np.pi * t / seasonal_period)69noise = np.random.normal(0, 1.5, n)7071# Combined series72y = trend + seasonality + noise7374# Basic statistics75mean_y = np.mean(y)76std_y = np.std(y)77min_y = np.min(y)78max_y = np.max(y)7980# Plot raw series81fig, ax = plt.subplots(figsize=(10, 4))82ax.plot(t, y, 'b-', linewidth=0.8, alpha=0.8)83ax.set_xlabel('Time')84ax.set_ylabel('Value')85ax.set_title('Raw Time Series Data')86ax.grid(True, alpha=0.3)87save_plot('ts_raw_series.pdf', 'Original time series showing trend and seasonal patterns.')88\end{pycode}8990The generated time series has $n = \py{n}$ observations with mean $\mu = \py{f"{mean_y:.2f}"}$ and standard deviation $\sigma = \py{f"{std_y:.2f}"}$.9192\section{Time Series Decomposition}93\begin{pycode}94# Classical decomposition using moving average95def moving_average(x, window):96weights = np.ones(window) / window97return np.convolve(x, weights, mode='same')9899# Extract trend100window = seasonal_period101trend_est = moving_average(y, window)102103# Detrend to get seasonal + residual104detrended = y - trend_est105106# Estimate seasonal pattern107seasonal_est = np.zeros(n)108for i in range(seasonal_period):109indices = np.arange(i, n, seasonal_period)110seasonal_est[indices] = np.mean(detrended[indices])111112# Residual113residual = y - trend_est - seasonal_est114115# Plot decomposition116fig, axes = plt.subplots(4, 1, figsize=(10, 10), sharex=True)117118axes[0].plot(t, y, 'b-', linewidth=0.8)119axes[0].set_ylabel('Observed')120axes[0].set_title('Time Series Decomposition')121axes[0].grid(True, alpha=0.3)122123axes[1].plot(t, trend_est, 'r-', linewidth=1.5)124axes[1].set_ylabel('Trend')125axes[1].grid(True, alpha=0.3)126127axes[2].plot(t, seasonal_est, 'g-', linewidth=1)128axes[2].set_ylabel('Seasonal')129axes[2].grid(True, alpha=0.3)130131axes[3].plot(t, residual, 'purple', linewidth=0.8)132axes[3].set_ylabel('Residual')133axes[3].set_xlabel('Time')134axes[3].grid(True, alpha=0.3)135136plt.tight_layout()137save_plot('ts_decomposition.pdf', 'Classical additive decomposition of the time series.')138\end{pycode}139140\section{Autocorrelation Analysis}141The autocorrelation function (ACF) measures correlation at different lags:142\begin{equation}143\rho_k = \frac{\sum_{t=k+1}^{n}(y_t - \bar{y})(y_{t-k} - \bar{y})}{\sum_{t=1}^{n}(y_t - \bar{y})^2}144\end{equation}145146The partial autocorrelation function (PACF) measures correlation controlling for intermediate lags.147148\begin{pycode}149def acf(x, nlags):150"""Compute autocorrelation function."""151n = len(x)152x = x - np.mean(x)153result = np.correlate(x, x, mode='full')154result = result[n-1:] / result[n-1]155return result[:nlags+1]156157def pacf(x, nlags):158"""Compute partial autocorrelation using Durbin-Levinson."""159r = acf(x, nlags)160pacf_vals = np.zeros(nlags + 1)161pacf_vals[0] = 1.0162pacf_vals[1] = r[1]163164phi = np.zeros((nlags + 1, nlags + 1))165phi[1, 1] = r[1]166167for k in range(2, nlags + 1):168num = r[k] - sum(phi[k-1, j] * r[k-j] for j in range(1, k))169den = 1 - sum(phi[k-1, j] * r[j] for j in range(1, k))170phi[k, k] = num / den if den != 0 else 0171for j in range(1, k):172phi[k, j] = phi[k-1, j] - phi[k, k] * phi[k-1, k-j]173pacf_vals[k] = phi[k, k]174175return pacf_vals176177nlags = 40178acf_vals = acf(y, nlags)179pacf_vals = pacf(y, nlags)180181# Confidence bounds (95%)182conf_bound = 1.96 / np.sqrt(n)183184fig, axes = plt.subplots(2, 1, figsize=(10, 6))185186# ACF plot187axes[0].bar(range(nlags+1), acf_vals, width=0.3, color='steelblue', alpha=0.8)188axes[0].axhline(conf_bound, color='r', linestyle='--', linewidth=1)189axes[0].axhline(-conf_bound, color='r', linestyle='--', linewidth=1)190axes[0].axhline(0, color='k', linewidth=0.5)191axes[0].set_xlabel('Lag')192axes[0].set_ylabel('ACF')193axes[0].set_title('Autocorrelation Function')194axes[0].set_xlim(-0.5, nlags+0.5)195196# PACF plot197axes[1].bar(range(nlags+1), pacf_vals, width=0.3, color='darkgreen', alpha=0.8)198axes[1].axhline(conf_bound, color='r', linestyle='--', linewidth=1)199axes[1].axhline(-conf_bound, color='r', linestyle='--', linewidth=1)200axes[1].axhline(0, color='k', linewidth=0.5)201axes[1].set_xlabel('Lag')202axes[1].set_ylabel('PACF')203axes[1].set_title('Partial Autocorrelation Function')204axes[1].set_xlim(-0.5, nlags+0.5)205206plt.tight_layout()207save_plot('ts_acf_pacf.pdf', 'ACF and PACF plots with 95\\% confidence bounds.')208\end{pycode}209210\section{Stationarity Testing}211A time series is stationary if its statistical properties remain constant over time. The Augmented Dickey-Fuller test checks for unit roots.212213\begin{pycode}214# Simple ADF test implementation215def adf_test(y, max_lag=None):216"""Augmented Dickey-Fuller test for stationarity."""217n = len(y)218if max_lag is None:219max_lag = int(np.floor(12 * (n/100)**(1/4)))220221# First difference222dy = np.diff(y)223y_lag = y[:-1]224225# Regression: dy = alpha + beta*y_{t-1} + sum(gamma_i * dy_{t-i}) + e226# Simplified: just test coefficient on y_{t-1}227X = np.column_stack([np.ones(len(dy)), y_lag])228coeffs = np.linalg.lstsq(X, dy, rcond=None)[0]229230# Compute test statistic231residuals = dy - X @ coeffs232se = np.sqrt(np.sum(residuals**2) / (len(dy) - 2))233se_beta = se / np.sqrt(np.sum((y_lag - np.mean(y_lag))**2))234235adf_stat = coeffs[1] / se_beta236237# Critical values (approximate)238critical_values = {0.01: -3.43, 0.05: -2.86, 0.10: -2.57}239240return adf_stat, critical_values241242adf_stat, critical = adf_test(y)243is_stationary = adf_stat < critical[0.05]244245# Test on differenced series246dy = np.diff(y)247adf_diff, _ = adf_test(dy)248\end{pycode}249250\begin{table}[H]251\centering252\caption{Augmented Dickey-Fuller Test Results}253\begin{tabular}{lcc}254\toprule255Series & ADF Statistic & Stationary (5\%) \\256\midrule257Original & \py{f"{adf_stat:.3f}"} & \py{"Yes" if is_stationary else "No"} \\258First Difference & \py{f"{adf_diff:.3f}"} & \py{"Yes" if adf_diff < -2.86 else "No"} \\259\bottomrule260\end{tabular}261\end{table}262263Critical values: 1\%: $-3.43$, 5\%: $-2.86$, 10\%: $-2.57$.264265\section{ARIMA Model Fitting}266\begin{pycode}267# Simple AR(p) model fitting268def fit_ar(y, p):269"""Fit AR(p) model using OLS."""270n = len(y)271Y = y[p:]272X = np.column_stack([y[p-i-1:n-i-1] for i in range(p)])273X = np.column_stack([np.ones(len(Y)), X])274275coeffs = np.linalg.lstsq(X, Y, rcond=None)[0]276fitted = X @ coeffs277residuals = Y - fitted278279# Information criteria280k = p + 1281n_eff = len(Y)282sse = np.sum(residuals**2)283aic = n_eff * np.log(sse/n_eff) + 2*k284bic = n_eff * np.log(sse/n_eff) + k*np.log(n_eff)285286return coeffs, residuals, aic, bic, fitted287288# Model selection289results = []290for p in range(1, 7):291coeffs, resid, aic, bic, fitted = fit_ar(y, p)292results.append((p, aic, bic, np.std(resid)))293294best_p = min(results, key=lambda x: x[1])[0]295coeffs_best, resid_best, aic_best, bic_best, fitted_best = fit_ar(y, best_p)296297# Plot model selection298fig, ax = plt.subplots(figsize=(8, 4))299ps = [r[0] for r in results]300aics = [r[1] for r in results]301bics = [r[2] for r in results]302303ax.plot(ps, aics, 'bo-', label='AIC', linewidth=1.5, markersize=6)304ax.plot(ps, bics, 'rs-', label='BIC', linewidth=1.5, markersize=6)305ax.axvline(best_p, color='gray', linestyle='--', alpha=0.7, label=f'Best p={best_p}')306ax.set_xlabel('AR Order (p)')307ax.set_ylabel('Information Criterion')308ax.set_title('Model Order Selection')309ax.legend()310ax.grid(True, alpha=0.3)311save_plot('ts_model_selection.pdf', 'AIC and BIC criteria for AR model order selection.')312\end{pycode}313314\begin{pycode}315print(r'\begin{table}[H]')316print(r'\centering')317print(r'\caption{Model Comparison Results}')318print(r'\begin{tabular}{cccc}')319print(r'\toprule')320print(r'AR Order & AIC & BIC & Residual Std \\')321print(r'\midrule')322for r in results[:min(6, len(results))]:323print(f'{r[0]} & {r[1]:.2f} & {r[2]:.2f} & {r[3]:.3f} \\\\')324print(r'\bottomrule')325print(r'\end{tabular}')326print(r'\end{table}')327\end{pycode}328329The optimal model is AR(\py{best_p}) with AIC = \py{f"{aic_best:.2f}"}.330331\section{Residual Diagnostics}332\begin{pycode}333# Residual analysis334fig, axes = plt.subplots(2, 2, figsize=(10, 8))335336# Time plot of residuals337axes[0, 0].plot(resid_best, 'b-', linewidth=0.8)338axes[0, 0].axhline(0, color='r', linestyle='--')339axes[0, 0].set_xlabel('Time')340axes[0, 0].set_ylabel('Residual')341axes[0, 0].set_title('Residuals Over Time')342axes[0, 0].grid(True, alpha=0.3)343344# Histogram of residuals345axes[0, 1].hist(resid_best, bins=20, density=True, alpha=0.7, color='steelblue', edgecolor='black')346x_norm = np.linspace(min(resid_best), max(resid_best), 100)347axes[0, 1].plot(x_norm, stats.norm.pdf(x_norm, np.mean(resid_best), np.std(resid_best)), 'r-', linewidth=2)348axes[0, 1].set_xlabel('Residual')349axes[0, 1].set_ylabel('Density')350axes[0, 1].set_title('Residual Distribution')351352# Q-Q plot353stats.probplot(resid_best, dist="norm", plot=axes[1, 0])354axes[1, 0].set_title('Normal Q-Q Plot')355axes[1, 0].grid(True, alpha=0.3)356357# ACF of residuals358acf_resid = acf(resid_best, 20)359axes[1, 1].bar(range(21), acf_resid, width=0.3, color='darkgreen', alpha=0.8)360axes[1, 1].axhline(1.96/np.sqrt(len(resid_best)), color='r', linestyle='--')361axes[1, 1].axhline(-1.96/np.sqrt(len(resid_best)), color='r', linestyle='--')362axes[1, 1].set_xlabel('Lag')363axes[1, 1].set_ylabel('ACF')364axes[1, 1].set_title('ACF of Residuals')365366plt.tight_layout()367save_plot('ts_diagnostics.pdf', 'Residual diagnostic plots for model validation.')368369# Normality test370_, shapiro_p = stats.shapiro(resid_best[:50]) # Shapiro limited to 5000371# Ljung-Box test (simplified)372lb_stat = len(resid_best) * np.sum(acf(resid_best, 10)[1:]**2)373\end{pycode}374375\section{Forecasting}376\begin{pycode}377# Forecast using fitted AR model378def forecast_ar(y, coeffs, h):379"""Generate h-step ahead forecasts."""380p = len(coeffs) - 1381forecasts = np.zeros(h)382history = list(y[-p:])383384for i in range(h):385pred = coeffs[0] + sum(coeffs[j+1] * history[-j-1] for j in range(p))386forecasts[i] = pred387history.append(pred)388389return forecasts390391h = 24 # Forecast horizon392forecasts = forecast_ar(y, coeffs_best, h)393394# Simple prediction intervals (based on residual std)395sigma = np.std(resid_best)396intervals = []397for i in range(1, h+1):398se = sigma * np.sqrt(i) # Simplified; grows with horizon399lower = forecasts[i-1] - 1.96 * se400upper = forecasts[i-1] + 1.96 * se401intervals.append((lower, upper))402403# Plot forecasts404fig, ax = plt.subplots(figsize=(10, 5))405406# Historical data407ax.plot(t[-50:], y[-50:], 'b-', linewidth=1, label='Observed')408409# Fitted values410t_fitted = t[best_p:]411ax.plot(t_fitted[-50:], fitted_best[-50:], 'g--', linewidth=1, alpha=0.7, label='Fitted')412413# Forecasts414t_forecast = np.arange(n, n + h)415ax.plot(t_forecast, forecasts, 'r-', linewidth=2, label='Forecast')416417# Prediction intervals418lower = [iv[0] for iv in intervals]419upper = [iv[1] for iv in intervals]420ax.fill_between(t_forecast, lower, upper, alpha=0.3, color='red', label='95% PI')421422ax.set_xlabel('Time')423ax.set_ylabel('Value')424ax.set_title(f'Time Series Forecast (AR({best_p}))')425ax.legend(loc='upper left')426ax.grid(True, alpha=0.3)427save_plot('ts_forecast.pdf', 'Out-of-sample forecasts with 95\\% prediction intervals.')428\end{pycode}429430\section{Spectral Analysis}431\begin{pycode}432# Power spectral density433freqs = fftfreq(n, 1)434spectrum = np.abs(fft(y))**2 / n435436# Only positive frequencies437pos_mask = freqs > 0438freqs_pos = freqs[pos_mask]439spectrum_pos = spectrum[pos_mask]440441# Find dominant frequency442dominant_idx = np.argmax(spectrum_pos)443dominant_freq = freqs_pos[dominant_idx]444dominant_period = 1 / dominant_freq if dominant_freq > 0 else np.inf445446# Periodogram447fig, ax = plt.subplots(figsize=(10, 4))448ax.semilogy(freqs_pos, spectrum_pos, 'b-', linewidth=0.8)449ax.axvline(dominant_freq, color='r', linestyle='--', alpha=0.7,450label=f'Dominant: T={dominant_period:.1f}')451ax.set_xlabel('Frequency')452ax.set_ylabel('Power Spectral Density')453ax.set_title('Periodogram')454ax.legend()455ax.grid(True, alpha=0.3)456save_plot('ts_spectrum.pdf', 'Power spectral density showing dominant frequencies.')457\end{pycode}458459The dominant period detected is approximately \py{f"{dominant_period:.1f}"} time units, which corresponds to the seasonal period of \py{seasonal_period} embedded in the data.460461\section{Summary Statistics}462\begin{table}[H]463\centering464\caption{Comprehensive Time Series Analysis Summary}465\begin{tabular}{lr}466\toprule467Statistic & Value \\468\midrule469Sample Size & \py{n} \\470Mean & \py{f"{mean_y:.3f}"} \\471Standard Deviation & \py{f"{std_y:.3f}"} \\472Minimum & \py{f"{min_y:.3f}"} \\473Maximum & \py{f"{max_y:.3f}"} \\474ADF Statistic & \py{f"{adf_stat:.3f}"} \\475Best AR Order & \py{best_p} \\476Residual Std & \py{f"{np.std(resid_best):.3f}"} \\477Dominant Period & \py{f"{dominant_period:.2f}"} \\478\bottomrule479\end{tabular}480\end{table}481482\section{Conclusion}483This analysis demonstrated comprehensive time series methodology including:484\begin{itemize}485\item Additive decomposition into trend, seasonal, and residual components486\item ACF/PACF analysis for identifying temporal dependencies487\item Stationarity testing using the Augmented Dickey-Fuller test488\item AR model fitting with AIC/BIC model selection489\item Residual diagnostics for model validation490\item Forecasting with prediction intervals491\item Spectral analysis for frequency domain insights492\end{itemize}493494The AR(\py{best_p}) model provides reasonable forecasts, with the spectral analysis confirming the seasonal pattern at period \py{f"{dominant_period:.1f}"}.495496\end{document}497498499