CoCalc -- time_series.tex

GitHub Repository: Ok-landscape/computational-pipeline
Path: blob/main/latex-templates/templates/data-science/time_series.tex
⁵¹ views
unlisted
1
\documentclass[a4paper, 11pt]{article}
2
\usepackage[utf8]{inputenc}
3
\usepackage[T1]{fontenc}
4
\usepackage{amsmath, amssymb}
5
\usepackage{graphicx}
6
\usepackage{booktabs}
7
\usepackage{siunitx}
8
\usepackage{float}
9
\usepackage{geometry}
10
\geometry{margin=1in}
11
\usepackage[makestderr]{pythontex}
12

13
\title{Time Series Analysis and Forecasting}
14
\author{Computational Data Science}
15
\date{\today}
16

17
\begin{document}
18
\maketitle
19

20
\begin{abstract}
21
This document presents a comprehensive analysis of time series data, including decomposition into trend, seasonality, and residual components, autocorrelation analysis, ARIMA modeling, and forecasting with prediction intervals. The analysis demonstrates statistical tests for stationarity and model selection criteria.
22
\end{abstract}
23

24
\section{Introduction}
25
Time series analysis is fundamental to understanding temporal patterns in data. A time series $\{y_t\}$ can be decomposed as:
26
\begin{equation}
27
y_t = T_t + S_t + R_t
28
\end{equation}
29
where $T_t$ is the trend component, $S_t$ is the seasonal component, and $R_t$ is the residual.
30

31
The ARIMA$(p,d,q)$ model combines autoregression, differencing, and moving average:
32
\begin{equation}
33
\phi(B)(1-B)^d y_t = \theta(B)\epsilon_t
34
\end{equation}
35
where $B$ is the backshift operator, $\phi(B)$ is the AR polynomial, and $\theta(B)$ is the MA polynomial.
36

37
\section{Computational Environment}
38
\begin{pycode}
39
import numpy as np
40
import matplotlib.pyplot as plt
41
from scipy import stats, signal
42
from scipy.fft import fft, fftfreq
43
import warnings
44
warnings.filterwarnings('ignore')
45

46
plt.rc('text', usetex=True)
47
plt.rc('font', family='serif')
48
np.random.seed(42)
49

50
def save_plot(filename, caption):
51
    plt.savefig(filename, bbox_inches='tight', dpi=150)
52
    print(r'\begin{figure}[H]')
53
    print(r'\centering')
54
    print(r'\includegraphics[width=0.85\textwidth]{' + filename + '}')
55
    print(r'\caption{' + caption + '}')
56
    print(r'\end{figure}')
57
    plt.close()
58
\end{pycode}
59

60
\section{Data Generation and Overview}
61
\begin{pycode}
62
# Generate synthetic time series with trend, seasonality, and noise
63
n = 200
64
t = np.arange(n)
65

66
# Components
67
trend = 0.05 * t + 0.0002 * t**2
68
seasonal_period = 12
69
seasonality = 3 * np.sin(2 * np.pi * t / seasonal_period)
70
noise = np.random.normal(0, 1.5, n)
71

72
# Combined series
73
y = trend + seasonality + noise
74

75
# Basic statistics
76
mean_y = np.mean(y)
77
std_y = np.std(y)
78
min_y = np.min(y)
79
max_y = np.max(y)
80

81
# Plot raw series
82
fig, ax = plt.subplots(figsize=(10, 4))
83
ax.plot(t, y, 'b-', linewidth=0.8, alpha=0.8)
84
ax.set_xlabel('Time')
85
ax.set_ylabel('Value')
86
ax.set_title('Raw Time Series Data')
87
ax.grid(True, alpha=0.3)
88
save_plot('ts_raw_series.pdf', 'Original time series showing trend and seasonal patterns.')
89
\end{pycode}
90

91
The generated time series has $n = \py{n}$ observations with mean $\mu = \py{f"{mean_y:.2f}"}$ and standard deviation $\sigma = \py{f"{std_y:.2f}"}$.
92

93
\section{Time Series Decomposition}
94
\begin{pycode}
95
# Classical decomposition using moving average
96
def moving_average(x, window):
97
    weights = np.ones(window) / window
98
    return np.convolve(x, weights, mode='same')
99

100
# Extract trend
101
window = seasonal_period
102
trend_est = moving_average(y, window)
103

104
# Detrend to get seasonal + residual
105
detrended = y - trend_est
106

107
# Estimate seasonal pattern
108
seasonal_est = np.zeros(n)
109
for i in range(seasonal_period):
110
    indices = np.arange(i, n, seasonal_period)
111
    seasonal_est[indices] = np.mean(detrended[indices])
112

113
# Residual
114
residual = y - trend_est - seasonal_est
115

116
# Plot decomposition
117
fig, axes = plt.subplots(4, 1, figsize=(10, 10), sharex=True)
118

119
axes[0].plot(t, y, 'b-', linewidth=0.8)
120
axes[0].set_ylabel('Observed')
121
axes[0].set_title('Time Series Decomposition')
122
axes[0].grid(True, alpha=0.3)
123

124
axes[1].plot(t, trend_est, 'r-', linewidth=1.5)
125
axes[1].set_ylabel('Trend')
126
axes[1].grid(True, alpha=0.3)
127

128
axes[2].plot(t, seasonal_est, 'g-', linewidth=1)
129
axes[2].set_ylabel('Seasonal')
130
axes[2].grid(True, alpha=0.3)
131

132
axes[3].plot(t, residual, 'purple', linewidth=0.8)
133
axes[3].set_ylabel('Residual')
134
axes[3].set_xlabel('Time')
135
axes[3].grid(True, alpha=0.3)
136

137
plt.tight_layout()
138
save_plot('ts_decomposition.pdf', 'Classical additive decomposition of the time series.')
139
\end{pycode}
140

141
\section{Autocorrelation Analysis}
142
The autocorrelation function (ACF) measures correlation at different lags:
143
\begin{equation}
144
\rho_k = \frac{\sum_{t=k+1}^{n}(y_t - \bar{y})(y_{t-k} - \bar{y})}{\sum_{t=1}^{n}(y_t - \bar{y})^2}
145
\end{equation}
146

147
The partial autocorrelation function (PACF) measures correlation controlling for intermediate lags.
148

149
\begin{pycode}
150
def acf(x, nlags):
151
    """Compute autocorrelation function."""
152
    n = len(x)
153
    x = x - np.mean(x)
154
    result = np.correlate(x, x, mode='full')
155
    result = result[n-1:] / result[n-1]
156
    return result[:nlags+1]
157

158
def pacf(x, nlags):
159
    """Compute partial autocorrelation using Durbin-Levinson."""
160
    r = acf(x, nlags)
161
    pacf_vals = np.zeros(nlags + 1)
162
    pacf_vals[0] = 1.0
163
    pacf_vals[1] = r[1]
164

165
    phi = np.zeros((nlags + 1, nlags + 1))
166
    phi[1, 1] = r[1]
167

168
    for k in range(2, nlags + 1):
169
        num = r[k] - sum(phi[k-1, j] * r[k-j] for j in range(1, k))
170
        den = 1 - sum(phi[k-1, j] * r[j] for j in range(1, k))
171
        phi[k, k] = num / den if den != 0 else 0
172
        for j in range(1, k):
173
            phi[k, j] = phi[k-1, j] - phi[k, k] * phi[k-1, k-j]
174
        pacf_vals[k] = phi[k, k]
175

176
    return pacf_vals
177

178
nlags = 40
179
acf_vals = acf(y, nlags)
180
pacf_vals = pacf(y, nlags)
181

182
# Confidence bounds (95%)
183
conf_bound = 1.96 / np.sqrt(n)
184

185
fig, axes = plt.subplots(2, 1, figsize=(10, 6))
186

187
# ACF plot
188
axes[0].bar(range(nlags+1), acf_vals, width=0.3, color='steelblue', alpha=0.8)
189
axes[0].axhline(conf_bound, color='r', linestyle='--', linewidth=1)
190
axes[0].axhline(-conf_bound, color='r', linestyle='--', linewidth=1)
191
axes[0].axhline(0, color='k', linewidth=0.5)
192
axes[0].set_xlabel('Lag')
193
axes[0].set_ylabel('ACF')
194
axes[0].set_title('Autocorrelation Function')
195
axes[0].set_xlim(-0.5, nlags+0.5)
196

197
# PACF plot
198
axes[1].bar(range(nlags+1), pacf_vals, width=0.3, color='darkgreen', alpha=0.8)
199
axes[1].axhline(conf_bound, color='r', linestyle='--', linewidth=1)
200
axes[1].axhline(-conf_bound, color='r', linestyle='--', linewidth=1)
201
axes[1].axhline(0, color='k', linewidth=0.5)
202
axes[1].set_xlabel('Lag')
203
axes[1].set_ylabel('PACF')
204
axes[1].set_title('Partial Autocorrelation Function')
205
axes[1].set_xlim(-0.5, nlags+0.5)
206

207
plt.tight_layout()
208
save_plot('ts_acf_pacf.pdf', 'ACF and PACF plots with 95\\% confidence bounds.')
209
\end{pycode}
210

211
\section{Stationarity Testing}
212
A time series is stationary if its statistical properties remain constant over time. The Augmented Dickey-Fuller test checks for unit roots.
213

214
\begin{pycode}
215
# Simple ADF test implementation
216
def adf_test(y, max_lag=None):
217
    """Augmented Dickey-Fuller test for stationarity."""
218
    n = len(y)
219
    if max_lag is None:
220
        max_lag = int(np.floor(12 * (n/100)**(1/4)))
221

222
    # First difference
223
    dy = np.diff(y)
224
    y_lag = y[:-1]
225

226
    # Regression: dy = alpha + beta*y_{t-1} + sum(gamma_i * dy_{t-i}) + e
227
    # Simplified: just test coefficient on y_{t-1}
228
    X = np.column_stack([np.ones(len(dy)), y_lag])
229
    coeffs = np.linalg.lstsq(X, dy, rcond=None)[0]
230

231
    # Compute test statistic
232
    residuals = dy - X @ coeffs
233
    se = np.sqrt(np.sum(residuals**2) / (len(dy) - 2))
234
    se_beta = se / np.sqrt(np.sum((y_lag - np.mean(y_lag))**2))
235

236
    adf_stat = coeffs[1] / se_beta
237

238
    # Critical values (approximate)
239
    critical_values = {0.01: -3.43, 0.05: -2.86, 0.10: -2.57}
240

241
    return adf_stat, critical_values
242

243
adf_stat, critical = adf_test(y)
244
is_stationary = adf_stat < critical[0.05]
245

246
# Test on differenced series
247
dy = np.diff(y)
248
adf_diff, _ = adf_test(dy)
249
\end{pycode}
250

251
\begin{table}[H]
252
\centering
253
\caption{Augmented Dickey-Fuller Test Results}
254
\begin{tabular}{lcc}
255
\toprule
256
Series & ADF Statistic & Stationary (5\%) \\
257
\midrule
258
Original & \py{f"{adf_stat:.3f}"} & \py{"Yes" if is_stationary else "No"} \\
259
First Difference & \py{f"{adf_diff:.3f}"} & \py{"Yes" if adf_diff < -2.86 else "No"} \\
260
\bottomrule
261
\end{tabular}
262
\end{table}
263

264
Critical values: 1\%: $-3.43$, 5\%: $-2.86$, 10\%: $-2.57$.
265

266
\section{ARIMA Model Fitting}
267
\begin{pycode}
268
# Simple AR(p) model fitting
269
def fit_ar(y, p):
270
    """Fit AR(p) model using OLS."""
271
    n = len(y)
272
    Y = y[p:]
273
    X = np.column_stack([y[p-i-1:n-i-1] for i in range(p)])
274
    X = np.column_stack([np.ones(len(Y)), X])
275

276
    coeffs = np.linalg.lstsq(X, Y, rcond=None)[0]
277
    fitted = X @ coeffs
278
    residuals = Y - fitted
279

280
    # Information criteria
281
    k = p + 1
282
    n_eff = len(Y)
283
    sse = np.sum(residuals**2)
284
    aic = n_eff * np.log(sse/n_eff) + 2*k
285
    bic = n_eff * np.log(sse/n_eff) + k*np.log(n_eff)
286

287
    return coeffs, residuals, aic, bic, fitted
288

289
# Model selection
290
results = []
291
for p in range(1, 7):
292
    coeffs, resid, aic, bic, fitted = fit_ar(y, p)
293
    results.append((p, aic, bic, np.std(resid)))
294

295
best_p = min(results, key=lambda x: x[1])[0]
296
coeffs_best, resid_best, aic_best, bic_best, fitted_best = fit_ar(y, best_p)
297

298
# Plot model selection
299
fig, ax = plt.subplots(figsize=(8, 4))
300
ps = [r[0] for r in results]
301
aics = [r[1] for r in results]
302
bics = [r[2] for r in results]
303

304
ax.plot(ps, aics, 'bo-', label='AIC', linewidth=1.5, markersize=6)
305
ax.plot(ps, bics, 'rs-', label='BIC', linewidth=1.5, markersize=6)
306
ax.axvline(best_p, color='gray', linestyle='--', alpha=0.7, label=f'Best p={best_p}')
307
ax.set_xlabel('AR Order (p)')
308
ax.set_ylabel('Information Criterion')
309
ax.set_title('Model Order Selection')
310
ax.legend()
311
ax.grid(True, alpha=0.3)
312
save_plot('ts_model_selection.pdf', 'AIC and BIC criteria for AR model order selection.')
313
\end{pycode}
314

315
\begin{pycode}
316
print(r'\begin{table}[H]')
317
print(r'\centering')
318
print(r'\caption{Model Comparison Results}')
319
print(r'\begin{tabular}{cccc}')
320
print(r'\toprule')
321
print(r'AR Order & AIC & BIC & Residual Std \\')
322
print(r'\midrule')
323
for r in results[:min(6, len(results))]:
324
    print(f'{r[0]} & {r[1]:.2f} & {r[2]:.2f} & {r[3]:.3f} \\\\')
325
print(r'\bottomrule')
326
print(r'\end{tabular}')
327
print(r'\end{table}')
328
\end{pycode}
329

330
The optimal model is AR(\py{best_p}) with AIC = \py{f"{aic_best:.2f}"}.
331

332
\section{Residual Diagnostics}
333
\begin{pycode}
334
# Residual analysis
335
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
336

337
# Time plot of residuals
338
axes[0, 0].plot(resid_best, 'b-', linewidth=0.8)
339
axes[0, 0].axhline(0, color='r', linestyle='--')
340
axes[0, 0].set_xlabel('Time')
341
axes[0, 0].set_ylabel('Residual')
342
axes[0, 0].set_title('Residuals Over Time')
343
axes[0, 0].grid(True, alpha=0.3)
344

345
# Histogram of residuals
346
axes[0, 1].hist(resid_best, bins=20, density=True, alpha=0.7, color='steelblue', edgecolor='black')
347
x_norm = np.linspace(min(resid_best), max(resid_best), 100)
348
axes[0, 1].plot(x_norm, stats.norm.pdf(x_norm, np.mean(resid_best), np.std(resid_best)), 'r-', linewidth=2)
349
axes[0, 1].set_xlabel('Residual')
350
axes[0, 1].set_ylabel('Density')
351
axes[0, 1].set_title('Residual Distribution')
352

353
# Q-Q plot
354
stats.probplot(resid_best, dist="norm", plot=axes[1, 0])
355
axes[1, 0].set_title('Normal Q-Q Plot')
356
axes[1, 0].grid(True, alpha=0.3)
357

358
# ACF of residuals
359
acf_resid = acf(resid_best, 20)
360
axes[1, 1].bar(range(21), acf_resid, width=0.3, color='darkgreen', alpha=0.8)
361
axes[1, 1].axhline(1.96/np.sqrt(len(resid_best)), color='r', linestyle='--')
362
axes[1, 1].axhline(-1.96/np.sqrt(len(resid_best)), color='r', linestyle='--')
363
axes[1, 1].set_xlabel('Lag')
364
axes[1, 1].set_ylabel('ACF')
365
axes[1, 1].set_title('ACF of Residuals')
366

367
plt.tight_layout()
368
save_plot('ts_diagnostics.pdf', 'Residual diagnostic plots for model validation.')
369

370
# Normality test
371
_, shapiro_p = stats.shapiro(resid_best[:50])  # Shapiro limited to 5000
372
# Ljung-Box test (simplified)
373
lb_stat = len(resid_best) * np.sum(acf(resid_best, 10)[1:]**2)
374
\end{pycode}
375

376
\section{Forecasting}
377
\begin{pycode}
378
# Forecast using fitted AR model
379
def forecast_ar(y, coeffs, h):
380
    """Generate h-step ahead forecasts."""
381
    p = len(coeffs) - 1
382
    forecasts = np.zeros(h)
383
    history = list(y[-p:])
384

385
    for i in range(h):
386
        pred = coeffs[0] + sum(coeffs[j+1] * history[-j-1] for j in range(p))
387
        forecasts[i] = pred
388
        history.append(pred)
389

390
    return forecasts
391

392
h = 24  # Forecast horizon
393
forecasts = forecast_ar(y, coeffs_best, h)
394

395
# Simple prediction intervals (based on residual std)
396
sigma = np.std(resid_best)
397
intervals = []
398
for i in range(1, h+1):
399
    se = sigma * np.sqrt(i)  # Simplified; grows with horizon
400
    lower = forecasts[i-1] - 1.96 * se
401
    upper = forecasts[i-1] + 1.96 * se
402
    intervals.append((lower, upper))
403

404
# Plot forecasts
405
fig, ax = plt.subplots(figsize=(10, 5))
406

407
# Historical data
408
ax.plot(t[-50:], y[-50:], 'b-', linewidth=1, label='Observed')
409

410
# Fitted values
411
t_fitted = t[best_p:]
412
ax.plot(t_fitted[-50:], fitted_best[-50:], 'g--', linewidth=1, alpha=0.7, label='Fitted')
413

414
# Forecasts
415
t_forecast = np.arange(n, n + h)
416
ax.plot(t_forecast, forecasts, 'r-', linewidth=2, label='Forecast')
417

418
# Prediction intervals
419
lower = [iv[0] for iv in intervals]
420
upper = [iv[1] for iv in intervals]
421
ax.fill_between(t_forecast, lower, upper, alpha=0.3, color='red', label='95% PI')
422

423
ax.set_xlabel('Time')
424
ax.set_ylabel('Value')
425
ax.set_title(f'Time Series Forecast (AR({best_p}))')
426
ax.legend(loc='upper left')
427
ax.grid(True, alpha=0.3)
428
save_plot('ts_forecast.pdf', 'Out-of-sample forecasts with 95\\% prediction intervals.')
429
\end{pycode}
430

431
\section{Spectral Analysis}
432
\begin{pycode}
433
# Power spectral density
434
freqs = fftfreq(n, 1)
435
spectrum = np.abs(fft(y))**2 / n
436

437
# Only positive frequencies
438
pos_mask = freqs > 0
439
freqs_pos = freqs[pos_mask]
440
spectrum_pos = spectrum[pos_mask]
441

442
# Find dominant frequency
443
dominant_idx = np.argmax(spectrum_pos)
444
dominant_freq = freqs_pos[dominant_idx]
445
dominant_period = 1 / dominant_freq if dominant_freq > 0 else np.inf
446

447
# Periodogram
448
fig, ax = plt.subplots(figsize=(10, 4))
449
ax.semilogy(freqs_pos, spectrum_pos, 'b-', linewidth=0.8)
450
ax.axvline(dominant_freq, color='r', linestyle='--', alpha=0.7,
451
           label=f'Dominant: T={dominant_period:.1f}')
452
ax.set_xlabel('Frequency')
453
ax.set_ylabel('Power Spectral Density')
454
ax.set_title('Periodogram')
455
ax.legend()
456
ax.grid(True, alpha=0.3)
457
save_plot('ts_spectrum.pdf', 'Power spectral density showing dominant frequencies.')
458
\end{pycode}
459

460
The dominant period detected is approximately \py{f"{dominant_period:.1f}"} time units, which corresponds to the seasonal period of \py{seasonal_period} embedded in the data.
461

462
\section{Summary Statistics}
463
\begin{table}[H]
464
\centering
465
\caption{Comprehensive Time Series Analysis Summary}
466
\begin{tabular}{lr}
467
\toprule
468
Statistic & Value \\
469
\midrule
470
Sample Size & \py{n} \\
471
Mean & \py{f"{mean_y:.3f}"} \\
472
Standard Deviation & \py{f"{std_y:.3f}"} \\
473
Minimum & \py{f"{min_y:.3f}"} \\
474
Maximum & \py{f"{max_y:.3f}"} \\
475
ADF Statistic & \py{f"{adf_stat:.3f}"} \\
476
Best AR Order & \py{best_p} \\
477
Residual Std & \py{f"{np.std(resid_best):.3f}"} \\
478
Dominant Period & \py{f"{dominant_period:.2f}"} \\
479
\bottomrule
480
\end{tabular}
481
\end{table}
482

483
\section{Conclusion}
484
This analysis demonstrated comprehensive time series methodology including:
485
\begin{itemize}
486
    \item Additive decomposition into trend, seasonal, and residual components
487
    \item ACF/PACF analysis for identifying temporal dependencies
488
    \item Stationarity testing using the Augmented Dickey-Fuller test
489
    \item AR model fitting with AIC/BIC model selection
490
    \item Residual diagnostics for model validation
491
    \item Forecasting with prediction intervals
492
    \item Spectral analysis for frequency domain insights
493
\end{itemize}
494

495
The AR(\py{best_p}) model provides reasonable forecasts, with the spectral analysis confirming the seasonal pattern at period \py{f"{dominant_period:.1f}"}.
496

497
\end{document}
498

499
Product

Resources

Company