library(feasts)
library(fabletools)
library(tsibble)
library(dplyr)
library(plotly)
Correlation Analysis
In time series analysis, correlation between a series and its previous lags is pivotal in explaining the time series variation and revealing its underlying patterns. A common example of high correlation between observations is the hourly temperature, which is highly correlated with its previous observations, as the hourly fluctuations are gradual.
There are many tools to explore time series correlation, and in this section, we will review the main method - the Auto Correlation Function (ACF).
Auto Correlation Function
In most cases, the lags of the series contain valuable information about the current variation of the series. The ACF measures the correlation between a series and its previous lags. It enables us to understand what lags have predictive power. Both the stats and feasts libraries provide an implementation of the ACF method, with the acf and ACF functions, respectively. Let’s load the required libraries and load ourtime series and explore its correlation:
load(file = "./data/ts.RData")
Correlation Analysis for Yearly Time Series
plot_ly(data = ts1, x = ~index, y = ~y, type = "scatter", mode = "lines")
Let’s drop the first year:
<- ts1 |> dplyr::filter(index > 1986)
ts1
plot_ly(data = ts1, x = ~index, y = ~y, type = "scatter", mode = "lines")
We will use the ACF
function to calculate the series autocorrelation:
<- ts1 %>% ACF(y)
acf1
head(acf1)
# A tsibble: 6 x 2 [1Y]
lag acf
<cf_lag> <dbl>
1 1Y 0.916
2 2Y 0.836
3 3Y 0.760
4 4Y 0.687
5 5Y 0.617
6 6Y 0.546
Let’s calculate the 95% confidence interval for the autocorrelation calculation:
<- 0.05
alpha <- qnorm(1 - alpha/2)/sqrt(nrow(ts1))
pi_upper pi_upper
[1] 0.3222161
And let’s plot it:
|> plot_ly(x = ~ lag, y = ~ acf, type = "bar", showlegend = FALSE) |>
acf1 add_segments(x = ~ min(lag), xend = ~ max(lag), y = pi_upper, yend = pi_upper, line = list(color = "black", dash = "dash"), name = "Upper") |>
add_segments(x = ~ min(lag), xend = ~ max(lag), y = - pi_upper, yend = - pi_upper, line = list(color = "black", dash = "dash"), name = "Lower") |>
layout(title = "ACF Plot", xaxis = list(title = "Lags"), yaxis = list(title = "ACF"))
What can we conclude from the ACF plot? The series has a high correlation with its recent lags (i.e., 0.91 correlation with its first lag). This is a typical behavior for time series without a seasonal component and with a clear trend. Regression the series with its first lag might yield a great predictive power, which is nothing but ARIMA model with AR process of order 1.
Correlation Analysis for Monthly Time Series
Le’ts repeat the process for the monthly demand for natural gas in the US:
plot_ly(data = ts2, x = ~date, y = ~y, type = "scatter", mode = "lines")
Next, we will calculate the ACF for the series
<- ts2 |> ACF(y, lag_max = 60)
acf2
<- qnorm(1 - alpha/2)/sqrt(nrow(ts2))
pi_upper pi_upper
[1] 0.1146982
And plot the ACF for the series:
|> plot_ly(x = ~ lag, y = ~ acf, type = "bar", showlegend = FALSE) |>
acf2 add_segments(x = ~ min(lag), xend = ~ max(lag), y = pi_upper, yend = pi_upper, line = list(color = "black", dash = "dash"), name = "Upper") |>
add_segments(x = ~ min(lag), xend = ~ max(lag), y = - pi_upper, yend = - pi_upper, line = list(color = "black", dash = "dash"), name = "Lower") |>
layout(title = "ACF Plot", xaxis = list(title = "Lags"), yaxis = list(title = "ACF"))
We can learn from the above output that the series has strong seasonal patterns and the first seasonal lag is the most significant. Potential regression to consider is - regression the series with lags 1, 12, and 24, which is seasonal ARIMA model with AR order 1 and seasonal AR order of 1 or 2 (depends if using lags 12 and 24).
A more elegant way to plot the ACF output would be to highlight the seasonal lags. Let’s functionalize the process and replot it:
<- function(ts, var, lag_max, frequency){
plot_acf <- ts |> feasts::ACF(!!rlang::sym(var), lag_max = lag_max)
a <- "#0072B5"
color <- qnorm(1 - alpha/2)/sqrt(nrow(ts))
pi_upper
pi_upper<- plotly::plot_ly(type = "bar")
p
if(!is.null(frequency)){
<- seq(from = frequency, by = frequency, to = nrow(a))
s $seasonal <- NA
a$non_seasonal <- a$acf
a$non_seasonal[s] <- NA
a$seasonal[s] <- a$acf[s]
a
<- p |> plotly::add_trace(x = a$lag, y = a$non_seasonal, name = "Non-seasonal", marker = list(color = color,
p line = list(color = "rgb(8,48,107)",
width = 1.5))) |>
::add_trace(x = a$lag, y = a$seasonal, name = "Seasonal", marker = list(color = "red", line = list(color = "rgb(8,48,107)",
plotlywidth = 1.5)))
else {
}
<- p |> plotly::add_trace(x = a$lag, y = a$acf, name = "Lags", marker = list(color = color,
p line = list(color = "rgb(8,48,107)",
width = 1.5)))
}
<- p |> plotly::layout("ACF Plot", yaxis = list(title = "ACF"), xaxis = list(title = "Lags")) |>
p ::add_segments(x = ~ min(a$lag), xend = ~ max(a$lag), y = pi_upper, yend = pi_upper, line = list(color = "black", dash = "dash"), name = "95% CI", showlegend = TRUE, legendgroup = "ci") |>
plotly::add_segments(x = ~ min(a$lag), xend = ~ max(a$lag), y = - pi_upper, yend = - pi_upper, line = list(color = "black", dash = "dash"), name = "95% CI", showlegend = FALSE, legendgroup = "ci")
plotly
return(p)
}
plot_acf(ts = ts2, var = "y", lag_max = 60, frequency = 12)
Lags Plots
Another common way to visualize correlation of the series with its lags is using lags plots. The following code generates a plot for each lag:
<- function(ts, var, lag){
plot_lag
<- ts |>
d ::mutate(lag = dplyr::lag(x= !!rlang::sym(var), n = lag))
dplyr
# Create the regression formula
<- as.formula(paste(var, "~ lag" ))
formula
# Fit the linear model
<- lm(formula, data = d)
model
# Extract model coefficients
<- coef(model)[1]
intercept <- coef(model)[2]
slope
# Format regression formula text
<- paste0("y = ", round(intercept, 2),
reg_formula ifelse(slope < 0, " - ", " + "),
abs(round(slope, 2)), paste("*lag", lag, sep = ""))
# Get adjusted R-squared
<- summary(model)$adj.r.squared
adj_r2 <- paste0("Adjusted R² = ", round(adj_r2, 3))
adj_r2_label
# Add predicted values to data
$predicted <- predict(model, newdata = d)
d
# Create plot
<- plot_ly(d, x = ~ lag, y = ~get(var), type = 'scatter', mode = 'markers',
p name = 'Actual') %>%
add_lines(x = ~ lag, y = ~predicted, name = 'Regression Fitted Line',
line = list(color = "red", dash = "dash")) %>%
layout(title = paste(var, "vs Lag", lag, sep = " "),
xaxis = list(title = paste("Lag", lag, sep = " ")),
yaxis = list(title = var),
annotations = list(
list(x = 0.05, y = 0.95, xref = "paper", yref = "paper",
text = reg_formula,
showarrow = FALSE,
font = list(size = 12)),
list(x = 0.05, y = 0.88, xref = "paper", yref = "paper",
text = adj_r2_label,
showarrow = FALSE,
font = list(size = 12))
))
return(p)
}
Let’s plot the relationship between the total number of consumers of natural gas in the US with its first and 10th lags:
plot_lag(ts = ts1, var = "y", lag = 1)
plot_lag(ts = ts1, var = "y", lag = 10)
Likewise, we will plot the relationship between the demand for natural gas in the US with its first and seasonal lag (12):
plot_lag(ts = ts2, var = "y", lag = 1)
plot_lag(ts = ts2, var = "y", lag = 12)