library(feasts)
library(fabletools)
library(tsibble)
library(dplyr)
library(plotly)
source("./functions.R")Modeling Seasonality
Next, let’s load the datasets:
load(file = "./data/ts.RData")p <- plot_ly() |>
add_lines(x = ts2$index, y = ts2$y, type = "scatter", mode = "lines", name = "Actual") |>
plotly::layout(
title = "US Monthly Demand for Natural Gas",
yaxis = list(title = "MMCF"),
xaxis = list(title = "Month")
)
pLet’s decompose the series:
stl_d <- ts2 |>
model(STL(y)) |>
components()
plot_decomposition(stl_d, var = "y", outliers = TRUE)Likewise, let’s review the ACF plot:
plot_acf(ts = ts2, var = "y", lag_max = 72, frequency = 12, alpha = 0.05)Add Trend Feature
We will build the features step-by-step to review the impact of each feature on the goodness of fit. Let’s start by defining the trend feature:
ts2$trend <- 1:nrow(ts2)
ts2# A tsibble: 297 x 5 [1M]
index date y series_id trend
<mth> <date> <int> <chr> <int>
1 2001 Jan 2001-01-01 2505011 NUS 1
2 2001 Feb 2001-02-01 2156873 NUS 2
3 2001 Mar 2001-03-01 2086568 NUS 3
4 2001 Apr 2001-04-01 1663832 NUS 4
5 2001 May 2001-05-01 1385163 NUS 5
6 2001 Jun 2001-06-01 1313119 NUS 6
7 2001 Jul 2001-07-01 1459919 NUS 7
8 2001 Aug 2001-08-01 1528483 NUS 8
9 2001 Sep 2001-09-01 1360871 NUS 9
10 2001 Oct 2001-10-01 1507428 NUS 10
# ℹ 287 more rows
md1 <- lm(y ~ trend, data = ts2)
summary(md1)
Call:
lm(formula = y ~ trend, data = ts2)
Residuals:
Min 1Q Median 3Q Max
-520269 -309692 -136915 307176 1154242
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1562817.0 45429.6 34.40 <2e-16 ***
trend 3075.3 264.3 11.64 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 390500 on 295 degrees of freedom
Multiple R-squared: 0.3146, Adjusted R-squared: 0.3123
F-statistic: 135.4 on 1 and 295 DF, p-value: < 2.2e-16
We will fit the model on the series to see the trend fit:
fit1 <- predict(object = md1, newdata = ts2, interval = "confidence", level = 0.95)
ts2$fit1 <- fit1[, 1]
p1 <- p |>
add_lines(x = ts2$index, y = ts2$fit1, mode = "lines", line = list(color = "black", dash = "dash"), name = "Fitted")
p1Let’s plot the model residuals:
plot_residuals(
data = ts2,
index_col = "date",
actual_col = "y",
fitted_col = "fit1",
lag_max = 60,
alpha = 0.05,
frequency = 12
)If you check the model residuals, you will noticed that the residuals are detrained version of the original series.
Add Seasonal Feature
We will now add the seasonal feature by creating a categorical variable for each month of the year:
ts2$month <- lubridate::month(ts2$index, label = TRUE)
ts2# A tsibble: 297 x 7 [1M]
index date y series_id trend fit1 month
<mth> <date> <int> <chr> <int> <dbl> <ord>
1 2001 Jan 2001-01-01 2505011 NUS 1 1565892. Jan
2 2001 Feb 2001-02-01 2156873 NUS 2 1568968. Feb
3 2001 Mar 2001-03-01 2086568 NUS 3 1572043. Mar
4 2001 Apr 2001-04-01 1663832 NUS 4 1575118. Apr
5 2001 May 2001-05-01 1385163 NUS 5 1578193. May
6 2001 Jun 2001-06-01 1313119 NUS 6 1581269. Jun
7 2001 Jul 2001-07-01 1459919 NUS 7 1584344. Jul
8 2001 Aug 2001-08-01 1528483 NUS 8 1587419. Aug
9 2001 Sep 2001-09-01 1360871 NUS 9 1590494. Sep
10 2001 Oct 2001-10-01 1507428 NUS 10 1593570. Oct
# ℹ 287 more rows
md2 <- lm(y ~ trend + month, data = ts2)
summary(md2)
Call:
lm(formula = y ~ trend + month, data = ts2)
Residuals:
Min 1Q Median 3Q Max
-501823 -81011 -6610 84281 374810
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1552208.70 14673.47 105.783 < 2e-16 ***
trend 3150.23 85.36 36.904 < 2e-16 ***
month.L -461232.44 25449.59 -18.123 < 2e-16 ***
month.Q 1110161.74 25378.26 43.745 < 2e-16 ***
month.C 23044.40 25372.91 0.908 0.364528
month^4 204173.98 25406.72 8.036 2.50e-14 ***
month^5 279945.41 25374.21 11.033 < 2e-16 ***
month^6 -88544.66 25356.78 -3.492 0.000556 ***
month^7 -210410.14 25381.66 -8.290 4.57e-15 ***
month^8 81077.10 25361.27 3.197 0.001546 **
month^9 41945.12 25290.94 1.659 0.098320 .
month^10 69794.47 25236.81 2.766 0.006054 **
month^11 -18612.30 25218.62 -0.738 0.461101
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 126100 on 284 degrees of freedom
Multiple R-squared: 0.9312, Adjusted R-squared: 0.9283
F-statistic: 320.4 on 12 and 284 DF, p-value: < 2.2e-16
As you can noticed from the model summary, adding the seasonal feature improved the model goodness of fit. Let’s fit the model and evaluate the model’s residuals:
fit2 <- predict(object = md2, newdata = ts2, interval = "confidence", level = 0.95)
ts2$fit2 <- fit2[, 1]Let’s plot the residuals:
plot_residuals(
data = ts2,
index_col = "date",
actual_col = "y",
fitted_col = "fit2",
lag_max = 60,
alpha = 0.05,
frequency = 12
)Let’s now create a 5 year (or 60 months) forecast by setting the future data frame:
h <- 5 * 12As before, we will use the new_data function from the tsibble library to create a continues data frame with the series future index:
future_data <- new_data(ts2, n = h)
future_data |> head()# A tsibble: 6 x 1 [1M]
index
<mth>
1 2025 Oct
2 2025 Nov
3 2025 Dec
4 2026 Jan
5 2026 Feb
6 2026 Mar
Next, let’s start to add the feature we created so far - trend and seasonal features:
# Adding the trend feature:
tail(ts2)# A tsibble: 6 x 8 [1M]
index date y series_id trend fit1 month fit2
<mth> <date> <int> <chr> <int> <dbl> <ord> <dbl>
1 2025 Apr 2025-04-01 2123687 NUS 292 2460795. Apr 2273034.
2 2025 May 2025-05-01 2050025 NUS 293 2463871. May 2110662.
3 2025 Jun 2025-06-01 2158141 NUS 294 2466946. Jun 2125069.
4 2025 Jul 2025-07-01 2441338 NUS 295 2470021. Jul 2318886.
5 2025 Aug 2025-08-01 2367094 NUS 296 2473097. Aug 2327483.
6 2025 Sep 2025-09-01 2162772 NUS 297 2476172. Sep 2118672.
trend_start <- max(ts2$trend) + 1
future_data$trend <- trend_start:(trend_start + h - 1)
# Adding the seasonal feature:
future_data$month <- lubridate::month(future_data$index, label = TRUE)
future_data |> head()# A tsibble: 6 x 3 [1M]
index trend month
<mth> <int> <ord>
1 2025 Oct 298 Oct
2 2025 Nov 299 Nov
3 2025 Dec 300 Dec
4 2026 Jan 301 Jan
5 2026 Feb 302 Feb
6 2026 Mar 303 Mar
When creating categorical variables in a future data frame, you should ensure that the categorical variables contain the exact same levels as the ones used in the training dataset.
Let’s now create the forecast with the predict function:
Let’s create the forecast and plot it:
fc2 <- lm_forecast(
model = md2,
actual = ts2,
future_data = future_data,
actual_col = "y",
lag_col = NULL,
level = 0.95
)plot_lm_forecast(fc2, actual_col = "y")The is a ton of information that left on the residuals!
We will explore the following features:
- Structural breaks
- Piecewise Linear Trend
- AR
- Outliers
Structural Break Feature
Often, time series would have an abrupt shift in its structure due to some event. A good example, is the Covid-19 pandemic impact on some macro-economic indicators such as the unemployment rate and number of passengers traveling on airplanes. At the end of this change, the series would be in a new level.
We can notice that the natural gas series had a structural break on the trend component around September 2018. To model a structural break, we will create a new variable that is equal to 0 prior to September 2018 and 1 after this date:
ts2$structural_break <- ifelse(ts2$date >= as.Date("2018-09-01"), 1, 0)
ts2 |> head()# A tsibble: 6 x 9 [1M]
index date y series_id trend fit1 month fit2
<mth> <date> <int> <chr> <int> <dbl> <ord> <dbl>
1 2001 Jan 2001-01-01 2505011 NUS 1 1565892. Jan 2323736.
2 2001 Feb 2001-02-01 2156873 NUS 2 1568968. Feb 1996404.
3 2001 Mar 2001-03-01 2086568 NUS 3 1572043. Mar 1794614.
4 2001 Apr 2001-04-01 1663832 NUS 4 1575118. Apr 1365767.
5 2001 May 2001-05-01 1385163 NUS 5 1578193. May 1203395.
6 2001 Jun 2001-06-01 1313119 NUS 6 1581269. Jun 1217802.
# ℹ 1 more variable: structural_break <dbl>
ts2 |> tail()# A tsibble: 6 x 9 [1M]
index date y series_id trend fit1 month fit2
<mth> <date> <int> <chr> <int> <dbl> <ord> <dbl>
1 2025 Apr 2025-04-01 2123687 NUS 292 2460795. Apr 2273034.
2 2025 May 2025-05-01 2050025 NUS 293 2463871. May 2110662.
3 2025 Jun 2025-06-01 2158141 NUS 294 2466946. Jun 2125069.
4 2025 Jul 2025-07-01 2441338 NUS 295 2470021. Jul 2318886.
5 2025 Aug 2025-08-01 2367094 NUS 296 2473097. Aug 2327483.
6 2025 Sep 2025-09-01 2162772 NUS 297 2476172. Sep 2118672.
# ℹ 1 more variable: structural_break <dbl>
Let’s update the regression model with the structural break feature:
md3 <- lm(y ~ trend + month + structural_break, data = ts2)
summary(md3)
Call:
lm(formula = y ~ trend + month + structural_break, data = ts2)
Residuals:
Min 1Q Median 3Q Max
-512833 -69180 -12410 73577 354107
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1606436.4 15770.1 101.866 < 2e-16 ***
trend 2469.5 127.4 19.383 < 2e-16 ***
month.L -461253.9 23624.9 -19.524 < 2e-16 ***
month.Q 1106744.4 23564.0 46.968 < 2e-16 ***
month.C 24589.0 23554.8 1.044 0.297420
month^4 206945.0 23588.6 8.773 < 2e-16 ***
month^5 280572.4 23555.1 11.911 < 2e-16 ***
month^6 -90199.0 23540.0 -3.832 0.000157 ***
month^7 -212111.4 23563.2 -9.002 < 2e-16 ***
month^8 81132.6 23542.9 3.446 0.000655 ***
month^9 43577.7 23478.9 1.856 0.064488 .
month^10 71608.5 23428.9 3.056 0.002454 **
month^11 -17638.4 23411.0 -0.753 0.451821
structural_break 164902.9 24166.1 6.824 5.39e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 117000 on 283 degrees of freedom
Multiple R-squared: 0.9409, Adjusted R-squared: 0.9382
F-statistic: 346.7 on 13 and 283 DF, p-value: < 2.2e-16
As you can noticed from the model summary, adding the seasonal feature improved the model goodness of fit.
fit3 <- predict(object = md3, newdata = ts2, interval = "confidence", level = 0.95)
ts2$fit3 <- fit3[, 1]
p3 <- p |>
add_lines(x = ts2$index, y = ts2$fit3, mode = "lines", line = list(color = "black", dash = "dash"), name = "Fitted")
p3Let’s plot the model residuals:
plot_residuals(
data = ts2,
index_col = "date",
actual_col = "y",
fitted_col = "fit3",
lag_max = 60,
alpha = 0.05,
frequency = 12
)Let’s re-forecast the series:
future_data$structural_break <- 1
fc3 <- lm_forecast(
model = md3,
actual = ts2,
future_data = future_data,
actual_col = "y",
lag_col = NULL,
level = 0.95
)plot_lm_forecast(fc3, actual_col = "y")Handling Outliers
We can notices that the series has some years where the consumption is higher than normal years. We can segment two groups of outliers:
- High: Jan 2025
- Medium: Jan 2014, Jan 2018, Jan 2019, Jan 2022, Jan 2024
Let’s create the features:
ts2$outlier_high <- ifelse(ts2$date == as.Date("2025-01-01"), 1, 0)
outlier_medium_dates <- c(as.Date("2014-01-01"), as.Date("2018-01-01"), as.Date("2019-01-01"), as.Date("2022-01-01"), as.Date("2024-01-01"))
ts2$outlier_medium <- ifelse(ts2$date %in% outlier_medium_dates, 1, 0)
table(ts2$outlier_high)
0 1
296 1
table(ts2$outlier_medium)
0 1
292 5
Let’s train the model with the new features:
md4 <- lm(y ~ trend + month + structural_break + outlier_high + outlier_medium, data = ts2)
summary(md4)
Call:
lm(formula = y ~ trend + month + structural_break + outlier_high +
outlier_medium, data = ts2)
Residuals:
Min 1Q Median 3Q Max
-451827 -67577 -6464 69865 278419
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1610804.6 15071.5 106.877 < 2e-16 ***
trend 2407.5 122.1 19.712 < 2e-16 ***
month.L -430107.4 23311.4 -18.451 < 2e-16 ***
month.Q 1072900.6 23392.2 45.866 < 2e-16 ***
month.C 55283.5 23226.5 2.380 0.017971 *
month^4 182361.9 22992.4 7.931 5.16e-14 ***
month^5 298220.4 22728.9 13.121 < 2e-16 ***
month^6 -101101.7 22560.9 -4.481 1.08e-05 ***
month^7 -206090.1 22516.3 -9.153 < 2e-16 ***
month^8 78132.7 22475.1 3.476 0.000589 ***
month^9 44597.1 22407.6 1.990 0.047530 *
month^10 71152.6 22359.2 3.182 0.001625 **
month^11 -17596.5 22341.9 -0.788 0.431594
structural_break 162435.9 23070.9 7.041 1.47e-11 ***
outlier_high 431715.8 115397.0 3.741 0.000222 ***
outlier_medium 248182.3 56713.6 4.376 1.71e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 111700 on 281 degrees of freedom
Multiple R-squared: 0.9466, Adjusted R-squared: 0.9437
F-statistic: 331.9 on 15 and 281 DF, p-value: < 2.2e-16
Let’s refit the model and plot the residuals:
fit4 <- predict(object = md4, newdata = ts2, interval = "confidence", level = 0.95)
ts2$fit4 <- fit4[, 1]Let’s plot the model residuals:
plot_residuals(
data = ts2,
index_col = "date",
actual_col = "y",
fitted_col = "fit4",
lag_max = 60,
alpha = 0.05,
frequency = 12
)