Modeling Seasonality

Author

Rami Krispin

Published

December 1, 2025

library(feasts)
library(fabletools)
library(tsibble)
library(dplyr)
library(plotly)

source("./functions.R")

Next, let’s load the datasets:

load(file = "./data/ts.RData")

p <- plot_ly() |>
    add_lines(x = ts2$index, y = ts2$y, type = "scatter", mode = "lines", name = "Actual") |>
    plotly::layout(
        title = "US Monthly Demand for Natural Gas",
        yaxis = list(title = "MMCF"),
        xaxis = list(title = "Month")
    )

p

Let’s decompose the series:

stl_d <- ts2 |>
    model(STL(y)) |>
    components()

plot_decomposition(stl_d, var = "y", outliers = TRUE)

Likewise, let’s review the ACF plot:

plot_acf(ts = ts2, var = "y", lag_max = 72, frequency = 12, alpha = 0.05)

Add Trend Feature

We will build the features step-by-step to review the impact of each feature on the goodness of fit. Let’s start by defining the trend feature:

ts2$trend <- 1:nrow(ts2)

ts2

# A tsibble: 297 x 5 [1M]
      index date             y series_id trend
      <mth> <date>       <int> <chr>     <int>
 1 2001 Jan 2001-01-01 2505011 NUS           1
 2 2001 Feb 2001-02-01 2156873 NUS           2
 3 2001 Mar 2001-03-01 2086568 NUS           3
 4 2001 Apr 2001-04-01 1663832 NUS           4
 5 2001 May 2001-05-01 1385163 NUS           5
 6 2001 Jun 2001-06-01 1313119 NUS           6
 7 2001 Jul 2001-07-01 1459919 NUS           7
 8 2001 Aug 2001-08-01 1528483 NUS           8
 9 2001 Sep 2001-09-01 1360871 NUS           9
10 2001 Oct 2001-10-01 1507428 NUS          10
# ℹ 287 more rows

md1 <- lm(y ~ trend, data = ts2)

summary(md1)


Call:
lm(formula = y ~ trend, data = ts2)

Residuals:
    Min      1Q  Median      3Q     Max 
-520269 -309692 -136915  307176 1154242 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1562817.0    45429.6   34.40   <2e-16 ***
trend          3075.3      264.3   11.64   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 390500 on 295 degrees of freedom
Multiple R-squared:  0.3146,    Adjusted R-squared:  0.3123 
F-statistic: 135.4 on 1 and 295 DF,  p-value: < 2.2e-16

We will fit the model on the series to see the trend fit:

fit1 <- predict(object = md1, newdata = ts2, interval = "confidence", level = 0.95)
ts2$fit1 <- fit1[, 1]
p1 <- p |>
    add_lines(x = ts2$index, y = ts2$fit1, mode = "lines", line = list(color = "black", dash = "dash"), name = "Fitted")

p1

Let’s plot the model residuals:

plot_residuals(
    data = ts2,
    index_col = "date",
    actual_col = "y",
    fitted_col = "fit1",
    lag_max = 60,
    alpha = 0.05,
    frequency = 12
)

Detranding the Series

If you check the model residuals, you will noticed that the residuals are detrained version of the original series.

Add Seasonal Feature

We will now add the seasonal feature by creating a categorical variable for each month of the year:

ts2$month <- lubridate::month(ts2$index, label = TRUE)
ts2

# A tsibble: 297 x 7 [1M]
      index date             y series_id trend     fit1 month
      <mth> <date>       <int> <chr>     <int>    <dbl> <ord>
 1 2001 Jan 2001-01-01 2505011 NUS           1 1565892. Jan  
 2 2001 Feb 2001-02-01 2156873 NUS           2 1568968. Feb  
 3 2001 Mar 2001-03-01 2086568 NUS           3 1572043. Mar  
 4 2001 Apr 2001-04-01 1663832 NUS           4 1575118. Apr  
 5 2001 May 2001-05-01 1385163 NUS           5 1578193. May  
 6 2001 Jun 2001-06-01 1313119 NUS           6 1581269. Jun  
 7 2001 Jul 2001-07-01 1459919 NUS           7 1584344. Jul  
 8 2001 Aug 2001-08-01 1528483 NUS           8 1587419. Aug  
 9 2001 Sep 2001-09-01 1360871 NUS           9 1590494. Sep  
10 2001 Oct 2001-10-01 1507428 NUS          10 1593570. Oct  
# ℹ 287 more rows

md2 <- lm(y ~ trend + month, data = ts2)

summary(md2)


Call:
lm(formula = y ~ trend + month, data = ts2)

Residuals:
    Min      1Q  Median      3Q     Max 
-501823  -81011   -6610   84281  374810 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1552208.70   14673.47 105.783  < 2e-16 ***
trend          3150.23      85.36  36.904  < 2e-16 ***
month.L     -461232.44   25449.59 -18.123  < 2e-16 ***
month.Q     1110161.74   25378.26  43.745  < 2e-16 ***
month.C       23044.40   25372.91   0.908 0.364528    
month^4      204173.98   25406.72   8.036 2.50e-14 ***
month^5      279945.41   25374.21  11.033  < 2e-16 ***
month^6      -88544.66   25356.78  -3.492 0.000556 ***
month^7     -210410.14   25381.66  -8.290 4.57e-15 ***
month^8       81077.10   25361.27   3.197 0.001546 ** 
month^9       41945.12   25290.94   1.659 0.098320 .  
month^10      69794.47   25236.81   2.766 0.006054 ** 
month^11     -18612.30   25218.62  -0.738 0.461101    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 126100 on 284 degrees of freedom
Multiple R-squared:  0.9312,    Adjusted R-squared:  0.9283 
F-statistic: 320.4 on 12 and 284 DF,  p-value: < 2.2e-16

As you can noticed from the model summary, adding the seasonal feature improved the model goodness of fit. Let’s fit the model and evaluate the model’s residuals:

fit2 <- predict(object = md2, newdata = ts2, interval = "confidence", level = 0.95)
ts2$fit2 <- fit2[, 1]

Let’s plot the residuals:

plot_residuals(
    data = ts2,
    index_col = "date",
    actual_col = "y",
    fitted_col = "fit2",
    lag_max = 60,
    alpha = 0.05,
    frequency = 12
)

Let’s now create a 5 year (or 60 months) forecast by setting the future data frame:

h <- 5 * 12

As before, we will use the new_data function from the tsibble library to create a continues data frame with the series future index:

future_data <- new_data(ts2, n = h)
future_data |> head()

# A tsibble: 6 x 1 [1M]
     index
     <mth>
1 2025 Oct
2 2025 Nov
3 2025 Dec
4 2026 Jan
5 2026 Feb
6 2026 Mar

Next, let’s start to add the feature we created so far - trend and seasonal features:

# Adding the trend feature:
tail(ts2)

# A tsibble: 6 x 8 [1M]
     index date             y series_id trend     fit1 month     fit2
     <mth> <date>       <int> <chr>     <int>    <dbl> <ord>    <dbl>
1 2025 Apr 2025-04-01 2123687 NUS         292 2460795. Apr   2273034.
2 2025 May 2025-05-01 2050025 NUS         293 2463871. May   2110662.
3 2025 Jun 2025-06-01 2158141 NUS         294 2466946. Jun   2125069.
4 2025 Jul 2025-07-01 2441338 NUS         295 2470021. Jul   2318886.
5 2025 Aug 2025-08-01 2367094 NUS         296 2473097. Aug   2327483.
6 2025 Sep 2025-09-01 2162772 NUS         297 2476172. Sep   2118672.

trend_start <- max(ts2$trend) + 1
future_data$trend <- trend_start:(trend_start + h - 1)

# Adding the seasonal feature:
future_data$month <- lubridate::month(future_data$index, label = TRUE)
future_data |> head()

# A tsibble: 6 x 3 [1M]
     index trend month
     <mth> <int> <ord>
1 2025 Oct   298 Oct  
2 2025 Nov   299 Nov  
3 2025 Dec   300 Dec  
4 2026 Jan   301 Jan  
5 2026 Feb   302 Feb  
6 2026 Mar   303 Mar

Caution

When creating categorical variables in a future data frame, you should ensure that the categorical variables contain the exact same levels as the ones used in the training dataset.

Let’s now create the forecast with the predict function:

Let’s create the forecast and plot it:

fc2 <- lm_forecast(
    model = md2,
    actual = ts2,
    future_data = future_data,
    actual_col = "y",
    lag_col = NULL,
    level = 0.95
)

plot_lm_forecast(fc2, actual_col = "y")

The is a ton of information that left on the residuals!

We will explore the following features:

Structural breaks
Piecewise Linear Trend
AR
Outliers

Structural Break Feature

Often, time series would have an abrupt shift in its structure due to some event. A good example, is the Covid-19 pandemic impact on some macro-economic indicators such as the unemployment rate and number of passengers traveling on airplanes. At the end of this change, the series would be in a new level.

We can notice that the natural gas series had a structural break on the trend component around September 2018. To model a structural break, we will create a new variable that is equal to 0 prior to September 2018 and 1 after this date:

ts2$structural_break <- ifelse(ts2$date >= as.Date("2018-09-01"), 1, 0)

ts2 |> head()

# A tsibble: 6 x 9 [1M]
     index date             y series_id trend     fit1 month     fit2
     <mth> <date>       <int> <chr>     <int>    <dbl> <ord>    <dbl>
1 2001 Jan 2001-01-01 2505011 NUS           1 1565892. Jan   2323736.
2 2001 Feb 2001-02-01 2156873 NUS           2 1568968. Feb   1996404.
3 2001 Mar 2001-03-01 2086568 NUS           3 1572043. Mar   1794614.
4 2001 Apr 2001-04-01 1663832 NUS           4 1575118. Apr   1365767.
5 2001 May 2001-05-01 1385163 NUS           5 1578193. May   1203395.
6 2001 Jun 2001-06-01 1313119 NUS           6 1581269. Jun   1217802.
# ℹ 1 more variable: structural_break <dbl>

ts2 |> tail()

# A tsibble: 6 x 9 [1M]
     index date             y series_id trend     fit1 month     fit2
     <mth> <date>       <int> <chr>     <int>    <dbl> <ord>    <dbl>
1 2025 Apr 2025-04-01 2123687 NUS         292 2460795. Apr   2273034.
2 2025 May 2025-05-01 2050025 NUS         293 2463871. May   2110662.
3 2025 Jun 2025-06-01 2158141 NUS         294 2466946. Jun   2125069.
4 2025 Jul 2025-07-01 2441338 NUS         295 2470021. Jul   2318886.
5 2025 Aug 2025-08-01 2367094 NUS         296 2473097. Aug   2327483.
6 2025 Sep 2025-09-01 2162772 NUS         297 2476172. Sep   2118672.
# ℹ 1 more variable: structural_break <dbl>

Let’s update the regression model with the structural break feature:

md3 <- lm(y ~ trend + month + structural_break, data = ts2)

summary(md3)


Call:
lm(formula = y ~ trend + month + structural_break, data = ts2)

Residuals:
    Min      1Q  Median      3Q     Max 
-512833  -69180  -12410   73577  354107 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      1606436.4    15770.1 101.866  < 2e-16 ***
trend               2469.5      127.4  19.383  < 2e-16 ***
month.L          -461253.9    23624.9 -19.524  < 2e-16 ***
month.Q          1106744.4    23564.0  46.968  < 2e-16 ***
month.C            24589.0    23554.8   1.044 0.297420    
month^4           206945.0    23588.6   8.773  < 2e-16 ***
month^5           280572.4    23555.1  11.911  < 2e-16 ***
month^6           -90199.0    23540.0  -3.832 0.000157 ***
month^7          -212111.4    23563.2  -9.002  < 2e-16 ***
month^8            81132.6    23542.9   3.446 0.000655 ***
month^9            43577.7    23478.9   1.856 0.064488 .  
month^10           71608.5    23428.9   3.056 0.002454 ** 
month^11          -17638.4    23411.0  -0.753 0.451821    
structural_break  164902.9    24166.1   6.824 5.39e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 117000 on 283 degrees of freedom
Multiple R-squared:  0.9409,    Adjusted R-squared:  0.9382 
F-statistic: 346.7 on 13 and 283 DF,  p-value: < 2.2e-16

As you can noticed from the model summary, adding the seasonal feature improved the model goodness of fit.

fit3 <- predict(object = md3, newdata = ts2, interval = "confidence", level = 0.95)
ts2$fit3 <- fit3[, 1]
p3 <- p |>
    add_lines(x = ts2$index, y = ts2$fit3, mode = "lines", line = list(color = "black", dash = "dash"), name = "Fitted")

p3

Let’s plot the model residuals:

plot_residuals(
    data = ts2,
    index_col = "date",
    actual_col = "y",
    fitted_col = "fit3",
    lag_max = 60,
    alpha = 0.05,
    frequency = 12
)

Let’s re-forecast the series:

future_data$structural_break <- 1

fc3 <- lm_forecast(
    model = md3,
    actual = ts2,
    future_data = future_data,
    actual_col = "y",
    lag_col = NULL,
    level = 0.95
)

plot_lm_forecast(fc3, actual_col = "y")

Handling Outliers

We can notices that the series has some years where the consumption is higher than normal years. We can segment two groups of outliers:

High: Jan 2025
Medium: Jan 2014, Jan 2018, Jan 2019, Jan 2022, Jan 2024

Let’s create the features:

ts2$outlier_high <- ifelse(ts2$date == as.Date("2025-01-01"), 1, 0)
outlier_medium_dates <- c(as.Date("2014-01-01"), as.Date("2018-01-01"), as.Date("2019-01-01"), as.Date("2022-01-01"), as.Date("2024-01-01"))
ts2$outlier_medium <- ifelse(ts2$date %in% outlier_medium_dates, 1, 0)

table(ts2$outlier_high)


  0   1 
296   1

table(ts2$outlier_medium)


  0   1 
292   5

Let’s train the model with the new features:

md4 <- lm(y ~ trend + month + structural_break + outlier_high + outlier_medium, data = ts2)

summary(md4)


Call:
lm(formula = y ~ trend + month + structural_break + outlier_high + 
    outlier_medium, data = ts2)

Residuals:
    Min      1Q  Median      3Q     Max 
-451827  -67577   -6464   69865  278419 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      1610804.6    15071.5 106.877  < 2e-16 ***
trend               2407.5      122.1  19.712  < 2e-16 ***
month.L          -430107.4    23311.4 -18.451  < 2e-16 ***
month.Q          1072900.6    23392.2  45.866  < 2e-16 ***
month.C            55283.5    23226.5   2.380 0.017971 *  
month^4           182361.9    22992.4   7.931 5.16e-14 ***
month^5           298220.4    22728.9  13.121  < 2e-16 ***
month^6          -101101.7    22560.9  -4.481 1.08e-05 ***
month^7          -206090.1    22516.3  -9.153  < 2e-16 ***
month^8            78132.7    22475.1   3.476 0.000589 ***
month^9            44597.1    22407.6   1.990 0.047530 *  
month^10           71152.6    22359.2   3.182 0.001625 ** 
month^11          -17596.5    22341.9  -0.788 0.431594    
structural_break  162435.9    23070.9   7.041 1.47e-11 ***
outlier_high      431715.8   115397.0   3.741 0.000222 ***
outlier_medium    248182.3    56713.6   4.376 1.71e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 111700 on 281 degrees of freedom
Multiple R-squared:  0.9466,    Adjusted R-squared:  0.9437 
F-statistic: 331.9 on 15 and 281 DF,  p-value: < 2.2e-16

Let’s refit the model and plot the residuals:

fit4 <- predict(object = md4, newdata = ts2, interval = "confidence", level = 0.95)

ts2$fit4 <- fit4[, 1]

Let’s plot the model residuals:

plot_residuals(
    data = ts2,
    index_col = "date",
    actual_col = "y",
    fitted_col = "fit4",
    lag_max = 60,
    alpha = 0.05,
    frequency = 12
)