import pandas as pd
import datetime
from mlforecast import MLForecast
from utilsforecast.plotting import plot_series
from statistics import mean
from mlforecast.target_transforms import Differences
from mlforecast.utils import PredictionIntervals
from window_ops.expanding import expanding_mean
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
Forecasting with the mlforecast Library
This notebook provides an example for creating a forecasting with the mlforecast library.
Loading the Required Libraries
Loading the Data
We will use the US hourly demand for electricity. The series contains a two (plus) years of data.
= pd.read_csv("data/data.csv")
ts "ds"] = pd.to_datetime(ts["ds"])
ts[= ts.sort_values("ds")
ts = ts[[ "ds", "y"]]
ts
"unique_id"] = "1"
ts[
print(ts.head())
print(ts.dtypes)
ds y unique_id
0 2022-11-11 23:00:00 456403 1
1 2022-11-12 00:00:00 458842 1
2 2022-11-12 01:00:00 455111 1
3 2022-11-12 02:00:00 448035 1
4 2022-11-12 03:00:00 438165 1
ds datetime64[ns]
y int64
unique_id object
dtype: object
Let’s plot the series:
= "plotly").update_layout(height=300) plot_series(ts, engine
Next, we will split the data inot training and testing partitions, leaving the last 72 hours as testing partition.
= 72
test_length = ts["ds"].max()
end = end - datetime.timedelta(hours = test_length)
train_end
= ts[ts["ds"] <= train_end]
train = ts[ts["ds"] > train_end] test
Let’s plot the partitions:
= "plotly").update_layout(height=300) plot_series(train, engine
= "plotly").update_layout(height=300) plot_series(test, engine
Create a Forecast
We set a forecasting object using the following three regression models:
- Light Gradient Boost model
- XGBoost model
- Linear Regression model
We will use regress the series with its lags (lags 1 and 24) using the lags
argument, and set a seasonal features with the data_features
argument:
= [LGBMRegressor(verbosity=-1), XGBRegressor(), LinearRegression()]
ml_models = 72
h = MLForecast(
mlf = ml_models,
models='h',
freq=list(range(1, 24)),
lags=['year', 'month', 'day', 'dayofweek', 'quarter', 'week', 'hour']
date_features )
mlf.preprocess(train)
ds | y | unique_id | lag1 | lag2 | lag3 | lag4 | lag5 | lag6 | lag7 | ... | lag21 | lag22 | lag23 | year | month | day | dayofweek | quarter | week | hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
23 | 2022-11-12 22:00:00 | 426580.0 | 1 | 424167.0 | 425596.0 | 429371.0 | 432831.0 | 433738.0 | 431735.0 | 422692.0 | ... | 455111.0 | 458842.0 | 456403.0 | 2022 | 11 | 12 | 5 | 4 | 45 | 22 |
24 | 2022-11-12 23:00:00 | 434299.0 | 1 | 426580.0 | 424167.0 | 425596.0 | 429371.0 | 432831.0 | 433738.0 | 431735.0 | ... | 448035.0 | 455111.0 | 458842.0 | 2022 | 11 | 12 | 5 | 4 | 45 | 23 |
25 | 2022-11-13 00:00:00 | 441559.0 | 1 | 434299.0 | 426580.0 | 424167.0 | 425596.0 | 429371.0 | 432831.0 | 433738.0 | ... | 438165.0 | 448035.0 | 455111.0 | 2022 | 11 | 13 | 6 | 4 | 45 | 0 |
26 | 2022-11-13 01:00:00 | 443879.0 | 1 | 441559.0 | 434299.0 | 426580.0 | 424167.0 | 425596.0 | 429371.0 | 432831.0 | ... | 423575.0 | 438165.0 | 448035.0 | 2022 | 11 | 13 | 6 | 4 | 45 | 1 |
27 | 2022-11-13 02:00:00 | 441392.0 | 1 | 443879.0 | 441559.0 | 434299.0 | 426580.0 | 424167.0 | 425596.0 | 429371.0 | ... | 407733.0 | 423575.0 | 438165.0 | 2022 | 11 | 13 | 6 | 4 | 45 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17924 | 2024-11-27 19:00:00 | 468461.0 | 1 | 467581.0 | 467932.0 | 467407.0 | 465722.0 | 461751.0 | 450555.0 | 430520.0 | ... | 463019.0 | 458560.0 | 458819.0 | 2024 | 11 | 27 | 2 | 4 | 48 | 19 |
17925 | 2024-11-27 20:00:00 | 469307.0 | 1 | 468461.0 | 467581.0 | 467932.0 | 467407.0 | 465722.0 | 461751.0 | 450555.0 | ... | 474476.0 | 463019.0 | 458560.0 | 2024 | 11 | 27 | 2 | 4 | 48 | 20 |
17926 | 2024-11-27 21:00:00 | 470896.0 | 1 | 469307.0 | 468461.0 | 467581.0 | 467932.0 | 467407.0 | 465722.0 | 461751.0 | ... | 483113.0 | 474476.0 | 463019.0 | 2024 | 11 | 27 | 2 | 4 | 48 | 21 |
17927 | 2024-11-27 22:00:00 | 475183.0 | 1 | 470896.0 | 469307.0 | 468461.0 | 467581.0 | 467932.0 | 467407.0 | 465722.0 | ... | 484300.0 | 483113.0 | 474476.0 | 2024 | 11 | 27 | 2 | 4 | 48 | 22 |
17928 | 2024-11-27 23:00:00 | 482886.0 | 1 | 475183.0 | 470896.0 | 469307.0 | 468461.0 | 467581.0 | 467932.0 | 467407.0 | ... | 480209.0 | 484300.0 | 483113.0 | 2024 | 11 | 27 | 2 | 4 | 48 | 23 |
17906 rows × 33 columns
We will use the fit
method to train the model on the training partition
= train, fitted=True,
mlf.fit(df =PredictionIntervals(n_windows=3, h=h, method="conformal_distribution" )) prediction_intervals
MLForecast(models=[LGBMRegressor, XGBRegressor, LinearRegression], freq=h, lag_features=['lag1', 'lag2', 'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12', 'lag13', 'lag14', 'lag15', 'lag16', 'lag17', 'lag18', 'lag19', 'lag20', 'lag21', 'lag22', 'lag23'], date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week', 'hour'], num_threads=1)
And create forecast using the predict
method
= mlf.predict(h = h, level = [95])
ml_forecast
ml_forecast
unique_id | ds | LGBMRegressor | XGBRegressor | LinearRegression | LGBMRegressor-lo-95 | LGBMRegressor-hi-95 | XGBRegressor-lo-95 | XGBRegressor-hi-95 | LinearRegression-lo-95 | LinearRegression-hi-95 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2024-11-28 00:00:00 | 487199.053594 | 488030.00000 | 488094.71875 | 481374.775774 | 493023.331413 | 482843.960938 | 493216.039062 | 485747.859375 | 490441.578125 |
1 | 1 | 2024-11-28 01:00:00 | 488083.773707 | 486569.75000 | 489366.46875 | 480367.297230 | 495800.250184 | 481044.855469 | 492094.644531 | 481237.285156 | 497495.652344 |
2 | 1 | 2024-11-28 02:00:00 | 482113.651224 | 477829.81250 | 486026.62500 | 471773.128881 | 492454.173567 | 467884.234375 | 487775.390625 | 473915.386719 | 498137.863281 |
3 | 1 | 2024-11-28 03:00:00 | 472089.878329 | 467479.31250 | 476700.00000 | 460520.207591 | 483659.549068 | 454011.707031 | 480946.917969 | 460611.222656 | 492788.777344 |
4 | 1 | 2024-11-28 04:00:00 | 458078.326833 | 452940.78125 | 462282.68750 | 445039.858843 | 471116.794823 | 437807.613281 | 468073.949219 | 441764.097656 | 482801.277344 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
67 | 1 | 2024-11-30 19:00:00 | 447631.719762 | 438918.40625 | 460450.43750 | 426512.220985 | 468751.218540 | 422838.820312 | 454997.992188 | 442614.953125 | 478285.921875 |
68 | 1 | 2024-11-30 20:00:00 | 445066.169093 | 435889.65625 | 463560.43750 | 424171.744252 | 465960.593934 | 419886.847656 | 451892.464844 | 441808.015625 | 485312.859375 |
69 | 1 | 2024-11-30 21:00:00 | 445385.142138 | 432958.81250 | 467750.46875 | 426866.760298 | 463903.523978 | 415659.046875 | 450258.578125 | 442364.996094 | 493135.941406 |
70 | 1 | 2024-11-30 22:00:00 | 449254.113714 | 433562.78125 | 472490.78125 | 433087.791798 | 465420.435631 | 415600.492188 | 451525.070312 | 446516.320312 | 498465.242188 |
71 | 1 | 2024-11-30 23:00:00 | 459357.945277 | 440341.65625 | 476577.71875 | 448663.109064 | 470052.781491 | 420396.593750 | 460286.718750 | 459785.488281 | 493369.949219 |
72 rows × 11 columns
Let’s use again the plot_forecast
function to plot the forecast with the testing partition:
= [95], engine = "plotly").update_layout(height=300) plot_series(test, ml_forecast, level
Last but not least, let’s score the models performance on the testing partition:
def mape(y, yhat):
= mean(abs(y - yhat)/ y)
mape return mape
def rmse(y, yhat):
= (mean((y - yhat) ** 2 )) ** 0.5
rmse return rmse
def coverage(y, lower, upper):
= sum((y <= upper) & (y >= lower)) / len(y)
coverage return coverage
= ml_forecast.merge(test, how = "left", on = "ds")
fc = None
fc_performance
for i in ["LGBMRegressor", "XGBRegressor", "LinearRegression"]:
= mape(y = fc.y, yhat = fc[i])
m = rmse(y = fc.y, yhat = fc[i])
r = coverage(y = fc.y, lower = fc[i + "-lo-95"], upper = fc[i + "-hi-95"])
c
= {"model": i,
perf "mape": m,
"rmse": r,
"coverage": c}
if fc_performance is None:
= pd.DataFrame([perf])
fc_performance else:
= pd.concat([fc_performance, pd.DataFrame([perf])])
fc_performance
"rmse") fc_performance.sort_values(
model | mape | rmse | coverage | |
---|---|---|---|---|
0 | LGBMRegressor | 0.047596 | 26080.550668 | 0.541667 |
0 | LinearRegression | 0.047649 | 26238.342427 | 0.583333 |
0 | XGBRegressor | 0.052684 | 29114.271130 | 0.541667 |