Forecasting with the mlforecast Library

This notebook provides an example for creating a forecasting with the mlforecast library.

Loading the Required Libraries

import pandas as pd
import datetime

from mlforecast import MLForecast
from utilsforecast.plotting import plot_series

from statistics import mean

from mlforecast.target_transforms import Differences
from mlforecast.utils import PredictionIntervals
from window_ops.expanding import expanding_mean
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

Loading the Data

We will use the US hourly demand for electricity. The series contains a two (plus) years of data.

ts = pd.read_csv("data/data.csv")
ts["ds"] = pd.to_datetime(ts["ds"])
ts = ts.sort_values("ds")
ts = ts[[ "ds", "y"]]

ts["unique_id"] = "1"

print(ts.head())

print(ts.dtypes)

                   ds       y unique_id
0 2022-11-11 23:00:00  456403         1
1 2022-11-12 00:00:00  458842         1
2 2022-11-12 01:00:00  455111         1
3 2022-11-12 02:00:00  448035         1
4 2022-11-12 03:00:00  438165         1
ds           datetime64[ns]
y                     int64
unique_id            object
dtype: object

Let’s plot the series:

plot_series(ts, engine = "plotly").update_layout(height=300)

Next, we will split the data inot training and testing partitions, leaving the last 72 hours as testing partition.

test_length = 72
end = ts["ds"].max()
train_end = end  - datetime.timedelta(hours = test_length)


train = ts[ts["ds"] <= train_end]
test = ts[ts["ds"] > train_end]

Let’s plot the partitions:

plot_series(train, engine = "plotly").update_layout(height=300)

plot_series(test, engine = "plotly").update_layout(height=300)

Create a Forecast

We set a forecasting object using the following three regression models:

Light Gradient Boost model
XGBoost model
Linear Regression model

We will use regress the series with its lags (lags 1 and 24) using the lags argument, and set a seasonal features with the data_features argument:

ml_models = [LGBMRegressor(verbosity=-1), XGBRegressor(), LinearRegression()]
h = 72
mlf = MLForecast(
    models= ml_models,  
    freq='h', 
    lags=list(range(1, 24)),
    date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week', 'hour']
    )

mlf.preprocess(train)

	ds	y	unique_id	lag1	lag2	lag3	lag4	lag5	lag6	lag7	...	lag21	lag22	lag23	year	month	day	dayofweek	quarter	week	hour
23	2022-11-12 22:00:00	426580.0	1	424167.0	425596.0	429371.0	432831.0	433738.0	431735.0	422692.0	...	455111.0	458842.0	456403.0	2022	11	12	5	4	45	22
24	2022-11-12 23:00:00	434299.0	1	426580.0	424167.0	425596.0	429371.0	432831.0	433738.0	431735.0	...	448035.0	455111.0	458842.0	2022	11	12	5	4	45	23
25	2022-11-13 00:00:00	441559.0	1	434299.0	426580.0	424167.0	425596.0	429371.0	432831.0	433738.0	...	438165.0	448035.0	455111.0	2022	11	13	6	4	45	0
26	2022-11-13 01:00:00	443879.0	1	441559.0	434299.0	426580.0	424167.0	425596.0	429371.0	432831.0	...	423575.0	438165.0	448035.0	2022	11	13	6	4	45	1
27	2022-11-13 02:00:00	441392.0	1	443879.0	441559.0	434299.0	426580.0	424167.0	425596.0	429371.0	...	407733.0	423575.0	438165.0	2022	11	13	6	4	45	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
17924	2024-11-27 19:00:00	468461.0	1	467581.0	467932.0	467407.0	465722.0	461751.0	450555.0	430520.0	...	463019.0	458560.0	458819.0	2024	11	27	2	4	48	19
17925	2024-11-27 20:00:00	469307.0	1	468461.0	467581.0	467932.0	467407.0	465722.0	461751.0	450555.0	...	474476.0	463019.0	458560.0	2024	11	27	2	4	48	20
17926	2024-11-27 21:00:00	470896.0	1	469307.0	468461.0	467581.0	467932.0	467407.0	465722.0	461751.0	...	483113.0	474476.0	463019.0	2024	11	27	2	4	48	21
17927	2024-11-27 22:00:00	475183.0	1	470896.0	469307.0	468461.0	467581.0	467932.0	467407.0	465722.0	...	484300.0	483113.0	474476.0	2024	11	27	2	4	48	22
17928	2024-11-27 23:00:00	482886.0	1	475183.0	470896.0	469307.0	468461.0	467581.0	467932.0	467407.0	...	480209.0	484300.0	483113.0	2024	11	27	2	4	48	23

17906 rows × 33 columns

We will use the fit method to train the model on the training partition

mlf.fit(df = train,  fitted=True, 
prediction_intervals=PredictionIntervals(n_windows=3, h=h, method="conformal_distribution" ))

MLForecast(models=[LGBMRegressor, XGBRegressor, LinearRegression], freq=h, lag_features=['lag1', 'lag2', 'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12', 'lag13', 'lag14', 'lag15', 'lag16', 'lag17', 'lag18', 'lag19', 'lag20', 'lag21', 'lag22', 'lag23'], date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week', 'hour'], num_threads=1)

And create forecast using the predict method

ml_forecast = mlf.predict(h = h, level  = [95])

ml_forecast

	unique_id	ds	LGBMRegressor	XGBRegressor	LinearRegression	LGBMRegressor-lo-95	LGBMRegressor-hi-95	XGBRegressor-lo-95	XGBRegressor-hi-95	LinearRegression-lo-95	LinearRegression-hi-95
0	1	2024-11-28 00:00:00	487199.053594	488030.00000	488094.71875	481374.775774	493023.331413	482843.960938	493216.039062	485747.859375	490441.578125
1	1	2024-11-28 01:00:00	488083.773707	486569.75000	489366.46875	480367.297230	495800.250184	481044.855469	492094.644531	481237.285156	497495.652344
2	1	2024-11-28 02:00:00	482113.651224	477829.81250	486026.62500	471773.128881	492454.173567	467884.234375	487775.390625	473915.386719	498137.863281
3	1	2024-11-28 03:00:00	472089.878329	467479.31250	476700.00000	460520.207591	483659.549068	454011.707031	480946.917969	460611.222656	492788.777344
4	1	2024-11-28 04:00:00	458078.326833	452940.78125	462282.68750	445039.858843	471116.794823	437807.613281	468073.949219	441764.097656	482801.277344
...	...	...	...	...	...	...	...	...	...	...	...
67	1	2024-11-30 19:00:00	447631.719762	438918.40625	460450.43750	426512.220985	468751.218540	422838.820312	454997.992188	442614.953125	478285.921875
68	1	2024-11-30 20:00:00	445066.169093	435889.65625	463560.43750	424171.744252	465960.593934	419886.847656	451892.464844	441808.015625	485312.859375
69	1	2024-11-30 21:00:00	445385.142138	432958.81250	467750.46875	426866.760298	463903.523978	415659.046875	450258.578125	442364.996094	493135.941406
70	1	2024-11-30 22:00:00	449254.113714	433562.78125	472490.78125	433087.791798	465420.435631	415600.492188	451525.070312	446516.320312	498465.242188
71	1	2024-11-30 23:00:00	459357.945277	440341.65625	476577.71875	448663.109064	470052.781491	420396.593750	460286.718750	459785.488281	493369.949219

72 rows × 11 columns

Let’s use again the plot_forecast function to plot the forecast with the testing partition:

plot_series(test, ml_forecast, level  = [95], engine = "plotly").update_layout(height=300)

Last but not least, let’s score the models performance on the testing partition:

def mape(y, yhat):
    mape = mean(abs(y - yhat)/ y) 
    return mape

def rmse(y, yhat):
    rmse = (mean((y - yhat) ** 2 )) ** 0.5
    return rmse

def coverage(y, lower, upper):
    coverage = sum((y <= upper) & (y >= lower)) / len(y)
    return coverage

fc = ml_forecast.merge(test, how = "left", on = "ds")
fc_performance = None

for i in ["LGBMRegressor", "XGBRegressor", "LinearRegression"]:
    m = mape(y = fc.y, yhat = fc[i])
    r = rmse(y = fc.y, yhat = fc[i])
    c = coverage(y = fc.y, lower = fc[i + "-lo-95"], upper = fc[i + "-hi-95"])

    perf = {"model": i,
            "mape": m,
            "rmse": r,
            "coverage": c}
    if fc_performance is None:
        fc_performance = pd.DataFrame([perf])
    else:
        fc_performance = pd.concat([fc_performance, pd.DataFrame([perf])])

fc_performance.sort_values("rmse")

model	mape	rmse	coverage
LGBMRegressor	0.047596	26080.550668	0.541667
LinearRegression	0.047649	26238.342427	0.583333
XGBRegressor	0.052684	29114.271130	0.541667