import pandas as pd
import datetime
from mlforecast import MLForecast
from utilsforecast.plotting import plot_series
from statistics import mean
from mlforecast.target_transforms import Differences
from mlforecast.utils import PredictionIntervals
from window_ops.expanding import expanding_mean
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
Forecasting with the mlforecast Library
This notebook provides an example for creating a forecasting with the mlforecast library.
Loading the Required Libraries
Loading the Data
We will use the US hourly demand for electricity. The series contains a two (plus) years of data.
= pd.read_csv("data/data.csv")
ts "ds"] = pd.to_datetime(ts["ds"])
ts[
print(ts.head())
print(ts.dtypes)
ds y unique_id
0 2022-04-06 23:00:00 462894.0 1
1 2022-04-07 00:00:00 463663.0 1
2 2022-04-07 01:00:00 464916.0 1
3 2022-04-07 02:00:00 459376.0 1
4 2022-04-07 03:00:00 441989.0 1
ds datetime64[ns]
y float64
unique_id int64
dtype: object
Let’s plot the series:
= "plotly").update_layout(height=300) plot_series(ts, engine
Next, we will split the data inot training and testing partitions, leaving the last 72 hours as testing partition.
= 72
test_length = ts["ds"].max()
end = end - datetime.timedelta(hours = test_length)
train_end
= ts[ts["ds"] <= train_end]
train = ts[ts["ds"] > train_end] test
Let’s plot the partitions:
= "plotly").update_layout(height=300) plot_series(train, engine
= "plotly").update_layout(height=300) plot_series(test, engine
Create a Forecast
We set a forecasting object using the following three regression models:
- Light Gradient Boost model
- XGBoost model
- Linear Regression model
We will use regress the series with its lags (lags 1 and 24) using the lags
argument, and set a seasonal features with the data_features
argument:
= [LGBMRegressor(), XGBRegressor(), LinearRegression()]
ml_models
= MLForecast(
mlf = ml_models,
models='h',
freq=list(range(1, 24)),
lags=['year', 'month', 'day', 'dayofweek', 'quarter', 'week', 'hour'], # Date features to use as regressors
date_features )
We will use the fit
method to train the model on the training partition
= train, fitted=True,
mlf.fit(df =PredictionIntervals(n_windows=5, h=72, method="conformal_distribution" )) prediction_intervals
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6003
[LightGBM] [Info] Number of data points in the train set: 17546, number of used features: 30
[LightGBM] [Info] Start training from score 468545.938421
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6003
[LightGBM] [Info] Number of data points in the train set: 17906, number of used features: 30
[LightGBM] [Info] Start training from score 467417.387929
MLForecast(models=[LGBMRegressor, XGBRegressor, LinearRegression], freq=h, lag_features=['lag1', 'lag2', 'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12', 'lag13', 'lag14', 'lag15', 'lag16', 'lag17', 'lag18', 'lag19', 'lag20', 'lag21', 'lag22', 'lag23'], date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week', 'hour'], num_threads=1)
And create forecast using the predict
method
= mlf.predict(72, level = [95])
ml_forecast
ml_forecast
unique_id | ds | LGBMRegressor | XGBRegressor | LinearRegression | LGBMRegressor-lo-95 | LGBMRegressor-hi-95 | XGBRegressor-lo-95 | XGBRegressor-hi-95 | LinearRegression-lo-95 | LinearRegression-hi-95 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2024-04-23 00:00:00 | 443380.300930 | 442959.93750 | 442375.437073 | 438995.964332 | 447764.637527 | 440161.827500 | 445758.047500 | 440624.176974 | 444126.697172 |
1 | 1 | 2024-04-23 01:00:00 | 446518.832009 | 446556.71875 | 443380.227789 | 440255.463132 | 452782.200886 | 442812.694906 | 450300.742594 | 438993.591895 | 447766.863683 |
2 | 1 | 2024-04-23 02:00:00 | 444037.865700 | 444417.34375 | 436880.021739 | 434083.716659 | 453992.014741 | 439461.053438 | 449373.634062 | 428366.448891 | 445393.594587 |
3 | 1 | 2024-04-23 03:00:00 | 430886.079710 | 432637.34375 | 423424.115322 | 419489.545789 | 442282.613631 | 428356.237437 | 436918.450063 | 412949.745843 | 433898.484800 |
4 | 1 | 2024-04-23 04:00:00 | 410735.049432 | 414182.62500 | 407929.955447 | 399380.539620 | 422089.559245 | 409950.440406 | 418414.809594 | 392810.056056 | 423049.854838 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
67 | 1 | 2024-04-25 19:00:00 | 445742.145942 | 425914.43750 | 444574.364180 | 417501.659843 | 473982.632041 | 396206.284156 | 455622.590844 | 407117.004181 | 482031.724180 |
68 | 1 | 2024-04-25 20:00:00 | 445582.186236 | 428038.31250 | 447792.938345 | 415246.156914 | 475918.215558 | 395746.243344 | 460330.381656 | 409559.426203 | 486026.450487 |
69 | 1 | 2024-04-25 21:00:00 | 446785.144275 | 430216.84375 | 452677.618714 | 415889.116396 | 477681.172153 | 394151.955344 | 466281.732156 | 413474.786873 | 491880.450556 |
70 | 1 | 2024-04-25 22:00:00 | 448322.142950 | 435146.59375 | 458245.842975 | 418846.763239 | 477797.522662 | 401076.494094 | 469216.693406 | 419257.635038 | 497234.050913 |
71 | 1 | 2024-04-25 23:00:00 | 449504.832286 | 436916.65625 | 462689.332354 | 421221.029906 | 477788.634667 | 405347.459531 | 468485.852969 | 428175.014241 | 497203.650468 |
72 rows × 11 columns
Let’s use again the plot_forecast
function to plot the forecast with the testing partition:
= [95], engine = "plotly").update_layout(height=300) plot_series(test, ml_forecast, level
Last but not least, let’s score the models performance on the testing partition:
def mape(y, yhat):
= mean(abs(y - yhat)/ y)
mape return mape
def rmse(y, yhat):
= (mean((y - yhat) ** 2 )) ** 0.5
rmse return rmse
def coverage(y, lower, upper):
= sum((y <= upper) & (y >= lower)) / len(y)
coverage return coverage
= ml_forecast.merge(test, how = "left", on = "ds")
fc = None
fc_performance
for i in ["LGBMRegressor", "XGBRegressor", "LinearRegression"]:
= mape(y = fc.y, yhat = fc[i])
m = rmse(y = fc.y, yhat = fc[i])
r = coverage(y = fc.y, lower = fc[i + "-lo-95"], upper = fc[i + "-hi-95"])
c
= {"model": i,
perf "mape": m,
"rmse": r,
"coverage": c}
if fc_performance is None:
= pd.DataFrame([perf])
fc_performance else:
= pd.concat([fc_performance, pd.DataFrame([perf])])
fc_performance
"rmse") fc_performance.sort_values(
model | mape | rmse | coverage | |
---|---|---|---|---|
0 | LGBMRegressor | 0.018438 | 9481.413402 | 0.750000 |
0 | XGBRegressor | 0.023905 | 14923.364734 | 0.805556 |
0 | LinearRegression | 0.031024 | 15386.193312 | 0.888889 |