import pandas as pd
import numpy as np
import requests
import json
import os
import mlflow
import datetime
import plotly.graph_objects as go
import mlflow
from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from mlforecast.utils import PredictionIntervals
from window_ops.expanding import expanding_mean
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from utilsforecast.plotting import plot_series
from statistics import mean
import plotly.express as px
Training Models with Backtesting
Load Libraries
Load metadata
= open("./settings/settings.json")
raw_json = json.load(raw_json)
meta_json = meta_json["data"]["backtesting_path"] backtesting_path
Load the Data
Reformat the data
::: {#cell-Load the data .cell execution_count=3}
= pd.read_csv("data/data.csv")
ts "ds"] = pd.to_datetime(ts["ds"])
ts[= ts.sort_values("ds")
ts ts.head()
ds | y | unique_id | |
---|---|---|---|
0 | 2022-04-06 23:00:00 | 462894.0 | 1 |
1 | 2022-04-07 00:00:00 | 463663.0 | 1 |
2 | 2022-04-07 01:00:00 | 464916.0 | 1 |
3 | 2022-04-07 02:00:00 | 459376.0 | 1 |
4 | 2022-04-07 03:00:00 | 441989.0 | 1 |
:::
Subset for the last 25 months:
= ts["ds"].max()
end = end - datetime.timedelta(hours = 24 * 31 * 25)
start = ts[ts["ds"] >= start] ts
Set the Backtesting
Define the forecasting models:
= {
ml_models "lightGBM": LGBMRegressor(verbosity=-1),
"xgboost": XGBRegressor(),
"linear_regression": LinearRegression(),
"lasso": Lasso(),
"ridge": Ridge()
}
# ml_models = [lgb, xgb, lm, lasso, ridge]
= MLForecast(
mlf = ml_models,
models='h',
freq=list(range(1, 24)),
lags=["month", "day", "dayofweek", "week", "hour"]
date_features )
= 72
h = 24
step_size = 10
partitions = 5
n_windows = "conformal_distribution"
method = PredictionIntervals(h=h, n_windows = n_windows , method = method)
pi = [95] levels
= mlf.cross_validation(
bkt_df = ts,
df = h,
h = step_size,
step_size = partitions,
n_windows = pi,
prediction_intervals = levels) level
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.005e+10, tolerance: 9.766e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.124e+10, tolerance: 9.923e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.010e+10, tolerance: 9.781e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.130e+10, tolerance: 9.943e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.014e+10, tolerance: 9.800e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.141e+10, tolerance: 9.953e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.023e+10, tolerance: 9.811e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.151e+10, tolerance: 9.961e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.030e+10, tolerance: 9.818e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.160e+10, tolerance: 9.968e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.039e+10, tolerance: 9.824e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.167e+10, tolerance: 9.974e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.048e+10, tolerance: 9.829e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.176e+10, tolerance: 9.981e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.058e+10, tolerance: 9.834e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.182e+10, tolerance: 9.992e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.064e+10, tolerance: 9.844e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.188e+10, tolerance: 1.001e+10
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.068e+10, tolerance: 9.858e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.207e+10, tolerance: 1.002e+10
::: {#cell-Review the bkt .cell execution_count=8}
bkt_df.head()
unique_id | ds | cutoff | y | lightGBM | xgboost | linear_regression | lasso | ridge | lightGBM-lo-95 | lightGBM-hi-95 | xgboost-lo-95 | xgboost-hi-95 | linear_regression-lo-95 | linear_regression-hi-95 | lasso-lo-95 | lasso-hi-95 | ridge-lo-95 | ridge-hi-95 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2024-04-14 00:00:00 | 2024-04-13 23:00:00 | 421836.39 | 422716.385199 | 421999.12500 | 422352.303789 | 422303.387840 | 422352.318516 | 421022.905138 | 424409.865260 | 413150.163187 | 430848.086813 | 420572.672519 | 424131.935059 | 420193.904762 | 424412.870918 | 420572.698521 | 424131.938511 |
1 | 1 | 2024-04-14 01:00:00 | 2024-04-13 23:00:00 | 421300.67 | 422439.422659 | 422403.71875 | 420444.205975 | 420181.246550 | 420444.246554 | 417651.607031 | 427227.238288 | 410528.525875 | 434278.911625 | 413143.863192 | 427744.548759 | 412754.618246 | 427607.874853 | 413143.912384 | 427744.580725 |
2 | 1 | 2024-04-14 02:00:00 | 2024-04-13 23:00:00 | 415940.13 | 417209.926483 | 418226.87500 | 413555.865386 | 412950.784579 | 413555.936496 | 407277.565240 | 427142.287725 | 403594.666500 | 432859.083500 | 401713.539463 | 425398.191308 | 400649.670396 | 425251.898762 | 401713.623882 | 425398.249109 |
3 | 1 | 2024-04-14 03:00:00 | 2024-04-13 23:00:00 | 403788.77 | 405820.047603 | 405515.62500 | 401358.336479 | 400883.378063 | 401358.437580 | 392670.364652 | 418969.730553 | 388216.352469 | 422814.897531 | 388385.173887 | 414331.499070 | 387325.023471 | 414441.732654 | 388385.318921 | 414331.556240 |
4 | 1 | 2024-04-14 04:00:00 | 2024-04-13 23:00:00 | 388042.74 | 386520.594124 | 388142.90625 | 386263.522024 | 385896.776166 | 386263.652143 | 372017.701036 | 401023.487212 | 370163.194469 | 406122.618031 | 370245.276224 | 402281.767824 | 369233.631968 | 402559.920363 | 370245.519124 | 402281.785161 |
:::
::: {#cell-Review a model .cell execution_count=9}
"ds", "lightGBM", "lightGBM-lo-95", "lightGBM-hi-95"]].head() bkt_df[[
ds | lightGBM | lightGBM-lo-95 | lightGBM-hi-95 | |
---|---|---|---|---|
0 | 2024-04-14 00:00:00 | 422716.385199 | 421022.905138 | 424409.865260 |
1 | 2024-04-14 01:00:00 | 422439.422659 | 417651.607031 | 427227.238288 |
2 | 2024-04-14 02:00:00 | 417209.926483 | 407277.565240 | 427142.287725 |
3 | 2024-04-14 03:00:00 | 405820.047603 | 392670.364652 | 418969.730553 |
4 | 2024-04-14 04:00:00 | 386520.594124 | 372017.701036 | 401023.487212 |
:::
::: {#cell-Reformat the bkt obj .cell execution_count=10}
= list(ml_models.keys())
model_label = [type(s).__name__ for s in list(ml_models.values())]
model_name = [s + "-lo-95" for s in model_label]
lower = [s + "-hi-95" for s in model_label]
upper= pd.DataFrame({"model_label": model_label,
models_mapping "model_name": model_name})
= pd.melt(bkt_df, id_vars= ["unique_id", "ds", "cutoff"],
d1 = model_label, var_name = "model_label" , value_name = "forecast")
value_vars= pd.melt(bkt_df, id_vars= ["unique_id", "ds", "cutoff"],
d2 = lower, var_name = "model_label" , value_name = "lower")
value_vars"model_label"] = d2["model_label"].str.replace("-lo-95", "")
d2[= pd.melt(bkt_df, id_vars= ["unique_id", "ds", "cutoff"],
d3 = upper, var_name = "model_label", value_name = "upper")
value_vars"model_label"] = d3["model_label"].str.replace("-hi-95", "")
d3[
= (
bkt_long
d1= d2, how = "left", on = ["unique_id", "ds", "cutoff", "model_label"])
.merge(right = d3, how = "left", on = ["unique_id", "ds", "cutoff", "model_label"])
.merge(right = models_mapping, how = "left", on = ["model_label"])
.merge(right = ts, how = "left", on = ["unique_id", "ds"])
.merge(right
) bkt_long.head()
unique_id | ds | cutoff | model_label | forecast | lower | upper | model_name | y | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2024-04-14 00:00:00 | 2024-04-13 23:00:00 | lightGBM | 422716.385199 | 421022.905138 | 424409.865260 | LGBMRegressor | 421836.39 |
1 | 1 | 2024-04-14 01:00:00 | 2024-04-13 23:00:00 | lightGBM | 422439.422659 | 417651.607031 | 427227.238288 | LGBMRegressor | 421300.67 |
2 | 1 | 2024-04-14 02:00:00 | 2024-04-13 23:00:00 | lightGBM | 417209.926483 | 407277.565240 | 427142.287725 | LGBMRegressor | 415940.13 |
3 | 1 | 2024-04-14 03:00:00 | 2024-04-13 23:00:00 | lightGBM | 405820.047603 | 392670.364652 | 418969.730553 | LGBMRegressor | 403788.77 |
4 | 1 | 2024-04-14 04:00:00 | 2024-04-13 23:00:00 | lightGBM | 386520.594124 | 372017.701036 | 401023.487212 | LGBMRegressor | 388042.74 |
:::
::: {#cell-Add partitions .cell execution_count=11}
= bkt_long["cutoff"].unique()
cutoff = pd.DataFrame({"cutoff": cutoff, "partition": range(1, len(cutoff) + 1)})
partitions_mapping
partitions_mapping
= bkt_long.merge(partitions_mapping, how = "left", on = ["cutoff"])
bkt_long bkt_long.head()
unique_id | ds | cutoff | model_label | forecast | lower | upper | model_name | y | partition | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2024-04-14 00:00:00 | 2024-04-13 23:00:00 | lightGBM | 422716.385199 | 421022.905138 | 424409.865260 | LGBMRegressor | 421836.39 | 1 |
1 | 1 | 2024-04-14 01:00:00 | 2024-04-13 23:00:00 | lightGBM | 422439.422659 | 417651.607031 | 427227.238288 | LGBMRegressor | 421300.67 | 1 |
2 | 1 | 2024-04-14 02:00:00 | 2024-04-13 23:00:00 | lightGBM | 417209.926483 | 407277.565240 | 427142.287725 | LGBMRegressor | 415940.13 | 1 |
3 | 1 | 2024-04-14 03:00:00 | 2024-04-13 23:00:00 | lightGBM | 405820.047603 | 392670.364652 | 418969.730553 | LGBMRegressor | 403788.77 | 1 |
4 | 1 | 2024-04-14 04:00:00 | 2024-04-13 23:00:00 | lightGBM | 386520.594124 | 372017.701036 | 401023.487212 | LGBMRegressor | 388042.74 | 1 |
:::
::: {#cell-Score the models .cell execution_count=12}
def mape(y, yhat):
= mean(abs(y - yhat)/ y)
mape return mape
def rmse(y, yhat):
= (mean((y - yhat) ** 2 )) ** 0.5
rmse return rmse
def coverage(y, lower, upper):
= sum((y <= upper) & (y >= lower)) / len(y)
coverage return coverage
def score(df):
= mape(y = df["y"], yhat = df["forecast"])
mape_score = rmse(y = df["y"], yhat = df["forecast"])
rmse_score = coverage(y = df["y"], lower = df["lower"], upper = df["upper"])
coverage_score = ["mape","rmse", "coverage"]
cols = pd.Series([mape_score, rmse_score, coverage_score], index=cols)
d
return d
= (bkt_long
score_df "unique_id", "model_label", "model_name", "partition"])
.groupby([apply(score)
.
.reset_index()
)
score_df.head()
/tmp/ipykernel_12518/1975747771.py:25: DeprecationWarning:
DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
unique_id | model_label | model_name | partition | mape | rmse | coverage | |
---|---|---|---|---|---|---|---|
0 | 1 | lasso | Lasso | 1 | 0.050913 | 29315.983715 | 0.486111 |
1 | 1 | lasso | Lasso | 2 | 0.037723 | 19034.393950 | 0.763889 |
2 | 1 | lasso | Lasso | 3 | 0.017668 | 9768.991810 | 0.986111 |
3 | 1 | lasso | Lasso | 4 | 0.014224 | 7839.292592 | 1.000000 |
4 | 1 | lasso | Lasso | 5 | 0.023679 | 13550.628885 | 0.847222 |
:::
::: {#cell-Calculate the leaderboard .cell execution_count=13}
= score_df.groupby(["unique_id", "model_label", "model_name"]).agg({"mape": "mean", "rmse": "mean", "coverage": "mean"}).reset_index()
leaderboard
= ["mape"]) leaderboard.sort_values(by
unique_id | model_label | model_name | mape | rmse | coverage | |
---|---|---|---|---|---|---|
1 | 1 | lightGBM | LGBMRegressor | 0.022780 | 12462.716830 | 0.750000 |
4 | 1 | xgboost | XGBRegressor | 0.022854 | 12967.317366 | 0.781944 |
0 | 1 | lasso | Lasso | 0.031274 | 16877.080942 | 0.804167 |
2 | 1 | linear_regression | LinearRegression | 0.031534 | 17016.431327 | 0.806944 |
3 | 1 | ridge | Ridge | 0.031534 | 17016.462986 | 0.806944 |
:::
::: {#cell-Plot the error rate .cell execution_count=14}
= px.box(score_df, x="model_label", y="rmse", color="model_label")
fig = 'all', jitter = 0.3, pointpos = -1.8, showlegend = False)
fig.update_traces(boxpoints
fig.update_layout(="Error Distribution",
title="Model",
xaxis_title="RMSE",
yaxis_title=dict(family="Arial", size=14, color="black")
font
)
fig.show()
:::
Logging the Results with MLflow
import mlflow
= "ml_forecast"
experiment_name
= "file:///mlruns"
mlflow_path
= {"h": h,
tags "step_size": step_size,
"partitions": partitions,
"intervals_type": "ConformalIntervals",
"intervals_h": h,
"intervals_n_windows": n_windows,
"intervals_method": "conformal_distribution",
"levels": levels }
try:
= experiment_name,
mlflow.create_experiment(name = mlflow_path,
artifact_location= tags)
tags = mlflow.get_experiment_by_name(experiment_name)
meta print(f"Set a new experiment {experiment_name}")
print("Pulling the metadata")
except:
print(f"Experiment {experiment_name} exists, pulling the metadata")
= mlflow.get_experiment_by_name(experiment_name) meta
Experiment ml_forecast exists, pulling the metadata
= datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S")
run_time for index, row in score_df.iterrows():
= row["model_label"] + "-" + run_time
run_name with mlflow.start_run(experiment_id = meta.experiment_id,
= run_name,
run_name = {"type": "backtesting",
tags "partition": row["partition"],
"unique_id": row["unique_id"],
"model_label": row["model_label"],
"model_name": row["model_name"],
"run_name": run_name}) as run:
= ml_models[row["model_label"]].get_params()
model_params "model_name"] = row["model_name"]
model_params["model_label"] = row["model_label"]
model_params["partition"] = row["partition"]
model_params["lags"] = list(range(1, 24))
model_params["date_features"] = ["month", "day", "dayofweek","week", "hour"]
model_params[
mlflow.log_params(model_params)"mape", row["mape"])
mlflow.log_metric("rmse", row["rmse"])
mlflow.log_metric("coverage", row["coverage"]) mlflow.log_metric(
::: {#save-the-data .cell execution_count=17} {.python .cell-code} bkt_df.to_csv(backtesting_path, index = False)
:::