import pandas as pd
import numpy as np
import requests
import json
import os
import mlflow
import datetime
import plotly.graph_objects as go
import mlflow
from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from mlforecast.utils import PredictionIntervals
from window_ops.expanding import expanding_mean
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from utilsforecast.plotting import plot_series
from statistics import mean
import plotly.express as pxTraining Models with Backtesting
Load Libraries
Load metadata
raw_json = open("./settings/settings.json")
meta_json = json.load(raw_json)
backtesting_path = meta_json["data"]["backtesting_path"]Load the Data
Reformat the data
::: {#cell-Load the data .cell execution_count=3}
ts = pd.read_csv("data/data.csv")
ts["ds"] = pd.to_datetime(ts["ds"])
ts = ts.sort_values("ds")
ts.head()| ds | y | unique_id | |
|---|---|---|---|
| 0 | 2022-04-06 23:00:00 | 462894.0 | 1 |
| 1 | 2022-04-07 00:00:00 | 463663.0 | 1 |
| 2 | 2022-04-07 01:00:00 | 464916.0 | 1 |
| 3 | 2022-04-07 02:00:00 | 459376.0 | 1 |
| 4 | 2022-04-07 03:00:00 | 441989.0 | 1 |
:::
Subset for the last 25 months:
end = ts["ds"].max()
start = end - datetime.timedelta(hours = 24 * 31 * 25)
ts = ts[ts["ds"] >= start]Set the Backtesting
Define the forecasting models:
ml_models = {
"lightGBM": LGBMRegressor(verbosity=-1),
"xgboost": XGBRegressor(),
"linear_regression": LinearRegression(),
"lasso": Lasso(),
"ridge": Ridge()
}
# ml_models = [lgb, xgb, lm, lasso, ridge]
mlf = MLForecast(
models= ml_models,
freq='h',
lags=list(range(1, 24)),
date_features=["month", "day", "dayofweek", "week", "hour"]
)h = 72
step_size = 24
partitions = 10
n_windows = 5
method = "conformal_distribution"
pi = PredictionIntervals(h=h, n_windows = n_windows , method = method)
levels = [95]bkt_df = mlf.cross_validation(
df = ts,
h = h,
step_size = step_size,
n_windows = partitions,
prediction_intervals = pi,
level = levels)/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.005e+10, tolerance: 9.766e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.124e+10, tolerance: 9.923e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.010e+10, tolerance: 9.781e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.130e+10, tolerance: 9.943e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.014e+10, tolerance: 9.800e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.141e+10, tolerance: 9.953e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.023e+10, tolerance: 9.811e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.151e+10, tolerance: 9.961e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.030e+10, tolerance: 9.818e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.160e+10, tolerance: 9.968e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.039e+10, tolerance: 9.824e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.167e+10, tolerance: 9.974e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.048e+10, tolerance: 9.829e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.176e+10, tolerance: 9.981e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.058e+10, tolerance: 9.834e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.182e+10, tolerance: 9.992e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.064e+10, tolerance: 9.844e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.188e+10, tolerance: 1.001e+10
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.068e+10, tolerance: 9.858e+09
/opt/forecasting-poc/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:697: ConvergenceWarning:
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.207e+10, tolerance: 1.002e+10
::: {#cell-Review the bkt .cell execution_count=8}
bkt_df.head()| unique_id | ds | cutoff | y | lightGBM | xgboost | linear_regression | lasso | ridge | lightGBM-lo-95 | lightGBM-hi-95 | xgboost-lo-95 | xgboost-hi-95 | linear_regression-lo-95 | linear_regression-hi-95 | lasso-lo-95 | lasso-hi-95 | ridge-lo-95 | ridge-hi-95 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-04-14 00:00:00 | 2024-04-13 23:00:00 | 421836.39 | 422716.385199 | 421999.12500 | 422352.303789 | 422303.387840 | 422352.318516 | 421022.905138 | 424409.865260 | 413150.163187 | 430848.086813 | 420572.672519 | 424131.935059 | 420193.904762 | 424412.870918 | 420572.698521 | 424131.938511 |
| 1 | 1 | 2024-04-14 01:00:00 | 2024-04-13 23:00:00 | 421300.67 | 422439.422659 | 422403.71875 | 420444.205975 | 420181.246550 | 420444.246554 | 417651.607031 | 427227.238288 | 410528.525875 | 434278.911625 | 413143.863192 | 427744.548759 | 412754.618246 | 427607.874853 | 413143.912384 | 427744.580725 |
| 2 | 1 | 2024-04-14 02:00:00 | 2024-04-13 23:00:00 | 415940.13 | 417209.926483 | 418226.87500 | 413555.865386 | 412950.784579 | 413555.936496 | 407277.565240 | 427142.287725 | 403594.666500 | 432859.083500 | 401713.539463 | 425398.191308 | 400649.670396 | 425251.898762 | 401713.623882 | 425398.249109 |
| 3 | 1 | 2024-04-14 03:00:00 | 2024-04-13 23:00:00 | 403788.77 | 405820.047603 | 405515.62500 | 401358.336479 | 400883.378063 | 401358.437580 | 392670.364652 | 418969.730553 | 388216.352469 | 422814.897531 | 388385.173887 | 414331.499070 | 387325.023471 | 414441.732654 | 388385.318921 | 414331.556240 |
| 4 | 1 | 2024-04-14 04:00:00 | 2024-04-13 23:00:00 | 388042.74 | 386520.594124 | 388142.90625 | 386263.522024 | 385896.776166 | 386263.652143 | 372017.701036 | 401023.487212 | 370163.194469 | 406122.618031 | 370245.276224 | 402281.767824 | 369233.631968 | 402559.920363 | 370245.519124 | 402281.785161 |
:::
::: {#cell-Review a model .cell execution_count=9}
bkt_df[["ds", "lightGBM", "lightGBM-lo-95", "lightGBM-hi-95"]].head()| ds | lightGBM | lightGBM-lo-95 | lightGBM-hi-95 | |
|---|---|---|---|---|
| 0 | 2024-04-14 00:00:00 | 422716.385199 | 421022.905138 | 424409.865260 |
| 1 | 2024-04-14 01:00:00 | 422439.422659 | 417651.607031 | 427227.238288 |
| 2 | 2024-04-14 02:00:00 | 417209.926483 | 407277.565240 | 427142.287725 |
| 3 | 2024-04-14 03:00:00 | 405820.047603 | 392670.364652 | 418969.730553 |
| 4 | 2024-04-14 04:00:00 | 386520.594124 | 372017.701036 | 401023.487212 |
:::
::: {#cell-Reformat the bkt obj .cell execution_count=10}
model_label = list(ml_models.keys())
model_name = [type(s).__name__ for s in list(ml_models.values())]
lower = [s + "-lo-95" for s in model_label]
upper= [s + "-hi-95" for s in model_label]
models_mapping = pd.DataFrame({"model_label": model_label,
"model_name": model_name})
d1 = pd.melt(bkt_df, id_vars= ["unique_id", "ds", "cutoff"],
value_vars= model_label, var_name = "model_label" , value_name = "forecast")
d2 = pd.melt(bkt_df, id_vars= ["unique_id", "ds", "cutoff"],
value_vars= lower, var_name = "model_label" , value_name = "lower")
d2["model_label"] = d2["model_label"].str.replace("-lo-95", "")
d3 = pd.melt(bkt_df, id_vars= ["unique_id", "ds", "cutoff"],
value_vars= upper, var_name = "model_label", value_name = "upper")
d3["model_label"] = d3["model_label"].str.replace("-hi-95", "")
bkt_long = (
d1
.merge(right = d2, how = "left", on = ["unique_id", "ds", "cutoff", "model_label"])
.merge(right = d3, how = "left", on = ["unique_id", "ds", "cutoff", "model_label"])
.merge(right = models_mapping, how = "left", on = ["model_label"])
.merge(right = ts, how = "left", on = ["unique_id", "ds"])
)
bkt_long.head()| unique_id | ds | cutoff | model_label | forecast | lower | upper | model_name | y | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-04-14 00:00:00 | 2024-04-13 23:00:00 | lightGBM | 422716.385199 | 421022.905138 | 424409.865260 | LGBMRegressor | 421836.39 |
| 1 | 1 | 2024-04-14 01:00:00 | 2024-04-13 23:00:00 | lightGBM | 422439.422659 | 417651.607031 | 427227.238288 | LGBMRegressor | 421300.67 |
| 2 | 1 | 2024-04-14 02:00:00 | 2024-04-13 23:00:00 | lightGBM | 417209.926483 | 407277.565240 | 427142.287725 | LGBMRegressor | 415940.13 |
| 3 | 1 | 2024-04-14 03:00:00 | 2024-04-13 23:00:00 | lightGBM | 405820.047603 | 392670.364652 | 418969.730553 | LGBMRegressor | 403788.77 |
| 4 | 1 | 2024-04-14 04:00:00 | 2024-04-13 23:00:00 | lightGBM | 386520.594124 | 372017.701036 | 401023.487212 | LGBMRegressor | 388042.74 |
:::
::: {#cell-Add partitions .cell execution_count=11}
cutoff = bkt_long["cutoff"].unique()
partitions_mapping = pd.DataFrame({"cutoff": cutoff, "partition": range(1, len(cutoff) + 1)})
partitions_mapping
bkt_long = bkt_long.merge(partitions_mapping, how = "left", on = ["cutoff"])
bkt_long.head()| unique_id | ds | cutoff | model_label | forecast | lower | upper | model_name | y | partition | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-04-14 00:00:00 | 2024-04-13 23:00:00 | lightGBM | 422716.385199 | 421022.905138 | 424409.865260 | LGBMRegressor | 421836.39 | 1 |
| 1 | 1 | 2024-04-14 01:00:00 | 2024-04-13 23:00:00 | lightGBM | 422439.422659 | 417651.607031 | 427227.238288 | LGBMRegressor | 421300.67 | 1 |
| 2 | 1 | 2024-04-14 02:00:00 | 2024-04-13 23:00:00 | lightGBM | 417209.926483 | 407277.565240 | 427142.287725 | LGBMRegressor | 415940.13 | 1 |
| 3 | 1 | 2024-04-14 03:00:00 | 2024-04-13 23:00:00 | lightGBM | 405820.047603 | 392670.364652 | 418969.730553 | LGBMRegressor | 403788.77 | 1 |
| 4 | 1 | 2024-04-14 04:00:00 | 2024-04-13 23:00:00 | lightGBM | 386520.594124 | 372017.701036 | 401023.487212 | LGBMRegressor | 388042.74 | 1 |
:::
::: {#cell-Score the models .cell execution_count=12}
def mape(y, yhat):
mape = mean(abs(y - yhat)/ y)
return mape
def rmse(y, yhat):
rmse = (mean((y - yhat) ** 2 )) ** 0.5
return rmse
def coverage(y, lower, upper):
coverage = sum((y <= upper) & (y >= lower)) / len(y)
return coverage
def score(df):
mape_score = mape(y = df["y"], yhat = df["forecast"])
rmse_score = rmse(y = df["y"], yhat = df["forecast"])
coverage_score = coverage(y = df["y"], lower = df["lower"], upper = df["upper"])
cols = ["mape","rmse", "coverage"]
d = pd.Series([mape_score, rmse_score, coverage_score], index=cols)
return d
score_df = (bkt_long
.groupby(["unique_id", "model_label", "model_name", "partition"])
.apply(score)
.reset_index()
)
score_df.head()/tmp/ipykernel_12518/1975747771.py:25: DeprecationWarning:
DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
| unique_id | model_label | model_name | partition | mape | rmse | coverage | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | lasso | Lasso | 1 | 0.050913 | 29315.983715 | 0.486111 |
| 1 | 1 | lasso | Lasso | 2 | 0.037723 | 19034.393950 | 0.763889 |
| 2 | 1 | lasso | Lasso | 3 | 0.017668 | 9768.991810 | 0.986111 |
| 3 | 1 | lasso | Lasso | 4 | 0.014224 | 7839.292592 | 1.000000 |
| 4 | 1 | lasso | Lasso | 5 | 0.023679 | 13550.628885 | 0.847222 |
:::
::: {#cell-Calculate the leaderboard .cell execution_count=13}
leaderboard = score_df.groupby(["unique_id", "model_label", "model_name"]).agg({"mape": "mean", "rmse": "mean", "coverage": "mean"}).reset_index()
leaderboard.sort_values(by = ["mape"])| unique_id | model_label | model_name | mape | rmse | coverage | |
|---|---|---|---|---|---|---|
| 1 | 1 | lightGBM | LGBMRegressor | 0.022780 | 12462.716830 | 0.750000 |
| 4 | 1 | xgboost | XGBRegressor | 0.022854 | 12967.317366 | 0.781944 |
| 0 | 1 | lasso | Lasso | 0.031274 | 16877.080942 | 0.804167 |
| 2 | 1 | linear_regression | LinearRegression | 0.031534 | 17016.431327 | 0.806944 |
| 3 | 1 | ridge | Ridge | 0.031534 | 17016.462986 | 0.806944 |
:::
::: {#cell-Plot the error rate .cell execution_count=14}
fig = px.box(score_df, x="model_label", y="rmse", color="model_label")
fig.update_traces(boxpoints = 'all', jitter = 0.3, pointpos = -1.8, showlegend = False)
fig.update_layout(
title="Error Distribution",
xaxis_title="Model",
yaxis_title="RMSE",
font=dict(family="Arial", size=14, color="black")
)
fig.show():::
Logging the Results with MLflow
import mlflow
experiment_name = "ml_forecast"
mlflow_path = "file:///mlruns"
tags = {"h": h,
"step_size": step_size,
"partitions": partitions,
"intervals_type": "ConformalIntervals",
"intervals_h": h,
"intervals_n_windows": n_windows,
"intervals_method": "conformal_distribution",
"levels": levels }
try:
mlflow.create_experiment(name = experiment_name,
artifact_location= mlflow_path,
tags = tags)
meta = mlflow.get_experiment_by_name(experiment_name)
print(f"Set a new experiment {experiment_name}")
print("Pulling the metadata")
except:
print(f"Experiment {experiment_name} exists, pulling the metadata")
meta = mlflow.get_experiment_by_name(experiment_name)Experiment ml_forecast exists, pulling the metadata
run_time = datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S")
for index, row in score_df.iterrows():
run_name = row["model_label"] + "-" + run_time
with mlflow.start_run(experiment_id = meta.experiment_id,
run_name = run_name,
tags = {"type": "backtesting",
"partition": row["partition"],
"unique_id": row["unique_id"],
"model_label": row["model_label"],
"model_name": row["model_name"],
"run_name": run_name}) as run:
model_params = ml_models[row["model_label"]].get_params()
model_params["model_name"] = row["model_name"]
model_params["model_label"] = row["model_label"]
model_params["partition"] = row["partition"]
model_params["lags"] = list(range(1, 24))
model_params["date_features"] = ["month", "day", "dayofweek","week", "hour"]
mlflow.log_params(model_params)
mlflow.log_metric("mape", row["mape"])
mlflow.log_metric("rmse", row["rmse"])
mlflow.log_metric("coverage", row["coverage"])::: {#save-the-data .cell execution_count=17} {.python .cell-code} bkt_df.to_csv(backtesting_path, index = False) :::