import eia_api as api
import eia_data
import pandas as pd
import numpy as np
import requests
import json
import os
import datetime
import plotly.express as px
import plotly.graph_objects as go
from great_tables import GTData Backfill
The goal of the backfill process is to pull the historical data for the required series using the settings.json file. This includes the following steps:
- Setting parameters and pulling the data
- Data quality checks
- Saving the data and creating a log file
Load Libraries
Load settings:
raw_json = open("../settings/settings.json")
meta_json = json.load(raw_json)
# Prototype version
# series = pd.DataFrame(meta_json["series_prototype"])
# Full version
series = pd.DataFrame(meta_json["series"])
api_path = meta_json["api_path"]facets_template = {
"parent" : None,
"subba" : None
}
start = datetime.datetime(meta_json["backfill"]["start"]["year"],
meta_json["backfill"]["start"]["month"],
meta_json["backfill"]["start"]["day"],
meta_json["backfill"]["start"]["hour"])
end = datetime.datetime(meta_json["backfill"]["end"]["year"],
meta_json["backfill"]["end"]["month"],
meta_json["backfill"]["end"]["day"],
meta_json["backfill"]["end"]["hour"])
offset = meta_json["backfill"]["offset"]
eia_api_key = os.getenv('EIA_API_KEY')
meta_path = meta_json["meta_path"]
data_path = meta_json["data_path"]
series_mapping_path = meta_json["series_mapping_path"]Pull the series metadata from the API
metadata = api.eia_metadata(api_key = eia_api_key, api_path = api_path)
print(metadata.meta.keys())
print(metadata.meta["startPeriod"])
print(metadata.meta["endPeriod"])dict_keys(['id', 'name', 'description', 'frequency', 'facets', 'data', 'startPeriod', 'endPeriod', 'defaultDateFormat', 'defaultFrequency'])
2019-01-01T00
2024-11-04T08
for i in series.index:
facets = facets_template
facets["parent"] = series.at[i, "parent_id"]
facets["subba"] = series.at[i, "subba_id"]
print(facets)
temp = api.eia_backfill(api_key = eia_api_key,
api_path = api_path+ "data",
facets = facets,
start = start,
end = end,
offset = offset)
ts_obj = pd.DataFrame(np.arange(start = start, stop = end + datetime.timedelta(hours = 1), step = datetime.timedelta(hours = 1)).astype(datetime.datetime), columns=["index"])
ts_obj = ts_obj.merge(temp.data, left_on = "index", right_on = "period", how="left")
ts_obj.drop("period", axis = 1, inplace= True)
ts_obj = ts_obj.rename(columns= {"index": "period"})
missing_index = ts_obj[ts_obj["value"].isnull()].index.tolist()
if len(missing_index) > 0:
ts_obj.loc[missing_index,"subba"] = facets["subba"]
meta_temp = eia_data.create_metadata(data = ts_obj, start = start, end = end, type = "backfill")
meta_temp["index"] = 1
meta_df = pd.DataFrame([meta_temp])
if i == series.index.start:
data = ts_obj
meta = meta_df
else:
data = data._append(ts_obj)
meta = meta._append(meta_df)
data.reset_index(drop=True, inplace=True)
meta.reset_index(drop=True, inplace=True){'parent': 'NYIS', 'subba': 'ZONA'}
{'parent': 'NYIS', 'subba': 'ZONB'}
{'parent': 'NYIS', 'subba': 'ZONC'}
{'parent': 'NYIS', 'subba': 'ZOND'}
{'parent': 'NYIS', 'subba': 'ZONE'}
{'parent': 'NYIS', 'subba': 'ZONF'}
{'parent': 'NYIS', 'subba': 'ZONG'}
{'parent': 'NYIS', 'subba': 'ZONH'}
{'parent': 'NYIS', 'subba': 'ZONI'}
{'parent': 'NYIS', 'subba': 'ZONJ'}
{'parent': 'NYIS', 'subba': 'ZONK'}
GT(meta)
# The initial pull has some missing values
print("Missing Values:" , data["value"].isna().sum())
GT(data.head(10))Missing Values: 11
| period | subba | subba-name | parent | parent-name | value | value-units |
|---|---|---|---|---|---|---|
| 2022-01-01 00:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1707.0 | megawatthours |
| 2022-01-01 01:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1673.0 | megawatthours |
| 2022-01-01 02:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1644.0 | megawatthours |
| 2022-01-01 03:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1605.0 | megawatthours |
| 2022-01-01 04:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1550.0 | megawatthours |
| 2022-01-01 05:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1487.0 | megawatthours |
| 2022-01-01 06:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1422.0 | megawatthours |
| 2022-01-01 07:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1373.0 | megawatthours |
| 2022-01-01 08:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1336.0 | megawatthours |
| 2022-01-01 09:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1317.0 | megawatthours |
Impute Missing Values
def impute_missing(input, var, index):
class imputed_values:
def __init__(self, data, missing_index, num_imputed):
self.data = data
self.missing_index = missing_index
self.num_imputed = num_imputed
df = input.copy()
df.loc[:,"impute"] = np.NaN
df = df.sort_values(by = [index])
missing_index = df[df[var].isnull()].index.tolist()
non_missing_index = df.index.difference(missing_index).tolist()
num_imputed = 0
for i in missing_index:
if i > 336:
df.loc[i ,"impute"] = (df.loc[i - 336 ,var] + df.loc[i - 168 ,var] + df.loc[i - 24 ,var]) / 3
num_imputed = num_imputed + 1
elif i > 168:
df.loc[i ,"impute"] = (df.loc[i - 168 ,var] + df.loc[i - 24 ,var]) / 2
num_imputed = num_imputed + 1
elif i > 24:
df.loc[i ,"impute"] = df.loc[i - 24 ,var]
num_imputed = num_imputed + 1
else:
print("There are not enough observations to impute observation:", i)
df["y"] = np.NaN
df.loc[missing_index, "y"] = df.loc[missing_index, "impute"]
df.loc[non_missing_index, "y"] = df.loc[non_missing_index, var]
output = imputed_values(data = df, missing_index = missing_index, num_imputed = num_imputed)
return outputdef impute_series(series, meta):
class imputed_series:
def __init__(self, data, metadata):
self.data = data
self.metadata = metadata
ts = series.copy()
new_df = None
for index, row in meta.iterrows():
s = row["subba"]
temp = None
temp = ts[ts["subba"] == s]
if row["na"] > 0:
print("Series", s, "has", row["na"], "missing values")
imputed = impute_missing(input = temp, var = "value", index = "period")
if imputed.num_imputed > 0:
temp = imputed.data
meta.loc[index, "imputed"] = imputed.num_imputed
meta.loc[index, "comments"] = meta.loc[index, "comments"] + " Missing values were imputed"
else:
temp["impute"] = np.NaN
temp["y"] = temp["value"]
else:
temp["impute"] = np.NaN
temp["y"] = temp["value"]
if all([meta.loc[index, "na"] == meta.loc[index, "imputed"],meta.loc[index, "start_match"],meta.loc[index, "end_match"]]):
meta.loc[index, "success"] = True
meta.loc[index, "update"] = True
if meta.loc[index, "success"]:
if new_df is None:
new_df = temp
else:
new_df = pd.concat([new_df, temp])
if new_df is not None:
output = imputed_series(data = new_df, metadata = meta)
else:
output = None
return outputts = impute_series(series= data, meta = meta)Series ZONA has 1 missing values
Series ZONB has 1 missing values
Series ZONC has 1 missing values
Series ZOND has 1 missing values
Series ZONE has 1 missing values
Series ZONF has 1 missing values
Series ZONG has 1 missing values
Series ZONH has 1 missing values
Series ZONI has 1 missing values
Series ZONJ has 1 missing values
Series ZONK has 1 missing values
Plot the Series
We will use Plotly to visualize the series:
fig = go.Figure()
for i in ts.data["subba"].unique():
d = None
d = ts.data[ts.data["subba"] == i]
fig.add_trace(go.Scatter(x=d["period"],
y=d["value"],
name = i,
# line = dict(color = "blue"),
mode='lines'))
fig.add_trace(go.Scatter(
x=ts.data["period"],
y=ts.data["impute"],
mode='markers',
name = "Imputed",
marker=dict(size=4, symbol='square', color = "red")
)
)
fig.update_layout(title = "New York Independent System Operator - Demand for Electricity")
fig