import eia_api as api
import eia_data
import pandas as pd
import numpy as np
import requests
import json
import os
import datetime
import plotly.express as px
import plotly.graph_objects as go
from great_tables import GT
Data Backfill
The goal of the backfill process is to pull the historical data for the required series using the settings.json file. This includes the following steps:
- Setting parameters and pulling the data
- Data quality checks
- Saving the data and creating a log file
Load Libraries
Load settings:
= open("../settings/settings.json")
raw_json = json.load(raw_json)
meta_json # Prototype version
# series = pd.DataFrame(meta_json["series_prototype"])
# Full version
= pd.DataFrame(meta_json["series"])
series = meta_json["api_path"] api_path
= {
facets_template "parent" : None,
"subba" : None
}
= datetime.datetime(meta_json["backfill"]["start"]["year"],
start "backfill"]["start"]["month"],
meta_json["backfill"]["start"]["day"],
meta_json["backfill"]["start"]["hour"])
meta_json[
= datetime.datetime(meta_json["backfill"]["end"]["year"],
end "backfill"]["end"]["month"],
meta_json["backfill"]["end"]["day"],
meta_json["backfill"]["end"]["hour"])
meta_json[
= meta_json["backfill"]["offset"]
offset
= os.getenv('EIA_API_KEY')
eia_api_key
= meta_json["meta_path"]
meta_path = meta_json["data_path"]
data_path = meta_json["series_mapping_path"] series_mapping_path
Pull the series metadata from the API
= api.eia_metadata(api_key = eia_api_key, api_path = api_path)
metadata print(metadata.meta.keys())
print(metadata.meta["startPeriod"])
print(metadata.meta["endPeriod"])
dict_keys(['id', 'name', 'description', 'frequency', 'facets', 'data', 'startPeriod', 'endPeriod', 'defaultDateFormat', 'defaultFrequency'])
2019-01-01T00
2024-11-04T08
for i in series.index:
= facets_template
facets "parent"] = series.at[i, "parent_id"]
facets["subba"] = series.at[i, "subba_id"]
facets[print(facets)
= api.eia_backfill(api_key = eia_api_key,
temp = api_path+ "data",
api_path = facets,
facets = start,
start = end,
end = offset)
offset = pd.DataFrame(np.arange(start = start, stop = end + datetime.timedelta(hours = 1), step = datetime.timedelta(hours = 1)).astype(datetime.datetime), columns=["index"])
ts_obj = ts_obj.merge(temp.data, left_on = "index", right_on = "period", how="left")
ts_obj "period", axis = 1, inplace= True)
ts_obj.drop(= ts_obj.rename(columns= {"index": "period"})
ts_obj = ts_obj[ts_obj["value"].isnull()].index.tolist()
missing_index if len(missing_index) > 0:
"subba"] = facets["subba"]
ts_obj.loc[missing_index,= eia_data.create_metadata(data = ts_obj, start = start, end = end, type = "backfill")
meta_temp "index"] = 1
meta_temp[= pd.DataFrame([meta_temp])
meta_df if i == series.index.start:
= ts_obj
data = meta_df
meta else:
= data._append(ts_obj)
data = meta._append(meta_df)
meta
=True, inplace=True)
data.reset_index(drop=True, inplace=True) meta.reset_index(drop
{'parent': 'NYIS', 'subba': 'ZONA'}
{'parent': 'NYIS', 'subba': 'ZONB'}
{'parent': 'NYIS', 'subba': 'ZONC'}
{'parent': 'NYIS', 'subba': 'ZOND'}
{'parent': 'NYIS', 'subba': 'ZONE'}
{'parent': 'NYIS', 'subba': 'ZONF'}
{'parent': 'NYIS', 'subba': 'ZONG'}
{'parent': 'NYIS', 'subba': 'ZONH'}
{'parent': 'NYIS', 'subba': 'ZONI'}
{'parent': 'NYIS', 'subba': 'ZONJ'}
{'parent': 'NYIS', 'subba': 'ZONK'}
GT(meta)# The initial pull has some missing values
print("Missing Values:" , data["value"].isna().sum())
10)) GT(data.head(
Missing Values: 11
period | subba | subba-name | parent | parent-name | value | value-units |
---|---|---|---|---|---|---|
2022-01-01 00:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1707.0 | megawatthours |
2022-01-01 01:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1673.0 | megawatthours |
2022-01-01 02:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1644.0 | megawatthours |
2022-01-01 03:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1605.0 | megawatthours |
2022-01-01 04:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1550.0 | megawatthours |
2022-01-01 05:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1487.0 | megawatthours |
2022-01-01 06:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1422.0 | megawatthours |
2022-01-01 07:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1373.0 | megawatthours |
2022-01-01 08:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1336.0 | megawatthours |
2022-01-01 09:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1317.0 | megawatthours |
Impute Missing Values
def impute_missing(input, var, index):
class imputed_values:
def __init__(self, data, missing_index, num_imputed):
self.data = data
self.missing_index = missing_index
self.num_imputed = num_imputed
= input.copy()
df "impute"] = np.NaN
df.loc[:,= df.sort_values(by = [index])
df = df[df[var].isnull()].index.tolist()
missing_index = df.index.difference(missing_index).tolist()
non_missing_index = 0
num_imputed for i in missing_index:
if i > 336:
"impute"] = (df.loc[i - 336 ,var] + df.loc[i - 168 ,var] + df.loc[i - 24 ,var]) / 3
df.loc[i ,= num_imputed + 1
num_imputed elif i > 168:
"impute"] = (df.loc[i - 168 ,var] + df.loc[i - 24 ,var]) / 2
df.loc[i ,= num_imputed + 1
num_imputed elif i > 24:
"impute"] = df.loc[i - 24 ,var]
df.loc[i ,= num_imputed + 1
num_imputed else:
print("There are not enough observations to impute observation:", i)
"y"] = np.NaN
df[
"y"] = df.loc[missing_index, "impute"]
df.loc[missing_index, "y"] = df.loc[non_missing_index, var]
df.loc[non_missing_index, = imputed_values(data = df, missing_index = missing_index, num_imputed = num_imputed)
output return output
def impute_series(series, meta):
class imputed_series:
def __init__(self, data, metadata):
self.data = data
self.metadata = metadata
= series.copy()
ts = None
new_df for index, row in meta.iterrows():
= row["subba"]
s = None
temp = ts[ts["subba"] == s]
temp if row["na"] > 0:
print("Series", s, "has", row["na"], "missing values")
= impute_missing(input = temp, var = "value", index = "period")
imputed if imputed.num_imputed > 0:
= imputed.data
temp "imputed"] = imputed.num_imputed
meta.loc[index, "comments"] = meta.loc[index, "comments"] + " Missing values were imputed"
meta.loc[index, else:
"impute"] = np.NaN
temp["y"] = temp["value"]
temp[else:
"impute"] = np.NaN
temp["y"] = temp["value"]
temp[if all([meta.loc[index, "na"] == meta.loc[index, "imputed"],meta.loc[index, "start_match"],meta.loc[index, "end_match"]]):
"success"] = True
meta.loc[index, "update"] = True
meta.loc[index, if meta.loc[index, "success"]:
if new_df is None:
= temp
new_df else:
= pd.concat([new_df, temp])
new_df if new_df is not None:
= imputed_series(data = new_df, metadata = meta)
output else:
= None
output return output
= impute_series(series= data, meta = meta) ts
Series ZONA has 1 missing values
Series ZONB has 1 missing values
Series ZONC has 1 missing values
Series ZOND has 1 missing values
Series ZONE has 1 missing values
Series ZONF has 1 missing values
Series ZONG has 1 missing values
Series ZONH has 1 missing values
Series ZONI has 1 missing values
Series ZONJ has 1 missing values
Series ZONK has 1 missing values
Plot the Series
We will use Plotly to visualize the series:
= go.Figure()
fig
for i in ts.data["subba"].unique():
= None
d = ts.data[ts.data["subba"] == i]
d =d["period"],
fig.add_trace(go.Scatter(x=d["value"],
y= i,
name # line = dict(color = "blue"),
='lines'))
mode
fig.add_trace(go.Scatter(=ts.data["period"],
x=ts.data["impute"],
y='markers',
mode= "Imputed",
name =dict(size=4, symbol='square', color = "red")
marker
)
)
= "New York Independent System Operator - Demand for Electricity")
fig.update_layout(title fig