Data Backfill

The goal of the backfill process is to pull the historical data for the required series using the settings.json file. This includes the following steps:

Setting parameters and pulling the data
Data quality checks
Saving the data and creating a log file

Load Libraries

import eia_api as api
import eia_data 
import pandas as pd
import numpy as np
import requests
import json
import os
import datetime
import plotly.express as px
import plotly.graph_objects as go
from great_tables import GT

Load settings:

raw_json = open("../settings/settings.json")
meta_json = json.load(raw_json)
# Prototype version
# series = pd.DataFrame(meta_json["series_prototype"])
# Full version
series = pd.DataFrame(meta_json["series"])
api_path = meta_json["api_path"]

facets_template = {
    "parent" : None,
    "subba" : None
}

start = datetime.datetime(meta_json["backfill"]["start"]["year"],
 meta_json["backfill"]["start"]["month"], 
 meta_json["backfill"]["start"]["day"], 
 meta_json["backfill"]["start"]["hour"])

end = datetime.datetime(meta_json["backfill"]["end"]["year"],
 meta_json["backfill"]["end"]["month"], 
 meta_json["backfill"]["end"]["day"], 
 meta_json["backfill"]["end"]["hour"])


offset =  meta_json["backfill"]["offset"]

eia_api_key = os.getenv('EIA_API_KEY')

meta_path = meta_json["meta_path"]
data_path = meta_json["data_path"]
series_mapping_path = meta_json["series_mapping_path"]

Pull the series metadata from the API

metadata = api.eia_metadata(api_key = eia_api_key, api_path = api_path)
print(metadata.meta.keys())
print(metadata.meta["startPeriod"])
print(metadata.meta["endPeriod"])

dict_keys(['id', 'name', 'description', 'frequency', 'facets', 'data', 'startPeriod', 'endPeriod', 'defaultDateFormat', 'defaultFrequency'])
2019-01-01T00
2024-11-04T08

for i in series.index:
    facets = facets_template
    facets["parent"] = series.at[i, "parent_id"]
    facets["subba"] = series.at[i, "subba_id"]
    print(facets)
    temp = api.eia_backfill(api_key = eia_api_key, 
          api_path = api_path+ "data", 
          facets = facets, 
          start = start,
          end = end,
          offset = offset) 
    ts_obj = pd.DataFrame(np.arange(start = start, stop = end + datetime.timedelta(hours = 1), step = datetime.timedelta(hours = 1)).astype(datetime.datetime), columns=["index"])
    ts_obj  = ts_obj.merge(temp.data, left_on = "index", right_on = "period", how="left")
    ts_obj.drop("period", axis = 1, inplace= True)
    ts_obj = ts_obj.rename(columns= {"index": "period"})
    missing_index = ts_obj[ts_obj["value"].isnull()].index.tolist()
    if len(missing_index) > 0:
      ts_obj.loc[missing_index,"subba"] = facets["subba"]   
    meta_temp = eia_data.create_metadata(data = ts_obj, start = start, end = end, type = "backfill")
    meta_temp["index"] = 1
    meta_df = pd.DataFrame([meta_temp]) 
    if i == series.index.start:
      data = ts_obj
      meta = meta_df
    else:
      data = data._append(ts_obj)
      meta = meta._append(meta_df)

data.reset_index(drop=True, inplace=True)
meta.reset_index(drop=True, inplace=True)

{'parent': 'NYIS', 'subba': 'ZONA'}
{'parent': 'NYIS', 'subba': 'ZONB'}
{'parent': 'NYIS', 'subba': 'ZONC'}
{'parent': 'NYIS', 'subba': 'ZOND'}
{'parent': 'NYIS', 'subba': 'ZONE'}
{'parent': 'NYIS', 'subba': 'ZONF'}
{'parent': 'NYIS', 'subba': 'ZONG'}
{'parent': 'NYIS', 'subba': 'ZONH'}
{'parent': 'NYIS', 'subba': 'ZONI'}
{'parent': 'NYIS', 'subba': 'ZONJ'}
{'parent': 'NYIS', 'subba': 'ZONK'}

GT(meta)
# The initial pull has some missing values

print("Missing Values:" , data["value"].isna().sum())

GT(data.head(10))

Missing Values: 11

period	subba	subba-name	parent	parent-name	value	value-units
2022-01-01 00:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1707.0	megawatthours
2022-01-01 01:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1673.0	megawatthours
2022-01-01 02:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1644.0	megawatthours
2022-01-01 03:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1605.0	megawatthours
2022-01-01 04:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1550.0	megawatthours
2022-01-01 05:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1487.0	megawatthours
2022-01-01 06:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1422.0	megawatthours
2022-01-01 07:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1373.0	megawatthours
2022-01-01 08:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1336.0	megawatthours
2022-01-01 09:00:00	ZONA	West - NYIS	NYIS	New York Independent System Operator	1317.0	megawatthours

Impute Missing Values

def impute_missing(input, var, index):
    class imputed_values:
        def __init__(self, data, missing_index, num_imputed):
            self.data = data
            self.missing_index = missing_index
            self.num_imputed = num_imputed
    df = input.copy()
    df.loc[:,"impute"] = np.NaN
    df = df.sort_values(by = [index])
    missing_index = df[df[var].isnull()].index.tolist()
    non_missing_index = df.index.difference(missing_index).tolist()
    num_imputed = 0
    for i in missing_index:
        if i > 336:
            df.loc[i ,"impute"] = (df.loc[i - 336 ,var] + df.loc[i - 168 ,var] + df.loc[i - 24 ,var]) / 3
            num_imputed = num_imputed + 1
        elif i > 168:
            df.loc[i ,"impute"] = (df.loc[i - 168 ,var] + df.loc[i - 24 ,var]) / 2
            num_imputed = num_imputed + 1
        elif i > 24:
            df.loc[i ,"impute"] = df.loc[i - 24 ,var]
            num_imputed = num_imputed + 1
        else:
            print("There are not enough observations to impute observation:", i)
        df["y"] = np.NaN

    df.loc[missing_index, "y"] = df.loc[missing_index, "impute"]
    df.loc[non_missing_index, "y"] = df.loc[non_missing_index, var]
    output = imputed_values(data = df, missing_index = missing_index, num_imputed = num_imputed)
    return output

def impute_series(series, meta):
    class imputed_series:
        def __init__(self, data, metadata):
            self.data = data
            self.metadata = metadata
    ts = series.copy()
    new_df = None
    for index, row in meta.iterrows():
        s = row["subba"]
        temp = None
        temp = ts[ts["subba"] == s]
        if row["na"] > 0:
            print("Series", s, "has", row["na"], "missing values")
            imputed = impute_missing(input = temp, var = "value", index = "period")
            if imputed.num_imputed > 0:
                temp = imputed.data
                meta.loc[index, "imputed"] = imputed.num_imputed
                meta.loc[index, "comments"] = meta.loc[index, "comments"] + " Missing values were imputed"
            else:
                temp["impute"] = np.NaN
                temp["y"] = temp["value"]
        else:
            temp["impute"] = np.NaN
            temp["y"] = temp["value"]
        if all([meta.loc[index, "na"] == meta.loc[index, "imputed"],meta.loc[index, "start_match"],meta.loc[index, "end_match"]]):
            meta.loc[index, "success"] = True
            meta.loc[index, "update"] = True
        if meta.loc[index, "success"]:
            if new_df is None:
                new_df = temp
            else:
                new_df = pd.concat([new_df, temp])
    if new_df is not None:
        output = imputed_series(data = new_df, metadata = meta)
    else: 
        output = None
    return output

ts = impute_series(series= data, meta = meta)

Series ZONA has 1 missing values
Series ZONB has 1 missing values
Series ZONC has 1 missing values
Series ZOND has 1 missing values
Series ZONE has 1 missing values
Series ZONF has 1 missing values
Series ZONG has 1 missing values
Series ZONH has 1 missing values
Series ZONI has 1 missing values
Series ZONJ has 1 missing values
Series ZONK has 1 missing values

Plot the Series

We will use Plotly to visualize the series:

fig = go.Figure()

for i in ts.data["subba"].unique():
    d = None
    d = ts.data[ts.data["subba"] == i]
    fig.add_trace(go.Scatter(x=d["period"], 
    y=d["value"], 
    name = i,
    # line = dict(color = "blue"),
    mode='lines'))
    


fig.add_trace(go.Scatter(
    x=ts.data["period"], 
    y=ts.data["impute"],
    mode='markers',  
    name = "Imputed",
    marker=dict(size=4, symbol='square', color = "red")
    )
    )

fig.update_layout(title = "New York Independent System Operator -  Demand for Electricity")
fig

Save the Data and Metadata

if all(meta["success"]) and all(meta["update"]):
    print("Save the data into CSV file")
    df = ts.data[["period", "subba", "impute", "y"]]
    d = eia_data.append_data(data_path = data_path, new_data = df, init = True, save = True)
    series_meta = ts.data[["subba", "subba-name","parent-name", "value-units"]].drop_duplicates().dropna().sort_values(by = ["subba"])
    series_meta.reset_index(drop=True, inplace=True)
    series_meta["unique_id"] = series_meta.index + 1
    series_meta.to_csv(series_mapping_path, index = False)
    print(d.head())
    print(series_meta)

print("Save the meatadata into CSV files")
m = eia_data.append_metadata(meta_path = meta_path, meta = meta, save = True, init = True)
GT(m)

Save the data into CSV file
Initial data pull
Save the data to CSV file
               period subba  impute       y
0 2022-01-01 00:00:00  ZONA     NaN  1707.0
1 2022-01-01 01:00:00  ZONA     NaN  1673.0
2 2022-01-01 02:00:00  ZONA     NaN  1644.0
3 2022-01-01 03:00:00  ZONA     NaN  1605.0
4 2022-01-01 04:00:00  ZONA     NaN  1550.0
   subba            subba-name                           parent-name  \
0   ZONA           West - NYIS  New York Independent System Operator   
1   ZONB        Genesee - NYIS  New York Independent System Operator   
2   ZONC        Central - NYIS  New York Independent System Operator   
3   ZOND          North - NYIS  New York Independent System Operator   
4   ZONE  Mohawk Valley - NYIS  New York Independent System Operator   
5   ZONF        Capital - NYIS  New York Independent System Operator   
6   ZONG  Hudson Valley - NYIS  New York Independent System Operator   
7   ZONH       Millwood - NYIS  New York Independent System Operator   
8   ZONI      Dunwoodie - NYIS  New York Independent System Operator   
9   ZONJ  New York City - NYIS  New York Independent System Operator   
10  ZONK    Long Island - NYIS  New York Independent System Operator   

      value-units  unique_id  
0   megawatthours          1  
1   megawatthours          2  
2   megawatthours          3  
3   megawatthours          4  
4   megawatthours          5  
5   megawatthours          6  
6   megawatthours          7  
7   megawatthours          8  
8   megawatthours          9  
9   megawatthours         10  
10  megawatthours         11  
Save the meatadata into CSV files

index	parent	subba	time	start	end	start_act	end_act	start_match	end_match	n_obs	na	imputed	type	update	success	comments
1	NYIS	ZONA	2024-11-05 05:30:28.064184+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZONB	2024-11-05 05:30:39.101353+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZONC	2024-11-05 05:30:54.164038+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZOND	2024-11-05 05:31:09.183913+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZONE	2024-11-05 05:31:24.350373+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZONF	2024-11-05 05:31:38.829825+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZONG	2024-11-05 05:31:54.169900+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZONH	2024-11-05 05:32:08.296707+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZONI	2024-11-05 05:32:23.408402+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZONJ	2024-11-05 05:32:39.046172+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed
1	NYIS	ZONK	2024-11-05 05:32:56.003404+00:00	2022-01-01	2024-10-01 01:00:00	2022-01-01	2024-10-01 01:00:00	True	True	24098	1	1	backfill	True	True	Missing values were found; Missing values were imputed