Source code for nem_bidding_dashboard.preprocessing

import datetime

import numpy as np
import pandas as pd

pd.set_option("display.width", None)


[docs]def stack_unit_bids(volume_bids, price_bids): """ Combine volume and price components of energy market offers and reformat them such that each price quantity pair is on a separate row of the dataframe. Examples: >>> volume_bids = pd.DataFrame( ... columns=["INTERVAL_DATETIME", "SETTLEMENTDATE", "DUID", "BANDAVAIL1", "BANDAVAIL2", "BANDAVAIL3", ... "BANDAVAIL4", "BANDAVAIL5", "BANDAVAIL6", "BANDAVAIL7", "BANDAVAIL8", "BANDAVAIL9", "BANDAVAIL10"], ... data=[('2020/01/01 00:45:00', '2019/12/31 00:00:00', 'AGLHAL', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)]) >>> volume_bids INTERVAL_DATETIME SETTLEMENTDATE ... BANDAVAIL9 BANDAVAIL10 0 2020/01/01 00:45:00 2019/12/31 00:00:00 ... 9 10 <BLANKLINE> [1 rows x 13 columns] >>> price_bids = pd.DataFrame( ... columns=["SETTLEMENTDATE", "DUID", "PRICEBAND1", "PRICEBAND2", "PRICEBAND3", "PRICEBAND4", "PRICEBAND5", ... "PRICEBAND6", "PRICEBAND7", "PRICEBAND8", "PRICEBAND9", "PRICEBAND10"], ... data=[('2019/12/31 00:00:00', 'AGLHAL', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)]) >>> price_bids SETTLEMENTDATE DUID PRICEBAND1 ... PRICEBAND8 PRICEBAND9 PRICEBAND10 0 2019/12/31 00:00:00 AGLHAL 1 ... 8 9 10 <BLANKLINE> [1 rows x 12 columns] >>> stack_unit_bids(volume_bids, price_bids) INTERVAL_DATETIME DUID BIDBAND BIDVOLUME BIDPRICE 0 2020/01/01 00:45:00 AGLHAL 1 1 1 1 2020/01/01 00:45:00 AGLHAL 2 2 2 2 2020/01/01 00:45:00 AGLHAL 3 3 3 3 2020/01/01 00:45:00 AGLHAL 4 4 4 4 2020/01/01 00:45:00 AGLHAL 5 5 5 5 2020/01/01 00:45:00 AGLHAL 6 6 6 6 2020/01/01 00:45:00 AGLHAL 7 7 7 7 2020/01/01 00:45:00 AGLHAL 8 8 8 8 2020/01/01 00:45:00 AGLHAL 9 9 9 9 2020/01/01 00:45:00 AGLHAL 10 10 10 Arguments: volume_bids: pd dataframe containing quantity of bids on a 5 minutely basis. Should have columns INTERVAL_DATETIME, SETTLEMENTDATE, DUID, BANDAVAIL1 . . . . BANDAVAIL10 price_bids: pd dataframe containing quantity of bids on a daily basis. Should have columns SETTLEMENTDATE, DUID, PRICEBAND1 . . . . PRICEBAND10 Returns: pd dataframe containing matched quantity and price pairs on a 5 minutely basis with bid bands on row basis. Should have columns INTERVAL_DATETIME, DUID, BIDBAND, BIDVOLUME and BIDPRICE """ volume_bids = pd.melt( volume_bids, id_vars=["INTERVAL_DATETIME", "SETTLEMENTDATE", "DUID"], value_vars=[ "BANDAVAIL1", "BANDAVAIL2", "BANDAVAIL3", "BANDAVAIL4", "BANDAVAIL5", "BANDAVAIL6", "BANDAVAIL7", "BANDAVAIL8", "BANDAVAIL9", "BANDAVAIL10", ], var_name="BIDBAND", value_name="BIDVOLUME", ) price_bids = pd.melt( price_bids, id_vars=["SETTLEMENTDATE", "DUID"], value_vars=[ "PRICEBAND1", "PRICEBAND2", "PRICEBAND3", "PRICEBAND4", "PRICEBAND5", "PRICEBAND6", "PRICEBAND7", "PRICEBAND8", "PRICEBAND9", "PRICEBAND10", ], var_name="BIDBAND", value_name="BIDPRICE", ) price_bids["BIDBAND"] = pd.to_numeric(price_bids["BIDBAND"].str[9:]) volume_bids["BIDBAND"] = pd.to_numeric(volume_bids["BIDBAND"].str[9:]) bids = pd.merge(volume_bids, price_bids, on=["SETTLEMENTDATE", "BIDBAND", "DUID"]) return bids.loc[ :, ["INTERVAL_DATETIME", "DUID", "BIDBAND", "BIDVOLUME", "BIDPRICE"] ]
[docs]def adjust_bids_for_availability(stacked_bids, unit_availability): """ Adjust bid volumes where the unit availability would restrict a bid from actually being fully dispatched. Starting from the highest bid bands bid volumes are adjusted down until the total bid volume is equal to the availability, if the total bid volume is already equal to or less than availability no adjustments are made. Examples: >>> bid_data = pd.DataFrame( ... columns=["INTERVAL_DATETIME", "DUID", "BIDBAND", "BIDVOLUME", "BIDPRICE"], ... data=[('2020/01/01 00:45:00', 'AGLHAL', 1, 50, 120), ... ('2020/01/01 00:45:00', 'AGLHAL', 2, 50, 200)]) >>> bid_data INTERVAL_DATETIME DUID BIDBAND BIDVOLUME BIDPRICE 0 2020/01/01 00:45:00 AGLHAL 1 50 120 1 2020/01/01 00:45:00 AGLHAL 2 50 200 >>> unit_availability = pd.DataFrame( ... columns=["SETTLEMENTDATE", "DUID", "AVAILABILITY"], ... data=[('2020/01/01 00:45:00', 'AGLHAL', 80),]) >>> unit_availability SETTLEMENTDATE DUID AVAILABILITY 0 2020/01/01 00:45:00 AGLHAL 80 >>> adjust_bids_for_availability(bid_data, unit_availability) INTERVAL_DATETIME DUID BIDBAND BIDVOLUME BIDVOLUMEADJUSTED BIDPRICE 0 2020/01/01 00:45:00 AGLHAL 1 50 50 120 1 2020/01/01 00:45:00 AGLHAL 2 50 30 200 Arguments: stacked_bids: pd.dataframe containing matched quantity and price pairs on a 5 minutely basis with bid bands on row basis. Should have columns INTERVAL_DATETIME, DUID, BIDBAND, BIDVOLUME and BIDPRICE unit_availability: pd.dataframe containing unit availability values on 5 minutely basis. Should have columns SETTLEMENTDATE, DUID, AVAILABILITY Returns: pd.dataframe containing matched quantity and price pairs on a 5 minutely basis with bid bands on row basis. An extra column is added in which did volumes have been adjusted so total bid volume doesn't exceed unit availability. Should have columns INTERVAL_DATETIME, DUID, BIDBAND, BIDVOLUME, BIDVOLUMEADJUSTED and BIDPRICE """ bids = stacked_bids.sort_values("BIDBAND") bids["BIDVOLUMECUMULATIVE"] = bids.groupby( ["DUID", "INTERVAL_DATETIME"], as_index=False )["BIDVOLUME"].cumsum() availability = unit_availability.rename( {"SETTLEMENTDATE": "INTERVAL_DATETIME"}, axis=1 ) bids = pd.merge(bids, availability, "left", on=["INTERVAL_DATETIME", "DUID"]) bids["SPAREBIDVOLUME"] = ( bids["AVAILABILITY"] - bids["BIDVOLUMECUMULATIVE"] ) + bids["BIDVOLUME"] bids["SPAREBIDVOLUME"] = np.where( bids["SPAREBIDVOLUME"] < 0, 0, bids["SPAREBIDVOLUME"] ) bids["BIDVOLUMEADJUSTED"] = bids[["BIDVOLUME", "SPAREBIDVOLUME"]].min(axis=1) bids = bids.loc[ :, [ "INTERVAL_DATETIME", "DUID", "BIDBAND", "BIDVOLUME", "BIDVOLUMEADJUSTED", "BIDPRICE", ], ] return bids.sort_values( [ "INTERVAL_DATETIME", "DUID", "BIDBAND", ] ).reset_index(drop=True)
[docs]def remove_number_from_region_names(region_column, data): """ Removes the trailing 1 from region names in the specified column. Names in input data are expected to be of the format NSW1, QLD1 etc. and the names in the output format will be NSW, QLD etc. Examples: >>> region_data = pd.DataFrame({ ... 'region': ['QLD1', 'TAS1'], ... 'dummy_values': [55.7, 102.9]}) >>> region_data region dummy_values 0 QLD1 55.7 1 TAS1 102.9 >>> remove_number_from_region_names('region', region_data) region dummy_values 0 QLD 55.7 1 TAS 102.9 Arguments: region_column: str the name of the column containing the region names. data: pd dataframe with a column called the value of region_column. Returns: pd dataframe with names in region_column modified. """ data.loc[:, region_column] = data[region_column].str[:-1] return data
[docs]def tech_namer_by_row(fuel, tech_descriptor, dispatch_type): """ Create a name for generation and loads using custom logic that considers the fuel type, technology descriptor, and dispatch type supplied by AEMO in the NEM Registration and Exemption List.xls file. See the `source code <https://github.com/UNSW-CEEM/nem-bidding-dashboard/blob/6468e6b5506c778d12b3d3653af1cf11bd28807b/src/nem_bidding_dashboard/preprocessing.py#L123>`_ for logic that this function applies: Examples: >>> tech_namer_by_row("Natural Gas", "Steam Sub-Critical", "Generator") 'Gas Thermal' >>> tech_namer_by_row("solar", "PV - Tracking", "Generator") 'Solar' >>> tech_namer_by_row("-", "Battery", "Load") 'Battery Charge' Arguments: fuel: str should be the value from the column 'Fuel Source - Descriptor' tech_descriptor: str should be the value from the column 'Technology Type - Descriptor' dispatch_type: str should be the value from the column 'Dispatch Type' Returns: str a value categorising the technology of the generation or load unit """ name = fuel if fuel in ["Solar", "Wind", "Black Coal", "Brown Coal"]: pass elif fuel == "solar": name = "Solar" elif tech_descriptor in ["Battery", "Battery and Inverter"]: if dispatch_type == "Load": name = "Battery Charge" else: name = "Battery Discharge" elif tech_descriptor in ["Hydro - Gravity"]: name = "Hydro" elif tech_descriptor in ["Hydro - Gravity", "Run of River"]: name = "Run of River Hydro" elif tech_descriptor == "Pump Storage": if dispatch_type == "Load": name = "Pump Storage Charge" else: name = "Pump Storage Discharge" elif tech_descriptor == "-" and fuel == "-" and dispatch_type == "Load": name = "Pump Storage Charge" elif tech_descriptor == "Open Cycle Gas turbines (OCGT)": name = "OCGT" elif tech_descriptor == "Combined Cycle Gas Turbine (CCGT)": name = "CCGT" elif ( fuel in ["Natural Gas / Fuel Oil", "Natural Gas"] and tech_descriptor == "Steam Sub-Critical" ): name = "Gas Thermal" elif isinstance(tech_descriptor, str) and "Engine" in tech_descriptor: name = "Engine" return name
[docs]def tech_namer(duid_info): """ Create a name for generation and load unit using custom logic applied to the fuel type, technology descriptor, and dispatch type supplied by AEMO in the NEM Registration and Exemption List.xls file. See the `source code <https://github.com/UNSW-CEEM/nem-bidding-dashboard/blob/6468e6b5506c778d12b3d3653af1cf11bd28807b/src/nem_bidding_dashboard/preprocessing.py#L123>`_ for logic that this function applies: Arguments: duid_info: pd dataframe with columns 'FUEL SOURCE - DESCRIPTOR', 'TECHNOLOGY TYPE - DESCRIPTOR' and 'DISPATCH TYPE' Returns: pd dataframe with additional column 'UNIT_TYPE' """ duid_info["UNIT TYPE"] = duid_info.apply( lambda x: tech_namer_by_row( x["FUEL SOURCE - DESCRIPTOR"], x["TECHNOLOGY TYPE - DESCRIPTOR"], x["DISPATCH TYPE"], ), axis=1, ) return duid_info
[docs]def hard_code_fix_fuel_source_and_tech_errors(duid_data): """ Where NA values occur in columns FUEL SOURCE - DESCRIPTOR or TECHNOLOGY TYPE - DESCRIPTOR replace these with the value '-'. This function is used to clean data before passing to :py:func:`nem_bidding_dashboard.preprocessing.tech_namer`. Args: duid_data: pd.DataFrame containing at least the columns FUEL SOURCE - DESCRIPTOR and TECHNOLOGY TYPE - DESCRIPTOR Returns: pd.DataFrame with the same columns as input data. Columns FUEL SOURCE - DESCRIPTOR and TECHNOLOGY TYPE - DESCRIPTOR have NA values replaced with '-'. """ duid_data.loc[ duid_data["FUEL SOURCE - DESCRIPTOR"].isna(), "FUEL SOURCE - DESCRIPTOR" ] = "-" duid_data.loc[ duid_data["TECHNOLOGY TYPE - DESCRIPTOR"].isna(), "TECHNOLOGY TYPE - DESCRIPTOR" ] = "-" return duid_data
[docs]def calculate_unit_time_series_metrics(as_bid_metrics, after_dispatch_metrics): """ Calculate some additional values associated with unit dispatch. These are used to populate the database table unit_dispatch and should be helpful in understanding unit dispatch outcomes, e.g. show when a plant is limited by its ramp rate, or if a unit's availability is the same as the availability submitted for the PASA process indicating that output is probably limited by a technical constraint. Examples: >>> as_bid_metrics = pd.DataFrame({ ... 'INTERVAL_DATETIME': ['2022/01/01 00:00:00', '2022/01/01 00:05:00', '2022/01/01 00:10:00'], ... 'DUID': ['AGLHAL', 'AGLHAL', 'AGLHAL'], ... 'ROCUP': [10, 15, 60], ... 'ROCDOWN': [20, 25, 30], ... 'MAXAVAIL': [100, 90, 105], ... 'PASAAVAILABILITY': [120, 90, 110], ... }) >>> as_bid_metrics['INTERVAL_DATETIME'] = pd.to_datetime(as_bid_metrics['INTERVAL_DATETIME']) >>> as_bid_metrics INTERVAL_DATETIME DUID ROCUP ROCDOWN MAXAVAIL PASAAVAILABILITY 0 2022-01-01 00:00:00 AGLHAL 10 20 100 120 1 2022-01-01 00:05:00 AGLHAL 15 25 90 90 2 2022-01-01 00:10:00 AGLHAL 60 30 105 110 >>> after_dispatch_metrics = pd.DataFrame({ ... 'SETTLEMENTDATE': ['2022/01/01 00:00:00', '2022/01/01 00:05:00', '2022/01/01 00:10:00'], ... 'DUID': ['AGLHAL', 'AGLHAL', 'AGLHAL'], ... 'RAMPUPRATE': [9*60, 14*60, 50*60], ... 'RAMPDOWNRATE': [18*60, 24*60, 28*60], ... 'AVAILABILITY': [100, 90, 105], ... 'INITIALMW': [80.1, 88.9, 84.5], ... 'TOTALCLEARED': [80, 90, 85], ... }) >>> after_dispatch_metrics['SETTLEMENTDATE'] = pd.to_datetime(after_dispatch_metrics['SETTLEMENTDATE']) >>> after_dispatch_metrics SETTLEMENTDATE DUID RAMPUPRATE RAMPDOWNRATE AVAILABILITY INITIALMW TOTALCLEARED 0 2022-01-01 00:00:00 AGLHAL 540 1080 100 80.1 80 1 2022-01-01 00:05:00 AGLHAL 840 1440 90 88.9 90 2 2022-01-01 00:10:00 AGLHAL 3000 1680 105 84.5 85 >>> calculate_unit_time_series_metrics(as_bid_metrics, after_dispatch_metrics) INTERVAL_DATETIME DUID AVAILABILITY TOTALCLEARED FINALMW ASBIDRAMPUPMAXAVAIL ASBIDRAMPDOWNMINAVAIL RAMPUPMAXAVAIL RAMPDOWNMINAVAIL PASAAVAILABILITY MAXAVAIL 0 2022-01-01 00:00:00 AGLHAL 100 80 88.9 130.1 -19.9 125.1 -9.9 120 100 1 2022-01-01 00:05:00 AGLHAL 90 90 84.5 163.9 -36.1 158.9 -31.1 90 90 Args: as_bid_metrics: pd.DataFrame containing values submitted by unit's as part of the bidding process. Should contain columns INTERVAL_DATETIME, DUID, ROCUP (ramp up rate in MW per min), ROCDOWN (ramp down rate in MW per min), MAXAVAIL (limit the unit can be dispatched up to), PASAAVAILABILITY (The technical maximum availability of unit given 24h notice, not used in dispatch) after_dispatch_metrics: pd.DataFrame containing values calculated by AEMO as part of the dispatch process. Should contain columns SETTLEMENTDATE, DUID, AVAILABILITY (presumed to be the lesser of the unit bid availability (MAXAVAIL column) and unit forecast availability for variable renewables), RAMPUPRATE, RAMPDOWNRATE (lesser of bid and telemetry ramp rates, MW per hour), INITIALMW (operating level of unit at start of dispatch interval), TOTALCLEARED (dispatch target for unit to ramp to by end of dispatch interval). Returns: pd.DataFrame containing columns INTERVAL_DATETIME, DUID, ASBIDRAMPUPMAXAVAIL (upper dispatch limit based on as bid ramp rate), ASBIDRAMPDOWNMINAVAIL (lower dispatch limit based on as bid ramp rate), RAMPUPMAXAVAIL ( upper dispatch limit based lesser of as bid and telemetry ramp rates), RAMPDOWNMINAVAIL (lower dispatch limit based lesser of as bid and telemetry ramp rates), AVAILABILITY, TOTALCLEARED (as for after_dispatch_metrics), PASAAVAILABILITY, MAXAVAIL (as for as_bid_metrics), and FINALMW (the unit operating level at the end of the dispatch interval). """ after_dispatch_metrics = after_dispatch_metrics.rename( {"SETTLEMENTDATE": "INTERVAL_DATETIME"}, axis=1 ) unit_time_series_metrics = pd.merge( as_bid_metrics, after_dispatch_metrics, on=["INTERVAL_DATETIME", "DUID"] ) unit_time_series_metrics["ASBIDRAMPUPMAXAVAIL"] = ( unit_time_series_metrics["INITIALMW"] + unit_time_series_metrics["ROCUP"] * 5 ) unit_time_series_metrics["ASBIDRAMPDOWNMINAVAIL"] = ( unit_time_series_metrics["INITIALMW"] - unit_time_series_metrics["ROCDOWN"] * 5 ) unit_time_series_metrics["RAMPUPMAXAVAIL"] = ( unit_time_series_metrics["INITIALMW"] + unit_time_series_metrics["RAMPUPRATE"] / 12 ) unit_time_series_metrics["RAMPDOWNMINAVAIL"] = ( unit_time_series_metrics["INITIALMW"] - unit_time_series_metrics["RAMPDOWNRATE"] / 12 ) final_mw = unit_time_series_metrics.loc[ :, ["INTERVAL_DATETIME", "DUID", "INITIALMW"] ] final_mw["INTERVAL_DATETIME"] -= datetime.timedelta(minutes=5) final_mw = final_mw.rename({"INITIALMW": "FINALMW"}, axis=1) unit_time_series_metrics = pd.merge( unit_time_series_metrics, final_mw, on=["INTERVAL_DATETIME", "DUID"] ) return unit_time_series_metrics.loc[ :, [ "INTERVAL_DATETIME", "DUID", "AVAILABILITY", "TOTALCLEARED", "FINALMW", "ASBIDRAMPUPMAXAVAIL", "ASBIDRAMPDOWNMINAVAIL", "RAMPUPMAXAVAIL", "RAMPDOWNMINAVAIL", "PASAAVAILABILITY", "MAXAVAIL", ], ]
def add_on_hour_column(data): data["ONHOUR"] = np.where(data["INTERVAL_DATETIME"].dt.minute == 0, True, False) return data