Source code for nem_bidding_dashboard.fetch_and_preprocess

import pandas as pd

from nem_bidding_dashboard import fetch_data, preprocessing
from nem_bidding_dashboard.input_validation import (
    data_cache_exits,
    validate_start_end_and_cache_location,
)

pd.set_option("display.width", None)


[docs]def region_data(start_time, end_time, raw_data_cache):
    """
    Wrapper for fetching and preprocessing regional demand and price data. Calls
    :py:func:`nem_bidding_dashboard.fetch_data.get_region_data` to get regional data, re-formats REGIONID by removing
    the trailing '1' using :py:func:`nem_bidding_dashboard.preprocessing.remove_number_from_region_names`, and finally
    converting datetime columns to string format. Used for preparing data to load into dashboard backend PostgresSQL
    database, or can be used for compiling regional data directly if the user does not have database, but compling data
    using this function will be considerably slower than from a PostgresSQL database.

    Examples:

    >>> region_data(
    ... '2022/01/01 00:00:00',
    ... '2022/01/01 00:05:00',
    ... 'D:/nemosis_data_cache')
      REGIONID       SETTLEMENTDATE  TOTALDEMAND        RRP
    0      NSW  2022-01-01 00:05:00      7206.03  124.85631
    1      QLD  2022-01-01 00:05:00      5982.85  118.73008
    2       SA  2022-01-01 00:05:00      1728.03  133.94970
    3      TAS  2022-01-01 00:05:00      1148.93   40.34000
    4      VIC  2022-01-01 00:05:00      5005.34  114.80312

    Args:
       start_time: str formatted "DD/MM/YYYY HH:MM:SS", data with date times greater than start_time are returned
       end_time: str formatted identical to start_time, data with date times less than or equal to end_time are returned
       raw_data_cache: Filepath to directory for caching files downloaded from AEMO

    Returns:
        pandas dataframe with columns SETTLEMENTDATE, REGIONID, TOTALDEMAND (the operational demand AEMO dispatches
        generation to meet), and RRP (the regional reference price for energy).
    """
    validate_start_end_and_cache_location(start_time, end_time, raw_data_cache)
    regional_data = fetch_data.region_data(start_time, end_time, raw_data_cache)
    regional_data = preprocessing.remove_number_from_region_names(
        "REGIONID", regional_data
    )
    regional_data["SETTLEMENTDATE"] = regional_data["SETTLEMENTDATE"].dt.strftime(
        "%Y-%m-%d %X"
    )
    regional_data = regional_data.sort_values("SETTLEMENTDATE")
    return regional_data


[docs]def bid_data(start_time, end_time, raw_data_cache):
    """
    Wrapper for fetching and preprocessing bid data.

    - Calls :py:func:`nem_bidding_dashboard.fetch_data.get_volume_bids`,
      :py:func:`nem_bidding_dashboard.fetch_data.get_price_bids`, and
      :py:func:`nem_bidding_dashboard.fetch_data.get_duid_availability_data` to get raw bidding and availabilty data
    - Volume and price bids are filtered to get only bids for the energy spot market
    - Volume and price bids are combined using :py:func:`nem_bidding_dashboard.preprocessing.stack_unit_bids`
    - Bids are filtered to remove those with zero volume.
    - The function :py:func:`nem_bidding_dashboardpreprocessing.adjust_bids_for_availability` to calculate the bid
      volume adjusted so that the bid volume does not exceed the unit availablity
    - Finally datetime columns are converted to string format.

    Used for preparing data to load into dashboard backend PostgresSQL database, or can be used for
    compiling regional data directly if the user does not have a database, but compling data using this function will be
    considerably slower than from a PostgresSQL database.

    Examples:

    >>> bid_data(
    ... '2022/01/01 00:00:00',
    ... '2022/01/01 00:05:00',
    ... 'D:/nemosis_data_cache')
           INTERVAL_DATETIME     DUID  BIDBAND  BIDVOLUME  BIDVOLUMEADJUSTED  BIDPRICE  ONHOUR
    0    2022-01-01 00:05:00  ADPBA1G        8          6              0.000    998.00   False
    462  2022-01-01 00:05:00   REECE1        2         45             45.000    -55.03   False
    463  2022-01-01 00:05:00   REECE1        4         74             74.000     -0.85   False
    464  2022-01-01 00:05:00   REECE2        2         35             35.000    -54.77   False
    465  2022-01-01 00:05:00   REECE2        4         84             84.000     -0.86   False
    ..                   ...      ...      ...        ...                ...       ...     ...
    235  2022-01-01 00:05:00  GUTHEGA       10         80             68.000  13599.06   False
    236  2022-01-01 00:05:00  HALLWF1        1         95             10.840   -963.00   False
    237  2022-01-01 00:05:00  HALLWF2        1         71              4.408   -960.60   False
    229  2022-01-01 00:05:00  GSTONE6        8          5              5.000   4806.84   False
    700  2022-01-01 00:05:00    YWPS4       10          9              0.000  14489.96   False
    <BLANKLINE>
    [701 rows x 7 columns]

    Args:
       start_time: str formatted "DD/MM/YYYY HH:MM:SS", data with date times greater than start_time are returned
       end_time: str formatted identical to start_time, data with date times less than or equal to end_time are returned
       raw_data_cache: Filepath to directory for caching files downloaded from AEMO

    Returns:
        pandas dataframe with columns INTERVAL_DATETIME, DUID, BIDPRICE ($/MWh), BIDVOLUME (MW), BIDVOLUMEADJUSTED (MW)
    """
    validate_start_end_and_cache_location(start_time, end_time, raw_data_cache)
    volume_bids = fetch_data.volume_bids(start_time, end_time, raw_data_cache)
    volume_bids = volume_bids[volume_bids["BIDTYPE"] == "ENERGY"].drop(
        columns=["BIDTYPE"]
    )
    price_bids = fetch_data.price_bids(start_time, end_time, raw_data_cache)
    price_bids = price_bids[price_bids["BIDTYPE"] == "ENERGY"].drop(columns=["BIDTYPE"])
    availability = fetch_data.duid_availability_data(
        start_time, end_time, raw_data_cache
    )
    combined_bids = preprocessing.stack_unit_bids(volume_bids, price_bids)
    combined_bids = combined_bids[combined_bids["BIDVOLUME"] > 0.0].copy()
    combined_bids = preprocessing.adjust_bids_for_availability(
        combined_bids, availability
    )
    combined_bids = preprocessing.add_on_hour_column(combined_bids)
    combined_bids["INTERVAL_DATETIME"] = combined_bids["INTERVAL_DATETIME"].dt.strftime(
        "%Y-%m-%d %X"
    )
    combined_bids = combined_bids.sort_values("INTERVAL_DATETIME")
    return combined_bids


[docs]def duid_info(raw_data_cache):
    """
    Wrapper for fetching and preprocessing duid summary data.

    - Calls :py:func:`nem_bidding_dashboard.fetch_data.get_duid_data` to get raw duid info
    - Calls :py:func:`nem_bidding_dashboard.preprocessing.hard_code_fix_fuel_source_and_tech_errors` to replace NA
      values with '-'
    - Calls :py:func:`nem_bidding_dashboard.preprocessing.remove_number_from_region_names` to remove trailing '1' from
      REGIONIDs
    - Calls :py:func:`nem_bidding_dashboard.preprocessing.tech_namer` to determine a more concise and consistent
      technology type for units. This value is stored in the column "UNIT TYPE".

    Used for preparing data to load into dashboard backend PostgresSQL database, or can be used for
    compiling regional data directly if the user does not have a database.

    Examples:

    >>> duid_info('D:/nemosis_data_cache')
                           STATION NAME REGION DISPATCH TYPE FUEL SOURCE - DESCRIPTOR       TECHNOLOGY TYPE - DESCRIPTOR      DUID           UNIT TYPE
    0       Adelaide Desalination Plant     SA     Generator                     Grid               Battery and Inverter   ADPBA1G   Battery Discharge
    1       Adelaide Desalination Plant     SA          Load                     Grid               Battery and Inverter   ADPBA1L      Battery Charge
    4       Adelaide Desalination Plant     SA     Generator                    Water                       Run of River    ADPMH1  Run of River Hydro
    5       Adelaide Desalination Plant     SA     Generator                    Solar            Photovoltaic Flat panel    ADPPV3               Solar
    6       Adelaide Desalination Plant     SA     Generator                    Solar            Photovoltaic Flat panel    ADPPV2               Solar
    ..                              ...    ...           ...                      ...                                ...       ...                 ...
    576  Yarrawonga Hydro Power Station    VIC     Generator                    Water                    Hydro - Gravity  YWNGAHYD               Hydro
    577            Yarwun Power Station    QLD     Generator              Natural Gas  Combined Cycle Gas Turbine (CCGT)  YARWUN_1                CCGT
    578              Yatpool Solar Farm    VIC     Generator                    Solar   Photovoltaic Tracking Flat panel    YATSF1               Solar
    579                Yawong Wind Farm    VIC     Generator                     Wind                     Wind - Onshore    YAWWF1                Wind
    580                Yendon Wind Farm    VIC     Generator                     Wind                     Wind - Onshore   YENDWF1                Wind
    <BLANKLINE>
    [488 rows x 7 columns]


    Args:
       raw_data_cache: Filepath to directory for caching files downloaded from AEMO

    Returns:
        pandas dataframe with columns DUID, REGIONID, "FUEL SOURCE - DESCRIPTOR", "DISPATCH TYPE",
        "TECHNOLOGY TYPE - DESCRIPTOR", "UNIT TYPE", "STATION NAME"
    """
    data_cache_exits(raw_data_cache)
    duid_info = fetch_data.duid_data(raw_data_cache)
    duid_info = preprocessing.hard_code_fix_fuel_source_and_tech_errors(duid_info)
    duid_info = preprocessing.remove_number_from_region_names("REGION", duid_info)
    duid_info = preprocessing.tech_namer(duid_info)
    return duid_info


[docs]def unit_dispatch(start_time, end_time, raw_data_cache):
    """
    Wrapper for fetching and preprocessing unit dispatch data.

    - Calls :py:func:`nem_bidding_dashboard.fetch_data.get_volume_bids` and
      :py:func:`nem_bidding_dashboard.fetch_data.get_duid_availability_data` to get raw
      bidding and availabilty data
    - Volume are filtered to get only bids for the energy spot market
    - Calls :py:func:`nem_bidding_dashboard.preprocessing.calculate_unit_time_series_metrics`
    - Finally datetime columns are converted to string format.

    Used for preparing data to load into dashboard backend PostgresSQL database, or can be used for
    compiling regional data directly if the user does not have database, but compling data using this function will be
    considerably slower than from a PostgresSQL database.

    Examples:

    >>> unit_dispatch(
    ... '2022/01/01 00:55:00',
    ... '2022/01/01 01:05:00',
    ... 'D:/nemosis_data_cache')
           INTERVAL_DATETIME      DUID  AVAILABILITY  TOTALCLEARED    FINALMW  ASBIDRAMPUPMAXAVAIL  ASBIDRAMPDOWNMINAVAIL  RAMPUPMAXAVAIL  RAMPDOWNMINAVAIL  PASAAVAILABILITY  MAXAVAIL  ONHOUR
    0    2022-01-01 01:00:00   ADPBA1G         0.000         0.000    0.00000             10.00000              -10.00000         7.75000          -7.75000               6.0         0    True
    250  2022-01-01 01:00:00     RRSF1         0.000         0.000    0.00000           3480.00000            -3480.00000      3480.00000       -3480.00000             116.0       116    True
    249  2022-01-01 01:00:00    ROMA_8        34.000         0.000    0.00000             40.00000              -40.00000        40.00000         -40.00000              34.0        34    True
    248  2022-01-01 01:00:00    ROMA_7        34.000         0.000    0.00000             40.00000              -40.00000        40.00000         -40.00000              34.0        34    True
    247  2022-01-01 01:00:00    REECE2       119.000       118.000  117.53001            266.89001              -33.10999       266.89001         -33.10999             119.0       119    True
    ..                   ...       ...           ...           ...        ...                  ...                    ...             ...               ...               ...       ...     ...
    118  2022-01-01 01:00:00   GSTONE3       260.000       170.000  170.45000            193.87500              143.87500       193.35000         144.40000             260.0       260    True
    117  2022-01-01 01:00:00   GSTONE2         0.000         0.000    0.00000             25.00000              -25.00000        25.00000         -25.00000             270.0         0    True
    116  2022-01-01 01:00:00   GSTONE1         0.000         0.000    0.00000             15.00000              -15.00000        15.00000         -15.00000               0.0         0    True
    125  2022-01-01 01:00:00  GUNNING1        10.042        10.042    7.97946             25.02899               -4.97101        25.02899          -4.97101              47.0        47    True
    368  2022-01-01 01:00:00     YWPS4       370.000       370.000  370.11520            385.39746              355.39746       385.39746         355.39746             396.0       370    True
    <BLANKLINE>
    [369 rows x 12 columns]

    Args:
       start_time: str formatted "DD/MM/YYYY HH:MM:SS", data with date times greater than start_time are returned
       end_time: str formatted identical to start_time, data with date times less than or equal to end_time are returned
       raw_data_cache: Filepath to directory for caching files downloaded from AEMO

    Returns:
        pandas dataframe with columns INTERVAL_DATETIME, DUID, AVAILABILITY, TOTALCLEARED, FINALMW,
        ASBIDRAMPUPMAXAVAIL, ASBIDRAMPDOWNMINAVAIL, RAMPUPMAXAVAIL, RAMPDOWNMINAVAIL, PASAAVAILABILITY,
        MAXAVAIL (see unit_dispatch in Database Guide for column definitions)
    """
    validate_start_end_and_cache_location(start_time, end_time, raw_data_cache)
    as_bid_metrics = fetch_data.volume_bids(start_time, end_time, raw_data_cache)
    as_bid_metrics = as_bid_metrics[as_bid_metrics["BIDTYPE"] == "ENERGY"].drop(
        columns=["BIDTYPE"]
    )
    as_bid_metrics = as_bid_metrics.loc[
        :,
        [
            "INTERVAL_DATETIME",
            "DUID",
            "MAXAVAIL",
            "ROCUP",
            "ROCDOWN",
            "PASAAVAILABILITY",
        ],
    ]
    after_dispatch_metrics = fetch_data.duid_availability_data(
        start_time, end_time, raw_data_cache
    )
    unit_time_series_metrics = preprocessing.calculate_unit_time_series_metrics(
        as_bid_metrics, after_dispatch_metrics
    )
    unit_time_series_metrics = preprocessing.add_on_hour_column(
        unit_time_series_metrics
    )
    unit_time_series_metrics["INTERVAL_DATETIME"] = unit_time_series_metrics[
        "INTERVAL_DATETIME"
    ].dt.strftime("%Y-%m-%d %X")
    unit_time_series_metrics = unit_time_series_metrics.sort_values("INTERVAL_DATETIME")
    return unit_time_series_metrics


[docs]def define_and_return_price_bins():
    """
    Defines the bins for aggregating bidding data.

    Examples:

    >>> define_and_return_price_bins()
              BIN_NAME  LOWER_EDGE  UPPER_EDGE
    0    [-1000, -100)       -2000        -100
    1        [-100, 0)        -100           0
    2          [0, 50)           0          50
    3        [50, 100)          50         100
    4       [100, 200)         100         200
    5       [200, 300)         200         300
    6       [300, 500)         300         500
    7      [500, 1000)         500        1000
    8     [1000, 5000)        1000        5000
    9    [5000, 10000)        5000       10000
    10  [10000, 15500)       10000       16000

    Returns:
        pandas dataframe with column bin_name, lower_edge and upper_edge
    """
    price_bins = pd.DataFrame(
        {
            "BIN_NAME": [
                "[-2000, -100)",
                "[-100, 0)",
                "[0, 50)",
                "[50, 100)",
                "[100, 200)",
                "[200, 300)",
                "[300, 500)",
                "[500, 1000)",
                "[1000, 5000)",
                "[5000, 10000)",
                "[10000, 16500)",
            ],
            "LOWER_EDGE": [-2000, -100, 0, 50, 100, 200, 300, 500, 1000, 5000, 10000],
            "UPPER_EDGE": [-100, 0, 50, 100, 200, 300, 500, 1000, 5000, 10000, 16500],
        }
    )
    return price_bins