ERCOT Daily Load

`ERCOTDailyLoadISOSource`

Bases: BaseISOSource

The ERCOT Daily Load ISO Source is used to read daily load data from ERCOT using WebScrapping. It supports actual and forecast data. To read more about the reports, visit the following URLs (The urls are only accessible if the requester/client is in US)-

For load type actual: Actual System Load by Weather Zone
For load type forecast: Seven-Day Load Forecast by Weather Zone

Parameters:

Name	Type	Description	Default
`spark`	`SparkSession`	Spark Session instance	required
`options`	`dict`	A dictionary of ISO Source specific configurations (See Attributes table below)	required

Attributes:

Name	Type	Description
`load_type`	`list`	Must be one of `actual` or `forecast`.
`date`	`str`	Must be in `YYYY-MM-DD` format.
`certificate_pfx_key`	`str`	The certificate key data or password received from ERCOT.
`certificate_pfx_key_contents`	`str`	The certificate data received from ERCOT, it could be base64 encoded.

Please check the BaseISOSource for available methods.

BaseISOSource

`BaseISOSource`

Bases: SourceInterface

Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

Parameters:

Name	Type	Description	Default
`spark`	`SparkSession`	Spark Session instance	required
`options`	`dict`	A dictionary of ISO Source specific configurations	required

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py

class BaseISOSource(SourceInterface):
    """
    Base class for all the ISO Sources. It provides common functionality and helps in reducing the code redundancy.

    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations
    """

    spark: SparkSession
    options: dict
    iso_url: str = "https://"
    query_datetime_format: str = "%Y%m%d"
    required_options: list = []
    spark_schema = StructType([StructField("id", IntegerType(), True)])
    default_query_timezone: str = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        self.spark = spark
        self.options = options
        self.query_timezone = pytz.timezone(
            self.options.get("query_timezone", self.default_query_timezone)
        )
        self.current_date = datetime.now(timezone.utc).astimezone(self.query_timezone)

    def _fetch_from_url(self, url_suffix: str) -> bytes:
        """
        Gets data from external ISO API.

        Args:
            url_suffix: String to be used as suffix to iso url.

        Returns:
            Raw content of the data received.

        """
        url = f"{self.iso_url}{url_suffix}"
        logging.info(f"Requesting URL - {url}")

        response = requests.get(url)
        code = response.status_code

        if code != 200:
            raise HTTPError(
                f"Unable to access URL `{url}`."
                f" Received status code {code} with message {response.content}"
            )

        return response.content

    def _get_localized_datetime(self, datetime_str: str) -> datetime:
        """
        Converts string datetime into Python datetime object with configured format and timezone.
        Args:
            datetime_str: String to be converted into datetime.

        Returns: Timezone aware datetime object.

        """
        parsed_dt = datetime.strptime(datetime_str, self.query_datetime_format)
        parsed_dt = parsed_dt.replace(tzinfo=self.query_timezone)
        return parsed_dt

    def _pull_data(self) -> pd.DataFrame:
        """
        Hits the fetch_from_url method with certain parameters to get raw data from API.

        All the children ISO classes must override this method and call the fetch_url method
        in it.

        Returns:
             Raw DataFrame from API.
        """

        return pd.read_csv(BytesIO(self._fetch_from_url("")))

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Performs all the basic transformations to prepare data for further processing.
        All the children ISO classes must override this method.

        Args:
            df: Raw DataFrame, received from the API.

        Returns:
             Modified DataFrame, ready for basic use.

        """
        return df

    def _sanitize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Another data transformation helper method to be called after prepare data.
        Used for advance data processing such as cleaning, filtering, restructuring.
        All the children ISO classes must override this method if there is any post-processing required.

        Args:
            df: Initial modified version of DataFrame, received after preparing the data.

        Returns:
             Final version of data after all the fixes and modifications.

        """
        return df

    def _get_data(self) -> pd.DataFrame:
        """
        Entrypoint method to return the final version of DataFrame.

        Returns:
            Modified form of data for specific use case.

        """
        df = self._pull_data()
        df = self._prepare_data(df)
        df = self._sanitize_data(df)

        # Reorder columns to keep the data consistent
        df = df[self.spark_schema.names]

        return df

    @staticmethod
    def system_type():
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def _validate_options(self) -> bool:
        """
        Performs all the options checks. Raises exception in case of any invalid value.
        Returns:
             True if all checks are passed.

        """
        return True

    def pre_read_validation(self) -> bool:
        """
        Ensures all the required options are provided and performs other validations.
        Returns:
             True if all checks are passed.

        """
        for key in self.required_options:
            if key not in self.options:
                raise ValueError(f"Required option `{key}` is missing.")

        return self._validate_options()

    def post_read_validation(self) -> bool:
        return True

    def read_batch(self) -> DataFrame:
        """
        Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
        Returns:
             Final Spark DataFrame converted from Pandas DataFrame post-execution.

        """

        try:
            self.pre_read_validation()
            pdf = self._get_data()
            pdf = _prepare_pandas_to_convert_to_spark(pdf)

            # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
            pd.DataFrame.iteritems = pd.DataFrame.items
            df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
            return df

        except Exception as e:
            logging.exception(str(e))
            raise e

    def read_stream(self) -> DataFrame:
        """
        By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

        Returns:
             Final Spark DataFrame after all the processing.

        """

        raise NotImplementedError(
            f"{self.__class__.__name__} connector doesn't support stream operation."
        )

`pre_read_validation()`

Ensures all the required options are provided and performs other validations. Returns: True if all checks are passed.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py

def pre_read_validation(self) -> bool:
    """
    Ensures all the required options are provided and performs other validations.
    Returns:
         True if all checks are passed.

    """
    for key in self.required_options:
        if key not in self.options:
            raise ValueError(f"Required option `{key}` is missing.")

    return self._validate_options()

`read_batch()`

Spark entrypoint, It executes the entire process of pulling, transforming & fixing data. Returns: Final Spark DataFrame converted from Pandas DataFrame post-execution.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py

def read_batch(self) -> DataFrame:
    """
    Spark entrypoint, It executes the entire process of pulling, transforming & fixing data.
    Returns:
         Final Spark DataFrame converted from Pandas DataFrame post-execution.

    """

    try:
        self.pre_read_validation()
        pdf = self._get_data()
        pdf = _prepare_pandas_to_convert_to_spark(pdf)

        # The below is to fix the compatibility issues between Pandas 2.0 and PySpark.
        pd.DataFrame.iteritems = pd.DataFrame.items
        df = self.spark.createDataFrame(data=pdf, schema=self.spark_schema)
        return df

    except Exception as e:
        logging.exception(str(e))
        raise e

`read_stream()`

By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

Returns:

Type	Description
`DataFrame`	Final Spark DataFrame after all the processing.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/base_iso.py

def read_stream(self) -> DataFrame:
    """
    By default, the streaming operation is not supported but child classes can override if ISO supports streaming.

    Returns:
         Final Spark DataFrame after all the processing.

    """

    raise NotImplementedError(
        f"{self.__class__.__name__} connector doesn't support stream operation."
    )

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/iso/ercot_daily_load_iso.py

class ERCOTDailyLoadISOSource(BaseISOSource):
    """
    The ERCOT Daily Load ISO Source is used to read daily load data from ERCOT using WebScrapping.
    It supports actual and forecast data. To read more about the reports, visit the following URLs
    (The urls are only accessible if the requester/client is in US)-

    For load type `actual`: [Actual System Load by Weather Zone](https://www.ercot.com/mp/data-products/
    data-product-details?id=NP6-345-CD)
    <br>
    For load type `forecast`: [Seven-Day Load Forecast by Weather Zone](https://www.ercot.com/mp/data-products/
    data-product-details?id=NP3-561-CD)


    Parameters:
        spark (SparkSession): Spark Session instance
        options (dict): A dictionary of ISO Source specific configurations (See Attributes table below)

    Attributes:
        load_type (list): Must be one of `actual` or `forecast`.
        date (str): Must be in `YYYY-MM-DD` format.
        certificate_pfx_key (str): The certificate key data or password received from ERCOT.
        certificate_pfx_key_contents (str): The certificate data received from ERCOT, it could be base64 encoded.

    Please check the BaseISOSource for available methods.

    BaseISOSource:
        ::: src.sdk.python.rtdip_sdk.pipelines.sources.spark.iso.base_iso
    """

    spark: SparkSession
    options: dict
    url_forecast: str = "https://mis.ercot.com/misapp/GetReports.do?reportTypeId=12312"
    url_actual: str = "https://mis.ercot.com/misapp/GetReports.do?reportTypeId=13101"
    url_prefix: str = "https://mis.ercot.com"
    query_datetime_format: str = "%Y-%m-%d"
    required_options = [
        "load_type",
        "date",
        "certificate_pfx_key",
        "certificate_pfx_key_contents",
    ]
    spark_schema = ERCOT_SCHEMA
    default_query_timezone = "UTC"

    def __init__(self, spark: SparkSession, options: dict) -> None:
        super().__init__(spark, options)
        self.spark = spark
        self.options = options
        self.load_type = self.options.get("load_type", "actual")
        self.date = self.options.get("date", "").strip()
        self.certificate_pfx_key = self.options.get("certificate_pfx_key", "").strip()
        self.certificate_pfx_key_contents = self.options.get(
            "certificate_pfx_key_contents", ""
        ).strip()

    def generate_temp_client_cert_files_from_pfx(self):
        password = self.certificate_pfx_key.encode()
        pfx: bytes = base64.b64decode(self.certificate_pfx_key_contents)

        if base64.b64encode(pfx) != self.certificate_pfx_key_contents.encode():
            pfx = self.certificate_pfx_key_contents

        key, cert, _ = pkcs12.load_key_and_certificates(data=pfx, password=password)
        key_bytes = key.private_bytes(
            encoding=serialization.Encoding.PEM,
            format=serialization.PrivateFormat.TraditionalOpenSSL,
            encryption_algorithm=serialization.NoEncryption(),
        )

        cert_bytes = cert.public_bytes(encoding=serialization.Encoding.PEM)
        return TempCertFiles(cert_bytes, key_bytes)

    def _pull_data(self) -> pd.DataFrame:
        """
        Pulls data from the ERCOT API and parses the zip files for CSV data.

        Returns:
            Raw form of data.
        """

        logging.info(f"Getting {self.load_type} data for date {self.date}")
        url = self.url_forecast
        req_date = datetime.strptime(self.date, self.query_datetime_format)

        if self.load_type == "actual":
            req_date = req_date + timedelta(days=1)
            url = self.url_actual

        url_lists, files = self.generate_urls_for_zip(url, req_date)
        dfs = []
        logging.info(f"Generated {len(url_lists)} URLs - {url_lists}")
        logging.info(f"Requesting files - {files}")

        for url in url_lists:
            df = self.download_zip(url)
            dfs.append(df)
        final_df = pd.concat(dfs)
        return final_df

    def download_zip(self, url) -> pd.DataFrame:
        logging.info(f"Downloading zip using {url}")
        with self.generate_temp_client_cert_files_from_pfx() as cert:
            response = requests.get(url, cert=cert)

        if not response.content:
            raise HTTPError("Empty Response was returned")

        logging.info("Unzipping the file")
        zf = ZipFile(BytesIO(response.content))
        csvs = [s for s in zf.namelist() if ".csv" in s]

        if len(csvs) == 0:
            raise ValueError("No data was found in the specified interval")

        df = pd.read_csv(zf.open(csvs[0]))
        return df

    def generate_urls_for_zip(self, url: str, date: datetime) -> (List[str], List[str]):
        logging.info(f"Finding urls list for date {date}")
        with self.generate_temp_client_cert_files_from_pfx() as cert:
            page_response = requests.get(url, timeout=5, cert=cert)

        page_content = BeautifulSoup(page_response.content, "html.parser")
        zip_info = []
        length = len(page_content.find_all("td", {"class": "labelOptional_ind"}))

        for i in range(0, length):
            zip_name = page_content.find_all("td", {"class": "labelOptional_ind"})[
                i
            ].text
            zip_link = page_content.find_all("a")[i].get("href")
            zip_info.append((zip_name, zip_link))

        date_str = date.strftime("%Y%m%d")
        zip_info = list(
            filter(
                lambda f_info: f_info[0].endswith("csv.zip") and date_str in f_info[0],
                zip_info,
            )
        )

        urls = []
        files = []

        if len(zip_info) == 0:
            raise ValueError(f"No file was found for date - {date_str}")

        # As Forecast is generated every hour, pick the latest one.
        zip_info = sorted(zip_info, key=lambda item: item[0], reverse=True)
        zip_info_item = zip_info[0]

        file_name, file_url = zip_info_item
        urls.append(self.url_prefix + file_url)
        files.append(file_name)

        return urls, files

    def _prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
        if self.load_type == "actual":
            df["Date"] = pd.to_datetime(df["OperDay"], format="%m/%d/%Y")

            df = df.rename(
                columns={
                    "COAST": "Coast",
                    "EAST": "East",
                    "FAR_WEST": "FarWest",
                    "NORTH": "North",
                    "NORTH_C": "NorthCentral",
                    "SOUTH_C": "SouthCentral",
                    "SOUTHERN": "Southern",
                    "WEST": "West",
                    "TOTAL": "SystemTotal",
                    "DSTFlag": "DstFlag",
                }
            )

        else:
            df = df.rename(columns={"DSTFlag": "DstFlag"})

            df["Date"] = pd.to_datetime(df["DeliveryDate"], format="%m/%d/%Y")

        return df

    def _validate_options(self) -> bool:
        try:
            datetime.strptime(self.date, self.query_datetime_format)
        except ValueError:
            raise ValueError(
                f"Unable to parse date. Please specify in {self.query_datetime_format} format."
            )
        return True