Read from Autoloader

`DataBricksAutoLoaderSource`

Bases: SourceInterface

The Spark Auto Loader is used to read new data files as they arrive in cloud storage. Further information on Auto Loader is available here

Example

ADLS Gen2AWS S3GCS

from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

options = {}
path = "abfss://{FILE-SYSTEM}@{ACCOUNT-NAME}.dfs.core.windows.net/{PATH}/{FILE-NAME}
format = "{DESIRED-FILE-FORMAT}"

DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

OR

DataBricksAutoLoaderSource(spark, options, path, format).read_batch()

from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

options = {}
path = "https://s3.{REGION-CODE}.amazonaws.com/{BUCKET-NAME}/{KEY-NAME}"
format = "{DESIRED-FILE-FORMAT}"

DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

OR

DataBricksAutoLoaderSource(spark, options, path, format).read_batch()

from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
from rtdip_sdk.pipelines.utilities import SparkSessionUtility

# Not required if using Databricks
spark = SparkSessionUtility(config={}).execute()

options = {}
path = "gs://{BUCKET-NAME}/{FILE-PATH}"
format = "{DESIRED-FILE-FORMAT}"

DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

OR

DataBricksAutoLoaderSource(spark, options, path, format).read_batch()

Parameters:

Name	Type	Description	Default
`spark`	`SparkSession`	Spark Session required to read data from cloud storage	required
`options`	`dict`	Options that can be specified for configuring the Auto Loader. Further information on the options available are here	required
`path`	`str`	The cloud storage path	required
`format`	`str`	Specifies the file format to be read. Supported formats are available here	required

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py

class DataBricksAutoLoaderSource(SourceInterface):
    """
    The Spark Auto Loader is used to read new data files as they arrive in cloud storage. Further information on Auto Loader is available [here](https://docs.databricks.com/ingestion/auto-loader/index.html)

    Example
    --------
    === "ADLS Gen2"

        ```python
        from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
        from rtdip_sdk.pipelines.utilities import SparkSessionUtility

        # Not required if using Databricks
        spark = SparkSessionUtility(config={}).execute()

        options = {}
        path = "abfss://{FILE-SYSTEM}@{ACCOUNT-NAME}.dfs.core.windows.net/{PATH}/{FILE-NAME}
        format = "{DESIRED-FILE-FORMAT}"

        DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

        OR

        DataBricksAutoLoaderSource(spark, options, path, format).read_batch()
        ```
    === "AWS S3"

        ```python
        from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
        from rtdip_sdk.pipelines.utilities import SparkSessionUtility

        # Not required if using Databricks
        spark = SparkSessionUtility(config={}).execute()

        options = {}
        path = "https://s3.{REGION-CODE}.amazonaws.com/{BUCKET-NAME}/{KEY-NAME}"
        format = "{DESIRED-FILE-FORMAT}"

        DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

        OR

        DataBricksAutoLoaderSource(spark, options, path, format).read_batch()
        ```
    === "GCS"

        ```python
        from rtdip_sdk.pipelines.sources import DataBricksAutoLoaderSource
        from rtdip_sdk.pipelines.utilities import SparkSessionUtility

        # Not required if using Databricks
        spark = SparkSessionUtility(config={}).execute()

        options = {}
        path = "gs://{BUCKET-NAME}/{FILE-PATH}"
        format = "{DESIRED-FILE-FORMAT}"

        DataBricksAutoLoaderSource(spark, options, path, format).read_stream()

        OR

        DataBricksAutoLoaderSource(spark, options, path, format).read_batch()
        ```

    Parameters:
        spark (SparkSession): Spark Session required to read data from cloud storage
        options (dict): Options that can be specified for configuring the Auto Loader. Further information on the options available are [here](https://docs.databricks.com/ingestion/auto-loader/options.html)
        path (str): The cloud storage path
        format (str): Specifies the file format to be read. Supported formats are available [here](https://docs.databricks.com/ingestion/auto-loader/options.html#file-format-options)
    """

    spark: SparkSession
    options: dict
    path: str

    def __init__(
        self, spark: SparkSession, options: dict, path: str, format: str
    ) -> None:
        self.spark = spark
        self.options = options
        self.path = path
        self.options["cloudFiles.format"] = format

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK on Databricks
        """
        return SystemType.PYSPARK_DATABRICKS

    @staticmethod
    def libraries():
        libraries = Libraries()
        libraries.add_maven_library(get_default_package("spark_delta_core"))
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_read_validation(self):
        return True

    def post_read_validation(self, df: DataFrame):
        return True

    def read_batch(self):
        """
        Raises:
            NotImplementedError: Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow` to perform batch-like reads of cloud storage files.
        """
        raise NotImplementedError(
            "Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow`"
        )

    def read_stream(self) -> DataFrame:
        """
        Performs streaming reads of files in cloud storage.
        """
        try:
            return (
                self.spark.readStream.format("cloudFiles")
                .options(**self.options)
                .load(self.path)
            )

        except Exception as e:
            logging.exception(str(e))
            raise e

`system_type()` `staticmethod`

Attributes:

Name	Type	Description
`SystemType`	`Environment`	Requires PYSPARK on Databricks

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py

@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK on Databricks
    """
    return SystemType.PYSPARK_DATABRICKS

`read_batch()`

Raises:

Type	Description
`NotImplementedError`	Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow` to perform batch-like reads of cloud storage files.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py

def read_batch(self):
    """
    Raises:
        NotImplementedError: Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method of this component and specify the Trigger on the write_stream to be `availableNow` to perform batch-like reads of cloud storage files.
    """
    raise NotImplementedError(
        "Auto Loader only supports streaming reads. To perform a batch read, use the read_stream method and specify Trigger on the write_stream as `availableNow`"
    )

`read_stream()`

Performs streaming reads of files in cloud storage.

Source code in src/sdk/python/rtdip_sdk/pipelines/sources/spark/autoloader.py

def read_stream(self) -> DataFrame:
    """
    Performs streaming reads of files in cloud storage.
    """
    try:
        return (
            self.spark.readStream.format("cloudFiles")
            .options(**self.options)
            .load(self.path)
        )

    except Exception as e:
        logging.exception(str(e))
        raise e

Read from Autoloader

DataBricksAutoLoaderSource

Example

system_type() staticmethod

read_batch()

read_stream()

`DataBricksAutoLoaderSource`

`system_type()` `staticmethod`

`read_batch()`

`read_stream()`