Convert SEM Json to Process Control Data Model

`SEMJsonToPCDMTransformer`

Bases: TransformerInterface

Converts a Spark Dataframe column containing a json string created by SEM to the Process Control Data Model.

Example

from rtdip_sdk.pipelines.transformers import SEMJsonToPCDMTransformer

sem_json_to_pcdm_transformer = SEMJsonToPCDMTransformer(
    data=df
    source_column_name="body",
    version=10,
    status_null_value="Good",
    change_type_value="insert"
)

result = sem_json_to_pcdm_transformer.transform()

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Dataframe containing the column with SEM data	required
`source_column_name`	`str`	Spark Dataframe column containing the Json SEM data	required
`version`	`int`	The version for the OBC field mappings. The latest version is 10.	required
`status_null_value`	`optional str`	If populated, will replace 'Good' in the Status column with the specified value.	`'Good'`
`change_type_value`	`optional str`	If populated, will replace 'insert' in the ChangeType column with the specified value.	`'insert'`

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/sem_json_to_pcdm.py

class SEMJsonToPCDMTransformer(TransformerInterface):
    """
    Converts a Spark Dataframe column containing a json string created by SEM to the Process Control Data Model.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.transformers import SEMJsonToPCDMTransformer

    sem_json_to_pcdm_transformer = SEMJsonToPCDMTransformer(
        data=df
        source_column_name="body",
        version=10,
        status_null_value="Good",
        change_type_value="insert"
    )

    result = sem_json_to_pcdm_transformer.transform()
    ```

    Parameters:
        data (DataFrame): Dataframe containing the column with SEM data
        source_column_name (str): Spark Dataframe column containing the Json SEM data
        version (int): The version for the OBC field mappings. The latest version is 10.
        status_null_value (optional str): If populated, will replace 'Good' in the Status column with the specified value.
        change_type_value (optional str): If populated, will replace 'insert' in the ChangeType column with the specified value.
    """

    data: DataFrame
    source_column_name: str
    version: int
    status_null_value: str
    change_type_value: str

    def __init__(
        self,
        data: DataFrame,
        source_column_name: str,
        version: int,
        status_null_value: str = "Good",
        change_type_value: str = "insert",
    ) -> None:
        _package_version_meets_minimum("pyspark", "3.4.0")
        self.data = data
        self.source_column_name = source_column_name
        self.version = version
        self.status_null_value = status_null_value
        self.change_type_value = change_type_value

    @staticmethod
    def system_type():
        """
        Attributes:
            SystemType (Environment): Requires PYSPARK
        """
        return SystemType.PYSPARK

    @staticmethod
    def libraries():
        libraries = Libraries()
        return libraries

    @staticmethod
    def settings() -> dict:
        return {}

    def pre_transform_validation(self):
        return True

    def post_transform_validation(self):
        return True

    def transform(self) -> DataFrame:
        """
        Returns:
            DataFrame: A dataframe with the specified column converted to PCDM
        """
        if self.version == 10:
            mapping = obc_field_mappings.OBC_FIELD_MAPPINGS_V10
            df = (
                self.data.withColumn(
                    self.source_column_name,
                    from_json(self.source_column_name, SEM_SCHEMA),
                )
                .select(self.source_column_name + ".readings")
                .melt(
                    ids=["readings.resourceName"],
                    values=["readings.value"],
                    variableColumnName="var",
                    valueColumnName="value",
                )
                .drop("var")
                .select(map_from_arrays("resourceName", "value").alias("resourceName"))
                .select("resourceName.dID", "resourceName.d", "resourceName.t")
                .select(
                    regexp_replace(col("t").cast("string"), "(\d{10})(\d+)", "$1.$2")
                    .cast("double")
                    .alias("timestamp"),
                    "dID",
                    posexplode(split(expr("substring(d, 2, length(d)-2)"), ",")),
                )
                .select(
                    to_timestamp("timestamp").alias("EventTime"),
                    col("dID"),
                    col("pos").cast("string"),
                    col("col").alias("Value"),
                )
                .withColumn(
                    "TagName",
                    concat(
                        col("dID"),
                        lit(":"),
                        udf(lambda row: mapping[row]["TagName"])(col("pos")),
                    ),
                )
                .withColumn(
                    "ValueType", udf(lambda row: mapping[row]["ValueType"])(col("pos"))
                )
                .withColumn("Status", lit(self.status_null_value))
                .withColumn("ChangeType", lit(self.change_type_value))
            )
            return df.select(
                "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
            )
        else:
            return logging.exception(
                "The wrong version was specified. Please use the latest version"
            )

`system_type()` `staticmethod`

Attributes:

Name	Type	Description
`SystemType`	`Environment`	Requires PYSPARK

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/sem_json_to_pcdm.py

@staticmethod
def system_type():
    """
    Attributes:
        SystemType (Environment): Requires PYSPARK
    """
    return SystemType.PYSPARK

`transform()`

Returns:

Name	Type	Description
`DataFrame`	`DataFrame`	A dataframe with the specified column converted to PCDM

Source code in src/sdk/python/rtdip_sdk/pipelines/transformers/spark/sem_json_to_pcdm.py

def transform(self) -> DataFrame:
    """
    Returns:
        DataFrame: A dataframe with the specified column converted to PCDM
    """
    if self.version == 10:
        mapping = obc_field_mappings.OBC_FIELD_MAPPINGS_V10
        df = (
            self.data.withColumn(
                self.source_column_name,
                from_json(self.source_column_name, SEM_SCHEMA),
            )
            .select(self.source_column_name + ".readings")
            .melt(
                ids=["readings.resourceName"],
                values=["readings.value"],
                variableColumnName="var",
                valueColumnName="value",
            )
            .drop("var")
            .select(map_from_arrays("resourceName", "value").alias("resourceName"))
            .select("resourceName.dID", "resourceName.d", "resourceName.t")
            .select(
                regexp_replace(col("t").cast("string"), "(\d{10})(\d+)", "$1.$2")
                .cast("double")
                .alias("timestamp"),
                "dID",
                posexplode(split(expr("substring(d, 2, length(d)-2)"), ",")),
            )
            .select(
                to_timestamp("timestamp").alias("EventTime"),
                col("dID"),
                col("pos").cast("string"),
                col("col").alias("Value"),
            )
            .withColumn(
                "TagName",
                concat(
                    col("dID"),
                    lit(":"),
                    udf(lambda row: mapping[row]["TagName"])(col("pos")),
                ),
            )
            .withColumn(
                "ValueType", udf(lambda row: mapping[row]["ValueType"])(col("pos"))
            )
            .withColumn("Status", lit(self.status_null_value))
            .withColumn("ChangeType", lit(self.change_type_value))
        )
        return df.select(
            "EventTime", "TagName", "Status", "Value", "ValueType", "ChangeType"
        )
    else:
        return logging.exception(
            "The wrong version was specified. Please use the latest version"
        )

Convert SEM Json to Process Control Data Model

SEMJsonToPCDMTransformer

Example

system_type() staticmethod

transform()

`SEMJsonToPCDMTransformer`

`system_type()` `staticmethod`

`transform()`