Skip to content

Normalization ZScore

NormalizationZScore

Bases: NormalizationBaseClass

Implements Z-Score normalization for specified columns in a PySpark DataFrame.

Example

from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_zscore import NormalizationZScore
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame

normalization = NormalizationZScore(df, column_names=["value_column_1", "value_column_2"], in_place=False)
normalized_df = normalization.filter_data()

Parameters:

Name Type Description Default
df DataFrame

PySpark DataFrame to be normalized.

required
column_names List[str]

List of columns in the DataFrame to be normalized.

required
in_place bool

If true, then result of normalization is stored in the same column.

False
Source code in src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class NormalizationZScore(NormalizationBaseClass):
    """
    Implements Z-Score normalization for specified columns in a PySpark DataFrame.

    Example
    --------
    ```python
    from rtdip_sdk.pipelines.data_quality.data_manipulation.spark.normalization.normalization_zscore import NormalizationZScore
    from pyspark.sql import SparkSession
    from pyspark.sql.dataframe import DataFrame

    normalization = NormalizationZScore(df, column_names=["value_column_1", "value_column_2"], in_place=False)
    normalized_df = normalization.filter_data()
    ```

    Parameters:
        df (DataFrame): PySpark DataFrame to be normalized.
        column_names (List[str]): List of columns in the DataFrame to be normalized.
        in_place (bool): If true, then result of normalization is stored in the same column.
    """

    NORMALIZED_COLUMN_NAME = "zscore"

    def _normalize_column(self, df: PySparkDataFrame, column: str) -> PySparkDataFrame:
        """
        Private method to apply Z-Score normalization to the specified column.
        Z-Score normalization: (value - mean) / std_dev
        """
        mean_val = df.select(F.mean(F.col(column))).collect()[0][0]
        std_dev_val = df.select(F.stddev(F.col(column))).collect()[0][0]

        if math.isclose(std_dev_val, 0.0, abs_tol=10e-8) or not math.isfinite(
            std_dev_val
        ):
            raise ZeroDivisionError("Division by Zero in ZScore")

        store_column = self._get_norm_column_name(column)
        self.reversal_value = [mean_val, std_dev_val]

        return df.withColumn(
            store_column, (F.col(column) - F.lit(mean_val)) / F.lit(std_dev_val)
        )

    def _denormalize_column(
        self, df: PySparkDataFrame, column: str
    ) -> PySparkDataFrame:
        """
        Private method to revert Z-Score normalization to the specified column.
        Z-Score denormalization: normalized_value * std_dev + mean = value
        """
        mean_val = self.reversal_value[0]
        std_dev_val = self.reversal_value[1]

        store_column = self._get_norm_column_name(column)

        return df.withColumn(
            store_column, F.col(column) * F.lit(std_dev_val) + F.lit(mean_val)
        )