Source code for pytorch_utils.dataset_configurations

from __future__ import annotations

import copy
import itertools
from collections import UserList
from dataclasses import asdict, dataclass, field
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type

import numpy as np
import pandas as pd

# TYPE_CHECKING is False at runtime, but treated as True by type checkers.
# So the Python interpreters won't do pyspark import, but the type checker
# will still understand where it came from!
if TYPE_CHECKING:
    import pyspark.sql

    # We do not want to import pyspark at runtime, only when type checking (when building this project).
    # Because we want to get rid as much as possible of unnecessary (heavy) dependencies
    # in order to avoid having to install them too when installing this ML project (ex: for the inference part -> API).

from pytorch_utils.exceptions import InconsistentDatasetConfigurations
from pytorch_utils.logging.loggers import (
    Logger,
    VoidLogger,
    SingleLoggerDataclassLoggable,
    use_loggers,
)
from pytorch_utils.pandas.data_formatting import (
    PandasFormatter,
    PandasIdentityFormatter,
)
# from pytorch_utils.io.interface import MetaDataFrame



[docs]
@dataclass(frozen=True)
class DataSplitConfig(SingleLoggerDataclassLoggable):
    """
    Configuration used to specify train, validation and test splits.
    The proportions should all lie in [0,1], with their sum smaller or equal to 1.
    In case the sum is strictly smaller than 1, only a random subset of the data is used.

    Attributes
    ----------
    training_proportion : float, default=1.
        proportion of dataframe to be used as training samples
    validation_proportion : float, default=0.
        proportion of dataframe to be used as validation samples
    test_proportion : float, default=0.
        proportion of dataframe to be used as test samples
    random_seed : Optional[int], default=None
        random seed used for random splitting
    stratify : Optional[List[str]], default=None
        list of column names used to stratify the data (see also sklearn.model_selection.train_test_split)
    """

    training_proportion: float = 1.0
    validation_proportion: float = 0.0
    test_proportion: float = 0.0
    random_seed: Optional[int] = None
    stratify: Optional[List[str]] = None
    logger: Logger = field(default=VoidLogger(), repr=False, compare=False)

    def __post_init__(self):
        # Check attribute values
        if not 0.0 <= self.training_proportion <= 1.0:
            raise ValueError("training_proportion should belong to unit interval [0,1]")
        if not 0.0 <= self.validation_proportion <= 1.0:
            raise ValueError("validation_proportion should belong to unit interval [0,1]")
        if not 0.0 <= self.test_proportion <= 1.0:
            raise ValueError("test_proportion should belong to unit interval [0,1]")
        if (
            not self.training_proportion + self.validation_proportion + self.test_proportion
            <= 1.0 + 1e-6
        ):
            raise ValueError("The sum of all proportions should be at most 1")

        # Sort columns used to stratify splits
        # This ensures that two dataclasses that only differ by the ordering of `stratify` are considered equal
        object.__setattr__(
            self,
            "stratify",
            sorted(self.stratify) if self.stratify is not None else self.stratify,
        )

        # Make this class comply with Loggable Protocol
        object.__setattr__(self, "loggers", {"logger": self.logger})


[docs]
    def train_valid_test_split(self, df: pd.DataFrame) -> Tuple[pd.Index, pd.Index, pd.Index]:
        def grouped_df(df: pd.DataFrame) -> pd.DataFrame:
            return (
                df.assign(
                    **{
                        # In case some values in self.stratify are null,
                        # we need to specify dropna=False to pandas groupby method
                        # but this causes some aggregation function (like "sample") to raise
                        # a KeyError. So the alternative method we found is to first
                        # replace null values by values with the same dtype that are not
                        # already present. N.B: this will work for both strings and numeric types.
                        f"{col}_bis_temp": df[col].fillna(df[col].max() * 2)
                        for col in self.stratify
                    }
                ).groupby(by=[f"{col}_bis_temp" for col in self.stratify])
                if self.stratify
                else df
            )

        training_indices = (
            grouped_df(df)
            .sample(frac=self.training_proportion, random_state=self.random_seed)
            .index
        )

        val_test_data = df[~df.index.isin(training_indices)]

        validation_indices = (
            grouped_df(val_test_data)
            .sample(
                frac=min(
                    self.validation_proportion / (1 - self.training_proportion)
                    if self.validation_proportion > 0
                    else 0.0,
                    1.0,
                ),
                random_state=self.random_seed,
            )
            .index
        )

        test_data = df[~df.index.isin(training_indices.union(validation_indices))]

        test_indices = (
            grouped_df(test_data)
            .sample(
                frac=min(
                    self.test_proportion
                    / (1 - self.training_proportion - self.validation_proportion)
                    if self.test_proportion > 0
                    else 0.0,
                    1.0,
                ),
                random_state=self.random_seed,
            )
            .index
        )

        return training_indices, validation_indices, test_indices





[docs]
@dataclass(frozen=True)
class DataAugmentationConfig(SingleLoggerDataclassLoggable):
    """
    Configuration used to specify data augmentation on specific column (`augmented_col`).
    The idea of this data augmentation is to duplicate the data several times, with only `augmented_col` changed
    by multiplying the original values with a scaling factor.

    This can be useful when there is some monotone relationship between a covariate (`augmented_col`) and the
    success of an event (0=success, 1=event).

    Attributes
    ----------
    augmented_col : str
        Name of column to augment
    scaling_factors : np.ndarray
        Numpy array of floats corresponding to scaling factors used for data augmentation
    """

    augmented_col: str
    scaling_factors: np.ndarray = np.array([1.0])
    min_value: float = -float("inf")
    max_value: float = float("inf")
    logger: Logger = field(default=VoidLogger(), repr=False, compare=False)

    def __post_init__(self):
        # Sort scaling factors
        # This ensures that two dataclasses that only differ by the ordering of `scaling_factors` are considered equal
        object.__setattr__(self, "scaling_factors", np.sort(self.scaling_factors))

        # Make this class comply with Loggable Protocol
        object.__setattr__(self, "loggers", {"logger": self.logger})


[docs]
    def augment_data(self, data: pd.DataFrame) -> pd.DataFrame:
        return pd.concat(
            [
                self.scale_col(
                    data,
                    scaling_factor,
                    self.augmented_col,
                    min_value=self.min_value,
                    max_value=self.max_value,
                )
                for scaling_factor in self.scaling_factors
            ],
            axis=0,
            ignore_index=True,
            sort=True,
        )



[docs]
    @staticmethod
    def scaling_filter(
        data: pd.DataFrame,
        scaling_factor: np.ndarray,
        col: str,
        min_value: float = -float("inf"),
        max_value: float = float("inf"),
    ) -> pd.DataFrame:
        """
        Filter used to drop scaled values that are outside the range [min_value, max_value].
        See method `scale_col`.
        """
        return (data[col] * scaling_factor >= min_value) & (data[col] * scaling_factor <= max_value)



[docs]
    @staticmethod
    def scale_col(
        data: pd.DataFrame,
        scaling_factor: np.ndarray,
        col: str,
        min_value: float = -float("inf"),
        max_value: float = float("inf"),
    ) -> pd.DataFrame:
        """
        Return pandas dataframe identical to `data` with column `col` scaled by `scaling_factor`.
        The scaled values that are outside the range [min_value, max_value] are dropped.
        """
        scale_filter = DataAugmentationConfig.scaling_filter(
            data, scaling_factor, col, min_value, max_value
        )
        output_df = data[scale_filter].copy()
        output_df[col] *= scaling_factor
        return output_df



[docs]
    @staticmethod
    def scaling_length(
        data: pd.DataFrame,
        scaling_factor: np.ndarray,
        col: str,
        min_value: float = -float("inf"),
        max_value: float = float("inf"),
    ) -> int:
        """
        Length of the pandas dataframe obtained when calling method `scale_col` with the exact same input.
        The implementation does not require explicitly building the dataframe.
        """
        scale_filter = DataAugmentationConfig.scaling_filter(
            data, scaling_factor, col, min_value, max_value
        )
        return scale_filter.sum()



[docs]
    def augmentation_length(self, data: pd.DataFrame) -> int:
        """
        Length of the pandas dataframe obtained when calling method `augment_data` with the exact same input.
        The implementation does not require explicitly building the dataframe.
        """
        return sum(
            [
                self.scaling_length(
                    data,
                    scaling_factor,
                    self.augmented_col,
                    min_value=self.min_value,
                    max_value=self.max_value,
                )
                for scaling_factor in self.scaling_factors
            ]
        )



[docs]
    @use_loggers("logger")
    def log(self, logger: Logger) -> None:
        params = asdict(self)
        del params["logger"]
        logger.log_params(params)





[docs]
@dataclass(frozen=True)
class AugmentedBernoulliDatasetConfig(SingleLoggerDataclassLoggable):
    """
    Dataset configuration for augmented Bernoulli samples (binary outcomes: successful or not).
    There are 2 ways to construct an instance of `AugmentedBernoulliDatasetConfig`:
        1) either by calling the constructor and passing a pandas dataframe (with optional meatada) as input
        2) or by calling `class method `from_meta_dataframe` and passing a delta table as input
    Method 1 is preferred for testing/debugging/prototyping while method 2 is preferred for
    production and traceable experimentations (clean metadata, etc...).

    Attributes
    ----------
        data : pandas.DataFrame
            The Pandas dataframe containing the data.
        is_success : bool
            Whether the samples correspond to successful events or not
        split_config : DataSplitConfig
            The configuration for splitting between train, validation and test
        data_augmentation_config : DataAugmentationConfig
            The configuration for data augmentation
        metadata : Dict[str, Any], default={}
            Any information regarding the source data that we wish to track/save
    """

    data: pd.DataFrame = field(repr=False)
    is_success: bool
    data_augmentation_config: DataAugmentationConfig
    split_config: DataSplitConfig = DataSplitConfig()
    metadata: Dict[str, Any] = field(default_factory=dict)
    logger: Logger = field(default=VoidLogger(), repr=False, compare=False)

    def __post_init__(self):
        # Compute training, validation and test indices
        (
            training_indices,
            validation_indices,
            test_indices,
        ) = self.split_config.train_valid_test_split(self.data)
        # N.B: sorting pandas data according to all columns (sorted in alphabetic order) for reproducibility
        object.__setattr__(
            self, "data", self.data.sort_values(by=self.columns).reset_index(drop=True)
        )

        object.__setattr__(self, "training_indices", training_indices)
        object.__setattr__(self, "validation_indices", validation_indices)
        object.__setattr__(self, "test_indices", test_indices)

        # Make this class comply with Loggable Protocol
        object.__setattr__(self, "loggers", {"logger": self.logger})


[docs]
    @classmethod
    def from_meta_dataframe(
        cls,
        meta_df: Any[pyspark.sql.DataFrame],
        is_success: bool,
        data_augmentation_config: DataAugmentationConfig,
        split_config: DataSplitConfig,
        logger: Logger = VoidLogger(),
        spark_filter: Optional[str] = None,
        pandas_formatter: PandasFormatter = PandasIdentityFormatter(),
    ) -> AugmentedBernoulliDatasetConfig:
        """
        Use this method to construct an instance of `AugmentedBernoulliDatasetConfig` directly from a delta table
        with proper metadata.

        Attributes
        ----------
        delta_table : delta.tables.DeltaTable
            The delta table containing the data.
            To use a previous version of the data, call `restoreToVersion(version: int)`
            on `delta_table` before passing it to `AugmentedBernoulliDatasetConfig`.
        is_success : bool
            Whether the samples correspond to successful events or not
        split_config : DataSplitConfig
            The configuration for splitting between train, validation and test
        data_augmentation_config : DataAugmentationConfig
            The configuration for data augmentation
        pandas_formatter: PandasFormatter
            Any formatting on pandas data (cast dtypes, etc...).
        """
        metadata = copy.deepcopy(meta_df.metadata)
        metadata["common_spark_filter"] = spark_filter
        metadata["pandas_formatter"] = str(pandas_formatter)
        spark_filter = spark_filter if spark_filter is not None else "True"

        dataset_config = cls(
            data=pandas_formatter.format(meta_df.read().filter(spark_filter).toPandas()),
            is_success=is_success,
            data_augmentation_config=data_augmentation_config,
            split_config=split_config,
            metadata=metadata,
            logger=logger,
        )
        # Set _pandas_formatter attribute to be able to log info later
        object.__setattr__(dataset_config, "_pandas_formatter", pandas_formatter)

        return dataset_config



[docs]
    def sample(self, n: int, replace: bool = False):
        return self.data.sample(n, replace=replace)


    @property
    def training_data(self):
        return self.data.iloc[self.training_indices]

    @property
    def training_data_length(self):
        return len(self.training_indices)

    @property
    def validation_data(self):
        return self.data.iloc[self.validation_indices]

    @property
    def validation_data_length(self):
        return len(self.validation_indices)

    @property
    def test_data(self):
        return self.data.iloc[self.test_indices]

    @property
    def test_data_length(self):
        return len(self.test_indices)

    @property
    def augmented_data(self):
        return self.data_augmentation_config.augment_data(self.data)

    @property
    def augmented_data_length(self):
        return self.data_augmentation_config.augmentation_length(self.data)

    @property
    def augmented_training_data(self):
        return self.data_augmentation_config.augment_data(self.training_data)

    @property
    def augmented_training_data_length(self):
        return self.data_augmentation_config.augmentation_length(self.training_data)

    @property
    def augmented_validation_data(self):
        return self.data_augmentation_config.augment_data(self.validation_data)

    @property
    def augmented_validation_data_length(self):
        return self.data_augmentation_config.augmentation_length(self.validation_data)

    @property
    def augmented_test_data(self):
        return self.data_augmentation_config.augment_data(self.test_data)

    @property
    def augmented_test_data_length(self):
        return self.data_augmentation_config.augmentation_length(self.test_data)

    @property
    def augmented_col(self):
        return self.data_augmentation_config.augmented_col

    @property
    def data_augmentation_scaling_factors(self):
        return self.data_augmentation_config.scaling_factors

    @property
    def columns(self):
        return sorted(self.data.columns)

    @property
    def dtypes(self):
        return self.data.dtypes

    def __len__(self) -> int:
        return len(self.data)


[docs]
    def clear_data(self) -> AugmentedBernoulliDatasetConfig:
        object.__setattr__(self, "data", self.data[:0])
        return self



[docs]
    @use_loggers("logger")
    def log(self, logger: Logger) -> None:
        self.split_config.log()
        self.data_augmentation_config.log()
        logger.log_param("metadata", self.metadata)
        logger.log_param("is_success", self.is_success)
        logger.log_param("columns", self.columns)
        logger.log_metrics(
            {
                "total_length": len(self),
                "total_augmented_length": self.augmented_data_length,
                "training_data_length": self.training_data_length,
                "augmented_training_data_length": self.augmented_training_data_length,
                "validation_data_length": self.validation_data_length,
                "augmented_validation_data_length": self.augmented_validation_data_length,
                "test_data_length": self.test_data_length,
                "augmented_test_data_length": self.augmented_test_data_length,
            }
        )
        logger.log_pandas_artifact(self.dtypes.rename("data_dtypes").to_frame(), "data_dtypes")
        logger.log_pandas_artifact(self.data.describe(), "data_statistics")

        if hasattr(self, "_pandas_formatter"):
            _pandas_formatter: PandasFormatter = getattr(self, "_pandas_formatter")
            _pandas_formatter.log()





[docs]
class AugmentedBernoulliDatasetConfigs(UserList):
    def __init__(
        self,
        augmented_bernoulli_dataset_configs=List[AugmentedBernoulliDatasetConfig],
        label_col: str = "success_labels",
        labels_dtype: Type[np.intc] = np.int32,
        sample_weight_col: Optional[str] = None,
        logger: Logger = VoidLogger(),
    ):
        self.data: List[AugmentedBernoulliDatasetConfig] = augmented_bernoulli_dataset_configs
        self.label_col = label_col
        self.labels_dtype = labels_dtype
        self.sample_weight_col = sample_weight_col
        self.check_compatibility()
        self.loggers = {"logger": logger}  # make this class comply with Loggable Protocol


[docs]
    def check_compatibility(self) -> None:
        if len(self.data) == 0:
            raise InconsistentDatasetConfigurations(
                "`augmented_bernoulli_dataset_configs` should have at least one element"
            )

        elif len(self.data) > 1:
            if self.sample_weight_col and not all(
                [self.sample_weight_col in conf.columns for conf in self.data]
            ):
                raise InconsistentDatasetConfigurations(
                    "All instances of `AugmentedBernoulliDatasetConfig` should have column `sample_weight_col`."
                )

            if not all(
                [self.data[0].augmented_col == conf.augmented_col for conf in self.data[1:]]
            ):
                raise InconsistentDatasetConfigurations(
                    "All instances of `AugmentedBernoulliDatasetConfig` should have "
                    + "the same attribute `augmented_col`."
                )

            if not all([set(self.data[0].columns) == set(conf.columns) for conf in self.data[1:]]):
                raise InconsistentDatasetConfigurations(
                    "All instances of `AugmentedBernoulliDatasetConfig` should have the same `columns`."
                )

            if not all([(self.data[0].dtypes == conf.dtypes).all() for conf in self.data[1:]]):
                raise InconsistentDatasetConfigurations(
                    "All instances of `AugmentedBernoulliDatasetConfig` should have the same `dtypes`."
                )


    @property
    def augmented_col(self):
        return self.data[0].augmented_col

    @property
    def columns(self):
        return self.data[0].columns

    @property
    def dtypes(self):
        return self.data[0].dtypes

    def _concat_data(self, list_df: List[pd.DataFrame], list_labels: List[bool]) -> pd.DataFrame:
        output_df = pd.concat(
            [df for df in list_df if len(df) > 0],
            axis=0,
            sort=True,
        )
        output_df[self.label_col] = list(
            itertools.chain(*[[label] * len(df) for df, label in zip(list_df, list_labels)])
        )
        return output_df.astype({self.label_col: self.labels_dtype})


[docs]
    def sample(self, n: int, replace: bool = False):
        return self._concat_data(
            [conf_df.sample(n, replace=replace) for conf_df in self.data],
            [conf_df.is_success for conf_df in self.data],
        ).sample(n, replace=replace)


    @property
    def all_data(self):
        return self._concat_data(
            [conf_df.data for conf_df in self.data],
            [conf_df.is_success for conf_df in self.data],
        )

    @property
    def all_data_length(self):
        return sum([len(conf_df) for conf_df in self.data])

    @property
    def all_training_data(self):
        return self._concat_data(
            [conf_df.training_data for conf_df in self.data],
            [conf_df.is_success for conf_df in self.data],
        )

    @property
    def all_training_data_length(self):
        return sum([conf_df.training_data_length for conf_df in self.data])

    @property
    def all_validation_data(self):
        return self._concat_data(
            [conf_df.validation_data for conf_df in self.data],
            [conf_df.is_success for conf_df in self.data],
        )

    @property
    def all_validation_data_length(self):
        return sum([conf_df.validation_data_length for conf_df in self.data])

    @property
    def all_test_data(self):
        return self._concat_data(
            [conf_df.test_data for conf_df in self.data],
            [conf_df.is_success for conf_df in self.data],
        )

    @property
    def all_test_data_length(self):
        return sum([conf_df.test_data_length for conf_df in self.data])

    @property
    def all_augmented_data(self):
        return self._concat_data(
            [conf_df.augmented_data for conf_df in self.data],
            [conf_df.is_success for conf_df in self.data],
        )

    @property
    def all_augmented_data_length(self):
        return sum([conf_df.augmented_data_length for conf_df in self.data])

    @property
    def all_augmented_training_data(self):
        return self._concat_data(
            [conf_df.augmented_training_data for conf_df in self.data],
            [conf_df.is_success for conf_df in self.data],
        )

    @property
    def all_augmented_training_data_length(self):
        return sum([conf_df.augmented_training_data_length for conf_df in self.data])

    @property
    def all_augmented_validation_data(self):
        return self._concat_data(
            [conf_df.augmented_validation_data for conf_df in self.data],
            [conf_df.is_success for conf_df in self.data],
        )

    @property
    def all_augmented_validation_data_length(self):
        return sum([conf_df.augmented_validation_data_length for conf_df in self.data])

    @property
    def all_augmented_test_data(self):
        return self._concat_data(
            [conf_df.augmented_test_data for conf_df in self.data],
            [conf_df.is_success for conf_df in self.data],
        )

    @property
    def all_augmented_test_data_length(self):
        return sum([conf_df.augmented_test_data_length for conf_df in self.data])


[docs]
    def clear_data(self) -> AugmentedBernoulliDatasetConfigs:
        self.data = [d.clear_data() for d in self.data]
        self.check_compatibility()
        return self



[docs]
    @use_loggers("logger")
    def log(self, logger: Logger) -> None:
        for conf in self.data:
            conf.log()

        logger.log_param("label_col", self.label_col)
        logger.log_param("sample_weight_col", self.sample_weight_col)
        logger.log_param("augmented_col", self.augmented_col)
        logger.log_param("columns", self.columns)
        logger.log_metrics(
            {
                "all_data_length": self.all_data_length,
                "all_augmented_data_length": self.all_augmented_data_length,
                "all_training_data_length": self.all_training_data_length,
                "all_augmented_training_data_length": self.all_augmented_training_data_length,
                "all_validation_data_length": self.all_validation_data_length,
                "all_augmented_validation_data_length": self.all_augmented_validation_data_length,
                "all_test_data_length": self.all_test_data_length,
                "all_augmented_test_data_length": self.all_augmented_test_data_length,
            }
        )
        logger.log_pandas_artifact(self.dtypes.rename("data_dtypes").to_frame(), "data_dtypes")