From c059d93780b9771e652ab0167f5031e4b64ff1ed Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 22 Sep 2025 15:40:38 -0400 Subject: [PATCH 01/40] Merge "Refactor Transformations handling: replace get_T_dict with a default class method and update function signatures to use Transformations directly." --- src/midst_toolkit/models/clavaddpm/dataset.py | 31 +++++++------------ src/midst_toolkit/models/clavaddpm/train.py | 26 +++++++--------- 2 files changed, 24 insertions(+), 33 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index 32f193f0..15569cdf 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -73,25 +73,18 @@ class Transformations: cat_encoding: CatEncoding | None = None y_policy: YPolicy | None = "default" - -# TODO move this into the Transformations' class init -def get_T_dict() -> dict[str, Any]: - """ - Return a dictionary used to initialize the transformation object. - - Returns: - The transformation object default parameters. - """ - # ruff: noqa: N802 - return { - "seed": 0, - "normalization": "quantile", - "num_nan_policy": None, - "cat_nan_policy": None, - "cat_min_frequency": None, - "cat_encoding": None, - "y_policy": "default", - } + @classmethod + def default(cls) -> Self: + """Return the default transformations.""" + return cls( + seed=0, + normalization="quantile", + num_nan_policy=None, + cat_nan_policy=None, + cat_min_frequency=None, + cat_encoding=None, + y_policy="default", + ) @dataclass(frozen=False) diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index b808da3b..27413c99 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -2,6 +2,7 @@ import pickle from collections.abc import Generator +from dataclasses import asdict from logging import INFO, WARNING from pathlib import Path from typing import Any, Literal @@ -16,7 +17,6 @@ from midst_toolkit.models.clavaddpm.dataset import ( Dataset, Transformations, - get_T_dict, make_dataset_from_df, ) from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion @@ -188,14 +188,14 @@ def child_training( "dropout": diffusion_config["dropout"], } ) - child_T_dict = get_T_dict() + child_transformations = Transformations.default() # ruff: noqa: N806 child_result = train_model( child_df_with_cluster, child_info, child_model_params, - child_T_dict, + child_transformations, diffusion_config["iterations"], diffusion_config["batch_size"], diffusion_config["model_type"], @@ -217,7 +217,7 @@ def child_training( child_df_with_cluster, child_info, child_model_params, - child_T_dict, + child_transformations, classifier_config["iterations"], classifier_config["batch_size"], diffusion_config["gaussian_loss_type"], @@ -236,7 +236,7 @@ def child_training( child_result["df_info"] = child_info child_result["model_params"] = child_model_params - child_result["T_dict"] = child_T_dict + child_result["T_dict"] = asdict(child_transformations) return child_result @@ -244,7 +244,7 @@ def train_model( data_frame: pd.DataFrame, data_frame_info: dict[str, Any], model_params: dict[str, Any], - transformations_dict: dict[str, Any], + transformations: Transformations, steps: int, batch_size: int, model_type: Literal["mlp", "resnet"], @@ -263,7 +263,7 @@ def train_model( data_frame: DataFrame to train the model on. data_frame_info: Dictionary of the table information. model_params: Dictionary of the model parameters. - transformations_dict: Dictionary of the transformations. + transformations: The transformations to apply to the dataset. steps: Number of steps to train the model. batch_size: Batch size to use for training. model_type: Type of the model to use. @@ -283,7 +283,6 @@ def train_model( - dataset: The dataset. - column_orders: The column orders. """ - transformations = Transformations(**transformations_dict) # ruff: noqa: N806 dataset, label_encoders, column_orders = make_dataset_from_df( data_frame, @@ -296,7 +295,7 @@ def train_model( category_sizes = np.array(dataset.get_category_sizes("train")) # ruff: noqa: N806 - if len(category_sizes) == 0 or transformations_dict["cat_encoding"] == "one-hot": + if len(category_sizes) == 0 or transformations.cat_encoding == "one-hot": category_sizes = np.array([0]) # ruff: noqa: N806 @@ -356,7 +355,7 @@ def train_classifier( data_frame: pd.DataFrame, data_frame_info: dict[str, Any], model_params: dict[str, Any], - transformations_dict: dict[str, Any], + transformations: Transformations, classifier_steps: int, batch_size: int, gaussian_loss_type: str, @@ -378,7 +377,7 @@ def train_classifier( data_frame: DataFrame to train the model on. data_frame_info: Dictionary of the table information. model_params: Dictionary of the model parameters. - transformations_dict: Dictionary of the transformations. + transformations: The transformations to apply to the dataset. classifier_steps: Number of steps to train the classifier. batch_size: Batch size to use for training. gaussian_loss_type: Type of the gaussian loss to use. @@ -399,7 +398,6 @@ def train_classifier( Returns: The trained classifier model. """ - transformations = Transformations(**transformations_dict) # ruff: noqa: N806 dataset, label_encoders, column_orders = make_dataset_from_df( data_frame, @@ -416,7 +414,7 @@ def train_classifier( category_sizes = np.array(dataset.get_category_sizes("train")) # ruff: noqa: N806 - if len(category_sizes) == 0 or transformations_dict["cat_encoding"] == "one-hot": + if len(category_sizes) == 0 or transformations.cat_encoding == "one-hot": category_sizes = np.array([0]) # ruff: noqa: N806 print(category_sizes) @@ -505,7 +503,7 @@ def train_classifier( correct += (pred.argmax(dim=1) == test_y).sum().item() acc = correct / (3000 * batch_size) - print(acc) + log(INFO, f"Classifier accuracy: {acc}") return classifier From 9f0290a71b3ee340810e38bb6ee26695fcaafeb9 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 22 Sep 2025 15:51:33 -0400 Subject: [PATCH 02/40] Refactor clustering method handling: Introduce ClusteringMethod enum for better type safety and clarity in clustering function signatures. --- .../models/clavaddpm/clustering.py | 27 +++++++++++-------- src/midst_toolkit/models/clavaddpm/typing.py | 10 +++++++ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 4ad5b494..d7896f5c 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -5,7 +5,7 @@ from collections import defaultdict from logging import INFO, WARNING from pathlib import Path -from typing import Any, Literal +from typing import Any import numpy as np import pandas as pd @@ -14,7 +14,13 @@ from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, QuantileTransformer from midst_toolkit.common.logger import log -from midst_toolkit.models.clavaddpm.typing import Configs, GroupLengthsProbDicts, RelationOrder, Tables +from midst_toolkit.models.clavaddpm.typing import ( + ClusteringMethod, + Configs, + GroupLengthsProbDicts, + RelationOrder, + Tables, +) def clava_clustering( @@ -142,7 +148,7 @@ def _run_clustering( num_clusters, configs["parent_scale"], 1, # not used for now - clustering_method=configs["clustering_method"], + clustering_method=ClusteringMethod(configs["clustering_method"]), ) tables[parent]["df"] = parent_df_with_cluster tables[child]["df"] = child_df_with_cluster @@ -159,7 +165,7 @@ def _pair_clustering_keep_id( num_clusters: int, parent_scale: float, key_scale: float, - clustering_method: Literal["kmeans", "gmm", "kmeans_and_gmm", "variational"] = "kmeans", + clustering_method: ClusteringMethod = ClusteringMethod.KMEANS, ) -> tuple[pd.DataFrame, pd.DataFrame, dict[int, dict[int, float]]]: """ Pairs clustering information to the parent and child dataframes. @@ -176,8 +182,7 @@ def _pair_clustering_keep_id( key_scale: Scaling factor applied to the foreign key values that link the child table to the parent table. This will weight how much influence the parent-child relationship has in the clustering algorithm. - clustering_method: Method of clustering. Has to be one of ["kmeans", "gmm", "kmeans_and_gmm", "variational"]. - Default is "kmeans". + clustering_method: Method of clustering. Default is ClusteringMethod.KMEANS. Returns: Tuple with 3 elements: @@ -287,11 +292,11 @@ def _pair_clustering_keep_id( child_group_lengths = np.array([len(group) for group in child_group_data], dtype=int) num_clusters = min(num_clusters, len(cluster_data)) - if clustering_method == "kmeans": + if clustering_method == ClusteringMethod.KMEANS: kmeans = KMeans(n_clusters=num_clusters, n_init="auto", init="k-means++") kmeans.fit(cluster_data) cluster_labels = kmeans.labels_ - elif clustering_method == "kmeans_and_gmm": + elif clustering_method == ClusteringMethod.KMEANS_AND_GMM: gmm = GaussianMixture( n_components=num_clusters, verbose=1, @@ -301,7 +306,7 @@ def _pair_clustering_keep_id( ) gmm.fit(cluster_data) cluster_labels = gmm.predict(cluster_data) - elif clustering_method == "variational": + elif clustering_method == ClusteringMethod.VARIATIONAL: bgmm = BayesianGaussianMixture( n_components=num_clusters, verbose=1, @@ -311,7 +316,7 @@ def _pair_clustering_keep_id( ) bgmm.fit(cluster_data) cluster_labels = bgmm.predict_proba(cluster_data) - elif clustering_method == "gmm": + elif clustering_method == ClusteringMethod.GMM: gmm = GaussianMixture( n_components=num_clusters, verbose=1, @@ -320,7 +325,7 @@ def _pair_clustering_keep_id( gmm.fit(cluster_data) cluster_labels = gmm.predict(cluster_data) - if clustering_method == "variational": + if clustering_method == ClusteringMethod.VARIATIONAL: group_cluster_labels, agree_rates = _aggregate_and_sample(cluster_labels, child_group_lengths) else: group_cluster_labels, agree_rates = _get_group_cluster_labels_through_voting( diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py index 02a1eee3..1edbe683 100644 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ b/src/midst_toolkit/models/clavaddpm/typing.py @@ -1,4 +1,5 @@ from collections.abc import Callable +from enum import Enum from typing import Any import numpy as np @@ -12,3 +13,12 @@ GroupLengthsProbDicts = dict[tuple[str, str], dict[int, dict[int, float]]] ArrayDict = dict[str, np.ndarray] ModuleType = str | Callable[..., nn.Module] + + +class ClusteringMethod(Enum): + """Possioble clustering methods for multi-table training.""" + + KMEANS = "kmeans" + GMM = "gmm" + KMEANS_AND_GMM = "kmeans_and_gmm" + VARIATIONAL = "variational" From 80073f210345a64c9fc84470c61eccc2851a2d6f Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 22 Sep 2025 16:06:09 -0400 Subject: [PATCH 03/40] Refactor model handling: Introduce ModelType enum for improved type safety and streamline model retrieval in fine-tuning and training functions. --- src/midst_toolkit/models/clavaddpm/model.py | 51 +++++++++++---------- src/midst_toolkit/models/clavaddpm/train.py | 8 ++-- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py index 4f9168a7..2d84dc51 100644 --- a/src/midst_toolkit/models/clavaddpm/model.py +++ b/src/midst_toolkit/models/clavaddpm/model.py @@ -1,6 +1,8 @@ from __future__ import annotations import math +from enum import Enum +from logging import INFO from typing import Any, Literal, Self import pandas as pd @@ -10,6 +12,7 @@ # ruff: noqa: N812 from torch import Tensor, nn +from midst_toolkit.common.logger import log from midst_toolkit.models.clavaddpm.typing import ModuleType @@ -122,29 +125,6 @@ def get_table_info(df: pd.DataFrame, domain_dict: dict[str, Any], y_col: str) -> return df_info -def get_model( - model_name: Literal["mlp", "resnet"], - model_params: dict[str, Any], -) -> nn.Module: - """ - Get the model. - - Args: - model_name: The name of the model. Can be "mlp" or "resnet". - model_params: The dictionary of parameters of the model. - - Returns: - The model. - """ - print(model_name) - if model_name == "mlp": - return MLPDiffusion(**model_params) - if model_name == "resnet": - return ResNetDiffusion(**model_params) - - raise ValueError("Unknown model!") - - def timestep_embedding(timesteps: Tensor, dim: int, max_period: int = 10000) -> Tensor: """ Create sinusoidal timestep embeddings. @@ -806,3 +786,28 @@ def _make_nn_module(module_type: ModuleType, *args: Any) -> nn.Module: if isinstance(module_type, str) else module_type(*args) ) + + +class ModelType(Enum): + """Possible model types for the ClavaDDPM model.""" + + MLP = "mlp" + RESNET = "resnet" + + def get_model(self, model_params: dict[str, Any]) -> nn.Module: + """ + Get the model. + + Args: + model_params: The dictionary of parameters of the model. + + Returns: + The model. + """ + log(INFO, f"Getting model: {self.value}") + if self == ModelType.MLP: + return MLPDiffusion(**model_params) + if self == ModelType.RESNET: + return ResNetDiffusion(**model_params) + + raise ValueError(f"Unsupported model type: {self.value}") diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 27413c99..89379f70 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -20,7 +20,7 @@ make_dataset_from_df, ) from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion -from midst_toolkit.models.clavaddpm.model import Classifier, get_model, get_table_info +from midst_toolkit.models.clavaddpm.model import Classifier, ModelType, get_table_info from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, create_named_schedule_sampler from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer from midst_toolkit.models.clavaddpm.typing import Configs, RelationOrder, Tables @@ -198,7 +198,7 @@ def child_training( child_transformations, diffusion_config["iterations"], diffusion_config["batch_size"], - diffusion_config["model_type"], + ModelType(diffusion_config["model_type"]), diffusion_config["gaussian_loss_type"], diffusion_config["num_timesteps"], diffusion_config["scheduler"], @@ -247,7 +247,7 @@ def train_model( transformations: Transformations, steps: int, batch_size: int, - model_type: Literal["mlp", "resnet"], + model_type: ModelType, gaussian_loss_type: str, num_timesteps: int, scheduler: str, @@ -306,7 +306,7 @@ def train_model( model_params["d_in"] = d_in print("Model params: {}".format(model_params)) - model = get_model(model_type, model_params) + model = model_type.get_model(model_params) model.to(device) train_loader = prepare_fast_dataloader(dataset, split="train", batch_size=batch_size) From 1d437837a665bbef5a4b7416886755edb0530d96 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 22 Sep 2025 16:27:23 -0400 Subject: [PATCH 04/40] Merge "Refactor model parameters: Introduce ModelParameters and RTDLParameters dataclasses for improved structure and type safety in model configuration across fine-tuning and training functions." --- src/midst_toolkit/models/clavaddpm/model.py | 87 ++++++++++++++++----- src/midst_toolkit/models/clavaddpm/train.py | 67 ++++------------ 2 files changed, 86 insertions(+), 68 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py index 2d84dc51..adbc96a1 100644 --- a/src/midst_toolkit/models/clavaddpm/model.py +++ b/src/midst_toolkit/models/clavaddpm/model.py @@ -1,6 +1,7 @@ from __future__ import annotations import math +from dataclasses import dataclass from enum import Enum from logging import INFO from typing import Any, Literal, Self @@ -310,6 +311,32 @@ def forward(self, x: Tensor) -> Tensor: return self.head(x) +@dataclass +class RTDLParameters: + """Parameters for the RTDL model.""" + + d_layers: list[int] + dropout: float + d_in: int = 0 + d_out: int = 0 + emb_d: int = 0 + n_blocks: int = 0 + d_main: int = 0 + d_hidden: int = 0 + dropout_first: float = 0 + dropout_second: float = 0 + + +@dataclass +class ModelParameters: + """Parameters for the ClavaDDPM model.""" + + rtdl_parameters: RTDLParameters + d_in: int = 0 + num_classes: int = 0 + is_y_cond: Literal["concat", "embedding", "none"] = "none" + + class ResNet(nn.Module): """ The ResNet model used in [gorishniy2021revisiting]. @@ -567,7 +594,7 @@ def __init__( d_in: int, num_classes: int, is_y_cond: Literal["concat", "embedding", "none"], - rtdl_params: dict[str, Any], + rtdl_parameters: RTDLParameters, dim_t: int = 128, ): """ @@ -577,20 +604,24 @@ def __init__( d_in: The input dimension size. num_classes: The number of classes. is_y_cond: The condition on the y column. Can be "concat", "embedding", or "none". - rtdl_params: The dictionary of parameters for the MLP. - dim_t: The dimension size of the timestep. + rtdl_parameters: The parameters for the MLP. + dim_t: The dimension size of the timestamp. """ super().__init__() self.dim_t = dim_t self.num_classes = num_classes self.is_y_cond = is_y_cond - # d0 = rtdl_params['d_layers'][0] + self.rtdl_parameters = rtdl_parameters + self.rtdl_parameters.d_in = dim_t + self.rtdl_parameters.d_out = d_in - rtdl_params["d_in"] = dim_t - rtdl_params["d_out"] = d_in - - self.mlp = MLP.make_baseline(**rtdl_params) + self.mlp = MLP.make_baseline( + d_in=self.rtdl_parameters.d_in, + d_layers=self.rtdl_parameters.d_layers, + dropout=self.rtdl_parameters.dropout, + d_out=self.rtdl_parameters.d_out, + ) self.label_emb: nn.Embedding | nn.Linear if self.num_classes > 0 and is_y_cond == "embedding": @@ -626,7 +657,7 @@ def __init__( self, d_in: int, num_classes: int, - rtdl_params: dict[str, Any], + rtdl_parameters: RTDLParameters, dim_t: int = 256, is_y_cond: Literal["concat", "embedding", "none"] | None = None, ): @@ -636,7 +667,7 @@ def __init__( Args: d_in: The input dimension size. num_classes: The number of classes. - rtdl_params: The dictionary of parameters for the ResNet. + rtdl_parameters: The parameters for the ResNet. dim_t: The dimension size of the timestep. is_y_cond: The condition on the y column. Can be "concat", "embedding", or "none". Optional, default is None. @@ -644,11 +675,22 @@ def __init__( super().__init__() self.dim_t = dim_t self.num_classes = num_classes + self.is_y_cond = is_y_cond - rtdl_params["d_in"] = d_in - rtdl_params["d_out"] = d_in - rtdl_params["emb_d"] = dim_t - self.resnet = ResNet.make_baseline(**rtdl_params) + self.rtdl_parameters = rtdl_parameters + self.rtdl_parameters.d_in = d_in + self.rtdl_parameters.d_out = d_in + self.rtdl_parameters.emb_d = dim_t + + self.resnet = ResNet.make_baseline( + d_in=rtdl_parameters.d_in, + n_blocks=rtdl_parameters.n_blocks, + d_main=rtdl_parameters.d_main, + d_hidden=rtdl_parameters.d_hidden, + dropout_first=rtdl_parameters.dropout_first, + dropout_second=rtdl_parameters.dropout_second, + d_out=rtdl_parameters.d_out, + ) self.label_emb: nn.Embedding | nn.Linear if self.num_classes > 0 and is_y_cond == "embedding": @@ -794,20 +836,29 @@ class ModelType(Enum): MLP = "mlp" RESNET = "resnet" - def get_model(self, model_params: dict[str, Any]) -> nn.Module: + def get_model(self, model_parameters: ModelParameters) -> nn.Module: """ Get the model. Args: - model_params: The dictionary of parameters of the model. + model_parameters: The parameters of the model. Returns: The model. """ log(INFO, f"Getting model: {self.value}") if self == ModelType.MLP: - return MLPDiffusion(**model_params) + return MLPDiffusion( + d_in=model_parameters.d_in, + num_classes=model_parameters.num_classes, + is_y_cond=model_parameters.is_y_cond, + rtdl_parameters=model_parameters.rtdl_parameters, + ) if self == ModelType.RESNET: - return ResNetDiffusion(**model_params) + return ResNetDiffusion( + d_in=model_parameters.d_in, + num_classes=model_parameters.num_classes, + rtdl_parameters=model_parameters.rtdl_parameters, + ) raise ValueError(f"Unsupported model type: {self.value}") diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 89379f70..9a6f8693 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -20,7 +20,7 @@ make_dataset_from_df, ) from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion -from midst_toolkit.models.clavaddpm.model import Classifier, ModelType, get_table_info +from midst_toolkit.models.clavaddpm.model import Classifier, ModelParameters, ModelType, RTDLParameters, get_table_info from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, create_named_schedule_sampler from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer from midst_toolkit.models.clavaddpm.typing import Configs, RelationOrder, Tables @@ -182,11 +182,11 @@ def child_training( else: y_col = f"{parent_name}_{child_name}_cluster" child_info = get_table_info(child_df_with_cluster, child_domain_dict, y_col) - child_model_params = _get_model_params( - { - "d_layers": diffusion_config["d_layers"], - "dropout": diffusion_config["dropout"], - } + child_model_params = ModelParameters( + rtdl_parameters=RTDLParameters( + d_layers=diffusion_config["d_layers"], + dropout=diffusion_config["dropout"], + ), ) child_transformations = Transformations.default() # ruff: noqa: N806 @@ -235,7 +235,7 @@ def child_training( log(WARNING, "Skipping classifier training since classifier_config['iterations'] <= 0") child_result["df_info"] = child_info - child_result["model_params"] = child_model_params + child_result["model_params"] = asdict(child_model_params) child_result["T_dict"] = asdict(child_transformations) return child_result @@ -243,7 +243,7 @@ def child_training( def train_model( data_frame: pd.DataFrame, data_frame_info: dict[str, Any], - model_params: dict[str, Any], + model_params: ModelParameters, transformations: Transformations, steps: int, batch_size: int, @@ -262,7 +262,7 @@ def train_model( Args: data_frame: DataFrame to train the model on. data_frame_info: Dictionary of the table information. - model_params: Dictionary of the model parameters. + model_params: The model parameters. transformations: The transformations to apply to the dataset. steps: Number of steps to train the model. batch_size: Batch size to use for training. @@ -287,7 +287,7 @@ def train_model( dataset, label_encoders, column_orders = make_dataset_from_df( data_frame, transformations, - is_y_cond=model_params["is_y_cond"], + is_y_cond=model_params.is_y_cond, ratios=data_split_ratios, df_info=data_frame_info, std=0, @@ -303,7 +303,7 @@ def train_model( num_numerical_features = dataset.x_num["train"].shape[1] if dataset.x_num is not None else 0 d_in = np.sum(category_sizes) + num_numerical_features - model_params["d_in"] = d_in + model_params.d_in = d_in print("Model params: {}".format(model_params)) model = model_type.get_model(model_params) @@ -333,7 +333,7 @@ def train_model( ) trainer.train() - if model_params["is_y_cond"] == "concat": + if model_params.is_y_cond == "concat": column_orders = column_orders[1:] + [column_orders[0]] else: column_orders = column_orders + [data_frame_info["y_col"]] @@ -354,7 +354,7 @@ def train_model( def train_classifier( data_frame: pd.DataFrame, data_frame_info: dict[str, Any], - model_params: dict[str, Any], + model_params: ModelParameters, transformations: Transformations, classifier_steps: int, batch_size: int, @@ -376,7 +376,7 @@ def train_classifier( Args: data_frame: DataFrame to train the model on. data_frame_info: Dictionary of the table information. - model_params: Dictionary of the model parameters. + model_params: The model parameters. transformations: The transformations to apply to the dataset. classifier_steps: Number of steps to train the classifier. batch_size: Batch size to use for training. @@ -402,7 +402,7 @@ def train_classifier( dataset, label_encoders, column_orders = make_dataset_from_df( data_frame, transformations, - is_y_cond=model_params["is_y_cond"], + is_y_cond=model_params.is_y_cond, ratios=data_split_ratios, df_info=data_frame_info, std=0, @@ -426,7 +426,7 @@ def train_classifier( else: num_numerical_features = dataset.x_num["train"].shape[1] - if model_params["is_y_cond"] == "concat": + if model_params.is_y_cond == "concat": num_numerical_features -= 1 classifier = Classifier( @@ -497,7 +497,7 @@ def train_classifier( for _ in range(3000): test_x, test_y = next(test_loader) test_y = test_y.long().to(device) - test_x = test_x[:, 1:].to(device) if model_params["is_y_cond"] == "concat" else test_x.to(device) + test_x = test_x[:, 1:].to(device) if model_params.is_y_cond == "concat" else test_x.to(device) with torch.no_grad(): pred = classifier(test_x, timesteps=torch.zeros(test_x.shape[0]).to(device)) correct += (pred.argmax(dim=1) == test_y).sum().item() @@ -705,36 +705,3 @@ def _split_microbatches( else: for i in range(0, bs, microbatch): yield batch[i : i + microbatch], labels[i : i + microbatch], t[i : i + microbatch] - - -# TODO make this into a class with default parameters -def _get_model_params(rtdl_params: dict[str, Any] | None = None) -> dict[str, Any]: - """ - Return the model parameters. - - Args: - rtdl_params: The parameters for the RTDL model. If None, the default parameters below are used: - { - "d_layers": [512, 1024, 1024, 1024, 1024, 512], - "dropout": 0.0, - } - - Returns: - The model parameters as a dictionary containing the following keys: - - num_classes: The number of classes. Defaults to 0. - - is_y_cond: Affects how y is generated. For more information, see the documentation - of the `make_dataset_from_df` function. Can be any of ["none", "concat", "embedding"]. - Defaults to "none". - - rtdl_params: The parameters for the RTDL model. - """ - if rtdl_params is None: - rtdl_params = { - "d_layers": [512, 1024, 1024, 1024, 1024, 512], - "dropout": 0.0, - } - - return { - "num_classes": 0, - "is_y_cond": "none", - "rtdl_params": rtdl_params, - } From f7d69ec3250f69734072329630904ba742b42b1f Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 22 Sep 2025 16:37:21 -0400 Subject: [PATCH 05/40] Merge "Refactor y condition handling: Replace string literals with IsYCond enum in dataset, model, and training modules for improved type safety and clarity in handling y column conditions." --- src/midst_toolkit/models/clavaddpm/dataset.py | 14 +++-- src/midst_toolkit/models/clavaddpm/model.py | 50 ++++------------- src/midst_toolkit/models/clavaddpm/train.py | 17 ++++-- src/midst_toolkit/models/clavaddpm/typing.py | 55 +++++++++++++++++++ 4 files changed, 86 insertions(+), 50 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index 15569cdf..c889a878 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -28,7 +28,7 @@ StandardScaler, ) -from midst_toolkit.models.clavaddpm.typing import ArrayDict +from midst_toolkit.models.clavaddpm.typing import ArrayDict, IsYCond # TODO: Dunders are special case in python, rename these values to something else. @@ -426,7 +426,7 @@ def make_dataset_from_df( # ruff: noqa: PLR0915, PLR0912 df: pd.DataFrame, transformations: Transformations, - is_y_cond: Literal["concat", "embedding", "none"], + is_y_cond: IsYCond, df_info: dict[str, Any], ratios: list[float] | None = None, std: float = 0, @@ -486,14 +486,16 @@ def make_dataset_from_df( column_to_index = {col: i for i, col in enumerate(index_to_column)} if df_info["n_classes"] > 0: - x_cat: dict[str, np.ndarray] | None = {} if df_info["cat_cols"] is not None or is_y_cond == "concat" else None + x_cat: dict[str, np.ndarray] | None = ( + {} if df_info["cat_cols"] is not None or is_y_cond == IsYCond.CONCAT else None + ) x_num: dict[str, np.ndarray] | None = {} if df_info["num_cols"] is not None else None y = {} cat_cols_with_y: list[str] = [] if df_info["cat_cols"] is not None: cat_cols_with_y += df_info["cat_cols"] - if is_y_cond == "concat": + if is_y_cond == IsYCond.CONCAT: cat_cols_with_y = [df_info["y_col"]] + cat_cols_with_y if len(cat_cols_with_y) > 0: @@ -515,13 +517,13 @@ def make_dataset_from_df( else: x_cat = {} if df_info["cat_cols"] is not None else None - x_num = {} if df_info["num_cols"] is not None or is_y_cond == "concat" else None + x_num = {} if df_info["num_cols"] is not None or is_y_cond == IsYCond.CONCAT else None y = {} num_cols_with_y: list[str] = [] if df_info["num_cols"] is not None: num_cols_with_y += df_info["num_cols"] - if is_y_cond == "concat": + if is_y_cond == IsYCond.CONCAT: num_cols_with_y = [df_info["y_col"]] + num_cols_with_y if len(num_cols_with_y) > 0: diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py index adbc96a1..118173bd 100644 --- a/src/midst_toolkit/models/clavaddpm/model.py +++ b/src/midst_toolkit/models/clavaddpm/model.py @@ -1,10 +1,9 @@ from __future__ import annotations import math -from dataclasses import dataclass from enum import Enum from logging import INFO -from typing import Any, Literal, Self +from typing import Any, Self import pandas as pd import torch @@ -14,7 +13,7 @@ from torch import Tensor, nn from midst_toolkit.common.logger import log -from midst_toolkit.models.clavaddpm.typing import ModuleType +from midst_toolkit.models.clavaddpm.typing import IsYCond, ModelParameters, ModuleType, RTDLParameters class Classifier(nn.Module): @@ -311,32 +310,6 @@ def forward(self, x: Tensor) -> Tensor: return self.head(x) -@dataclass -class RTDLParameters: - """Parameters for the RTDL model.""" - - d_layers: list[int] - dropout: float - d_in: int = 0 - d_out: int = 0 - emb_d: int = 0 - n_blocks: int = 0 - d_main: int = 0 - d_hidden: int = 0 - dropout_first: float = 0 - dropout_second: float = 0 - - -@dataclass -class ModelParameters: - """Parameters for the ClavaDDPM model.""" - - rtdl_parameters: RTDLParameters - d_in: int = 0 - num_classes: int = 0 - is_y_cond: Literal["concat", "embedding", "none"] = "none" - - class ResNet(nn.Module): """ The ResNet model used in [gorishniy2021revisiting]. @@ -593,7 +566,7 @@ def __init__( self, d_in: int, num_classes: int, - is_y_cond: Literal["concat", "embedding", "none"], + is_y_cond: IsYCond, rtdl_parameters: RTDLParameters, dim_t: int = 128, ): @@ -603,7 +576,7 @@ def __init__( Args: d_in: The input dimension size. num_classes: The number of classes. - is_y_cond: The condition on the y column. Can be "concat", "embedding", or "none". + is_y_cond: The condition on the y column. rtdl_parameters: The parameters for the MLP. dim_t: The dimension size of the timestamp. """ @@ -624,9 +597,9 @@ def __init__( ) self.label_emb: nn.Embedding | nn.Linear - if self.num_classes > 0 and is_y_cond == "embedding": + if self.num_classes > 0 and is_y_cond == IsYCond.EMBEDDING: self.label_emb = nn.Embedding(self.num_classes, dim_t) - elif self.num_classes == 0 and is_y_cond == "embedding": + elif self.num_classes == 0 and is_y_cond == IsYCond.EMBEDDING: self.label_emb = nn.Linear(1, dim_t) self.proj = nn.Linear(d_in, dim_t) @@ -645,7 +618,7 @@ def forward(self, x: Tensor, timesteps: Tensor, y: Tensor | None = None) -> Tens The output tensor. """ emb = self.time_embed(timestep_embedding(timesteps, self.dim_t)) - if self.is_y_cond == "embedding" and y is not None: + if self.is_y_cond == IsYCond.EMBEDDING and y is not None: y = y.squeeze() if self.num_classes > 0 else y.resize_(y.size(0), 1).float() emb += F.silu(self.label_emb(y)) x = self.proj(x) + emb @@ -659,7 +632,7 @@ def __init__( num_classes: int, rtdl_parameters: RTDLParameters, dim_t: int = 256, - is_y_cond: Literal["concat", "embedding", "none"] | None = None, + is_y_cond: IsYCond | None = None, ): """ Initialize the ResNet diffusion model. @@ -669,8 +642,7 @@ def __init__( num_classes: The number of classes. rtdl_parameters: The parameters for the ResNet. dim_t: The dimension size of the timestep. - is_y_cond: The condition on the y column. Can be "concat", "embedding", or "none". - Optional, default is None. + is_y_cond: The condition on the y column. Optional, default is None. """ super().__init__() self.dim_t = dim_t @@ -693,9 +665,9 @@ def __init__( ) self.label_emb: nn.Embedding | nn.Linear - if self.num_classes > 0 and is_y_cond == "embedding": + if self.num_classes > 0 and is_y_cond == IsYCond.EMBEDDING: self.label_emb = nn.Embedding(self.num_classes, dim_t) - elif self.num_classes == 0 and is_y_cond == "embedding": + elif self.num_classes == 0 and is_y_cond == IsYCond.EMBEDDING: self.label_emb = nn.Linear(1, dim_t) self.time_embed = nn.Sequential(nn.Linear(dim_t, dim_t), nn.SiLU(), nn.Linear(dim_t, dim_t)) diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 9a6f8693..7f55ff3f 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -20,10 +20,17 @@ make_dataset_from_df, ) from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion -from midst_toolkit.models.clavaddpm.model import Classifier, ModelParameters, ModelType, RTDLParameters, get_table_info +from midst_toolkit.models.clavaddpm.model import Classifier, ModelType, get_table_info from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, create_named_schedule_sampler from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer -from midst_toolkit.models.clavaddpm.typing import Configs, RelationOrder, Tables +from midst_toolkit.models.clavaddpm.typing import ( + Configs, + IsYCond, + ModelParameters, + RelationOrder, + RTDLParameters, + Tables, +) def clava_training( @@ -333,7 +340,7 @@ def train_model( ) trainer.train() - if model_params.is_y_cond == "concat": + if model_params.is_y_cond == IsYCond.CONCAT: column_orders = column_orders[1:] + [column_orders[0]] else: column_orders = column_orders + [data_frame_info["y_col"]] @@ -426,7 +433,7 @@ def train_classifier( else: num_numerical_features = dataset.x_num["train"].shape[1] - if model_params.is_y_cond == "concat": + if model_params.is_y_cond == IsYCond.CONCAT: num_numerical_features -= 1 classifier = Classifier( @@ -497,7 +504,7 @@ def train_classifier( for _ in range(3000): test_x, test_y = next(test_loader) test_y = test_y.long().to(device) - test_x = test_x[:, 1:].to(device) if model_params.is_y_cond == "concat" else test_x.to(device) + test_x = test_x[:, 1:].to(device) if model_params.is_y_cond == IsYCond.CONCAT else test_x.to(device) with torch.no_grad(): pred = classifier(test_x, timesteps=torch.zeros(test_x.shape[0]).to(device)) correct += (pred.argmax(dim=1) == test_y).sum().item() diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py index 1edbe683..b07d44b2 100644 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ b/src/midst_toolkit/models/clavaddpm/typing.py @@ -1,4 +1,5 @@ from collections.abc import Callable +from dataclasses import dataclass from enum import Enum from typing import Any @@ -22,3 +23,57 @@ class ClusteringMethod(Enum): GMM = "gmm" KMEANS_AND_GMM = "kmeans_and_gmm" VARIATIONAL = "variational" + + +class IsYCond(Enum): + """ + The condition on the y column. + + IsYCond.CONCAT: y is concatenated to X, the model learn a joint distribution of (y, X) + IsYCond.EMBEDDING: y is not concatenated to X. During computations, y is embedded + and added to the latent vector of X + IsYCond.NONE: y column is completely ignored + + How does is_y_cond affect the generation of y? + is_y_cond: + IsYCond.CONCAT: the model synthesizes (y, X) directly, so y is just the first column + IsYCond.EMBEDDING: y is first sampled using empirical distribution of y. The model only + synthesizes X. When returning the generated data, we return the generated X + and the sampled y. (y is sampled from empirical distribution, instead of being + generated by the model) + Note that in this way, y is still not independent of X, because the model has been + adding the embedding of y to the latent vector of X during computations. + IsYCond.NONE: + y is synthesized using y's empirical distribution. X is generated by the model. + In this case, y is completely independent of X. + """ + + CONCAT = "concat" + EMBEDDING = "embedding" + NONE = "none" + + +@dataclass +class RTDLParameters: + """Parameters for the RTDL model.""" + + d_layers: list[int] + dropout: float + d_in: int = 0 + d_out: int = 0 + emb_d: int = 0 + n_blocks: int = 0 + d_main: int = 0 + d_hidden: int = 0 + dropout_first: float = 0 + dropout_second: float = 0 + + +@dataclass +class ModelParameters: + """Parameters for the ClavaDDPM model.""" + + rtdl_parameters: RTDLParameters + d_in: int = 0 + num_classes: int = 0 + is_y_cond: IsYCond = IsYCond.NONE From 01ed9c1425e021b5c61b8173dd87777025f050ff Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 22 Sep 2025 16:43:23 -0400 Subject: [PATCH 06/40] Refactor Gaussian loss handling: Introduce GaussianLossType enum to replace string literals for loss type specification in fine-tuning and training modules, enhancing type safety and clarity. --- .../clavaddpm/gaussian_multinomial_diffusion.py | 7 ++++--- src/midst_toolkit/models/clavaddpm/train.py | 13 +++++++------ src/midst_toolkit/models/clavaddpm/typing.py | 7 +++++++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py index 2240e2a4..e4f9cf32 100644 --- a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py +++ b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py @@ -30,6 +30,7 @@ sliced_logsumexp, sum_except_batch, ) +from midst_toolkit.models.clavaddpm.typing import GaussianLossType # Based in part on: @@ -87,7 +88,7 @@ def __init__( num_numerical_features: int, denoise_fn: torch.nn.Module, num_timesteps: int = 1000, - gaussian_loss_type: str = "mse", + gaussian_loss_type: GaussianLossType = GaussianLossType.MSE, gaussian_parametrization: str = "eps", multinomial_loss_type: str = "vb_stochastic", parametrization: str = "x0", @@ -355,9 +356,9 @@ def _gaussian_loss( model_kwargs = {} terms = {} - if self.gaussian_loss_type == "mse": + if self.gaussian_loss_type == GaussianLossType.MSE: terms["loss"] = mean_flat((noise - model_out) ** 2) - elif self.gaussian_loss_type == "kl": + elif self.gaussian_loss_type == GaussianLossType.KL: terms["loss"] = self._vb_terms_bpd( model_output=model_out, x_start=x_start, diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 7f55ff3f..52eef587 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -25,6 +25,7 @@ from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer from midst_toolkit.models.clavaddpm.typing import ( Configs, + GaussianLossType, IsYCond, ModelParameters, RelationOrder, @@ -66,7 +67,7 @@ def clava_training( iterations = int, batch_size = int, model_type = str["mlp" | "resnet"], - gaussian_loss_type = str["mse" | "cross_entropy"], + gaussian_loss_type = str["mse" | "kl"], num_timesteps = int, scheduler = str["cosine" | "linear"], lr = float, @@ -157,7 +158,7 @@ def child_training( iterations = int, batch_size = int, model_type = str["mlp" | "resnet"], - gaussian_loss_type = str["mse" | "cross_entropy"], + gaussian_loss_type = str["mse" | "kl"], num_timesteps = int, scheduler = str["cosine" | "linear"], lr = float, @@ -206,7 +207,7 @@ def child_training( diffusion_config["iterations"], diffusion_config["batch_size"], ModelType(diffusion_config["model_type"]), - diffusion_config["gaussian_loss_type"], + GaussianLossType(diffusion_config["gaussian_loss_type"]), diffusion_config["num_timesteps"], diffusion_config["scheduler"], diffusion_config["lr"], @@ -227,7 +228,7 @@ def child_training( child_transformations, classifier_config["iterations"], classifier_config["batch_size"], - diffusion_config["gaussian_loss_type"], + GaussianLossType(diffusion_config["gaussian_loss_type"]), diffusion_config["num_timesteps"], diffusion_config["scheduler"], cluster_col=y_col, @@ -255,7 +256,7 @@ def train_model( steps: int, batch_size: int, model_type: ModelType, - gaussian_loss_type: str, + gaussian_loss_type: GaussianLossType, num_timesteps: int, scheduler: str, learning_rate: float, @@ -365,7 +366,7 @@ def train_classifier( transformations: Transformations, classifier_steps: int, batch_size: int, - gaussian_loss_type: str, + gaussian_loss_type: GaussianLossType, num_timesteps: int, scheduler: str, d_layers: list[int], diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py index b07d44b2..ec3796d3 100644 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ b/src/midst_toolkit/models/clavaddpm/typing.py @@ -77,3 +77,10 @@ class ModelParameters: d_in: int = 0 num_classes: int = 0 is_y_cond: IsYCond = IsYCond.NONE + + +class GaussianLossType(Enum): + """Possible types of Gaussian loss.""" + + MSE = "mse" + KL = "kl" From 0dbc6b4817b08ba160a5593162e000655e78704f Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 22 Sep 2025 16:49:33 -0400 Subject: [PATCH 07/40] Refactor scheduler handling: Introduce Scheduler enum to replace string literals for scheduler specification in fine-tuning and training modules, enhancing type safety and clarity. --- .../gaussian_multinomial_diffusion.py | 19 +++++++++++++------ src/midst_toolkit/models/clavaddpm/train.py | 9 +++++---- src/midst_toolkit/models/clavaddpm/typing.py | 7 +++++++ 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py index e4f9cf32..54b2a0a7 100644 --- a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py +++ b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py @@ -30,7 +30,7 @@ sliced_logsumexp, sum_except_batch, ) -from midst_toolkit.models.clavaddpm.typing import GaussianLossType +from midst_toolkit.models.clavaddpm.typing import GaussianLossType, Scheduler # Based in part on: @@ -38,27 +38,34 @@ eps = 1e-8 -def get_named_beta_schedule(schedule_name: str, num_diffusion_timesteps: int) -> np.ndarray: +def get_named_beta_schedule(scheduler: Scheduler, num_diffusion_timesteps: int) -> np.ndarray: """ Get a pre-defined beta schedule for the given name. The beta schedule library consists of beta schedules which remain similar in the limit of num_diffusion_timesteps. Beta schedules may be added, but should not be removed or changed once they are committed to maintain backwards compatibility. + + Args: + scheduler: The scheduler to use. + num_diffusion_timesteps: The number of diffusion timesteps. + + Returns: + The beta schedule. """ - if schedule_name == "linear": + if scheduler == Scheduler.LINEAR: # Linear schedule from Ho et al, extended to work for any number of # diffusion steps. scale = 1000 / num_diffusion_timesteps beta_start = scale * 0.0001 beta_end = scale * 0.02 return np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64) - if schedule_name == "cosine": + if scheduler == Scheduler.COSINE: return betas_for_alpha_bar( num_diffusion_timesteps, lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, ) - raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + raise NotImplementedError(f"Unsupported scheduler: {scheduler.value}") def betas_for_alpha_bar(num_diffusion_timesteps: int, alpha_bar: Callable, max_beta: float = 0.999) -> np.ndarray: @@ -92,7 +99,7 @@ def __init__( gaussian_parametrization: str = "eps", multinomial_loss_type: str = "vb_stochastic", parametrization: str = "x0", - scheduler: str = "cosine", + scheduler: Scheduler = Scheduler.COSINE, device: torch.device | None = None, ): # ruff: noqa: D107 diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 52eef587..b4b5e364 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -30,6 +30,7 @@ ModelParameters, RelationOrder, RTDLParameters, + Scheduler, Tables, ) @@ -209,7 +210,7 @@ def child_training( ModelType(diffusion_config["model_type"]), GaussianLossType(diffusion_config["gaussian_loss_type"]), diffusion_config["num_timesteps"], - diffusion_config["scheduler"], + Scheduler(diffusion_config["scheduler"]), diffusion_config["lr"], diffusion_config["weight_decay"], diffusion_config["data_split_ratios"], @@ -230,7 +231,7 @@ def child_training( classifier_config["batch_size"], GaussianLossType(diffusion_config["gaussian_loss_type"]), diffusion_config["num_timesteps"], - diffusion_config["scheduler"], + Scheduler(diffusion_config["scheduler"]), cluster_col=y_col, d_layers=classifier_config["d_layers"], dim_t=classifier_config["dim_t"], @@ -258,7 +259,7 @@ def train_model( model_type: ModelType, gaussian_loss_type: GaussianLossType, num_timesteps: int, - scheduler: str, + scheduler: Scheduler, learning_rate: float, weight_decay: float, data_split_ratios: list[float], @@ -368,7 +369,7 @@ def train_classifier( batch_size: int, gaussian_loss_type: GaussianLossType, num_timesteps: int, - scheduler: str, + scheduler: Scheduler, d_layers: list[int], data_split_ratios: list[float], device: str = "cuda", diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py index ec3796d3..c4cf8a68 100644 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ b/src/midst_toolkit/models/clavaddpm/typing.py @@ -84,3 +84,10 @@ class GaussianLossType(Enum): MSE = "mse" KL = "kl" + + +class Scheduler(Enum): + """Possible types of scheduler.""" + + COSINE = "cosine" + LINEAR = "linear" From 128da65b893f48e51bcd2edfed592b42bf084e57 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 22 Sep 2025 16:58:03 -0400 Subject: [PATCH 08/40] Merge "Refactor sampler initialization: Update UniformSampler and LossSecondMomentResampler to accept num_timesteps directly, replacing the diffusion object dependency, and enhance the ScheduleSampler enum with a method for creating samplers." --- src/midst_toolkit/models/clavaddpm/sampler.py | 62 +++++++++---------- src/midst_toolkit/models/clavaddpm/train.py | 16 ++--- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/sampler.py b/src/midst_toolkit/models/clavaddpm/sampler.py index d9e54404..51c69588 100644 --- a/src/midst_toolkit/models/clavaddpm/sampler.py +++ b/src/midst_toolkit/models/clavaddpm/sampler.py @@ -1,14 +1,12 @@ """Samplers for the ClavaDDPM model.""" from abc import ABC, abstractmethod -from typing import Literal +from enum import Enum import numpy as np import torch from torch import Tensor -from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion - class ScheduleSampler(ABC): """ @@ -54,15 +52,15 @@ def sample(self, batch_size: int, device: str) -> tuple[Tensor, Tensor]: class UniformSampler(ScheduleSampler): - def __init__(self, diffusion: GaussianMultinomialDiffusion): + def __init__(self, num_timesteps: int): """ Initialize the UniformSampler. Args: - diffusion: The diffusion object. + num_timesteps: The number of diffusion timesteps. """ - self.diffusion = diffusion - self._weights = torch.from_numpy(np.ones([diffusion.num_timesteps])) + self.num_timesteps = num_timesteps + self._weights = torch.from_numpy(np.ones([num_timesteps])) def weights(self) -> Tensor: """Return the weights.""" @@ -125,7 +123,7 @@ def update_with_all_losses(self, ts: list[int], losses: list[float]) -> None: class LossSecondMomentResampler(LossAwareSampler): def __init__( self, - diffusion: GaussianMultinomialDiffusion, + num_timesteps: int, history_per_term: int = 10, uniform_prob: float = 0.001, ): @@ -133,15 +131,15 @@ def __init__( Initialize the LossSecondMomentResampler. Args: - diffusion: The diffusion object. + num_timesteps: The number of diffusion timesteps. history_per_term: The number of losses to keep for each timestep. uniform_prob: The probability of sampling a uniform timestep. """ - self.diffusion = diffusion + self.num_timesteps = num_timesteps self.history_per_term = history_per_term self.uniform_prob = uniform_prob - self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64) - self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.uint) + self._loss_history = np.zeros([num_timesteps, history_per_term], dtype=np.float64) + self._loss_counts = np.zeros([num_timesteps], dtype=np.uint) def weights(self) -> Tensor: """ @@ -150,7 +148,7 @@ def weights(self) -> Tensor: Warms up the sampler if it's not warmed up. """ if not self._warmed_up(): - return torch.from_numpy(np.ones([self.diffusion.num_timesteps], dtype=np.float64)) + return torch.from_numpy(np.ones([self.num_timesteps], dtype=np.float64)) weights = np.sqrt(np.mean(self._loss_history**2, axis=-1)) weights /= np.sum(weights) weights *= 1 - self.uniform_prob @@ -185,23 +183,25 @@ def _warmed_up(self) -> bool: return (self._loss_counts == self.history_per_term).all() -def create_named_schedule_sampler( - name: Literal["uniform", "loss-second-moment"], - diffusion: GaussianMultinomialDiffusion, -) -> ScheduleSampler: - """ - Create a ScheduleSampler from a library of pre-defined samplers. +class ScheduleSamplerType(Enum): + """Possible types of schedule sampler.""" - Args: - name: The name of the sampler. Should be one of ["uniform", "loss-second-moment"]. - diffusion: The diffusion object to sample for. + UNIFORM = "uniform" + LOSS_SECOND_MOMENT = "loss-second-moment" - Returns: - The UniformSampler if ``name`` is "uniform", LossSecondMomentResampler if ``name`` - is "loss-second-moment". - """ - if name == "uniform": - return UniformSampler(diffusion) - if name == "loss-second-moment": - return LossSecondMomentResampler(diffusion) - raise NotImplementedError(f"unknown schedule sampler: {name}") + def create_named_schedule_sampler(self, num_timesteps: int) -> ScheduleSampler: + """ + Create a ScheduleSampler from a library of pre-defined samplers. + + Args: + num_timesteps: The number of diffusion timesteps. + + Returns: + The UniformSampler if ScheduleSamplerType.UNIFORM, LossSecondMomentResampler + if ScheduleSamplerType.LOSS_SECOND_MOMENT. + """ + if self == ScheduleSamplerType.UNIFORM: + return UniformSampler(num_timesteps) + if self == ScheduleSamplerType.LOSS_SECOND_MOMENT: + return LossSecondMomentResampler(num_timesteps) + raise NotImplementedError(f"Unsupported schedule sampler: {self.value}") diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index b4b5e364..bf1ec45b 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -21,7 +21,7 @@ ) from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion from midst_toolkit.models.clavaddpm.model import Classifier, ModelType, get_table_info -from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, create_named_schedule_sampler +from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, ScheduleSamplerType from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer from midst_toolkit.models.clavaddpm.typing import ( Configs, @@ -447,7 +447,10 @@ def train_classifier( classifier_optimizer = optim.AdamW(classifier.parameters(), lr=learning_rate) - empty_diffusion = GaussianMultinomialDiffusion( + schedule_sampler = ScheduleSamplerType.UNIFORM.create_named_schedule_sampler(num_timesteps) + key_value_logger = KeyValueLogger() + + diffusion_model = GaussianMultinomialDiffusion( num_classes=category_sizes, num_numerical_features=num_numerical_features, denoise_fn=None, # type: ignore[arg-type] @@ -456,10 +459,7 @@ def train_classifier( scheduler=scheduler, device=torch.device(device), ) - empty_diffusion.to(device) - - schedule_sampler = create_named_schedule_sampler("uniform", empty_diffusion) - key_value_logger = KeyValueLogger() + diffusion_model.to(device) classifier.train() for step in range(classifier_steps): @@ -471,7 +471,7 @@ def train_classifier( train_loader, dataset, schedule_sampler, - empty_diffusion, + diffusion_model, prefix="train", device=device, key_value_logger=key_value_logger, @@ -487,7 +487,7 @@ def train_classifier( val_loader, dataset, schedule_sampler, - empty_diffusion, + diffusion_model, prefix="val", device=device, key_value_logger=key_value_logger, From f158b4439024f96b8b20b65095ec7d3f14256323 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 23 Sep 2025 11:34:02 -0400 Subject: [PATCH 09/40] Enhance metric and loss handling: Refactor loss computation in _numerical_forward_backward_log and _compute_top_k functions to utilize the new ReductionMethod enum for improved type safety and consistency. --- .../evaluation/quality/synthcity/metric.py | 8 +++++++- src/midst_toolkit/models/clavaddpm/train.py | 17 +++++++++-------- src/midst_toolkit/models/clavaddpm/typing.py | 8 ++++++++ 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/midst_toolkit/evaluation/quality/synthcity/metric.py b/src/midst_toolkit/evaluation/quality/synthcity/metric.py index d671f2e8..ca2f7e73 100644 --- a/src/midst_toolkit/evaluation/quality/synthcity/metric.py +++ b/src/midst_toolkit/evaluation/quality/synthcity/metric.py @@ -84,7 +84,13 @@ def name() -> str: @classmethod def fqdn(cls) -> str: - """No idea.""" + """ + Return the fully qualified domain name of the metric. + + Returns: + The fully qualified domain name of the metric, composed of the + type and name of the metric, separated by a dot. + """ return f"{cls.type()}.{cls.name()}" def reduction(self) -> Callable: diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index bf1ec45b..f877c261 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -5,7 +5,7 @@ from dataclasses import asdict from logging import INFO, WARNING from pathlib import Path -from typing import Any, Literal +from typing import Any import numpy as np import pandas as pd @@ -28,6 +28,7 @@ GaussianLossType, IsYCond, ModelParameters, + ReductionMethod, RelationOrder, RTDLParameters, Scheduler, @@ -624,9 +625,9 @@ def _numerical_forward_backward_log( losses = {} losses[f"{prefix}_loss"] = loss.detach() - losses[f"{prefix}_acc@1"] = _compute_top_k(logits, sub_labels, k=1, reduction="none") + losses[f"{prefix}_acc@1"] = _compute_top_k(logits, sub_labels, k=1, reduction=ReductionMethod.NONE) if logits.shape[1] >= 5: - losses[f"{prefix}_acc@5"] = _compute_top_k(logits, sub_labels, k=5, reduction="none") + losses[f"{prefix}_acc@5"] = _compute_top_k(logits, sub_labels, k=5, reduction=ReductionMethod.NONE) _log_loss_dict(diffusion, sub_t, losses, key_value_logger) del losses loss = loss.mean() @@ -641,7 +642,7 @@ def _compute_top_k( logits: Tensor, labels: Tensor, k: int, - reduction: Literal["mean", "none"] = "mean", + reduction: ReductionMethod = ReductionMethod.MEAN, ) -> Tensor: """ Compute the top-k accuracy. @@ -650,18 +651,18 @@ def _compute_top_k( logits: The logits of the classifier. labels: The labels of the data. k: The number of top-k. - reduction: The reduction method. Should be one of ["mean", "none"]. Defaults to "mean". + reduction: The reduction method. Defaults to ReductionMethod.MEAN. Returns: The top-k accuracy. """ _, top_ks = torch.topk(logits, k, dim=-1) - if reduction == "mean": + if reduction == ReductionMethod.MEAN: return (top_ks == labels[:, None]).float().sum(dim=-1).mean() - if reduction == "none": + if reduction == ReductionMethod.NONE: return (top_ks == labels[:, None]).float().sum(dim=-1) - raise ValueError(f"reduction should be one of ['mean', 'none']: {reduction}") + raise ValueError(f"Unsupported reduction method: {reduction.value}.") def _log_loss_dict( diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py index c4cf8a68..c37ec5eb 100644 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ b/src/midst_toolkit/models/clavaddpm/typing.py @@ -91,3 +91,11 @@ class Scheduler(Enum): COSINE = "cosine" LINEAR = "linear" + + +class ReductionMethod(Enum): + """Possible methods of reduction.""" + + MEAN = "mean" + SUM = "sum" + NONE = "none" From 6503788ef575844f735d38df8ed73d5450eced02 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 29 Sep 2025 12:06:31 -0400 Subject: [PATCH 10/40] Transforming a lot of literals into enums --- src/midst_toolkit/models/clavaddpm/dataset.py | 137 ++++++------------ src/midst_toolkit/models/clavaddpm/train.py | 12 +- src/midst_toolkit/models/clavaddpm/typing.py | 73 +++++++++- 3 files changed, 122 insertions(+), 100 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index c889a878..806a077d 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -6,7 +6,6 @@ from collections import Counter from copy import deepcopy from dataclasses import astuple, dataclass, replace -from enum import Enum from pathlib import Path from typing import Any, Literal, Self, cast @@ -28,7 +27,18 @@ StandardScaler, ) -from midst_toolkit.models.clavaddpm.typing import ArrayDict, IsYCond +from midst_toolkit.models.clavaddpm.typing import ( + ArrayDict, + CatEncoding, + CatNanPolicy, + IsYCond, + Normalization, + NumNanPolicy, + PredictionType, + TaskType, + Transformations, + YPolicy, +) # TODO: Dunders are special case in python, rename these values to something else. @@ -36,57 +46,6 @@ CAT_RARE_VALUE = "__rare__" -Normalization = Literal["standard", "quantile", "minmax"] -NumNanPolicy = Literal["drop-rows", "mean"] -CatNanPolicy = Literal["most_frequent"] -CatEncoding = Literal["one-hot", "counter"] -YPolicy = Literal["default"] - - -class TaskType(Enum): - BINCLASS = "binclass" - MULTICLASS = "multiclass" - REGRESSION = "regression" - - def __str__(self) -> str: - """ - Return the string representation of the task type, which is the value of the enum. - - Returns: - The string representation of the task type. - """ - return self.value - - -class PredictionType(Enum): - LOGITS = "logits" - PROBS = "probs" - - -@dataclass(frozen=True) -class Transformations: - seed: int = 0 - normalization: Normalization | None = None - num_nan_policy: NumNanPolicy | None = None - cat_nan_policy: CatNanPolicy | None = None - cat_min_frequency: float | None = None - cat_encoding: CatEncoding | None = None - y_policy: YPolicy | None = "default" - - @classmethod - def default(cls) -> Self: - """Return the default transformations.""" - return cls( - seed=0, - normalization="quantile", - num_nan_policy=None, - cat_nan_policy=None, - cat_min_frequency=None, - cat_encoding=None, - y_policy="default", - ) - - @dataclass(frozen=False) class Dataset: x_num: ArrayDict | None @@ -248,11 +207,10 @@ def get_category_sizes(self, split: Literal["train", "val", "test"]) -> list[int """ return [] if self.x_cat is None else get_category_sizes(self.x_cat[split]) - # TODO: prediction_type should be of type PredictionType def calculate_metrics( self, predictions: dict[str, np.ndarray], - prediction_type: str | PredictionType | None, + prediction_type: PredictionType | None, ) -> dict[str, Any]: """ Calculate the metrics of the predictions. @@ -298,14 +256,14 @@ def get_category_sizes(x: torch.Tensor | np.ndarray) -> list[int]: def calculate_metrics( y_true: np.ndarray, y_pred: np.ndarray, - task_type: str | TaskType, - prediction_type: str | PredictionType | None, + task_type: TaskType, + prediction_type: PredictionType | None, y_info: dict[str, Any], ) -> dict[str, Any]: """ Calculate the metrics of the predictions. - Usage: calculate_metrics(y_true, y_pred, 'binclass', 'logits', {}) + Usage: calculate_metrics(y_true, y_pred, TaskType.BINCLASS, PredictionType.LOGITS, {}) Args: y_true: The true labels as a numpy array. @@ -350,10 +308,6 @@ def calculate_metrics( "roc_auc": The ROC AUC score. } """ - task_type = TaskType(task_type) - if prediction_type is not None: - prediction_type = PredictionType(prediction_type) - if task_type == TaskType.REGRESSION: assert prediction_type is None assert "std" in y_info @@ -398,14 +352,16 @@ def _get_predicted_labels_and_probs( Args: y_pred: The predicted labels as a numpy array. - task_type: The type of the task. - prediction_type: The type of the predictions. + task_type: The type of the task. Can be TaskType.BINCLASS or TaskType.MULTICLASS. + Other task types are not supported. + prediction_type: The type of the predictions. If None, will return the predictions as labels + and probabilities as None. Returns: A tuple with the labels and probabilities. The probabilities are None if the prediction_type is None. """ - assert task_type in (TaskType.BINCLASS, TaskType.MULTICLASS) + assert task_type in (TaskType.BINCLASS, TaskType.MULTICLASS), f"Unsupported task type: {task_type.value}" if prediction_type is None: return y_pred, None @@ -415,7 +371,7 @@ def _get_predicted_labels_and_probs( elif prediction_type == PredictionType.PROBS: probs = y_pred else: - raise ValueError(f"Unknown prediction_type: {prediction_type}") + raise ValueError(f"Unsupported prediction_type: {prediction_type.value}") assert probs is not None labels = np.round(probs) if task_type == TaskType.BINCLASS else probs.argmax(axis=1) @@ -713,7 +669,7 @@ def normalize( Args: x: The data to normalize. - normalization: The normalization to use. Can be "standard", "minmax", or "quantile". + normalization: The normalization to use. seed: The seed to use for the random state. Optional, default is None. return_normalizer: Whether to return the normalizer. Optional, default is False. @@ -722,11 +678,11 @@ def normalize( normalized data and the normalizer. """ x_train = x["train"] - if normalization == "standard": + if normalization == Normalization.STANDARD: normalizer = StandardScaler() - elif normalization == "minmax": + elif normalization == Normalization.MINMAX: normalizer = MinMaxScaler() - elif normalization == "quantile": + elif normalization == Normalization.QUANTILE: normalizer = QuantileTransformer( output_distribution="normal", n_quantiles=max(min(x["train"].shape[0] // 30, 1000), 10), @@ -734,7 +690,7 @@ def normalize( random_state=seed, ) else: - raise ValueError(f"Unknown normalization: {normalization}") + raise ValueError(f"Unsupported normalization: {normalization.value}") normalizer.fit(x_train) if return_normalizer: return {k: normalizer.transform(v) for k, v in x.items()}, normalizer @@ -749,8 +705,7 @@ def num_process_nans(dataset: Dataset, policy: NumNanPolicy | None) -> Dataset: Args: dataset: The dataset to process. - policy: The policy to use to process the NaN values. Can be "drop-rows" or "mean". - Optional, default is None. + policy: The policy to use to process the NaN values. Returns: The processed dataset. @@ -762,7 +717,7 @@ def num_process_nans(dataset: Dataset, policy: NumNanPolicy | None) -> Dataset: return dataset assert policy is not None - if policy == "drop-rows": + if policy == NumNanPolicy.DROP_ROWS: valid_masks = {k: ~v.any(1) for k, v in nan_masks.items()} assert valid_masks["test"].all(), "Cannot drop test rows, since this will affect the final metrics." new_data = {} @@ -771,7 +726,7 @@ def num_process_nans(dataset: Dataset, policy: NumNanPolicy | None) -> Dataset: if data_dict is not None: new_data[data_name] = {k: v[valid_masks[k]] for k, v in data_dict.items()} dataset = replace(dataset, **new_data) # type: ignore[arg-type] - elif policy == "mean": + elif policy == NumNanPolicy.MEAN: new_values = np.nanmean(dataset.x_num["train"], axis=0) # type: ignore[index] x_num = deepcopy(dataset.x_num) for k, v in x_num.items(): # type: ignore[union-attr] @@ -779,7 +734,7 @@ def num_process_nans(dataset: Dataset, policy: NumNanPolicy | None) -> Dataset: v[num_nan_indices] = np.take(new_values, num_nan_indices[1]) dataset = replace(dataset, x_num=x_num) else: - raise ValueError(f"Unknown policy: {policy}") + raise ValueError(f"Unsupported policy: {policy.value}") return dataset @@ -789,8 +744,7 @@ def cat_process_nans(x: ArrayDict, policy: CatNanPolicy | None) -> ArrayDict: Args: x: The data to process. - policy: The policy to use to process the NaN values. Can be "most_frequent". - Optional, default is None. + policy: The policy to use to process the NaN values. If none, will no-op. Returns: The processed data. @@ -800,12 +754,12 @@ def cat_process_nans(x: ArrayDict, policy: CatNanPolicy | None) -> ArrayDict: if any(mask.any() for mask in nan_masks.values()): if policy is None: x_new = x - elif policy == "most_frequent": + elif policy == CatNanPolicy.MOST_FREQUENT: imputer = SimpleImputer(missing_values=CAT_MISSING_VALUE, strategy=policy) imputer.fit(x["train"]) x_new = {k: cast(np.ndarray, imputer.transform(v)) for k, v in x.items()} else: - raise ValueError(f"Unknown cat_nan_policy: {policy}") + raise ValueError(f"Unsupported cat_nan_policy: {policy.value}") else: assert policy is None x_new = x @@ -838,7 +792,7 @@ def cat_drop_rare(x: ArrayDict, min_frequency: float) -> ArrayDict: def cat_encode( x: ArrayDict, - encoding: CatEncoding | None, # TODO: add "ordinal" as one of the options, maybe? + encoding: CatEncoding | None, y_train: np.ndarray | None, seed: int | None, return_encoder: bool = False, @@ -848,8 +802,7 @@ def cat_encode( Args: x: The data to encode. - encoding: The encoding to use. Can be "one-hot" or "counter". Default is None. - If None, will use the "ordinal" encoding. + encoding: The encoding to use. If None, will use CatEncoding.ORDINAL. y_train: The target values. Optional, default is None. Will only be used for the "counter" encoding. seed: The seed to use for the random state. Optional, default is None. return_encoder: Whether to return the encoder. Optional, default is False. @@ -860,12 +813,12 @@ def cat_encode( - A boolean value indicating if the data was converted to numerical. - The encoder, if return_encoder is True. None otherwise. """ - if encoding != "counter": + if encoding != CatEncoding.COUNTER: y_train = None # Step 1. Map strings to 0-based ranges - if encoding is None: + if encoding is None or encoding == CatEncoding.ORDINAL: unknown_value = np.iinfo("int64").max - 3 oe = OrdinalEncoder( handle_unknown="use_encoded_value", @@ -887,7 +840,7 @@ def cat_encode( # Step 2. Encode. - if encoding == "one-hot": + if encoding == CatEncoding.ONE_HOT: ohe = OneHotEncoder( handle_unknown="ignore", sparse=False, @@ -898,7 +851,7 @@ def cat_encode( # encoder.steps.append(('ohe', ohe)) encoder.fit(x["train"]) x = {k: encoder.transform(v) for k, v in x.items()} - elif encoding == "counter": + elif encoding == CatEncoding.COUNTER: assert y_train is not None assert seed is not None loe = LeaveOneOutEncoder(sigma=0.1, random_state=seed, return_df=False) @@ -908,7 +861,7 @@ def cat_encode( if not isinstance(x["train"], pd.DataFrame): x = {k: v.values for k, v in x.items()} # type: ignore[attr-defined] else: - raise ValueError(f"Unknown encoding: {encoding}") + raise ValueError(f"Unsupported encoding: {encoding.value}") if return_encoder: return x, True, encoder @@ -921,8 +874,7 @@ def build_target(y: ArrayDict, policy: YPolicy | None, task_type: TaskType) -> t Args: y: The target values. - policy: The policy to use to build the target. Can be "default". Optional, default is None. - If none, it will no-op. + policy: The policy to use to build the target. Can be YPolicy.DEFAULT. If none, it will no-op. task_type: The type of the task. Returns: @@ -931,12 +883,13 @@ def build_target(y: ArrayDict, policy: YPolicy | None, task_type: TaskType) -> t info: dict[str, Any] = {"policy": policy} if policy is None: pass - elif policy == "default": + elif policy == YPolicy.DEFAULT: if task_type == TaskType.REGRESSION: mean, std = float(y["train"].mean()), float(y["train"].std()) y = {k: (v - mean) / std for k, v in y.items()} info["mean"] = mean info["std"] = std else: - raise ValueError(f"Unknown policy: {policy}") + raise ValueError(f"Unsupported policy: {policy.value}") + return y, info diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index f877c261..a9d88112 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -14,16 +14,13 @@ from midst_toolkit.common.logger import KeyValueLogger, log from midst_toolkit.models.clavaddpm.data_loaders import prepare_fast_dataloader -from midst_toolkit.models.clavaddpm.dataset import ( - Dataset, - Transformations, - make_dataset_from_df, -) +from midst_toolkit.models.clavaddpm.dataset import Dataset, make_dataset_from_df from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion from midst_toolkit.models.clavaddpm.model import Classifier, ModelType, get_table_info from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, ScheduleSamplerType from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer from midst_toolkit.models.clavaddpm.typing import ( + CatEncoding, Configs, GaussianLossType, IsYCond, @@ -33,6 +30,7 @@ RTDLParameters, Scheduler, Tables, + Transformations, ) @@ -305,7 +303,7 @@ def train_model( category_sizes = np.array(dataset.get_category_sizes("train")) # ruff: noqa: N806 - if len(category_sizes) == 0 or transformations.cat_encoding == "one-hot": + if len(category_sizes) == 0 or transformations.cat_encoding == CatEncoding.ONE_HOT: category_sizes = np.array([0]) # ruff: noqa: N806 @@ -424,7 +422,7 @@ def train_classifier( category_sizes = np.array(dataset.get_category_sizes("train")) # ruff: noqa: N806 - if len(category_sizes) == 0 or transformations.cat_encoding == "one-hot": + if len(category_sizes) == 0 or transformations.cat_encoding == CatEncoding.ONE_HOT: category_sizes = np.array([0]) # ruff: noqa: N806 print(category_sizes) diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py index c37ec5eb..32a760bc 100644 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ b/src/midst_toolkit/models/clavaddpm/typing.py @@ -1,7 +1,7 @@ from collections.abc import Callable from dataclasses import dataclass from enum import Enum -from typing import Any +from typing import Any, Self import numpy as np from torch import nn @@ -99,3 +99,74 @@ class ReductionMethod(Enum): MEAN = "mean" SUM = "sum" NONE = "none" + + +class Normalization(Enum): + """Possible types of normalization.""" + + STANDARD = "standard" + QUANTILE = "quantile" + MINMAX = "minmax" + + +class NumNanPolicy(Enum): + """Possible types of num nan policy.""" + + DROP_ROWS = "drop-rows" + MEAN = "mean" + + +class CatNanPolicy(Enum): + """Possible types of cat nan policy.""" + + MOST_FREQUENT = "most_frequent" + + +class CatEncoding(Enum): + """Possible types of cat encoding.""" + + ONE_HOT = "one-hot" + COUNTER = "counter" + ORDINAL = "ordinal" + + +class YPolicy(Enum): + """Possible types of y policy.""" + + DEFAULT = "default" + + +class TaskType(Enum): + BINCLASS = "binclass" + MULTICLASS = "multiclass" + REGRESSION = "regression" + + def __str__(self) -> str: + """ + Return the string representation of the task type, which is the value of the enum. + + Returns: + The string representation of the task type. + """ + return self.value + + +class PredictionType(Enum): + LOGITS = "logits" + PROBS = "probs" + + +@dataclass(frozen=True) +class Transformations: + seed: int = 0 + normalization: Normalization | None = None + num_nan_policy: NumNanPolicy | None = None + cat_nan_policy: CatNanPolicy | None = None + cat_min_frequency: float | None = None + cat_encoding: CatEncoding | None = CatEncoding.ORDINAL + y_policy: YPolicy | None = YPolicy.DEFAULT + + @classmethod + def default(cls) -> Self: + """Return the default transformations.""" + return cls(seed=0, normalization=Normalization.QUANTILE, y_policy=YPolicy.DEFAULT) From d599334a5ff6b0e74284c7936fe56fde9e7d8e28 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 29 Sep 2025 17:44:46 -0400 Subject: [PATCH 11/40] WIP renaming RTDL, cat and num and data splits --- .../models/clavaddpm/data_loaders.py | 31 +++++++---- src/midst_toolkit/models/clavaddpm/dataset.py | 55 ++++++++++--------- src/midst_toolkit/models/clavaddpm/model.py | 50 ++++++++--------- src/midst_toolkit/models/clavaddpm/train.py | 27 +++++---- src/midst_toolkit/models/clavaddpm/typing.py | 34 +++++++----- 5 files changed, 108 insertions(+), 89 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index b8fcbf27..4e28febf 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -3,7 +3,7 @@ from collections.abc import Generator from logging import INFO from pathlib import Path -from typing import Any, Literal, Self +from typing import Any, Self import numpy as np import pandas as pd @@ -11,6 +11,7 @@ from midst_toolkit.common.logger import log from midst_toolkit.models.clavaddpm.dataset import Dataset +from midst_toolkit.models.clavaddpm.typing import DataSplit def load_multi_table( @@ -132,8 +133,8 @@ def pipeline_process_data( A tuple with 2 values: - The data dictionary containing the following keys: - "df": The dataframe containing the data. - - "train": The dataframe containing the training set. - - "test": The dataframe containing the test set. It will be absent if ratio == 1. + - DataSplit.TRAIN: The dataframe containing the training set. + - DataSplit.TEST: The dataframe containing the test set. It will be absent if ratio == 1. - "numpy": A dictionary with the numeric data, containing the keys: - "x_num_train": The numeric data for the training set. - "x_cat_train": The categorical data for the training set. @@ -326,7 +327,9 @@ def pipeline_process_data( log(INFO, str_shape) data: dict[str, dict[str, Any]] = { - "df": {"train": train_df}, + "df": { + DataSplit.TRAIN.value: train_df, + }, "numpy": { "x_num_train": x_num_train, "x_cat_train": x_cat_train, @@ -336,7 +339,7 @@ def pipeline_process_data( if ratio < 1: assert test_df is not None and x_num_test is not None and x_cat_test is not None and y_test is not None - data["df"]["test"] = test_df + data["df"][DataSplit.TEST.value] = test_df data["numpy"]["x_num_test"] = x_num_test data["numpy"]["x_cat_test"] = x_cat_test data["numpy"]["y_test"] = y_test @@ -529,7 +532,7 @@ def __len__(self) -> int: def prepare_fast_dataloader( dataset: Dataset, - split: Literal["train", "val", "test"], + split: DataSplit, batch_size: int, y_type: str = "float", ) -> Generator[tuple[torch.Tensor, ...]]: @@ -547,13 +550,19 @@ def prepare_fast_dataloader( """ if dataset.x_cat is not None: if dataset.x_num is not None: - x = torch.from_numpy(np.concatenate([dataset.x_num[split], dataset.x_cat[split]], axis=1)).float() + x = torch.from_numpy( + np.concatenate([dataset.x_num[split.value], dataset.x_cat[split.value]], axis=1) + ).float() else: - x = torch.from_numpy(dataset.x_cat[split]).float() + x = torch.from_numpy(dataset.x_cat[split.value]).float() else: assert dataset.x_num is not None - x = torch.from_numpy(dataset.x_num[split]).float() - y = torch.from_numpy(dataset.y[split]).float() if y_type == "float" else torch.from_numpy(dataset.y[split]).long() - dataloader = FastTensorDataLoader((x, y), batch_size=batch_size, shuffle=(split == "train")) + x = torch.from_numpy(dataset.x_num[split.value]).float() + y = ( + torch.from_numpy(dataset.y[split.value]).float() + if y_type == "float" + else torch.from_numpy(dataset.y[split.value]).long() + ) + dataloader = FastTensorDataLoader((x, y), batch_size=batch_size, shuffle=(split == DataSplit.TRAIN)) while True: yield from dataloader diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index 806a077d..59ea9314 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -29,11 +29,12 @@ from midst_toolkit.models.clavaddpm.typing import ( ArrayDict, - CatEncoding, - CatNanPolicy, + CategoricalEncoding, + CategoricalNANPolicy, + DataSplit, IsYCond, Normalization, - NumNanPolicy, + NumericalNANPolicy, PredictionType, TaskType, Transformations, @@ -54,8 +55,8 @@ class Dataset: y_info: dict[str, Any] task_type: TaskType n_classes: int | None - cat_transform: OneHotEncoder | None = None - num_transform: StandardScaler | None = None + categorical_transform: OneHotEncoder | None = None + numerical_transform: StandardScaler | None = None @classmethod def from_dir(cls, directory: Path) -> Self: @@ -95,7 +96,7 @@ def _load_datasets(cls, directory: Path, dataset_name: str) -> ArrayDict: Returns: The loaded datasets with all the splits. """ - splits = [k for k in ["train", "val", "test"] if directory.joinpath(f"y_{k}.npy").exists()] + splits = [k.value for k in list(DataSplit) if directory.joinpath(f"y_{k.value}.npy").exists()] # TODO: figure out if there is a way of getting rid of the cast return {x: cast(np.ndarray, np.load(directory / f"{dataset_name}_{x}.npy", allow_pickle=True)) for x in splits} @@ -195,7 +196,7 @@ def output_dimension(self) -> int: return self.n_classes return 1 - def get_category_sizes(self, split: Literal["train", "val", "test"]) -> list[int]: + def get_category_sizes(self, split: DataSplit) -> list[int]: """ Get the size of the categories in the specified split of the dataset. @@ -205,7 +206,7 @@ def get_category_sizes(self, split: Literal["train", "val", "test"]) -> list[int Returns: The size of the categories in the specified split of the dataset. """ - return [] if self.x_cat is None else get_category_sizes(self.x_cat[split]) + return [] if self.x_cat is None else get_category_sizes(self.x_cat[split.value]) def calculate_metrics( self, @@ -584,7 +585,7 @@ def transform_dataset( raise RuntimeError(f"Hash collision for {cache_path}") if dataset.x_num is not None: - dataset = num_process_nans(dataset, transformations.num_nan_policy) + dataset = num_process_nans(dataset, transformations.numerical_nan_policy) num_transform = None cat_transform = None @@ -599,17 +600,17 @@ def transform_dataset( ) if dataset.x_cat is None: - assert transformations.cat_nan_policy is None - assert transformations.cat_min_frequency is None + assert transformations.categorical_nan_policy is None + assert transformations.category_minimum_frequency is None # assert transformations.cat_encoding is None x_cat = None else: - x_cat = cat_process_nans(dataset.x_cat, transformations.cat_nan_policy) - if transformations.cat_min_frequency is not None: - x_cat = cat_drop_rare(x_cat, transformations.cat_min_frequency) + x_cat = cat_process_nans(dataset.x_cat, transformations.categorical_nan_policy) + if transformations.category_minimum_frequency is not None: + x_cat = cat_drop_rare(x_cat, transformations.category_minimum_frequency) x_cat, is_num, cat_transform = cat_encode( x_cat, - transformations.cat_encoding, + transformations.categorical_encoding, dataset.y["train"], transformations.seed, return_encoder=True, @@ -621,8 +622,8 @@ def transform_dataset( y, y_info = build_target(dataset.y, transformations.y_policy, dataset.task_type) dataset = replace(dataset, x_num=x_num, x_cat=x_cat, y=y, y_info=y_info) - dataset.num_transform = num_transform - dataset.cat_transform = cat_transform + dataset.numerical_transform = num_transform + dataset.categorical_transform = cat_transform if cache_path is not None: dump_pickle((transformations, dataset), cache_path) @@ -699,7 +700,7 @@ def normalize( # TODO: is there any relationship between this function and the cat_process_nans function? # Can they be made a little more similar to each other (in terms of signature)? -def num_process_nans(dataset: Dataset, policy: NumNanPolicy | None) -> Dataset: +def num_process_nans(dataset: Dataset, policy: NumericalNANPolicy | None) -> Dataset: """ Process the NaN values in the dataset. @@ -717,7 +718,7 @@ def num_process_nans(dataset: Dataset, policy: NumNanPolicy | None) -> Dataset: return dataset assert policy is not None - if policy == NumNanPolicy.DROP_ROWS: + if policy == NumericalNANPolicy.DROP_ROWS: valid_masks = {k: ~v.any(1) for k, v in nan_masks.items()} assert valid_masks["test"].all(), "Cannot drop test rows, since this will affect the final metrics." new_data = {} @@ -726,7 +727,7 @@ def num_process_nans(dataset: Dataset, policy: NumNanPolicy | None) -> Dataset: if data_dict is not None: new_data[data_name] = {k: v[valid_masks[k]] for k, v in data_dict.items()} dataset = replace(dataset, **new_data) # type: ignore[arg-type] - elif policy == NumNanPolicy.MEAN: + elif policy == NumericalNANPolicy.MEAN: new_values = np.nanmean(dataset.x_num["train"], axis=0) # type: ignore[index] x_num = deepcopy(dataset.x_num) for k, v in x_num.items(): # type: ignore[union-attr] @@ -738,7 +739,7 @@ def num_process_nans(dataset: Dataset, policy: NumNanPolicy | None) -> Dataset: return dataset -def cat_process_nans(x: ArrayDict, policy: CatNanPolicy | None) -> ArrayDict: +def cat_process_nans(x: ArrayDict, policy: CategoricalNANPolicy | None) -> ArrayDict: """ Process the NaN values in the categorical data. @@ -754,7 +755,7 @@ def cat_process_nans(x: ArrayDict, policy: CatNanPolicy | None) -> ArrayDict: if any(mask.any() for mask in nan_masks.values()): if policy is None: x_new = x - elif policy == CatNanPolicy.MOST_FREQUENT: + elif policy == CategoricalNANPolicy.MOST_FREQUENT: imputer = SimpleImputer(missing_values=CAT_MISSING_VALUE, strategy=policy) imputer.fit(x["train"]) x_new = {k: cast(np.ndarray, imputer.transform(v)) for k, v in x.items()} @@ -792,7 +793,7 @@ def cat_drop_rare(x: ArrayDict, min_frequency: float) -> ArrayDict: def cat_encode( x: ArrayDict, - encoding: CatEncoding | None, + encoding: CategoricalEncoding | None, y_train: np.ndarray | None, seed: int | None, return_encoder: bool = False, @@ -813,12 +814,12 @@ def cat_encode( - A boolean value indicating if the data was converted to numerical. - The encoder, if return_encoder is True. None otherwise. """ - if encoding != CatEncoding.COUNTER: + if encoding != CategoricalEncoding.COUNTER: y_train = None # Step 1. Map strings to 0-based ranges - if encoding is None or encoding == CatEncoding.ORDINAL: + if encoding is None or encoding == CategoricalEncoding.ORDINAL: unknown_value = np.iinfo("int64").max - 3 oe = OrdinalEncoder( handle_unknown="use_encoded_value", @@ -840,7 +841,7 @@ def cat_encode( # Step 2. Encode. - if encoding == CatEncoding.ONE_HOT: + if encoding == CategoricalEncoding.ONE_HOT: ohe = OneHotEncoder( handle_unknown="ignore", sparse=False, @@ -851,7 +852,7 @@ def cat_encode( # encoder.steps.append(('ohe', ohe)) encoder.fit(x["train"]) x = {k: encoder.transform(v) for k, v in x.items()} - elif encoding == CatEncoding.COUNTER: + elif encoding == CategoricalEncoding.COUNTER: assert y_train is not None assert seed is not None loe = LeaveOneOutEncoder(sigma=0.1, random_state=seed, return_df=False) diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py index 118173bd..0e25ddfd 100644 --- a/src/midst_toolkit/models/clavaddpm/model.py +++ b/src/midst_toolkit/models/clavaddpm/model.py @@ -13,7 +13,7 @@ from torch import Tensor, nn from midst_toolkit.common.logger import log -from midst_toolkit.models.clavaddpm.typing import IsYCond, ModelParameters, ModuleType, RTDLParameters +from midst_toolkit.models.clavaddpm.typing import DiffusionParameters, IsYCond, ModelParameters, ModuleType class Classifier(nn.Module): @@ -567,7 +567,7 @@ def __init__( d_in: int, num_classes: int, is_y_cond: IsYCond, - rtdl_parameters: RTDLParameters, + diffusion_parameters: DiffusionParameters, dim_t: int = 128, ): """ @@ -577,7 +577,7 @@ def __init__( d_in: The input dimension size. num_classes: The number of classes. is_y_cond: The condition on the y column. - rtdl_parameters: The parameters for the MLP. + diffusion_parameters: The parameters for the MLP. dim_t: The dimension size of the timestamp. """ super().__init__() @@ -585,15 +585,15 @@ def __init__( self.num_classes = num_classes self.is_y_cond = is_y_cond - self.rtdl_parameters = rtdl_parameters - self.rtdl_parameters.d_in = dim_t - self.rtdl_parameters.d_out = d_in + self.diffusion_parameters = diffusion_parameters + self.diffusion_parameters.d_in = dim_t + self.diffusion_parameters.d_out = d_in self.mlp = MLP.make_baseline( - d_in=self.rtdl_parameters.d_in, - d_layers=self.rtdl_parameters.d_layers, - dropout=self.rtdl_parameters.dropout, - d_out=self.rtdl_parameters.d_out, + d_in=self.diffusion_parameters.d_in, + d_layers=self.diffusion_parameters.d_layers, + dropout=self.diffusion_parameters.dropout, + d_out=self.diffusion_parameters.d_out, ) self.label_emb: nn.Embedding | nn.Linear @@ -630,7 +630,7 @@ def __init__( self, d_in: int, num_classes: int, - rtdl_parameters: RTDLParameters, + diffusion_parameters: DiffusionParameters, dim_t: int = 256, is_y_cond: IsYCond | None = None, ): @@ -640,7 +640,7 @@ def __init__( Args: d_in: The input dimension size. num_classes: The number of classes. - rtdl_parameters: The parameters for the ResNet. + diffusion_parameters: The parameters for the ResNet. dim_t: The dimension size of the timestep. is_y_cond: The condition on the y column. Optional, default is None. """ @@ -649,19 +649,19 @@ def __init__( self.num_classes = num_classes self.is_y_cond = is_y_cond - self.rtdl_parameters = rtdl_parameters - self.rtdl_parameters.d_in = d_in - self.rtdl_parameters.d_out = d_in - self.rtdl_parameters.emb_d = dim_t + self.diffusion_parameters = diffusion_parameters + self.diffusion_parameters.d_in = d_in + self.diffusion_parameters.d_out = d_in + self.diffusion_parameters.emb_d = dim_t self.resnet = ResNet.make_baseline( - d_in=rtdl_parameters.d_in, - n_blocks=rtdl_parameters.n_blocks, - d_main=rtdl_parameters.d_main, - d_hidden=rtdl_parameters.d_hidden, - dropout_first=rtdl_parameters.dropout_first, - dropout_second=rtdl_parameters.dropout_second, - d_out=rtdl_parameters.d_out, + d_in=self.diffusion_parameters.d_in, + n_blocks=self.diffusion_parameters.n_blocks, + d_main=self.diffusion_parameters.d_main, + d_hidden=self.diffusion_parameters.d_hidden, + dropout_first=self.diffusion_parameters.dropout_first, + dropout_second=self.diffusion_parameters.dropout_second, + d_out=self.diffusion_parameters.d_out, ) self.label_emb: nn.Embedding | nn.Linear @@ -824,13 +824,13 @@ def get_model(self, model_parameters: ModelParameters) -> nn.Module: d_in=model_parameters.d_in, num_classes=model_parameters.num_classes, is_y_cond=model_parameters.is_y_cond, - rtdl_parameters=model_parameters.rtdl_parameters, + diffusion_parameters=model_parameters.diffusion_parameters, ) if self == ModelType.RESNET: return ResNetDiffusion( d_in=model_parameters.d_in, num_classes=model_parameters.num_classes, - rtdl_parameters=model_parameters.rtdl_parameters, + diffusion_parameters=model_parameters.diffusion_parameters, ) raise ValueError(f"Unsupported model type: {self.value}") diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index a9d88112..92975220 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -20,14 +20,15 @@ from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, ScheduleSamplerType from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer from midst_toolkit.models.clavaddpm.typing import ( - CatEncoding, + CategoricalEncoding, Configs, + DataSplit, + DiffusionParameters, GaussianLossType, IsYCond, ModelParameters, ReductionMethod, RelationOrder, - RTDLParameters, Scheduler, Tables, Transformations, @@ -191,7 +192,7 @@ def child_training( y_col = f"{parent_name}_{child_name}_cluster" child_info = get_table_info(child_df_with_cluster, child_domain_dict, y_col) child_model_params = ModelParameters( - rtdl_parameters=RTDLParameters( + diffusion_parameters=DiffusionParameters( d_layers=diffusion_config["d_layers"], dropout=diffusion_config["dropout"], ), @@ -301,9 +302,9 @@ def train_model( std=0, ) - category_sizes = np.array(dataset.get_category_sizes("train")) + category_sizes = np.array(dataset.get_category_sizes(DataSplit.TRAIN)) # ruff: noqa: N806 - if len(category_sizes) == 0 or transformations.cat_encoding == CatEncoding.ONE_HOT: + if len(category_sizes) == 0 or transformations.categorical_encoding == CategoricalEncoding.ONE_HOT: category_sizes = np.array([0]) # ruff: noqa: N806 @@ -317,7 +318,7 @@ def train_model( model = model_type.get_model(model_params) model.to(device) - train_loader = prepare_fast_dataloader(dataset, split="train", batch_size=batch_size) + train_loader = prepare_fast_dataloader(dataset, split=DataSplit.TRAIN, batch_size=batch_size) diffusion = GaussianMultinomialDiffusion( num_classes=category_sizes, @@ -355,7 +356,9 @@ def train_model( "K": category_sizes, "empirical_class_dist": empirical_class_dist, "is_regression": dataset.is_regression, - "inverse_transform": dataset.num_transform.inverse_transform if dataset.num_transform is not None else None, + "inverse_transform": dataset.numerical_transform.inverse_transform + if dataset.numerical_transform is not None + else None, } @@ -416,13 +419,13 @@ def train_classifier( std=0, ) print(dataset.n_features) - train_loader = prepare_fast_dataloader(dataset, split="train", batch_size=batch_size, y_type="long") - val_loader = prepare_fast_dataloader(dataset, split="val", batch_size=batch_size, y_type="long") - test_loader = prepare_fast_dataloader(dataset, split="test", batch_size=batch_size, y_type="long") + train_loader = prepare_fast_dataloader(dataset, split=DataSplit.TRAIN, batch_size=batch_size, y_type="long") + val_loader = prepare_fast_dataloader(dataset, split=DataSplit.VALIDATION, batch_size=batch_size, y_type="long") + test_loader = prepare_fast_dataloader(dataset, split=DataSplit.TEST, batch_size=batch_size, y_type="long") - category_sizes = np.array(dataset.get_category_sizes("train")) + category_sizes = np.array(dataset.get_category_sizes(DataSplit.TRAIN)) # ruff: noqa: N806 - if len(category_sizes) == 0 or transformations.cat_encoding == CatEncoding.ONE_HOT: + if len(category_sizes) == 0 or transformations.categorical_encoding == CategoricalEncoding.ONE_HOT: category_sizes = np.array([0]) # ruff: noqa: N806 print(category_sizes) diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py index 32a760bc..e401bd17 100644 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ b/src/midst_toolkit/models/clavaddpm/typing.py @@ -54,8 +54,8 @@ class IsYCond(Enum): @dataclass -class RTDLParameters: - """Parameters for the RTDL model.""" +class DiffusionParameters: + """Parameters for the diffusion model.""" d_layers: list[int] dropout: float @@ -73,7 +73,7 @@ class RTDLParameters: class ModelParameters: """Parameters for the ClavaDDPM model.""" - rtdl_parameters: RTDLParameters + diffusion_parameters: DiffusionParameters d_in: int = 0 num_classes: int = 0 is_y_cond: IsYCond = IsYCond.NONE @@ -109,21 +109,21 @@ class Normalization(Enum): MINMAX = "minmax" -class NumNanPolicy(Enum): - """Possible types of num nan policy.""" +class NumericalNANPolicy(Enum): + """Possible policies for dealng with NANs in numerical data.""" DROP_ROWS = "drop-rows" MEAN = "mean" -class CatNanPolicy(Enum): - """Possible types of cat nan policy.""" +class CategoricalNANPolicy(Enum): + """Possible policies for dealng with NANs in categorical data.""" MOST_FREQUENT = "most_frequent" -class CatEncoding(Enum): - """Possible types of cat encoding.""" +class CategoricalEncoding(Enum): + """Possible types of encoding for categorical data.""" ONE_HOT = "one-hot" COUNTER = "counter" @@ -131,7 +131,7 @@ class CatEncoding(Enum): class YPolicy(Enum): - """Possible types of y policy.""" + """Possible types of policy for the y column.""" DEFAULT = "default" @@ -156,14 +156,20 @@ class PredictionType(Enum): PROBS = "probs" +class DataSplit(Enum): + TRAIN = "train" + VALIDATION = "val" + TEST = "test" + + @dataclass(frozen=True) class Transformations: seed: int = 0 normalization: Normalization | None = None - num_nan_policy: NumNanPolicy | None = None - cat_nan_policy: CatNanPolicy | None = None - cat_min_frequency: float | None = None - cat_encoding: CatEncoding | None = CatEncoding.ORDINAL + numerical_nan_policy: NumericalNANPolicy | None = None + categorical_nan_policy: CategoricalNANPolicy | None = None + category_minimum_frequency: float | None = None + categorical_encoding: CategoricalEncoding | None = CategoricalEncoding.ORDINAL y_policy: YPolicy | None = YPolicy.DEFAULT @classmethod From 1950f5cea29f0a57772ec3c7faf26add115104a6 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 30 Sep 2025 12:10:58 -0400 Subject: [PATCH 12/40] Using more data splits and adding types for gaussian parametrization --- src/midst_toolkit/models/clavaddpm/dataset.py | 111 ++++++++------ .../gaussian_multinomial_diffusion.py | 144 ++++-------------- src/midst_toolkit/models/clavaddpm/train.py | 14 +- 3 files changed, 98 insertions(+), 171 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index 59ea9314..34a458a9 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -7,7 +7,7 @@ from copy import deepcopy from dataclasses import astuple, dataclass, replace from pathlib import Path -from typing import Any, Literal, Self, cast +from typing import Any, Self, cast import numpy as np import pandas as pd @@ -140,7 +140,7 @@ def n_num_features(self) -> int: Returns: The number of numerical features in the dataset. """ - return 0 if self.x_num is None else self.x_num["train"].shape[1] + return 0 if self.x_num is None else self.x_num[DataSplit.TRAIN.value].shape[1] @property def n_cat_features(self) -> int: @@ -152,7 +152,7 @@ def n_cat_features(self) -> int: Returns: The number of categorical features in the dataset. """ - return 0 if self.x_cat is None else self.x_cat["train"].shape[1] + return 0 if self.x_cat is None else self.x_cat[DataSplit.TRAIN.value].shape[1] @property def n_features(self) -> int: @@ -164,8 +164,7 @@ def n_features(self) -> int: """ return self.n_num_features + self.n_cat_features - # TODO: make partition into an Enum - def size(self, split: Literal["train", "val", "test"] | None) -> int: + def size(self, split: DataSplit | None) -> int: """ Get the size of a dataset split. If no split is provided, the size of the entire dataset is returned. @@ -177,7 +176,7 @@ def size(self, split: Literal["train", "val", "test"] | None) -> int: Returns: The size of the dataset. """ - return sum(map(len, self.y.values())) if split is None else len(self.y[split]) + return sum(map(len, self.y.values())) if split is None else len(self.y[split.value]) @property def output_dimension(self) -> int: @@ -456,18 +455,18 @@ def make_dataset_from_df( cat_cols_with_y = [df_info["y_col"]] + cat_cols_with_y if len(cat_cols_with_y) > 0: - x_cat["train"] = train_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index] - x_cat["val"] = val_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index] - x_cat["test"] = test_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index] + x_cat[DataSplit.TRAIN.value] = train_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index] + x_cat[DataSplit.VALIDATION.value] = val_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index] + x_cat[DataSplit.TEST.value] = test_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index] - y["train"] = train_df[df_info["y_col"]].values.astype(np.float32) - y["val"] = val_df[df_info["y_col"]].values.astype(np.float32) - y["test"] = test_df[df_info["y_col"]].values.astype(np.float32) + y[DataSplit.TRAIN.value] = train_df[df_info["y_col"]].values.astype(np.float32) + y[DataSplit.VALIDATION.value] = val_df[df_info["y_col"]].values.astype(np.float32) + y[DataSplit.TEST.value] = test_df[df_info["y_col"]].values.astype(np.float32) if df_info["num_cols"] is not None: - x_num["train"] = train_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index] - x_num["val"] = val_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index] - x_num["test"] = test_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index] + x_num[DataSplit.TRAIN.value] = train_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index] + x_num[DataSplit.VALIDATION.value] = val_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index] + x_num[DataSplit.TEST.value] = test_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index] cat_column_orders = [column_to_index[col] for col in cat_cols_with_y] num_column_orders = [column_to_index[col] for col in df_info["num_cols"]] @@ -485,19 +484,19 @@ def make_dataset_from_df( if len(num_cols_with_y) > 0: assert x_num is not None - x_num["train"] = train_df[num_cols_with_y].values.astype(np.float32) - x_num["val"] = val_df[num_cols_with_y].values.astype(np.float32) - x_num["test"] = test_df[num_cols_with_y].values.astype(np.float32) + x_num[DataSplit.TRAIN.value] = train_df[num_cols_with_y].values.astype(np.float32) + x_num[DataSplit.VALIDATION.value] = val_df[num_cols_with_y].values.astype(np.float32) + x_num[DataSplit.TEST.value] = test_df[num_cols_with_y].values.astype(np.float32) - y["train"] = train_df[df_info["y_col"]].values.astype(np.float32) - y["val"] = val_df[df_info["y_col"]].values.astype(np.float32) - y["test"] = test_df[df_info["y_col"]].values.astype(np.float32) + y[DataSplit.TRAIN.value] = train_df[df_info["y_col"]].values.astype(np.float32) + y[DataSplit.VALIDATION.value] = val_df[df_info["y_col"]].values.astype(np.float32) + y[DataSplit.TEST.value] = test_df[df_info["y_col"]].values.astype(np.float32) if df_info["cat_cols"] is not None: assert x_cat is not None - x_cat["train"] = train_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) - x_cat["val"] = val_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) - x_cat["test"] = test_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) + x_cat[DataSplit.TRAIN.value] = train_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) + x_cat[DataSplit.VALIDATION.value] = val_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) + x_cat[DataSplit.TEST.value] = test_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) cat_column_orders = [column_to_index[col] for col in df_info["cat_cols"]] num_column_orders = [column_to_index[col] for col in num_cols_with_y] @@ -507,7 +506,9 @@ def make_dataset_from_df( label_encoders = {} if x_cat is not None and len(df_info["cat_cols"]) > 0: - x_cat_all = np.vstack((x_cat["train"], x_cat["val"], x_cat["test"])) + x_cat_all = np.vstack( + (x_cat[DataSplit.TRAIN.value], x_cat[DataSplit.VALIDATION.value], x_cat[DataSplit.TEST.value]) + ) x_cat_converted = [] for col_index in range(x_cat_all.shape[1]): label_encoder = LabelEncoder() @@ -519,18 +520,24 @@ def make_dataset_from_df( x_cat_converted = np.vstack(x_cat_converted).T # type: ignore[assignment] - train_num = x_cat["train"].shape[0] - val_num = x_cat["val"].shape[0] + train_num = x_cat[DataSplit.TRAIN.value].shape[0] + val_num = x_cat[DataSplit.VALIDATION.value].shape[0] - x_cat["train"] = x_cat_converted[:train_num, :] # type: ignore[call-overload] - x_cat["val"] = x_cat_converted[train_num : train_num + val_num, :] # type: ignore[call-overload] - x_cat["test"] = x_cat_converted[train_num + val_num :, :] # type: ignore[call-overload] + x_cat[DataSplit.TRAIN.value] = x_cat_converted[:train_num, :] # type: ignore[call-overload] + x_cat[DataSplit.VALIDATION.value] = x_cat_converted[train_num : train_num + val_num, :] # type: ignore[call-overload] + x_cat[DataSplit.TEST.value] = x_cat_converted[train_num + val_num :, :] # type: ignore[call-overload] if x_num and len(x_num) > 0: assert x_num is not None - x_num["train"] = np.concatenate((x_num["train"], x_cat["train"]), axis=1) - x_num["val"] = np.concatenate((x_num["val"], x_cat["val"]), axis=1) - x_num["test"] = np.concatenate((x_num["test"], x_cat["test"]), axis=1) + x_num[DataSplit.TRAIN.value] = np.concatenate( + (x_num[DataSplit.TRAIN.value], x_cat[DataSplit.TRAIN.value]), axis=1 + ) + x_num[DataSplit.VALIDATION.value] = np.concatenate( + (x_num[DataSplit.VALIDATION.value], x_cat[DataSplit.VALIDATION.value]), axis=1 + ) + x_num[DataSplit.TEST.value] = np.concatenate( + (x_num[DataSplit.TEST.value], x_cat[DataSplit.TEST.value]), axis=1 + ) else: x_num = x_cat x_cat = None @@ -611,7 +618,7 @@ def transform_dataset( x_cat, is_num, cat_transform = cat_encode( x_cat, transformations.categorical_encoding, - dataset.y["train"], + dataset.y[DataSplit.TRAIN.value], transformations.seed, return_encoder=True, ) @@ -678,7 +685,7 @@ def normalize( The normalized data. If return_normalizer is True, will return a tuple with the normalized data and the normalizer. """ - x_train = x["train"] + x_train = x[DataSplit.TRAIN.value] if normalization == Normalization.STANDARD: normalizer = StandardScaler() elif normalization == Normalization.MINMAX: @@ -686,7 +693,7 @@ def normalize( elif normalization == Normalization.QUANTILE: normalizer = QuantileTransformer( output_distribution="normal", - n_quantiles=max(min(x["train"].shape[0] // 30, 1000), 10), + n_quantiles=max(min(x[DataSplit.TRAIN.value].shape[0] // 30, 1000), 10), subsample=int(1e9), random_state=seed, ) @@ -720,15 +727,18 @@ def num_process_nans(dataset: Dataset, policy: NumericalNANPolicy | None) -> Dat assert policy is not None if policy == NumericalNANPolicy.DROP_ROWS: valid_masks = {k: ~v.any(1) for k, v in nan_masks.items()} - assert valid_masks["test"].all(), "Cannot drop test rows, since this will affect the final metrics." + assert valid_masks[DataSplit.TEST.value].all(), ( + "Cannot drop test rows, since this will affect the final metrics." + ) new_data = {} for data_name in ["x_num", "x_cat", "y"]: + # TODO: find a way to do this without getattr data_dict = getattr(dataset, data_name) if data_dict is not None: new_data[data_name] = {k: v[valid_masks[k]] for k, v in data_dict.items()} dataset = replace(dataset, **new_data) # type: ignore[arg-type] elif policy == NumericalNANPolicy.MEAN: - new_values = np.nanmean(dataset.x_num["train"], axis=0) # type: ignore[index] + new_values = np.nanmean(dataset.x_num[DataSplit.TRAIN.value], axis=0) # type: ignore[index] x_num = deepcopy(dataset.x_num) for k, v in x_num.items(): # type: ignore[union-attr] num_nan_indices = np.where(nan_masks[k]) @@ -757,7 +767,7 @@ def cat_process_nans(x: ArrayDict, policy: CategoricalNANPolicy | None) -> Array x_new = x elif policy == CategoricalNANPolicy.MOST_FREQUENT: imputer = SimpleImputer(missing_values=CAT_MISSING_VALUE, strategy=policy) - imputer.fit(x["train"]) + imputer.fit(x[DataSplit.TRAIN.value]) x_new = {k: cast(np.ndarray, imputer.transform(v)) for k, v in x.items()} else: raise ValueError(f"Unsupported cat_nan_policy: {policy.value}") @@ -779,10 +789,10 @@ def cat_drop_rare(x: ArrayDict, min_frequency: float) -> ArrayDict: The processed data. """ assert 0.0 < min_frequency < 1.0, "min_frequency has to be between 0 and 1" - min_count = round(len(x["train"]) * min_frequency) + min_count = round(len(x[DataSplit.TRAIN.value]) * min_frequency) x_new: dict[str, list[Any]] = {key: [] for key in x} - for column_idx in range(x["train"].shape[1]): - counter = Counter(x["train"][:, column_idx].tolist()) + for column_idx in range(x[DataSplit.TRAIN.value].shape[1]): + counter = Counter(x[DataSplit.TRAIN.value][:, column_idx].tolist()) popular_categories = {k for k, v in counter.items() if v >= min_count} for part, _ in x_new.items(): x_new[part].append( @@ -825,13 +835,13 @@ def cat_encode( handle_unknown="use_encoded_value", unknown_value=unknown_value, dtype="int64", - ).fit(x["train"]) + ).fit(x[DataSplit.TRAIN.value]) encoder = make_pipeline(oe) - encoder.fit(x["train"]) + encoder.fit(x[DataSplit.TRAIN.value]) x = {k: encoder.transform(v) for k, v in x.items()} - max_values = x["train"].max(axis=0) + max_values = x[DataSplit.TRAIN.value].max(axis=0) for part in x: - if part == "train": + if part == DataSplit.TRAIN.value: continue for column_idx in range(x[part].shape[1]): x[part][x[part][:, column_idx] == unknown_value, column_idx] = max_values[column_idx] + 1 @@ -850,16 +860,16 @@ def cat_encode( encoder = make_pipeline(ohe) # encoder.steps.append(('ohe', ohe)) - encoder.fit(x["train"]) + encoder.fit(x[DataSplit.TRAIN.value]) x = {k: encoder.transform(v) for k, v in x.items()} elif encoding == CategoricalEncoding.COUNTER: assert y_train is not None assert seed is not None loe = LeaveOneOutEncoder(sigma=0.1, random_state=seed, return_df=False) encoder.steps.append(("loe", loe)) - encoder.fit(x["train"], y_train) + encoder.fit(x[DataSplit.TRAIN.value], y_train) x = {k: encoder.transform(v).astype("float32") for k, v in x.items()} - if not isinstance(x["train"], pd.DataFrame): + if not isinstance(x[DataSplit.TRAIN.value], pd.DataFrame): x = {k: v.values for k, v in x.items()} # type: ignore[attr-defined] else: raise ValueError(f"Unsupported encoding: {encoding.value}") @@ -886,7 +896,8 @@ def build_target(y: ArrayDict, policy: YPolicy | None, task_type: TaskType) -> t pass elif policy == YPolicy.DEFAULT: if task_type == TaskType.REGRESSION: - mean, std = float(y["train"].mean()), float(y["train"].std()) + mean = float(y[DataSplit.TRAIN.value].mean()) + std = float(y[DataSplit.TRAIN.value].std()) y = {k: (v - mean) / std for k, v in y.items()} info["mean"] = mean info["std"] = std diff --git a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py index 54b2a0a7..47ea5777 100644 --- a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py +++ b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py @@ -7,6 +7,7 @@ import math from collections.abc import Callable +from enum import Enum from typing import Any, cast import numpy as np @@ -87,6 +88,20 @@ def betas_for_alpha_bar(num_diffusion_timesteps: int, alpha_bar: Callable, max_b return np.array(betas) +class GaussianParametrization(Enum): + """Possible types of Gaussian parametrization.""" + + EPS = "eps" + X0 = "x0" + + +class Parametrization(Enum): + """Possible types of parametrization.""" + + X0 = "x0" + DIRECT = "direct" + + class GaussianMultinomialDiffusion(torch.nn.Module): def __init__( # ruff: noqa: PLR0915 @@ -96,9 +111,8 @@ def __init__( denoise_fn: torch.nn.Module, num_timesteps: int = 1000, gaussian_loss_type: GaussianLossType = GaussianLossType.MSE, - gaussian_parametrization: str = "eps", - multinomial_loss_type: str = "vb_stochastic", - parametrization: str = "x0", + gaussian_parametrization: GaussianParametrization = GaussianParametrization.EPS, + parametrization: Parametrization = Parametrization.X0, scheduler: Scheduler = Scheduler.COSINE, device: torch.device | None = None, ): @@ -107,14 +121,6 @@ def __init__( device = torch.device("cpu") super(GaussianMultinomialDiffusion, self).__init__() - assert multinomial_loss_type in ("vb_stochastic", "vb_all") - assert parametrization in ("x0", "direct") - - if multinomial_loss_type == "vb_all": - print( - "Computing the loss using the bound on _all_ timesteps." - " This is expensive both in terms of memory and computation." - ) self.num_numerical_features = num_numerical_features self.num_classes = num_classes # it as a vector [K1, K2, ..., Km] @@ -131,7 +137,6 @@ def __init__( self._denoise_fn = denoise_fn self.gaussian_loss_type = gaussian_loss_type self.gaussian_parametrization = gaussian_parametrization - self.multinomial_loss_type = multinomial_loss_type self.num_timesteps = num_timesteps self.parametrization = parametrization self.scheduler = scheduler @@ -269,18 +274,17 @@ def gaussian_p_mean_variance( ], dim=0, ) - # model_variance = self.posterior_variance.to(x.device) model_log_variance = torch.log(model_variance) model_variance = extract(model_variance, t, x.shape) model_log_variance = extract(model_log_variance, t, x.shape) - if self.gaussian_parametrization == "eps": + if self.gaussian_parametrization == GaussianParametrization.EPS: pred_xstart = self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output) - elif self.gaussian_parametrization == "x0": + elif self.gaussian_parametrization == GaussianParametrization.X0: pred_xstart = model_output else: - raise NotImplementedError + raise ValueError(f"Unsupported Gaussian parametrization: {self.gaussian_parametrization}") model_mean, _, _ = self.gaussian_q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t) @@ -496,8 +500,6 @@ def q_pred(self, log_x_start: Tensor, t: Tensor) -> Tensor: ) def predict_start(self, model_out: Tensor, log_x_t: Tensor, t: Tensor, out_dict: dict[str, Tensor]) -> Tensor: - # model_out = self._denoise_fn(x_t, t.to(x_t.device), **out_dict) - assert model_out.size(0) == log_x_t.size(0) assert self.num_classes is not None assert model_out.size(1) == self.num_classes.sum(), f"{model_out.size()}" @@ -508,15 +510,6 @@ def predict_start(self, model_out: Tensor, log_x_t: Tensor, t: Tensor, out_dict: return log_pred def q_posterior(self, log_x_start: Tensor, log_x_t: Tensor, t: Tensor) -> Tensor: - # q(xt-1 | xt, x0) = q(xt | xt-1, x0) * q(xt-1 | x0) / q(xt | x0) - # where q(xt | xt-1, x0) = q(xt | xt-1). - - # EV_log_qxt_x0 = self.q_pred(log_x_start, t) - - # print('sum exp', EV_log_qxt_x0.exp().sum(1).mean()) - # assert False - - # log_qxt_x0 = (log_x_t.exp() * EV_log_qxt_x0).sum(dim=1) t_minus_1 = t - 1 # Remove negative values, will not be used anyway for final decoder t_minus_1 = torch.where(t_minus_1 < 0, torch.zeros_like(t_minus_1), t_minus_1) @@ -535,13 +528,13 @@ def q_posterior(self, log_x_start: Tensor, log_x_t: Tensor, t: Tensor) -> Tensor return unnormed_logprobs - sliced_logsumexp(unnormed_logprobs, self.offsets) def p_pred(self, model_out: Tensor, log_x: Tensor, t: Tensor, out_dict: dict[str, Tensor]) -> Tensor: - if self.parametrization == "x0": + if self.parametrization == Parametrization.X0: log_x_recon = self.predict_start(model_out, log_x, t=t, out_dict=out_dict) log_model_pred = self.q_posterior(log_x_start=log_x_recon, log_x_t=log_x, t=t) - elif self.parametrization == "direct": + elif self.parametrization == Parametrization.DIRECT: log_model_pred = self.predict_start(model_out, log_x, t=t, out_dict=out_dict) else: - raise ValueError + raise ValueError(f"Unsupported parametrization: {self.parametrization}") return log_model_pred @torch.no_grad() @@ -549,38 +542,6 @@ def p_sample(self, model_out: Tensor, log_x: Tensor, t: Tensor, out_dict: dict[s model_log_prob = self.p_pred(model_out, log_x=log_x, t=t, out_dict=out_dict) return self.log_sample_categorical(model_log_prob) - # Dead code - # @torch.no_grad() - # def p_sample_loop(self, shape, out_dict): - # b = shape[0] - # # start with random normal image. - # img = torch.randn(shape, device=device) - - # for i in reversed(range(1, self.num_timesteps)): - # img = self.p_sample(img, torch.full((b,), i, device=self.device, dtype=torch.long), out_dict) - # return img - - # @torch.no_grad() - # def _sample(self, image_size, out_dict, batch_size=16): - # return self.p_sample_loop((batch_size, 3, image_size, image_size), out_dict) - - # Dead code - # @torch.no_grad() - # def interpolate(self, x1: Tensor, x2: Tensor, t: Tensor | None = None, lam: float = 0.5) -> Tensor: - # b, *_, device = *x1.shape, x1.device - # t = default(t, self.num_timesteps - 1) - - # assert x1.shape == x2.shape - - # t_batched = torch.stack([torch.tensor(t, device=device)] * b) - # xt1, xt2 = map(lambda x: self.q_sample(x, t=t_batched), (x1, x2)) - - # img = (1 - lam) * xt1 + lam * xt2 - # for i in reversed(range(0, t)): - # img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long)) - - # return img - def log_sample_categorical(self, logits: Tensor) -> Tensor: full_sample = [] for i in range(len(self.num_classes)): @@ -597,27 +558,6 @@ def q_sample(self, log_x_start: Tensor, t: Tensor) -> Tensor: # ruff: noqa: N806 return self.log_sample_categorical(log_EV_qxt_x0) - # Dead code - # def nll(self, log_x_start, out_dict): - # b = log_x_start.size(0) - # device = log_x_start.device - # loss = 0 - # for t in range(0, self.num_timesteps): - # t_array = (torch.ones(b, device=device) * t).long() - - # kl = self.compute_Lt( - # log_x_start=log_x_start, - # log_x_t=self.q_sample(log_x_start=log_x_start, t=t_array), - # t=t_array, - # out_dict=out_dict, - # ) - - # loss += kl - - # loss += self.kl_prior(log_x_start) - - # return loss - def kl_prior(self, log_x_start: Tensor) -> Tensor: b = log_x_start.size(0) device = log_x_start.device @@ -687,37 +627,13 @@ def _multinomial_loss( pt: Tensor, out_dict: dict[str, Tensor], ) -> Tensor: - if self.multinomial_loss_type == "vb_stochastic": - kl = self.compute_Lt(model_out, log_x_start, log_x_t, t, out_dict) - kl_prior = self.kl_prior(log_x_start) - # Upweigh loss term of the kl - return kl / pt + kl_prior - - if self.multinomial_loss_type == "vb_all": - # Expensive, dont do it ;). - # DEPRECATED - # return -self.nll(log_x_start) - raise ValueError("multinomial_loss_type == 'vb_all' is deprecated.") - raise ValueError - - # Dead code - # def log_prob(self, x, out_dict): - # b, device = x.size(0), x.device - # if self.training: - # return self._multinomial_loss(x, out_dict) - - # log_x_start = index_to_log_onehot(x, self.num_classes) - - # t, pt = self.sample_time(b, device, "importance") - - # kl = self.compute_Lt(log_x_start, self.q_sample(log_x_start=log_x_start, t=t), t, out_dict) - - # kl_prior = self.kl_prior(log_x_start) - - # # Upweigh loss term of the kl - # loss = kl / pt + kl_prior - - # return -loss + # Here we are calculating the VB_STOCHASTIC loss. In the original implementation, there + # was a choice between VB_STOCHASTIC and VB_ALL. VB_ALL is deprecated for being too + # expensive to calculate. + kl = self.compute_Lt(model_out, log_x_start, log_x_t, t, out_dict) + kl_prior = self.kl_prior(log_x_start) + # Upweigh loss term of the kl + return kl / pt + kl_prior def mixed_loss(self, x: Tensor, out_dict: dict[str, Tensor]) -> tuple[Tensor, Tensor]: b = x.shape[0] diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 92975220..842d222e 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -308,9 +308,9 @@ def train_model( category_sizes = np.array([0]) # ruff: noqa: N806 - _, empirical_class_dist = torch.unique(torch.from_numpy(dataset.y["train"]), return_counts=True) + _, empirical_class_dist = torch.unique(torch.from_numpy(dataset.y[DataSplit.TRAIN.value]), return_counts=True) - num_numerical_features = dataset.x_num["train"].shape[1] if dataset.x_num is not None else 0 + num_numerical_features = dataset.x_num[DataSplit.TRAIN.value].shape[1] if dataset.x_num is not None else 0 d_in = np.sum(category_sizes) + num_numerical_features model_params.d_in = d_in @@ -435,7 +435,7 @@ def train_classifier( log(WARNING, "dataset.x_num is None. num_numerical_features will be set to 0") num_numerical_features = 0 else: - num_numerical_features = dataset.x_num["train"].shape[1] + num_numerical_features = dataset.x_num[DataSplit.TRAIN.value].shape[1] if model_params.is_y_cond == IsYCond.CONCAT: num_numerical_features -= 1 @@ -474,7 +474,7 @@ def train_classifier( dataset, schedule_sampler, diffusion_model, - prefix="train", + prefix=DataSplit.TRAIN.value, device=device, key_value_logger=key_value_logger, ) @@ -490,7 +490,7 @@ def train_classifier( dataset, schedule_sampler, diffusion_model, - prefix="val", + prefix=DataSplit.VALIDATION.value, device=device, key_value_logger=key_value_logger, ) @@ -588,7 +588,7 @@ def _numerical_forward_backward_log( dataset: Dataset, schedule_sampler: ScheduleSampler, diffusion: GaussianMultinomialDiffusion, - prefix: str = "train", + prefix: str = DataSplit.TRAIN.value, remove_first_col: bool = False, device: str = "cuda", key_value_logger: KeyValueLogger | None = None, @@ -603,7 +603,7 @@ def _numerical_forward_backward_log( dataset: The dataset. schedule_sampler: The schedule sampler. diffusion: The diffusion object. - prefix: The prefix for the loss. Defaults to "train". + prefix: The prefix for the loss. Defaults to DataSplit.TRAIN.value. remove_first_col: Whether to remove the first column of the batch. Defaults to False. device: The device to use. Defaults to "cuda". key_value_logger: The key-value logger to log the losses. If None, the losses are not logged. From 3e8237c45fabf9cab6f143941f193ff351f740bd Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 30 Sep 2025 12:19:52 -0400 Subject: [PATCH 13/40] Adding enum for YType --- .../models/clavaddpm/data_loaders.py | 24 ++++++++++--------- src/midst_toolkit/models/clavaddpm/train.py | 7 +++--- src/midst_toolkit/models/clavaddpm/typing.py | 7 ++++++ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index 4e28febf..f518b06a 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -11,7 +11,7 @@ from midst_toolkit.common.logger import log from midst_toolkit.models.clavaddpm.dataset import Dataset -from midst_toolkit.models.clavaddpm.typing import DataSplit +from midst_toolkit.models.clavaddpm.typing import DataSplit, YType def load_multi_table( @@ -534,7 +534,7 @@ def prepare_fast_dataloader( dataset: Dataset, split: DataSplit, batch_size: int, - y_type: str = "float", + y_type: YType = YType.FLOAT, ) -> Generator[tuple[torch.Tensor, ...]]: """ Prepare a fast dataloader for the dataset. @@ -543,26 +543,28 @@ def prepare_fast_dataloader( dataset: The dataset to prepare the dataloader for. split: The split to prepare the dataloader for. batch_size: The batch size to use for the dataloader. - y_type: The type of the target values. Can be "float" or "long". Default is "float". + y_type: The type of the target values. Default is YType.FLOAT. Returns: A generator of batches of data from the dataset. """ if dataset.x_cat is not None: if dataset.x_num is not None: - x = torch.from_numpy( - np.concatenate([dataset.x_num[split.value], dataset.x_cat[split.value]], axis=1) - ).float() + concatenated_features = np.concatenate([dataset.x_num[split.value], dataset.x_cat[split.value]], axis=1) + x = torch.from_numpy(concatenated_features).float() else: x = torch.from_numpy(dataset.x_cat[split.value]).float() else: assert dataset.x_num is not None x = torch.from_numpy(dataset.x_num[split.value]).float() - y = ( - torch.from_numpy(dataset.y[split.value]).float() - if y_type == "float" - else torch.from_numpy(dataset.y[split.value]).long() - ) + + if y_type == YType.FLOAT: + y = torch.from_numpy(dataset.y[split.value]).float() + elif y_type == YType.LONG: + y = torch.from_numpy(dataset.y[split.value]).long() + else: + raise ValueError(f"Unsupported y type: {y_type}") + dataloader = FastTensorDataLoader((x, y), batch_size=batch_size, shuffle=(split == DataSplit.TRAIN)) while True: yield from dataloader diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 842d222e..fa7700b4 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -32,6 +32,7 @@ Scheduler, Tables, Transformations, + YType, ) @@ -419,9 +420,9 @@ def train_classifier( std=0, ) print(dataset.n_features) - train_loader = prepare_fast_dataloader(dataset, split=DataSplit.TRAIN, batch_size=batch_size, y_type="long") - val_loader = prepare_fast_dataloader(dataset, split=DataSplit.VALIDATION, batch_size=batch_size, y_type="long") - test_loader = prepare_fast_dataloader(dataset, split=DataSplit.TEST, batch_size=batch_size, y_type="long") + train_loader = prepare_fast_dataloader(dataset, split=DataSplit.TRAIN, batch_size=batch_size, y_type=YType.LONG) + val_loader = prepare_fast_dataloader(dataset, split=DataSplit.VALIDATION, batch_size=batch_size, y_type=YType.LONG) + test_loader = prepare_fast_dataloader(dataset, split=DataSplit.TEST, batch_size=batch_size, y_type=YType.LONG) category_sizes = np.array(dataset.get_category_sizes(DataSplit.TRAIN)) # ruff: noqa: N806 diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py index e401bd17..b4f08a00 100644 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ b/src/midst_toolkit/models/clavaddpm/typing.py @@ -136,6 +136,13 @@ class YPolicy(Enum): DEFAULT = "default" +class YType(Enum): + """Possible types of y.""" + + FLOAT = "float" + LONG = "long" + + class TaskType(Enum): BINCLASS = "binclass" MULTICLASS = "multiclass" From 4d0707bc2d212285d23c80fa0777c460d4ac38a4 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 30 Sep 2025 12:37:33 -0400 Subject: [PATCH 14/40] Renaming Scheduler to SchedulerType and moving it and GaussianLossType to the gaussian diffusion file --- .../gaussian_multinomial_diffusion.py | 59 +++++++++++-------- src/midst_toolkit/models/clavaddpm/train.py | 24 ++++---- src/midst_toolkit/models/clavaddpm/typing.py | 14 ----- 3 files changed, 49 insertions(+), 48 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py index 47ea5777..d4c98abd 100644 --- a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py +++ b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py @@ -31,7 +31,6 @@ sliced_logsumexp, sum_except_batch, ) -from midst_toolkit.models.clavaddpm.typing import GaussianLossType, Scheduler # Based in part on: @@ -39,7 +38,35 @@ eps = 1e-8 -def get_named_beta_schedule(scheduler: Scheduler, num_diffusion_timesteps: int) -> np.ndarray: +class GaussianLossType(Enum): + """Possible types of Gaussian loss.""" + + MSE = "mse" + KL = "kl" + + +class SchedulerType(Enum): + """Possible types of scheduler.""" + + COSINE = "cosine" + LINEAR = "linear" + + +class GaussianParametrization(Enum): + """Possible types of Gaussian parametrization.""" + + EPS = "eps" + X0 = "x0" + + +class Parametrization(Enum): + """Possible types of parametrization.""" + + X0 = "x0" + DIRECT = "direct" + + +def get_named_beta_schedule(scheduler_type: SchedulerType, num_diffusion_timesteps: int) -> np.ndarray: """ Get a pre-defined beta schedule for the given name. The beta schedule library consists of beta schedules which remain similar @@ -48,25 +75,25 @@ def get_named_beta_schedule(scheduler: Scheduler, num_diffusion_timesteps: int) they are committed to maintain backwards compatibility. Args: - scheduler: The scheduler to use. + scheduler_type: The scheduler type to use. num_diffusion_timesteps: The number of diffusion timesteps. Returns: The beta schedule. """ - if scheduler == Scheduler.LINEAR: + if scheduler_type == SchedulerType.LINEAR: # Linear schedule from Ho et al, extended to work for any number of # diffusion steps. scale = 1000 / num_diffusion_timesteps beta_start = scale * 0.0001 beta_end = scale * 0.02 return np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64) - if scheduler == Scheduler.COSINE: + if scheduler_type == SchedulerType.COSINE: return betas_for_alpha_bar( num_diffusion_timesteps, lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, ) - raise NotImplementedError(f"Unsupported scheduler: {scheduler.value}") + raise ValueError(f"Unsupported scheduler: {scheduler_type.value}") def betas_for_alpha_bar(num_diffusion_timesteps: int, alpha_bar: Callable, max_beta: float = 0.999) -> np.ndarray: @@ -88,20 +115,6 @@ def betas_for_alpha_bar(num_diffusion_timesteps: int, alpha_bar: Callable, max_b return np.array(betas) -class GaussianParametrization(Enum): - """Possible types of Gaussian parametrization.""" - - EPS = "eps" - X0 = "x0" - - -class Parametrization(Enum): - """Possible types of parametrization.""" - - X0 = "x0" - DIRECT = "direct" - - class GaussianMultinomialDiffusion(torch.nn.Module): def __init__( # ruff: noqa: PLR0915 @@ -113,7 +126,7 @@ def __init__( gaussian_loss_type: GaussianLossType = GaussianLossType.MSE, gaussian_parametrization: GaussianParametrization = GaussianParametrization.EPS, parametrization: Parametrization = Parametrization.X0, - scheduler: Scheduler = Scheduler.COSINE, + scheduler_type: SchedulerType = SchedulerType.COSINE, device: torch.device | None = None, ): # ruff: noqa: D107 @@ -139,7 +152,7 @@ def __init__( self.gaussian_parametrization = gaussian_parametrization self.num_timesteps = num_timesteps self.parametrization = parametrization - self.scheduler = scheduler + self.scheduler_type = scheduler_type self.device = device self.alphas: Tensor self.alphas_cumprod: Tensor @@ -156,7 +169,7 @@ def __init__( self.Lt_history: Tensor self.Lt_count: Tensor - a = 1.0 - get_named_beta_schedule(scheduler, num_timesteps) + a = 1.0 - get_named_beta_schedule(scheduler_type, num_timesteps) alphas = torch.tensor(a.astype("float64")) betas = 1.0 - alphas diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index fa7700b4..6afb7650 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -15,7 +15,11 @@ from midst_toolkit.common.logger import KeyValueLogger, log from midst_toolkit.models.clavaddpm.data_loaders import prepare_fast_dataloader from midst_toolkit.models.clavaddpm.dataset import Dataset, make_dataset_from_df -from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import GaussianMultinomialDiffusion +from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import ( + GaussianLossType, + GaussianMultinomialDiffusion, + SchedulerType, +) from midst_toolkit.models.clavaddpm.model import Classifier, ModelType, get_table_info from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, ScheduleSamplerType from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer @@ -24,12 +28,10 @@ Configs, DataSplit, DiffusionParameters, - GaussianLossType, IsYCond, ModelParameters, ReductionMethod, RelationOrder, - Scheduler, Tables, Transformations, YType, @@ -211,7 +213,7 @@ def child_training( ModelType(diffusion_config["model_type"]), GaussianLossType(diffusion_config["gaussian_loss_type"]), diffusion_config["num_timesteps"], - Scheduler(diffusion_config["scheduler"]), + SchedulerType(diffusion_config["scheduler"]), diffusion_config["lr"], diffusion_config["weight_decay"], diffusion_config["data_split_ratios"], @@ -232,7 +234,7 @@ def child_training( classifier_config["batch_size"], GaussianLossType(diffusion_config["gaussian_loss_type"]), diffusion_config["num_timesteps"], - Scheduler(diffusion_config["scheduler"]), + SchedulerType(diffusion_config["scheduler"]), cluster_col=y_col, d_layers=classifier_config["d_layers"], dim_t=classifier_config["dim_t"], @@ -260,7 +262,7 @@ def train_model( model_type: ModelType, gaussian_loss_type: GaussianLossType, num_timesteps: int, - scheduler: Scheduler, + scheduler_type: SchedulerType, learning_rate: float, weight_decay: float, data_split_ratios: list[float], @@ -279,7 +281,7 @@ def train_model( model_type: Type of the model to use. gaussian_loss_type: Type of the gaussian loss to use. num_timesteps: Number of timesteps to use for the diffusion model. - scheduler: Scheduler to use for the diffusion model. + scheduler_type: Type of scheduler to use for the diffusion model. learning_rate: Learning rate to use for the optimizer in the diffusion model. weight_decay: Weight decay to use for the optimizer in the diffusion model. data_split_ratios: The ratios of the dataset to split into train, validation, and test. @@ -327,7 +329,7 @@ def train_model( denoise_fn=model, gaussian_loss_type=gaussian_loss_type, num_timesteps=num_timesteps, - scheduler=scheduler, + scheduler_type=scheduler_type, device=torch.device(device), ) diffusion.to(device) @@ -372,7 +374,7 @@ def train_classifier( batch_size: int, gaussian_loss_type: GaussianLossType, num_timesteps: int, - scheduler: Scheduler, + scheduler_type: SchedulerType, d_layers: list[int], data_split_ratios: list[float], device: str = "cuda", @@ -394,7 +396,7 @@ def train_classifier( batch_size: Batch size to use for training. gaussian_loss_type: Type of the gaussian loss to use. num_timesteps: Number of timesteps to use for the diffusion model. - scheduler: Scheduler to use for the diffusion model. + scheduler_type: Type of scheduler to use for the diffusion model. d_layers: List of the hidden sizes of the classifier. data_split_ratios: The ratios of the dataset to split into train, validation, and test. It must have exactly 3 values and their sum must amount to 1 (with a tolerance of 0.01). @@ -459,7 +461,7 @@ def train_classifier( denoise_fn=None, # type: ignore[arg-type] gaussian_loss_type=gaussian_loss_type, num_timesteps=num_timesteps, - scheduler=scheduler, + scheduler_type=scheduler_type, device=torch.device(device), ) diffusion_model.to(device) diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py index b4f08a00..efeab522 100644 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ b/src/midst_toolkit/models/clavaddpm/typing.py @@ -79,20 +79,6 @@ class ModelParameters: is_y_cond: IsYCond = IsYCond.NONE -class GaussianLossType(Enum): - """Possible types of Gaussian loss.""" - - MSE = "mse" - KL = "kl" - - -class Scheduler(Enum): - """Possible types of scheduler.""" - - COSINE = "cosine" - LINEAR = "linear" - - class ReductionMethod(Enum): """Possible methods of reduction.""" From 30d0a0d9976b92250ed9de4af4e89017b3f5eaa1 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 30 Sep 2025 17:35:01 -0400 Subject: [PATCH 15/40] WIP CR by David --- src/midst_toolkit/common/enumerations.py | 13 +- .../models/clavaddpm/clustering.py | 2 +- .../models/clavaddpm/data_loaders.py | 13 +- src/midst_toolkit/models/clavaddpm/dataset.py | 75 ++++---- .../models/clavaddpm/diffusion_utils.py | 4 +- .../models/clavaddpm/enumerations.py | 102 +++++++++++ .../gaussian_multinomial_diffusion.py | 4 +- src/midst_toolkit/models/clavaddpm/model.py | 55 ++++-- src/midst_toolkit/models/clavaddpm/train.py | 57 +++--- src/midst_toolkit/models/clavaddpm/typing.py | 171 ------------------ 10 files changed, 246 insertions(+), 250 deletions(-) create mode 100644 src/midst_toolkit/models/clavaddpm/enumerations.py delete mode 100644 src/midst_toolkit/models/clavaddpm/typing.py diff --git a/src/midst_toolkit/common/enumerations.py b/src/midst_toolkit/common/enumerations.py index df3b3bcd..c9079f6f 100644 --- a/src/midst_toolkit/common/enumerations.py +++ b/src/midst_toolkit/common/enumerations.py @@ -7,10 +7,21 @@ class TaskType(Enum): REGRESSION = "regression" def __str__(self) -> str: - """Return the string value of the enum.""" + """ + Return the string representation of the task type, which is the value of the enum. + + Returns: + The string representation of the task type. + """ return self.value class PredictionType(Enum): LOGITS = "logits" PROBS = "probs" + + +class DataSplit(Enum): + TRAIN = "train" + VALIDATION = "val" + TEST = "test" diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index d7896f5c..be225efe 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -14,7 +14,7 @@ from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, QuantileTransformer from midst_toolkit.common.logger import log -from midst_toolkit.models.clavaddpm.typing import ( +from midst_toolkit.models.clavaddpm.enumerations import ( ClusteringMethod, Configs, GroupLengthsProbDicts, diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index f518b06a..a1b35692 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -9,9 +9,10 @@ import pandas as pd import torch +from midst_toolkit.common.enumerations import DataSplit from midst_toolkit.common.logger import log from midst_toolkit.models.clavaddpm.dataset import Dataset -from midst_toolkit.models.clavaddpm.typing import DataSplit, YType +from midst_toolkit.models.clavaddpm.enumerations import TargetType def load_multi_table( @@ -534,7 +535,7 @@ def prepare_fast_dataloader( dataset: Dataset, split: DataSplit, batch_size: int, - y_type: YType = YType.FLOAT, + target_type: TargetType = TargetType.FLOAT, ) -> Generator[tuple[torch.Tensor, ...]]: """ Prepare a fast dataloader for the dataset. @@ -543,7 +544,7 @@ def prepare_fast_dataloader( dataset: The dataset to prepare the dataloader for. split: The split to prepare the dataloader for. batch_size: The batch size to use for the dataloader. - y_type: The type of the target values. Default is YType.FLOAT. + target_type: The type of the target values. Default is TargetType.FLOAT. Returns: A generator of batches of data from the dataset. @@ -558,12 +559,12 @@ def prepare_fast_dataloader( assert dataset.x_num is not None x = torch.from_numpy(dataset.x_num[split.value]).float() - if y_type == YType.FLOAT: + if target_type == TargetType.FLOAT: y = torch.from_numpy(dataset.y[split.value]).float() - elif y_type == YType.LONG: + elif target_type == TargetType.LONG: y = torch.from_numpy(dataset.y[split.value]).long() else: - raise ValueError(f"Unsupported y type: {y_type}") + raise ValueError(f"Unsupported target type: {target_type}") dataloader = FastTensorDataLoader((x, y), batch_size=batch_size, shuffle=(split == DataSplit.TRAIN)) while True: diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index 34a458a9..cabae2c3 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -27,18 +27,15 @@ StandardScaler, ) -from midst_toolkit.models.clavaddpm.typing import ( +from midst_toolkit.common.enumerations import DataSplit, PredictionType, TaskType +from midst_toolkit.models.clavaddpm.enumerations import ( ArrayDict, CategoricalEncoding, - CategoricalNANPolicy, - DataSplit, - IsYCond, + CategoricalNaNPolicy, + IsTargetCondioned, Normalization, - NumericalNANPolicy, - PredictionType, - TaskType, - Transformations, - YPolicy, + NumericalNaNPolicy, + TargetPolicy, ) @@ -47,6 +44,22 @@ CAT_RARE_VALUE = "__rare__" +@dataclass(frozen=True) +class Transformations: + seed: int = 0 + normalization: Normalization | None = None + numerical_nan_policy: NumericalNaNPolicy | None = None + categorical_nan_policy: CategoricalNaNPolicy | None = None + category_minimum_frequency: float | None = None + categorical_encoding: CategoricalEncoding | None = CategoricalEncoding.ORDINAL + target_policy: TargetPolicy | None = TargetPolicy.DEFAULT + + @classmethod + def default(cls) -> Self: + """Return the default transformations.""" + return cls(seed=0, normalization=Normalization.QUANTILE, target_policy=TargetPolicy.DEFAULT) + + @dataclass(frozen=False) class Dataset: x_num: ArrayDict | None @@ -382,7 +395,7 @@ def make_dataset_from_df( # ruff: noqa: PLR0915, PLR0912 df: pd.DataFrame, transformations: Transformations, - is_y_cond: IsYCond, + is_target_conditioned: IsTargetCondioned, df_info: dict[str, Any], ratios: list[float] | None = None, std: float = 0, @@ -400,22 +413,22 @@ def make_dataset_from_df( Args: df: The pandas DataFrame to generate the dataset from. transformations: The transformations to apply to the dataset. - is_y_cond: The condition on the y column. - concat: y is concatenated to X, the model learns a joint distribution of (y, X) - embedding: y is not concatenated to X. During computations, y is embedded + is_target_conditioned: The condition on the y column. + IsTargetCondioned.CONCAT: y is concatenated to X, the model learns a joint distribution of (y, X) + IsTargetCondioned.EMBEDDING: y is not concatenated to X. During computations, y is embedded and added to the latent vector of X - none: y column is completely ignored + IsTargetCondioned.NONE: y column is completely ignored - How does is_y_cond affect the generation of y? - is_y_cond: - concat: the model synthesizes (y, X) directly, so y is just the first column - embedding: y is first sampled using empirical distribution of y. The model only + How does is_target_conditioned affect the generation of y? + is_target_conditioned: + IsTargetCondioned.CONCAT: the model synthesizes (y, X) directly, so y is just the first column + IsTargetCondioned.EMBEDDING: y is first sampled using empirical distribution of y. The model only synthesizes X. When returning the generated data, we return the generated X and the sampled y. (y is sampled from empirical distribution, instead of being generated by the model) Note that in this way, y is still not independent of X, because the model has been adding the embedding of y to the latent vector of X during computations. - none: + IsTargetCondioned.NONE: y is synthesized using y's empirical distribution. X is generated by the model. In this case, y is completely independent of X. @@ -443,7 +456,7 @@ def make_dataset_from_df( if df_info["n_classes"] > 0: x_cat: dict[str, np.ndarray] | None = ( - {} if df_info["cat_cols"] is not None or is_y_cond == IsYCond.CONCAT else None + {} if df_info["cat_cols"] is not None or is_target_conditioned == IsTargetCondioned.CONCAT else None ) x_num: dict[str, np.ndarray] | None = {} if df_info["num_cols"] is not None else None y = {} @@ -451,7 +464,7 @@ def make_dataset_from_df( cat_cols_with_y: list[str] = [] if df_info["cat_cols"] is not None: cat_cols_with_y += df_info["cat_cols"] - if is_y_cond == IsYCond.CONCAT: + if is_target_conditioned == IsTargetCondioned.CONCAT: cat_cols_with_y = [df_info["y_col"]] + cat_cols_with_y if len(cat_cols_with_y) > 0: @@ -473,13 +486,13 @@ def make_dataset_from_df( else: x_cat = {} if df_info["cat_cols"] is not None else None - x_num = {} if df_info["num_cols"] is not None or is_y_cond == IsYCond.CONCAT else None + x_num = {} if df_info["num_cols"] is not None or is_target_conditioned == IsTargetCondioned.CONCAT else None y = {} num_cols_with_y: list[str] = [] if df_info["num_cols"] is not None: num_cols_with_y += df_info["num_cols"] - if is_y_cond == IsYCond.CONCAT: + if is_target_conditioned == IsTargetCondioned.CONCAT: num_cols_with_y = [df_info["y_col"]] + num_cols_with_y if len(num_cols_with_y) > 0: @@ -626,7 +639,7 @@ def transform_dataset( x_num = x_cat if x_num is None else {x: np.hstack([x_num[x], x_cat[x]]) for x in x_num} x_cat = None - y, y_info = build_target(dataset.y, transformations.y_policy, dataset.task_type) + y, y_info = build_target(dataset.y, transformations.target_policy, dataset.task_type) dataset = replace(dataset, x_num=x_num, x_cat=x_cat, y=y, y_info=y_info) dataset.numerical_transform = num_transform @@ -707,7 +720,7 @@ def normalize( # TODO: is there any relationship between this function and the cat_process_nans function? # Can they be made a little more similar to each other (in terms of signature)? -def num_process_nans(dataset: Dataset, policy: NumericalNANPolicy | None) -> Dataset: +def num_process_nans(dataset: Dataset, policy: NumericalNaNPolicy | None) -> Dataset: """ Process the NaN values in the dataset. @@ -725,7 +738,7 @@ def num_process_nans(dataset: Dataset, policy: NumericalNANPolicy | None) -> Dat return dataset assert policy is not None - if policy == NumericalNANPolicy.DROP_ROWS: + if policy == NumericalNaNPolicy.DROP_ROWS: valid_masks = {k: ~v.any(1) for k, v in nan_masks.items()} assert valid_masks[DataSplit.TEST.value].all(), ( "Cannot drop test rows, since this will affect the final metrics." @@ -737,7 +750,7 @@ def num_process_nans(dataset: Dataset, policy: NumericalNANPolicy | None) -> Dat if data_dict is not None: new_data[data_name] = {k: v[valid_masks[k]] for k, v in data_dict.items()} dataset = replace(dataset, **new_data) # type: ignore[arg-type] - elif policy == NumericalNANPolicy.MEAN: + elif policy == NumericalNaNPolicy.MEAN: new_values = np.nanmean(dataset.x_num[DataSplit.TRAIN.value], axis=0) # type: ignore[index] x_num = deepcopy(dataset.x_num) for k, v in x_num.items(): # type: ignore[union-attr] @@ -749,7 +762,7 @@ def num_process_nans(dataset: Dataset, policy: NumericalNANPolicy | None) -> Dat return dataset -def cat_process_nans(x: ArrayDict, policy: CategoricalNANPolicy | None) -> ArrayDict: +def cat_process_nans(x: ArrayDict, policy: CategoricalNaNPolicy | None) -> ArrayDict: """ Process the NaN values in the categorical data. @@ -765,7 +778,7 @@ def cat_process_nans(x: ArrayDict, policy: CategoricalNANPolicy | None) -> Array if any(mask.any() for mask in nan_masks.values()): if policy is None: x_new = x - elif policy == CategoricalNANPolicy.MOST_FREQUENT: + elif policy == CategoricalNaNPolicy.MOST_FREQUENT: imputer = SimpleImputer(missing_values=CAT_MISSING_VALUE, strategy=policy) imputer.fit(x[DataSplit.TRAIN.value]) x_new = {k: cast(np.ndarray, imputer.transform(v)) for k, v in x.items()} @@ -879,7 +892,7 @@ def cat_encode( return x, True, None -def build_target(y: ArrayDict, policy: YPolicy | None, task_type: TaskType) -> tuple[ArrayDict, dict[str, Any]]: +def build_target(y: ArrayDict, policy: TargetPolicy | None, task_type: TaskType) -> tuple[ArrayDict, dict[str, Any]]: """ Build the target and return the target values metadata. @@ -894,7 +907,7 @@ def build_target(y: ArrayDict, policy: YPolicy | None, task_type: TaskType) -> t info: dict[str, Any] = {"policy": policy} if policy is None: pass - elif policy == YPolicy.DEFAULT: + elif policy == TargetPolicy.DEFAULT: if task_type == TaskType.REGRESSION: mean = float(y[DataSplit.TRAIN.value].mean()) std = float(y[DataSplit.TRAIN.value].std()) diff --git a/src/midst_toolkit/models/clavaddpm/diffusion_utils.py b/src/midst_toolkit/models/clavaddpm/diffusion_utils.py index 76c72df2..b6720bc9 100644 --- a/src/midst_toolkit/models/clavaddpm/diffusion_utils.py +++ b/src/midst_toolkit/models/clavaddpm/diffusion_utils.py @@ -194,9 +194,9 @@ def log_onehot_to_index(log_x: Tensor) -> Tensor: return log_x.argmax(1) -class FoundNANsError(BaseException): +class FoundNaNsError(BaseException): """Found NANs during sampling.""" def __init__(self, message: str = "Found NANs during sampling.") -> None: # ruff: noqa: D107 - super(FoundNANsError, self).__init__(message) + super(FoundNaNsError, self).__init__(message) diff --git a/src/midst_toolkit/models/clavaddpm/enumerations.py b/src/midst_toolkit/models/clavaddpm/enumerations.py new file mode 100644 index 00000000..51694966 --- /dev/null +++ b/src/midst_toolkit/models/clavaddpm/enumerations.py @@ -0,0 +1,102 @@ +from collections.abc import Callable +from enum import Enum +from typing import Any + +import numpy as np +from torch import nn + + +# TODO: Temporary, will switch to classes later +Configs = dict[str, Any] +Tables = dict[str, dict[str, Any]] +RelationOrder = list[tuple[str, str]] +GroupLengthsProbDicts = dict[tuple[str, str], dict[int, dict[int, float]]] +ArrayDict = dict[str, np.ndarray] +ModuleType = str | Callable[..., nn.Module] + + +class ClusteringMethod(Enum): + """Possioble clustering methods for multi-table training.""" + + KMEANS = "kmeans" + GMM = "gmm" + KMEANS_AND_GMM = "kmeans_and_gmm" + VARIATIONAL = "variational" + + +class IsTargetCondioned(Enum): + """ + The condition on the y column. + + IsTargetCondioned.CONCAT: y is concatenated to X, the model learn a joint distribution of (y, X) + IsTargetCondioned.EMBEDDING: y is not concatenated to X. During computations, y is embedded + and added to the latent vector of X + IsTargetCondioned.NONE: y column is completely ignored + + How does IsTargetCondioned affect the generation of y? + IsTargetCondioned: + IsTargetCondioned.CONCAT: the model synthesizes (y, X) directly, so y is just the first column + IsTargetCondioned.EMBEDDING: y is first sampled using empirical distribution of y. The model only + synthesizes X. When returning the generated data, we return the generated X + and the sampled y. (y is sampled from empirical distribution, instead of being + generated by the model) + Note that in this way, y is still not independent of X, because the model has been + adding the embedding of y to the latent vector of X during computations. + IsTargetCondioned.NONE: + y is synthesized using y's empirical distribution. X is generated by the model. + In this case, y is completely independent of X. + """ + + CONCAT = "concat" + EMBEDDING = "embedding" + NONE = "none" + + +class ReductionMethod(Enum): + """Possible methods of reduction.""" + + MEAN = "mean" + SUM = "sum" + NONE = "none" + + +class Normalization(Enum): + """Possible types of normalization.""" + + STANDARD = "standard" + QUANTILE = "quantile" + MINMAX = "minmax" + + +class NumericalNaNPolicy(Enum): + """Possible policies for dealng with NaNs in numerical data.""" + + DROP_ROWS = "drop-rows" + MEAN = "mean" + + +class CategoricalNaNPolicy(Enum): + """Possible policies for dealng with NaNs in categorical data.""" + + MOST_FREQUENT = "most_frequent" + + +class CategoricalEncoding(Enum): + """Possible types of encoding for categorical data.""" + + ONE_HOT = "one-hot" + COUNTER = "counter" + ORDINAL = "ordinal" + + +class TargetPolicy(Enum): + """Possible types of policy for the model target.""" + + DEFAULT = "default" + + +class TargetType(Enum): + """Possible types of model target.""" + + FLOAT = "float" + LONG = "long" diff --git a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py index d4c98abd..29cd4b67 100644 --- a/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py +++ b/src/midst_toolkit/models/clavaddpm/gaussian_multinomial_diffusion.py @@ -18,7 +18,7 @@ from torch import Tensor from midst_toolkit.models.clavaddpm.diffusion_utils import ( - FoundNANsError, + FoundNaNsError, discretized_gaussian_log_likelihood, extract, index_to_log_onehot, @@ -1085,7 +1085,7 @@ def sample_all( all_samples.append(sample) all_y.append(out_dict["y"].cpu()) if sample.shape[0] != b: - raise FoundNANsError + raise FoundNaNsError num_generated += sample.shape[0] x_gen = torch.cat(all_samples, dim=0)[:num_samples] diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py index 0e25ddfd..2daa2ed2 100644 --- a/src/midst_toolkit/models/clavaddpm/model.py +++ b/src/midst_toolkit/models/clavaddpm/model.py @@ -1,6 +1,7 @@ from __future__ import annotations import math +from dataclasses import dataclass from enum import Enum from logging import INFO from typing import Any, Self @@ -13,7 +14,33 @@ from torch import Tensor, nn from midst_toolkit.common.logger import log -from midst_toolkit.models.clavaddpm.typing import DiffusionParameters, IsYCond, ModelParameters, ModuleType +from midst_toolkit.models.clavaddpm.enumerations import IsTargetCondioned, ModuleType + + +@dataclass +class DiffusionParameters: + """Parameters for the diffusion model.""" + + d_layers: list[int] + dropout: float + d_in: int = 0 + d_out: int = 0 + emb_d: int = 0 + n_blocks: int = 0 + d_main: int = 0 + d_hidden: int = 0 + dropout_first: float = 0 + dropout_second: float = 0 + + +@dataclass +class ModelParameters: + """Parameters for the ClavaDDPM model.""" + + diffusion_parameters: DiffusionParameters + d_in: int = 0 + num_classes: int = 0 + is_target_conditioned: IsTargetCondioned = IsTargetCondioned.NONE class Classifier(nn.Module): @@ -566,7 +593,7 @@ def __init__( self, d_in: int, num_classes: int, - is_y_cond: IsYCond, + is_target_conditioned: IsTargetCondioned, diffusion_parameters: DiffusionParameters, dim_t: int = 128, ): @@ -576,14 +603,14 @@ def __init__( Args: d_in: The input dimension size. num_classes: The number of classes. - is_y_cond: The condition on the y column. + is_target_conditioned: The condition on the model target. diffusion_parameters: The parameters for the MLP. - dim_t: The dimension size of the timestamp. + dim_t: The dimension size of the timestep. """ super().__init__() self.dim_t = dim_t self.num_classes = num_classes - self.is_y_cond = is_y_cond + self.is_target_conditioned = is_target_conditioned self.diffusion_parameters = diffusion_parameters self.diffusion_parameters.d_in = dim_t @@ -597,9 +624,9 @@ def __init__( ) self.label_emb: nn.Embedding | nn.Linear - if self.num_classes > 0 and is_y_cond == IsYCond.EMBEDDING: + if self.num_classes > 0 and is_target_conditioned == IsTargetCondioned.EMBEDDING: self.label_emb = nn.Embedding(self.num_classes, dim_t) - elif self.num_classes == 0 and is_y_cond == IsYCond.EMBEDDING: + elif self.num_classes == 0 and is_target_conditioned == IsTargetCondioned.EMBEDDING: self.label_emb = nn.Linear(1, dim_t) self.proj = nn.Linear(d_in, dim_t) @@ -618,7 +645,7 @@ def forward(self, x: Tensor, timesteps: Tensor, y: Tensor | None = None) -> Tens The output tensor. """ emb = self.time_embed(timestep_embedding(timesteps, self.dim_t)) - if self.is_y_cond == IsYCond.EMBEDDING and y is not None: + if self.is_target_conditioned == IsTargetCondioned.EMBEDDING and y is not None: y = y.squeeze() if self.num_classes > 0 else y.resize_(y.size(0), 1).float() emb += F.silu(self.label_emb(y)) x = self.proj(x) + emb @@ -632,7 +659,7 @@ def __init__( num_classes: int, diffusion_parameters: DiffusionParameters, dim_t: int = 256, - is_y_cond: IsYCond | None = None, + is_target_conditioned: IsTargetCondioned | None = None, ): """ Initialize the ResNet diffusion model. @@ -642,12 +669,12 @@ def __init__( num_classes: The number of classes. diffusion_parameters: The parameters for the ResNet. dim_t: The dimension size of the timestep. - is_y_cond: The condition on the y column. Optional, default is None. + is_target_conditioned: The condition on the model target. Optional, default is None. """ super().__init__() self.dim_t = dim_t self.num_classes = num_classes - self.is_y_cond = is_y_cond + self.is_target_conditioned = is_target_conditioned self.diffusion_parameters = diffusion_parameters self.diffusion_parameters.d_in = d_in @@ -665,9 +692,9 @@ def __init__( ) self.label_emb: nn.Embedding | nn.Linear - if self.num_classes > 0 and is_y_cond == IsYCond.EMBEDDING: + if self.num_classes > 0 and is_target_conditioned == IsTargetCondioned.EMBEDDING: self.label_emb = nn.Embedding(self.num_classes, dim_t) - elif self.num_classes == 0 and is_y_cond == IsYCond.EMBEDDING: + elif self.num_classes == 0 and is_target_conditioned == IsTargetCondioned.EMBEDDING: self.label_emb = nn.Linear(1, dim_t) self.time_embed = nn.Sequential(nn.Linear(dim_t, dim_t), nn.SiLU(), nn.Linear(dim_t, dim_t)) @@ -823,7 +850,7 @@ def get_model(self, model_parameters: ModelParameters) -> nn.Module: return MLPDiffusion( d_in=model_parameters.d_in, num_classes=model_parameters.num_classes, - is_y_cond=model_parameters.is_y_cond, + is_target_conditioned=model_parameters.is_target_conditioned, diffusion_parameters=model_parameters.diffusion_parameters, ) if self == ModelType.RESNET: diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 6afb7650..a0a0facb 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -12,30 +12,33 @@ import torch from torch import Tensor, optim +from midst_toolkit.common.enumerations import DataSplit from midst_toolkit.common.logger import KeyValueLogger, log from midst_toolkit.models.clavaddpm.data_loaders import prepare_fast_dataloader -from midst_toolkit.models.clavaddpm.dataset import Dataset, make_dataset_from_df +from midst_toolkit.models.clavaddpm.dataset import Dataset, Transformations, make_dataset_from_df +from midst_toolkit.models.clavaddpm.enumerations import ( + CategoricalEncoding, + Configs, + IsTargetCondioned, + ReductionMethod, + RelationOrder, + Tables, + TargetType, +) from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import ( GaussianLossType, GaussianMultinomialDiffusion, SchedulerType, ) -from midst_toolkit.models.clavaddpm.model import Classifier, ModelType, get_table_info -from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, ScheduleSamplerType -from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer -from midst_toolkit.models.clavaddpm.typing import ( - CategoricalEncoding, - Configs, - DataSplit, +from midst_toolkit.models.clavaddpm.model import ( + Classifier, DiffusionParameters, - IsYCond, ModelParameters, - ReductionMethod, - RelationOrder, - Tables, - Transformations, - YType, + ModelType, + get_table_info, ) +from midst_toolkit.models.clavaddpm.sampler import ScheduleSampler, ScheduleSamplerType +from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer def clava_training( @@ -299,7 +302,7 @@ def train_model( dataset, label_encoders, column_orders = make_dataset_from_df( data_frame, transformations, - is_y_cond=model_params.is_y_cond, + is_target_conditioned=model_params.is_target_conditioned, ratios=data_split_ratios, df_info=data_frame_info, std=0, @@ -345,7 +348,7 @@ def train_model( ) trainer.train() - if model_params.is_y_cond == IsYCond.CONCAT: + if model_params.is_target_conditioned == IsTargetCondioned.CONCAT: column_orders = column_orders[1:] + [column_orders[0]] else: column_orders = column_orders + [data_frame_info["y_col"]] @@ -416,15 +419,21 @@ def train_classifier( dataset, label_encoders, column_orders = make_dataset_from_df( data_frame, transformations, - is_y_cond=model_params.is_y_cond, + is_target_conditioned=model_params.is_target_conditioned, ratios=data_split_ratios, df_info=data_frame_info, std=0, ) print(dataset.n_features) - train_loader = prepare_fast_dataloader(dataset, split=DataSplit.TRAIN, batch_size=batch_size, y_type=YType.LONG) - val_loader = prepare_fast_dataloader(dataset, split=DataSplit.VALIDATION, batch_size=batch_size, y_type=YType.LONG) - test_loader = prepare_fast_dataloader(dataset, split=DataSplit.TEST, batch_size=batch_size, y_type=YType.LONG) + train_loader = prepare_fast_dataloader( + dataset, split=DataSplit.TRAIN, batch_size=batch_size, target_type=TargetType.LONG + ) + val_loader = prepare_fast_dataloader( + dataset, split=DataSplit.VALIDATION, batch_size=batch_size, target_type=TargetType.LONG + ) + test_loader = prepare_fast_dataloader( + dataset, split=DataSplit.TEST, batch_size=batch_size, target_type=TargetType.LONG + ) category_sizes = np.array(dataset.get_category_sizes(DataSplit.TRAIN)) # ruff: noqa: N806 @@ -440,7 +449,7 @@ def train_classifier( else: num_numerical_features = dataset.x_num[DataSplit.TRAIN.value].shape[1] - if model_params.is_y_cond == IsYCond.CONCAT: + if model_params.is_target_conditioned == IsTargetCondioned.CONCAT: num_numerical_features -= 1 classifier = Classifier( @@ -511,7 +520,11 @@ def train_classifier( for _ in range(3000): test_x, test_y = next(test_loader) test_y = test_y.long().to(device) - test_x = test_x[:, 1:].to(device) if model_params.is_y_cond == IsYCond.CONCAT else test_x.to(device) + test_x = ( + test_x[:, 1:].to(device) + if model_params.is_target_conditioned == IsTargetCondioned.CONCAT + else test_x.to(device) + ) with torch.no_grad(): pred = classifier(test_x, timesteps=torch.zeros(test_x.shape[0]).to(device)) correct += (pred.argmax(dim=1) == test_y).sum().item() diff --git a/src/midst_toolkit/models/clavaddpm/typing.py b/src/midst_toolkit/models/clavaddpm/typing.py deleted file mode 100644 index efeab522..00000000 --- a/src/midst_toolkit/models/clavaddpm/typing.py +++ /dev/null @@ -1,171 +0,0 @@ -from collections.abc import Callable -from dataclasses import dataclass -from enum import Enum -from typing import Any, Self - -import numpy as np -from torch import nn - - -# TODO: Temporary, will switch to classes later -Configs = dict[str, Any] -Tables = dict[str, dict[str, Any]] -RelationOrder = list[tuple[str, str]] -GroupLengthsProbDicts = dict[tuple[str, str], dict[int, dict[int, float]]] -ArrayDict = dict[str, np.ndarray] -ModuleType = str | Callable[..., nn.Module] - - -class ClusteringMethod(Enum): - """Possioble clustering methods for multi-table training.""" - - KMEANS = "kmeans" - GMM = "gmm" - KMEANS_AND_GMM = "kmeans_and_gmm" - VARIATIONAL = "variational" - - -class IsYCond(Enum): - """ - The condition on the y column. - - IsYCond.CONCAT: y is concatenated to X, the model learn a joint distribution of (y, X) - IsYCond.EMBEDDING: y is not concatenated to X. During computations, y is embedded - and added to the latent vector of X - IsYCond.NONE: y column is completely ignored - - How does is_y_cond affect the generation of y? - is_y_cond: - IsYCond.CONCAT: the model synthesizes (y, X) directly, so y is just the first column - IsYCond.EMBEDDING: y is first sampled using empirical distribution of y. The model only - synthesizes X. When returning the generated data, we return the generated X - and the sampled y. (y is sampled from empirical distribution, instead of being - generated by the model) - Note that in this way, y is still not independent of X, because the model has been - adding the embedding of y to the latent vector of X during computations. - IsYCond.NONE: - y is synthesized using y's empirical distribution. X is generated by the model. - In this case, y is completely independent of X. - """ - - CONCAT = "concat" - EMBEDDING = "embedding" - NONE = "none" - - -@dataclass -class DiffusionParameters: - """Parameters for the diffusion model.""" - - d_layers: list[int] - dropout: float - d_in: int = 0 - d_out: int = 0 - emb_d: int = 0 - n_blocks: int = 0 - d_main: int = 0 - d_hidden: int = 0 - dropout_first: float = 0 - dropout_second: float = 0 - - -@dataclass -class ModelParameters: - """Parameters for the ClavaDDPM model.""" - - diffusion_parameters: DiffusionParameters - d_in: int = 0 - num_classes: int = 0 - is_y_cond: IsYCond = IsYCond.NONE - - -class ReductionMethod(Enum): - """Possible methods of reduction.""" - - MEAN = "mean" - SUM = "sum" - NONE = "none" - - -class Normalization(Enum): - """Possible types of normalization.""" - - STANDARD = "standard" - QUANTILE = "quantile" - MINMAX = "minmax" - - -class NumericalNANPolicy(Enum): - """Possible policies for dealng with NANs in numerical data.""" - - DROP_ROWS = "drop-rows" - MEAN = "mean" - - -class CategoricalNANPolicy(Enum): - """Possible policies for dealng with NANs in categorical data.""" - - MOST_FREQUENT = "most_frequent" - - -class CategoricalEncoding(Enum): - """Possible types of encoding for categorical data.""" - - ONE_HOT = "one-hot" - COUNTER = "counter" - ORDINAL = "ordinal" - - -class YPolicy(Enum): - """Possible types of policy for the y column.""" - - DEFAULT = "default" - - -class YType(Enum): - """Possible types of y.""" - - FLOAT = "float" - LONG = "long" - - -class TaskType(Enum): - BINCLASS = "binclass" - MULTICLASS = "multiclass" - REGRESSION = "regression" - - def __str__(self) -> str: - """ - Return the string representation of the task type, which is the value of the enum. - - Returns: - The string representation of the task type. - """ - return self.value - - -class PredictionType(Enum): - LOGITS = "logits" - PROBS = "probs" - - -class DataSplit(Enum): - TRAIN = "train" - VALIDATION = "val" - TEST = "test" - - -@dataclass(frozen=True) -class Transformations: - seed: int = 0 - normalization: Normalization | None = None - numerical_nan_policy: NumericalNANPolicy | None = None - categorical_nan_policy: CategoricalNANPolicy | None = None - category_minimum_frequency: float | None = None - categorical_encoding: CategoricalEncoding | None = CategoricalEncoding.ORDINAL - y_policy: YPolicy | None = YPolicy.DEFAULT - - @classmethod - def default(cls) -> Self: - """Return the default transformations.""" - return cls(seed=0, normalization=Normalization.QUANTILE, y_policy=YPolicy.DEFAULT) From 0e3a42a8710b1ced5b42f95835147e3a7d8d5d8c Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 1 Oct 2025 12:39:49 -0400 Subject: [PATCH 16/40] Cont'd CR comments by David --- src/midst_toolkit/models/clavaddpm/dataset.py | 32 +++++++++---------- src/midst_toolkit/models/clavaddpm/train.py | 5 --- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index cabae2c3..04738ce1 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -605,14 +605,14 @@ def transform_dataset( raise RuntimeError(f"Hash collision for {cache_path}") if dataset.x_num is not None: - dataset = num_process_nans(dataset, transformations.numerical_nan_policy) + dataset = process_nans_in_numerical_features(dataset, transformations.numerical_nan_policy) - num_transform = None - cat_transform = None + numerical_transform = None + categorical_transform = None x_num = dataset.x_num if x_num is not None and transformations.normalization is not None: - x_num, num_transform = normalize( # type: ignore[assignment] + x_num, numerical_transform = normalize( # type: ignore[assignment] x_num, transformations.normalization, transformations.seed, @@ -625,10 +625,10 @@ def transform_dataset( # assert transformations.cat_encoding is None x_cat = None else: - x_cat = cat_process_nans(dataset.x_cat, transformations.categorical_nan_policy) + x_cat = process_nans_in_categorical_features(dataset.x_cat, transformations.categorical_nan_policy) if transformations.category_minimum_frequency is not None: - x_cat = cat_drop_rare(x_cat, transformations.category_minimum_frequency) - x_cat, is_num, cat_transform = cat_encode( + x_cat = drop_rare_categories(x_cat, transformations.category_minimum_frequency) + x_cat, is_num, categorical_transform = encode_categorical_features( x_cat, transformations.categorical_encoding, dataset.y[DataSplit.TRAIN.value], @@ -642,8 +642,8 @@ def transform_dataset( y, y_info = build_target(dataset.y, transformations.target_policy, dataset.task_type) dataset = replace(dataset, x_num=x_num, x_cat=x_cat, y=y, y_info=y_info) - dataset.numerical_transform = num_transform - dataset.categorical_transform = cat_transform + dataset.numerical_transform = numerical_transform + dataset.categorical_transform = categorical_transform if cache_path is not None: dump_pickle((transformations, dataset), cache_path) @@ -720,9 +720,9 @@ def normalize( # TODO: is there any relationship between this function and the cat_process_nans function? # Can they be made a little more similar to each other (in terms of signature)? -def num_process_nans(dataset: Dataset, policy: NumericalNaNPolicy | None) -> Dataset: +def process_nans_in_numerical_features(dataset: Dataset, policy: NumericalNaNPolicy | None) -> Dataset: """ - Process the NaN values in the dataset. + Process the NaN values in the numerical features of thedataset. Args: dataset: The dataset to process. @@ -762,9 +762,9 @@ def num_process_nans(dataset: Dataset, policy: NumericalNaNPolicy | None) -> Dat return dataset -def cat_process_nans(x: ArrayDict, policy: CategoricalNaNPolicy | None) -> ArrayDict: +def process_nans_in_categorical_features(x: ArrayDict, policy: CategoricalNaNPolicy | None) -> ArrayDict: """ - Process the NaN values in the categorical data. + Process the NaN values in the categorical features of the dataset. Args: x: The data to process. @@ -790,7 +790,7 @@ def cat_process_nans(x: ArrayDict, policy: CategoricalNaNPolicy | None) -> Array return x_new -def cat_drop_rare(x: ArrayDict, min_frequency: float) -> ArrayDict: +def drop_rare_categories(x: ArrayDict, min_frequency: float) -> ArrayDict: """ Drop the rare categories in the categorical data. @@ -814,7 +814,7 @@ def cat_drop_rare(x: ArrayDict, min_frequency: float) -> ArrayDict: return {k: np.array(v).T for k, v in x_new.items()} -def cat_encode( +def encode_categorical_features( x: ArrayDict, encoding: CategoricalEncoding | None, y_train: np.ndarray | None, @@ -822,7 +822,7 @@ def cat_encode( return_encoder: bool = False, ) -> tuple[ArrayDict, bool, Any | None]: """ - Encode the categorical data. + Encode the categorical features of the dataset. Args: x: The data to encode. diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index a0a0facb..d8d286bf 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -298,7 +298,6 @@ def train_model( - dataset: The dataset. - column_orders: The column orders. """ - # ruff: noqa: N806 dataset, label_encoders, column_orders = make_dataset_from_df( data_frame, transformations, @@ -309,10 +308,8 @@ def train_model( ) category_sizes = np.array(dataset.get_category_sizes(DataSplit.TRAIN)) - # ruff: noqa: N806 if len(category_sizes) == 0 or transformations.categorical_encoding == CategoricalEncoding.ONE_HOT: category_sizes = np.array([0]) - # ruff: noqa: N806 _, empirical_class_dist = torch.unique(torch.from_numpy(dataset.y[DataSplit.TRAIN.value]), return_counts=True) @@ -436,10 +433,8 @@ def train_classifier( ) category_sizes = np.array(dataset.get_category_sizes(DataSplit.TRAIN)) - # ruff: noqa: N806 if len(category_sizes) == 0 or transformations.categorical_encoding == CategoricalEncoding.ONE_HOT: category_sizes = np.array([0]) - # ruff: noqa: N806 print(category_sizes) # TODO: understand what's going on here From bc672660722c516409677b6e203295526aca69ca Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 1 Oct 2025 13:19:59 -0400 Subject: [PATCH 17/40] Adding TODO --- src/midst_toolkit/models/clavaddpm/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index d8d286bf..2d88d287 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -41,6 +41,8 @@ from midst_toolkit.models.clavaddpm.trainer import ClavaDDPMTrainer +# TODO: Make diffusion_config and classifier_config into config classes and use the +# enums instead of string values. def clava_training( tables: Tables, relation_order: RelationOrder, From 774e99ba1db8d8845c0dde4d1f72bed2c4be72c1 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 1 Oct 2025 15:45:23 -0400 Subject: [PATCH 18/40] WIp starting the breakdown --- .../models/clavaddpm/clustering.py | 220 ++++++++++++------ 1 file changed, 144 insertions(+), 76 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index be225efe..a70c316b 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -200,93 +200,28 @@ def _pair_clustering_keep_id( all_child_cols = list(child_df.columns) all_parent_cols = list(parent_df.columns) - # Splitting the data columns into categorical and numerical based on the domain dictionary. - # Columns that are not in the domain dictionary are ignored (except for the primary and foreign keys). - child_num_cols, child_cat_cols = _get_categorical_and_numerical_columns(all_child_cols, child_domain_dict) - parent_num_cols, parent_cat_cols = _get_categorical_and_numerical_columns(all_parent_cols, parent_domain_dict) - parent_primary_key_index = all_parent_cols.index(parent_primary_key) foreign_key_index = all_child_cols.index(parent_primary_key) # sort child data by foreign key child_data = child_df.to_numpy() sorted_child_data = child_data[np.argsort(child_data[:, foreign_key_index])] - child_group_data_dict = _get_group_data_dict(sorted_child_data, [foreign_key_index]) # sort parent data by primary key parent_data = parent_df.to_numpy() sorted_parent_data = parent_data[np.argsort(parent_data[:, parent_primary_key_index])] - group_lengths = [] - unique_group_ids = sorted_parent_data[:, parent_primary_key_index] - for group_id in unique_group_ids: - group_id = (group_id,) - if group_id not in child_group_data_dict: - group_lengths.append(0) - else: - group_lengths.append(len(child_group_data_dict[group_id])) - - group_lengths_np = np.array(group_lengths, dtype=int) - - sorted_parent_data_repeated = np.repeat(sorted_parent_data, group_lengths_np, axis=0) - assert (sorted_parent_data_repeated[:, parent_primary_key_index] == sorted_child_data[:, foreign_key_index]).all() - - sorted_child_num_data = sorted_child_data[:, child_num_cols] - sorted_child_cat_data = sorted_child_data[:, child_cat_cols] - sorted_parent_num_data = sorted_parent_data_repeated[:, parent_num_cols] - sorted_parent_cat_data = sorted_parent_data_repeated[:, parent_cat_cols] - - joint_num_matrix = np.concatenate([sorted_child_num_data, sorted_parent_num_data], axis=1) - joint_cat_matrix = np.concatenate([sorted_child_cat_data, sorted_parent_cat_data], axis=1) - - if joint_cat_matrix.shape[1] > 0: - joint_cat_matrix_p_index = sorted_child_cat_data.shape[1] - joint_num_matrix_p_index = sorted_child_num_data.shape[1] - - cat_converted = [] - label_encoders = [] - for i in range(joint_cat_matrix.shape[1]): - # A threshold of 1000 unique values is used to prevent the one-hot encoding of large categorical columns - if len(np.unique(joint_cat_matrix[:, i])) > 1000: - log(WARNING, f"Categorical column {i} has more than 1000 unique values, skipping...") - continue - label_encoder = LabelEncoder() - cat_converted.append(label_encoder.fit_transform(joint_cat_matrix[:, i]).astype(float)) - label_encoders.append(label_encoder) - - cat_converted_transposed = np.vstack(cat_converted).T - - # Initialize an empty array to store the encoded values - cat_one_hot = np.empty((cat_converted_transposed.shape[0], 0)) - - # Loop through each column in the data and encode it - for col in range(cat_converted_transposed.shape[1]): - encoder = OneHotEncoder(sparse_output=False) - column = cat_converted_transposed[:, col].reshape(-1, 1) - encoded_column = encoder.fit_transform(column) - cat_one_hot = np.concatenate((cat_one_hot, encoded_column), axis=1) - - cat_one_hot[:, joint_cat_matrix_p_index:] = parent_scale * cat_one_hot[:, joint_cat_matrix_p_index:] - - # Perform quantile normalization using QuantileTransformer - num_quantile = _quantile_normalize_sklearn(joint_num_matrix) - num_min_max = _min_max_normalize_sklearn(joint_num_matrix) - - # TODO: change the commented lines below into options/if-conditions. - # key_quantile = - # quantile_normalize_sklearn(sorted_parent_data_repeated[:, parent_primary_key_index].reshape(-1, 1)) - key_min_max = _min_max_normalize_sklearn(sorted_parent_data_repeated[:, parent_primary_key_index].reshape(-1, 1)) - - # key_scaled = key_scaler * key_quantile - key_scaled = key_scale * key_min_max - - num_quantile[:, joint_num_matrix_p_index:] = parent_scale * num_quantile[:, joint_num_matrix_p_index:] - num_min_max[:, joint_num_matrix_p_index:] = parent_scale * num_min_max[:, joint_num_matrix_p_index:] - - if joint_cat_matrix.shape[1] > 0: - cluster_data = np.concatenate((num_min_max, cat_one_hot, key_scaled), axis=1) - else: - cluster_data = np.concatenate((num_min_max, key_scaled), axis=1) + cluster_data = _prepare_cluster_data( + sorted_child_data, + sorted_parent_data, + child_domain_dict, + parent_domain_dict, + all_child_cols, + all_parent_cols, + parent_primary_key, + parent_scale, + key_scale, + ) child_group_data = _get_group_data(sorted_child_data, [foreign_key_index]) child_group_lengths = np.array([len(group) for group in child_group_data], dtype=int) @@ -404,6 +339,139 @@ def _pair_clustering_keep_id( return parent_df_with_cluster, child_df_with_cluster, group_lengths_prob_dicts +def _repeat_parent_data( + sorted_child_data: np.ndarray, + sorted_parent_data: np.ndarray, + parent_primary_key_index: int, + foreign_key_index: int, +) -> np.ndarray: + child_group_data_dict = _get_group_data_dict(sorted_child_data, [foreign_key_index]) + + group_lengths = [] + unique_group_ids = sorted_parent_data[:, parent_primary_key_index] + for group_id in unique_group_ids: + group_id = (group_id,) + if group_id not in child_group_data_dict: + group_lengths.append(0) + else: + group_lengths.append(len(child_group_data_dict[group_id])) + group_lengths_np = np.array(group_lengths, dtype=int) + sorted_parent_data_repeated = np.repeat(sorted_parent_data, group_lengths_np, axis=0) + assert (sorted_parent_data_repeated[:, parent_primary_key_index] == sorted_child_data[:, foreign_key_index]).all() + + return sorted_parent_data_repeated + + +def _get_min_max_for_numerical_columns( + child_numerical_data: np.ndarray, + parent_numerical_data: np.ndarray, + parent_scale: float, +) -> np.ndarray: + joint_numerical_matrix = np.concatenate([child_numerical_data, parent_numerical_data], axis=1) + joint_num_matrix_p_index = child_numerical_data.shape[1] + + # Perform quantile normalization using QuantileTransformer + num_quantile = _quantile_normalize_sklearn(joint_numerical_matrix) + num_min_max = _min_max_normalize_sklearn(joint_numerical_matrix) + + num_quantile[:, joint_num_matrix_p_index:] = parent_scale * num_quantile[:, joint_num_matrix_p_index:] + num_min_max[:, joint_num_matrix_p_index:] = parent_scale * num_min_max[:, joint_num_matrix_p_index:] + + return num_min_max + + +def _one_hot_encode_categorical_columns( + child_categorical_data: np.ndarray, + parent_categorical_data: np.ndarray, + parent_scale: float, +) -> np.ndarray | None: + joint_categorical_matrix = np.concatenate([child_categorical_data, parent_categorical_data], axis=1) + if joint_categorical_matrix.shape[1] == 0: + return None + + joint_cat_matrix_p_index = child_categorical_data.shape[1] + + cat_converted = [] + label_encoders = [] + for i in range(joint_categorical_matrix.shape[1]): + # A threshold of 1000 unique values is used to prevent the one-hot encoding of large categorical columns + if len(np.unique(joint_categorical_matrix[:, i])) > 1000: + log(WARNING, f"Categorical column '{i}' has more than 1000 unique values, skipping...") + continue + + label_encoder = LabelEncoder() + cat_converted.append(label_encoder.fit_transform(joint_categorical_matrix[:, i]).astype(float)) + label_encoders.append(label_encoder) + + cat_converted_transposed = np.vstack(cat_converted).T + + # Initialize an empty array to store the encoded values + cat_one_hot = np.empty((cat_converted_transposed.shape[0], 0)) + + # Loop through each column in the data and encode it + for col in range(cat_converted_transposed.shape[1]): + encoder = OneHotEncoder(sparse_output=False) + column = cat_converted_transposed[:, col].reshape(-1, 1) + encoded_column = encoder.fit_transform(column) + cat_one_hot = np.concatenate((cat_one_hot, encoded_column), axis=1) + + cat_one_hot[:, joint_cat_matrix_p_index:] = parent_scale * cat_one_hot[:, joint_cat_matrix_p_index:] + + return cat_one_hot + + +def _prepare_cluster_data( + child_data: np.ndarray, + parent_data: np.ndarray, + child_domain_dict: dict[str, Any], + parent_domain_dict: dict[str, Any], + all_child_cols: list[str], + all_parent_cols: list[str], + parent_primary_key: str, + parent_scale: float, + key_scale: float, +) -> np.ndarray: + parent_primary_key_index = all_parent_cols.index(parent_primary_key) + foreign_key_index = all_child_cols.index(parent_primary_key) + + parent_data_repeated = _repeat_parent_data( + child_data, + parent_data, + parent_primary_key_index, + foreign_key_index, + ) + + # Splitting the data columns into categorical and numerical based on the domain dictionary. + # Columns that are not in the domain dictionary are ignored (except for the primary and foreign keys). + child_num_cols, child_cat_cols = _get_categorical_and_numerical_columns(all_child_cols, child_domain_dict) + parent_num_cols, parent_cat_cols = _get_categorical_and_numerical_columns(all_parent_cols, parent_domain_dict) + + child_numerical_data = child_data[:, child_num_cols] + child_categorical_data = child_data[:, child_cat_cols] + parent_numerical_data = parent_data_repeated[:, parent_num_cols] + parent_categorical_data = parent_data_repeated[:, parent_cat_cols] + + numerical_min_max = _get_min_max_for_numerical_columns( + child_numerical_data, + parent_numerical_data, + parent_scale, + ) + + categorical_one_hot = _one_hot_encode_categorical_columns( + child_categorical_data, + parent_categorical_data, + parent_scale, + ) + + key_min_max = _min_max_normalize_sklearn(parent_data_repeated[:, parent_primary_key_index].reshape(-1, 1)) + key_scaled = key_scale * key_min_max + + if categorical_one_hot is None: + return np.concatenate((numerical_min_max, key_scaled), axis=1) + + return np.concatenate((numerical_min_max, categorical_one_hot, key_scaled), axis=1) + + def _get_categorical_and_numerical_columns( all_columns: list[str], domain_dictionary: dict[str, Any], From 1168e938db4517e1a49886d82135f1ab86433e40 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 1 Oct 2025 15:58:40 -0400 Subject: [PATCH 19/40] Renames --- .../models/clavaddpm/clustering.py | 91 ++++++++++--------- 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index a70c316b..c174d764 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -197,11 +197,11 @@ def _pair_clustering_keep_id( parent_domain_dict = tables[parent_name]["domain"] child_primary_key = f"{child_name}_id" parent_primary_key = f"{parent_name}_id" - all_child_cols = list(child_df.columns) - all_parent_cols = list(parent_df.columns) + all_child_columns = list(child_df.columns) + all_parent_columns = list(parent_df.columns) - parent_primary_key_index = all_parent_cols.index(parent_primary_key) - foreign_key_index = all_child_cols.index(parent_primary_key) + parent_primary_key_index = all_parent_columns.index(parent_primary_key) + foreign_key_index = all_child_columns.index(parent_primary_key) # sort child data by foreign key child_data = child_df.to_numpy() @@ -216,8 +216,8 @@ def _pair_clustering_keep_id( sorted_parent_data, child_domain_dict, parent_domain_dict, - all_child_cols, - all_parent_cols, + all_child_columns, + all_parent_columns, parent_primary_key, parent_scale, key_scale, @@ -294,7 +294,7 @@ def _pair_clustering_keep_id( relation_cluster_name = f"{parent_name}_{child_name}_cluster" child_df_with_cluster = pd.DataFrame( sorted_child_data_with_cluster, - columns=all_child_cols + [relation_cluster_name], + columns=all_child_columns + [relation_cluster_name], ) # recover child df order @@ -324,7 +324,9 @@ def _pair_clustering_keep_id( parent_data_clusters_np = np.array(parent_data_clusters).reshape(-1, 1) parent_data_with_cluster = np.concatenate([parent_data, parent_data_clusters_np], axis=1) - parent_df_with_cluster = pd.DataFrame(parent_data_with_cluster, columns=all_parent_cols + [relation_cluster_name]) + parent_df_with_cluster = pd.DataFrame( + parent_data_with_cluster, columns=all_parent_columns + [relation_cluster_name] + ) new_col_entry = { "type": "discrete", @@ -367,17 +369,17 @@ def _get_min_max_for_numerical_columns( parent_numerical_data: np.ndarray, parent_scale: float, ) -> np.ndarray: - joint_numerical_matrix = np.concatenate([child_numerical_data, parent_numerical_data], axis=1) - joint_num_matrix_p_index = child_numerical_data.shape[1] + joint_matrix = np.concatenate([child_numerical_data, parent_numerical_data], axis=1) + matrix_p_index = child_numerical_data.shape[1] # Perform quantile normalization using QuantileTransformer - num_quantile = _quantile_normalize_sklearn(joint_numerical_matrix) - num_min_max = _min_max_normalize_sklearn(joint_numerical_matrix) + numerical_quantile = _quantile_normalize_sklearn(joint_matrix) + numerical_min_max = _min_max_normalize_sklearn(joint_matrix) - num_quantile[:, joint_num_matrix_p_index:] = parent_scale * num_quantile[:, joint_num_matrix_p_index:] - num_min_max[:, joint_num_matrix_p_index:] = parent_scale * num_min_max[:, joint_num_matrix_p_index:] + numerical_quantile[:, matrix_p_index:] = parent_scale * numerical_quantile[:, matrix_p_index:] + numerical_min_max[:, matrix_p_index:] = parent_scale * numerical_min_max[:, matrix_p_index:] - return num_min_max + return numerical_min_max def _one_hot_encode_categorical_columns( @@ -385,39 +387,36 @@ def _one_hot_encode_categorical_columns( parent_categorical_data: np.ndarray, parent_scale: float, ) -> np.ndarray | None: - joint_categorical_matrix = np.concatenate([child_categorical_data, parent_categorical_data], axis=1) - if joint_categorical_matrix.shape[1] == 0: + joint_matrix = np.concatenate([child_categorical_data, parent_categorical_data], axis=1) + if joint_matrix.shape[1] == 0: return None - joint_cat_matrix_p_index = child_categorical_data.shape[1] + matrix_p_index = child_categorical_data.shape[1] - cat_converted = [] - label_encoders = [] - for i in range(joint_categorical_matrix.shape[1]): + categories_converted = [] + for i in range(joint_matrix.shape[1]): # A threshold of 1000 unique values is used to prevent the one-hot encoding of large categorical columns - if len(np.unique(joint_categorical_matrix[:, i])) > 1000: + if len(np.unique(joint_matrix[:, i])) > 1000: log(WARNING, f"Categorical column '{i}' has more than 1000 unique values, skipping...") continue - label_encoder = LabelEncoder() - cat_converted.append(label_encoder.fit_transform(joint_categorical_matrix[:, i]).astype(float)) - label_encoders.append(label_encoder) + categories_converted.append(LabelEncoder().fit_transform(joint_matrix[:, i]).astype(float)) - cat_converted_transposed = np.vstack(cat_converted).T + transposed_categories = np.vstack(categories_converted).T # Initialize an empty array to store the encoded values - cat_one_hot = np.empty((cat_converted_transposed.shape[0], 0)) + categorical_one_hot = np.empty((transposed_categories.shape[0], 0)) # Loop through each column in the data and encode it - for col in range(cat_converted_transposed.shape[1]): + for column in range(transposed_categories.shape[1]): encoder = OneHotEncoder(sparse_output=False) - column = cat_converted_transposed[:, col].reshape(-1, 1) - encoded_column = encoder.fit_transform(column) - cat_one_hot = np.concatenate((cat_one_hot, encoded_column), axis=1) + reshaped_column = transposed_categories[:, column].reshape(-1, 1) + encoded_column = encoder.fit_transform(reshaped_column) + categorical_one_hot = np.concatenate((categorical_one_hot, encoded_column), axis=1) - cat_one_hot[:, joint_cat_matrix_p_index:] = parent_scale * cat_one_hot[:, joint_cat_matrix_p_index:] + categorical_one_hot[:, matrix_p_index:] = parent_scale * categorical_one_hot[:, matrix_p_index:] - return cat_one_hot + return categorical_one_hot def _prepare_cluster_data( @@ -425,14 +424,14 @@ def _prepare_cluster_data( parent_data: np.ndarray, child_domain_dict: dict[str, Any], parent_domain_dict: dict[str, Any], - all_child_cols: list[str], - all_parent_cols: list[str], + all_child_columns: list[str], + all_parent_columns: list[str], parent_primary_key: str, parent_scale: float, key_scale: float, ) -> np.ndarray: - parent_primary_key_index = all_parent_cols.index(parent_primary_key) - foreign_key_index = all_child_cols.index(parent_primary_key) + parent_primary_key_index = all_parent_columns.index(parent_primary_key) + foreign_key_index = all_child_columns.index(parent_primary_key) parent_data_repeated = _repeat_parent_data( child_data, @@ -443,13 +442,19 @@ def _prepare_cluster_data( # Splitting the data columns into categorical and numerical based on the domain dictionary. # Columns that are not in the domain dictionary are ignored (except for the primary and foreign keys). - child_num_cols, child_cat_cols = _get_categorical_and_numerical_columns(all_child_cols, child_domain_dict) - parent_num_cols, parent_cat_cols = _get_categorical_and_numerical_columns(all_parent_cols, parent_domain_dict) + child_numerical_columns, child_categorical_columns = _get_categorical_and_numerical_columns( + all_child_columns, + child_domain_dict, + ) + parent_numerical_columns, parent_categorical_columns = _get_categorical_and_numerical_columns( + all_parent_columns, + parent_domain_dict, + ) - child_numerical_data = child_data[:, child_num_cols] - child_categorical_data = child_data[:, child_cat_cols] - parent_numerical_data = parent_data_repeated[:, parent_num_cols] - parent_categorical_data = parent_data_repeated[:, parent_cat_cols] + child_numerical_data = child_data[:, child_numerical_columns] + child_categorical_data = child_data[:, child_categorical_columns] + parent_numerical_data = parent_data_repeated[:, parent_numerical_columns] + parent_categorical_data = parent_data_repeated[:, parent_categorical_columns] numerical_min_max = _get_min_max_for_numerical_columns( child_numerical_data, From bf05c3c1978e3db48c3cc2d12cf0c624ccbb0225 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 1 Oct 2025 18:03:25 -0400 Subject: [PATCH 20/40] Last breakdown --- .../models/clavaddpm/clustering.py | 179 +++++++++++------- 1 file changed, 111 insertions(+), 68 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index c174d764..51d72526 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -189,6 +189,13 @@ def _pair_clustering_keep_id( - parent_df_with_cluster: DataFrame of the parent table with the cluster column. - child_df_with_cluster: DataFrame of the child table with the cluster column. - group_lengths_prob_dicts: Dictionary of group lengths and probabilities. + + NOTE: It will also mutate the "domain" dictionaries under the child and parent tables + to add the following entry: + "{parent_name}_{child_name}_cluster": { + "type": "discrete", + "size": num_clusters, + } """ child_df = tables[child_name]["df"] parent_df = tables[parent_name]["df"] @@ -223,42 +230,10 @@ def _pair_clustering_keep_id( key_scale, ) + cluster_labels = _get_cluster_labels(cluster_data, clustering_method, num_clusters) + child_group_data = _get_group_data(sorted_child_data, [foreign_key_index]) child_group_lengths = np.array([len(group) for group in child_group_data], dtype=int) - num_clusters = min(num_clusters, len(cluster_data)) - - if clustering_method == ClusteringMethod.KMEANS: - kmeans = KMeans(n_clusters=num_clusters, n_init="auto", init="k-means++") - kmeans.fit(cluster_data) - cluster_labels = kmeans.labels_ - elif clustering_method == ClusteringMethod.KMEANS_AND_GMM: - gmm = GaussianMixture( - n_components=num_clusters, - verbose=1, - covariance_type="diag", - init_params="k-means++", - tol=0.0001, - ) - gmm.fit(cluster_data) - cluster_labels = gmm.predict(cluster_data) - elif clustering_method == ClusteringMethod.VARIATIONAL: - bgmm = BayesianGaussianMixture( - n_components=num_clusters, - verbose=1, - covariance_type="diag", - init_params="k-means++", - tol=0.0001, - ) - bgmm.fit(cluster_data) - cluster_labels = bgmm.predict_proba(cluster_data) - elif clustering_method == ClusteringMethod.GMM: - gmm = GaussianMixture( - n_components=num_clusters, - verbose=1, - covariance_type="diag", - ) - gmm.fit(cluster_data) - cluster_labels = gmm.predict(cluster_data) if clustering_method == ClusteringMethod.VARIATIONAL: group_cluster_labels, agree_rates = _aggregate_and_sample(cluster_labels, child_group_lengths) @@ -271,25 +246,10 @@ def _pair_clustering_keep_id( average_agree_rate = np.mean(agree_rates) log(INFO, f"Average agree rate: {average_agree_rate}") - group_assignment = np.repeat(group_cluster_labels, child_group_lengths, axis=0).reshape((-1, 1)) - # obtain the child data with clustering + group_assignment = np.repeat(group_cluster_labels, child_group_lengths, axis=0).reshape((-1, 1)) sorted_child_data_with_cluster = np.concatenate([sorted_child_data, group_assignment], axis=1) - group_labels_list = group_cluster_labels - group_lengths_list = child_group_lengths.tolist() - - group_lengths_dict: dict[int, dict[int, int]] = {} - for i in range(len(group_labels_list)): - group_label = group_labels_list[i] - if group_label not in group_lengths_dict: - group_lengths_dict[group_label] = defaultdict(int) - group_lengths_dict[group_label][group_lengths_list[i]] += 1 - - group_lengths_prob_dicts: dict[int, dict[int, float]] = {} - for group_label, freq_dict in group_lengths_dict.items(): - group_lengths_prob_dicts[group_label] = _freq_to_prob(freq_dict) - # recover the preprocessed data back to dataframe relation_cluster_name = f"{parent_name}_{child_name}_cluster" child_df_with_cluster = pd.DataFrame( @@ -305,29 +265,21 @@ def _pair_clustering_keep_id( how="left", ) - parent_id_to_cluster: dict[Any, Any] = {} - for i in range(len(sorted_child_data)): - parent_id = sorted_child_data[i, foreign_key_index] - if parent_id in parent_id_to_cluster: - assert parent_id_to_cluster[parent_id] == sorted_child_data_with_cluster[i, -1] - else: - parent_id_to_cluster[parent_id] = sorted_child_data_with_cluster[i, -1] - - max_cluster_label = max(parent_id_to_cluster.values()) - - parent_data_clusters = [] - for i in range(len(parent_data)): - if parent_data[i, parent_primary_key_index] in parent_id_to_cluster: - parent_data_clusters.append(parent_id_to_cluster[parent_data[i, parent_primary_key_index]]) - else: - parent_data_clusters.append(max_cluster_label + 1) - + parent_data_clusters = _get_parent_data_clusters( + sorted_child_data, + sorted_child_data_with_cluster, + parent_data, + parent_primary_key_index, + foreign_key_index, + ) parent_data_clusters_np = np.array(parent_data_clusters).reshape(-1, 1) parent_data_with_cluster = np.concatenate([parent_data, parent_data_clusters_np], axis=1) parent_df_with_cluster = pd.DataFrame( parent_data_with_cluster, columns=all_parent_columns + [relation_cluster_name] ) + group_lengths_probabilities = _get_group_lengths_probabilities(group_cluster_labels, child_group_lengths) + new_col_entry = { "type": "discrete", "size": len(set(parent_data_clusters_np.flatten())), @@ -338,7 +290,7 @@ def _pair_clustering_keep_id( parent_domain_dict[relation_cluster_name] = new_col_entry.copy() child_domain_dict[relation_cluster_name] = new_col_entry.copy() - return parent_df_with_cluster, child_df_with_cluster, group_lengths_prob_dicts + return parent_df_with_cluster, child_df_with_cluster, group_lengths_probabilities def _repeat_parent_data( @@ -477,6 +429,97 @@ def _prepare_cluster_data( return np.concatenate((numerical_min_max, categorical_one_hot, key_scaled), axis=1) +def _get_cluster_labels( + cluster_data: np.ndarray, + clustering_method: ClusteringMethod, + num_clusters: int, +) -> np.ndarray: + num_clusters = min(num_clusters, len(cluster_data)) + + if clustering_method == ClusteringMethod.KMEANS: + kmeans = KMeans(n_clusters=num_clusters, n_init="auto", init="k-means++") + kmeans.fit(cluster_data) + cluster_labels = kmeans.labels_ + elif clustering_method == ClusteringMethod.KMEANS_AND_GMM: + gmm = GaussianMixture( + n_components=num_clusters, + verbose=1, + covariance_type="diag", + init_params="k-means++", + tol=0.0001, + ) + gmm.fit(cluster_data) + cluster_labels = gmm.predict(cluster_data) + elif clustering_method == ClusteringMethod.VARIATIONAL: + bgmm = BayesianGaussianMixture( + n_components=num_clusters, + verbose=1, + covariance_type="diag", + init_params="k-means++", + tol=0.0001, + ) + bgmm.fit(cluster_data) + cluster_labels = bgmm.predict_proba(cluster_data) + elif clustering_method == ClusteringMethod.GMM: + gmm = GaussianMixture( + n_components=num_clusters, + verbose=1, + covariance_type="diag", + ) + gmm.fit(cluster_data) + cluster_labels = gmm.predict(cluster_data) + + return cluster_labels + + +def _get_group_lengths_probabilities( + group_cluster_labels: list[int], + child_group_lengths: np.ndarray, +) -> dict[int, dict[int, float]]: + group_labels_list = group_cluster_labels + group_lengths_list = child_group_lengths.tolist() + + group_lengths_dict: dict[int, dict[int, int]] = {} + for i in range(len(group_labels_list)): + group_label = group_labels_list[i] + if group_label not in group_lengths_dict: + group_lengths_dict[group_label] = defaultdict(int) + group_lengths_dict[group_label][group_lengths_list[i]] += 1 + + group_lengths_probabilities: dict[int, dict[int, float]] = {} + for group_label, frequencies_dict in group_lengths_dict.items(): + group_lengths_probabilities[group_label] = _freq_to_prob(frequencies_dict) + + return group_lengths_probabilities + + +def _get_parent_data_clusters( + sorted_child_data: np.ndarray, + sorted_child_data_with_cluster: np.ndarray, + parent_data: np.ndarray, + parent_primary_key_index: int, + foreign_key_index: int, +) -> list[Any]: + parent_id_to_cluster: dict[Any, Any] = {} + for i in range(len(sorted_child_data)): + parent_id = sorted_child_data[i, foreign_key_index] + if parent_id in parent_id_to_cluster: + assert parent_id_to_cluster[parent_id] == sorted_child_data_with_cluster[i, -1] + else: + parent_id_to_cluster[parent_id] = sorted_child_data_with_cluster[i, -1] + + max_cluster_label = max(parent_id_to_cluster.values()) + + parent_data_clusters = [] + for i in range(len(parent_data)): + if parent_data[i, parent_primary_key_index] in parent_id_to_cluster: + parent_data_clusters.append(parent_id_to_cluster[parent_data[i, parent_primary_key_index]]) + else: + parent_data_clusters.append(max_cluster_label + 1) + + return parent_data_clusters + + def _get_categorical_and_numerical_columns( all_columns: list[str], domain_dictionary: dict[str, Any], From d90ed2c932e19e473dd386d1f4569d7ae14c055b Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 1 Oct 2025 18:04:22 -0400 Subject: [PATCH 21/40] Removing ignore --- src/midst_toolkit/models/clavaddpm/clustering.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 51d72526..2b73ced0 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -158,7 +158,6 @@ def _run_clustering( def _pair_clustering_keep_id( - # ruff: noqa: PLR0912, PLR0915 tables: Tables, child_name: str, parent_name: str, From 96414beab1beb2dd539c965035f545c2222318c5 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 1 Oct 2025 18:42:10 -0400 Subject: [PATCH 22/40] Finished refactoring --- .../models/clavaddpm/clustering.py | 161 ++++++++++++++---- 1 file changed, 127 insertions(+), 34 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 2b73ced0..7492dc6e 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -265,23 +265,24 @@ def _pair_clustering_keep_id( ) parent_data_clusters = _get_parent_data_clusters( - sorted_child_data, sorted_child_data_with_cluster, parent_data, parent_primary_key_index, foreign_key_index, ) - parent_data_clusters_np = np.array(parent_data_clusters).reshape(-1, 1) - parent_data_with_cluster = np.concatenate([parent_data, parent_data_clusters_np], axis=1) + parent_data_with_cluster = np.concatenate([parent_data, parent_data_clusters], axis=1) parent_df_with_cluster = pd.DataFrame( parent_data_with_cluster, columns=all_parent_columns + [relation_cluster_name] ) - group_lengths_probabilities = _get_group_lengths_probabilities(group_cluster_labels, child_group_lengths) + group_lengths_probabilities = _get_group_lengths_probabilities( + group_cluster_labels, + child_group_lengths.tolist(), + ) new_col_entry = { "type": "discrete", - "size": len(set(parent_data_clusters_np.flatten())), + "size": len(set(parent_data_clusters.flatten())), } log(INFO, f"Number of cluster centers: {new_col_entry['size']}") @@ -292,27 +293,40 @@ def _pair_clustering_keep_id( return parent_df_with_cluster, child_df_with_cluster, group_lengths_probabilities -def _repeat_parent_data( - sorted_child_data: np.ndarray, - sorted_parent_data: np.ndarray, +def _denormalize_parent_data( + child_data: np.ndarray, + parent_data: np.ndarray, parent_primary_key_index: int, foreign_key_index: int, ) -> np.ndarray: - child_group_data_dict = _get_group_data_dict(sorted_child_data, [foreign_key_index]) + """ + Denormalize the parent data in relation to the child group data, + i.e. duplicate the parent data for each element of the child group data. + + Args: + child_data: Numpy array of the child data. + parent_data: Numpy array of the parent data. + parent_primary_key_index: Index of the parent primary key. + foreign_key_index: Index of the foreign key to the child data. + + Returns: + Numpy array of the parent data denormalized for each group of the child group data. + """ + child_group_data_dict = _get_group_data_dict(child_data, [foreign_key_index]) group_lengths = [] - unique_group_ids = sorted_parent_data[:, parent_primary_key_index] + unique_group_ids = parent_data[:, parent_primary_key_index] for group_id in unique_group_ids: - group_id = (group_id,) - if group_id not in child_group_data_dict: + group_id_tuple = (group_id,) + if group_id_tuple not in child_group_data_dict: group_lengths.append(0) else: - group_lengths.append(len(child_group_data_dict[group_id])) + group_lengths.append(len(child_group_data_dict[group_id_tuple])) group_lengths_np = np.array(group_lengths, dtype=int) - sorted_parent_data_repeated = np.repeat(sorted_parent_data, group_lengths_np, axis=0) - assert (sorted_parent_data_repeated[:, parent_primary_key_index] == sorted_child_data[:, foreign_key_index]).all() + denormalized_parent_data = np.repeat(parent_data, group_lengths_np, axis=0) + assert (denormalized_parent_data[:, parent_primary_key_index] == child_data[:, foreign_key_index]).all() - return sorted_parent_data_repeated + return denormalized_parent_data def _get_min_max_for_numerical_columns( @@ -320,6 +334,17 @@ def _get_min_max_for_numerical_columns( parent_numerical_data: np.ndarray, parent_scale: float, ) -> np.ndarray: + """ + Get the min-max values for the numerical columns in both the child and parent data. + + Args: + child_numerical_data: Numpy array of the child numerical data. + parent_numerical_data: Numpy array of the parent numerical data. + parent_scale: Scaling factor applied to the parent data. + + Returns: + Numpy array of the min-max values for the numerical columns. + """ joint_matrix = np.concatenate([child_numerical_data, parent_numerical_data], axis=1) matrix_p_index = child_numerical_data.shape[1] @@ -338,6 +363,17 @@ def _one_hot_encode_categorical_columns( parent_categorical_data: np.ndarray, parent_scale: float, ) -> np.ndarray | None: + """ + One-hot encode the categorical columns in both the child and parent data. + + Args: + child_categorical_data: Numpy array of the child categorical data. + parent_categorical_data: Numpy array of the parent categorical data. + parent_scale: Scaling factor applied to the parent data. + + Returns: + Numpy array of the one-hot encoded categorical columns. + """ joint_matrix = np.concatenate([child_categorical_data, parent_categorical_data], axis=1) if joint_matrix.shape[1] == 0: return None @@ -381,10 +417,33 @@ def _prepare_cluster_data( parent_scale: float, key_scale: float, ) -> np.ndarray: + """ + Prepare the data for the clustering algorithm, which comprises of denormalizing the parent data, + splitting the data into categorical and numerical columns, and normalizing the data. + + Args: + child_data: Numpy array of the child data. + parent_data: Numpy array of the parent data. + child_domain_dict: Dictionary of the domain of the child table. The domain dictionary + holds metadata about the columns of each one of the tables. + parent_domain_dict: Dictionary of the domain of the parent table. The domain dictionary + holds metadata about the columns of each one of the tables. + all_child_columns: List of all child columns. + all_parent_columns: List of all parent columns. + parent_primary_key: Name of the parent primary key. + parent_scale: Scaling factor applied to the parent table, provided by the config. + It will be applied to the features to weight their importance during clustering. + key_scale: Scaling factor applied to the foreign key values that link + the child table to the parent table. This will weight how much influence + the parent-child relationship has in the clustering algorithm. + + Returns: + Numpy array of the data prepared for the clustering algorithm. + """ parent_primary_key_index = all_parent_columns.index(parent_primary_key) foreign_key_index = all_child_columns.index(parent_primary_key) - parent_data_repeated = _repeat_parent_data( + denormalized_parent_data = _denormalize_parent_data( child_data, parent_data, parent_primary_key_index, @@ -404,8 +463,8 @@ def _prepare_cluster_data( child_numerical_data = child_data[:, child_numerical_columns] child_categorical_data = child_data[:, child_categorical_columns] - parent_numerical_data = parent_data_repeated[:, parent_numerical_columns] - parent_categorical_data = parent_data_repeated[:, parent_categorical_columns] + parent_numerical_data = denormalized_parent_data[:, parent_numerical_columns] + parent_categorical_data = denormalized_parent_data[:, parent_categorical_columns] numerical_min_max = _get_min_max_for_numerical_columns( child_numerical_data, @@ -419,7 +478,7 @@ def _prepare_cluster_data( parent_scale, ) - key_min_max = _min_max_normalize_sklearn(parent_data_repeated[:, parent_primary_key_index].reshape(-1, 1)) + key_min_max = _min_max_normalize_sklearn(denormalized_parent_data[:, parent_primary_key_index].reshape(-1, 1)) key_scaled = key_scale * key_min_max if categorical_one_hot is None: @@ -433,6 +492,20 @@ def _get_cluster_labels( clustering_method: ClusteringMethod, num_clusters: int, ) -> np.ndarray: + """ + Get the cluster labels from the clustering algorithm chosen by the given clustering method. + The cluster labels are obtained by fitting the clustering algorithm to the data prepared + for the clustering algorithm. + + Args: + cluster_data: Numpy array of the data prepared for the clustering algorithm. + clustering_method: The clustering method to use. + num_clusters: Number of clusters. If the number of clusters is greater than the + number of data points, the number of clusters will be set to the number of data points. + + Returns: + Numpy array of the cluster labels for the data. + """ num_clusters = min(num_clusters, len(cluster_data)) if clustering_method == ClusteringMethod.KMEANS: @@ -473,17 +546,26 @@ def _get_cluster_labels( def _get_group_lengths_probabilities( group_cluster_labels: list[int], - child_group_lengths: np.ndarray, + child_group_lengths: list[int], ) -> dict[int, dict[int, float]]: - group_labels_list = group_cluster_labels - group_lengths_list = child_group_lengths.tolist() + """ + Calculate the group lengths probabilities from the frequency in which the child group lengths + appear for each of the group cluster labels. + + Args: + group_cluster_labels: List of the group cluster labels. + child_group_lengths: List of the child group lengths. + Returns: + Dictionary of the group lengths probabilities. + The keys are the group cluster labels and the values are the probabilities of the group lengths. + """ group_lengths_dict: dict[int, dict[int, int]] = {} - for i in range(len(group_labels_list)): - group_label = group_labels_list[i] + for i in range(len(group_cluster_labels)): + group_label = group_cluster_labels[i] if group_label not in group_lengths_dict: group_lengths_dict[group_label] = defaultdict(int) - group_lengths_dict[group_label][group_lengths_list[i]] += 1 + group_lengths_dict[group_label][child_group_lengths[i]] += 1 group_lengths_probabilities: dict[int, dict[int, float]] = {} for group_label, frequencies_dict in group_lengths_dict.items(): @@ -493,19 +575,30 @@ def _get_group_lengths_probabilities( def _get_parent_data_clusters( - sorted_child_data: np.ndarray, - sorted_child_data_with_cluster: np.ndarray, + child_data_with_cluster: np.ndarray, parent_data: np.ndarray, parent_primary_key_index: int, foreign_key_index: int, -) -> list[Any]: +) -> np.ndarray: + """ + Get the parent data clusters from the child data with cluster and the parent data. + + Args: + child_data_with_cluster: Numpy array of the child data with cluster information. + parent_data: Numpy array of the parent data. + parent_primary_key_index: Index of the parent primary key. + foreign_key_index: Index of the foreign key to the child data. + + Returns: + Numpy array of the parent data clusters. + """ parent_id_to_cluster: dict[Any, Any] = {} - for i in range(len(sorted_child_data)): - parent_id = sorted_child_data[i, foreign_key_index] + for i in range(len(child_data_with_cluster)): + parent_id = child_data_with_cluster[i, foreign_key_index] if parent_id in parent_id_to_cluster: - assert parent_id_to_cluster[parent_id] == sorted_child_data_with_cluster[i, -1] + assert parent_id_to_cluster[parent_id] == child_data_with_cluster[i, -1] else: - parent_id_to_cluster[parent_id] = sorted_child_data_with_cluster[i, -1] + parent_id_to_cluster[parent_id] = child_data_with_cluster[i, -1] max_cluster_label = max(parent_id_to_cluster.values()) @@ -516,7 +609,7 @@ def _get_parent_data_clusters( else: parent_data_clusters.append(max_cluster_label + 1) - return parent_data_clusters + return np.array(parent_data_clusters).reshape(-1, 1) def _get_categorical_and_numerical_columns( From 594d9cd5f47fb04cf89f30dd51f58abbb9280717 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 11:32:07 -0400 Subject: [PATCH 23/40] Renamings, mostly --- .../models/clavaddpm/clustering.py | 50 +-- .../models/clavaddpm/data_loaders.py | 345 +++++++++--------- src/midst_toolkit/models/clavaddpm/model.py | 8 +- src/midst_toolkit/models/clavaddpm/train.py | 6 +- 4 files changed, 211 insertions(+), 198 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 7492dc6e..9fd33122 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -141,7 +141,7 @@ def _run_clustering( parent_df_with_cluster, child_df_with_cluster, group_lengths_prob_dicts, - ) = _pair_clustering_keep_id( + ) = _pair_clustering( tables, child, parent, @@ -157,7 +157,7 @@ def _run_clustering( return tables, all_group_lengths_prob_dicts -def _pair_clustering_keep_id( +def _pair_clustering( tables: Tables, child_name: str, parent_name: str, @@ -199,8 +199,8 @@ def _pair_clustering_keep_id( child_df = tables[child_name]["df"] parent_df = tables[parent_name]["df"] # The domain dictionary holds metadata about the columns of each one of the tables. - child_domain_dict = tables[child_name]["domain"] - parent_domain_dict = tables[parent_name]["domain"] + child_domain = tables[child_name]["domain"] + parent_domain = tables[parent_name]["domain"] child_primary_key = f"{child_name}_id" parent_primary_key = f"{parent_name}_id" all_child_columns = list(child_df.columns) @@ -220,8 +220,8 @@ def _pair_clustering_keep_id( cluster_data = _prepare_cluster_data( sorted_child_data, sorted_parent_data, - child_domain_dict, - parent_domain_dict, + child_domain, + parent_domain, all_child_columns, all_parent_columns, parent_primary_key, @@ -287,8 +287,8 @@ def _pair_clustering_keep_id( log(INFO, f"Number of cluster centers: {new_col_entry['size']}") - parent_domain_dict[relation_cluster_name] = new_col_entry.copy() - child_domain_dict[relation_cluster_name] = new_col_entry.copy() + parent_domain[relation_cluster_name] = new_col_entry.copy() + child_domain[relation_cluster_name] = new_col_entry.copy() return parent_df_with_cluster, child_df_with_cluster, group_lengths_probabilities @@ -409,8 +409,8 @@ def _one_hot_encode_categorical_columns( def _prepare_cluster_data( child_data: np.ndarray, parent_data: np.ndarray, - child_domain_dict: dict[str, Any], - parent_domain_dict: dict[str, Any], + child_domain: dict[str, Any], + parent_domain: dict[str, Any], all_child_columns: list[str], all_parent_columns: list[str], parent_primary_key: str, @@ -424,9 +424,9 @@ def _prepare_cluster_data( Args: child_data: Numpy array of the child data. parent_data: Numpy array of the parent data. - child_domain_dict: Dictionary of the domain of the child table. The domain dictionary + child_domain: Dictionary of the domain of the child table. The domain dictionary holds metadata about the columns of each one of the tables. - parent_domain_dict: Dictionary of the domain of the parent table. The domain dictionary + parent_domain: Dictionary of the domain of the parent table. The domain dictionary holds metadata about the columns of each one of the tables. all_child_columns: List of all child columns. all_parent_columns: List of all parent columns. @@ -454,11 +454,11 @@ def _prepare_cluster_data( # Columns that are not in the domain dictionary are ignored (except for the primary and foreign keys). child_numerical_columns, child_categorical_columns = _get_categorical_and_numerical_columns( all_child_columns, - child_domain_dict, + child_domain, ) parent_numerical_columns, parent_categorical_columns = _get_categorical_and_numerical_columns( all_parent_columns, - parent_domain_dict, + parent_domain, ) child_numerical_data = child_data[:, child_numerical_columns] @@ -569,7 +569,7 @@ def _get_group_lengths_probabilities( group_lengths_probabilities: dict[int, dict[int, float]] = {} for group_label, frequencies_dict in group_lengths_dict.items(): - group_lengths_probabilities[group_label] = _freq_to_prob(frequencies_dict) + group_lengths_probabilities[group_label] = _frequency_to_probability(frequencies_dict) return group_lengths_probabilities @@ -614,14 +614,14 @@ def _get_parent_data_clusters( def _get_categorical_and_numerical_columns( all_columns: list[str], - domain_dictionary: dict[str, Any], + tables_domain: dict[str, Any], ) -> tuple[list[int], list[int]]: """ Return the list of numerical and categorical column indices from the domain dictionary. Args: all_columns: List of all columns. - domain_dictionary: Dictionary of the domain. + tables_domain: Dictionary of the tables' domain. Returns: Tuple with two lists of indices, one for the numerical columns and one for the categorical columns. @@ -630,8 +630,8 @@ def _get_categorical_and_numerical_columns( categorical_columns = [] for col_index, col in enumerate(all_columns): - if col in domain_dictionary: - if domain_dictionary[col]["type"] == "discrete": + if col in tables_domain: + if tables_domain[col]["type"] == "discrete": categorical_columns.append(col_index) else: numerical_columns.append(col_index) @@ -833,17 +833,17 @@ def _get_group_cluster_labels_through_voting( return group_cluster_labels, agree_rates -def _freq_to_prob(freq_dict: dict[int, int]) -> dict[int, float]: +def _frequency_to_probability(frequencies: dict[int, int]) -> dict[int, float]: """ Convert a frequency dictionary to a probability dictionary. Args: - freq_dict: Dictionary of frequencies. + frequencies: Dictionary of frequencies. Returns: Dictionary of probabilities. """ - prob_dict: dict[Any, float] = {} - for key, freq in freq_dict.items(): - prob_dict[key] = freq / sum(list(freq_dict.values())) - return prob_dict + probabilities: dict[Any, float] = {} + for key, freq in frequencies.items(): + probabilities[key] = freq / sum(list(frequencies.values())) + return probabilities diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index a1b35692..b2eaf3df 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -16,7 +16,9 @@ def load_multi_table( - data_dir: Path, verbose: bool = True + data_dir: Path, + verbose: bool = True, + training_data_ratio: float = 1, ) -> tuple[dict[str, Any], list[tuple[str, str]], dict[str, Any]]: """ Load the multi-table dataset from the data directory. @@ -24,6 +26,8 @@ def load_multi_table( Args: data_dir: The directory to load the dataset from. verbose: Whether to print verbose output. Optional, default is True. + training_data_ratio: The ratio of the data to be used for training. Should be between 0 and 1. + If it's == 1, it will only return the training set. Optional, default is 1. Returns: A tuple with 3 values: @@ -58,11 +62,11 @@ def load_multi_table( id_cols = [col for col in tables[table]["df"].columns if "_id" in col] df_no_id = tables[table]["df"].drop(columns=id_cols) info = get_info_from_domain(df_no_id, tables[table]["domain"]) - _, info = pipeline_process_data( - name=table, - data_df=df_no_id, + _, info = process_pipeline_data( + table_name=table, + data=df_no_id, info=info, - ratio=1, + training_data_ratio=training_data_ratio, save=False, verbose=verbose, ) @@ -71,13 +75,13 @@ def load_multi_table( return tables, relation_order, dataset_meta -def get_info_from_domain(data_df: pd.DataFrame, domain_dict: dict[str, Any]) -> dict[str, Any]: +def get_info_from_domain(data: pd.DataFrame, tables_domain: dict[str, Any]) -> dict[str, Any]: """ - Get the information dictionary from the domain dictionary. + Get the information dictionary from the table domain dictionary. Args: - data_df: The dataframe containing the data. - domain_dict: The domain dictionary containing metadata about the data columns. + data: The dataframe containing the data. + tables_domain: The tables' domain dictionary containing metadata about the data columns. Returns: The information dictionary containing the following keys: @@ -90,9 +94,9 @@ def get_info_from_domain(data_df: pd.DataFrame, domain_dict: dict[str, Any]) -> info: dict[str, Any] = {} info["num_col_idx"] = [] info["cat_col_idx"] = [] - columns = data_df.columns.tolist() + columns = data.columns.tolist() for i in range(len(columns)): - if domain_dict[columns[i]]["type"] == "discrete": + if tables_domain[columns[i]]["type"] == "discrete": info["cat_col_idx"].append(i) else: info["num_col_idx"].append(i) @@ -104,12 +108,12 @@ def get_info_from_domain(data_df: pd.DataFrame, domain_dict: dict[str, Any]) -> return info -def pipeline_process_data( +def process_pipeline_data( # ruff: noqa: PLR0915, PLR0912 - name: str, - data_df: pd.DataFrame, + table_name: str, + data: pd.DataFrame, info: dict[str, Any], - ratio: float = 0.9, + training_data_ratio: float = 0.9, save: bool = False, verbose: bool = True, ) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]: @@ -122,10 +126,10 @@ def pipeline_process_data( metadata. Args: - name: The name of the table. Used to name the files when saving the data. - data_df: The dataframe containing the data. + table_name: The name of the table. Used to name the files when saving the data. + data: The dataframe containing the data. info: The information dictionary, retrieved from the get_info_from_domain function. - ratio: The ratio of the data to be used for training. Should be between 0 and 1. + training_data_ratio: The ratio of the data to be used for training. Should be between 0 and 1. If it's == 1, it will only return the training set. Optional, default is 0.9. save: Whether to save the data. Optional, default is False. verbose: Whether to print verbose output. Optional, default is True. @@ -145,168 +149,173 @@ def pipeline_process_data( - "y_test": The target data for the test set. It will be absent if ratio == 1. - The information dictionary with updated values. """ - assert 0 < ratio <= 1, "Ratio must be between 0 and 1." - if ratio == 1: - log(INFO, "Ratio is 1, so the data will not be split into training and test sets.") + assert 0 < training_data_ratio <= 1, "Training data ratio must be between 0 and 1." + if training_data_ratio == 1: + log(INFO, "Training data ratio is 1, so the data will not be split into training and test sets.") - num_data = data_df.shape[0] + num_samples = data.shape[0] - column_names = info["column_names"] if info["column_names"] else data_df.columns.tolist() + column_names = info["column_names"] if info["column_names"] else data.columns.tolist() - num_col_idx = info["num_col_idx"] - cat_col_idx = info["cat_col_idx"] - target_col_idx = info["target_col_idx"] + numerical_column_indices = info["num_col_idx"] + categorical_column_indices = info["cat_col_idx"] + target_columns_indices = info["target_col_idx"] - idx_mapping, inverse_idx_mapping, idx_name_mapping = get_column_name_mapping( - data_df, num_col_idx, cat_col_idx, column_names + index_mapping, inverse_index_mapping, index_to_name_mapping = get_column_name_mapping( + data, + numerical_column_indices, + categorical_column_indices, + column_names, ) - num_columns = [column_names[i] for i in num_col_idx] - cat_columns = [column_names[i] for i in cat_col_idx] - target_columns = [column_names[i] for i in target_col_idx] + numerical_columns = [column_names[i] for i in numerical_column_indices] + categorical_columns = [column_names[i] for i in categorical_column_indices] + target_columns = [column_names[i] for i in target_columns_indices] - # Train/ Test Split, 90% Training, 10% Testing (Validation set will be selected from Training set) - num_train = int(num_data * ratio) - num_test = num_data - num_train + # Train/ Test Split: + # num_train_samples% for Training, (1 - num_test_samples)% for Testing + # Validation set will be selected from Training set + num_train_samples = int(num_samples * training_data_ratio) + num_test_samples = num_samples - num_train_samples - test_df: pd.DataFrame | None = None + test_data: pd.DataFrame | None = None - if ratio < 1: - train_df, test_df, seed = train_test_split(data_df, cat_columns, num_train, num_test) + if training_data_ratio < 1: + train_data, test_data, _ = train_test_split(data, categorical_columns, num_train_samples, num_test_samples) else: - train_df = data_df.copy() + train_data = data.copy() - train_df.columns = list(range(len(train_df.columns))) + train_data.columns = list(range(len(train_data.columns))) - if ratio < 1: - assert test_df is not None - test_df.columns = list(range(len(test_df.columns))) + if training_data_ratio < 1: + assert test_data is not None + test_data.columns = list(range(len(test_data.columns))) - col_info: dict[Any, Any] = {} + columns_info: dict[Any, Any] = {} - for col_idx in num_col_idx: - col_info[col_idx] = {} - col_info["type"] = "numerical" - col_info["max"] = float(train_df[col_idx].max()) - col_info["min"] = float(train_df[col_idx].min()) + for column in numerical_column_indices: + columns_info[column] = {} + columns_info["type"] = "numerical" + columns_info["max"] = float(train_data[column].max()) + columns_info["min"] = float(train_data[column].min()) - for col_idx in cat_col_idx: - col_info[col_idx] = {} - col_info["type"] = "categorical" - col_info["categorizes"] = list(set(train_df[col_idx])) + for column in categorical_column_indices: + columns_info[column] = {} + columns_info["type"] = "categorical" + columns_info["categorizes"] = list(set(train_data[column])) - for col_idx in target_col_idx: + for column in target_columns_indices: if info["task_type"] == "regression": - col_info[col_idx] = {} - col_info["type"] = "numerical" - col_info["max"] = float(train_df[col_idx].max()) - col_info["min"] = float(train_df[col_idx].min()) + columns_info[column] = {} + columns_info["type"] = "numerical" + columns_info["max"] = float(train_data[column].max()) + columns_info["min"] = float(train_data[column].min()) else: - col_info[col_idx] = {} - col_info["type"] = "categorical" - col_info["categorizes"] = list(set(train_df[col_idx])) + columns_info[column] = {} + columns_info["type"] = "categorical" + columns_info["categorizes"] = list(set(train_data[column])) - info["column_info"] = col_info + info["column_info"] = columns_info - train_df.rename(columns=idx_name_mapping, inplace=True) - if ratio < 1: - assert test_df is not None - test_df.rename(columns=idx_name_mapping, inplace=True) + train_data.rename(columns=index_to_name_mapping, inplace=True) + if training_data_ratio < 1: + assert test_data is not None + test_data.rename(columns=index_to_name_mapping, inplace=True) - for col in num_columns: - train_df.loc[train_df[col] == "?", col] = np.nan - for col in cat_columns: - train_df.loc[train_df[col] == "?", col] = "nan" + for col in numerical_columns: + train_data.loc[train_data[col] == "?", col] = np.nan + for col in categorical_columns: + train_data.loc[train_data[col] == "?", col] = "nan" - if ratio < 1: - assert test_df is not None - for col in num_columns: - test_df.loc[test_df[col] == "?", col] = np.nan - for col in cat_columns: - test_df.loc[test_df[col] == "?", col] = "nan" + if training_data_ratio < 1: + assert test_data is not None + for col in numerical_columns: + test_data.loc[test_data[col] == "?", col] = np.nan + for col in categorical_columns: + test_data.loc[test_data[col] == "?", col] = "nan" - x_num_train = train_df[num_columns].to_numpy().astype(np.float32) - x_cat_train = train_df[cat_columns].to_numpy() - y_train = train_df[target_columns].to_numpy() + x_num_train = train_data[numerical_columns].to_numpy().astype(np.float32) + x_cat_train = train_data[categorical_columns].to_numpy() + y_train = train_data[target_columns].to_numpy() x_num_test: np.ndarray | None = None x_cat_test: np.ndarray | None = None y_test: np.ndarray | None = None - if ratio < 1: - assert test_df is not None - x_num_test = test_df[num_columns].to_numpy().astype(np.float32) - x_cat_test = test_df[cat_columns].to_numpy() - y_test = test_df[target_columns].to_numpy() + if training_data_ratio < 1: + assert test_data is not None + x_num_test = test_data[numerical_columns].to_numpy().astype(np.float32) + x_cat_test = test_data[categorical_columns].to_numpy() + y_test = test_data[target_columns].to_numpy() if save: - save_dir = f"data/{name}" + save_dir = f"data/{table_name}" np.save(f"{save_dir}/x_num_train.npy", x_num_train) np.save(f"{save_dir}/x_cat_train.npy", x_cat_train) np.save(f"{save_dir}/y_train.npy", y_train) - if ratio < 1: + if training_data_ratio < 1: assert x_num_test is not None and x_cat_test is not None and y_test is not None np.save(f"{save_dir}/x_num_test.npy", x_num_test) np.save(f"{save_dir}/x_cat_test.npy", x_cat_test) np.save(f"{save_dir}/y_test.npy", y_test) - train_df[num_columns] = train_df[num_columns].astype(np.float32) + train_data[numerical_columns] = train_data[numerical_columns].astype(np.float32) - if ratio < 1: - assert test_df is not None - test_df[num_columns] = test_df[num_columns].astype(np.float32) + if training_data_ratio < 1: + assert test_data is not None + test_data[numerical_columns] = test_data[numerical_columns].astype(np.float32) if save: - train_df.to_csv(f"{save_dir}/train.csv", index=False) + train_data.to_csv(f"{save_dir}/train.csv", index=False) - if ratio < 1: - assert test_df is not None - test_df.to_csv(f"{save_dir}/test.csv", index=False) + if training_data_ratio < 1: + assert test_data is not None + test_data.to_csv(f"{save_dir}/test.csv", index=False) - if not os.path.exists(f"synthetic/{name}"): - os.makedirs(f"synthetic/{name}") + if not os.path.exists(f"synthetic/{table_name}"): + os.makedirs(f"synthetic/{table_name}") - train_df.to_csv(f"synthetic/{name}/real.csv", index=False) + train_data.to_csv(f"synthetic/{table_name}/real.csv", index=False) - if ratio < 1: - assert test_df is not None - test_df.to_csv(f"synthetic/{name}/test.csv", index=False) + if training_data_ratio < 1: + assert test_data is not None + test_data.to_csv(f"synthetic/{table_name}/test.csv", index=False) info["column_names"] = column_names - info["train_num"] = train_df.shape[0] + info["train_num"] = train_data.shape[0] - if ratio < 1: - assert test_df is not None - info["test_num"] = test_df.shape[0] + if training_data_ratio < 1: + assert test_data is not None + info["test_num"] = test_data.shape[0] - info["idx_mapping"] = idx_mapping - info["inverse_idx_mapping"] = inverse_idx_mapping - info["idx_name_mapping"] = idx_name_mapping + info["idx_mapping"] = index_mapping + info["inverse_idx_mapping"] = inverse_index_mapping + info["idx_name_mapping"] = index_to_name_mapping metadata: dict[str, Any] = {"columns": {}} task_type = info["task_type"] - num_col_idx = info["num_col_idx"] - cat_col_idx = info["cat_col_idx"] - target_col_idx = info["target_col_idx"] + numerical_column_indices = info["num_col_idx"] + categorical_column_indices = info["cat_col_idx"] + target_columns_indices = info["target_col_idx"] - for i in num_col_idx: + for i in numerical_column_indices: metadata["columns"][i] = {} metadata["columns"][i]["sdtype"] = "numerical" metadata["columns"][i]["computer_representation"] = "Float" - for i in cat_col_idx: + for i in categorical_column_indices: metadata["columns"][i] = {} metadata["columns"][i]["sdtype"] = "categorical" if task_type == "regression": - for i in target_col_idx: + for i in target_columns_indices: metadata["columns"][i] = {} metadata["columns"][i]["sdtype"] = "numerical" metadata["columns"][i]["computer_representation"] = "Float" else: - for i in target_col_idx: + for i in target_columns_indices: metadata["columns"][i] = {} metadata["columns"][i]["sdtype"] = "categorical" @@ -317,19 +326,19 @@ def pipeline_process_data( json.dump(info, file, indent=4) if verbose: - if ratio < 1: - assert test_df is not None - str_shape = f"Train dataframe shape: {train_df.shape}, Test dataframe shape: {test_df.shape}, Total dataframe shape: {data_df.shape}" + if training_data_ratio < 1: + assert test_data is not None + str_shape = f"Train dataframe shape: {train_data.shape}, Test dataframe shape: {test_data.shape}, Total dataframe shape: {data.shape}" else: - str_shape = f"Table name: {name}, Total dataframe shape: {data_df.shape}" + str_shape = f"Table name: {table_name}, Total dataframe shape: {data.shape}" str_shape += f", Numerical data shape: {x_num_train.shape}" str_shape += f", Categorical data shape: {x_cat_train.shape}" log(INFO, str_shape) - data: dict[str, dict[str, Any]] = { + output_data: dict[str, dict[str, Any]] = { "df": { - DataSplit.TRAIN.value: train_df, + DataSplit.TRAIN.value: train_data, }, "numpy": { "x_num_train": x_num_train, @@ -338,20 +347,20 @@ def pipeline_process_data( }, } - if ratio < 1: - assert test_df is not None and x_num_test is not None and x_cat_test is not None and y_test is not None - data["df"][DataSplit.TEST.value] = test_df - data["numpy"]["x_num_test"] = x_num_test - data["numpy"]["x_cat_test"] = x_cat_test - data["numpy"]["y_test"] = y_test + if training_data_ratio < 1: + assert test_data is not None and x_num_test is not None and x_cat_test is not None and y_test is not None + output_data["df"][DataSplit.TEST.value] = test_data + output_data["numpy"]["x_num_test"] = x_num_test + output_data["numpy"]["x_cat_test"] = x_cat_test + output_data["numpy"]["y_test"] = y_test - return data, info + return output_data, info def get_column_name_mapping( - data_df: pd.DataFrame, - num_col_idx: list[int], - cat_col_idx: list[int], + data: pd.DataFrame, + numerical_columns_indices: list[int], + categorical_column_indices: list[int], column_names: list[str] | None = None, ) -> tuple[dict[int, int], dict[int, int], dict[int, str]]: """ @@ -359,64 +368,68 @@ def get_column_name_mapping( Will produce 3 mappings: - The mapping of the categorical and numerical columns from their original indices - in the dataframe to their indices in the num_col_idx and cat_col_idx lists. + in the dataframe to their indices in the numerical_columns_indices and + categorical_column_indices lists. - The inverse mapping of the above, i.e. the mapping from their indices in the - num_col_idx and cat_col_idx lists to their original indices in the dataframe. + numerical_columns_indices and categorical_column_indices lists to their original + indices in the dataframe. - The mapping of the indices in the original dataframe to the column names for all columns. Args: - data_df: The dataframe containing the data. - num_col_idx: The indices of the numerical columns. - cat_col_idx: The indices of the categorical columns. + data: The dataframe containing the data. + numerical_columns_indices: The indices of the numerical columns. + categorical_column_indices: The indices of the categorical columns. column_names: The names of the columns. Optional, default is None. If None, it will use the columns of the dataframe. Returns: A tuple with 3 values: - The mapping of the categorical and numerical columns from their original indices - in the dataframe to their indices in the num_col_idx and cat_col_idx lists. + in the dataframe to their indices in the numerical_columns_indices and + categorical_column_indices lists. - The inverse mapping of the above, i.e. the mapping from their indices in the - num_col_idx and cat_col_idx lists to their original indices in the dataframe. + numerical_columns_indices and categorical_column_indices lists to their original + indices in the dataframe. - The mapping of the indices in the original dataframe to the column names for all columns. """ if not column_names: - column_names = data_df.columns.tolist() + column_names = data.columns.tolist() - idx_mapping = {} + index_mapping = {} curr_num_idx = 0 - curr_cat_idx = len(num_col_idx) - curr_target_idx = curr_cat_idx + len(cat_col_idx) + curr_cat_idx = len(numerical_columns_indices) + curr_target_idx = curr_cat_idx + len(categorical_column_indices) for idx in range(len(column_names)): - if idx in num_col_idx: - idx_mapping[idx] = curr_num_idx + if idx in numerical_columns_indices: + index_mapping[idx] = curr_num_idx curr_num_idx += 1 - elif idx in cat_col_idx: - idx_mapping[idx] = curr_cat_idx + elif idx in categorical_column_indices: + index_mapping[idx] = curr_cat_idx curr_cat_idx += 1 else: - idx_mapping[idx] = curr_target_idx + index_mapping[idx] = curr_target_idx curr_target_idx += 1 - inverse_idx_mapping = {} - for k, v in idx_mapping.items(): - inverse_idx_mapping[v] = k + inverse_index_mapping = {} + for k, v in index_mapping.items(): + inverse_index_mapping[v] = k - idx_name_mapping = {} + index_to_name_mapping = {} for i in range(len(column_names)): - idx_name_mapping[i] = column_names[i] + index_to_name_mapping[i] = column_names[i] - return idx_mapping, inverse_idx_mapping, idx_name_mapping + return index_mapping, inverse_index_mapping, index_to_name_mapping # TODO: refactor this function so it doesn't run the risk of running indefinitely. def train_test_split( - data_df: pd.DataFrame, - cat_columns: list[str], - num_train: int = 0, - num_test: int = 0, + data: pd.DataFrame, + categorical_columns: list[str], + num_train_samples: int = 0, + num_test_samples: int = 0, ) -> tuple[pd.DataFrame, pd.DataFrame, int]: """ Split the data into training and test sets. @@ -425,10 +438,10 @@ def train_test_split( columns represented. Args: - data_df: The dataframe containing the data. - cat_columns: The names of the categorical columns. - num_train: The number of rows in the training set. Optional, default is 0. - num_test: The number of rows in the test set. Optional, default is 0. + data: The dataframe containing the data. + categorical_columns: The names of the categorical columns. + num_train_samples: The number of rows in the training set. Optional, default is 0. + num_test_samples: The number of rows in the test set. Optional, default is 0. Returns: A tuple with 3 values: @@ -436,7 +449,7 @@ def train_test_split( - The test dataframe. - The seed used by the random number generator to generate the split. """ - total_num = data_df.shape[0] + total_num = data.shape[0] idx = np.arange(total_num) seed = 1234 @@ -445,15 +458,15 @@ def train_test_split( np.random.seed(seed) np.random.shuffle(idx) - train_idx = idx[:num_train] - test_idx = idx[-num_test:] + train_idx = idx[:num_train_samples] + test_idx = idx[-num_test_samples:] - train_df = data_df.loc[train_idx] - test_df = data_df.loc[test_idx] + train_df = data.loc[train_idx] + test_df = data.loc[test_idx] flag = 0 - for i in cat_columns: - if len(set(train_df[i])) != len(set(data_df[i])): + for i in categorical_columns: + if len(set(train_df[i])) != len(set(data[i])): flag = 1 break diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py index 2daa2ed2..256e766a 100644 --- a/src/midst_toolkit/models/clavaddpm/model.py +++ b/src/midst_toolkit/models/clavaddpm/model.py @@ -114,13 +114,13 @@ def forward(self, x: Tensor, timesteps: Tensor) -> Tensor: return self.model(x) -def get_table_info(df: pd.DataFrame, domain_dict: dict[str, Any], y_col: str) -> dict[str, Any]: +def get_table_info(df: pd.DataFrame, tables_domain: dict[str, Any], y_col: str) -> dict[str, Any]: """ Get the dictionary of table information. Args: df: The dataframe containing the data. - domain_dict: The domain dictionary of metadata about the data columns. + tables_domain: The tables' domain dictionary of metadata about the data columns. y_col: The name of the target column. Returns: @@ -136,8 +136,8 @@ def get_table_info(df: pd.DataFrame, domain_dict: dict[str, Any], y_col: str) -> cat_cols = [] num_cols = [] for col in df.columns: - if col in domain_dict and col != y_col: - if domain_dict[col]["type"] == "discrete": + if col in tables_domain and col != y_col: + if tables_domain[col]["type"] == "discrete": cat_cols.append(col) else: num_cols.append(col) diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 2d88d287..0f7b3fea 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -140,7 +140,7 @@ def clava_training( def child_training( child_df_with_cluster: pd.DataFrame, - child_domain_dict: dict[str, Any], + child_domain: dict[str, Any], parent_name: str | None, child_name: str, diffusion_config: Configs, @@ -152,7 +152,7 @@ def child_training( Args: child_df_with_cluster: DataFrame with the cluster column. - child_domain_dict: Dictionary of the child table domain. It should contain size and type for each + child_domain: Dictionary of the child table domain. It should contain size and type for each column of the table. For example: { "frequency": {"size": 3, "type": "discrete"}, @@ -198,7 +198,7 @@ def child_training( child_df_with_cluster["placeholder"] = list(range(len(child_df_with_cluster))) else: y_col = f"{parent_name}_{child_name}_cluster" - child_info = get_table_info(child_df_with_cluster, child_domain_dict, y_col) + child_info = get_table_info(child_df_with_cluster, child_domain, y_col) child_model_params = ModelParameters( diffusion_parameters=DiffusionParameters( d_layers=diffusion_config["d_layers"], From 3a2b20398ab822f8cd9e40df19b6ccb3a6621c6e Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 11:40:24 -0400 Subject: [PATCH 24/40] More enums --- src/midst_toolkit/common/enumerations.py | 14 +++++++ .../models/clavaddpm/clustering.py | 5 ++- .../models/clavaddpm/data_loaders.py | 38 +++++++++---------- src/midst_toolkit/models/clavaddpm/model.py | 5 ++- .../models/clavaddpm/test_model.py | 6 +-- 5 files changed, 42 insertions(+), 26 deletions(-) diff --git a/src/midst_toolkit/common/enumerations.py b/src/midst_toolkit/common/enumerations.py index c9079f6f..c5d017bd 100644 --- a/src/midst_toolkit/common/enumerations.py +++ b/src/midst_toolkit/common/enumerations.py @@ -25,3 +25,17 @@ class DataSplit(Enum): TRAIN = "train" VALIDATION = "val" TEST = "test" + + +class DomainDataType(Enum): + """Possible types of domain data.""" + + CONTINUOUS = "continuous" + DISCRETE = "discrete" + + +class InfoDataType(Enum): + """Possible types of column information data.""" + + NUMERICAL = "numerical" + CATEGORICAL = "categorical" diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 9fd33122..430a1311 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -13,6 +13,7 @@ from sklearn.mixture import BayesianGaussianMixture, GaussianMixture from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, QuantileTransformer +from midst_toolkit.common.enumerations import DomainDataType from midst_toolkit.common.logger import log from midst_toolkit.models.clavaddpm.enumerations import ( ClusteringMethod, @@ -281,7 +282,7 @@ def _pair_clustering( ) new_col_entry = { - "type": "discrete", + "type": DomainDataType.DISCRETE.value, "size": len(set(parent_data_clusters.flatten())), } @@ -631,7 +632,7 @@ def _get_categorical_and_numerical_columns( for col_index, col in enumerate(all_columns): if col in tables_domain: - if tables_domain[col]["type"] == "discrete": + if tables_domain[col]["type"] == DomainDataType.DISCRETE.value: categorical_columns.append(col_index) else: numerical_columns.append(col_index) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index b2eaf3df..b81d1718 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -9,7 +9,7 @@ import pandas as pd import torch -from midst_toolkit.common.enumerations import DataSplit +from midst_toolkit.common.enumerations import DataSplit, DomainDataType, InfoDataType, TaskType from midst_toolkit.common.logger import log from midst_toolkit.models.clavaddpm.dataset import Dataset from midst_toolkit.models.clavaddpm.enumerations import TargetType @@ -96,13 +96,13 @@ def get_info_from_domain(data: pd.DataFrame, tables_domain: dict[str, Any]) -> d info["cat_col_idx"] = [] columns = data.columns.tolist() for i in range(len(columns)): - if tables_domain[columns[i]]["type"] == "discrete": + if tables_domain[columns[i]]["type"] == DomainDataType.DISCRETE.value: info["cat_col_idx"].append(i) else: info["num_col_idx"].append(i) info["target_col_idx"] = [] - info["task_type"] = "None" + info["task_type"] = None info["column_names"] = columns return info @@ -195,24 +195,24 @@ def process_pipeline_data( for column in numerical_column_indices: columns_info[column] = {} - columns_info["type"] = "numerical" + columns_info["type"] = InfoDataType.NUMERICAL.value columns_info["max"] = float(train_data[column].max()) columns_info["min"] = float(train_data[column].min()) for column in categorical_column_indices: columns_info[column] = {} - columns_info["type"] = "categorical" + columns_info["type"] = InfoDataType.CATEGORICAL.value columns_info["categorizes"] = list(set(train_data[column])) for column in target_columns_indices: - if info["task_type"] == "regression": + if info["task_type"] == TaskType.REGRESSION.value: columns_info[column] = {} - columns_info["type"] = "numerical" + columns_info["type"] = InfoDataType.NUMERICAL.value columns_info["max"] = float(train_data[column].max()) columns_info["min"] = float(train_data[column].min()) else: columns_info[column] = {} - columns_info["type"] = "categorical" + columns_info["type"] = InfoDataType.CATEGORICAL.value columns_info["categorizes"] = list(set(train_data[column])) info["column_info"] = columns_info @@ -300,24 +300,24 @@ def process_pipeline_data( target_columns_indices = info["target_col_idx"] for i in numerical_column_indices: - metadata["columns"][i] = {} - metadata["columns"][i]["sdtype"] = "numerical" - metadata["columns"][i]["computer_representation"] = "Float" + metadata["columns"][i] = { + "sdtype": InfoDataType.NUMERICAL.value, + "computer_representation": "Float", + } for i in categorical_column_indices: - metadata["columns"][i] = {} - metadata["columns"][i]["sdtype"] = "categorical" + metadata["columns"][i] = {"sdtype": InfoDataType.CATEGORICAL.value} - if task_type == "regression": + if task_type == TaskType.REGRESSION.value: for i in target_columns_indices: - metadata["columns"][i] = {} - metadata["columns"][i]["sdtype"] = "numerical" - metadata["columns"][i]["computer_representation"] = "Float" + metadata["columns"][i] = { + "sdtype": InfoDataType.NUMERICAL.value, + "computer_representation": "Float", + } else: for i in target_columns_indices: - metadata["columns"][i] = {} - metadata["columns"][i]["sdtype"] = "categorical" + metadata["columns"][i] = {"sdtype": InfoDataType.CATEGORICAL.value} info["metadata"] = metadata diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py index 256e766a..d6ae31c6 100644 --- a/src/midst_toolkit/models/clavaddpm/model.py +++ b/src/midst_toolkit/models/clavaddpm/model.py @@ -13,6 +13,7 @@ # ruff: noqa: N812 from torch import Tensor, nn +from midst_toolkit.common.enumerations import DomainDataType, TaskType from midst_toolkit.common.logger import log from midst_toolkit.models.clavaddpm.enumerations import IsTargetCondioned, ModuleType @@ -137,7 +138,7 @@ def get_table_info(df: pd.DataFrame, tables_domain: dict[str, Any], y_col: str) num_cols = [] for col in df.columns: if col in tables_domain and col != y_col: - if tables_domain[col]["type"] == "discrete": + if tables_domain[col]["type"] == DomainDataType.DISCRETE.value: cat_cols.append(col) else: num_cols.append(col) @@ -147,7 +148,7 @@ def get_table_info(df: pd.DataFrame, tables_domain: dict[str, Any], y_col: str) df_info["num_cols"] = num_cols df_info["y_col"] = y_col df_info["n_classes"] = 0 - df_info["task_type"] = "multiclass" + df_info["task_type"] = TaskType.MULTICLASS.value return df_info diff --git a/tests/integration/models/clavaddpm/test_model.py b/tests/integration/models/clavaddpm/test_model.py index dfe40210..80cb68f5 100644 --- a/tests/integration/models/clavaddpm/test_model.py +++ b/tests/integration/models/clavaddpm/test_model.py @@ -73,7 +73,7 @@ def test_load_single_table(): "num_col_idx": [0, 3, 4, 7], "cat_col_idx": [1, 2, 5, 6], "target_col_idx": [], - "task_type": "None", + "task_type": None, "column_names": ["trans_date", "trans_type", "operation", "amount", "balance", "k_symbol", "bank", "account"], "column_info": { 0: {}, @@ -139,7 +139,7 @@ def test_load_multi_table(): "num_col_idx": [1], "cat_col_idx": [0], "target_col_idx": [], - "task_type": "None", + "task_type": None, "column_names": ["frequency", "account_date"], "column_info": { 0: {}, @@ -196,7 +196,7 @@ def test_load_multi_table(): "num_col_idx": [0, 3, 4, 7], "cat_col_idx": [1, 2, 5, 6], "target_col_idx": [], - "task_type": "None", + "task_type": None, "column_names": ["trans_date", "trans_type", "operation", "amount", "balance", "k_symbol", "bank", "account"], "column_info": { 0: {}, From 972947d2935deff7087a573ecc756ee8ad74b033 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 13:08:24 -0400 Subject: [PATCH 25/40] Adding datasplits class --- .../models/clavaddpm/data_loaders.py | 316 ++++++++++-------- 1 file changed, 175 insertions(+), 141 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index b81d1718..ccd725dc 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -1,6 +1,7 @@ import json import os from collections.abc import Generator +from dataclasses import dataclass from logging import INFO from pathlib import Path from typing import Any, Self @@ -27,7 +28,7 @@ def load_multi_table( data_dir: The directory to load the dataset from. verbose: Whether to print verbose output. Optional, default is True. training_data_ratio: The ratio of the data to be used for training. Should be between 0 and 1. - If it's == 1, it will only return the training set. Optional, default is 1. + If it's equal to 1, it will only return the training set. Optional, default is 1. Returns: A tuple with 3 values: @@ -108,8 +109,23 @@ def get_info_from_domain(data: pd.DataFrame, tables_domain: dict[str, Any]) -> d return info +@dataclass +class DataFeatures: + data: pd.DataFrame + numerical_features: np.ndarray | None = None + categorical_features: np.ndarray | None = None + target_features: np.ndarray | None = None + + +@dataclass +class DataSplits: + train_data: DataFeatures + test_data: DataFeatures | None = None + seed: int | None = None + + def process_pipeline_data( - # ruff: noqa: PLR0915, PLR0912 + # ruff: noqa: PLR0912, PLR0915 table_name: str, data: pd.DataFrame, info: dict[str, Any], @@ -130,7 +146,7 @@ def process_pipeline_data( data: The dataframe containing the data. info: The information dictionary, retrieved from the get_info_from_domain function. training_data_ratio: The ratio of the data to be used for training. Should be between 0 and 1. - If it's == 1, it will only return the training set. Optional, default is 0.9. + If it's equal to 1, it will only return the training set. Optional, default is 0.9. save: Whether to save the data. Optional, default is False. verbose: Whether to print verbose output. Optional, default is True. @@ -153,8 +169,6 @@ def process_pipeline_data( if training_data_ratio == 1: log(INFO, "Training data ratio is 1, so the data will not be split into training and test sets.") - num_samples = data.shape[0] - column_names = info["column_names"] if info["column_names"] else data.columns.tolist() numerical_column_indices = info["num_col_idx"] @@ -172,122 +186,61 @@ def process_pipeline_data( categorical_columns = [column_names[i] for i in categorical_column_indices] target_columns = [column_names[i] for i in target_columns_indices] - # Train/ Test Split: - # num_train_samples% for Training, (1 - num_test_samples)% for Testing - # Validation set will be selected from Training set - num_train_samples = int(num_samples * training_data_ratio) - num_test_samples = num_samples - num_train_samples - - test_data: pd.DataFrame | None = None - - if training_data_ratio < 1: - train_data, test_data, _ = train_test_split(data, categorical_columns, num_train_samples, num_test_samples) - else: - train_data = data.copy() + data_splits = train_test_split(data, categorical_columns, training_data_ratio) - train_data.columns = list(range(len(train_data.columns))) - - if training_data_ratio < 1: - assert test_data is not None - test_data.columns = list(range(len(test_data.columns))) - - columns_info: dict[Any, Any] = {} - - for column in numerical_column_indices: - columns_info[column] = {} - columns_info["type"] = InfoDataType.NUMERICAL.value - columns_info["max"] = float(train_data[column].max()) - columns_info["min"] = float(train_data[column].min()) - - for column in categorical_column_indices: - columns_info[column] = {} - columns_info["type"] = InfoDataType.CATEGORICAL.value - columns_info["categorizes"] = list(set(train_data[column])) + data_splits.train_data.data.columns = list(range(len(data_splits.train_data.data.columns))) - for column in target_columns_indices: - if info["task_type"] == TaskType.REGRESSION.value: - columns_info[column] = {} - columns_info["type"] = InfoDataType.NUMERICAL.value - columns_info["max"] = float(train_data[column].max()) - columns_info["min"] = float(train_data[column].min()) - else: - columns_info[column] = {} - columns_info["type"] = InfoDataType.CATEGORICAL.value - columns_info["categorizes"] = list(set(train_data[column])) + if data_splits.test_data is not None: + data_splits.test_data.data.columns = list(range(len(data_splits.test_data.data.columns))) - info["column_info"] = columns_info + info["column_info"] = _get_columns_info( + data_splits.train_data.data, + numerical_column_indices, + categorical_column_indices, + target_columns_indices, + TaskType(info["task_type"]) if info["task_type"] else None, + ) - train_data.rename(columns=index_to_name_mapping, inplace=True) - if training_data_ratio < 1: - assert test_data is not None - test_data.rename(columns=index_to_name_mapping, inplace=True) + data_splits.train_data.data.rename(columns=index_to_name_mapping, inplace=True) + if data_splits.test_data is not None: + data_splits.test_data.data.rename(columns=index_to_name_mapping, inplace=True) for col in numerical_columns: - train_data.loc[train_data[col] == "?", col] = np.nan + data_splits.train_data.data.loc[data_splits.train_data.data[col] == "?", col] = np.nan for col in categorical_columns: - train_data.loc[train_data[col] == "?", col] = "nan" + data_splits.train_data.data.loc[data_splits.train_data.data[col] == "?", col] = "nan" - if training_data_ratio < 1: - assert test_data is not None + if data_splits.test_data is not None: for col in numerical_columns: - test_data.loc[test_data[col] == "?", col] = np.nan + data_splits.test_data.data.loc[data_splits.test_data.data[col] == "?", col] = np.nan for col in categorical_columns: - test_data.loc[test_data[col] == "?", col] = "nan" - - x_num_train = train_data[numerical_columns].to_numpy().astype(np.float32) - x_cat_train = train_data[categorical_columns].to_numpy() - y_train = train_data[target_columns].to_numpy() - - x_num_test: np.ndarray | None = None - x_cat_test: np.ndarray | None = None - y_test: np.ndarray | None = None - - if training_data_ratio < 1: - assert test_data is not None - x_num_test = test_data[numerical_columns].to_numpy().astype(np.float32) - x_cat_test = test_data[categorical_columns].to_numpy() - y_test = test_data[target_columns].to_numpy() - - if save: - save_dir = f"data/{table_name}" - np.save(f"{save_dir}/x_num_train.npy", x_num_train) - np.save(f"{save_dir}/x_cat_train.npy", x_cat_train) - np.save(f"{save_dir}/y_train.npy", y_train) + data_splits.test_data.data.loc[data_splits.test_data.data[col] == "?", col] = "nan" - if training_data_ratio < 1: - assert x_num_test is not None and x_cat_test is not None and y_test is not None - np.save(f"{save_dir}/x_num_test.npy", x_num_test) - np.save(f"{save_dir}/x_cat_test.npy", x_cat_test) - np.save(f"{save_dir}/y_test.npy", y_test) - - train_data[numerical_columns] = train_data[numerical_columns].astype(np.float32) - - if training_data_ratio < 1: - assert test_data is not None - test_data[numerical_columns] = test_data[numerical_columns].astype(np.float32) - - if save: - train_data.to_csv(f"{save_dir}/train.csv", index=False) - - if training_data_ratio < 1: - assert test_data is not None - test_data.to_csv(f"{save_dir}/test.csv", index=False) + data_splits.train_data.numerical_features = ( + data_splits.train_data.data[numerical_columns].to_numpy().astype(np.float32) + ) + data_splits.train_data.categorical_features = data_splits.train_data.data[categorical_columns].to_numpy() + data_splits.train_data.target_features = data_splits.train_data.data[target_columns].to_numpy() - if not os.path.exists(f"synthetic/{table_name}"): - os.makedirs(f"synthetic/{table_name}") + if data_splits.test_data is not None: + data_splits.test_data.numerical_features = ( + data_splits.test_data.data[numerical_columns].to_numpy().astype(np.float32) + ) + data_splits.test_data.categorical_features = data_splits.test_data.data[categorical_columns].to_numpy() + data_splits.test_data.target_features = data_splits.test_data.data[target_columns].to_numpy() - train_data.to_csv(f"synthetic/{table_name}/real.csv", index=False) + data_splits.train_data.data[numerical_columns] = data_splits.train_data.data[numerical_columns].astype(np.float32) - if training_data_ratio < 1: - assert test_data is not None - test_data.to_csv(f"synthetic/{table_name}/test.csv", index=False) + if data_splits.test_data is not None: + data_splits.test_data.data[numerical_columns] = data_splits.test_data.data[numerical_columns].astype( + np.float32 + ) info["column_names"] = column_names - info["train_num"] = train_data.shape[0] + info["train_num"] = data_splits.train_data.data.shape[0] - if training_data_ratio < 1: - assert test_data is not None - info["test_num"] = test_data.shape[0] + if data_splits.test_data is not None: + info["test_num"] = data_splits.test_data.data.shape[0] info["idx_mapping"] = index_mapping info["inverse_idx_mapping"] = inverse_index_mapping @@ -322,37 +275,34 @@ def process_pipeline_data( info["metadata"] = metadata if save: - with open(f"{save_dir}/info.json", "w") as file: - json.dump(info, file, indent=4) + _save_data_and_info(table_name, data_splits, info) if verbose: - if training_data_ratio < 1: - assert test_data is not None - str_shape = f"Train dataframe shape: {train_data.shape}, Test dataframe shape: {test_data.shape}, Total dataframe shape: {data.shape}" + if data_splits.test_data is not None: + str_shape = f"Train dataframe shape: {data_splits.train_data.data.shape}, Test dataframe shape: {data_splits.test_data.data.shape}, Total dataframe shape: {data.shape}" else: str_shape = f"Table name: {table_name}, Total dataframe shape: {data.shape}" - str_shape += f", Numerical data shape: {x_num_train.shape}" - str_shape += f", Categorical data shape: {x_cat_train.shape}" + str_shape += f", Numerical data shape: {data_splits.train_data.numerical_features.shape}" + str_shape += f", Categorical data shape: {data_splits.train_data.categorical_features.shape}" log(INFO, str_shape) output_data: dict[str, dict[str, Any]] = { "df": { - DataSplit.TRAIN.value: train_data, + DataSplit.TRAIN.value: data_splits.train_data.data, }, "numpy": { - "x_num_train": x_num_train, - "x_cat_train": x_cat_train, - "y_train": y_train, + "x_num_train": data_splits.train_data.numerical_features, + "x_cat_train": data_splits.train_data.categorical_features, + "y_train": data_splits.train_data.target_features, }, } - if training_data_ratio < 1: - assert test_data is not None and x_num_test is not None and x_cat_test is not None and y_test is not None - output_data["df"][DataSplit.TEST.value] = test_data - output_data["numpy"]["x_num_test"] = x_num_test - output_data["numpy"]["x_cat_test"] = x_cat_test - output_data["numpy"]["y_test"] = y_test + if data_splits.test_data is not None: + output_data["df"][DataSplit.TEST.value] = data_splits.test_data.data + output_data["numpy"]["x_num_test"] = data_splits.test_data.numerical_features + output_data["numpy"]["x_cat_test"] = data_splits.test_data.categorical_features + output_data["numpy"]["y_test"] = data_splits.test_data.target_features return output_data, info @@ -424,13 +374,85 @@ def get_column_name_mapping( return index_mapping, inverse_index_mapping, index_to_name_mapping +def _get_columns_info( + train_data: pd.DataFrame, + numerical_column_indices: list[int], + categorical_column_indices: list[int], + target_columns_indices: list[int], + task_type: TaskType | None, +) -> dict[str, Any]: + columns_info: dict[Any, Any] = {} + + for column in numerical_column_indices: + columns_info[column] = {} + columns_info["type"] = InfoDataType.NUMERICAL.value + columns_info["max"] = float(train_data[column].max()) + columns_info["min"] = float(train_data[column].min()) + + for column in categorical_column_indices: + columns_info[column] = {} + columns_info["type"] = InfoDataType.CATEGORICAL.value + columns_info["categorizes"] = list(set(train_data[column])) + + for column in target_columns_indices: + if task_type == TaskType.REGRESSION: + columns_info[column] = {} + columns_info["type"] = InfoDataType.NUMERICAL.value + columns_info["max"] = float(train_data[column].max()) + columns_info["min"] = float(train_data[column].min()) + else: + columns_info[column] = {} + columns_info["type"] = InfoDataType.CATEGORICAL.value + columns_info["categorizes"] = list(set(train_data[column])) + + return columns_info + + +def _save_data_and_info( + table_name: str, + data_splits: DataSplits, + info: dict[str, Any], +) -> None: + save_dir = f"data/{table_name}" + + data_splits.train_data.data.to_csv(f"{save_dir}/train.csv", index=False) + + if data_splits.test_data is not None: + data_splits.test_data.data.to_csv(f"{save_dir}/test.csv", index=False) + + if not os.path.exists(f"synthetic/{table_name}"): + os.makedirs(f"synthetic/{table_name}") + + data_splits.train_data.data.to_csv(f"synthetic/{table_name}/real.csv", index=False) + + if data_splits.test_data is not None: + data_splits.test_data.data.to_csv(f"synthetic/{table_name}/test.csv", index=False) + + assert data_splits.train_data.numerical_features is not None + assert data_splits.train_data.categorical_features is not None + assert data_splits.train_data.target_features is not None + np.save(f"{save_dir}/x_num_train.npy", data_splits.train_data.numerical_features) + np.save(f"{save_dir}/x_cat_train.npy", data_splits.train_data.categorical_features) + np.save(f"{save_dir}/y_train.npy", data_splits.train_data.target_features) + + if data_splits.test_data is not None: + assert data_splits.test_data.numerical_features is not None + assert data_splits.test_data.categorical_features is not None + assert data_splits.test_data.target_features is not None + np.save(f"{save_dir}/x_num_test.npy", data_splits.test_data.numerical_features) + np.save(f"{save_dir}/x_cat_test.npy", data_splits.test_data.categorical_features) + np.save(f"{save_dir}/y_test.npy", data_splits.test_data.target_features) + + with open(f"{save_dir}/info.json", "w") as file: + json.dump(info, file, indent=4) + + # TODO: refactor this function so it doesn't run the risk of running indefinitely. def train_test_split( data: pd.DataFrame, categorical_columns: list[str], - num_train_samples: int = 0, - num_test_samples: int = 0, -) -> tuple[pd.DataFrame, pd.DataFrame, int]: + training_data_ratio: float = 0.9, +) -> DataSplits: """ Split the data into training and test sets. @@ -440,8 +462,8 @@ def train_test_split( Args: data: The dataframe containing the data. categorical_columns: The names of the categorical columns. - num_train_samples: The number of rows in the training set. Optional, default is 0. - num_test_samples: The number of rows in the test set. Optional, default is 0. + training_data_ratio: The ratio of the data to be used for training. Should be between 0 and 1. + If it's equal to 1, it will only return the training set. Optional, default is 0.9. Returns: A tuple with 3 values: @@ -449,32 +471,44 @@ def train_test_split( - The test dataframe. - The seed used by the random number generator to generate the split. """ - total_num = data.shape[0] - idx = np.arange(total_num) + if training_data_ratio == 1: + return DataSplits(train_data=DataFeatures(data=data.copy()), test_data=None, seed=None) - seed = 1234 + # Train/ Test Split:# Train/ Test Split: + # num_train_samples% for Training, (1 - num_test_samples)% for Testing + # Validation set will be selected from Training set + num_samples = data.shape[0] + num_train_samples = int(num_samples * training_data_ratio) + num_test_samples = num_samples - num_train_samples + indices = np.arange(num_samples) + current_seed = 1234 while True: - np.random.seed(seed) - np.random.shuffle(idx) + np.random.seed(current_seed) + np.random.shuffle(indices) - train_idx = idx[:num_train_samples] - test_idx = idx[-num_test_samples:] + train_indices = indices[:num_train_samples] + test_indices = indices[-num_test_samples:] - train_df = data.loc[train_idx] - test_df = data.loc[test_idx] + train_data = data.loc[train_indices] + test_data = data.loc[test_indices] - flag = 0 + stop = True for i in categorical_columns: - if len(set(train_df[i])) != len(set(data[i])): - flag = 1 + if len(set(train_data[i])) != len(set(data[i])): + stop = False break - if flag == 0: + if stop: break - seed += 1 - return train_df, test_df, seed + current_seed += 1 + + return DataSplits( + train_data=DataFeatures(data=train_data), + test_data=DataFeatures(data=test_data), + seed=current_seed, + ) class FastTensorDataLoader: From 77a2249ba4dab6c31317fad15942692e003f919b Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 13:19:46 -0400 Subject: [PATCH 26/40] Splitting into another function --- .../models/clavaddpm/data_loaders.py | 176 +++++++++--------- 1 file changed, 93 insertions(+), 83 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index ccd725dc..0a15513f 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -125,7 +125,6 @@ class DataSplits: def process_pipeline_data( - # ruff: noqa: PLR0912, PLR0915 table_name: str, data: pd.DataFrame, info: dict[str, Any], @@ -169,82 +168,7 @@ def process_pipeline_data( if training_data_ratio == 1: log(INFO, "Training data ratio is 1, so the data will not be split into training and test sets.") - column_names = info["column_names"] if info["column_names"] else data.columns.tolist() - - numerical_column_indices = info["num_col_idx"] - categorical_column_indices = info["cat_col_idx"] - target_columns_indices = info["target_col_idx"] - - index_mapping, inverse_index_mapping, index_to_name_mapping = get_column_name_mapping( - data, - numerical_column_indices, - categorical_column_indices, - column_names, - ) - - numerical_columns = [column_names[i] for i in numerical_column_indices] - categorical_columns = [column_names[i] for i in categorical_column_indices] - target_columns = [column_names[i] for i in target_columns_indices] - - data_splits = train_test_split(data, categorical_columns, training_data_ratio) - - data_splits.train_data.data.columns = list(range(len(data_splits.train_data.data.columns))) - - if data_splits.test_data is not None: - data_splits.test_data.data.columns = list(range(len(data_splits.test_data.data.columns))) - - info["column_info"] = _get_columns_info( - data_splits.train_data.data, - numerical_column_indices, - categorical_column_indices, - target_columns_indices, - TaskType(info["task_type"]) if info["task_type"] else None, - ) - - data_splits.train_data.data.rename(columns=index_to_name_mapping, inplace=True) - if data_splits.test_data is not None: - data_splits.test_data.data.rename(columns=index_to_name_mapping, inplace=True) - - for col in numerical_columns: - data_splits.train_data.data.loc[data_splits.train_data.data[col] == "?", col] = np.nan - for col in categorical_columns: - data_splits.train_data.data.loc[data_splits.train_data.data[col] == "?", col] = "nan" - - if data_splits.test_data is not None: - for col in numerical_columns: - data_splits.test_data.data.loc[data_splits.test_data.data[col] == "?", col] = np.nan - for col in categorical_columns: - data_splits.test_data.data.loc[data_splits.test_data.data[col] == "?", col] = "nan" - - data_splits.train_data.numerical_features = ( - data_splits.train_data.data[numerical_columns].to_numpy().astype(np.float32) - ) - data_splits.train_data.categorical_features = data_splits.train_data.data[categorical_columns].to_numpy() - data_splits.train_data.target_features = data_splits.train_data.data[target_columns].to_numpy() - - if data_splits.test_data is not None: - data_splits.test_data.numerical_features = ( - data_splits.test_data.data[numerical_columns].to_numpy().astype(np.float32) - ) - data_splits.test_data.categorical_features = data_splits.test_data.data[categorical_columns].to_numpy() - data_splits.test_data.target_features = data_splits.test_data.data[target_columns].to_numpy() - - data_splits.train_data.data[numerical_columns] = data_splits.train_data.data[numerical_columns].astype(np.float32) - - if data_splits.test_data is not None: - data_splits.test_data.data[numerical_columns] = data_splits.test_data.data[numerical_columns].astype( - np.float32 - ) - - info["column_names"] = column_names - info["train_num"] = data_splits.train_data.data.shape[0] - - if data_splits.test_data is not None: - info["test_num"] = data_splits.test_data.data.shape[0] - - info["idx_mapping"] = index_mapping - info["inverse_idx_mapping"] = inverse_index_mapping - info["idx_name_mapping"] = index_to_name_mapping + data_splits, info = _split_data_and_populate_info(data, info, training_data_ratio) metadata: dict[str, Any] = {"columns": {}} task_type = info["task_type"] @@ -278,14 +202,15 @@ def process_pipeline_data( _save_data_and_info(table_name, data_splits, info) if verbose: + log(INFO, f"Train dataframe shape: {data_splits.train_data.data.shape}") if data_splits.test_data is not None: - str_shape = f"Train dataframe shape: {data_splits.train_data.data.shape}, Test dataframe shape: {data_splits.test_data.data.shape}, Total dataframe shape: {data.shape}" - else: - str_shape = f"Table name: {table_name}, Total dataframe shape: {data.shape}" + log(INFO, f"Test dataframe shape: {data_splits.test_data.data.shape}") + log(INFO, f"Total dataframe shape: {data.shape}") - str_shape += f", Numerical data shape: {data_splits.train_data.numerical_features.shape}" - str_shape += f", Categorical data shape: {data_splits.train_data.categorical_features.shape}" - log(INFO, str_shape) + assert data_splits.train_data.numerical_features is not None + assert data_splits.train_data.categorical_features is not None + log(INFO, f"Numerical data shape: {data_splits.train_data.numerical_features.shape}") + log(INFO, f"Categorical data shape: {data_splits.train_data.categorical_features.shape}") output_data: dict[str, dict[str, Any]] = { "df": { @@ -408,6 +333,91 @@ def _get_columns_info( return columns_info +def _split_data_and_populate_info( + data: pd.DataFrame, + info: dict[str, Any], + training_data_ratio: float, +) -> tuple[DataSplits, dict[str, Any]]: + column_names = info["column_names"] if info["column_names"] else data.columns.tolist() + + numerical_column_indices = info["num_col_idx"] + categorical_column_indices = info["cat_col_idx"] + target_columns_indices = info["target_col_idx"] + + index_mapping, inverse_index_mapping, index_to_name_mapping = get_column_name_mapping( + data, + numerical_column_indices, + categorical_column_indices, + column_names, + ) + + numerical_columns = [column_names[i] for i in numerical_column_indices] + categorical_columns = [column_names[i] for i in categorical_column_indices] + target_columns = [column_names[i] for i in target_columns_indices] + + data_splits = train_test_split(data, categorical_columns, training_data_ratio) + + data_splits.train_data.data.columns = list(range(len(data_splits.train_data.data.columns))) + + if data_splits.test_data is not None: + data_splits.test_data.data.columns = list(range(len(data_splits.test_data.data.columns))) + + info["column_info"] = _get_columns_info( + data_splits.train_data.data, + numerical_column_indices, + categorical_column_indices, + target_columns_indices, + TaskType(info["task_type"]) if info["task_type"] else None, + ) + + data_splits.train_data.data.rename(columns=index_to_name_mapping, inplace=True) + if data_splits.test_data is not None: + data_splits.test_data.data.rename(columns=index_to_name_mapping, inplace=True) + + for col in numerical_columns: + data_splits.train_data.data.loc[data_splits.train_data.data[col] == "?", col] = np.nan + for col in categorical_columns: + data_splits.train_data.data.loc[data_splits.train_data.data[col] == "?", col] = "nan" + + if data_splits.test_data is not None: + for col in numerical_columns: + data_splits.test_data.data.loc[data_splits.test_data.data[col] == "?", col] = np.nan + for col in categorical_columns: + data_splits.test_data.data.loc[data_splits.test_data.data[col] == "?", col] = "nan" + + data_splits.train_data.numerical_features = ( + data_splits.train_data.data[numerical_columns].to_numpy().astype(np.float32) + ) + data_splits.train_data.categorical_features = data_splits.train_data.data[categorical_columns].to_numpy() + data_splits.train_data.target_features = data_splits.train_data.data[target_columns].to_numpy() + + if data_splits.test_data is not None: + data_splits.test_data.numerical_features = ( + data_splits.test_data.data[numerical_columns].to_numpy().astype(np.float32) + ) + data_splits.test_data.categorical_features = data_splits.test_data.data[categorical_columns].to_numpy() + data_splits.test_data.target_features = data_splits.test_data.data[target_columns].to_numpy() + + data_splits.train_data.data[numerical_columns] = data_splits.train_data.data[numerical_columns].astype(np.float32) + + if data_splits.test_data is not None: + data_splits.test_data.data[numerical_columns] = data_splits.test_data.data[numerical_columns].astype( + np.float32 + ) + + info["column_names"] = column_names + info["train_num"] = data_splits.train_data.data.shape[0] + + if data_splits.test_data is not None: + info["test_num"] = data_splits.test_data.data.shape[0] + + info["idx_mapping"] = index_mapping + info["inverse_idx_mapping"] = inverse_index_mapping + info["idx_name_mapping"] = index_to_name_mapping + + return data_splits, info + + def _save_data_and_info( table_name: str, data_splits: DataSplits, From 67368ab190385c372c7e8548d70e51b3e4d19068 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 13:56:10 -0400 Subject: [PATCH 27/40] Adding docstrings, removing save --- .../models/clavaddpm/data_loaders.py | 195 +++++++++--------- 1 file changed, 103 insertions(+), 92 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index 0a15513f..f6f32a6d 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -1,5 +1,4 @@ import json -import os from collections.abc import Generator from dataclasses import dataclass from logging import INFO @@ -68,7 +67,6 @@ def load_multi_table( data=df_no_id, info=info, training_data_ratio=training_data_ratio, - save=False, verbose=verbose, ) tables[table]["info"] = info @@ -129,7 +127,6 @@ def process_pipeline_data( data: pd.DataFrame, info: dict[str, Any], training_data_ratio: float = 0.9, - save: bool = False, verbose: bool = True, ) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]: """ @@ -164,10 +161,6 @@ def process_pipeline_data( - "y_test": The target data for the test set. It will be absent if ratio == 1. - The information dictionary with updated values. """ - assert 0 < training_data_ratio <= 1, "Training data ratio must be between 0 and 1." - if training_data_ratio == 1: - log(INFO, "Training data ratio is 1, so the data will not be split into training and test sets.") - data_splits, info = _split_data_and_populate_info(data, info, training_data_ratio) metadata: dict[str, Any] = {"columns": {}} @@ -198,9 +191,6 @@ def process_pipeline_data( info["metadata"] = metadata - if save: - _save_data_and_info(table_name, data_splits, info) - if verbose: log(INFO, f"Train dataframe shape: {data_splits.train_data.data.shape}") if data_splits.test_data is not None: @@ -232,6 +222,7 @@ def process_pipeline_data( return output_data, info +# TODO: this might not be needed at all now. def get_column_name_mapping( data: pd.DataFrame, numerical_columns_indices: list[int], @@ -306,29 +297,54 @@ def _get_columns_info( target_columns_indices: list[int], task_type: TaskType | None, ) -> dict[str, Any]: + """ + Get the columns info dictionary to be populated into the info dictionary. + + Args: + train_data: The training data. + numerical_column_indices: The indices of the numerical columns. + categorical_column_indices: The indices of the categorical columns. + target_columns_indices: The indices of the target columns. + task_type: The type of the task. If None, it will assume the target + columns are categorical. + + Returns: + The columns info dictionary to be populated into the info dictionary. + It will contain the following keys for numerical columns: + - type: equals to InfoDataType.NUMERICAL.value. + - max: The maximum value of the column. + - min: The minimum value of the column. + It will contain the following keys for categorical columns: + - type: equals to InfoDataType.CATEGORICAL.value. + - categorizes: The list of possible categories of the column. + """ columns_info: dict[Any, Any] = {} for column in numerical_column_indices: + column_name = train_data.columns[column] columns_info[column] = {} columns_info["type"] = InfoDataType.NUMERICAL.value - columns_info["max"] = float(train_data[column].max()) - columns_info["min"] = float(train_data[column].min()) + columns_info["max"] = float(train_data[column_name].max()) + columns_info["min"] = float(train_data[column_name].min()) for column in categorical_column_indices: + column_name = train_data.columns[column] columns_info[column] = {} columns_info["type"] = InfoDataType.CATEGORICAL.value - columns_info["categorizes"] = list(set(train_data[column])) + columns_info["categorizes"] = list(set(train_data[column_name])) for column in target_columns_indices: if task_type == TaskType.REGRESSION: + column_name = train_data.columns[column] columns_info[column] = {} columns_info["type"] = InfoDataType.NUMERICAL.value - columns_info["max"] = float(train_data[column].max()) - columns_info["min"] = float(train_data[column].min()) + columns_info["max"] = float(train_data[column_name].max()) + columns_info["min"] = float(train_data[column_name].min()) else: + column_name = train_data.columns[column] columns_info[column] = {} columns_info["type"] = InfoDataType.CATEGORICAL.value - columns_info["categorizes"] = list(set(train_data[column])) + columns_info["categorizes"] = list(set(train_data[column_name])) return columns_info @@ -338,11 +354,44 @@ def _split_data_and_populate_info( info: dict[str, Any], training_data_ratio: float, ) -> tuple[DataSplits, dict[str, Any]]: + """ + Split the data into training and test sets and populate the info dictionary + with additional metadata. + + Args: + data: The dataframe containing the data. + info: The info dictionary, retrieved from the get_info_from_domain function. + training_data_ratio: The ratio of the data to be used for training. + Should be between 0 and 1. If it's equal to 1, it will only return the training set. + + Returns: + A tuple with 2 values: + - The data splits as an instance of the DataSplits class. Test data will be None if the + training_data_ratio is 1. + - The info dictionary with updated maetadata, namely: + - column_info: The columns info dictionary, as returned by the _get_columns_info function. + - idx_mapping: The mapping of the indices in the original dataframe to the column names + for all columns, as returned by the get_column_name_mapping function. + - inverse_idx_mapping: The inverse mapping of the indices in the original dataframe to + the column names for all columns, as returned by the get_column_name_mapping function. + - idx_name_mapping: The mapping of the indices in the original dataframe to the column names + for all columns, as returned by the get_column_name_mapping function. + - train_num: The number of samples in the training set. + - test_num: The number of samples in the test set. It will be absent if the training_data_ratio is 1. + - column_names: The names of the columns. + """ + assert 0 < training_data_ratio <= 1, "Training data ratio must be between 0 and 1." + if training_data_ratio == 1: + log(INFO, "Training data ratio is 1, so the data will not be split into training and test sets.") + column_names = info["column_names"] if info["column_names"] else data.columns.tolist() numerical_column_indices = info["num_col_idx"] categorical_column_indices = info["cat_col_idx"] target_columns_indices = info["target_col_idx"] + numerical_column_names = [column_names[i] for i in numerical_column_indices] + categorical_column_names = [column_names[i] for i in categorical_column_indices] + target_column_names = [column_names[i] for i in target_columns_indices] index_mapping, inverse_index_mapping, index_to_name_mapping = get_column_name_mapping( data, @@ -351,17 +400,10 @@ def _split_data_and_populate_info( column_names, ) - numerical_columns = [column_names[i] for i in numerical_column_indices] - categorical_columns = [column_names[i] for i in categorical_column_indices] - target_columns = [column_names[i] for i in target_columns_indices] - - data_splits = train_test_split(data, categorical_columns, training_data_ratio) - - data_splits.train_data.data.columns = list(range(len(data_splits.train_data.data.columns))) - - if data_splits.test_data is not None: - data_splits.test_data.data.columns = list(range(len(data_splits.test_data.data.columns))) + # Splitting the data into training and test sets + data_splits = train_test_split(data, categorical_column_names, training_data_ratio) + # Populating the column info into the info dictionary info["column_info"] = _get_columns_info( data_splits.train_data.data, numerical_column_indices, @@ -370,41 +412,49 @@ def _split_data_and_populate_info( TaskType(info["task_type"]) if info["task_type"] else None, ) - data_splits.train_data.data.rename(columns=index_to_name_mapping, inplace=True) - if data_splits.test_data is not None: - data_splits.test_data.data.rename(columns=index_to_name_mapping, inplace=True) - - for col in numerical_columns: - data_splits.train_data.data.loc[data_splits.train_data.data[col] == "?", col] = np.nan - for col in categorical_columns: - data_splits.train_data.data.loc[data_splits.train_data.data[col] == "?", col] = "nan" + # Replace the invalid and missing values with np.nan for the numerical columns + # and "nan" for the categorical columns + for column_name in numerical_column_names: + column_data = data_splits.train_data.data[column_name] + data_splits.train_data.data.loc[column_data == "?", column_name] = np.nan + for column_name in categorical_column_names: + column_data = data_splits.train_data.data[column_name] + data_splits.train_data.data.loc[column_data == "?", column_name] = "nan" if data_splits.test_data is not None: - for col in numerical_columns: - data_splits.test_data.data.loc[data_splits.test_data.data[col] == "?", col] = np.nan - for col in categorical_columns: - data_splits.test_data.data.loc[data_splits.test_data.data[col] == "?", col] = "nan" - - data_splits.train_data.numerical_features = ( - data_splits.train_data.data[numerical_columns].to_numpy().astype(np.float32) - ) - data_splits.train_data.categorical_features = data_splits.train_data.data[categorical_columns].to_numpy() - data_splits.train_data.target_features = data_splits.train_data.data[target_columns].to_numpy() + for column_name in numerical_column_names: + column_data = data_splits.test_data.data[column_name] + data_splits.test_data.data.loc[column_data == "?", column_name] = np.nan + for column_name in categorical_column_names: + column_data = data_splits.test_data.data[column_name] + data_splits.test_data.data.loc[column_data == "?", column_name] = "nan" + + # Extract the numerical, categorical and target features + # and convert them to numpy arrays + numerical_features = data_splits.train_data.data[numerical_column_names].to_numpy().astype(np.float32) + data_splits.train_data.numerical_features = numerical_features + categorical_features = data_splits.train_data.data[categorical_column_names].to_numpy() + data_splits.train_data.categorical_features = categorical_features + target_features = data_splits.train_data.data[target_column_names].to_numpy() + data_splits.train_data.target_features = target_features if data_splits.test_data is not None: - data_splits.test_data.numerical_features = ( - data_splits.test_data.data[numerical_columns].to_numpy().astype(np.float32) - ) - data_splits.test_data.categorical_features = data_splits.test_data.data[categorical_columns].to_numpy() - data_splits.test_data.target_features = data_splits.test_data.data[target_columns].to_numpy() + numerical_features = data_splits.test_data.data[numerical_column_names].to_numpy().astype(np.float32) + data_splits.test_data.numerical_features = numerical_features + categorical_features = data_splits.test_data.data[categorical_column_names].to_numpy() + data_splits.test_data.categorical_features = categorical_features + target_features = data_splits.test_data.data[target_column_names].to_numpy() + data_splits.test_data.target_features = target_features - data_splits.train_data.data[numerical_columns] = data_splits.train_data.data[numerical_columns].astype(np.float32) + # Making sure the numerical data is float + numerical_data_as_float = data_splits.train_data.data[numerical_column_names].astype(np.float32) + data_splits.train_data.data[numerical_column_names] = numerical_data_as_float if data_splits.test_data is not None: - data_splits.test_data.data[numerical_columns] = data_splits.test_data.data[numerical_columns].astype( - np.float32 - ) + numerical_data_as_float = data_splits.test_data.data[numerical_column_names].astype(np.float32) + data_splits.test_data.data[numerical_column_names] = numerical_data_as_float + # Populating the rest of the info dictionary info["column_names"] = column_names info["train_num"] = data_splits.train_data.data.shape[0] @@ -418,45 +468,6 @@ def _split_data_and_populate_info( return data_splits, info -def _save_data_and_info( - table_name: str, - data_splits: DataSplits, - info: dict[str, Any], -) -> None: - save_dir = f"data/{table_name}" - - data_splits.train_data.data.to_csv(f"{save_dir}/train.csv", index=False) - - if data_splits.test_data is not None: - data_splits.test_data.data.to_csv(f"{save_dir}/test.csv", index=False) - - if not os.path.exists(f"synthetic/{table_name}"): - os.makedirs(f"synthetic/{table_name}") - - data_splits.train_data.data.to_csv(f"synthetic/{table_name}/real.csv", index=False) - - if data_splits.test_data is not None: - data_splits.test_data.data.to_csv(f"synthetic/{table_name}/test.csv", index=False) - - assert data_splits.train_data.numerical_features is not None - assert data_splits.train_data.categorical_features is not None - assert data_splits.train_data.target_features is not None - np.save(f"{save_dir}/x_num_train.npy", data_splits.train_data.numerical_features) - np.save(f"{save_dir}/x_cat_train.npy", data_splits.train_data.categorical_features) - np.save(f"{save_dir}/y_train.npy", data_splits.train_data.target_features) - - if data_splits.test_data is not None: - assert data_splits.test_data.numerical_features is not None - assert data_splits.test_data.categorical_features is not None - assert data_splits.test_data.target_features is not None - np.save(f"{save_dir}/x_num_test.npy", data_splits.test_data.numerical_features) - np.save(f"{save_dir}/x_cat_test.npy", data_splits.test_data.categorical_features) - np.save(f"{save_dir}/y_test.npy", data_splits.test_data.target_features) - - with open(f"{save_dir}/info.json", "w") as file: - json.dump(info, file, indent=4) - - # TODO: refactor this function so it doesn't run the risk of running indefinitely. def train_test_split( data: pd.DataFrame, From 293f4d96c712b0aaf5e6fb7585414543573bb41c Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 14:04:17 -0400 Subject: [PATCH 28/40] Renaming function --- src/midst_toolkit/models/clavaddpm/clustering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 7492dc6e..e8b6f085 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -141,7 +141,7 @@ def _run_clustering( parent_df_with_cluster, child_df_with_cluster, group_lengths_prob_dicts, - ) = _pair_clustering_keep_id( + ) = _pair_clustering( tables, child, parent, @@ -157,7 +157,7 @@ def _run_clustering( return tables, all_group_lengths_prob_dicts -def _pair_clustering_keep_id( +def _pair_clustering( tables: Tables, child_name: str, parent_name: str, From f8c9adf6385413741a9b742ca02b086ba659ad2d Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 14:29:20 -0400 Subject: [PATCH 29/40] One more refactor --- .../models/clavaddpm/data_loaders.py | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index f6f32a6d..04093a4f 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -61,14 +61,9 @@ def load_multi_table( tables[table]["original_df"] = tables[table]["df"].copy() id_cols = [col for col in tables[table]["df"].columns if "_id" in col] df_no_id = tables[table]["df"].drop(columns=id_cols) - info = get_info_from_domain(df_no_id, tables[table]["domain"]) - _, info = process_pipeline_data( - table_name=table, - data=df_no_id, - info=info, - training_data_ratio=training_data_ratio, - verbose=verbose, - ) + table_domain = tables[table]["domain"] + + _, info = process_pipeline_data(df_no_id, table_domain, training_data_ratio, verbose) tables[table]["info"] = info return tables, relation_order, dataset_meta @@ -123,9 +118,8 @@ class DataSplits: def process_pipeline_data( - table_name: str, data: pd.DataFrame, - info: dict[str, Any], + table_domain: dict[str, Any], training_data_ratio: float = 0.9, verbose: bool = True, ) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]: @@ -138,12 +132,10 @@ def process_pipeline_data( metadata. Args: - table_name: The name of the table. Used to name the files when saving the data. data: The dataframe containing the data. - info: The information dictionary, retrieved from the get_info_from_domain function. + table_domain: The table domain dictionary containing metadata about the data columns. training_data_ratio: The ratio of the data to be used for training. Should be between 0 and 1. If it's equal to 1, it will only return the training set. Optional, default is 0.9. - save: Whether to save the data. Optional, default is False. verbose: Whether to print verbose output. Optional, default is True. Returns: @@ -159,9 +151,10 @@ def process_pipeline_data( - "x_num_test": The numeric data for the test set. It will be absent if ratio == 1. - "x_cat_test": The categorical data for the test set. It will be absent if ratio == 1. - "y_test": The target data for the test set. It will be absent if ratio == 1. - - The information dictionary with updated values. + - The information dictionary as returned by the _split_data_and_populate_info function + with additional metadata. """ - data_splits, info = _split_data_and_populate_info(data, info, training_data_ratio) + data_splits, info = _split_data_and_generate_info(data, table_domain, training_data_ratio) metadata: dict[str, Any] = {"columns": {}} task_type = info["task_type"] @@ -349,9 +342,9 @@ def _get_columns_info( return columns_info -def _split_data_and_populate_info( +def _split_data_and_generate_info( data: pd.DataFrame, - info: dict[str, Any], + table_domain: dict[str, Any], training_data_ratio: float, ) -> tuple[DataSplits, dict[str, Any]]: """ @@ -360,7 +353,7 @@ def _split_data_and_populate_info( Args: data: The dataframe containing the data. - info: The info dictionary, retrieved from the get_info_from_domain function. + table_domain: The table domain dictionary containing metadata about the data columns. training_data_ratio: The ratio of the data to be used for training. Should be between 0 and 1. If it's equal to 1, it will only return the training set. @@ -368,7 +361,7 @@ def _split_data_and_populate_info( A tuple with 2 values: - The data splits as an instance of the DataSplits class. Test data will be None if the training_data_ratio is 1. - - The info dictionary with updated maetadata, namely: + - The info dictionary as retrieved from the get_info_from_domain function with updated metadata, namely: - column_info: The columns info dictionary, as returned by the _get_columns_info function. - idx_mapping: The mapping of the indices in the original dataframe to the column names for all columns, as returned by the get_column_name_mapping function. @@ -384,8 +377,9 @@ def _split_data_and_populate_info( if training_data_ratio == 1: log(INFO, "Training data ratio is 1, so the data will not be split into training and test sets.") - column_names = info["column_names"] if info["column_names"] else data.columns.tolist() + info = get_info_from_domain(data, table_domain) + column_names = info["column_names"] if info["column_names"] else data.columns.tolist() numerical_column_indices = info["num_col_idx"] categorical_column_indices = info["cat_col_idx"] target_columns_indices = info["target_col_idx"] From 1c055e8283eb85bef9f152ca18683a8dfdaf28c1 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 14:31:15 -0400 Subject: [PATCH 30/40] rolling back table_domain renamings --- src/midst_toolkit/models/clavaddpm/clustering.py | 8 ++++---- src/midst_toolkit/models/clavaddpm/data_loaders.py | 6 +++--- src/midst_toolkit/models/clavaddpm/model.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 430a1311..f697ed2d 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -615,14 +615,14 @@ def _get_parent_data_clusters( def _get_categorical_and_numerical_columns( all_columns: list[str], - tables_domain: dict[str, Any], + table_domain: dict[str, Any], ) -> tuple[list[int], list[int]]: """ Return the list of numerical and categorical column indices from the domain dictionary. Args: all_columns: List of all columns. - tables_domain: Dictionary of the tables' domain. + table_domain: Dictionary of the table's domain containing metadata about the data columns. Returns: Tuple with two lists of indices, one for the numerical columns and one for the categorical columns. @@ -631,8 +631,8 @@ def _get_categorical_and_numerical_columns( categorical_columns = [] for col_index, col in enumerate(all_columns): - if col in tables_domain: - if tables_domain[col]["type"] == DomainDataType.DISCRETE.value: + if col in table_domain: + if table_domain[col]["type"] == DomainDataType.DISCRETE.value: categorical_columns.append(col_index) else: numerical_columns.append(col_index) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index b81d1718..a1ae781e 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -75,13 +75,13 @@ def load_multi_table( return tables, relation_order, dataset_meta -def get_info_from_domain(data: pd.DataFrame, tables_domain: dict[str, Any]) -> dict[str, Any]: +def get_info_from_domain(data: pd.DataFrame, table_domain: dict[str, Any]) -> dict[str, Any]: """ Get the information dictionary from the table domain dictionary. Args: data: The dataframe containing the data. - tables_domain: The tables' domain dictionary containing metadata about the data columns. + table_domain: The table's domain dictionary containing metadata about the data columns. Returns: The information dictionary containing the following keys: @@ -96,7 +96,7 @@ def get_info_from_domain(data: pd.DataFrame, tables_domain: dict[str, Any]) -> d info["cat_col_idx"] = [] columns = data.columns.tolist() for i in range(len(columns)): - if tables_domain[columns[i]]["type"] == DomainDataType.DISCRETE.value: + if table_domain[columns[i]]["type"] == DomainDataType.DISCRETE.value: info["cat_col_idx"].append(i) else: info["num_col_idx"].append(i) diff --git a/src/midst_toolkit/models/clavaddpm/model.py b/src/midst_toolkit/models/clavaddpm/model.py index d6ae31c6..6c9cfafe 100644 --- a/src/midst_toolkit/models/clavaddpm/model.py +++ b/src/midst_toolkit/models/clavaddpm/model.py @@ -115,13 +115,13 @@ def forward(self, x: Tensor, timesteps: Tensor) -> Tensor: return self.model(x) -def get_table_info(df: pd.DataFrame, tables_domain: dict[str, Any], y_col: str) -> dict[str, Any]: +def get_table_info(df: pd.DataFrame, table_domain: dict[str, Any], y_col: str) -> dict[str, Any]: """ Get the dictionary of table information. Args: df: The dataframe containing the data. - tables_domain: The tables' domain dictionary of metadata about the data columns. + table_domain: The table's domain dictionary containing metadata about the data columns. y_col: The name of the target column. Returns: @@ -137,8 +137,8 @@ def get_table_info(df: pd.DataFrame, tables_domain: dict[str, Any], y_col: str) cat_cols = [] num_cols = [] for col in df.columns: - if col in tables_domain and col != y_col: - if tables_domain[col]["type"] == DomainDataType.DISCRETE.value: + if col in table_domain and col != y_col: + if table_domain[col]["type"] == DomainDataType.DISCRETE.value: cat_cols.append(col) else: num_cols.append(col) From 84fe9720fc81f17c1988fcca0ad4b8f9a4f78588 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 17:12:52 -0400 Subject: [PATCH 31/40] Splitting the make_dataset_from_df function --- .../models/clavaddpm/data_loaders.py | 12 +- src/midst_toolkit/models/clavaddpm/dataset.py | 276 ++++++++++-------- src/midst_toolkit/models/clavaddpm/train.py | 8 +- 3 files changed, 171 insertions(+), 125 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index 65a9d628..f2b6cfd5 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -527,7 +527,7 @@ def train_test_split( class FastTensorDataLoader: - def __init__(self, tensors: tuple[torch.Tensor, ...], batch_size: int = 32, shuffle: bool = False): + def __init__(self, tensors: list[torch.Tensor], batch_size: int = 32, shuffle: bool = False): """ Initialize a FastTensorDataLoader. @@ -537,7 +537,7 @@ def __init__(self, tensors: tuple[torch.Tensor, ...], batch_size: int = 32, shuf Source: https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6 Args: - tensors: a tuple of tensors to store. The first dimension for each tensor is the + tensors: a list of tensors to store. The first dimension for each tensor is the number of samples, and all tensors must have the same number of samples. batch_size: batch size to load. Optional, default is 32. shuffle: if True, shuffle the data *in-place* whenever an @@ -567,11 +567,11 @@ def __iter__(self) -> Self: """ if self.shuffle: r = torch.randperm(self.dataset_len) - self.tensors = [t[r] for t in self.tensors] # type: ignore[assignment] + self.tensors = [t[r] for t in self.tensors] self.i = 0 return self - def __next__(self) -> tuple[torch.Tensor, ...]: + def __next__(self) -> list[torch.Tensor]: """Get the next batch of data from the dataset. Returns: @@ -579,7 +579,7 @@ def __next__(self) -> tuple[torch.Tensor, ...]: """ if self.i >= self.dataset_len: raise StopIteration - batch = tuple(t[self.i : self.i + self.batch_size] for t in self.tensors) + batch = [t[self.i : self.i + self.batch_size] for t in self.tensors] self.i += self.batch_size return batch @@ -628,6 +628,6 @@ def prepare_fast_dataloader( else: raise ValueError(f"Unsupported target type: {target_type}") - dataloader = FastTensorDataLoader((x, y), batch_size=batch_size, shuffle=(split == DataSplit.TRAIN)) + dataloader = FastTensorDataLoader([x, y], batch_size=batch_size, shuffle=(split == DataSplit.TRAIN)) while True: yield from dataloader diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index 917f669a..106915ed 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -251,18 +251,18 @@ def calculate_metrics( # TODO consider moving all the functions below into the Dataset class -def get_category_sizes(x: torch.Tensor | np.ndarray) -> list[int]: +def get_category_sizes(features: torch.Tensor | np.ndarray) -> list[int]: """ Get the size of the categories in the data by counting the number of unique values in each column. Args: - x: The data to get the size of the categories of. + features: The data to get the size of the categories of. Returns: A list with the category sizes in the data. """ - x_t = x.T.cpu().tolist() if isinstance(x, torch.Tensor) else x.T.tolist() + x_t = features.T.cpu().tolist() if isinstance(features, torch.Tensor) else features.T.tolist() return [len(set(xt)) for xt in x_t] @@ -392,12 +392,11 @@ def _get_predicted_labels_and_probs( def make_dataset_from_df( - # ruff: noqa: PLR0915, PLR0912 - df: pd.DataFrame, + data: pd.DataFrame, transformations: Transformations, is_target_conditioned: IsTargetCondioned, - df_info: dict[str, Any], - ratios: list[float] | None = None, + info: dict[str, Any], + data_split_ratios: list[float] | None = None, std: float = 0, ) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]: """ @@ -411,7 +410,7 @@ def make_dataset_from_df( However, if we have n_classes > 0, then y is not the first column of the matrix. Args: - df: The pandas DataFrame to generate the dataset from. + data: The pandas DataFrame to generate the dataset from. transformations: The transformations to apply to the dataset. is_target_conditioned: The condition on the y column. IsTargetCondioned.CONCAT: y is concatenated to X, the model learns a joint distribution of (y, X) @@ -432,142 +431,189 @@ def make_dataset_from_df( y is synthesized using y's empirical distribution. X is generated by the model. In this case, y is completely independent of X. - df_info: A dictionary with metadata about the DataFrame. - ratios: The ratios of the dataset to split into train, val, and test. The sum of + info: A dictionary with metadata about the DataFrame. + data_split_ratios: The ratios of the dataset to split into train, val, and test. The sum of the ratios must amount to 1 (with a tolerance of 0.01). Optional, default is [0.7, 0.2, 0.1]. std: The standard deviation of the labels. Optional, default is 0. Returns: A tuple with the dataset, the label encoders, and the column orders. """ - if ratios is None: - ratios = [0.7, 0.2, 0.1] + if data_split_ratios is None: + data_split_ratios = [0.7, 0.2, 0.1] - assert len(ratios) == 3, "The ratios must be a list of 3 values (train, validation, test)." - assert np.isclose(sum(ratios), 1, atol=0.01), "The sum of the ratios must amount to 1 (with a tolerance of 0.01)." + assert len(data_split_ratios) == 3, "The ratios must be a list of 3 values (train, validation, test)." + assert np.isclose(sum(data_split_ratios), 1, atol=0.01), ( + "The sum of the ratios must amount to 1 (with a tolerance of 0.01)." + ) + + train_val_data, test_data = train_test_split(data, test_size=data_split_ratios[2], random_state=42) + train_data, val_data = train_test_split( + train_val_data, + test_size=data_split_ratios[1] / (data_split_ratios[0] + data_split_ratios[1]), + random_state=42, + ) + + categorical_column_names, numerical_column_names = _get_categorical_and_numerical_column_names( + info, + is_target_conditioned, + ) + + if categorical_column_names is not None and len(categorical_column_names) > 0: + categorical_features = { + DataSplit.TRAIN.value: train_data[categorical_column_names].to_numpy(dtype=np.str_), + DataSplit.VALIDATION.value: val_data[categorical_column_names].to_numpy(dtype=np.str_), + DataSplit.TEST.value: test_data[categorical_column_names].to_numpy(dtype=np.str_), + } + else: + categorical_features = None + + if numerical_column_names is not None and len(numerical_column_names) > 0: + numerical_features = { + DataSplit.TRAIN.value: train_data[numerical_column_names].values.astype(np.float32), + DataSplit.VALIDATION.value: val_data[numerical_column_names].values.astype(np.float32), + DataSplit.TEST.value: test_data[numerical_column_names].values.astype(np.float32), + } + else: + numerical_features = None - train_val_df, test_df = train_test_split(df, test_size=ratios[2], random_state=42) - train_df, val_df = train_test_split(train_val_df, test_size=ratios[1] / (ratios[0] + ratios[1]), random_state=42) + target = { + DataSplit.TRAIN.value: train_data[info["y_col"]].values.astype(np.float32), + DataSplit.VALIDATION.value: val_data[info["y_col"]].values.astype(np.float32), + DataSplit.TEST.value: test_data[info["y_col"]].values.astype(np.float32), + } - cat_column_orders = [] - num_column_orders = [] - index_to_column = list(df.columns) + index_to_column = list(data.columns) column_to_index = {col: i for i, col in enumerate(index_to_column)} + categorical_column_orders = [column_to_index[col] for col in categorical_column_names] + numerical_column_orders = [column_to_index[col] for col in numerical_column_names] - if df_info["n_classes"] > 0: - x_cat: dict[str, np.ndarray] | None = ( - {} if df_info["cat_cols"] is not None or is_target_conditioned == IsTargetCondioned.CONCAT else None - ) - x_num: dict[str, np.ndarray] | None = {} if df_info["num_cols"] is not None else None - y = {} + column_orders_indices = numerical_column_orders + categorical_column_orders + column_orders = [index_to_column[index] for index in column_orders_indices] - cat_cols_with_y: list[str] = [] - if df_info["cat_cols"] is not None: - cat_cols_with_y += df_info["cat_cols"] - if is_target_conditioned == IsTargetCondioned.CONCAT: - cat_cols_with_y = [df_info["y_col"]] + cat_cols_with_y + numerical_features, label_encoders = _merge_features(categorical_features, numerical_features, std) - if len(cat_cols_with_y) > 0: - x_cat[DataSplit.TRAIN.value] = train_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index] - x_cat[DataSplit.VALIDATION.value] = val_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index] - x_cat[DataSplit.TEST.value] = test_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index] + assert isinstance(info["n_classes"], int) - y[DataSplit.TRAIN.value] = train_df[df_info["y_col"]].values.astype(np.float32) - y[DataSplit.VALIDATION.value] = val_df[df_info["y_col"]].values.astype(np.float32) - y[DataSplit.TEST.value] = test_df[df_info["y_col"]].values.astype(np.float32) + dataset = Dataset( + numerical_features, + None, + target, + y_info={}, + task_type=TaskType(info["task_type"]), + n_classes=info["n_classes"], + ) - if df_info["num_cols"] is not None: - x_num[DataSplit.TRAIN.value] = train_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index] - x_num[DataSplit.VALIDATION.value] = val_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index] - x_num[DataSplit.TEST.value] = test_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index] + return transform_dataset(dataset, transformations, None), label_encoders, column_orders - cat_column_orders = [column_to_index[col] for col in cat_cols_with_y] - num_column_orders = [column_to_index[col] for col in df_info["num_cols"]] - else: - x_cat = {} if df_info["cat_cols"] is not None else None - x_num = {} if df_info["num_cols"] is not None or is_target_conditioned == IsTargetCondioned.CONCAT else None - y = {} +def _get_categorical_and_numerical_column_names( + info: dict[str, Any], + is_target_conditioned: IsTargetCondioned, +) -> tuple[list[str], list[str]]: + """ + Get the categorical and numerical column names from the info dictionary. + + Args: + info: The info dictionary. + is_target_conditioned: The condition on the y column. + """ + numerical_columns: list[str] = [] + categorical_columns: list[str] = [] - num_cols_with_y: list[str] = [] - if df_info["num_cols"] is not None: - num_cols_with_y += df_info["num_cols"] + if info["n_classes"] > 0: + if info["cat_cols"] is not None: + categorical_columns += info["cat_cols"] if is_target_conditioned == IsTargetCondioned.CONCAT: - num_cols_with_y = [df_info["y_col"]] + num_cols_with_y + categorical_columns = [info["y_col"]] + categorical_columns - if len(num_cols_with_y) > 0: - assert x_num is not None - x_num[DataSplit.TRAIN.value] = train_df[num_cols_with_y].values.astype(np.float32) - x_num[DataSplit.VALIDATION.value] = val_df[num_cols_with_y].values.astype(np.float32) - x_num[DataSplit.TEST.value] = test_df[num_cols_with_y].values.astype(np.float32) + numerical_columns = info["num_cols"] - y[DataSplit.TRAIN.value] = train_df[df_info["y_col"]].values.astype(np.float32) - y[DataSplit.VALIDATION.value] = val_df[df_info["y_col"]].values.astype(np.float32) - y[DataSplit.TEST.value] = test_df[df_info["y_col"]].values.astype(np.float32) + else: + if info["num_cols"] is not None: + numerical_columns += info["num_cols"] + if is_target_conditioned == IsTargetCondioned.CONCAT: + numerical_columns = [info["y_col"]] + numerical_columns - if df_info["cat_cols"] is not None: - assert x_cat is not None - x_cat[DataSplit.TRAIN.value] = train_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) - x_cat[DataSplit.VALIDATION.value] = val_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) - x_cat[DataSplit.TEST.value] = test_df[df_info["cat_cols"]].to_numpy(dtype=np.str_) + categorical_columns = info["cat_cols"] - cat_column_orders = [column_to_index[col] for col in df_info["cat_cols"]] - num_column_orders = [column_to_index[col] for col in num_cols_with_y] + return categorical_columns, numerical_columns - column_orders_indices = num_column_orders + cat_column_orders - column_orders = [index_to_column[index] for index in column_orders_indices] - label_encoders = {} - if x_cat is not None and len(df_info["cat_cols"]) > 0: - x_cat_all = np.vstack( - (x_cat[DataSplit.TRAIN.value], x_cat[DataSplit.VALIDATION.value], x_cat[DataSplit.TEST.value]) - ) - x_cat_converted = [] - for col_index in range(x_cat_all.shape[1]): - label_encoder = LabelEncoder() - x_cat_converted.append(label_encoder.fit_transform(x_cat_all[:, col_index]).astype(float)) - if std > 0: - # add noise - x_cat_converted[-1] += np.random.normal(0, std, x_cat_converted[-1].shape) - label_encoders[col_index] = label_encoder - - x_cat_converted = np.vstack(x_cat_converted).T # type: ignore[assignment] - - train_num = x_cat[DataSplit.TRAIN.value].shape[0] - val_num = x_cat[DataSplit.VALIDATION.value].shape[0] - - x_cat[DataSplit.TRAIN.value] = x_cat_converted[:train_num, :] # type: ignore[call-overload] - x_cat[DataSplit.VALIDATION.value] = x_cat_converted[train_num : train_num + val_num, :] # type: ignore[call-overload] - x_cat[DataSplit.TEST.value] = x_cat_converted[train_num + val_num :, :] # type: ignore[call-overload] - - if x_num and len(x_num) > 0: - assert x_num is not None - x_num[DataSplit.TRAIN.value] = np.concatenate( - (x_num[DataSplit.TRAIN.value], x_cat[DataSplit.TRAIN.value]), axis=1 - ) - x_num[DataSplit.VALIDATION.value] = np.concatenate( - (x_num[DataSplit.VALIDATION.value], x_cat[DataSplit.VALIDATION.value]), axis=1 - ) - x_num[DataSplit.TEST.value] = np.concatenate( - (x_num[DataSplit.TEST.value], x_cat[DataSplit.TEST.value]), axis=1 - ) - else: - x_num = x_cat - x_cat = None +def _merge_features( + categorical_features: ArrayDict | None, + numerical_features: ArrayDict | None, + std: float, +) -> tuple[ArrayDict, dict[int, LabelEncoder]]: + """ + Merge the categorical with the numerical features for train, validation, and test datasets. - n_classes = df_info["n_classes"] - assert isinstance(n_classes, int) + Args: + categorical_features: The categorical features. + numerical_features: The numerical features. + std: The standard deviation of the labels. - dataset = Dataset( - x_num, - None, - y, - y_info={}, - task_type=TaskType(df_info["task_type"]), - n_classes=n_classes, + Returns: + The merged features for train, validation, and test datasets and the label encoders + used to do so. + """ + if categorical_features is None: + # if no categorical features, just return the numerical features + assert numerical_features is not None + return numerical_features, {} + + # Otherwise, encode the categorical features + all_categorical_data = np.vstack( + ( + categorical_features[DataSplit.TRAIN.value], + categorical_features[DataSplit.VALIDATION.value], + categorical_features[DataSplit.TEST.value], + ) ) - return transform_dataset(dataset, transformations, None), label_encoders, column_orders + categorical_data_converted = [] + label_encoders = {} + for column in range(all_categorical_data.shape[1]): + label_encoder = LabelEncoder() + encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float) + categorical_data_converted.append(encoded_labels) + if std > 0: + # add noise + categorical_data_converted[-1] += np.random.normal(0, std, categorical_data_converted[-1].shape) + label_encoders[column] = label_encoder + + categorical_data_transposed = np.vstack(categorical_data_converted).T + + num_train_samples = categorical_features[DataSplit.TRAIN.value].shape[0] + num_validation_samples = categorical_features[DataSplit.VALIDATION.value].shape[0] + + categorical_features[DataSplit.TRAIN.value] = categorical_data_transposed[:num_train_samples, :] + categorical_features[DataSplit.VALIDATION.value] = categorical_data_transposed[ + num_train_samples : num_train_samples + num_validation_samples, : + ] + categorical_features[DataSplit.TEST.value] = categorical_data_transposed[ + num_train_samples + num_validation_samples :, : + ] + + if numerical_features is None: + # if no numerical features then no need to merge, just return the categorical features + return categorical_features, label_encoders + + # Otherwise, merge the categorical and numerical features + merged_features = { + DataSplit.TRAIN.value: np.concatenate( + (numerical_features[DataSplit.TRAIN.value], categorical_features[DataSplit.TRAIN.value]), axis=1 + ), + DataSplit.VALIDATION.value: np.concatenate( + (numerical_features[DataSplit.VALIDATION.value], categorical_features[DataSplit.VALIDATION.value]), + axis=1, + ), + DataSplit.TEST.value: np.concatenate( + (numerical_features[DataSplit.TEST.value], categorical_features[DataSplit.TEST.value]), axis=1 + ), + } + + return merged_features, label_encoders def transform_dataset( diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 0f7b3fea..e9475365 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -304,8 +304,8 @@ def train_model( data_frame, transformations, is_target_conditioned=model_params.is_target_conditioned, - ratios=data_split_ratios, - df_info=data_frame_info, + data_split_ratios=data_split_ratios, + info=data_frame_info, std=0, ) @@ -419,8 +419,8 @@ def train_classifier( data_frame, transformations, is_target_conditioned=model_params.is_target_conditioned, - ratios=data_split_ratios, - df_info=data_frame_info, + data_split_ratios=data_split_ratios, + info=data_frame_info, std=0, ) print(dataset.n_features) From 56a09ecf03d6f23d9aa8ad8bf51180daa0e971e6 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 17:17:50 -0400 Subject: [PATCH 32/40] Fixing broken code from revert --- src/midst_toolkit/models/clavaddpm/data_loaders.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index f2b6cfd5..65a9d628 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -527,7 +527,7 @@ def train_test_split( class FastTensorDataLoader: - def __init__(self, tensors: list[torch.Tensor], batch_size: int = 32, shuffle: bool = False): + def __init__(self, tensors: tuple[torch.Tensor, ...], batch_size: int = 32, shuffle: bool = False): """ Initialize a FastTensorDataLoader. @@ -537,7 +537,7 @@ def __init__(self, tensors: list[torch.Tensor], batch_size: int = 32, shuffle: b Source: https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6 Args: - tensors: a list of tensors to store. The first dimension for each tensor is the + tensors: a tuple of tensors to store. The first dimension for each tensor is the number of samples, and all tensors must have the same number of samples. batch_size: batch size to load. Optional, default is 32. shuffle: if True, shuffle the data *in-place* whenever an @@ -567,11 +567,11 @@ def __iter__(self) -> Self: """ if self.shuffle: r = torch.randperm(self.dataset_len) - self.tensors = [t[r] for t in self.tensors] + self.tensors = [t[r] for t in self.tensors] # type: ignore[assignment] self.i = 0 return self - def __next__(self) -> list[torch.Tensor]: + def __next__(self) -> tuple[torch.Tensor, ...]: """Get the next batch of data from the dataset. Returns: @@ -579,7 +579,7 @@ def __next__(self) -> list[torch.Tensor]: """ if self.i >= self.dataset_len: raise StopIteration - batch = [t[self.i : self.i + self.batch_size] for t in self.tensors] + batch = tuple(t[self.i : self.i + self.batch_size] for t in self.tensors) self.i += self.batch_size return batch @@ -628,6 +628,6 @@ def prepare_fast_dataloader( else: raise ValueError(f"Unsupported target type: {target_type}") - dataloader = FastTensorDataLoader([x, y], batch_size=batch_size, shuffle=(split == DataSplit.TRAIN)) + dataloader = FastTensorDataLoader((x, y), batch_size=batch_size, shuffle=(split == DataSplit.TRAIN)) while True: yield from dataloader From f35b596012cfee4138b4a8641f9f3dd66d1bfd15 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 2 Oct 2025 17:22:32 -0400 Subject: [PATCH 33/40] CR by David --- src/midst_toolkit/models/clavaddpm/clustering.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index f697ed2d..04ad56c7 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -630,9 +630,9 @@ def _get_categorical_and_numerical_columns( numerical_columns = [] categorical_columns = [] - for col_index, col in enumerate(all_columns): - if col in table_domain: - if table_domain[col]["type"] == DomainDataType.DISCRETE.value: + for col_index, column in enumerate(all_columns): + if column in table_domain: + if table_domain[column]["type"] == DomainDataType.DISCRETE.value: categorical_columns.append(col_index) else: numerical_columns.append(col_index) From 0a9994ae40b7f063b01e74a1e337ff2823e8024a Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Fri, 3 Oct 2025 12:13:22 -0400 Subject: [PATCH 34/40] CR by David --- .../models/clavaddpm/clustering.py | 65 ++++++++++++------- .../models/clavaddpm/enumerations.py | 7 ++ 2 files changed, 48 insertions(+), 24 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index e8b6f085..916ff698 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -17,6 +17,7 @@ from midst_toolkit.models.clavaddpm.enumerations import ( ClusteringMethod, Configs, + ForeignKeyScalingType, GroupLengthsProbDicts, RelationOrder, Tables, @@ -304,8 +305,8 @@ def _denormalize_parent_data( i.e. duplicate the parent data for each element of the child group data. Args: - child_data: Numpy array of the child data. - parent_data: Numpy array of the parent data. + child_data: Numpy array of the child data. Should be sorted by the foreign key. + parent_data: Numpy array of the parent data. Should be sorted by the parent primary key. parent_primary_key_index: Index of the parent primary key. foreign_key_index: Index of the foreign key to the child data. @@ -329,13 +330,14 @@ def _denormalize_parent_data( return denormalized_parent_data -def _get_min_max_for_numerical_columns( +def _get_min_max_and_quantile_for_numerical_columns( child_numerical_data: np.ndarray, parent_numerical_data: np.ndarray, parent_scale: float, -) -> np.ndarray: +) -> tuple[np.ndarray, np.ndarray]: """ - Get the min-max values for the numerical columns in both the child and parent data. + Get the min-max and quantile values for the numerical columns in both the + child and parent data. Args: child_numerical_data: Numpy array of the child numerical data. @@ -343,7 +345,8 @@ def _get_min_max_for_numerical_columns( parent_scale: Scaling factor applied to the parent data. Returns: - Numpy array of the min-max values for the numerical columns. + A tuple with two numpy arrays, one with the min-max values and one with the quantile + values for the numerical columns. """ joint_matrix = np.concatenate([child_numerical_data, parent_numerical_data], axis=1) matrix_p_index = child_numerical_data.shape[1] @@ -355,7 +358,7 @@ def _get_min_max_for_numerical_columns( numerical_quantile[:, matrix_p_index:] = parent_scale * numerical_quantile[:, matrix_p_index:] numerical_min_max[:, matrix_p_index:] = parent_scale * numerical_min_max[:, matrix_p_index:] - return numerical_min_max + return numerical_min_max, numerical_quantile def _one_hot_encode_categorical_columns( @@ -409,13 +412,14 @@ def _one_hot_encode_categorical_columns( def _prepare_cluster_data( child_data: np.ndarray, parent_data: np.ndarray, - child_domain_dict: dict[str, Any], - parent_domain_dict: dict[str, Any], + child_domain: dict[str, Any], + parent_domain: dict[str, Any], all_child_columns: list[str], all_parent_columns: list[str], parent_primary_key: str, parent_scale: float, key_scale: float, + key_scaling_type: ForeignKeyScalingType = ForeignKeyScalingType.MINMAX, ) -> np.ndarray: """ Prepare the data for the clustering algorithm, which comprises of denormalizing the parent data, @@ -424,9 +428,9 @@ def _prepare_cluster_data( Args: child_data: Numpy array of the child data. parent_data: Numpy array of the parent data. - child_domain_dict: Dictionary of the domain of the child table. The domain dictionary + child_domain: Dictionary of the domain of the child table. The domain dictionary holds metadata about the columns of each one of the tables. - parent_domain_dict: Dictionary of the domain of the parent table. The domain dictionary + parent_domain: Dictionary of the domain of the parent table. The domain dictionary holds metadata about the columns of each one of the tables. all_child_columns: List of all child columns. all_parent_columns: List of all parent columns. @@ -436,10 +440,12 @@ def _prepare_cluster_data( key_scale: Scaling factor applied to the foreign key values that link the child table to the parent table. This will weight how much influence the parent-child relationship has in the clustering algorithm. + key_scaling_type: Type of scaling for the foreign key. Default is ForeignKeyScalingType.MINMAX. Returns: Numpy array of the data prepared for the clustering algorithm. """ + # Recalculating the keys' indices here to save us from passing one extra parameter. parent_primary_key_index = all_parent_columns.index(parent_primary_key) foreign_key_index = all_child_columns.index(parent_primary_key) @@ -454,11 +460,11 @@ def _prepare_cluster_data( # Columns that are not in the domain dictionary are ignored (except for the primary and foreign keys). child_numerical_columns, child_categorical_columns = _get_categorical_and_numerical_columns( all_child_columns, - child_domain_dict, + child_domain, ) parent_numerical_columns, parent_categorical_columns = _get_categorical_and_numerical_columns( all_parent_columns, - parent_domain_dict, + parent_domain, ) child_numerical_data = child_data[:, child_numerical_columns] @@ -466,25 +472,34 @@ def _prepare_cluster_data( parent_numerical_data = denormalized_parent_data[:, parent_numerical_columns] parent_categorical_data = denormalized_parent_data[:, parent_categorical_columns] - numerical_min_max = _get_min_max_for_numerical_columns( + numerical_min_max, numerical_quantile = _get_min_max_and_quantile_for_numerical_columns( child_numerical_data, parent_numerical_data, parent_scale, ) + reshaped_parent_data = denormalized_parent_data[:, parent_primary_key_index].reshape(-1, 1) + if key_scaling_type == ForeignKeyScalingType.MINMAX: + key_normalized = _min_max_normalize_sklearn(reshaped_parent_data) + numerical_normalized = numerical_min_max + elif key_scaling_type == ForeignKeyScalingType.QUANTILE: + key_normalized = _quantile_normalize_sklearn(reshaped_parent_data) + numerical_normalized = numerical_quantile + else: + raise ValueError(f"Unsupported foreign key scaling type: {key_scaling_type}") + + key_scaled = key_scale * key_normalized + categorical_one_hot = _one_hot_encode_categorical_columns( child_categorical_data, parent_categorical_data, parent_scale, ) - key_min_max = _min_max_normalize_sklearn(denormalized_parent_data[:, parent_primary_key_index].reshape(-1, 1)) - key_scaled = key_scale * key_min_max - if categorical_one_hot is None: - return np.concatenate((numerical_min_max, key_scaled), axis=1) + return np.concatenate((numerical_normalized, key_scaled), axis=1) - return np.concatenate((numerical_min_max, categorical_one_hot, key_scaled), axis=1) + return np.concatenate((numerical_normalized, categorical_one_hot, key_scaled), axis=1) def _get_cluster_labels( @@ -582,9 +597,11 @@ def _get_parent_data_clusters( ) -> np.ndarray: """ Get the parent data clusters from the child data with cluster and the parent data. + The child data needs to be sorted by the foreign key. Args: child_data_with_cluster: Numpy array of the child data with cluster information. + Should be sorted by the foreign key. parent_data: Numpy array of the parent data. parent_primary_key_index: Index of the parent primary key. foreign_key_index: Index of the foreign key to the child data. @@ -614,14 +631,14 @@ def _get_parent_data_clusters( def _get_categorical_and_numerical_columns( all_columns: list[str], - domain_dictionary: dict[str, Any], + table_domain: dict[str, Any], ) -> tuple[list[int], list[int]]: """ - Return the list of numerical and categorical column indices from the domain dictionary. + Return the list of numerical and categorical column indices from the table domain dictionary. Args: all_columns: List of all columns. - domain_dictionary: Dictionary of the domain. + table_domain: Dictionary of the domain. Returns: Tuple with two lists of indices, one for the numerical columns and one for the categorical columns. @@ -630,8 +647,8 @@ def _get_categorical_and_numerical_columns( categorical_columns = [] for col_index, col in enumerate(all_columns): - if col in domain_dictionary: - if domain_dictionary[col]["type"] == "discrete": + if col in table_domain: + if table_domain[col]["type"] == "discrete": categorical_columns.append(col_index) else: numerical_columns.append(col_index) diff --git a/src/midst_toolkit/models/clavaddpm/enumerations.py b/src/midst_toolkit/models/clavaddpm/enumerations.py index 51694966..b9a9f771 100644 --- a/src/midst_toolkit/models/clavaddpm/enumerations.py +++ b/src/midst_toolkit/models/clavaddpm/enumerations.py @@ -100,3 +100,10 @@ class TargetType(Enum): FLOAT = "float" LONG = "long" + + +class ForeignKeyScalingType(Enum): + """Possible types of scaling for the foreign key.""" + + MINMAX = "minmax" + QUANTILE = "quantile" From 1f4fed2b8ba9f6fa15f5e5d6f8171799df00aed6 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Fri, 3 Oct 2025 12:55:05 -0400 Subject: [PATCH 35/40] CR by David --- .../models/clavaddpm/data_loaders.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index 65a9d628..4d9c8525 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -143,14 +143,18 @@ def process_pipeline_data( - The data dictionary containing the following keys: - "df": The dataframe containing the data. - DataSplit.TRAIN: The dataframe containing the training set. - - DataSplit.TEST: The dataframe containing the test set. It will be absent if ratio == 1. + - DataSplit.TEST: The dataframe containing the test set. It will be absent if + training_data_ratio == 1. - "numpy": A dictionary with the numeric data, containing the keys: - "x_num_train": The numeric data for the training set. - "x_cat_train": The categorical data for the training set. - "y_train": The target data for the training set. - - "x_num_test": The numeric data for the test set. It will be absent if ratio == 1. - - "x_cat_test": The categorical data for the test set. It will be absent if ratio == 1. - - "y_test": The target data for the test set. It will be absent if ratio == 1. + - "x_num_test": The numeric data for the test set. It will be absent if + training_data_ratio == 1. + - "x_cat_test": The categorical data for the test set. It will be absent if + training_data_ratio == 1. + - "y_test": The target data for the test set. It will be absent if + training_data_ratio == 1. - The information dictionary as returned by the _split_data_and_populate_info function with additional metadata. """ @@ -373,10 +377,6 @@ def _split_data_and_generate_info( - test_num: The number of samples in the test set. It will be absent if the training_data_ratio is 1. - column_names: The names of the columns. """ - assert 0 < training_data_ratio <= 1, "Training data ratio must be between 0 and 1." - if training_data_ratio == 1: - log(INFO, "Training data ratio is 1, so the data will not be split into training and test sets.") - info = get_info_from_domain(data, table_domain) column_names = info["column_names"] if info["column_names"] else data.columns.tolist() @@ -486,8 +486,12 @@ def train_test_split( - The test dataframe. - The seed used by the random number generator to generate the split. """ + assert 0 < training_data_ratio <= 1, "Training data ratio must be between 0 and 1." + if training_data_ratio == 1: + log(INFO, "Training data ratio is 1, so the data will not be split into training and test sets.") + if training_data_ratio == 1: - return DataSplits(train_data=DataFeatures(data=data.copy()), test_data=None, seed=None) + return DataSplits(train_data=DataFeatures(data=data.copy())) # Train/ Test Split:# Train/ Test Split: # num_train_samples% for Training, (1 - num_test_samples)% for Testing From e9ffe391bdc165764ff66f1c5edb822712834cf9 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 6 Oct 2025 15:24:40 -0400 Subject: [PATCH 36/40] CR by David --- .../attacks/ensemble/process_split_data.py | 2 +- src/midst_toolkit/models/clavaddpm/dataset.py | 21 ++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/midst_toolkit/attacks/ensemble/process_split_data.py b/src/midst_toolkit/attacks/ensemble/process_split_data.py index 1e9b5e98..0f0040e8 100644 --- a/src/midst_toolkit/attacks/ensemble/process_split_data.py +++ b/src/midst_toolkit/attacks/ensemble/process_split_data.py @@ -150,7 +150,7 @@ def process_split_data( processed_attack_data_path: Path, column_to_stratify: str, num_total_samples: int = 40000, - random_seed: int = 42, + random_seed: int = 42, # TODO: do we really need to hardcode the random state? ) -> None: """ Splits the data into train, validation, and test sets according to the attack design. diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index 106915ed..596dbdea 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -257,7 +257,7 @@ def get_category_sizes(features: torch.Tensor | np.ndarray) -> list[int]: unique values in each column. Args: - features: The data to get the size of the categories of. + features: The data from which to extract category sizes. Returns: A list with the category sizes in the data. @@ -398,6 +398,7 @@ def make_dataset_from_df( info: dict[str, Any], data_split_ratios: list[float] | None = None, std: float = 0, + data_split_random_state: int = 42, ) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]: """ Generate a dataset from a pandas DataFrame. @@ -435,6 +436,8 @@ def make_dataset_from_df( data_split_ratios: The ratios of the dataset to split into train, val, and test. The sum of the ratios must amount to 1 (with a tolerance of 0.01). Optional, default is [0.7, 0.2, 0.1]. std: The standard deviation of the labels. Optional, default is 0. + data_split_random_state: The random state to use for the data split. Will be passed down to the + train_test_split function from sklearn. Optional, default is 42. Returns: A tuple with the dataset, the label encoders, and the column orders. @@ -447,11 +450,15 @@ def make_dataset_from_df( "The sum of the ratios must amount to 1 (with a tolerance of 0.01)." ) - train_val_data, test_data = train_test_split(data, test_size=data_split_ratios[2], random_state=42) + train_val_data, test_data = train_test_split( + data, + test_size=data_split_ratios[2], + random_state=data_split_random_state, + ) train_data, val_data = train_test_split( train_val_data, test_size=data_split_ratios[1] / (data_split_ratios[0] + data_split_ratios[1]), - random_state=42, + random_state=data_split_random_state, ) categorical_column_names, numerical_column_names = _get_categorical_and_numerical_column_names( @@ -459,7 +466,7 @@ def make_dataset_from_df( is_target_conditioned, ) - if categorical_column_names is not None and len(categorical_column_names) > 0: + if len(categorical_column_names) > 0: categorical_features = { DataSplit.TRAIN.value: train_data[categorical_column_names].to_numpy(dtype=np.str_), DataSplit.VALIDATION.value: val_data[categorical_column_names].to_numpy(dtype=np.str_), @@ -468,7 +475,7 @@ def make_dataset_from_df( else: categorical_features = None - if numerical_column_names is not None and len(numerical_column_names) > 0: + if len(numerical_column_names) > 0: numerical_features = { DataSplit.TRAIN.value: train_data[numerical_column_names].values.astype(np.float32), DataSplit.VALIDATION.value: val_data[numerical_column_names].values.astype(np.float32), @@ -525,7 +532,7 @@ def _get_categorical_and_numerical_column_names( if info["cat_cols"] is not None: categorical_columns += info["cat_cols"] if is_target_conditioned == IsTargetCondioned.CONCAT: - categorical_columns = [info["y_col"]] + categorical_columns + categorical_columns += [info["y_col"]] numerical_columns = info["num_cols"] @@ -533,7 +540,7 @@ def _get_categorical_and_numerical_column_names( if info["num_cols"] is not None: numerical_columns += info["num_cols"] if is_target_conditioned == IsTargetCondioned.CONCAT: - numerical_columns = [info["y_col"]] + numerical_columns + numerical_columns += [info["y_col"]] categorical_columns = info["cat_cols"] From 224b2652d8c1bf3c1aa4fcc12547a16620a46bfe Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 6 Oct 2025 15:37:59 -0400 Subject: [PATCH 37/40] CR by David and Fatemeh --- .../models/clavaddpm/clustering.py | 44 ++++++++++--------- .../models/clavaddpm/enumerations.py | 2 +- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index a1bc5d13..b43e7396 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -18,8 +18,8 @@ from midst_toolkit.models.clavaddpm.enumerations import ( ClusteringMethod, Configs, - ForeignKeyScalingType, GroupLengthsProbDicts, + KeyScalingType, RelationOrder, Tables, ) @@ -295,15 +295,19 @@ def _pair_clustering( return parent_df_with_cluster, child_df_with_cluster, group_lengths_probabilities -def _denormalize_parent_data( +def _merge_parent_data_with_child_data( child_data: np.ndarray, parent_data: np.ndarray, parent_primary_key_index: int, foreign_key_index: int, ) -> np.ndarray: """ - Denormalize the parent data in relation to the child group data, - i.e. duplicate the parent data for each element of the child group data. + Merge the parent data in relation to the child group data. + + This is done by duplicating the parent data for each element of the child group data + in a process akin to database table denormalization. + + https://en.wikipedia.org/wiki/Denormalization Args: child_data: Numpy array of the child data. Should be sorted by the foreign key. @@ -312,7 +316,7 @@ def _denormalize_parent_data( foreign_key_index: Index of the foreign key to the child data. Returns: - Numpy array of the parent data denormalized for each group of the child group data. + Numpy array of the parent data merged for each group of the child group data. """ child_group_data_dict = _get_group_data_dict(child_data, [foreign_key_index]) @@ -325,10 +329,10 @@ def _denormalize_parent_data( else: group_lengths.append(len(child_group_data_dict[group_id_tuple])) group_lengths_np = np.array(group_lengths, dtype=int) - denormalized_parent_data = np.repeat(parent_data, group_lengths_np, axis=0) - assert (denormalized_parent_data[:, parent_primary_key_index] == child_data[:, foreign_key_index]).all() + merged_parent_data = np.repeat(parent_data, group_lengths_np, axis=0) + assert (merged_parent_data[:, parent_primary_key_index] == child_data[:, foreign_key_index]).all() - return denormalized_parent_data + return merged_parent_data def _get_min_max_and_quantile_for_numerical_columns( @@ -420,11 +424,12 @@ def _prepare_cluster_data( parent_primary_key: str, parent_scale: float, key_scale: float, - key_scaling_type: ForeignKeyScalingType = ForeignKeyScalingType.MINMAX, + key_scaling_type: KeyScalingType = KeyScalingType.MINMAX, ) -> np.ndarray: """ - Prepare the data for the clustering algorithm, which comprises of denormalizing the parent data, - splitting the data into categorical and numerical columns, and normalizing the data. + Prepare the data for the clustering algorithm, which comprises of merging the parent + and child data, splitting the data into categorical and numerical columns, and + normalizing the data. Args: child_data: Numpy array of the child data. @@ -438,10 +443,9 @@ def _prepare_cluster_data( parent_primary_key: Name of the parent primary key. parent_scale: Scaling factor applied to the parent table, provided by the config. It will be applied to the features to weight their importance during clustering. - key_scale: Scaling factor applied to the foreign key values that link - the child table to the parent table. This will weight how much influence + key_scale: Scaling factor applied to the tables' keys. This will weight how much influence the parent-child relationship has in the clustering algorithm. - key_scaling_type: Type of scaling for the foreign key. Default is ForeignKeyScalingType.MINMAX. + key_scaling_type: Type of scaling for the tables' keys. Default is KeyScalingType.MINMAX. Returns: Numpy array of the data prepared for the clustering algorithm. @@ -450,7 +454,7 @@ def _prepare_cluster_data( parent_primary_key_index = all_parent_columns.index(parent_primary_key) foreign_key_index = all_child_columns.index(parent_primary_key) - denormalized_parent_data = _denormalize_parent_data( + merged_data = _merge_parent_data_with_child_data( child_data, parent_data, parent_primary_key_index, @@ -470,8 +474,8 @@ def _prepare_cluster_data( child_numerical_data = child_data[:, child_numerical_columns] child_categorical_data = child_data[:, child_categorical_columns] - parent_numerical_data = denormalized_parent_data[:, parent_numerical_columns] - parent_categorical_data = denormalized_parent_data[:, parent_categorical_columns] + parent_numerical_data = merged_data[:, parent_numerical_columns] + parent_categorical_data = merged_data[:, parent_categorical_columns] numerical_min_max, numerical_quantile = _get_min_max_and_quantile_for_numerical_columns( child_numerical_data, @@ -479,11 +483,11 @@ def _prepare_cluster_data( parent_scale, ) - reshaped_parent_data = denormalized_parent_data[:, parent_primary_key_index].reshape(-1, 1) - if key_scaling_type == ForeignKeyScalingType.MINMAX: + reshaped_parent_data = merged_data[:, parent_primary_key_index].reshape(-1, 1) + if key_scaling_type == KeyScalingType.MINMAX: key_normalized = _min_max_normalize_sklearn(reshaped_parent_data) numerical_normalized = numerical_min_max - elif key_scaling_type == ForeignKeyScalingType.QUANTILE: + elif key_scaling_type == KeyScalingType.QUANTILE: key_normalized = _quantile_normalize_sklearn(reshaped_parent_data) numerical_normalized = numerical_quantile else: diff --git a/src/midst_toolkit/models/clavaddpm/enumerations.py b/src/midst_toolkit/models/clavaddpm/enumerations.py index b9a9f771..be2d3ad7 100644 --- a/src/midst_toolkit/models/clavaddpm/enumerations.py +++ b/src/midst_toolkit/models/clavaddpm/enumerations.py @@ -102,7 +102,7 @@ class TargetType(Enum): LONG = "long" -class ForeignKeyScalingType(Enum): +class KeyScalingType(Enum): """Possible types of scaling for the foreign key.""" MINMAX = "minmax" From 94f014b5714f77d9fd646e5f04d1586bbe39de5d Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 6 Oct 2025 15:37:59 -0400 Subject: [PATCH 38/40] CR by David and Fatemeh --- .../models/clavaddpm/clustering.py | 44 ++++++++++--------- .../models/clavaddpm/enumerations.py | 2 +- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 916ff698..5198670d 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -17,8 +17,8 @@ from midst_toolkit.models.clavaddpm.enumerations import ( ClusteringMethod, Configs, - ForeignKeyScalingType, GroupLengthsProbDicts, + KeyScalingType, RelationOrder, Tables, ) @@ -294,15 +294,19 @@ def _pair_clustering( return parent_df_with_cluster, child_df_with_cluster, group_lengths_probabilities -def _denormalize_parent_data( +def _merge_parent_data_with_child_data( child_data: np.ndarray, parent_data: np.ndarray, parent_primary_key_index: int, foreign_key_index: int, ) -> np.ndarray: """ - Denormalize the parent data in relation to the child group data, - i.e. duplicate the parent data for each element of the child group data. + Merge the parent data in relation to the child group data. + + This is done by duplicating the parent data for each element of the child group data + in a process akin to database table denormalization. + + https://en.wikipedia.org/wiki/Denormalization Args: child_data: Numpy array of the child data. Should be sorted by the foreign key. @@ -311,7 +315,7 @@ def _denormalize_parent_data( foreign_key_index: Index of the foreign key to the child data. Returns: - Numpy array of the parent data denormalized for each group of the child group data. + Numpy array of the parent data merged for each group of the child group data. """ child_group_data_dict = _get_group_data_dict(child_data, [foreign_key_index]) @@ -324,10 +328,10 @@ def _denormalize_parent_data( else: group_lengths.append(len(child_group_data_dict[group_id_tuple])) group_lengths_np = np.array(group_lengths, dtype=int) - denormalized_parent_data = np.repeat(parent_data, group_lengths_np, axis=0) - assert (denormalized_parent_data[:, parent_primary_key_index] == child_data[:, foreign_key_index]).all() + merged_parent_data = np.repeat(parent_data, group_lengths_np, axis=0) + assert (merged_parent_data[:, parent_primary_key_index] == child_data[:, foreign_key_index]).all() - return denormalized_parent_data + return merged_parent_data def _get_min_max_and_quantile_for_numerical_columns( @@ -419,11 +423,12 @@ def _prepare_cluster_data( parent_primary_key: str, parent_scale: float, key_scale: float, - key_scaling_type: ForeignKeyScalingType = ForeignKeyScalingType.MINMAX, + key_scaling_type: KeyScalingType = KeyScalingType.MINMAX, ) -> np.ndarray: """ - Prepare the data for the clustering algorithm, which comprises of denormalizing the parent data, - splitting the data into categorical and numerical columns, and normalizing the data. + Prepare the data for the clustering algorithm, which comprises of merging the parent + and child data, splitting the data into categorical and numerical columns, and + normalizing the data. Args: child_data: Numpy array of the child data. @@ -437,10 +442,9 @@ def _prepare_cluster_data( parent_primary_key: Name of the parent primary key. parent_scale: Scaling factor applied to the parent table, provided by the config. It will be applied to the features to weight their importance during clustering. - key_scale: Scaling factor applied to the foreign key values that link - the child table to the parent table. This will weight how much influence + key_scale: Scaling factor applied to the tables' keys. This will weight how much influence the parent-child relationship has in the clustering algorithm. - key_scaling_type: Type of scaling for the foreign key. Default is ForeignKeyScalingType.MINMAX. + key_scaling_type: Type of scaling for the tables' keys. Default is KeyScalingType.MINMAX. Returns: Numpy array of the data prepared for the clustering algorithm. @@ -449,7 +453,7 @@ def _prepare_cluster_data( parent_primary_key_index = all_parent_columns.index(parent_primary_key) foreign_key_index = all_child_columns.index(parent_primary_key) - denormalized_parent_data = _denormalize_parent_data( + merged_data = _merge_parent_data_with_child_data( child_data, parent_data, parent_primary_key_index, @@ -469,8 +473,8 @@ def _prepare_cluster_data( child_numerical_data = child_data[:, child_numerical_columns] child_categorical_data = child_data[:, child_categorical_columns] - parent_numerical_data = denormalized_parent_data[:, parent_numerical_columns] - parent_categorical_data = denormalized_parent_data[:, parent_categorical_columns] + parent_numerical_data = merged_data[:, parent_numerical_columns] + parent_categorical_data = merged_data[:, parent_categorical_columns] numerical_min_max, numerical_quantile = _get_min_max_and_quantile_for_numerical_columns( child_numerical_data, @@ -478,11 +482,11 @@ def _prepare_cluster_data( parent_scale, ) - reshaped_parent_data = denormalized_parent_data[:, parent_primary_key_index].reshape(-1, 1) - if key_scaling_type == ForeignKeyScalingType.MINMAX: + reshaped_parent_data = merged_data[:, parent_primary_key_index].reshape(-1, 1) + if key_scaling_type == KeyScalingType.MINMAX: key_normalized = _min_max_normalize_sklearn(reshaped_parent_data) numerical_normalized = numerical_min_max - elif key_scaling_type == ForeignKeyScalingType.QUANTILE: + elif key_scaling_type == KeyScalingType.QUANTILE: key_normalized = _quantile_normalize_sklearn(reshaped_parent_data) numerical_normalized = numerical_quantile else: diff --git a/src/midst_toolkit/models/clavaddpm/enumerations.py b/src/midst_toolkit/models/clavaddpm/enumerations.py index b9a9f771..be2d3ad7 100644 --- a/src/midst_toolkit/models/clavaddpm/enumerations.py +++ b/src/midst_toolkit/models/clavaddpm/enumerations.py @@ -102,7 +102,7 @@ class TargetType(Enum): LONG = "long" -class ForeignKeyScalingType(Enum): +class KeyScalingType(Enum): """Possible types of scaling for the foreign key.""" MINMAX = "minmax" From 8eb21ae02010148a99ee718f51af8ea9056fa951 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 6 Oct 2025 16:08:08 -0400 Subject: [PATCH 39/40] Last CR comment by David --- src/midst_toolkit/models/clavaddpm/dataset.py | 14 +++++++------- src/midst_toolkit/models/clavaddpm/train.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index 596dbdea..dfca9e26 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -397,7 +397,7 @@ def make_dataset_from_df( is_target_conditioned: IsTargetCondioned, info: dict[str, Any], data_split_ratios: list[float] | None = None, - std: float = 0, + noise_scale: float = 0, data_split_random_state: int = 42, ) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]: """ @@ -435,7 +435,7 @@ def make_dataset_from_df( info: A dictionary with metadata about the DataFrame. data_split_ratios: The ratios of the dataset to split into train, val, and test. The sum of the ratios must amount to 1 (with a tolerance of 0.01). Optional, default is [0.7, 0.2, 0.1]. - std: The standard deviation of the labels. Optional, default is 0. + noise_scale: The scale of the noise to add to the categorical features. Optional, default is 0. data_split_random_state: The random state to use for the data split. Will be passed down to the train_test_split function from sklearn. Optional, default is 42. @@ -498,7 +498,7 @@ def make_dataset_from_df( column_orders_indices = numerical_column_orders + categorical_column_orders column_orders = [index_to_column[index] for index in column_orders_indices] - numerical_features, label_encoders = _merge_features(categorical_features, numerical_features, std) + numerical_features, label_encoders = _merge_features(categorical_features, numerical_features, noise_scale) assert isinstance(info["n_classes"], int) @@ -550,7 +550,7 @@ def _get_categorical_and_numerical_column_names( def _merge_features( categorical_features: ArrayDict | None, numerical_features: ArrayDict | None, - std: float, + noise_scale: float, ) -> tuple[ArrayDict, dict[int, LabelEncoder]]: """ Merge the categorical with the numerical features for train, validation, and test datasets. @@ -558,7 +558,7 @@ def _merge_features( Args: categorical_features: The categorical features. numerical_features: The numerical features. - std: The standard deviation of the labels. + noise_scale: The scale of the noise to add to the categorical features. Returns: The merged features for train, validation, and test datasets and the label encoders @@ -584,9 +584,9 @@ def _merge_features( label_encoder = LabelEncoder() encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float) categorical_data_converted.append(encoded_labels) - if std > 0: + if noise_scale > 0: # add noise - categorical_data_converted[-1] += np.random.normal(0, std, categorical_data_converted[-1].shape) + categorical_data_converted[-1] += np.random.normal(0, noise_scale, categorical_data_converted[-1].shape) label_encoders[column] = label_encoder categorical_data_transposed = np.vstack(categorical_data_converted).T diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index e9475365..8bc96ab3 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -306,7 +306,7 @@ def train_model( is_target_conditioned=model_params.is_target_conditioned, data_split_ratios=data_split_ratios, info=data_frame_info, - std=0, + noise_scale=0, ) category_sizes = np.array(dataset.get_category_sizes(DataSplit.TRAIN)) @@ -421,7 +421,7 @@ def train_classifier( is_target_conditioned=model_params.is_target_conditioned, data_split_ratios=data_split_ratios, info=data_frame_info, - std=0, + noise_scale=0, ) print(dataset.n_features) train_loader = prepare_fast_dataloader( From edc60bbfd1d5ed98c94cedb07ca0082a41ff901e Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Fri, 10 Oct 2025 15:05:20 -0400 Subject: [PATCH 40/40] CR by Fatemeh --- src/midst_toolkit/models/clavaddpm/dataset.py | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index dfca9e26..ac6896f3 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -440,7 +440,7 @@ def make_dataset_from_df( train_test_split function from sklearn. Optional, default is 42. Returns: - A tuple with the dataset, the label encoders, and the column orders. + A tuple with the dataset, the label encoders, and the column names in the order they appear in the dataset. """ if data_split_ratios is None: data_split_ratios = [0.7, 0.2, 0.1] @@ -490,6 +490,9 @@ def make_dataset_from_df( DataSplit.TEST.value: test_data[info["y_col"]].values.astype(np.float32), } + # build the column_orders list + # It's a string list with the names numerical columns followed by the names of + # the categorical columns in order they appear in the dataset that will be returned index_to_column = list(data.columns) column_to_index = {col: i for i, col in enumerate(index_to_column)} categorical_column_orders = [column_to_index[col] for col in categorical_column_names] @@ -498,7 +501,12 @@ def make_dataset_from_df( column_orders_indices = numerical_column_orders + categorical_column_orders column_orders = [index_to_column[index] for index in column_orders_indices] - numerical_features, label_encoders = _merge_features(categorical_features, numerical_features, noise_scale) + # Encode the categorical features and merge them with the numerical features + numerical_features, label_encoders = _encode_and_merge_features( + categorical_features, + numerical_features, + noise_scale, + ) assert isinstance(info["n_classes"], int) @@ -547,7 +555,7 @@ def _get_categorical_and_numerical_column_names( return categorical_columns, numerical_columns -def _merge_features( +def _encode_and_merge_features( categorical_features: ArrayDict | None, numerical_features: ArrayDict | None, noise_scale: float, @@ -555,9 +563,17 @@ def _merge_features( """ Merge the categorical with the numerical features for train, validation, and test datasets. + The categorical features are encoded and then merged with the numerical features. The + label encoders used to do that are also returned. + + If ``noise_scale`` is greater than 0, noise from a normal distribution with a standard + deviation of ``noise_scale`` is added to the categorical features. + Args: - categorical_features: The categorical features. - numerical_features: The numerical features. + categorical_features: A dictionary with the categorical features data for train, + validation, and test datasets. + numerical_features: A dictionary with the numerical features data for train, + validation, and test datasets. noise_scale: The scale of the noise to add to the categorical features. Returns: