VectorInstitute · lotif · Oct 16, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/src/midst_toolkit/attacks/ensemble/process_split_data.py b/src/midst_toolkit/attacks/ensemble/process_split_data.py
@@ -150,7 +150,7 @@ def process_split_data(
     processed_attack_data_path: Path,
     column_to_stratify: str,
     num_total_samples: int = 40000,
-    random_seed: int = 42,
+    random_seed: int = 42,  # TODO: do we really need to hardcode the random state?
 ) -> None:
     """
     Splits the data into train, validation, and test sets according to the attack design.

diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py
@@ -251,18 +251,18 @@ def calculate_metrics(
 
 
 # TODO consider moving all the functions below into the Dataset class
-def get_category_sizes(x: torch.Tensor | np.ndarray) -> list[int]:
+def get_category_sizes(features: torch.Tensor | np.ndarray) -> list[int]:
     """
     Get the size of the categories in the data by counting the number of
     unique values in each column.
 
     Args:
-        x: The data to get the size of the categories of.
+        features: The data from which to extract category sizes.
 
     Returns:
         A list with the category sizes in the data.
     """
-    x_t = x.T.cpu().tolist() if isinstance(x, torch.Tensor) else x.T.tolist()
+    x_t = features.T.cpu().tolist() if isinstance(features, torch.Tensor) else features.T.tolist()
     return [len(set(xt)) for xt in x_t]
 
 
@@ -392,13 +392,13 @@ def _get_predicted_labels_and_probs(
 
 
 def make_dataset_from_df(
-    # ruff: noqa: PLR0915, PLR0912
-    df: pd.DataFrame,
+    data: pd.DataFrame,
     transformations: Transformations,
     is_target_conditioned: IsTargetCondioned,
-    df_info: dict[str, Any],
-    ratios: list[float] | None = None,
-    std: float = 0,
+    info: dict[str, Any],
+    data_split_ratios: list[float] | None = None,
+    noise_scale: float = 0,
+    data_split_random_state: int = 42,
 ) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]:
     """
     Generate a dataset from a pandas DataFrame.
@@ -411,7 +411,7 @@ def make_dataset_from_df(
     However, if we have n_classes > 0, then y is not the first column of the matrix.
 
     Args:
-        df: The pandas DataFrame to generate the dataset from.
+        data: The pandas DataFrame to generate the dataset from.
         transformations: The transformations to apply to the dataset.
         is_target_conditioned: The condition on the y column.
             IsTargetCondioned.CONCAT: y is concatenated to X, the model learns a joint distribution of (y, X)
@@ -432,142 +432,195 @@ def make_dataset_from_df(
                     y is synthesized using y's empirical distribution. X is generated by the model.
                     In this case, y is completely independent of X.
 
-        df_info: A dictionary with metadata about the DataFrame.
-        ratios: The ratios of the dataset to split into train, val, and test. The sum of
+        info: A dictionary with metadata about the DataFrame.
+        data_split_ratios: The ratios of the dataset to split into train, val, and test. The sum of
             the ratios must amount to 1 (with a tolerance of 0.01). Optional, default is [0.7, 0.2, 0.1].
-        std: The standard deviation of the labels. Optional, default is 0.
+        noise_scale: The scale of the noise to add to the categorical features. Optional, default is 0.
+        data_split_random_state: The random state to use for the data split. Will be passed down to the
+            train_test_split function from sklearn. Optional, default is 42.
 
     Returns:
         A tuple with the dataset, the label encoders, and the column orders.
     """
-    if ratios is None:
-        ratios = [0.7, 0.2, 0.1]
+    if data_split_ratios is None:
+        data_split_ratios = [0.7, 0.2, 0.1]
 
-    assert len(ratios) == 3, "The ratios must be a list of 3 values (train, validation, test)."
-    assert np.isclose(sum(ratios), 1, atol=0.01), "The sum of the ratios must amount to 1 (with a tolerance of 0.01)."
+    assert len(data_split_ratios) == 3, "The ratios must be a list of 3 values (train, validation, test)."
+    assert np.isclose(sum(data_split_ratios), 1, atol=0.01), (
+        "The sum of the ratios must amount to 1 (with a tolerance of 0.01)."
+    )
+
+    train_val_data, test_data = train_test_split(
+        data,
+        test_size=data_split_ratios[2],
+        random_state=data_split_random_state,
+    )
+    train_data, val_data = train_test_split(
+        train_val_data,
+        test_size=data_split_ratios[1] / (data_split_ratios[0] + data_split_ratios[1]),
+        random_state=data_split_random_state,
+    )
+
+    categorical_column_names, numerical_column_names = _get_categorical_and_numerical_column_names(
+        info,
+        is_target_conditioned,
+    )
+
+    if len(categorical_column_names) > 0:
+        categorical_features = {
+            DataSplit.TRAIN.value: train_data[categorical_column_names].to_numpy(dtype=np.str_),
+            DataSplit.VALIDATION.value: val_data[categorical_column_names].to_numpy(dtype=np.str_),
+            DataSplit.TEST.value: test_data[categorical_column_names].to_numpy(dtype=np.str_),
+        }
+    else:
+        categorical_features = None
+
+    if len(numerical_column_names) > 0:
+        numerical_features = {
+            DataSplit.TRAIN.value: train_data[numerical_column_names].values.astype(np.float32),
+            DataSplit.VALIDATION.value: val_data[numerical_column_names].values.astype(np.float32),
+            DataSplit.TEST.value: test_data[numerical_column_names].values.astype(np.float32),
+        }
+    else:
+        numerical_features = None
 
-    train_val_df, test_df = train_test_split(df, test_size=ratios[2], random_state=42)
-    train_df, val_df = train_test_split(train_val_df, test_size=ratios[1] / (ratios[0] + ratios[1]), random_state=42)
+    target = {
+        DataSplit.TRAIN.value: train_data[info["y_col"]].values.astype(np.float32),
+        DataSplit.VALIDATION.value: val_data[info["y_col"]].values.astype(np.float32),
+        DataSplit.TEST.value: test_data[info["y_col"]].values.astype(np.float32),
+    }
 
-    cat_column_orders = []
-    num_column_orders = []
-    index_to_column = list(df.columns)
+    index_to_column = list(data.columns)
     column_to_index = {col: i for i, col in enumerate(index_to_column)}
+    categorical_column_orders = [column_to_index[col] for col in categorical_column_names]
+    numerical_column_orders = [column_to_index[col] for col in numerical_column_names]
 
-    if df_info["n_classes"] > 0:
-        x_cat: dict[str, np.ndarray] | None = (
-            {} if df_info["cat_cols"] is not None or is_target_conditioned == IsTargetCondioned.CONCAT else None
-        )
-        x_num: dict[str, np.ndarray] | None = {} if df_info["num_cols"] is not None else None
-        y = {}
+    column_orders_indices = numerical_column_orders + categorical_column_orders
+    column_orders = [index_to_column[index] for index in column_orders_indices]
 
-        cat_cols_with_y: list[str] = []
-        if df_info["cat_cols"] is not None:
-            cat_cols_with_y += df_info["cat_cols"]
-        if is_target_conditioned == IsTargetCondioned.CONCAT:
-            cat_cols_with_y = [df_info["y_col"]] + cat_cols_with_y
+    numerical_features, label_encoders = _merge_features(categorical_features, numerical_features, noise_scale)
 
-        if len(cat_cols_with_y) > 0:
-            x_cat[DataSplit.TRAIN.value] = train_df[cat_cols_with_y].to_numpy(dtype=np.str_)  # type: ignore[index]
-            x_cat[DataSplit.VALIDATION.value] = val_df[cat_cols_with_y].to_numpy(dtype=np.str_)  # type: ignore[index]
-            x_cat[DataSplit.TEST.value] = test_df[cat_cols_with_y].to_numpy(dtype=np.str_)  # type: ignore[index]
+    assert isinstance(info["n_classes"], int)
 
-        y[DataSplit.TRAIN.value] = train_df[df_info["y_col"]].values.astype(np.float32)
-        y[DataSplit.VALIDATION.value] = val_df[df_info["y_col"]].values.astype(np.float32)
-        y[DataSplit.TEST.value] = test_df[df_info["y_col"]].values.astype(np.float32)
+    dataset = Dataset(
+        numerical_features,
+        None,
+        target,
+        y_info={},
+        task_type=TaskType(info["task_type"]),
+        n_classes=info["n_classes"],
+    )
 
-        if df_info["num_cols"] is not None:
-            x_num[DataSplit.TRAIN.value] = train_df[df_info["num_cols"]].values.astype(np.float32)  # type: ignore[index]
-            x_num[DataSplit.VALIDATION.value] = val_df[df_info["num_cols"]].values.astype(np.float32)  # type: ignore[index]
-            x_num[DataSplit.TEST.value] = test_df[df_info["num_cols"]].values.astype(np.float32)  # type: ignore[index]
+    return transform_dataset(dataset, transformations, None), label_encoders, column_orders
 
-        cat_column_orders = [column_to_index[col] for col in cat_cols_with_y]
-        num_column_orders = [column_to_index[col] for col in df_info["num_cols"]]
 
-    else:
-        x_cat = {} if df_info["cat_cols"] is not None else None
-        x_num = {} if df_info["num_cols"] is not None or is_target_conditioned == IsTargetCondioned.CONCAT else None
-        y = {}
+def _get_categorical_and_numerical_column_names(
+    info: dict[str, Any],
+    is_target_conditioned: IsTargetCondioned,
+) -> tuple[list[str], list[str]]:
+    """
+    Get the categorical and numerical column names from the info dictionary.
 
-        num_cols_with_y: list[str] = []
-        if df_info["num_cols"] is not None:
-            num_cols_with_y += df_info["num_cols"]
+    Args:
+        info: The info dictionary.
+        is_target_conditioned: The condition on the y column.
+    """
+    numerical_columns: list[str] = []
+    categorical_columns: list[str] = []
+
+    if info["n_classes"] > 0:
+        if info["cat_cols"] is not None:
+            categorical_columns += info["cat_cols"]
         if is_target_conditioned == IsTargetCondioned.CONCAT:
-            num_cols_with_y = [df_info["y_col"]] + num_cols_with_y
+            categorical_columns += [info["y_col"]]
 
-        if len(num_cols_with_y) > 0:
-            assert x_num is not None
-            x_num[DataSplit.TRAIN.value] = train_df[num_cols_with_y].values.astype(np.float32)
-            x_num[DataSplit.VALIDATION.value] = val_df[num_cols_with_y].values.astype(np.float32)
-            x_num[DataSplit.TEST.value] = test_df[num_cols_with_y].values.astype(np.float32)
+        numerical_columns = info["num_cols"]
 
-        y[DataSplit.TRAIN.value] = train_df[df_info["y_col"]].values.astype(np.float32)
-        y[DataSplit.VALIDATION.value] = val_df[df_info["y_col"]].values.astype(np.float32)
-        y[DataSplit.TEST.value] = test_df[df_info["y_col"]].values.astype(np.float32)
+    else:
+        if info["num_cols"] is not None:
+            numerical_columns += info["num_cols"]
+        if is_target_conditioned == IsTargetCondioned.CONCAT:
+            numerical_columns += [info["y_col"]]
 
-        if df_info["cat_cols"] is not None:
-            assert x_cat is not None
-            x_cat[DataSplit.TRAIN.value] = train_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)
-            x_cat[DataSplit.VALIDATION.value] = val_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)
-            x_cat[DataSplit.TEST.value] = test_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)
+        categorical_columns = info["cat_cols"]
 
-        cat_column_orders = [column_to_index[col] for col in df_info["cat_cols"]]
-        num_column_orders = [column_to_index[col] for col in num_cols_with_y]
+    return categorical_columns, numerical_columns
 
-    column_orders_indices = num_column_orders + cat_column_orders
-    column_orders = [index_to_column[index] for index in column_orders_indices]
 
-    label_encoders = {}
-    if x_cat is not None and len(df_info["cat_cols"]) > 0:
-        x_cat_all = np.vstack(
-            (x_cat[DataSplit.TRAIN.value], x_cat[DataSplit.VALIDATION.value], x_cat[DataSplit.TEST.value])
-        )
-        x_cat_converted = []
-        for col_index in range(x_cat_all.shape[1]):
-            label_encoder = LabelEncoder()
-            x_cat_converted.append(label_encoder.fit_transform(x_cat_all[:, col_index]).astype(float))
-            if std > 0:
-                # add noise
-                x_cat_converted[-1] += np.random.normal(0, std, x_cat_converted[-1].shape)
-            label_encoders[col_index] = label_encoder
-
-        x_cat_converted = np.vstack(x_cat_converted).T  # type: ignore[assignment]
-
-        train_num = x_cat[DataSplit.TRAIN.value].shape[0]
-        val_num = x_cat[DataSplit.VALIDATION.value].shape[0]
-
-        x_cat[DataSplit.TRAIN.value] = x_cat_converted[:train_num, :]  # type: ignore[call-overload]
-        x_cat[DataSplit.VALIDATION.value] = x_cat_converted[train_num : train_num + val_num, :]  # type: ignore[call-overload]
-        x_cat[DataSplit.TEST.value] = x_cat_converted[train_num + val_num :, :]  # type: ignore[call-overload]
-
-        if x_num and len(x_num) > 0:
-            assert x_num is not None
-            x_num[DataSplit.TRAIN.value] = np.concatenate(
-                (x_num[DataSplit.TRAIN.value], x_cat[DataSplit.TRAIN.value]), axis=1
-            )
-            x_num[DataSplit.VALIDATION.value] = np.concatenate(
-                (x_num[DataSplit.VALIDATION.value], x_cat[DataSplit.VALIDATION.value]), axis=1
-            )
-            x_num[DataSplit.TEST.value] = np.concatenate(
-                (x_num[DataSplit.TEST.value], x_cat[DataSplit.TEST.value]), axis=1
-            )
-        else:
-            x_num = x_cat
-            x_cat = None
+def _merge_features(
+    categorical_features: ArrayDict | None,
+    numerical_features: ArrayDict | None,
+    noise_scale: float,
+) -> tuple[ArrayDict, dict[int, LabelEncoder]]:
+    """
+    Merge the categorical with the numerical features for train, validation, and test datasets.
 
-    n_classes = df_info["n_classes"]
-    assert isinstance(n_classes, int)
+    Args:
+        categorical_features: The categorical features.
+        numerical_features: The numerical features.
+        noise_scale: The scale of the noise to add to the categorical features.
 
-    dataset = Dataset(
-        x_num,
-        None,
-        y,
-        y_info={},
-        task_type=TaskType(df_info["task_type"]),
-        n_classes=n_classes,
+    Returns:
+        The merged features for train, validation, and test datasets and the label encoders
+        used to do so.
+    """
+    if categorical_features is None:
+        # if no categorical features, just return the numerical features
+        assert numerical_features is not None
+        return numerical_features, {}
+
+    # Otherwise, encode the categorical features
+    all_categorical_data = np.vstack(
+        (
+            categorical_features[DataSplit.TRAIN.value],
+            categorical_features[DataSplit.VALIDATION.value],
+            categorical_features[DataSplit.TEST.value],
+        )
     )
 
-    return transform_dataset(dataset, transformations, None), label_encoders, column_orders
+    categorical_data_converted = []
+    label_encoders = {}
+    for column in range(all_categorical_data.shape[1]):
+        label_encoder = LabelEncoder()
+        encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
+        categorical_data_converted.append(encoded_labels)
+        if noise_scale > 0:
+            # add noise
+            categorical_data_converted[-1] += np.random.normal(0, noise_scale, categorical_data_converted[-1].shape)
+        label_encoders[column] = label_encoder
+
+    categorical_data_transposed = np.vstack(categorical_data_converted).T
+
+    num_train_samples = categorical_features[DataSplit.TRAIN.value].shape[0]
+    num_validation_samples = categorical_features[DataSplit.VALIDATION.value].shape[0]
+
+    categorical_features[DataSplit.TRAIN.value] = categorical_data_transposed[:num_train_samples, :]
+    categorical_features[DataSplit.VALIDATION.value] = categorical_data_transposed[
+        num_train_samples : num_train_samples + num_validation_samples, :
+    ]
+    categorical_features[DataSplit.TEST.value] = categorical_data_transposed[
+        num_train_samples + num_validation_samples :, :
+    ]
+
+    if numerical_features is None:
+        # if no numerical features then no need to merge, just return the categorical features
+        return categorical_features, label_encoders
+
+    # Otherwise, merge the categorical and numerical features
+    merged_features = {
+        DataSplit.TRAIN.value: np.concatenate(
+            (numerical_features[DataSplit.TRAIN.value], categorical_features[DataSplit.TRAIN.value]), axis=1
+        ),
+        DataSplit.VALIDATION.value: np.concatenate(
+            (numerical_features[DataSplit.VALIDATION.value], categorical_features[DataSplit.VALIDATION.value]),
+            axis=1,
+        ),
+        DataSplit.TEST.value: np.concatenate(
+            (numerical_features[DataSplit.TEST.value], categorical_features[DataSplit.TEST.value]), axis=1
+        ),
+    }
+
+    return merged_features, label_encoders
 
 
 def transform_dataset(

diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py
@@ -304,9 +304,9 @@ def train_model(
         data_frame,
         transformations,
         is_target_conditioned=model_params.is_target_conditioned,
-        ratios=data_split_ratios,
-        df_info=data_frame_info,
-        std=0,
+        data_split_ratios=data_split_ratios,
+        info=data_frame_info,
+        noise_scale=0,
     )
 
     category_sizes = np.array(dataset.get_category_sizes(DataSplit.TRAIN))
@@ -419,9 +419,9 @@ def train_classifier(
         data_frame,
         transformations,
         is_target_conditioned=model_params.is_target_conditioned,
-        ratios=data_split_ratios,
-        df_info=data_frame_info,
-        std=0,
+        data_split_ratios=data_split_ratios,
+        info=data_frame_info,
+        noise_scale=0,
     )
     print(dataset.n_features)
     train_loader = prepare_fast_dataloader(