Skip to content
Merged
Show file tree
Hide file tree
Changes from 62 commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
c059d93
Merge "Refactor Transformations handling: replace get_T_dict with a d…
lotif Sep 22, 2025
9f0290a
Refactor clustering method handling: Introduce ClusteringMethod enum …
lotif Sep 22, 2025
80073f2
Refactor model handling: Introduce ModelType enum for improved type s…
lotif Sep 22, 2025
1d43783
Merge "Refactor model parameters: Introduce ModelParameters and RTDLP…
lotif Sep 22, 2025
f7d69ec
Merge "Refactor y condition handling: Replace string literals with Is…
lotif Sep 22, 2025
01ed9c1
Refactor Gaussian loss handling: Introduce GaussianLossType enum to r…
lotif Sep 22, 2025
0dbc6b4
Refactor scheduler handling: Introduce Scheduler enum to replace stri…
lotif Sep 22, 2025
128da65
Merge "Refactor sampler initialization: Update UniformSampler and Los…
lotif Sep 22, 2025
f158b44
Enhance metric and loss handling: Refactor loss computation in _numer…
lotif Sep 23, 2025
6503788
Transforming a lot of literals into enums
lotif Sep 29, 2025
d599334
WIP renaming RTDL, cat and num and data splits
lotif Sep 29, 2025
1950f5c
Using more data splits and adding types for gaussian parametrization
lotif Sep 30, 2025
3e8237c
Adding enum for YType
lotif Sep 30, 2025
c957dec
Merge branch 'main' into marcelo/classes-and-enums-2
lotif Sep 30, 2025
4d0707b
Renaming Scheduler to SchedulerType and moving it and GaussianLossTyp…
lotif Sep 30, 2025
b7db96e
Merge remote-tracking branch 'origin/marcelo/classes-and-enums-2' int…
lotif Sep 30, 2025
30d0a0d
WIP CR by David
lotif Sep 30, 2025
779b108
Merge branch 'main' into marcelo/classes-and-enums-2
lotif Oct 1, 2025
0e3a42a
Cont'd CR comments by David
lotif Oct 1, 2025
8600ccb
Merge remote-tracking branch 'origin/marcelo/classes-and-enums-2' int…
lotif Oct 1, 2025
bc67266
Adding TODO
lotif Oct 1, 2025
774e99b
WIp starting the breakdown
lotif Oct 1, 2025
1168e93
Renames
lotif Oct 1, 2025
bf05c3c
Last breakdown
lotif Oct 1, 2025
d90ed2c
Removing ignore
lotif Oct 1, 2025
96414be
Finished refactoring
lotif Oct 1, 2025
aafd66c
Merge branch 'main' into marcelo/remove-ignores
lotif Oct 1, 2025
594d9cd
Renamings, mostly
lotif Oct 2, 2025
3a2b203
More enums
lotif Oct 2, 2025
972947d
Adding datasplits class
lotif Oct 2, 2025
77a2249
Splitting into another function
lotif Oct 2, 2025
67368ab
Adding docstrings, removing save
lotif Oct 2, 2025
293f4d9
Renaming function
lotif Oct 2, 2025
dbe71a4
Merge branch 'marcelo/refactoring-pair-clustering' into marcelo/renam…
lotif Oct 2, 2025
3a29c46
Merge branch 'marcelo/refactoring-pair-clustering' into marcelo/remov…
lotif Oct 2, 2025
f8c9adf
One more refactor
lotif Oct 2, 2025
1c055e8
rolling back table_domain renamings
lotif Oct 2, 2025
390ad5b
Merge branch 'marcelo/renamings' into marcelo/refactor-process-pipeli…
lotif Oct 2, 2025
84fe972
Splitting the make_dataset_from_df function
lotif Oct 2, 2025
56a09ec
Fixing broken code from revert
lotif Oct 2, 2025
f35b596
CR by David
lotif Oct 2, 2025
0a9994a
CR by David
lotif Oct 3, 2025
c7ed903
Merge branch 'marcelo/refactoring-pair-clustering' into marcelo/renam…
lotif Oct 3, 2025
ba7ab5a
Merge branch 'marcelo/renamings' into marcelo/refactor-process-pipeli…
lotif Oct 3, 2025
771bde8
Merge branch 'marcelo/refactor-process-pipeline-data' into marcelo/re…
lotif Oct 3, 2025
1f4fed2
CR by David
lotif Oct 3, 2025
57bd33c
Merge branch 'marcelo/refactor-process-pipeline-data' into marcelo/re…
lotif Oct 3, 2025
e9ffe39
CR by David
lotif Oct 6, 2025
224b265
CR by David and Fatemeh
lotif Oct 6, 2025
10e9989
Merge branch 'main' into marcelo/refactoring-pair-clustering
lotif Oct 6, 2025
94f014b
CR by David and Fatemeh
lotif Oct 6, 2025
f479990
Merge remote-tracking branch 'origin/marcelo/refactoring-pair-cluster…
lotif Oct 6, 2025
7d120ac
Merge branch 'marcelo/refactoring-pair-clustering' into marcelo/renam…
lotif Oct 6, 2025
626a39a
Merge branch 'marcelo/renamings' into marcelo/refactor-process-pipeli…
lotif Oct 6, 2025
93dfd31
Merge branch 'marcelo/refactor-process-pipeline-data' into marcelo/re…
lotif Oct 6, 2025
8eb21ae
Last CR comment by David
lotif Oct 6, 2025
53c1320
Fixing merge conflicts
lotif Oct 6, 2025
e75c5ca
Merge branch 'marcelo/renamings' into marcelo/refactor-process-pipeli…
lotif Oct 6, 2025
3b472e3
Merge branch 'marcelo/refactor-process-pipeline-data' into marcelo/re…
lotif Oct 6, 2025
ea8e2db
Merge branch 'main' into marcelo/refactor-process-pipeline-data
lotif Oct 6, 2025
def3b21
Merge branch 'marcelo/refactor-process-pipeline-data' into marcelo/re…
lotif Oct 6, 2025
747d9c4
Merge branch 'main' into marcelo/refactor-make-dataset-from-df
lotif Oct 6, 2025
edc60bb
CR by Fatemeh
lotif Oct 10, 2025
f80068f
Merge branch 'main' into marcelo/refactor-make-dataset-from-df
emersodb Oct 14, 2025
d75f587
Merge branch 'main' into marcelo/refactor-make-dataset-from-df
lotif Oct 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/midst_toolkit/attacks/ensemble/process_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def process_split_data(
processed_attack_data_path: Path,
column_to_stratify: str,
num_total_samples: int = 40000,
random_seed: int = 42,
random_seed: int = 42, # TODO: do we really need to hardcode the random state?
) -> None:
"""
Splits the data into train, validation, and test sets according to the attack design.
Expand Down
287 changes: 170 additions & 117 deletions src/midst_toolkit/models/clavaddpm/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,18 +251,18 @@ def calculate_metrics(


# TODO consider moving all the functions below into the Dataset class
def get_category_sizes(x: torch.Tensor | np.ndarray) -> list[int]:
def get_category_sizes(features: torch.Tensor | np.ndarray) -> list[int]:
"""
Get the size of the categories in the data by counting the number of
unique values in each column.

Args:
x: The data to get the size of the categories of.
features: The data from which to extract category sizes.

Returns:
A list with the category sizes in the data.
"""
x_t = x.T.cpu().tolist() if isinstance(x, torch.Tensor) else x.T.tolist()
x_t = features.T.cpu().tolist() if isinstance(features, torch.Tensor) else features.T.tolist()
return [len(set(xt)) for xt in x_t]


Expand Down Expand Up @@ -392,13 +392,13 @@ def _get_predicted_labels_and_probs(


def make_dataset_from_df(
# ruff: noqa: PLR0915, PLR0912
df: pd.DataFrame,
data: pd.DataFrame,
transformations: Transformations,
is_target_conditioned: IsTargetCondioned,
df_info: dict[str, Any],
ratios: list[float] | None = None,
std: float = 0,
info: dict[str, Any],
data_split_ratios: list[float] | None = None,
noise_scale: float = 0,
data_split_random_state: int = 42,
) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]:
"""
Generate a dataset from a pandas DataFrame.
Expand All @@ -411,7 +411,7 @@ def make_dataset_from_df(
However, if we have n_classes > 0, then y is not the first column of the matrix.

Args:
df: The pandas DataFrame to generate the dataset from.
data: The pandas DataFrame to generate the dataset from.
transformations: The transformations to apply to the dataset.
is_target_conditioned: The condition on the y column.
IsTargetCondioned.CONCAT: y is concatenated to X, the model learns a joint distribution of (y, X)
Expand All @@ -432,142 +432,195 @@ def make_dataset_from_df(
y is synthesized using y's empirical distribution. X is generated by the model.
In this case, y is completely independent of X.

df_info: A dictionary with metadata about the DataFrame.
ratios: The ratios of the dataset to split into train, val, and test. The sum of
info: A dictionary with metadata about the DataFrame.
data_split_ratios: The ratios of the dataset to split into train, val, and test. The sum of
the ratios must amount to 1 (with a tolerance of 0.01). Optional, default is [0.7, 0.2, 0.1].
std: The standard deviation of the labels. Optional, default is 0.
noise_scale: The scale of the noise to add to the categorical features. Optional, default is 0.
data_split_random_state: The random state to use for the data split. Will be passed down to the
train_test_split function from sklearn. Optional, default is 42.

Returns:
A tuple with the dataset, the label encoders, and the column orders.
"""
if ratios is None:
ratios = [0.7, 0.2, 0.1]
if data_split_ratios is None:
data_split_ratios = [0.7, 0.2, 0.1]

assert len(ratios) == 3, "The ratios must be a list of 3 values (train, validation, test)."
assert np.isclose(sum(ratios), 1, atol=0.01), "The sum of the ratios must amount to 1 (with a tolerance of 0.01)."
assert len(data_split_ratios) == 3, "The ratios must be a list of 3 values (train, validation, test)."
assert np.isclose(sum(data_split_ratios), 1, atol=0.01), (
"The sum of the ratios must amount to 1 (with a tolerance of 0.01)."
)

train_val_data, test_data = train_test_split(
data,
test_size=data_split_ratios[2],
random_state=data_split_random_state,
)
train_data, val_data = train_test_split(
train_val_data,
test_size=data_split_ratios[1] / (data_split_ratios[0] + data_split_ratios[1]),
random_state=data_split_random_state,
)

categorical_column_names, numerical_column_names = _get_categorical_and_numerical_column_names(
info,
is_target_conditioned,
)

if len(categorical_column_names) > 0:
categorical_features = {
DataSplit.TRAIN.value: train_data[categorical_column_names].to_numpy(dtype=np.str_),
DataSplit.VALIDATION.value: val_data[categorical_column_names].to_numpy(dtype=np.str_),
DataSplit.TEST.value: test_data[categorical_column_names].to_numpy(dtype=np.str_),
}
else:
categorical_features = None

if len(numerical_column_names) > 0:
numerical_features = {
DataSplit.TRAIN.value: train_data[numerical_column_names].values.astype(np.float32),
DataSplit.VALIDATION.value: val_data[numerical_column_names].values.astype(np.float32),
DataSplit.TEST.value: test_data[numerical_column_names].values.astype(np.float32),
}
else:
numerical_features = None

train_val_df, test_df = train_test_split(df, test_size=ratios[2], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=ratios[1] / (ratios[0] + ratios[1]), random_state=42)
target = {
DataSplit.TRAIN.value: train_data[info["y_col"]].values.astype(np.float32),
DataSplit.VALIDATION.value: val_data[info["y_col"]].values.astype(np.float32),
DataSplit.TEST.value: test_data[info["y_col"]].values.astype(np.float32),
}

cat_column_orders = []
num_column_orders = []
index_to_column = list(df.columns)
index_to_column = list(data.columns)
column_to_index = {col: i for i, col in enumerate(index_to_column)}
categorical_column_orders = [column_to_index[col] for col in categorical_column_names]
numerical_column_orders = [column_to_index[col] for col in numerical_column_names]

if df_info["n_classes"] > 0:
x_cat: dict[str, np.ndarray] | None = (
{} if df_info["cat_cols"] is not None or is_target_conditioned == IsTargetCondioned.CONCAT else None
)
x_num: dict[str, np.ndarray] | None = {} if df_info["num_cols"] is not None else None
y = {}
column_orders_indices = numerical_column_orders + categorical_column_orders
column_orders = [index_to_column[index] for index in column_orders_indices]

cat_cols_with_y: list[str] = []
if df_info["cat_cols"] is not None:
cat_cols_with_y += df_info["cat_cols"]
if is_target_conditioned == IsTargetCondioned.CONCAT:
cat_cols_with_y = [df_info["y_col"]] + cat_cols_with_y
numerical_features, label_encoders = _merge_features(categorical_features, numerical_features, noise_scale)

if len(cat_cols_with_y) > 0:
x_cat[DataSplit.TRAIN.value] = train_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index]
x_cat[DataSplit.VALIDATION.value] = val_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index]
x_cat[DataSplit.TEST.value] = test_df[cat_cols_with_y].to_numpy(dtype=np.str_) # type: ignore[index]
assert isinstance(info["n_classes"], int)

y[DataSplit.TRAIN.value] = train_df[df_info["y_col"]].values.astype(np.float32)
y[DataSplit.VALIDATION.value] = val_df[df_info["y_col"]].values.astype(np.float32)
y[DataSplit.TEST.value] = test_df[df_info["y_col"]].values.astype(np.float32)
dataset = Dataset(
numerical_features,
None,
target,
y_info={},
task_type=TaskType(info["task_type"]),
n_classes=info["n_classes"],
)

if df_info["num_cols"] is not None:
x_num[DataSplit.TRAIN.value] = train_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index]
x_num[DataSplit.VALIDATION.value] = val_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index]
x_num[DataSplit.TEST.value] = test_df[df_info["num_cols"]].values.astype(np.float32) # type: ignore[index]
return transform_dataset(dataset, transformations, None), label_encoders, column_orders

cat_column_orders = [column_to_index[col] for col in cat_cols_with_y]
num_column_orders = [column_to_index[col] for col in df_info["num_cols"]]

else:
x_cat = {} if df_info["cat_cols"] is not None else None
x_num = {} if df_info["num_cols"] is not None or is_target_conditioned == IsTargetCondioned.CONCAT else None
y = {}
def _get_categorical_and_numerical_column_names(
info: dict[str, Any],
is_target_conditioned: IsTargetCondioned,
) -> tuple[list[str], list[str]]:
"""
Get the categorical and numerical column names from the info dictionary.

num_cols_with_y: list[str] = []
if df_info["num_cols"] is not None:
num_cols_with_y += df_info["num_cols"]
Args:
info: The info dictionary.
is_target_conditioned: The condition on the y column.
"""
numerical_columns: list[str] = []
categorical_columns: list[str] = []

if info["n_classes"] > 0:
if info["cat_cols"] is not None:
categorical_columns += info["cat_cols"]
if is_target_conditioned == IsTargetCondioned.CONCAT:
num_cols_with_y = [df_info["y_col"]] + num_cols_with_y
categorical_columns += [info["y_col"]]

if len(num_cols_with_y) > 0:
assert x_num is not None
x_num[DataSplit.TRAIN.value] = train_df[num_cols_with_y].values.astype(np.float32)
x_num[DataSplit.VALIDATION.value] = val_df[num_cols_with_y].values.astype(np.float32)
x_num[DataSplit.TEST.value] = test_df[num_cols_with_y].values.astype(np.float32)
numerical_columns = info["num_cols"]

y[DataSplit.TRAIN.value] = train_df[df_info["y_col"]].values.astype(np.float32)
y[DataSplit.VALIDATION.value] = val_df[df_info["y_col"]].values.astype(np.float32)
y[DataSplit.TEST.value] = test_df[df_info["y_col"]].values.astype(np.float32)
else:
if info["num_cols"] is not None:
numerical_columns += info["num_cols"]
if is_target_conditioned == IsTargetCondioned.CONCAT:
numerical_columns += [info["y_col"]]

if df_info["cat_cols"] is not None:
assert x_cat is not None
x_cat[DataSplit.TRAIN.value] = train_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)
x_cat[DataSplit.VALIDATION.value] = val_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)
x_cat[DataSplit.TEST.value] = test_df[df_info["cat_cols"]].to_numpy(dtype=np.str_)
categorical_columns = info["cat_cols"]

cat_column_orders = [column_to_index[col] for col in df_info["cat_cols"]]
num_column_orders = [column_to_index[col] for col in num_cols_with_y]
return categorical_columns, numerical_columns

column_orders_indices = num_column_orders + cat_column_orders
column_orders = [index_to_column[index] for index in column_orders_indices]

label_encoders = {}
if x_cat is not None and len(df_info["cat_cols"]) > 0:
x_cat_all = np.vstack(
(x_cat[DataSplit.TRAIN.value], x_cat[DataSplit.VALIDATION.value], x_cat[DataSplit.TEST.value])
)
x_cat_converted = []
for col_index in range(x_cat_all.shape[1]):
label_encoder = LabelEncoder()
x_cat_converted.append(label_encoder.fit_transform(x_cat_all[:, col_index]).astype(float))
if std > 0:
# add noise
x_cat_converted[-1] += np.random.normal(0, std, x_cat_converted[-1].shape)
label_encoders[col_index] = label_encoder

x_cat_converted = np.vstack(x_cat_converted).T # type: ignore[assignment]

train_num = x_cat[DataSplit.TRAIN.value].shape[0]
val_num = x_cat[DataSplit.VALIDATION.value].shape[0]

x_cat[DataSplit.TRAIN.value] = x_cat_converted[:train_num, :] # type: ignore[call-overload]
x_cat[DataSplit.VALIDATION.value] = x_cat_converted[train_num : train_num + val_num, :] # type: ignore[call-overload]
x_cat[DataSplit.TEST.value] = x_cat_converted[train_num + val_num :, :] # type: ignore[call-overload]

if x_num and len(x_num) > 0:
assert x_num is not None
x_num[DataSplit.TRAIN.value] = np.concatenate(
(x_num[DataSplit.TRAIN.value], x_cat[DataSplit.TRAIN.value]), axis=1
)
x_num[DataSplit.VALIDATION.value] = np.concatenate(
(x_num[DataSplit.VALIDATION.value], x_cat[DataSplit.VALIDATION.value]), axis=1
)
x_num[DataSplit.TEST.value] = np.concatenate(
(x_num[DataSplit.TEST.value], x_cat[DataSplit.TEST.value]), axis=1
)
else:
x_num = x_cat
x_cat = None
def _merge_features(
categorical_features: ArrayDict | None,
numerical_features: ArrayDict | None,
noise_scale: float,
) -> tuple[ArrayDict, dict[int, LabelEncoder]]:
"""
Merge the categorical with the numerical features for train, validation, and test datasets.

n_classes = df_info["n_classes"]
assert isinstance(n_classes, int)
Args:
categorical_features: The categorical features.
numerical_features: The numerical features.
noise_scale: The scale of the noise to add to the categorical features.

dataset = Dataset(
x_num,
None,
y,
y_info={},
task_type=TaskType(df_info["task_type"]),
n_classes=n_classes,
Returns:
The merged features for train, validation, and test datasets and the label encoders
used to do so.
"""
if categorical_features is None:
# if no categorical features, just return the numerical features
assert numerical_features is not None
return numerical_features, {}

# Otherwise, encode the categorical features
all_categorical_data = np.vstack(
(
categorical_features[DataSplit.TRAIN.value],
categorical_features[DataSplit.VALIDATION.value],
categorical_features[DataSplit.TEST.value],
)
)

return transform_dataset(dataset, transformations, None), label_encoders, column_orders
categorical_data_converted = []
label_encoders = {}
for column in range(all_categorical_data.shape[1]):
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
categorical_data_converted.append(encoded_labels)
if noise_scale > 0:
# add noise
categorical_data_converted[-1] += np.random.normal(0, noise_scale, categorical_data_converted[-1].shape)
label_encoders[column] = label_encoder

categorical_data_transposed = np.vstack(categorical_data_converted).T

num_train_samples = categorical_features[DataSplit.TRAIN.value].shape[0]
num_validation_samples = categorical_features[DataSplit.VALIDATION.value].shape[0]

categorical_features[DataSplit.TRAIN.value] = categorical_data_transposed[:num_train_samples, :]
categorical_features[DataSplit.VALIDATION.value] = categorical_data_transposed[
num_train_samples : num_train_samples + num_validation_samples, :
]
categorical_features[DataSplit.TEST.value] = categorical_data_transposed[
num_train_samples + num_validation_samples :, :
]

if numerical_features is None:
# if no numerical features then no need to merge, just return the categorical features
return categorical_features, label_encoders

# Otherwise, merge the categorical and numerical features
merged_features = {
DataSplit.TRAIN.value: np.concatenate(
(numerical_features[DataSplit.TRAIN.value], categorical_features[DataSplit.TRAIN.value]), axis=1
),
DataSplit.VALIDATION.value: np.concatenate(
(numerical_features[DataSplit.VALIDATION.value], categorical_features[DataSplit.VALIDATION.value]),
axis=1,
),
DataSplit.TEST.value: np.concatenate(
(numerical_features[DataSplit.TEST.value], categorical_features[DataSplit.TEST.value]), axis=1
),
}

return merged_features, label_encoders


def transform_dataset(
Expand Down
12 changes: 6 additions & 6 deletions src/midst_toolkit/models/clavaddpm/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,9 @@ def train_model(
data_frame,
transformations,
is_target_conditioned=model_params.is_target_conditioned,
ratios=data_split_ratios,
df_info=data_frame_info,
std=0,
data_split_ratios=data_split_ratios,
info=data_frame_info,
noise_scale=0,
)

category_sizes = np.array(dataset.get_category_sizes(DataSplit.TRAIN))
Expand Down Expand Up @@ -419,9 +419,9 @@ def train_classifier(
data_frame,
transformations,
is_target_conditioned=model_params.is_target_conditioned,
ratios=data_split_ratios,
df_info=data_frame_info,
std=0,
data_split_ratios=data_split_ratios,
info=data_frame_info,
noise_scale=0,
)
print(dataset.n_features)
train_loader = prepare_fast_dataloader(
Expand Down