From 59eb5dd9afc5810397476fc570503e8a7f6c97f1 Mon Sep 17 00:00:00 2001 From: Alp Akpinar Date: Mon, 22 Nov 2021 15:22:04 -0500 Subject: [PATCH 1/7] Introduce different feature scalers for different types of models --- vbfml/config/convolutional_model.yml | 8 +++++-- vbfml/config/dense_model.yml | 4 ++++ vbfml/input/sequences.py | 31 ++++++++++++++++++++++------ vbfml/scripts/train.py | 11 ++++++---- vbfml/training/input.py | 5 ++++- 5 files changed, 46 insertions(+), 13 deletions(-) diff --git a/vbfml/config/convolutional_model.yml b/vbfml/config/convolutional_model.yml index b26c461..b652cc7 100644 --- a/vbfml/config/convolutional_model.yml +++ b/vbfml/config/convolutional_model.yml @@ -1,11 +1,15 @@ architecture: conv training_parameters: batch_size: 10 - batch_buffer_size: 100 + batch_buffer_size: 1000 + scale_features: norm + shuffle: true train_size: 0.67 validation_parameters: - batch_size: 100 + batch_size: 1000 batch_buffer_size: 10 + scale_features: norm + shuffle: true weight_expression: Normalization features: - JetImage_pixels diff --git a/vbfml/config/dense_model.yml b/vbfml/config/dense_model.yml index e0f2158..eaae3f0 100644 --- a/vbfml/config/dense_model.yml +++ b/vbfml/config/dense_model.yml @@ -3,9 +3,13 @@ training_parameters: batch_size: 20 batch_buffer_size: 1000000 train_size: 0.5 + scale_features: standard + shuffle: true validation_parameters: batch_size: 1000000 batch_buffer_size: 10 + scale_features: standard + shuffle: true weight_expression: weight_total*xs/sumw features: - mjj diff --git a/vbfml/input/sequences.py b/vbfml/input/sequences.py index 25700e5..ead4730 100644 --- a/vbfml/input/sequences.py +++ b/vbfml/input/sequences.py @@ -27,6 +27,24 @@ def row_vector(branch): return np.array(branch).reshape((len(branch), 1)) +class Normalizer: + def __init__(self) -> None: + pass + + def fit(self, features: np.ndarray): + return self + + def transform(self, features: np.ndarray, ceiling: float = 255.0): + """Simple normalizer for image data. Divides all values with 255 + to get all values within [0,1] range. + + Args: + features (np.ndarray): Image pixels. + ceiling (float, optional): The highest pixel value. Defaults to 255. + """ + return features / ceiling + + class MultiDatasetSequence(Sequence): def __init__( self, @@ -36,7 +54,7 @@ def __init__( batch_buffer_size=1, read_range=(0.0, 1.0), weight_expression=None, - scale_features=False, + scale_features="none", ) -> None: self.datasets = {} self.readers = {} @@ -76,11 +94,11 @@ def batch_size(self, batch_size: int) -> None: self._batch_size = batch_size @property - def scale_features(self) -> bool: + def scale_features(self) -> str: return self._scale_features @scale_features.setter - def scale_features(self, scale_features: bool) -> None: + def scale_features(self, scale_features: str) -> None: if scale_features != self.scale_features: self.buffer.clear() self._scale_features = scale_features @@ -118,7 +136,8 @@ def _init_feature_scaler_from_features(self, features: np.ndarray) -> None: """ Initialize the feature scaler object based on a feature tensor. """ - self._feature_scaler = StandardScaler().fit(features) + scalers = {"standard": StandardScaler, "norm": Normalizer} + self._feature_scaler = scalers[self._scale_features]().fit(features) def _init_feature_scaler_from_multibatch(self, df: "pd.DataFrame") -> None: """ @@ -237,7 +256,7 @@ def _fill_batch_buffer(self, batch_start: int, batch_stop: int) -> None: Read batches from file and save them into the buffer for future use. """ multibatch_df = self._read_multibatch(batch_start, batch_stop) - if self.scale_features and not self._feature_scaler: + if self.scale_features != "none" and not self._feature_scaler: self._init_feature_scaler_from_multibatch(multibatch_df) if self.shuffle: multibatch_df = multibatch_df.sample(frac=1, ignore_index=True) @@ -248,7 +267,7 @@ def _batch_df_formatting(self, df: pd.DataFrame) -> "tuple[np.ndarray]": features = df.drop(columns=self._non_feature_columns()).to_numpy() features = features.astype(self._float_dtype) - if self.scale_features: + if self.scale_features != "none": features = self.apply_feature_scaling(features) labels = to_categorical( diff --git a/vbfml/scripts/train.py b/vbfml/scripts/train.py index 5234941..90428a2 100755 --- a/vbfml/scripts/train.py +++ b/vbfml/scripts/train.py @@ -83,33 +83,36 @@ def setup(ctx, learning_rate: float, dropout: float, input_dir: str, model_confi features = mconfig.get("features") + training_params = mconfig.get("training_parameters") + validation_params = mconfig.get("validation_parameters") + training_sequence = build_sequence( datasets=copy.deepcopy(datasets), features=features, weight_expression=mconfig.get("weight_expression"), + shuffle=training_params["shuffle"], + scale_features=training_params["scale_features"], ) validation_sequence = build_sequence( datasets=copy.deepcopy(datasets), features=features, weight_expression=mconfig.get("weight_expression"), + shuffle=validation_params["shuffle"], + scale_features=validation_params["scale_features"], ) normalize_classes(training_sequence) normalize_classes(validation_sequence) # Training sequence - training_params = mconfig.get("training_parameters") train_size = training_params["train_size"] training_sequence.read_range = (0.0, train_size) - training_sequence.scale_features = True training_sequence.batch_size = training_params["batch_size"] training_sequence.batch_buffer_size = training_params["batch_buffer_size"] training_sequence[0] # Validation sequence - validation_params = mconfig.get("validation_parameters") validation_sequence.read_range = (train_size, 1.0) - validation_sequence.scale_features = True validation_sequence._feature_scaler = copy.deepcopy( training_sequence._feature_scaler ) diff --git a/vbfml/training/input.py b/vbfml/training/input.py index b9dc486..51db663 100644 --- a/vbfml/training/input.py +++ b/vbfml/training/input.py @@ -11,6 +11,8 @@ def build_sequence( features: "list[str]", weight_expression: str = "weight_total*xs/sumw", absolute_weight: bool = False, + shuffle: bool = True, + scale_features: str = "standard", ) -> MultiDatasetSequence: """Shortcut to set up a MultiDatasetSequence""" @@ -20,9 +22,10 @@ def build_sequence( sequence = MultiDatasetSequence( batch_size=50, branches=features, - shuffle=True, + shuffle=shuffle, batch_buffer_size=int(1e5), weight_expression=weight_expression, + scale_features=scale_features, ) for dataset in datasets: From 164d3779cddf8ef707b9d5f7135dd6c8ce9fce95 Mon Sep 17 00:00:00 2001 From: Alp Akpinar Date: Mon, 22 Nov 2021 15:23:12 -0500 Subject: [PATCH 2/7] Update tests --- vbfml/tests/test_postprocessing.py | 2 +- vbfml/tests/test_sequences.py | 20 +++++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/vbfml/tests/test_postprocessing.py b/vbfml/tests/test_postprocessing.py index 6a8d0fe..b3c273f 100644 --- a/vbfml/tests/test_postprocessing.py +++ b/vbfml/tests/test_postprocessing.py @@ -78,7 +78,7 @@ def setUp(self): datasets = load_datasets_bucoffea(self.wdir) sequence = build_sequence(datasets, features=self.features) - sequence.scale_features = True + sequence.scale_features = "standard" self.training_sequence = sequence self.validation_sequence = copy.deepcopy(sequence) diff --git a/vbfml/tests/test_sequences.py b/vbfml/tests/test_sequences.py index 41de093..e370d7e 100644 --- a/vbfml/tests/test_sequences.py +++ b/vbfml/tests/test_sequences.py @@ -424,23 +424,37 @@ def deviation_from_target(features): """ Mean is supposed to be ~=0, std dev ~=1 -> Calculate and return the absolute differences + IMPORTANT: Note that this is relevant only for "standard" scaling! """ deviation_mean = np.max(np.abs(np.mean(features, axis=0))) deviation_std = np.max(np.abs(np.std(features, axis=0) - 1)) return deviation_mean, deviation_std # Read without feature scaling - self.mds.scale_features = False + self.mds.scale_features = "none" features, _, weights = self.mds[0] dev_mean, dev_std = deviation_from_target(features) self.assertNotAlmostEqual(dev_mean, self.feature_mean) self.assertNotAlmostEqual(dev_std, self.feature_std - 1) self.assertTrue(np.allclose(np.abs(features), weights, rtol=0.01)) - # Read with feature scaling - self.mds.scale_features = True + # Read with standard feature scaling + self.mds.scale_features = "standard" features, _, weights = self.mds[0] dev_mean, dev_std = deviation_from_target(features) self.assertAlmostEqual(dev_mean, 0, places=2) self.assertAlmostEqual(dev_std, 0, places=2) self.assertTrue(np.all(features != weights)) + + def test_feature_scaling_with_norm(self): + """ + After "norm" normalization, all values should be <=1. + Compute the (min,max) range in features and see + if this is indeed the case. + """ + # Read with norm feature scaling + self.mds.scale_features = "norm" + features, _, _ = self.mds[0] + f_min, f_max = np.min(features), np.max(features) + self.assertTrue(f_min < f_max) + self.assertTrue(np.abs(f_max) <= 1.0) From 85c5638c21497f133e4b5ff5611be85f487b6184 Mon Sep 17 00:00:00 2001 From: Alp Akpinar Date: Mon, 22 Nov 2021 15:24:15 -0500 Subject: [PATCH 3/7] Fixup in TrainingAnalyzer: Fix scale_features --- vbfml/training/analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vbfml/training/analysis.py b/vbfml/training/analysis.py index 34bf064..261ade3 100644 --- a/vbfml/training/analysis.py +++ b/vbfml/training/analysis.py @@ -104,7 +104,7 @@ def analyze(self): histograms = {} for sequence_type in ["training", "validation"]: sequence = self.loader.get_sequence(sequence_type) - sequence.scale_features = False + sequence.scale_features = "none" sequence.batch_size = int(1e6) sequence.batch_buffer_size = 10 ( From 23d47ce51d46fef87745f657fdcdac4ffd49e92e Mon Sep 17 00:00:00 2001 From: Alp Akpinar Date: Tue, 23 Nov 2021 09:33:16 -0500 Subject: [PATCH 4/7] Add test for feature normalizing --- vbfml/tests/test_sequences.py | 51 ++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/vbfml/tests/test_sequences.py b/vbfml/tests/test_sequences.py index e370d7e..13e198f 100644 --- a/vbfml/tests/test_sequences.py +++ b/vbfml/tests/test_sequences.py @@ -446,15 +446,54 @@ def deviation_from_target(features): self.assertAlmostEqual(dev_std, 0, places=2) self.assertTrue(np.all(features != weights)) +class TestMultiDatasetSequenceNormFeatureScaling(TestCase): + def setUp(self): + self.treename = "tree" + self.feature_branches = ["a"] + self.nevents_per_file = int(1e4) + # Generate data that resembles the real image data + # i.e. has a range of [0,255] + self.values = np.random.randint( + low=0, + high=255, + size=self.nevents_per_file + ) + + self.mds = MultiDatasetSequence( + batch_size=int(1e3), + branches=self.feature_branches, + shuffle=False + ) + + self.wdir = make_tmp_dir() + self.addCleanup(os.rmdir, self.wdir) + fname = os.path.abspath(os.path.join(self.wdir, "test.root")) + + create_test_tree( + filename=fname, + treename=self.treename, + branches=self.feature_branches, + n_events=self.nevents_per_file, + value=self.values, + ) + self.addCleanup(os.remove, fname) + + dataset = DatasetInfo( + name="dataset", + files=[fname], + n_events=self.nevents_per_file, + treename=self.treename, + ) + self.mds.add_dataset(dataset) + def test_feature_scaling_with_norm(self): """ - After "norm" normalization, all values should be <=1. - Compute the (min,max) range in features and see - if this is indeed the case. + Test "norm" feature scaling. After the feature scaling, all values + should be in a range of [0,1]. """ - # Read with norm feature scaling self.mds.scale_features = "norm" - features, _, _ = self.mds[0] + features, _ = self.mds[0] f_min, f_max = np.min(features), np.max(features) self.assertTrue(f_min < f_max) - self.assertTrue(np.abs(f_max) <= 1.0) + self.assertTrue(f_min >= 0) + self.assertTrue(f_max <= 1) From 3aea9d246befb12540e64a7f9653eec6fc8cedaa Mon Sep 17 00:00:00 2001 From: Alp Akpinar Date: Tue, 23 Nov 2021 09:36:17 -0500 Subject: [PATCH 5/7] Double check the feature range in runtime for "norm" feature scaling --- vbfml/input/sequences.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vbfml/input/sequences.py b/vbfml/input/sequences.py index ead4730..8e91ac6 100644 --- a/vbfml/input/sequences.py +++ b/vbfml/input/sequences.py @@ -270,6 +270,11 @@ def _batch_df_formatting(self, df: pd.DataFrame) -> "tuple[np.ndarray]": if self.scale_features != "none": features = self.apply_feature_scaling(features) + # Double checking the feature range here + if self.scale_features == "norm": + valid = np.all(features >= 0) & np.all(features <= 1) + assert valid, "Features are not scaled correctly to [0,1] range, please check!" + labels = to_categorical( row_vector(df["label"]), num_classes=len(self.dataset_labels()), From 532c075bdec6d0dbc0535cfddf2abb2548ca7790 Mon Sep 17 00:00:00 2001 From: Alp Akpinar Date: Tue, 23 Nov 2021 09:37:25 -0500 Subject: [PATCH 6/7] Minor update --- vbfml/input/sequences.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vbfml/input/sequences.py b/vbfml/input/sequences.py index 8e91ac6..ac06e7b 100644 --- a/vbfml/input/sequences.py +++ b/vbfml/input/sequences.py @@ -272,7 +272,7 @@ def _batch_df_formatting(self, df: pd.DataFrame) -> "tuple[np.ndarray]": # Double checking the feature range here if self.scale_features == "norm": - valid = np.all(features >= 0) & np.all(features <= 1) + valid = np.all((features >= 0) & (features <= 1)) assert valid, "Features are not scaled correctly to [0,1] range, please check!" labels = to_categorical( From f4a5bcc1d64f970b0bf8f764d7a7d3e3bf1d358c Mon Sep 17 00:00:00 2001 From: Alp Akpinar Date: Tue, 23 Nov 2021 09:43:24 -0500 Subject: [PATCH 7/7] black formatting --- vbfml/input/sequences.py | 4 +++- vbfml/tests/test_sequences.py | 13 ++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/vbfml/input/sequences.py b/vbfml/input/sequences.py index ac06e7b..d49bacb 100644 --- a/vbfml/input/sequences.py +++ b/vbfml/input/sequences.py @@ -273,7 +273,9 @@ def _batch_df_formatting(self, df: pd.DataFrame) -> "tuple[np.ndarray]": # Double checking the feature range here if self.scale_features == "norm": valid = np.all((features >= 0) & (features <= 1)) - assert valid, "Features are not scaled correctly to [0,1] range, please check!" + assert ( + valid + ), "Features are not scaled correctly to [0,1] range, please check!" labels = to_categorical( row_vector(df["label"]), diff --git a/vbfml/tests/test_sequences.py b/vbfml/tests/test_sequences.py index 13e198f..5c81707 100644 --- a/vbfml/tests/test_sequences.py +++ b/vbfml/tests/test_sequences.py @@ -446,6 +446,7 @@ def deviation_from_target(features): self.assertAlmostEqual(dev_std, 0, places=2) self.assertTrue(np.all(features != weights)) + class TestMultiDatasetSequenceNormFeatureScaling(TestCase): def setUp(self): self.treename = "tree" @@ -453,18 +454,12 @@ def setUp(self): self.nevents_per_file = int(1e4) # Generate data that resembles the real image data # i.e. has a range of [0,255] - self.values = np.random.randint( - low=0, - high=255, - size=self.nevents_per_file - ) + self.values = np.random.randint(low=0, high=255, size=self.nevents_per_file) self.mds = MultiDatasetSequence( - batch_size=int(1e3), - branches=self.feature_branches, - shuffle=False + batch_size=int(1e3), branches=self.feature_branches, shuffle=False ) - + self.wdir = make_tmp_dir() self.addCleanup(os.rmdir, self.wdir) fname = os.path.abspath(os.path.join(self.wdir, "test.root"))