From 59eb5dd9afc5810397476fc570503e8a7f6c97f1 Mon Sep 17 00:00:00 2001
From: Alp Akpinar <aakpinar@bu.edu>
Date: Mon, 22 Nov 2021 15:22:04 -0500
Subject: [PATCH 1/7] Introduce different feature scalers for different types
 of models

---
 vbfml/config/convolutional_model.yml |  8 +++++--
 vbfml/config/dense_model.yml         |  4 ++++
 vbfml/input/sequences.py             | 31 ++++++++++++++++++++++------
 vbfml/scripts/train.py               | 11 ++++++----
 vbfml/training/input.py              |  5 ++++-
 5 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/vbfml/config/convolutional_model.yml b/vbfml/config/convolutional_model.yml
index b26c461..b652cc7 100644
--- a/vbfml/config/convolutional_model.yml
+++ b/vbfml/config/convolutional_model.yml
@@ -1,11 +1,15 @@
 architecture: conv
 training_parameters:
   batch_size: 10
-  batch_buffer_size: 100
+  batch_buffer_size: 1000
+  scale_features: norm
+  shuffle: true
   train_size: 0.67
 validation_parameters:
-  batch_size: 100
+  batch_size: 1000
   batch_buffer_size: 10
+  scale_features: norm
+  shuffle: true
 weight_expression: Normalization
 features:
 - JetImage_pixels
diff --git a/vbfml/config/dense_model.yml b/vbfml/config/dense_model.yml
index e0f2158..eaae3f0 100644
--- a/vbfml/config/dense_model.yml
+++ b/vbfml/config/dense_model.yml
@@ -3,9 +3,13 @@ training_parameters:
   batch_size: 20
   batch_buffer_size: 1000000
   train_size: 0.5
+  scale_features: standard
+  shuffle: true
 validation_parameters:
   batch_size: 1000000
   batch_buffer_size: 10
+  scale_features: standard
+  shuffle: true
 weight_expression: weight_total*xs/sumw
 features:
 - mjj
diff --git a/vbfml/input/sequences.py b/vbfml/input/sequences.py
index 25700e5..ead4730 100644
--- a/vbfml/input/sequences.py
+++ b/vbfml/input/sequences.py
@@ -27,6 +27,24 @@ def row_vector(branch):
     return np.array(branch).reshape((len(branch), 1))
 
 
+class Normalizer:
+    def __init__(self) -> None:
+        pass
+
+    def fit(self, features: np.ndarray):
+        return self
+
+    def transform(self, features: np.ndarray, ceiling: float = 255.0):
+        """Simple normalizer for image data. Divides all values with 255
+        to get all values within [0,1] range.
+
+        Args:
+            features (np.ndarray): Image pixels.
+            ceiling (float, optional): The highest pixel value. Defaults to 255.
+        """
+        return features / ceiling
+
+
 class MultiDatasetSequence(Sequence):
     def __init__(
         self,
@@ -36,7 +54,7 @@ def __init__(
         batch_buffer_size=1,
         read_range=(0.0, 1.0),
         weight_expression=None,
-        scale_features=False,
+        scale_features="none",
     ) -> None:
         self.datasets = {}
         self.readers = {}
@@ -76,11 +94,11 @@ def batch_size(self, batch_size: int) -> None:
         self._batch_size = batch_size
 
     @property
-    def scale_features(self) -> bool:
+    def scale_features(self) -> str:
         return self._scale_features
 
     @scale_features.setter
-    def scale_features(self, scale_features: bool) -> None:
+    def scale_features(self, scale_features: str) -> None:
         if scale_features != self.scale_features:
             self.buffer.clear()
         self._scale_features = scale_features
@@ -118,7 +136,8 @@ def _init_feature_scaler_from_features(self, features: np.ndarray) -> None:
         """
         Initialize the feature scaler object based on a feature tensor.
         """
-        self._feature_scaler = StandardScaler().fit(features)
+        scalers = {"standard": StandardScaler, "norm": Normalizer}
+        self._feature_scaler = scalers[self._scale_features]().fit(features)
 
     def _init_feature_scaler_from_multibatch(self, df: "pd.DataFrame") -> None:
         """
@@ -237,7 +256,7 @@ def _fill_batch_buffer(self, batch_start: int, batch_stop: int) -> None:
         Read batches from file and save them into the buffer for future use.
         """
         multibatch_df = self._read_multibatch(batch_start, batch_stop)
-        if self.scale_features and not self._feature_scaler:
+        if self.scale_features != "none" and not self._feature_scaler:
             self._init_feature_scaler_from_multibatch(multibatch_df)
         if self.shuffle:
             multibatch_df = multibatch_df.sample(frac=1, ignore_index=True)
@@ -248,7 +267,7 @@ def _batch_df_formatting(self, df: pd.DataFrame) -> "tuple[np.ndarray]":
 
         features = df.drop(columns=self._non_feature_columns()).to_numpy()
         features = features.astype(self._float_dtype)
-        if self.scale_features:
+        if self.scale_features != "none":
             features = self.apply_feature_scaling(features)
 
         labels = to_categorical(
diff --git a/vbfml/scripts/train.py b/vbfml/scripts/train.py
index 5234941..90428a2 100755
--- a/vbfml/scripts/train.py
+++ b/vbfml/scripts/train.py
@@ -83,33 +83,36 @@ def setup(ctx, learning_rate: float, dropout: float, input_dir: str, model_confi
 
     features = mconfig.get("features")
 
+    training_params = mconfig.get("training_parameters")
+    validation_params = mconfig.get("validation_parameters")
+
     training_sequence = build_sequence(
         datasets=copy.deepcopy(datasets),
         features=features,
         weight_expression=mconfig.get("weight_expression"),
+        shuffle=training_params["shuffle"],
+        scale_features=training_params["scale_features"],
     )
     validation_sequence = build_sequence(
         datasets=copy.deepcopy(datasets),
         features=features,
         weight_expression=mconfig.get("weight_expression"),
+        shuffle=validation_params["shuffle"],
+        scale_features=validation_params["scale_features"],
     )
     normalize_classes(training_sequence)
     normalize_classes(validation_sequence)
 
     # Training sequence
-    training_params = mconfig.get("training_parameters")
     train_size = training_params["train_size"]
 
     training_sequence.read_range = (0.0, train_size)
-    training_sequence.scale_features = True
     training_sequence.batch_size = training_params["batch_size"]
     training_sequence.batch_buffer_size = training_params["batch_buffer_size"]
     training_sequence[0]
 
     # Validation sequence
-    validation_params = mconfig.get("validation_parameters")
     validation_sequence.read_range = (train_size, 1.0)
-    validation_sequence.scale_features = True
     validation_sequence._feature_scaler = copy.deepcopy(
         training_sequence._feature_scaler
     )
diff --git a/vbfml/training/input.py b/vbfml/training/input.py
index b9dc486..51db663 100644
--- a/vbfml/training/input.py
+++ b/vbfml/training/input.py
@@ -11,6 +11,8 @@ def build_sequence(
     features: "list[str]",
     weight_expression: str = "weight_total*xs/sumw",
     absolute_weight: bool = False,
+    shuffle: bool = True,
+    scale_features: str = "standard",
 ) -> MultiDatasetSequence:
     """Shortcut to set up a MultiDatasetSequence"""
 
@@ -20,9 +22,10 @@ def build_sequence(
     sequence = MultiDatasetSequence(
         batch_size=50,
         branches=features,
-        shuffle=True,
+        shuffle=shuffle,
         batch_buffer_size=int(1e5),
         weight_expression=weight_expression,
+        scale_features=scale_features,
     )
 
     for dataset in datasets:

From 164d3779cddf8ef707b9d5f7135dd6c8ce9fce95 Mon Sep 17 00:00:00 2001
From: Alp Akpinar <aakpinar@bu.edu>
Date: Mon, 22 Nov 2021 15:23:12 -0500
Subject: [PATCH 2/7] Update tests

---
 vbfml/tests/test_postprocessing.py |  2 +-
 vbfml/tests/test_sequences.py      | 20 +++++++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/vbfml/tests/test_postprocessing.py b/vbfml/tests/test_postprocessing.py
index 6a8d0fe..b3c273f 100644
--- a/vbfml/tests/test_postprocessing.py
+++ b/vbfml/tests/test_postprocessing.py
@@ -78,7 +78,7 @@ def setUp(self):
 
         datasets = load_datasets_bucoffea(self.wdir)
         sequence = build_sequence(datasets, features=self.features)
-        sequence.scale_features = True
+        sequence.scale_features = "standard"
         self.training_sequence = sequence
         self.validation_sequence = copy.deepcopy(sequence)
 
diff --git a/vbfml/tests/test_sequences.py b/vbfml/tests/test_sequences.py
index 41de093..e370d7e 100644
--- a/vbfml/tests/test_sequences.py
+++ b/vbfml/tests/test_sequences.py
@@ -424,23 +424,37 @@ def deviation_from_target(features):
             """
             Mean is supposed to be ~=0, std dev ~=1
             -> Calculate and return the absolute differences
+            IMPORTANT: Note that this is relevant only for "standard" scaling!
             """
             deviation_mean = np.max(np.abs(np.mean(features, axis=0)))
             deviation_std = np.max(np.abs(np.std(features, axis=0) - 1))
             return deviation_mean, deviation_std
 
         # Read without feature scaling
-        self.mds.scale_features = False
+        self.mds.scale_features = "none"
         features, _, weights = self.mds[0]
         dev_mean, dev_std = deviation_from_target(features)
         self.assertNotAlmostEqual(dev_mean, self.feature_mean)
         self.assertNotAlmostEqual(dev_std, self.feature_std - 1)
         self.assertTrue(np.allclose(np.abs(features), weights, rtol=0.01))
 
-        # Read with feature scaling
-        self.mds.scale_features = True
+        # Read with standard feature scaling
+        self.mds.scale_features = "standard"
         features, _, weights = self.mds[0]
         dev_mean, dev_std = deviation_from_target(features)
         self.assertAlmostEqual(dev_mean, 0, places=2)
         self.assertAlmostEqual(dev_std, 0, places=2)
         self.assertTrue(np.all(features != weights))
+
+    def test_feature_scaling_with_norm(self):
+        """
+        After "norm" normalization, all values should be <=1.
+        Compute the (min,max) range in features and see
+        if this is indeed the case.
+        """
+        # Read with norm feature scaling
+        self.mds.scale_features = "norm"
+        features, _, _ = self.mds[0]
+        f_min, f_max = np.min(features), np.max(features)
+        self.assertTrue(f_min < f_max)
+        self.assertTrue(np.abs(f_max) <= 1.0)

From 85c5638c21497f133e4b5ff5611be85f487b6184 Mon Sep 17 00:00:00 2001
From: Alp Akpinar <aakpinar@bu.edu>
Date: Mon, 22 Nov 2021 15:24:15 -0500
Subject: [PATCH 3/7] Fixup in TrainingAnalyzer: Fix scale_features

---
 vbfml/training/analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vbfml/training/analysis.py b/vbfml/training/analysis.py
index 34bf064..261ade3 100644
--- a/vbfml/training/analysis.py
+++ b/vbfml/training/analysis.py
@@ -104,7 +104,7 @@ def analyze(self):
         histograms = {}
         for sequence_type in ["training", "validation"]:
             sequence = self.loader.get_sequence(sequence_type)
-            sequence.scale_features = False
+            sequence.scale_features = "none"
             sequence.batch_size = int(1e6)
             sequence.batch_buffer_size = 10
             (

From 23d47ce51d46fef87745f657fdcdac4ffd49e92e Mon Sep 17 00:00:00 2001
From: Alp Akpinar <aakpinar@bu.edu>
Date: Tue, 23 Nov 2021 09:33:16 -0500
Subject: [PATCH 4/7] Add test for feature normalizing

---
 vbfml/tests/test_sequences.py | 51 ++++++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/vbfml/tests/test_sequences.py b/vbfml/tests/test_sequences.py
index e370d7e..13e198f 100644
--- a/vbfml/tests/test_sequences.py
+++ b/vbfml/tests/test_sequences.py
@@ -446,15 +446,54 @@ def deviation_from_target(features):
         self.assertAlmostEqual(dev_std, 0, places=2)
         self.assertTrue(np.all(features != weights))
 
+class TestMultiDatasetSequenceNormFeatureScaling(TestCase):
+    def setUp(self):
+        self.treename = "tree"
+        self.feature_branches = ["a"]
+        self.nevents_per_file = int(1e4)
+        # Generate data that resembles the real image data
+        # i.e. has a range of [0,255]
+        self.values = np.random.randint(
+            low=0,
+            high=255, 
+            size=self.nevents_per_file
+            )
+
+        self.mds = MultiDatasetSequence(
+            batch_size=int(1e3),
+            branches=self.feature_branches,
+            shuffle=False
+        )
+        
+        self.wdir = make_tmp_dir()
+        self.addCleanup(os.rmdir, self.wdir)
+        fname = os.path.abspath(os.path.join(self.wdir, "test.root"))
+
+        create_test_tree(
+            filename=fname,
+            treename=self.treename,
+            branches=self.feature_branches,
+            n_events=self.nevents_per_file,
+            value=self.values,
+        )
+        self.addCleanup(os.remove, fname)
+
+        dataset = DatasetInfo(
+            name="dataset",
+            files=[fname],
+            n_events=self.nevents_per_file,
+            treename=self.treename,
+        )
+        self.mds.add_dataset(dataset)
+
     def test_feature_scaling_with_norm(self):
         """
-        After "norm" normalization, all values should be <=1.
-        Compute the (min,max) range in features and see
-        if this is indeed the case.
+        Test "norm" feature scaling. After the feature scaling, all values
+        should be in a range of [0,1].
         """
-        # Read with norm feature scaling
         self.mds.scale_features = "norm"
-        features, _, _ = self.mds[0]
+        features, _ = self.mds[0]
         f_min, f_max = np.min(features), np.max(features)
         self.assertTrue(f_min < f_max)
-        self.assertTrue(np.abs(f_max) <= 1.0)
+        self.assertTrue(f_min >= 0)
+        self.assertTrue(f_max <= 1)

From 3aea9d246befb12540e64a7f9653eec6fc8cedaa Mon Sep 17 00:00:00 2001
From: Alp Akpinar <aakpinar@bu.edu>
Date: Tue, 23 Nov 2021 09:36:17 -0500
Subject: [PATCH 5/7] Double check the feature range in runtime for "norm"
 feature scaling

---
 vbfml/input/sequences.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vbfml/input/sequences.py b/vbfml/input/sequences.py
index ead4730..8e91ac6 100644
--- a/vbfml/input/sequences.py
+++ b/vbfml/input/sequences.py
@@ -270,6 +270,11 @@ def _batch_df_formatting(self, df: pd.DataFrame) -> "tuple[np.ndarray]":
         if self.scale_features != "none":
             features = self.apply_feature_scaling(features)
 
+        # Double checking the feature range here
+        if self.scale_features == "norm":
+            valid = np.all(features >= 0) & np.all(features <= 1)
+            assert valid, "Features are not scaled correctly to [0,1] range, please check!"
+
         labels = to_categorical(
             row_vector(df["label"]),
             num_classes=len(self.dataset_labels()),

From 532c075bdec6d0dbc0535cfddf2abb2548ca7790 Mon Sep 17 00:00:00 2001
From: Alp Akpinar <aakpinar@bu.edu>
Date: Tue, 23 Nov 2021 09:37:25 -0500
Subject: [PATCH 6/7] Minor update

---
 vbfml/input/sequences.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vbfml/input/sequences.py b/vbfml/input/sequences.py
index 8e91ac6..ac06e7b 100644
--- a/vbfml/input/sequences.py
+++ b/vbfml/input/sequences.py
@@ -272,7 +272,7 @@ def _batch_df_formatting(self, df: pd.DataFrame) -> "tuple[np.ndarray]":
 
         # Double checking the feature range here
         if self.scale_features == "norm":
-            valid = np.all(features >= 0) & np.all(features <= 1)
+            valid = np.all((features >= 0) & (features <= 1))
             assert valid, "Features are not scaled correctly to [0,1] range, please check!"
 
         labels = to_categorical(

From f4a5bcc1d64f970b0bf8f764d7a7d3e3bf1d358c Mon Sep 17 00:00:00 2001
From: Alp Akpinar <aakpinar@bu.edu>
Date: Tue, 23 Nov 2021 09:43:24 -0500
Subject: [PATCH 7/7] black formatting

---
 vbfml/input/sequences.py      |  4 +++-
 vbfml/tests/test_sequences.py | 13 ++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/vbfml/input/sequences.py b/vbfml/input/sequences.py
index ac06e7b..d49bacb 100644
--- a/vbfml/input/sequences.py
+++ b/vbfml/input/sequences.py
@@ -273,7 +273,9 @@ def _batch_df_formatting(self, df: pd.DataFrame) -> "tuple[np.ndarray]":
         # Double checking the feature range here
         if self.scale_features == "norm":
             valid = np.all((features >= 0) & (features <= 1))
-            assert valid, "Features are not scaled correctly to [0,1] range, please check!"
+            assert (
+                valid
+            ), "Features are not scaled correctly to [0,1] range, please check!"
 
         labels = to_categorical(
             row_vector(df["label"]),
diff --git a/vbfml/tests/test_sequences.py b/vbfml/tests/test_sequences.py
index 13e198f..5c81707 100644
--- a/vbfml/tests/test_sequences.py
+++ b/vbfml/tests/test_sequences.py
@@ -446,6 +446,7 @@ def deviation_from_target(features):
         self.assertAlmostEqual(dev_std, 0, places=2)
         self.assertTrue(np.all(features != weights))
 
+
 class TestMultiDatasetSequenceNormFeatureScaling(TestCase):
     def setUp(self):
         self.treename = "tree"
@@ -453,18 +454,12 @@ def setUp(self):
         self.nevents_per_file = int(1e4)
         # Generate data that resembles the real image data
         # i.e. has a range of [0,255]
-        self.values = np.random.randint(
-            low=0,
-            high=255, 
-            size=self.nevents_per_file
-            )
+        self.values = np.random.randint(low=0, high=255, size=self.nevents_per_file)
 
         self.mds = MultiDatasetSequence(
-            batch_size=int(1e3),
-            branches=self.feature_branches,
-            shuffle=False
+            batch_size=int(1e3), branches=self.feature_branches, shuffle=False
         )
-        
+
         self.wdir = make_tmp_dir()
         self.addCleanup(os.rmdir, self.wdir)
         fname = os.path.abspath(os.path.join(self.wdir, "test.root"))