Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Different feature scaling for different models #16

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions vbfml/config/convolutional_model.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
architecture: conv
training_parameters:
batch_size: 10
batch_buffer_size: 100
batch_buffer_size: 1000
scale_features: norm
shuffle: true
train_size: 0.67
validation_parameters:
batch_size: 100
batch_size: 1000
batch_buffer_size: 10
scale_features: norm
shuffle: true
weight_expression: Normalization
features:
- JetImage_pixels
Expand Down
4 changes: 4 additions & 0 deletions vbfml/config/dense_model.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@ training_parameters:
batch_size: 20
batch_buffer_size: 1000000
train_size: 0.5
scale_features: standard
shuffle: true
validation_parameters:
batch_size: 1000000
batch_buffer_size: 10
scale_features: standard
shuffle: true
weight_expression: weight_total*xs/sumw
features:
- mjj
Expand Down
38 changes: 32 additions & 6 deletions vbfml/input/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,24 @@ def row_vector(branch):
return np.array(branch).reshape((len(branch), 1))


class Normalizer:
def __init__(self) -> None:
pass

def fit(self, features: np.ndarray):
return self

def transform(self, features: np.ndarray, ceiling: float = 255.0):
"""Simple normalizer for image data. Divides all values with 255
to get all values within [0,1] range.

Args:
features (np.ndarray): Image pixels.
ceiling (float, optional): The highest pixel value. Defaults to 255.
"""
return features / ceiling


class MultiDatasetSequence(Sequence):
def __init__(
self,
Expand All @@ -36,7 +54,7 @@ def __init__(
batch_buffer_size=1,
read_range=(0.0, 1.0),
weight_expression=None,
scale_features=False,
scale_features="none",
) -> None:
self.datasets = {}
self.readers = {}
Expand Down Expand Up @@ -76,11 +94,11 @@ def batch_size(self, batch_size: int) -> None:
self._batch_size = batch_size

@property
def scale_features(self) -> bool:
def scale_features(self) -> str:
return self._scale_features

@scale_features.setter
def scale_features(self, scale_features: bool) -> None:
def scale_features(self, scale_features: str) -> None:
if scale_features != self.scale_features:
self.buffer.clear()
self._scale_features = scale_features
Expand Down Expand Up @@ -118,7 +136,8 @@ def _init_feature_scaler_from_features(self, features: np.ndarray) -> None:
"""
Initialize the feature scaler object based on a feature tensor.
"""
self._feature_scaler = StandardScaler().fit(features)
scalers = {"standard": StandardScaler, "norm": Normalizer}
self._feature_scaler = scalers[self._scale_features]().fit(features)

def _init_feature_scaler_from_multibatch(self, df: "pd.DataFrame") -> None:
"""
Expand Down Expand Up @@ -237,7 +256,7 @@ def _fill_batch_buffer(self, batch_start: int, batch_stop: int) -> None:
Read batches from file and save them into the buffer for future use.
"""
multibatch_df = self._read_multibatch(batch_start, batch_stop)
if self.scale_features and not self._feature_scaler:
if self.scale_features != "none" and not self._feature_scaler:
self._init_feature_scaler_from_multibatch(multibatch_df)
if self.shuffle:
multibatch_df = multibatch_df.sample(frac=1, ignore_index=True)
Expand All @@ -248,9 +267,16 @@ def _batch_df_formatting(self, df: pd.DataFrame) -> "tuple[np.ndarray]":

features = df.drop(columns=self._non_feature_columns()).to_numpy()
features = features.astype(self._float_dtype)
if self.scale_features:
if self.scale_features != "none":
features = self.apply_feature_scaling(features)

# Double checking the feature range here
if self.scale_features == "norm":
valid = np.all((features >= 0) & (features <= 1))
assert (
valid
), "Features are not scaled correctly to [0,1] range, please check!"

labels = to_categorical(
row_vector(df["label"]),
num_classes=len(self.dataset_labels()),
Expand Down
11 changes: 7 additions & 4 deletions vbfml/scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,33 +83,36 @@ def setup(ctx, learning_rate: float, dropout: float, input_dir: str, model_confi

features = mconfig.get("features")

training_params = mconfig.get("training_parameters")
validation_params = mconfig.get("validation_parameters")

training_sequence = build_sequence(
datasets=copy.deepcopy(datasets),
features=features,
weight_expression=mconfig.get("weight_expression"),
shuffle=training_params["shuffle"],
scale_features=training_params["scale_features"],
)
validation_sequence = build_sequence(
datasets=copy.deepcopy(datasets),
features=features,
weight_expression=mconfig.get("weight_expression"),
shuffle=validation_params["shuffle"],
scale_features=validation_params["scale_features"],
)
normalize_classes(training_sequence)
normalize_classes(validation_sequence)

# Training sequence
training_params = mconfig.get("training_parameters")
train_size = training_params["train_size"]

training_sequence.read_range = (0.0, train_size)
training_sequence.scale_features = True
training_sequence.batch_size = training_params["batch_size"]
training_sequence.batch_buffer_size = training_params["batch_buffer_size"]
training_sequence[0]

# Validation sequence
validation_params = mconfig.get("validation_parameters")
validation_sequence.read_range = (train_size, 1.0)
validation_sequence.scale_features = True
validation_sequence._feature_scaler = copy.deepcopy(
training_sequence._feature_scaler
)
Expand Down
2 changes: 1 addition & 1 deletion vbfml/tests/test_postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def setUp(self):

datasets = load_datasets_bucoffea(self.wdir)
sequence = build_sequence(datasets, features=self.features)
sequence.scale_features = True
sequence.scale_features = "standard"
self.training_sequence = sequence
self.validation_sequence = copy.deepcopy(sequence)

Expand Down
54 changes: 51 additions & 3 deletions vbfml/tests/test_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,23 +424,71 @@ def deviation_from_target(features):
"""
Mean is supposed to be ~=0, std dev ~=1
-> Calculate and return the absolute differences
IMPORTANT: Note that this is relevant only for "standard" scaling!
"""
deviation_mean = np.max(np.abs(np.mean(features, axis=0)))
deviation_std = np.max(np.abs(np.std(features, axis=0) - 1))
return deviation_mean, deviation_std

# Read without feature scaling
self.mds.scale_features = False
self.mds.scale_features = "none"
features, _, weights = self.mds[0]
dev_mean, dev_std = deviation_from_target(features)
self.assertNotAlmostEqual(dev_mean, self.feature_mean)
self.assertNotAlmostEqual(dev_std, self.feature_std - 1)
self.assertTrue(np.allclose(np.abs(features), weights, rtol=0.01))

# Read with feature scaling
self.mds.scale_features = True
# Read with standard feature scaling
self.mds.scale_features = "standard"
features, _, weights = self.mds[0]
dev_mean, dev_std = deviation_from_target(features)
self.assertAlmostEqual(dev_mean, 0, places=2)
self.assertAlmostEqual(dev_std, 0, places=2)
self.assertTrue(np.all(features != weights))


class TestMultiDatasetSequenceNormFeatureScaling(TestCase):
def setUp(self):
self.treename = "tree"
self.feature_branches = ["a"]
self.nevents_per_file = int(1e4)
# Generate data that resembles the real image data
# i.e. has a range of [0,255]
self.values = np.random.randint(low=0, high=255, size=self.nevents_per_file)

self.mds = MultiDatasetSequence(
batch_size=int(1e3), branches=self.feature_branches, shuffle=False
)

self.wdir = make_tmp_dir()
self.addCleanup(os.rmdir, self.wdir)
fname = os.path.abspath(os.path.join(self.wdir, "test.root"))

create_test_tree(
filename=fname,
treename=self.treename,
branches=self.feature_branches,
n_events=self.nevents_per_file,
value=self.values,
)
self.addCleanup(os.remove, fname)

dataset = DatasetInfo(
name="dataset",
files=[fname],
n_events=self.nevents_per_file,
treename=self.treename,
)
self.mds.add_dataset(dataset)

def test_feature_scaling_with_norm(self):
"""
Test "norm" feature scaling. After the feature scaling, all values
should be in a range of [0,1].
"""
self.mds.scale_features = "norm"
features, _ = self.mds[0]
f_min, f_max = np.min(features), np.max(features)
self.assertTrue(f_min < f_max)
self.assertTrue(f_min >= 0)
self.assertTrue(f_max <= 1)
2 changes: 1 addition & 1 deletion vbfml/training/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def analyze(self):
histograms = {}
for sequence_type in ["training", "validation"]:
sequence = self.loader.get_sequence(sequence_type)
sequence.scale_features = False
sequence.scale_features = "none"
sequence.batch_size = int(1e6)
sequence.batch_buffer_size = 10
(
Expand Down
5 changes: 4 additions & 1 deletion vbfml/training/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ def build_sequence(
features: "list[str]",
weight_expression: str = "weight_total*xs/sumw",
absolute_weight: bool = False,
shuffle: bool = True,
scale_features: str = "standard",
) -> MultiDatasetSequence:
"""Shortcut to set up a MultiDatasetSequence"""

Expand All @@ -20,9 +22,10 @@ def build_sequence(
sequence = MultiDatasetSequence(
batch_size=50,
branches=features,
shuffle=True,
shuffle=shuffle,
batch_buffer_size=int(1e5),
weight_expression=weight_expression,
scale_features=scale_features,
)

for dataset in datasets:
Expand Down