Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
!workspaces/CMS_project_v1/CMS_project_v1/config/CMS_project_v1_config.py
!workspaces/CFD_workspace/CFD_project_animation/config/CFD_project_animation_config.py

# This directory is used to store libraries that are built on the target machine and are not
# intended to be shared across different machines (e.g. SZ3).
/external/*

# Exclude results tracking files
green_code_tracking.txt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
green_code_tracking.txt
*.txt
*.npz
*.dat
*.png
*.root
*.jpg
*.jpeg
*.log

Are there any .txt files we would want to track? Maybe a catch-all for results/data/log files would be better

compression_comparison_results.txt

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
223 changes: 209 additions & 14 deletions baler/baler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2022 Baler Contributors
# Copyright 2022-2025 Baler Contributors

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -15,13 +15,13 @@
import os
import time
from math import ceil

import numpy as np

from .modules import helper
from .modules import compare
import gzip
from .modules.profiling import pytorch_profile

# from .modules.profiling import pytorch_profile
import blosc2

__all__ = (
"perform_compression",
Expand All @@ -30,6 +30,7 @@
"perform_plotting",
"perform_training",
"print_info",
"perform_comparison",
)


Expand All @@ -38,10 +39,13 @@ def main():

- if --mode=newProject: call `helper.create_new_project` and create a new project sub directory with config file
- if --mode=train: call `perform_training` and train the network on given data and based on the config file and check if profilers are enabled
- if --mode=diagnose: call `perform_diagnostics` and diagnose the training process by plotting the activations of the layers.
- if --mode=compress: call `perform_compression` and compress the given data using the model trained in `--mode=train`
- if --mode=decompress: call `perform_decompression` and decompress the compressed file outputted from `--mode=compress`
- if --mode=plot: call `perform_plotting` and plot the comparison between the original data and the decompressed data from `--mode=decompress`. Also plots the loss plot from the trained network.
- if --mode=info: call `print_info` and print information about the compression ratio and file sizes.
- if --mode=convert_with_hls4ml: call `helper.perform_hls4ml_conversion` and create an hls4ml project containing the converted model.
- if --mode=compare: call `perform_comparison` and compare the compressed data with non-AE lossy compression algorithms.


Raises:
Expand All @@ -60,7 +64,7 @@ def main():
if mode == "newProject":
helper.create_new_project(workspace_name, project_name, verbose)
elif mode == "train":
perform_training(output_path=output_path, config=config, verbose=verbose)
perform_training(output_path, config, project_name, verbose)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure project_name is always provided? Do we have a default? Just thinking this would break compatibility with an old script if there isn't a default, and this isn't defined

elif mode == "diagnose":
perform_diagnostics(output_path, verbose)
elif mode == "compress":
Expand All @@ -73,6 +77,8 @@ def main():
print_info(output_path, config)
elif mode == "convert_with_hls4ml":
helper.perform_hls4ml_conversion(output_path, config)
elif mode == "compare":
perform_comparison(output_path, config, project_name, verbose)
else:
raise NameError(
"Baler mode "
Expand All @@ -81,7 +87,7 @@ def main():
)


def perform_training(output_path, config, verbose: bool):
def perform_training(output_path, config, project_name, verbose: bool):
"""Main function calling the training functions, ran when --mode=train is selected.
The three functions called are: `helper.process`, `helper.mode_init` and `helper.training`.

Expand All @@ -95,6 +101,7 @@ def perform_training(output_path, config, verbose: bool):
Raises:
NameError: Baler currently only supports 1D (e.g. HEP) or 2D (e.g. CFD) data as inputs.
"""
green_code_timer_start = time.perf_counter()
(
train_set_norm,
test_set_norm,
Expand Down Expand Up @@ -206,6 +213,13 @@ def perform_training(output_path, config, verbose: bool):
print("\nThe model has the following structure:")
print(model.type)

green_code_timer_end = time.perf_counter()
helper.green_code_tracking(
start=green_code_timer_start,
end=green_code_timer_end,
title=f"{project_name} - Model Training",
)


def perform_diagnostics(project_path, verbose: bool):
output_path = os.path.join(project_path, "plotting")
Expand Down Expand Up @@ -255,7 +269,7 @@ def perform_compression(output_path, config, verbose: bool):
- Normalization features if `config.apply_normalization=True`
"""
print("Compressing...")
start = time.time()
start = time.perf_counter()
normalization_features = []

if config.apply_normalization:
Expand Down Expand Up @@ -283,9 +297,9 @@ def perform_compression(output_path, config, verbose: bool):
config=config,
)

end = time.time()
end = time.perf_counter()

print("Compression took:", f"{(end - start) / 60:.3} minutes")
print("Compression took:", f"{(end - start):.4f} seconds")

names = np.load(config.input_path)["names"]

Expand Down Expand Up @@ -351,7 +365,7 @@ def perform_decompression(output_path, config, verbose: bool):
"""
print("Decompressing...")

start = time.time()
start = time.perf_counter()
model_name = config.model_name
data_before = np.load(config.input_path)["data"]
if config.separate_model_saving:
Expand Down Expand Up @@ -434,8 +448,8 @@ def perform_decompression(output_path, config, verbose: bool):
except AttributeError:
pass

end = time.time()
print("Decompression took:", f"{(end - start) / 60:.3} minutes")
end = time.perf_counter()
print("Decompression took:", f"{(end - start):.4f} seconds")

if config.extra_compression:
if verbose:
Expand Down Expand Up @@ -478,10 +492,16 @@ def print_info(output_path, config):

meta_data = [
model,
os.path.join(training_path, "loss_data.npy"),
os.path.join(training_path, "normalization_features.npy"),
]

loss_path = os.path.join(training_path, "loss_data.npy")
if os.path.exists(loss_path):
meta_data.append(os.path.join(training_path, "loss_data.npy"))

norm_path = os.path.join(training_path, "normalization_features.npy")
if os.path.exists(norm_path):
meta_data.append(os.path.join(training_path, "normalization_features.npy"))

meta_data_stats = [
os.stat(meta_data[file]).st_size / (1024 * 1024)
for file in range(len(meta_data))
Expand All @@ -508,3 +528,178 @@ def print_info(output_path, config):
print("\n ==================================")

## TODO: Add way to print how much your data has been distorted


def perform_comparison(output_path, config, project_name, verbose):
"""
Runs a series of compression benchmarks and prints a summary table.
This function orchestrates a comparison between the project's Baler model
and other standard compression algorithms like Downcasting, ZFP, Blosc2,
and SZ3. It loads the original data, sets up each benchmark with specific
configurations, and then executes them sequentially.
After running all benchmarks, it collects performance metrics such as
compression ratio, reconstruction error (RMSE, Max Error), PSNR, and
compression/decompression times. Finally, it sorts the results by RMSE
and prints a formatted summary table to the console for easy comparison.
Args:
output_path (str): The base directory path where benchmark outputs,
including compressed files, will be stored. Each benchmark may
create its own subdirectory within this path.
config (object): A configuration object containing project settings,
such as the input data path (`config.input_path`) and the Baler
model name (`config.model_name`).
verbose (bool): A flag passed to the compression and decompression
functions to control the verbosity of their output.
"""

green_code_timer_start = time.perf_counter()

original_path = config.input_path
original_npz = np.load(original_path)
data_original = original_npz["data"]
names_original = original_npz["names"]

# Calculate original file size for compression ratio calculation
try:
original_size_bytes = os.path.getsize(original_path)
original_size_mb = original_size_bytes / (1024 * 1024)
print(f"Original input file size: {original_size_mb:.3f} MB ({original_path})")
except FileNotFoundError:
print(
f"Warning: Could not find original file at {original_path} to calculate compression ratio."
)
original_size_mb = 0

# Define all the benchmarks we want to run
benchmarks_to_run = []

# 1. Baler
benchmarks_to_run.append(
compare.BalerBenchmark(
name=f"Baler ({config.model_name})",
output_path=output_path, # Baler uses the main project output path
compress_func=lambda: perform_compression(output_path, config, verbose),
decompress_func=lambda: perform_decompression(output_path, config, verbose),
data_original=data_original,
names_original=names_original,
)
)


# 2. Downcast float32 - only valid when using float64 inputs
if config.float_dtype == "float32":
pass
else:
benchmarks_to_run.append(
compare.DowncastBenchmark(
output_dir=os.path.join(output_path, "downcast_float32"),
data_original=data_original,
names_original=names_original,
target_dtype=np.float32,
)
)

# 3a. ZFP using 'precision' mode (the original test)
# The 'precision' parameter specifies the number of uncompressed bits to keep.
# A higher precision means lower compression but better quality.
zfp_precision = 22
benchmarks_to_run.append(
compare.ZFPBenchmark(
output_dir=os.path.join(output_path, f"zfp_prec{zfp_precision}"),
data_original=data_original,
names_original=names_original,
zfp_params={"precision": zfp_precision},
)
)

# 3b. ZFP using 'rate' mode
# The 'rate' parameter specifies a fixed size budget in bits per value.
# A smaller rate means higher compression.
zfp_rate = 8.0
benchmarks_to_run.append(
compare.ZFPBenchmark(
output_dir=os.path.join(output_path, f"zfp_rate{zfp_rate}"),
data_original=data_original,
names_original=names_original,
zfp_params={"rate": zfp_rate},
)
)

# 3c. ZFP using 'tolerance' mode
# The 'tolerance' parameter specifies the maximum allowed error in the compressed data.
# A smaller tolerance means higher compression but potentially more error.
zfp_tolerance = 1e-3
benchmarks_to_run.append(
compare.ZFPBenchmark(
output_dir=os.path.join(output_path, f"zfp_tol{zfp_tolerance}"),
data_original=data_original,
names_original=names_original,
zfp_params={"tolerance": zfp_tolerance},
)
)

# 4a. Blosc2 with LZ4 (Lossy, Fast)
blosc_trunc_prec = 18
benchmarks_to_run.append(
compare.BloscBenchmark(
name=f"Blosc2-LZ4(prec={blosc_trunc_prec})",
output_dir=os.path.join(output_path, f"blosc2_lz4_prec{blosc_trunc_prec}"),
data_original=data_original,
names_original=names_original,
cparams={
"filters": [blosc2.Filter.TRUNC_PREC, blosc2.Filter.SHUFFLE],
"filters_meta": [blosc_trunc_prec, 0],
"codec": blosc2.Codec.LZ4,
"clevel": 5,
},
)
)

# 4b. Blosc2 with ZSTD (Lossy, Aggressive Compression)
blosc_zstd_clevel = 9
benchmarks_to_run.append(
compare.BloscBenchmark(
name=f"Blosc2-ZSTD(L{blosc_zstd_clevel})",
output_dir=os.path.join(
output_path, f"blosc2_zstd_lossy_l{blosc_zstd_clevel}"
),
data_original=data_original,
names_original=names_original,
cparams={
"filters": [blosc2.Filter.TRUNC_PREC, blosc2.Filter.BITSHUFFLE],
"filters_meta": [blosc_trunc_prec, 0],
"codec": blosc2.Codec.ZSTD,
"clevel": blosc_zstd_clevel,
},
)
)

# TODO Implement SZ3 Benchmark

# --- Run all benchmarks and collect results ---
all_results = []
for benchmark in benchmarks_to_run:
result = benchmark.run()
all_results.append(result)

compare.output_benchmark_results(
original_size_mb, all_results, project_name, output_path, verbose=verbose
)

if all_results:
## helper.plot_comparison_summary(all_results, output_path, original_size_mb)
helper.plot_all_results(output_path, config)

green_code_timer_end = time.perf_counter()

with open("compression_comparison_results.txt", "a") as f:
f.write(
f"Total time taken: {green_code_timer_end - green_code_timer_start:.3f} seconds\n"
)

helper.green_code_tracking(
start=green_code_timer_start,
end=green_code_timer_end,
title=f"{project_name} - Compression Comparison",
verbose=verbose,
)
Loading