legend-exp · gipert · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 workflow/src/legendsimflow/_version.py
 .snakemake
 /inputs
+/simflow-config.yaml
 generated
 tests/dummyprod/inputs/hardware
 tests/dummyprod/inputs/datasets

diff --git a/docs/source/manual/prod.md b/docs/source/manual/prod.md
@@ -74,18 +74,21 @@ in the configuration file) for each simulation. Once the production is over, the
 results can be summarized via the `print_benchmark_stats` rule:
 
 ```console
-> snakemake -q all print_benchmark_stats
-simid                                       CPU time [ms/ev]  evts / 1h  jobs (1h) / 10^8 evts
------                                       ----------------  ---------  ---------------------
-stp.birds-nest-K40                                (13s) 2.79    1288475                     77
-stp.birds-nest-Ra224-to-Pb208                   (191s) 38.33      93916                   1064
-stp.fiber-support-copper-Co60                   (223s) 44.69      80558                   1241
-...                                                       ...        ...                    ...
+> snakemake print_benchmark_stats
+simid                                runtime [sec]  speed (hot loop) [ev/sec]  evts / 1h  jobs (1h) / 10^8 evts
+-----                                -------------  -------------------------  ---------  ---------------------
+stp.sis1_z8430_slot2_Bi212_to_Pb208          139.0                     717.70    2583720                     38
+stp.sis1_z8580_slot2_Pb214_to_Po214          167.0                     596.99    2149164                     46
+stp.sis1_z8630_slot2_Bi212_to_Pb208          135.0                     740.46    2665656                     37
+...                                            ...                        ...        ...                    ...
 ```
 
+Which computes statistics by inspecting the `stp`-tier (_remage_) logs.
+
 :::{note}
 
-The CPU time is a good measure of the actual simulation time, since other tasks
-(e.g. application loading) are typically not CPU intensive.
+The benchmarking statistics refer exclusively to the hot Geant4 simulation loop.
+Overheads such as application initialization or remage built-in post processing
+are not taken into account.
 
 :::
diff --git a/docs/source/manual/setup.md b/docs/source/manual/setup.md
@@ -1,9 +1,23 @@
 # Installation and configuration
 
+Clone [legend-simflow](https://github.com/legend-exp/legend-simflow) and give it
+a custom name:
+
+```console
+> git clone [email protected]:legend-exp/legend-simflow <path/to/prod/cycle>
+```
+
+We recommend tagging the production cycle with a version number to be used as
+folder name (e.g. `path/to/productions/v1.0.0`).
+
+Before a simulation production can be run, the user must configure the run with
+a dedicated file and install the required software dependencies.
+
 ## The configuration file
 
-The `simflow-config.yaml` file in the production directory allows to customize
-the workflow in great detail. Here's a basic description of its fields:
+The `simflow-config.yaml` file resides in the production directory (the root of
+the GitHub repository) and allows to customize the workflow in great detail.
+Here's a basic description of its fields:
 
 - `experiment`: labels the experiment to be simulated. The same name is used in
   the metadata to label the corresponding configuration files.
@@ -34,6 +48,15 @@ Snakemake's `--config` option.
 
 :::
 
+For a quick start, just copy over the default configuration file from the
+templates:
+
+```console
+cp templates/default.yaml simflow-config.yaml
+```
+
+and customize it.
+
 ## Software dependencies
 
 The first step is obtaining the software, which is fully specified by the

diff --git a/pyproject.toml b/pyproject.toml
@@ -171,29 +171,29 @@ legend-simflow = { path = ".", editable = true }
 [tool.pixi.dependencies]
 # legend-simflow core dependencies
 awkward = "*"
-dbetto = ">=1.3.2"
+dbetto = ">=1.3.2,<1.4"
 # legend-dataflow-scripts = "..."  # not on conda-forge
 legend-pydataobj = "*"
-legend-pygeom-l200 = ">=0.8"
-legend-pygeom-tools = ">=0.1"
+legend-pygeom-l200 = ">=0.8,<0.9"
+legend-pygeom-tools = ">=0.1,<0.2"
 numpy = "*"
-pylegendmeta = ">=1.3.3"
+pylegendmeta = ">=1.3.3,<1.4"
 
 # execution
-snakemake = ">=8.16"
+snakemake = ">=8.16,<9"
 snakemake-storage-plugin-fs = "*"
 
 # tier hit
-legend-pygeom-hpges = ">=0.9"
+legend-pygeom-hpges = ">=0.9,<0.10"
 pyg4ometry = "*"
-pygama = ">=2.2.3"
-reboost = ">=0.8.3"
+pygama = ">=2.2.3,<2.3"
+reboost = ">=0.8.3,<0.9"
 
 # tier stp
 remage = ">=0.16,<0.17"
 
 # drift-time maps and other SSD.jl jobs
-julia = ">=1.12"
+julia = ">=1.12,<1.13"
 h5py = "*"
 hdf5 = "*"
-revertex = ">=0.1.2"
+revertex = ">=0.1.2,<0.2"
diff --git a/simflow-config.yaml → templates/default.yaml b/simflow-config.yaml → templates/default.yaml
diff --git a/workflow/src/legendsimflow/metadata.py b/workflow/src/legendsimflow/metadata.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-import hashlib
 import json
 import logging
 import re
@@ -80,8 +79,10 @@ def hash_dict(d: dict | AttrsDict) -> str:
     if isinstance(d, AttrsDict):
         d = d.to_dict()
 
-    s = json.dumps(d, sort_keys=True)
-    return hashlib.sha256(s.encode()).hexdigest()
+    return json.dumps(d, sort_keys=True)
+
+    # NOTE: alternatively, return sha256 (shorter string but bad for diffs)
+    # return hashlib.sha256(s.encode()).hexdigest()
 
 
 def smk_hash_simconfig(

diff --git a/workflow/src/legendsimflow/scripts/print_benchmark_stats.py b/workflow/src/legendsimflow/scripts/print_benchmark_stats.py
@@ -1,3 +1,5 @@
+# ruff: noqa: I002, T201
+
 # Copyright (C) 2023 Luigi Pertoldi <[email protected]>
 #
 # This program is free software: you can redistribute it and/or modify it under
@@ -13,40 +15,99 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-# ruff: noqa: F821, T201
-from __future__ import annotations
-
-import csv
-from pathlib import Path
+import re
+from datetime import timedelta
+from statistics import mean
 
-import legenddataflowscripts as ldfs
+from legendsimflow import nersc
 
 
 def printline(*line):
-    print("{:<52}{:>16}{:>11}{:>23}".format(*line))
+    print("{:<52}{:>16}{:>27}{:>11}{:>23}".format(*line))
+
+
+args = nersc.dvs_ro_snakemake(snakemake)  # noqa: F821
+
+speed_pattern = re.compile(
+    r"^.*Stats: average event processing time was\s+"
+    r"([0-9]+(?:\.[0-9]+)?)\s+seconds/event\s+=\s+"
+    r"([0-9]+(?:\.[0-9]+)?)\s+events/second\s*$",
+    re.MULTILINE,
+)
+
+nev_pattern = re.compile(
+    r"^.*Run nr\. \d+ completed\. (\d+) events simulated\.", re.MULTILINE
+)
 
+time_pattern = re.compile(
+    r"^.*Stats: run time was (\d+) days, (\d+) hours, (\d+) minutes and (\d+) seconds$",
+    re.MULTILINE,
+)
 
-printline("simid", "CPU time [ms/ev]", "evts / 1h", "jobs (1h) / 10^8 evts")
-printline("-----", "----------------", "---------", "---------------------")
+# have a look at the latest run
+logdir = (nersc.dvs_ro(args.config, args.config.paths.log) / "benchmark").resolve()
 
-bdir = Path(ldfs.as_ro(snakemake.config, snakemake.config.paths.benchmarks))
+if not logdir.is_dir():
+    msg = "no benchmark run available!"
+    raise RuntimeError(msg)
 
-for simd in sorted(bdir.glob("*/*")):
-    if simd.parent.name not in ("ver", "stp"):
+printline(
+    "simid",
+    "runtime [sec]",
+    "speed (hot loop) [ev/sec]",
+    "evts / 1h",
+    "jobs (1h) / 10^8 evts",
+)
+printline(
+    "-----",
+    "-------------",
+    "-------------------------",
+    "---------",
+    "---------------------",
+)
+
+for simd in sorted(logdir.glob("*/*")):
+    # this code works only for remage output
+    if simd.parent.name != "stp":
         continue
 
-    data = {"cpu_time": 0}
-    for jobd in simd.glob("*.tsv"):
-        with jobd.open(newline="") as f:
-            this_data = next(iter(csv.DictReader(f, delimiter="\t")))
-            data["cpu_time"] += float(this_data["cpu_time"])
+    speed = 0
+    runtime = 0
+    for jobd in simd.glob("*.log"):
+        with jobd.open("r", encoding="utf-8") as f:
+            # read the full file in memory (assuming it can't be huge)
+            data = f.read()
+
+            # extract events/sec for each thread
+            time = [
+                float(m.group(2)) for m in speed_pattern.finditer(data) if m is not None
+            ]
+
+            # simulations might have crashed or still running
+            if time == []:
+                runtime = "..."
+                speed = "..."
+
+            # get the number of simulated events for each thread (it's always the same)
+            nev = int(nev_pattern.search(data).group(1))
+
+            # get the runtime of each thread
+            runtimes = [
+                timedelta(
+                    days=int(d), hours=int(h), minutes=int(mi), seconds=int(s)
+                ).total_seconds()
+                for d, h, mi, s in time_pattern.findall(data)
+            ]
+
+            runtime = mean(runtimes)
+            speed += mean(time)
 
-    speed = data["cpu_time"] / snakemake.config.benchmark.n_primaries[simd.parent.name]
-    evts_1h = int(60 * 60 / speed) if speed > 0 else "..."
+    evts_1h = int(speed * 60 * 60) if speed > 0 else "..."
     njobs = int(1e8 / evts_1h) if not isinstance(evts_1h, str) else 0
     printline(
         simd.parent.name + "." + simd.name,
-        "({:}s) {:.2f}".format(int(data["cpu_time"]), 1000 * speed),
+        ("!!! " if runtime < 10 else "") + f"{runtime:.1f}",
+        f"{speed:.2f}",
         evts_1h,
         njobs,
     )
diff --git a/workflow/src/legendsimflow/utils.py b/workflow/src/legendsimflow/utils.py
@@ -91,11 +91,17 @@ def _make_path(d):
     # I have verified only that this variable is visible in scripts (not shell directives)
     os.environ["MPLCONFIGDIR"] = f"{workflow.basedir}/src/legendsimflow"
 
+    proctime = (
+        "benchmark"
+        if config.benchmark.enabled
+        else datetime.now().strftime("%Y%m%dT%H%M%SZ")
+    )
+
     return AttrsDict(
         {
             "config": config,
             "basedir": workflow.basedir,
-            "proctime": datetime.now().strftime("%Y%m%dT%H%M%SZ"),
+            "proctime": proctime,
         }
     )