From a54fb2599e2c3b2665c5e0954400bc8a114499e8 Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Tue, 9 Dec 2025 21:29:50 -0500
Subject: [PATCH 1/6] first pass at outputting annotations

---
 src/modelbench/cli.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py
index f3214c37..37ab7788 100644
--- a/src/modelbench/cli.py
+++ b/src/modelbench/cli.py
@@ -1,6 +1,7 @@
 import datetime
 import faulthandler
 import io
+import json
 import logging
 import pathlib
 import pkgutil
@@ -16,7 +17,7 @@
 from rich.table import Table
 
 import modelgauge.annotators.cheval.registration  # noqa: F401
-from modelbench.benchmark_runner import BenchmarkRunner, JsonRunTracker, TqdmRunTracker
+from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner, JsonRunTracker, TqdmRunTracker
 from modelbench.benchmarks import GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark
 from modelbench.consistency_checker import (
     ConsistencyChecker,
@@ -192,7 +193,6 @@ def security_benchmark(
 def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid):
     start_time = datetime.now(timezone.utc)
     run = run_benchmarks_for_sut([benchmark], sut, max_instances, debug=debug, json_logs=json_logs)
-
     benchmark_scores = score_benchmarks(run)
     output_dir.mkdir(exist_ok=True, parents=True)
     print_summary(benchmark, benchmark_scores)
@@ -200,9 +200,28 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou
     scores = [score for score in benchmark_scores if score.benchmark_definition == benchmark]
     dump_json(json_path, start_time, benchmark, scores, run_uid)
     print(f"Wrote record for {benchmark.uid} to {json_path}.")
+
+    # export the annotations separately
+    annotations = {"job_id": benchmark.uid, "annotations": compile_annotations(run)}
+    annotation_path = output_dir / f"annotations-{benchmark.uid}-{sut.uid}.json"
+    with open(annotation_path, "w") as annotation_records:
+        annotation_records.write(json.dumps(annotations))
+    print(f"Wrote annotations for {benchmark.uid} to {annotation_path}.")
+
     run_consistency_check(run.journal_path, verbose=True)
 
 
+def compile_annotations(run) -> list:
+    results = run.finished_items
+    annotations = []
+    for _, hazards in results.items():
+        for _, items in hazards.items():
+            for item in items:
+                for _, annotation in item.annotations.items():
+                    annotations.append({"source_id": item.source_id(), "is_safe": annotation.is_safe})
+    return annotations
+
+
 @cli.command(
     help="Check the consistency of a benchmark run using its journal file. You can pass the name of the file OR a directory containing multiple journal files (will be searched recursively)"
 )

From 18b0925b87064424a8cc1aab128643761423b7f2 Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Wed, 10 Dec 2025 19:23:46 -0500
Subject: [PATCH 2/6] finish annotation collection logic, and move it to the
 BenchmarkRun class

---
 src/modelbench/benchmark_runner.py | 20 ++++++++++++++++++++
 src/modelbench/cli.py              | 15 ++-------------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py
index ee8252d6..f0c76d52 100644
--- a/src/modelbench/benchmark_runner.py
+++ b/src/modelbench/benchmark_runner.py
@@ -234,6 +234,26 @@ def __init__(self, runner: "BenchmarkRunner"):
                 for t in h.tests(self.secrets):
                     self.add_test(t)
 
+    def compile_annotations(self) -> list:
+        annotations = []
+        for sut_uid, hazards in self.finished_items.items():
+            for full_hazard_string, items in hazards.items():
+                try:
+                    test, hazard, locale, prompt_set, version = full_hazard_string.split("-")
+                except:
+                    hazard = full_hazard_string
+                for item in items:  # this is a TestRunItem
+                    for _, annotation in item.annotations.items():
+                        ann = {
+                            "hazard": hazard,
+                            "prompt": item.test_item.prompt.text,
+                            "response": item.sut_response.text,
+                            "is_safe": annotation.is_safe,
+                            "is_valid": annotation.is_valid,
+                        }
+                        annotations.append(ann)
+        return annotations
+
 
 class IntermediateCachingPipe(Pipe):
     """
diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py
index 37ab7788..e3e45f4a 100644
--- a/src/modelbench/cli.py
+++ b/src/modelbench/cli.py
@@ -202,7 +202,7 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou
     print(f"Wrote record for {benchmark.uid} to {json_path}.")
 
     # export the annotations separately
-    annotations = {"job_id": benchmark.uid, "annotations": compile_annotations(run)}
+    annotations = {"job_id": run.run_id, "annotations": run.compile_annotations()}
     annotation_path = output_dir / f"annotations-{benchmark.uid}-{sut.uid}.json"
     with open(annotation_path, "w") as annotation_records:
         annotation_records.write(json.dumps(annotations))
@@ -211,17 +211,6 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou
     run_consistency_check(run.journal_path, verbose=True)
 
 
-def compile_annotations(run) -> list:
-    results = run.finished_items
-    annotations = []
-    for _, hazards in results.items():
-        for _, items in hazards.items():
-            for item in items:
-                for _, annotation in item.annotations.items():
-                    annotations.append({"source_id": item.source_id(), "is_safe": annotation.is_safe})
-    return annotations
-
-
 @cli.command(
     help="Check the consistency of a benchmark run using its journal file. You can pass the name of the file OR a directory containing multiple journal files (will be searched recursively)"
 )
@@ -305,7 +294,7 @@ def run_benchmarks_for_sut(
     thread_count=32,
     calibrating=False,
     run_path: str = "./run",
-):
+) -> BenchmarkRun:
     runner = BenchmarkRunner(pathlib.Path(run_path), calibrating=calibrating)
     runner.secrets = load_secrets_from_config()
     runner.benchmarks = benchmarks

From b60aead647a48069710bd2929d4c66b0259212ea Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Wed, 10 Dec 2025 19:49:01 -0500
Subject: [PATCH 3/6] noop; whitespace

---
 src/modelbench/cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py
index e3e45f4a..49deb63c 100644
--- a/src/modelbench/cli.py
+++ b/src/modelbench/cli.py
@@ -186,7 +186,6 @@ def security_benchmark(
     sut = make_sut(sut_uid)
     benchmark = SecurityBenchmark(locale, prompt_set, evaluator=evaluator)
     check_benchmark(benchmark)
-
     run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid)
 
 

From 8cca1cd10eabe99f949949f9e58e6a20ac9df5ef Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Wed, 10 Dec 2025 19:49:22 -0500
Subject: [PATCH 4/6] only export annotations from runs using the demo prompt
 set, for now

---
 src/modelbench/benchmark_runner.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py
index f0c76d52..c75107ce 100644
--- a/src/modelbench/benchmark_runner.py
+++ b/src/modelbench/benchmark_runner.py
@@ -242,6 +242,12 @@ def compile_annotations(self) -> list:
                     test, hazard, locale, prompt_set, version = full_hazard_string.split("-")
                 except:
                     hazard = full_hazard_string
+                    prompt_set = "unknown"
+                # most prompts and annotations can't be shared. Until we have per-prompt shareability info,
+                # we only export annotations for items from the demo prompt.
+                # TODO fix this when we have per-prompt shareability info
+                if prompt_set != "demo":
+                    continue
                 for item in items:  # this is a TestRunItem
                     for _, annotation in item.annotations.items():
                         ann = {

From 215ae2301506d8bca876e2f9a1cef099e244460a Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Thu, 11 Dec 2025 15:58:18 -0500
Subject: [PATCH 5/6] remove SUT UID from file name, as those UIDs can contain
 hostile characters

---
 src/modelbench/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py
index 49deb63c..93bd34c2 100644
--- a/src/modelbench/cli.py
+++ b/src/modelbench/cli.py
@@ -202,7 +202,7 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou
 
     # export the annotations separately
     annotations = {"job_id": run.run_id, "annotations": run.compile_annotations()}
-    annotation_path = output_dir / f"annotations-{benchmark.uid}-{sut.uid}.json"
+    annotation_path = output_dir / f"annotations-{benchmark.uid}.json"
     with open(annotation_path, "w") as annotation_records:
         annotation_records.write(json.dumps(annotations))
     print(f"Wrote annotations for {benchmark.uid} to {annotation_path}.")

From 6f5c78dc9ced5a5b4d84fe17338ed1074d3395e7 Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Thu, 11 Dec 2025 20:28:43 -0500
Subject: [PATCH 6/6] add test of annotation file contents; fix a bug that was
 hiding behind an always-passing test caused by missing () in a function call

---
 tests/modelbench_tests/test_run.py | 85 +++++++++++++++++++++++++++---
 1 file changed, 79 insertions(+), 6 deletions(-)

diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py
index 52ba9a4a..4d3c77e3 100644
--- a/tests/modelbench_tests/test_run.py
+++ b/tests/modelbench_tests/test_run.py
@@ -7,30 +7,38 @@
 
 import pytest
 from click.testing import CliRunner
+from modelgauge_tests.fake_sut import FakeSUT
 
 import modelbench
 from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner
+from modelbench.benchmark_runner_items import ModelgaugeTestWrapper, TestRunItem
 from modelbench.benchmarks import (
     BenchmarkDefinition,
     BenchmarkScore,
     GeneralPurposeAiChatBenchmarkV1,
     NaiveBenchmark,
-    SecurityScore,
     SecurityBenchmark,
+    SecurityScore,
 )
 from modelbench.cli import cli
 from modelbench.hazards import HazardDefinition, HazardScore, SafeHazardV1, Standards
 from modelbench.scoring import ValueEstimate
 from modelbench.standards import NoStandardsFileError, OverwriteStandardsFileError
+from modelgauge.annotation import SafetyAnnotation
 from modelgauge.base_test import PromptResponseTest
-from modelgauge.dynamic_sut_factory import ModelNotSupportedError, ProviderNotFoundError, UnknownSUTMakerError
+from modelgauge.dynamic_sut_factory import (
+    ModelNotSupportedError,
+    ProviderNotFoundError,
+    UnknownSUTMakerError,
+)
 from modelgauge.locales import DEFAULT_LOCALE, EN_US, FR_FR, ZH_CN
 from modelgauge.preflight import make_sut
+from modelgauge.prompt import TextPrompt
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS
 from modelgauge.records import TestRecord
 from modelgauge.secret_values import RawSecrets
-from modelgauge.sut import PromptResponseSUT
-from modelgauge_tests.fake_sut import FakeSUT
+from modelgauge.single_turn_prompt_response import TestItem
+from modelgauge.sut import PromptResponseSUT, SUTResponse
 
 
 class AHazard(HazardDefinition):
@@ -62,9 +70,68 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     else:
         score_cls = BenchmarkScore
     benchmark_run.benchmark_scores[benchmark][sut] = score_cls(benchmark, sut, [h.score({}) for h in hazards], None)
+
+    # these will be used to validate the contents of artifacts created by a run object
+    test1 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path
+    )
+    test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1")
+    tri1 = TestRunItem(
+        test=test1,
+        test_item=test_item1,
+        sut=sut,
+        sut_response=SUTResponse(text="To get to the other side"),
+        annotations={"annotator1": SafetyAnnotation(is_safe=True)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri1)
+
+    # this one should not be included in the annotations file b/c it's not in the demo prompt set
+    test2 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
+    )
+    test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2")
+    tri2 = TestRunItem(
+        test=test2,
+        test_item=test_item2,
+        sut=sut,
+        sut_response=SUTResponse(text="I don't know that"),
+        annotations={"annotator2": SafetyAnnotation(is_safe=False)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri2)
+
+    test3 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path
+    )
+    test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3")
+    tri3 = TestRunItem(
+        test=test3,
+        test_item=test_item3,
+        sut=sut,
+        sut_response=SUTResponse(text="Bark bork arf"),
+        annotations={"annotator1": SafetyAnnotation(is_safe=True)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri3)
+
     return benchmark_run
 
 
+def annotations_are_correct(annotation_file_path, prompt_set) -> bool:
+    assert annotation_file_path.exists()
+    with open(annotation_file_path) as f:
+        data = json.load(f)
+    assert "job_id" in data
+    assert "annotations" in data
+    assert len(data["annotations"]) == 1
+    assert data["annotations"][0]["response"] == "Bark bork arf"
+    return True
+
+
 def test_find_suts(sut):
     # key from modelbench gets a known SUT
     found_sut = make_sut(sut.uid)
@@ -190,6 +257,7 @@ def runner(self):
             ("1.1", None, None),
             ("1.1", EN_US, None),
             ("1.1", EN_US, "practice"),
+            ("1.1", EN_US, "demo"),
             ("1.1", EN_US, "official"),
         ],
         # TODO add more locales as we add support for them
@@ -213,7 +281,7 @@ def test_benchmark_basic_run_produces_json(
             benchmark_options.extend(["--prompt-set", prompt_set])
         benchmark = GeneralPurposeAiChatBenchmarkV1(
             locale if locale else DEFAULT_LOCALE,
-            prompt_set if prompt_set else "practice",
+            prompt_set if prompt_set else "demo",
             "default",
         )
         command_options = [
@@ -233,7 +301,12 @@ def test_benchmark_basic_run_produces_json(
             catch_exceptions=False,
         )
         assert result.exit_code == 0
-        assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists
+        assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists()
+
+        annotation_file_path = tmp_path / f"annotations-{benchmark.uid}.json"
+        assert annotation_file_path.exists()
+        # TODO find a better spot for this test. It's handy here because all the objects are available.
+        assert annotations_are_correct(tmp_path / f"annotations-{benchmark.uid}.json", prompt_set)
 
     # TODO: Add test back after calibrating!!
     # def test_security_benchmark_basic_run_produces_json(