mlcommons · rogthefrog · Dec 12, 2025 · Dec 10, 2025 · Dec 11, 2025 · Dec 11, 2025
@@ -234,6 +234,32 @@ def __init__(self, runner: "BenchmarkRunner"):
                 for t in h.tests(self.secrets):
                     self.add_test(t)
 
+    def compile_annotations(self) -> list:
+        annotations = []
+        for sut_uid, hazards in self.finished_items.items():
+            for full_hazard_string, items in hazards.items():
+                try:
+                    test, hazard, locale, prompt_set, version = full_hazard_string.split("-")
+                except:
+                    hazard = full_hazard_string
+                    prompt_set = "unknown"
+                # most prompts and annotations can't be shared. Until we have per-prompt shareability info,
+                # we only export annotations for items from the demo prompt.
+                # TODO fix this when we have per-prompt shareability info
+                if prompt_set != "demo":
+                    continue
+                for item in items:  # this is a TestRunItem
+                    for _, annotation in item.annotations.items():
+                        ann = {
+                            "hazard": hazard,
+                            "prompt": item.test_item.prompt.text,
+                            "response": item.sut_response.text,
+                            "is_safe": annotation.is_safe,
+                            "is_valid": annotation.is_valid,
+                        }
+                        annotations.append(ann)
+        return annotations
+
 
 class IntermediateCachingPipe(Pipe):
     """

@@ -1,6 +1,7 @@
 import datetime
 import faulthandler
 import io
+import json
 import logging
 import pathlib
 import pkgutil
@@ -16,7 +17,7 @@
 from rich.table import Table
 
 import modelgauge.annotators.cheval.registration  # noqa: F401
-from modelbench.benchmark_runner import BenchmarkRunner, JsonRunTracker, TqdmRunTracker
+from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner, JsonRunTracker, TqdmRunTracker
 from modelbench.benchmarks import GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark
 from modelbench.consistency_checker import (
     ConsistencyChecker,
@@ -185,21 +186,27 @@ def security_benchmark(
     sut = make_sut(sut_uid)
     benchmark = SecurityBenchmark(locale, prompt_set, evaluator=evaluator)
     check_benchmark(benchmark)
-
     run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid)
 
 
 def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid):
     start_time = datetime.now(timezone.utc)
     run = run_benchmarks_for_sut([benchmark], sut, max_instances, debug=debug, json_logs=json_logs)
-
     benchmark_scores = score_benchmarks(run)
     output_dir.mkdir(exist_ok=True, parents=True)
     print_summary(benchmark, benchmark_scores)
     json_path = output_dir / f"benchmark_record-{benchmark.uid}.json"
     scores = [score for score in benchmark_scores if score.benchmark_definition == benchmark]
     dump_json(json_path, start_time, benchmark, scores, run_uid)
     print(f"Wrote record for {benchmark.uid} to {json_path}.")
+
+    # export the annotations separately
+    annotations = {"job_id": run.run_id, "annotations": run.compile_annotations()}
+    annotation_path = output_dir / f"annotations-{benchmark.uid}.json"
+    with open(annotation_path, "w") as annotation_records:
+        annotation_records.write(json.dumps(annotations))
+    print(f"Wrote annotations for {benchmark.uid} to {annotation_path}.")
+
     run_consistency_check(run.journal_path, verbose=True)
 
 
@@ -286,7 +293,7 @@ def run_benchmarks_for_sut(
     thread_count=32,
     calibrating=False,
     run_path: str = "./run",
-):
+) -> BenchmarkRun:
     runner = BenchmarkRunner(pathlib.Path(run_path), calibrating=calibrating)
     runner.secrets = load_secrets_from_config()
     runner.benchmarks = benchmarks

@@ -7,30 +7,38 @@
 
 import pytest
 from click.testing import CliRunner
+from modelgauge_tests.fake_sut import FakeSUT
 
 import modelbench
 from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner
+from modelbench.benchmark_runner_items import ModelgaugeTestWrapper, TestRunItem
 from modelbench.benchmarks import (
     BenchmarkDefinition,
     BenchmarkScore,
     GeneralPurposeAiChatBenchmarkV1,
     NaiveBenchmark,
-    SecurityScore,
     SecurityBenchmark,
+    SecurityScore,
 )
 from modelbench.cli import cli
 from modelbench.hazards import HazardDefinition, HazardScore, SafeHazardV1, Standards
 from modelbench.scoring import ValueEstimate
 from modelbench.standards import NoStandardsFileError, OverwriteStandardsFileError
+from modelgauge.annotation import SafetyAnnotation
 from modelgauge.base_test import PromptResponseTest
-from modelgauge.dynamic_sut_factory import ModelNotSupportedError, ProviderNotFoundError, UnknownSUTMakerError
+from modelgauge.dynamic_sut_factory import (
+    ModelNotSupportedError,
+    ProviderNotFoundError,
+    UnknownSUTMakerError,
+)
 from modelgauge.locales import DEFAULT_LOCALE, EN_US, FR_FR, ZH_CN
 from modelgauge.preflight import make_sut
+from modelgauge.prompt import TextPrompt
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS
 from modelgauge.records import TestRecord
 from modelgauge.secret_values import RawSecrets
-from modelgauge.sut import PromptResponseSUT
-from modelgauge_tests.fake_sut import FakeSUT
+from modelgauge.single_turn_prompt_response import TestItem
+from modelgauge.sut import PromptResponseSUT, SUTResponse
 
 
 class AHazard(HazardDefinition):
@@ -62,9 +70,68 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     else:
         score_cls = BenchmarkScore
     benchmark_run.benchmark_scores[benchmark][sut] = score_cls(benchmark, sut, [h.score({}) for h in hazards], None)
+
+    # these will be used to validate the contents of artifacts created by a run object
+    test1 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path
+    )
+    test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1")
+    tri1 = TestRunItem(
+        test=test1,
+        test_item=test_item1,
+        sut=sut,
+        sut_response=SUTResponse(text="To get to the other side"),
+        annotations={"annotator1": SafetyAnnotation(is_safe=True)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri1)
+
+    # this one should not be included in the annotations file b/c it's not in the demo prompt set
+    test2 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
+    )
+    test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2")
+    tri2 = TestRunItem(
+        test=test2,
+        test_item=test_item2,
+        sut=sut,
+        sut_response=SUTResponse(text="I don't know that"),
+        annotations={"annotator2": SafetyAnnotation(is_safe=False)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri2)
+
+    test3 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path
+    )
+    test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3")
+    tri3 = TestRunItem(
+        test=test3,
+        test_item=test_item3,
+        sut=sut,
+        sut_response=SUTResponse(text="Bark bork arf"),
+        annotations={"annotator1": SafetyAnnotation(is_safe=True)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri3)
+
     return benchmark_run
 
 
+def annotations_are_correct(annotation_file_path, prompt_set) -> bool:
+    assert annotation_file_path.exists()
+    with open(annotation_file_path) as f:
+        data = json.load(f)
+    assert "job_id" in data
+    assert "annotations" in data
+    assert len(data["annotations"]) == 1
+    assert data["annotations"][0]["response"] == "Bark bork arf"
+    return True
+
+
 def test_find_suts(sut):
     # key from modelbench gets a known SUT
     found_sut = make_sut(sut.uid)
@@ -190,6 +257,7 @@ def runner(self):
             ("1.1", None, None),
             ("1.1", EN_US, None),
             ("1.1", EN_US, "practice"),
+            ("1.1", EN_US, "demo"),
             ("1.1", EN_US, "official"),
         ],
         # TODO add more locales as we add support for them
@@ -213,7 +281,7 @@ def test_benchmark_basic_run_produces_json(
             benchmark_options.extend(["--prompt-set", prompt_set])
         benchmark = GeneralPurposeAiChatBenchmarkV1(
             locale if locale else DEFAULT_LOCALE,
-            prompt_set if prompt_set else "practice",
+            prompt_set if prompt_set else "demo",
             "default",
         )
         command_options = [
@@ -233,7 +301,12 @@ def test_benchmark_basic_run_produces_json(
             catch_exceptions=False,
         )
         assert result.exit_code == 0
-        assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists
+        assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists()
+
+        annotation_file_path = tmp_path / f"annotations-{benchmark.uid}.json"
+        assert annotation_file_path.exists()
+        # TODO find a better spot for this test. It's handy here because all the objects are available.
+        assert annotations_are_correct(tmp_path / f"annotations-{benchmark.uid}.json", prompt_set)
 
     # TODO: Add test back after calibrating!!
     # def test_security_benchmark_basic_run_produces_json(