From a54fb2599e2c3b2665c5e0954400bc8a114499e8 Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Tue, 9 Dec 2025 21:29:50 -0500 Subject: [PATCH 1/6] first pass at outputting annotations --- src/modelbench/cli.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py index f3214c37..37ab7788 100644 --- a/src/modelbench/cli.py +++ b/src/modelbench/cli.py @@ -1,6 +1,7 @@ import datetime import faulthandler import io +import json import logging import pathlib import pkgutil @@ -16,7 +17,7 @@ from rich.table import Table import modelgauge.annotators.cheval.registration # noqa: F401 -from modelbench.benchmark_runner import BenchmarkRunner, JsonRunTracker, TqdmRunTracker +from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner, JsonRunTracker, TqdmRunTracker from modelbench.benchmarks import GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark from modelbench.consistency_checker import ( ConsistencyChecker, @@ -192,7 +193,6 @@ def security_benchmark( def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid): start_time = datetime.now(timezone.utc) run = run_benchmarks_for_sut([benchmark], sut, max_instances, debug=debug, json_logs=json_logs) - benchmark_scores = score_benchmarks(run) output_dir.mkdir(exist_ok=True, parents=True) print_summary(benchmark, benchmark_scores) @@ -200,9 +200,28 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou scores = [score for score in benchmark_scores if score.benchmark_definition == benchmark] dump_json(json_path, start_time, benchmark, scores, run_uid) print(f"Wrote record for {benchmark.uid} to {json_path}.") + + # export the annotations separately + annotations = {"job_id": benchmark.uid, "annotations": compile_annotations(run)} + annotation_path = output_dir / f"annotations-{benchmark.uid}-{sut.uid}.json" + with open(annotation_path, "w") as annotation_records: + annotation_records.write(json.dumps(annotations)) + print(f"Wrote annotations for {benchmark.uid} to {annotation_path}.") + run_consistency_check(run.journal_path, verbose=True) +def compile_annotations(run) -> list: + results = run.finished_items + annotations = [] + for _, hazards in results.items(): + for _, items in hazards.items(): + for item in items: + for _, annotation in item.annotations.items(): + annotations.append({"source_id": item.source_id(), "is_safe": annotation.is_safe}) + return annotations + + @cli.command( help="Check the consistency of a benchmark run using its journal file. You can pass the name of the file OR a directory containing multiple journal files (will be searched recursively)" ) From 18b0925b87064424a8cc1aab128643761423b7f2 Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Wed, 10 Dec 2025 19:23:46 -0500 Subject: [PATCH 2/6] finish annotation collection logic, and move it to the BenchmarkRun class --- src/modelbench/benchmark_runner.py | 20 ++++++++++++++++++++ src/modelbench/cli.py | 15 ++------------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py index ee8252d6..f0c76d52 100644 --- a/src/modelbench/benchmark_runner.py +++ b/src/modelbench/benchmark_runner.py @@ -234,6 +234,26 @@ def __init__(self, runner: "BenchmarkRunner"): for t in h.tests(self.secrets): self.add_test(t) + def compile_annotations(self) -> list: + annotations = [] + for sut_uid, hazards in self.finished_items.items(): + for full_hazard_string, items in hazards.items(): + try: + test, hazard, locale, prompt_set, version = full_hazard_string.split("-") + except: + hazard = full_hazard_string + for item in items: # this is a TestRunItem + for _, annotation in item.annotations.items(): + ann = { + "hazard": hazard, + "prompt": item.test_item.prompt.text, + "response": item.sut_response.text, + "is_safe": annotation.is_safe, + "is_valid": annotation.is_valid, + } + annotations.append(ann) + return annotations + class IntermediateCachingPipe(Pipe): """ diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py index 37ab7788..e3e45f4a 100644 --- a/src/modelbench/cli.py +++ b/src/modelbench/cli.py @@ -202,7 +202,7 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou print(f"Wrote record for {benchmark.uid} to {json_path}.") # export the annotations separately - annotations = {"job_id": benchmark.uid, "annotations": compile_annotations(run)} + annotations = {"job_id": run.run_id, "annotations": run.compile_annotations()} annotation_path = output_dir / f"annotations-{benchmark.uid}-{sut.uid}.json" with open(annotation_path, "w") as annotation_records: annotation_records.write(json.dumps(annotations)) @@ -211,17 +211,6 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou run_consistency_check(run.journal_path, verbose=True) -def compile_annotations(run) -> list: - results = run.finished_items - annotations = [] - for _, hazards in results.items(): - for _, items in hazards.items(): - for item in items: - for _, annotation in item.annotations.items(): - annotations.append({"source_id": item.source_id(), "is_safe": annotation.is_safe}) - return annotations - - @cli.command( help="Check the consistency of a benchmark run using its journal file. You can pass the name of the file OR a directory containing multiple journal files (will be searched recursively)" ) @@ -305,7 +294,7 @@ def run_benchmarks_for_sut( thread_count=32, calibrating=False, run_path: str = "./run", -): +) -> BenchmarkRun: runner = BenchmarkRunner(pathlib.Path(run_path), calibrating=calibrating) runner.secrets = load_secrets_from_config() runner.benchmarks = benchmarks From b60aead647a48069710bd2929d4c66b0259212ea Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Wed, 10 Dec 2025 19:49:01 -0500 Subject: [PATCH 3/6] noop; whitespace --- src/modelbench/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py index e3e45f4a..49deb63c 100644 --- a/src/modelbench/cli.py +++ b/src/modelbench/cli.py @@ -186,7 +186,6 @@ def security_benchmark( sut = make_sut(sut_uid) benchmark = SecurityBenchmark(locale, prompt_set, evaluator=evaluator) check_benchmark(benchmark) - run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid) From 8cca1cd10eabe99f949949f9e58e6a20ac9df5ef Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Wed, 10 Dec 2025 19:49:22 -0500 Subject: [PATCH 4/6] only export annotations from runs using the demo prompt set, for now --- src/modelbench/benchmark_runner.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py index f0c76d52..c75107ce 100644 --- a/src/modelbench/benchmark_runner.py +++ b/src/modelbench/benchmark_runner.py @@ -242,6 +242,12 @@ def compile_annotations(self) -> list: test, hazard, locale, prompt_set, version = full_hazard_string.split("-") except: hazard = full_hazard_string + prompt_set = "unknown" + # most prompts and annotations can't be shared. Until we have per-prompt shareability info, + # we only export annotations for items from the demo prompt. + # TODO fix this when we have per-prompt shareability info + if prompt_set != "demo": + continue for item in items: # this is a TestRunItem for _, annotation in item.annotations.items(): ann = { From 215ae2301506d8bca876e2f9a1cef099e244460a Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Thu, 11 Dec 2025 15:58:18 -0500 Subject: [PATCH 5/6] remove SUT UID from file name, as those UIDs can contain hostile characters --- src/modelbench/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py index 49deb63c..93bd34c2 100644 --- a/src/modelbench/cli.py +++ b/src/modelbench/cli.py @@ -202,7 +202,7 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou # export the annotations separately annotations = {"job_id": run.run_id, "annotations": run.compile_annotations()} - annotation_path = output_dir / f"annotations-{benchmark.uid}-{sut.uid}.json" + annotation_path = output_dir / f"annotations-{benchmark.uid}.json" with open(annotation_path, "w") as annotation_records: annotation_records.write(json.dumps(annotations)) print(f"Wrote annotations for {benchmark.uid} to {annotation_path}.") From 6f5c78dc9ced5a5b4d84fe17338ed1074d3395e7 Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Thu, 11 Dec 2025 20:28:43 -0500 Subject: [PATCH 6/6] add test of annotation file contents; fix a bug that was hiding behind an always-passing test caused by missing () in a function call --- tests/modelbench_tests/test_run.py | 85 +++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py index 52ba9a4a..4d3c77e3 100644 --- a/tests/modelbench_tests/test_run.py +++ b/tests/modelbench_tests/test_run.py @@ -7,30 +7,38 @@ import pytest from click.testing import CliRunner +from modelgauge_tests.fake_sut import FakeSUT import modelbench from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner +from modelbench.benchmark_runner_items import ModelgaugeTestWrapper, TestRunItem from modelbench.benchmarks import ( BenchmarkDefinition, BenchmarkScore, GeneralPurposeAiChatBenchmarkV1, NaiveBenchmark, - SecurityScore, SecurityBenchmark, + SecurityScore, ) from modelbench.cli import cli from modelbench.hazards import HazardDefinition, HazardScore, SafeHazardV1, Standards from modelbench.scoring import ValueEstimate from modelbench.standards import NoStandardsFileError, OverwriteStandardsFileError +from modelgauge.annotation import SafetyAnnotation from modelgauge.base_test import PromptResponseTest -from modelgauge.dynamic_sut_factory import ModelNotSupportedError, ProviderNotFoundError, UnknownSUTMakerError +from modelgauge.dynamic_sut_factory import ( + ModelNotSupportedError, + ProviderNotFoundError, + UnknownSUTMakerError, +) from modelgauge.locales import DEFAULT_LOCALE, EN_US, FR_FR, ZH_CN from modelgauge.preflight import make_sut +from modelgauge.prompt import TextPrompt from modelgauge.prompt_sets import GENERAL_PROMPT_SETS from modelgauge.records import TestRecord from modelgauge.secret_values import RawSecrets -from modelgauge.sut import PromptResponseSUT -from modelgauge_tests.fake_sut import FakeSUT +from modelgauge.single_turn_prompt_response import TestItem +from modelgauge.sut import PromptResponseSUT, SUTResponse class AHazard(HazardDefinition): @@ -62,9 +70,68 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): else: score_cls = BenchmarkScore benchmark_run.benchmark_scores[benchmark][sut] = score_cls(benchmark, sut, [h.score({}) for h in hazards], None) + + # these will be used to validate the contents of artifacts created by a run object + test1 = ModelgaugeTestWrapper( + actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path + ) + test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1") + tri1 = TestRunItem( + test=test1, + test_item=test_item1, + sut=sut, + sut_response=SUTResponse(text="To get to the other side"), + annotations={"annotator1": SafetyAnnotation(is_safe=True)}, + measurements={}, + exceptions=[], + ) + benchmark_run.add_finished_item(tri1) + + # this one should not be included in the annotations file b/c it's not in the demo prompt set + test2 = ModelgaugeTestWrapper( + actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path + ) + test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2") + tri2 = TestRunItem( + test=test2, + test_item=test_item2, + sut=sut, + sut_response=SUTResponse(text="I don't know that"), + annotations={"annotator2": SafetyAnnotation(is_safe=False)}, + measurements={}, + exceptions=[], + ) + benchmark_run.add_finished_item(tri2) + + test3 = ModelgaugeTestWrapper( + actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path + ) + test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3") + tri3 = TestRunItem( + test=test3, + test_item=test_item3, + sut=sut, + sut_response=SUTResponse(text="Bark bork arf"), + annotations={"annotator1": SafetyAnnotation(is_safe=True)}, + measurements={}, + exceptions=[], + ) + benchmark_run.add_finished_item(tri3) + return benchmark_run +def annotations_are_correct(annotation_file_path, prompt_set) -> bool: + assert annotation_file_path.exists() + with open(annotation_file_path) as f: + data = json.load(f) + assert "job_id" in data + assert "annotations" in data + assert len(data["annotations"]) == 1 + assert data["annotations"][0]["response"] == "Bark bork arf" + return True + + def test_find_suts(sut): # key from modelbench gets a known SUT found_sut = make_sut(sut.uid) @@ -190,6 +257,7 @@ def runner(self): ("1.1", None, None), ("1.1", EN_US, None), ("1.1", EN_US, "practice"), + ("1.1", EN_US, "demo"), ("1.1", EN_US, "official"), ], # TODO add more locales as we add support for them @@ -213,7 +281,7 @@ def test_benchmark_basic_run_produces_json( benchmark_options.extend(["--prompt-set", prompt_set]) benchmark = GeneralPurposeAiChatBenchmarkV1( locale if locale else DEFAULT_LOCALE, - prompt_set if prompt_set else "practice", + prompt_set if prompt_set else "demo", "default", ) command_options = [ @@ -233,7 +301,12 @@ def test_benchmark_basic_run_produces_json( catch_exceptions=False, ) assert result.exit_code == 0 - assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists + assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists() + + annotation_file_path = tmp_path / f"annotations-{benchmark.uid}.json" + assert annotation_file_path.exists() + # TODO find a better spot for this test. It's handy here because all the objects are available. + assert annotations_are_correct(tmp_path / f"annotations-{benchmark.uid}.json", prompt_set) # TODO: Add test back after calibrating!! # def test_security_benchmark_basic_run_produces_json(