Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/modelbench/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,32 @@ def __init__(self, runner: "BenchmarkRunner"):
for t in h.tests(self.secrets):
self.add_test(t)

def compile_annotations(self) -> list:
annotations = []
for sut_uid, hazards in self.finished_items.items():
for full_hazard_string, items in hazards.items():
try:
test, hazard, locale, prompt_set, version = full_hazard_string.split("-")
except:
hazard = full_hazard_string
prompt_set = "unknown"
# most prompts and annotations can't be shared. Until we have per-prompt shareability info,
# we only export annotations for items from the demo prompt.
# TODO fix this when we have per-prompt shareability info
if prompt_set != "demo":
continue
for item in items: # this is a TestRunItem
for _, annotation in item.annotations.items():
ann = {
"hazard": hazard,
"prompt": item.test_item.prompt.text,
"response": item.sut_response.text,
"is_safe": annotation.is_safe,
"is_valid": annotation.is_valid,
}
annotations.append(ann)
return annotations


class IntermediateCachingPipe(Pipe):
"""
Expand Down
15 changes: 11 additions & 4 deletions src/modelbench/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import faulthandler
import io
import json
import logging
import pathlib
import pkgutil
Expand All @@ -16,7 +17,7 @@
from rich.table import Table

import modelgauge.annotators.cheval.registration # noqa: F401
from modelbench.benchmark_runner import BenchmarkRunner, JsonRunTracker, TqdmRunTracker
from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner, JsonRunTracker, TqdmRunTracker
from modelbench.benchmarks import GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark
from modelbench.consistency_checker import (
ConsistencyChecker,
Expand Down Expand Up @@ -185,21 +186,27 @@ def security_benchmark(
sut = make_sut(sut_uid)
benchmark = SecurityBenchmark(locale, prompt_set, evaluator=evaluator)
check_benchmark(benchmark)

run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid)


def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid):
start_time = datetime.now(timezone.utc)
run = run_benchmarks_for_sut([benchmark], sut, max_instances, debug=debug, json_logs=json_logs)

benchmark_scores = score_benchmarks(run)
output_dir.mkdir(exist_ok=True, parents=True)
print_summary(benchmark, benchmark_scores)
json_path = output_dir / f"benchmark_record-{benchmark.uid}.json"
scores = [score for score in benchmark_scores if score.benchmark_definition == benchmark]
dump_json(json_path, start_time, benchmark, scores, run_uid)
print(f"Wrote record for {benchmark.uid} to {json_path}.")

# export the annotations separately
annotations = {"job_id": run.run_id, "annotations": run.compile_annotations()}
annotation_path = output_dir / f"annotations-{benchmark.uid}.json"
with open(annotation_path, "w") as annotation_records:
annotation_records.write(json.dumps(annotations))
print(f"Wrote annotations for {benchmark.uid} to {annotation_path}.")

run_consistency_check(run.journal_path, verbose=True)


Expand Down Expand Up @@ -286,7 +293,7 @@ def run_benchmarks_for_sut(
thread_count=32,
calibrating=False,
run_path: str = "./run",
):
) -> BenchmarkRun:
runner = BenchmarkRunner(pathlib.Path(run_path), calibrating=calibrating)
runner.secrets = load_secrets_from_config()
runner.benchmarks = benchmarks
Expand Down
85 changes: 79 additions & 6 deletions tests/modelbench_tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,30 +7,38 @@

import pytest
from click.testing import CliRunner
from modelgauge_tests.fake_sut import FakeSUT

import modelbench
from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner
from modelbench.benchmark_runner_items import ModelgaugeTestWrapper, TestRunItem
from modelbench.benchmarks import (
BenchmarkDefinition,
BenchmarkScore,
GeneralPurposeAiChatBenchmarkV1,
NaiveBenchmark,
SecurityScore,
SecurityBenchmark,
SecurityScore,
)
from modelbench.cli import cli
from modelbench.hazards import HazardDefinition, HazardScore, SafeHazardV1, Standards
from modelbench.scoring import ValueEstimate
from modelbench.standards import NoStandardsFileError, OverwriteStandardsFileError
from modelgauge.annotation import SafetyAnnotation
from modelgauge.base_test import PromptResponseTest
from modelgauge.dynamic_sut_factory import ModelNotSupportedError, ProviderNotFoundError, UnknownSUTMakerError
from modelgauge.dynamic_sut_factory import (
ModelNotSupportedError,
ProviderNotFoundError,
UnknownSUTMakerError,
)
from modelgauge.locales import DEFAULT_LOCALE, EN_US, FR_FR, ZH_CN
from modelgauge.preflight import make_sut
from modelgauge.prompt import TextPrompt
from modelgauge.prompt_sets import GENERAL_PROMPT_SETS
from modelgauge.records import TestRecord
from modelgauge.secret_values import RawSecrets
from modelgauge.sut import PromptResponseSUT
from modelgauge_tests.fake_sut import FakeSUT
from modelgauge.single_turn_prompt_response import TestItem
from modelgauge.sut import PromptResponseSUT, SUTResponse


class AHazard(HazardDefinition):
Expand Down Expand Up @@ -62,9 +70,68 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
else:
score_cls = BenchmarkScore
benchmark_run.benchmark_scores[benchmark][sut] = score_cls(benchmark, sut, [h.score({}) for h in hazards], None)

# these will be used to validate the contents of artifacts created by a run object
test1 = ModelgaugeTestWrapper(
actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path
)
test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1")
tri1 = TestRunItem(
test=test1,
test_item=test_item1,
sut=sut,
sut_response=SUTResponse(text="To get to the other side"),
annotations={"annotator1": SafetyAnnotation(is_safe=True)},
measurements={},
exceptions=[],
)
benchmark_run.add_finished_item(tri1)

# this one should not be included in the annotations file b/c it's not in the demo prompt set
test2 = ModelgaugeTestWrapper(
actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
)
test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2")
tri2 = TestRunItem(
test=test2,
test_item=test_item2,
sut=sut,
sut_response=SUTResponse(text="I don't know that"),
annotations={"annotator2": SafetyAnnotation(is_safe=False)},
measurements={},
exceptions=[],
)
benchmark_run.add_finished_item(tri2)

test3 = ModelgaugeTestWrapper(
actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path
)
test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3")
tri3 = TestRunItem(
test=test3,
test_item=test_item3,
sut=sut,
sut_response=SUTResponse(text="Bark bork arf"),
annotations={"annotator1": SafetyAnnotation(is_safe=True)},
measurements={},
exceptions=[],
)
benchmark_run.add_finished_item(tri3)

return benchmark_run


def annotations_are_correct(annotation_file_path, prompt_set) -> bool:
assert annotation_file_path.exists()
with open(annotation_file_path) as f:
data = json.load(f)
assert "job_id" in data
assert "annotations" in data
assert len(data["annotations"]) == 1
assert data["annotations"][0]["response"] == "Bark bork arf"
return True


def test_find_suts(sut):
# key from modelbench gets a known SUT
found_sut = make_sut(sut.uid)
Expand Down Expand Up @@ -190,6 +257,7 @@ def runner(self):
("1.1", None, None),
("1.1", EN_US, None),
("1.1", EN_US, "practice"),
("1.1", EN_US, "demo"),
("1.1", EN_US, "official"),
],
# TODO add more locales as we add support for them
Expand All @@ -213,7 +281,7 @@ def test_benchmark_basic_run_produces_json(
benchmark_options.extend(["--prompt-set", prompt_set])
benchmark = GeneralPurposeAiChatBenchmarkV1(
locale if locale else DEFAULT_LOCALE,
prompt_set if prompt_set else "practice",
prompt_set if prompt_set else "demo",
"default",
)
command_options = [
Expand All @@ -233,7 +301,12 @@ def test_benchmark_basic_run_produces_json(
catch_exceptions=False,
)
assert result.exit_code == 0
assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists
assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists()

annotation_file_path = tmp_path / f"annotations-{benchmark.uid}.json"
assert annotation_file_path.exists()
# TODO find a better spot for this test. It's handy here because all the objects are available.
assert annotations_are_correct(tmp_path / f"annotations-{benchmark.uid}.json", prompt_set)

# TODO: Add test back after calibrating!!
# def test_security_benchmark_basic_run_produces_json(
Expand Down