Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions src/modelbench/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import faulthandler
import io
import json
import logging
import pathlib
import pkgutil
Expand All @@ -16,7 +17,7 @@
from rich.table import Table

import modelgauge.annotators.cheval.registration # noqa: F401
from modelbench.benchmark_runner import BenchmarkRunner, JsonRunTracker, TqdmRunTracker
from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner, JsonRunTracker, TqdmRunTracker
from modelbench.benchmarks import GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark
from modelbench.consistency_checker import (
ConsistencyChecker,
Expand Down Expand Up @@ -192,17 +193,35 @@ def security_benchmark(
def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid):
start_time = datetime.now(timezone.utc)
run = run_benchmarks_for_sut([benchmark], sut, max_instances, debug=debug, json_logs=json_logs)

benchmark_scores = score_benchmarks(run)
output_dir.mkdir(exist_ok=True, parents=True)
print_summary(benchmark, benchmark_scores)
json_path = output_dir / f"benchmark_record-{benchmark.uid}.json"
scores = [score for score in benchmark_scores if score.benchmark_definition == benchmark]
dump_json(json_path, start_time, benchmark, scores, run_uid)
print(f"Wrote record for {benchmark.uid} to {json_path}.")

# export the annotations separately
annotations = {"job_id": benchmark.uid, "annotations": compile_annotations(run)}
annotation_path = output_dir / f"annotations-{benchmark.uid}-{sut.uid}.json"
with open(annotation_path, "w") as annotation_records:
annotation_records.write(json.dumps(annotations))
print(f"Wrote annotations for {benchmark.uid} to {annotation_path}.")

run_consistency_check(run.journal_path, verbose=True)


def compile_annotations(run) -> list:
results = run.finished_items
annotations = []
for _, hazards in results.items():
for _, items in hazards.items():
for item in items:
for _, annotation in item.annotations.items():
annotations.append({"source_id": item.source_id(), "is_safe": annotation.is_safe})
return annotations


@cli.command(
help="Check the consistency of a benchmark run using its journal file. You can pass the name of the file OR a directory containing multiple journal files (will be searched recursively)"
)
Expand Down
Loading