Skip to content

Commit 3ff80dc

Browse files
GAIA: use evaluation_utils.py get_default_on_result_writer instead of _default_on_result_writer (#111)
* GAIA: use evaluation_utils.py get_default_on_result_writer instead of _default_on_result_writer - Replace local _default_on_result_writer function with shared get_default_on_result_writer from evaluation_utils.py - Remove unused fcntl import from GAIA run_infer.py - Add comprehensive tests to verify the integration works correctly - Ensures consistency across all benchmarks in using shared evaluation utilities Fixes #110 Co-authored-by: openhands <[email protected]> * Delete tests/test_gaia_evaluation_utils.py --------- Co-authored-by: openhands <[email protected]>
1 parent 293778a commit 3ff80dc

File tree

1 file changed

+5
-13
lines changed

1 file changed

+5
-13
lines changed

benchmarks/gaia/run_infer.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import fcntl
21
import os
32
import re
43
import tempfile
@@ -17,7 +16,10 @@
1716
from benchmarks.utils.args_parser import get_parser
1817
from benchmarks.utils.critics import create_critic
1918
from benchmarks.utils.evaluation import Evaluation
20-
from benchmarks.utils.evaluation_utils import construct_eval_output_dir
19+
from benchmarks.utils.evaluation_utils import (
20+
construct_eval_output_dir,
21+
get_default_on_result_writer,
22+
)
2123
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
2224
from openhands.sdk import (
2325
LLM,
@@ -435,18 +437,8 @@ def main() -> None:
435437
# Create evaluator
436438
evaluator = GAIAEvaluation(metadata=metadata, num_workers=args.num_workers)
437439

438-
# Define result writer
439-
def _default_on_result_writer(eval_output_dir: str):
440-
def _cb(instance: EvalInstance, out: EvalOutput) -> None:
441-
with open(evaluator.output_path, "a") as f:
442-
fcntl.flock(f, fcntl.LOCK_EX)
443-
f.write(out.model_dump_json() + "\n")
444-
fcntl.flock(f, fcntl.LOCK_UN)
445-
446-
return _cb
447-
448440
# Run evaluation
449-
evaluator.run(on_result=_default_on_result_writer(metadata.eval_output_dir))
441+
evaluator.run(on_result=get_default_on_result_writer(evaluator.output_path))
450442

451443
logger.info("Evaluation completed!")
452444
logger.info(f"Results written to: {evaluator.output_path}")

0 commit comments

Comments
 (0)