-
Notifications
You must be signed in to change notification settings - Fork 18
Add syntax based generated test for Inferredbugs dataset #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,129 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| Generate verified InferredBugs dataset by adding LLM-authored verifiers to an existing tasks dataset. | ||
| """ | ||
|
|
||
| import sys | ||
| import argparse | ||
| from pathlib import Path | ||
|
|
||
| # Add project root to sys.path | ||
| PROJECT_ROOT = Path(__file__).resolve().parents[2] | ||
| if str(PROJECT_ROOT) not in sys.path: | ||
| sys.path.insert(0, str(PROJECT_ROOT)) | ||
|
|
||
| # Import from parent package | ||
| from data.commons import ( | ||
| upload_tasks_to_hf, | ||
| download_hf_dataset | ||
| ) | ||
| from scripts.harbor import tasks_parquet_converter as tpc | ||
| from data.inferredbugs.structural_verifier import inject_inferredbugs_verifier | ||
|
|
||
| def update_instructions_with_requirement(dataset_dir: str): | ||
| """ | ||
| Appends the Deliverable Requirement to each instruction.md file. | ||
| This ensures the agent knows where to save the fix for the structural verifier. | ||
| """ | ||
| tasks_root = Path(dataset_dir) | ||
| print(f"Updating instructions in: {tasks_root}") | ||
|
|
||
| for task_dir in sorted(tasks_root.iterdir()): | ||
| if not task_dir.is_dir(): | ||
| continue | ||
|
|
||
| instr_path = task_dir / "instruction.md" | ||
| if instr_path.exists(): | ||
| content = instr_path.read_text() | ||
|
|
||
| # Simple heuristic to find the target file from the instruction text | ||
| import re | ||
| file_match = re.search(r"\*\*File:\*\* `(.*?)`", content) or re.search(r"###\s*File:\s*\n(.*)", content) | ||
|
|
||
| if file_match: | ||
| target_file = file_match.group(1) | ||
| requirement = f""" | ||
|
|
||
| ## Deliverable Requirement | ||
| Write the complete, corrected version of the file to: `/app/{target_file}`. | ||
| The directory does not exist yet. You must create it and write the file: | ||
| ```bash | ||
| mkdir -p /app/{target_file.rsplit('/', 1)[0]} | ||
| cat > /app/{target_file} << 'ENDOFFILE' | ||
| <full corrected file content here> | ||
| ENDOFFILE | ||
| ``` | ||
| """ | ||
| if "Deliverable Requirement" not in content: | ||
| instr_path.write_text(content + requirement) | ||
|
|
||
| def main() -> None: | ||
| """Main function - processes InferredBugs tasks with LLM-authored verifiers""" | ||
| parser = argparse.ArgumentParser(description="Generate Verified InferredBugs dataset") | ||
| parser.add_argument("--skip_upload", action="store_true", help="Skip upload to Hugging Face") | ||
| parser.add_argument("--model", type=str, default="gpt-5-nano", help="LLM to use for authoring verifiers") | ||
| parser.add_argument("--limit", type=int, default=None, help="Limit the number of tasks to process") | ||
| args = parser.parse_args() | ||
|
|
||
| source_repo = "mlfoundations-dev/inferredbugs-sandboxes" | ||
| target_repo = "DCAgent/inferredbugs-sandboxes-verifier" | ||
|
Comment on lines
+68
to
+69
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The parser.add_argument("--source_repo", type=str, default="mlfoundations-dev/inferredbugs-sandboxes", help="Source Hugging Face repository for InferredBugs tasks")
parser.add_argument("--target_repo", type=str, default="DCAgent/inferredbugs-sandboxes-verifier", help="Target Hugging Face repository for verified InferredBugs tasks")
args = parser.parse_args()
source_repo = args.source_repo
target_repo = args.target_repo |
||
|
|
||
| # 1. Download | ||
| print(f"Step 1: Downloading source tasks from {source_repo}...") | ||
| snapshot_dir = Path(download_hf_dataset(source_repo)) | ||
|
|
||
| # 2. Extract tasks | ||
| print("Step 2: Extracting tasks from parquet files...") | ||
| parquet_files = sorted(snapshot_dir.rglob("*.parquet")) | ||
| if not parquet_files: | ||
| raise FileNotFoundError(f"No parquet files found in {snapshot_dir}") | ||
|
|
||
| # Use a FIXED directory to allow for resumption and skip logic | ||
| output_dir = PROJECT_ROOT / "data" / "inferredbugs" / "workdir" | ||
| output_dir.mkdir(parents=True, exist_ok=True) | ||
|
Comment on lines
+82
to
+83
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The parser.add_argument("--output_dir", type=str, default=None, help="Directory to store extracted tasks. Defaults to a workdir inside data/inferredbugs.")
args = parser.parse_args()
# ... existing code ...
output_dir = Path(args.output_dir) if args.output_dir else PROJECT_ROOT / "data" / "inferredbugs" / "workdir" |
||
| print(f"Working directory: {output_dir}") | ||
|
|
||
| # Extract only if the directory is empty, to support resumption | ||
| if not any(output_dir.iterdir()): | ||
| print(f"Extracting tasks to: {output_dir}") | ||
| tpc.from_parquet( | ||
| parquet_path=str(parquet_files[0]), | ||
| base=str(output_dir), | ||
| on_exist="overwrite" | ||
| ) | ||
| else: | ||
| print(f"Directory {output_dir} not empty. Skipping extraction to support resumption.") | ||
|
|
||
| # 3. Update Instructions | ||
| print("Step 3: Appending Deliverable Requirements to instructions...") | ||
| update_instructions_with_requirement(str(output_dir)) | ||
|
|
||
| # 4. Inject Authored Verifiers | ||
| print(f"Step 4: Authoring and injecting structural verifiers using {args.model}...") | ||
| # We need to collect the instruction texts again to pass to the authoring engine | ||
| task_dirs = sorted([d for d in output_dir.iterdir() if d.is_dir()]) | ||
|
|
||
| # Apply limit if specified | ||
| if args.limit: | ||
| print(f"Limiting processing to first {args.limit} tasks.") | ||
| task_dirs = task_dirs[:args.limit] | ||
|
|
||
| instructions = [] | ||
| for d in task_dirs: | ||
| instr_file = d / "instruction.md" | ||
| instructions.append(instr_file.read_text() if instr_file.exists() else "") | ||
|
|
||
| inject_inferredbugs_verifier(str(output_dir), instructions, model_name=args.model) | ||
|
|
||
| # 5. Upload | ||
| if not args.skip_upload: | ||
| print(f"Step 5: Uploading verified tasks to {target_repo}...") | ||
| upload_tasks_to_hf(str(output_dir), target_repo) | ||
| print(f"Success! Tasks uploaded to: https://huggingface.co/datasets/{target_repo}") | ||
| else: | ||
| print(f"Upload skipped. Local tasks available in: {output_dir}") | ||
|
|
||
| print("Verified InferredBugs Generation Complete!") | ||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,206 @@ | ||||||||||||||||||||
| """ | ||||||||||||||||||||
| Logic for authoring and injecting structural Python verifiers for InferredBugs. | ||||||||||||||||||||
| """ | ||||||||||||||||||||
|
|
||||||||||||||||||||
| import os | ||||||||||||||||||||
| import re | ||||||||||||||||||||
| import json | ||||||||||||||||||||
| from pathlib import Path | ||||||||||||||||||||
| from typing import List, Tuple | ||||||||||||||||||||
|
|
||||||||||||||||||||
| AUTHORING_PROMPT_TEMPLATE = """ | ||||||||||||||||||||
| You are authoring a verification harness for a bug fix task. The harness consists of two files: | ||||||||||||||||||||
|
|
||||||||||||||||||||
| - `test.sh`: bash entrypoint — installs ONLY the Python packages actually imported by `test_state.py` (if `test_state.py` only uses stdlib modules like `re`, `os`, `pathlib`, no pip installs are needed), then runs `python3 -u /tests/test_state.py`. Nothing else. | ||||||||||||||||||||
| - `test_state.py`: Python script — reads the agent's submitted file, checks whether the bug is fixed, and writes a reward score | ||||||||||||||||||||
|
|
||||||||||||||||||||
| The harness must verify whether the bug described in the "Bug Report" below has been correctly fixed, by statically inspecting the source code (no compilation or execution). | ||||||||||||||||||||
|
|
||||||||||||||||||||
| ================ CORE PRINCIPLES ================ | ||||||||||||||||||||
| 1. Analyze Structure, Not Runtime: The environment lacks the .NET SDK. Use Python's `re` module or string parsing to inspect the source code files directly. | ||||||||||||||||||||
| 2. Target File: The bug report specifies the target file path. Read this file from the candidate paths (relative and `/app/` prefixed). | ||||||||||||||||||||
| 3. Analyze the Bug First: Read the "Before (Buggy File)" section in the bug report. Identify the exact buggy pattern and what class of fixes would address it. Document this in a comment in your verifier code. | ||||||||||||||||||||
| 4. The Contract: The Python script MUST write a scalar score (1.0 for success, 0.0 for failure) to the file `/logs/verifier/reward.txt`. | ||||||||||||||||||||
| 5. `test.sh` contains ONLY: environment setup (apt-get, pip install) and the single command `python3 -u /tests/test_state.py`. Nothing else — no Python code, no heredocs, no file creation, no cat commands. | ||||||||||||||||||||
| 6. `test_state.py` contains ALL verification logic: reading the target file, extracting the method body, checking the fix, and writing the reward. The target file is written by the agent being evaluated — do NOT create or modify it in either script. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| ================ VERIFICATION QUALITY RULES ================ | ||||||||||||||||||||
| These rules are CRITICAL. Violating them produces a useless verifier. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| 7. Scope checks to the specific buggy method/function named in the bug report. Do NOT check the entire file — patterns that already exist elsewhere in the file will cause false positives on unfixed code. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| 8. Design a check that would return 0.0 on the ORIGINAL buggy code and 1.0 on any correct fix. You decide the best strategy — presence of a fix construct, absence of a bug pattern, or a combination — whichever is most reliable for this specific bug type and language. Before finalizing, mentally run your check against the original buggy code shown in the bug report: if it would return 1.0, revise it. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| 9. Accept multiple valid fix patterns. There is rarely only one correct fix. Your check should pass for any reasonable fix, not just one specific implementation. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| 10. Use a brace-counting parser to extract the exact method body, not a regex that stops at the first closing brace. The reference example shows a simple brace-counter approach. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| ================ REQUIRED OUTPUT FORMAT ================ | ||||||||||||||||||||
| Emit ONLY the two XML blocks below — no prose before, between, or after them. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| <bash_script> | ||||||||||||||||||||
| test.sh content here | ||||||||||||||||||||
| </bash_script> | ||||||||||||||||||||
|
|
||||||||||||||||||||
| <python_script> | ||||||||||||||||||||
| test_state.py content here | ||||||||||||||||||||
| </python_script> | ||||||||||||||||||||
|
|
||||||||||||||||||||
| ================ REFERENCE EXAMPLE ================ | ||||||||||||||||||||
|
|
||||||||||||||||||||
| <bash_script> | ||||||||||||||||||||
| #!/bin/bash | ||||||||||||||||||||
| # Ensure standard setup | ||||||||||||||||||||
| apt-get update > /dev/null 2>&1 | ||||||||||||||||||||
| apt-get install -y python3-pip > /dev/null 2>&1 | ||||||||||||||||||||
| # Run the judge with unbuffered output for real-time logs | ||||||||||||||||||||
| python3 -u /tests/test_state.py | ||||||||||||||||||||
| </bash_script> | ||||||||||||||||||||
|
|
||||||||||||||||||||
| <python_script> | ||||||||||||||||||||
| import re | ||||||||||||||||||||
| import os | ||||||||||||||||||||
| import sys | ||||||||||||||||||||
| import traceback | ||||||||||||||||||||
| from pathlib import Path | ||||||||||||||||||||
|
|
||||||||||||||||||||
| RELATIVE_TARGET = "src/Storage/Database.cs" | ||||||||||||||||||||
| BUGGY_METHOD = "void SaveData(" | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def extract_method_body(code, method_signature): | ||||||||||||||||||||
| # Brace-counting extractor -- handles nested braces correctly. | ||||||||||||||||||||
| idx = code.find(method_signature) | ||||||||||||||||||||
| if idx == -1: | ||||||||||||||||||||
| return None | ||||||||||||||||||||
| brace_start = code.find('{{', idx) | ||||||||||||||||||||
| if brace_start == -1: | ||||||||||||||||||||
| return None | ||||||||||||||||||||
| depth = 0 | ||||||||||||||||||||
| for i in range(brace_start, len(code)): | ||||||||||||||||||||
| if code[i] == '{{': | ||||||||||||||||||||
| depth += 1 | ||||||||||||||||||||
| elif code[i] == '}}': | ||||||||||||||||||||
| depth -= 1 | ||||||||||||||||||||
| if depth == 0: | ||||||||||||||||||||
| return code[brace_start:i+1] | ||||||||||||||||||||
| return None | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def verify(): | ||||||||||||||||||||
| reward_file = Path("/logs/verifier/reward.txt") | ||||||||||||||||||||
| reward_file.parent.mkdir(parents=True, exist_ok=True) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| candidate_paths = [Path(RELATIVE_TARGET), Path("/app") / RELATIVE_TARGET] | ||||||||||||||||||||
| target_path = next((p for p in candidate_paths if p.exists()), None) | ||||||||||||||||||||
| if not target_path: | ||||||||||||||||||||
| print(f"ERROR: target file not found. Searched: {{[str(p) for p in candidate_paths]}}") | ||||||||||||||||||||
| reward_file.write_text("0.0") | ||||||||||||||||||||
| print("VERIFIER: FAIL") | ||||||||||||||||||||
| return | ||||||||||||||||||||
|
|
||||||||||||||||||||
| code = target_path.read_text() | ||||||||||||||||||||
| body = extract_method_body(code, BUGGY_METHOD) | ||||||||||||||||||||
| if body is None: | ||||||||||||||||||||
| print("ERROR: buggy method not found in file.") | ||||||||||||||||||||
| reward_file.write_text("0.0") | ||||||||||||||||||||
| print("VERIFIER: FAIL") | ||||||||||||||||||||
| return | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Check for presence of fix construct (using or finally+Dispose) | ||||||||||||||||||||
| has_using = bool(re.search(r'\busing\s*\(', body)) | ||||||||||||||||||||
| has_finally_dispose = bool(re.search(r'finally', body)) and bool(re.search(r'\.Dispose\(', body)) | ||||||||||||||||||||
| fixed = has_using or has_finally_dispose | ||||||||||||||||||||
|
|
||||||||||||||||||||
| reward_file.write_text("1.0" if fixed else "0.0") | ||||||||||||||||||||
| print("VERIFIER: PASS" if fixed else "VERIFIER: FAIL") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if __name__ == "__main__": | ||||||||||||||||||||
| try: | ||||||||||||||||||||
| verify() | ||||||||||||||||||||
| except Exception: | ||||||||||||||||||||
| traceback.print_exc() | ||||||||||||||||||||
| Path("/logs/verifier/reward.txt").write_text("0.0") | ||||||||||||||||||||
| sys.exit(1) | ||||||||||||||||||||
| </python_script> | ||||||||||||||||||||
|
|
||||||||||||||||||||
| --- | ||||||||||||||||||||
| ### Bug Report to Process: | ||||||||||||||||||||
| {instruction} | ||||||||||||||||||||
| """ | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def author_verifier_harness(instruction: str, task_name: str, model_name: str = "gpt-5-nano") -> Tuple[str, str]: | ||||||||||||||||||||
| """Calls the LLM to author both test.sh and test_state.py.""" | ||||||||||||||||||||
| try: | ||||||||||||||||||||
| from openai import OpenAI | ||||||||||||||||||||
| from openai.types.chat import ChatCompletionUserMessageParam | ||||||||||||||||||||
| client = OpenAI() | ||||||||||||||||||||
|
|
||||||||||||||||||||
| prompt = AUTHORING_PROMPT_TEMPLATE.format(instruction=instruction) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| messages: List[ChatCompletionUserMessageParam] = [ | ||||||||||||||||||||
| { | ||||||||||||||||||||
| "role": "user", | ||||||||||||||||||||
| "content": prompt, | ||||||||||||||||||||
| } | ||||||||||||||||||||
| ] | ||||||||||||||||||||
|
|
||||||||||||||||||||
| response = client.chat.completions.create( | ||||||||||||||||||||
| model=model_name, | ||||||||||||||||||||
| messages=messages, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| content = response.choices[0].message.content | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Extract Bash script | ||||||||||||||||||||
| bash_match = re.search(r"<bash_script>(.*?)</bash_script>", content, re.DOTALL) | ||||||||||||||||||||
| if not bash_match: | ||||||||||||||||||||
| print(f"ERROR [{task_name}]: Model response is missing <bash_script> tags.") | ||||||||||||||||||||
| bash_script = bash_match.group(1).strip() if bash_match else "# Error: No bash script authored" | ||||||||||||||||||||
|
Comment on lines
+156
to
+157
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When the |
||||||||||||||||||||
|
|
||||||||||||||||||||
| # Extract Python script | ||||||||||||||||||||
| python_match = re.search(r"<python_script>(.*?)</python_script>", content, re.DOTALL) | ||||||||||||||||||||
| if not python_match: | ||||||||||||||||||||
|
Comment on lines
+160
to
+161
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||||||||||||||
| print(f"ERROR [{task_name}]: Model response is missing <python_script> tags.") | ||||||||||||||||||||
| python_script = python_match.group(1).strip() if python_match else "# Error: No python script authored" | ||||||||||||||||||||
|
|
||||||||||||||||||||
| return bash_script, python_script | ||||||||||||||||||||
|
|
||||||||||||||||||||
| except Exception as e: | ||||||||||||||||||||
| print(f"Error calling LLM for verifier authoring on {task_name}: {e}") | ||||||||||||||||||||
| return f"#!/bin/bash\\necho 'Error: {e}'\\nexit 1", f"# Error: {str(e)}" | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def inject_inferredbugs_verifier(dataset_dir: str, questions: List[str], model_name: str = "gpt-5-nano", max_workers: int = 30): | ||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
|
||||||||||||||||||||
| """Orchestrates the authoring and injection of verifiers into tasks.""" | ||||||||||||||||||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | ||||||||||||||||||||
|
|
||||||||||||||||||||
| tasks_root = Path(dataset_dir) | ||||||||||||||||||||
| task_dirs = sorted([d for d in tasks_root.iterdir() if d.is_dir()], key=lambda x: x.name) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| print(f"Authoring verifier harnesses for {len(task_dirs)} tasks using {model_name} (workers={max_workers})...") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def process_task(task_dir, instruction): | ||||||||||||||||||||
| tests_dir = task_dir / "tests" | ||||||||||||||||||||
| tests_dir.mkdir(exist_ok=True) | ||||||||||||||||||||
| test_sh_path = tests_dir / "test.sh" | ||||||||||||||||||||
| test_py_path = tests_dir / "test_state.py" | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if test_sh_path.exists() and test_py_path.exists(): | ||||||||||||||||||||
| print(f" - Verifier already exists for {task_dir.name}. Skipping.") | ||||||||||||||||||||
| return | ||||||||||||||||||||
|
|
||||||||||||||||||||
| bash_code, python_code = author_verifier_harness(instruction, task_dir.name, model_name) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if python_code.startswith("# Error:") or bash_code.startswith("#!/bin/bash\necho 'Error:"): | ||||||||||||||||||||
| print(f" - Skipping {task_dir.name}: LLM authoring failed.") | ||||||||||||||||||||
| return | ||||||||||||||||||||
|
|
||||||||||||||||||||
| with open(test_py_path, "w") as f: | ||||||||||||||||||||
| f.write(python_code) | ||||||||||||||||||||
|
Comment on lines
+196
to
+197
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When writing
Suggested change
|
||||||||||||||||||||
| with open(test_sh_path, "w") as f: | ||||||||||||||||||||
| f.write(bash_code) | ||||||||||||||||||||
| os.chmod(test_sh_path, 0o755) | ||||||||||||||||||||
| print(f" - Harness generated for {task_dir.name}") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | ||||||||||||||||||||
| futures = [executor.submit(process_task, td, instr) for td, instr in zip(task_dirs, questions)] | ||||||||||||||||||||
| for f in as_completed(futures): | ||||||||||||||||||||
| f.result() | ||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
import restatement is placed inside theupdate_instructions_with_requirementfunction. It's generally a best practice in Python to place all imports at the top of the file, outside of any functions. This improves readability and ensures that modules are imported only once, avoiding potential performance overhead if the function is called multiple times.