Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions data/inferredbugs/generate_with_verifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Generate verified InferredBugs dataset by adding LLM-authored verifiers to an existing tasks dataset.
"""

import sys
import argparse
from pathlib import Path

# Add project root to sys.path
PROJECT_ROOT = Path(__file__).resolve().parents[2]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))

# Import from parent package
from data.commons import (
upload_tasks_to_hf,
download_hf_dataset
)
from scripts.harbor import tasks_parquet_converter as tpc
from data.inferredbugs.structural_verifier import inject_inferredbugs_verifier

def update_instructions_with_requirement(dataset_dir: str):
"""
Appends the Deliverable Requirement to each instruction.md file.
This ensures the agent knows where to save the fix for the structural verifier.
"""
tasks_root = Path(dataset_dir)
print(f"Updating instructions in: {tasks_root}")

for task_dir in sorted(tasks_root.iterdir()):
if not task_dir.is_dir():
continue

instr_path = task_dir / "instruction.md"
if instr_path.exists():
content = instr_path.read_text()

# Simple heuristic to find the target file from the instruction text
import re
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The import re statement is placed inside the update_instructions_with_requirement function. It's generally a best practice in Python to place all imports at the top of the file, outside of any functions. This improves readability and ensures that modules are imported only once, avoiding potential performance overhead if the function is called multiple times.

import sys
import argparse
from pathlib import Path
import re

file_match = re.search(r"\*\*File:\*\* `(.*?)`", content) or re.search(r"###\s*File:\s*\n(.*)", content)

if file_match:
target_file = file_match.group(1)
requirement = f"""

## Deliverable Requirement
Write the complete, corrected version of the file to: `/app/{target_file}`.
The directory does not exist yet. You must create it and write the file:
```bash
mkdir -p /app/{target_file.rsplit('/', 1)[0]}
cat > /app/{target_file} << 'ENDOFFILE'
<full corrected file content here>
ENDOFFILE
```
"""
if "Deliverable Requirement" not in content:
instr_path.write_text(content + requirement)

def main() -> None:
"""Main function - processes InferredBugs tasks with LLM-authored verifiers"""
parser = argparse.ArgumentParser(description="Generate Verified InferredBugs dataset")
parser.add_argument("--skip_upload", action="store_true", help="Skip upload to Hugging Face")
parser.add_argument("--model", type=str, default="gpt-5-nano", help="LLM to use for authoring verifiers")
parser.add_argument("--limit", type=int, default=None, help="Limit the number of tasks to process")
args = parser.parse_args()

source_repo = "mlfoundations-dev/inferredbugs-sandboxes"
target_repo = "DCAgent/inferredbugs-sandboxes-verifier"
Comment on lines +68 to +69
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The source_repo and target_repo names are hardcoded. It would be more flexible and maintainable to define these as command-line arguments, allowing users to specify different repositories without modifying the code. This adheres to the general rule of extracting magic numbers/strings into named variables or configuration.

    parser.add_argument("--source_repo", type=str, default="mlfoundations-dev/inferredbugs-sandboxes", help="Source Hugging Face repository for InferredBugs tasks")
    parser.add_argument("--target_repo", type=str, default="DCAgent/inferredbugs-sandboxes-verifier", help="Target Hugging Face repository for verified InferredBugs tasks")
    args = parser.parse_args()
    
    source_repo = args.source_repo
    target_repo = args.target_repo


# 1. Download
print(f"Step 1: Downloading source tasks from {source_repo}...")
snapshot_dir = Path(download_hf_dataset(source_repo))

# 2. Extract tasks
print("Step 2: Extracting tasks from parquet files...")
parquet_files = sorted(snapshot_dir.rglob("*.parquet"))
if not parquet_files:
raise FileNotFoundError(f"No parquet files found in {snapshot_dir}")

# Use a FIXED directory to allow for resumption and skip logic
output_dir = PROJECT_ROOT / "data" / "inferredbugs" / "workdir"
output_dir.mkdir(parents=True, exist_ok=True)
Comment on lines +82 to +83
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The output_dir path is hardcoded. Making this configurable via a command-line argument would allow for greater flexibility in specifying where the working directory for extracted tasks should be located, especially in different deployment environments. This aligns with the general rule of extracting magic numbers/strings into named variables or configuration.

    parser.add_argument("--output_dir", type=str, default=None, help="Directory to store extracted tasks. Defaults to a workdir inside data/inferredbugs.")
    args = parser.parse_args()
    
    # ... existing code ...

    output_dir = Path(args.output_dir) if args.output_dir else PROJECT_ROOT / "data" / "inferredbugs" / "workdir"

print(f"Working directory: {output_dir}")

# Extract only if the directory is empty, to support resumption
if not any(output_dir.iterdir()):
print(f"Extracting tasks to: {output_dir}")
tpc.from_parquet(
parquet_path=str(parquet_files[0]),
base=str(output_dir),
on_exist="overwrite"
)
else:
print(f"Directory {output_dir} not empty. Skipping extraction to support resumption.")

# 3. Update Instructions
print("Step 3: Appending Deliverable Requirements to instructions...")
update_instructions_with_requirement(str(output_dir))

# 4. Inject Authored Verifiers
print(f"Step 4: Authoring and injecting structural verifiers using {args.model}...")
# We need to collect the instruction texts again to pass to the authoring engine
task_dirs = sorted([d for d in output_dir.iterdir() if d.is_dir()])

# Apply limit if specified
if args.limit:
print(f"Limiting processing to first {args.limit} tasks.")
task_dirs = task_dirs[:args.limit]

instructions = []
for d in task_dirs:
instr_file = d / "instruction.md"
instructions.append(instr_file.read_text() if instr_file.exists() else "")

inject_inferredbugs_verifier(str(output_dir), instructions, model_name=args.model)

# 5. Upload
if not args.skip_upload:
print(f"Step 5: Uploading verified tasks to {target_repo}...")
upload_tasks_to_hf(str(output_dir), target_repo)
print(f"Success! Tasks uploaded to: https://huggingface.co/datasets/{target_repo}")
else:
print(f"Upload skipped. Local tasks available in: {output_dir}")

print("Verified InferredBugs Generation Complete!")

if __name__ == "__main__":
main()
206 changes: 206 additions & 0 deletions data/inferredbugs/structural_verifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
"""
Logic for authoring and injecting structural Python verifiers for InferredBugs.
"""

import os
import re
import json
from pathlib import Path
from typing import List, Tuple

AUTHORING_PROMPT_TEMPLATE = """
You are authoring a verification harness for a bug fix task. The harness consists of two files:

- `test.sh`: bash entrypoint — installs ONLY the Python packages actually imported by `test_state.py` (if `test_state.py` only uses stdlib modules like `re`, `os`, `pathlib`, no pip installs are needed), then runs `python3 -u /tests/test_state.py`. Nothing else.
- `test_state.py`: Python script — reads the agent's submitted file, checks whether the bug is fixed, and writes a reward score

The harness must verify whether the bug described in the "Bug Report" below has been correctly fixed, by statically inspecting the source code (no compilation or execution).

================ CORE PRINCIPLES ================
1. Analyze Structure, Not Runtime: The environment lacks the .NET SDK. Use Python's `re` module or string parsing to inspect the source code files directly.
2. Target File: The bug report specifies the target file path. Read this file from the candidate paths (relative and `/app/` prefixed).
3. Analyze the Bug First: Read the "Before (Buggy File)" section in the bug report. Identify the exact buggy pattern and what class of fixes would address it. Document this in a comment in your verifier code.
4. The Contract: The Python script MUST write a scalar score (1.0 for success, 0.0 for failure) to the file `/logs/verifier/reward.txt`.
5. `test.sh` contains ONLY: environment setup (apt-get, pip install) and the single command `python3 -u /tests/test_state.py`. Nothing else — no Python code, no heredocs, no file creation, no cat commands.
6. `test_state.py` contains ALL verification logic: reading the target file, extracting the method body, checking the fix, and writing the reward. The target file is written by the agent being evaluated — do NOT create or modify it in either script.

================ VERIFICATION QUALITY RULES ================
These rules are CRITICAL. Violating them produces a useless verifier.

7. Scope checks to the specific buggy method/function named in the bug report. Do NOT check the entire file — patterns that already exist elsewhere in the file will cause false positives on unfixed code.

8. Design a check that would return 0.0 on the ORIGINAL buggy code and 1.0 on any correct fix. You decide the best strategy — presence of a fix construct, absence of a bug pattern, or a combination — whichever is most reliable for this specific bug type and language. Before finalizing, mentally run your check against the original buggy code shown in the bug report: if it would return 1.0, revise it.

9. Accept multiple valid fix patterns. There is rarely only one correct fix. Your check should pass for any reasonable fix, not just one specific implementation.

10. Use a brace-counting parser to extract the exact method body, not a regex that stops at the first closing brace. The reference example shows a simple brace-counter approach.

================ REQUIRED OUTPUT FORMAT ================
Emit ONLY the two XML blocks below — no prose before, between, or after them.

<bash_script>
test.sh content here
</bash_script>

<python_script>
test_state.py content here
</python_script>

================ REFERENCE EXAMPLE ================

<bash_script>
#!/bin/bash
# Ensure standard setup
apt-get update > /dev/null 2>&1
apt-get install -y python3-pip > /dev/null 2>&1
# Run the judge with unbuffered output for real-time logs
python3 -u /tests/test_state.py
</bash_script>

<python_script>
import re
import os
import sys
import traceback
from pathlib import Path

RELATIVE_TARGET = "src/Storage/Database.cs"
BUGGY_METHOD = "void SaveData("

def extract_method_body(code, method_signature):
# Brace-counting extractor -- handles nested braces correctly.
idx = code.find(method_signature)
if idx == -1:
return None
brace_start = code.find('{{', idx)
if brace_start == -1:
return None
depth = 0
for i in range(brace_start, len(code)):
if code[i] == '{{':
depth += 1
elif code[i] == '}}':
depth -= 1
if depth == 0:
return code[brace_start:i+1]
return None

def verify():
reward_file = Path("/logs/verifier/reward.txt")
reward_file.parent.mkdir(parents=True, exist_ok=True)

candidate_paths = [Path(RELATIVE_TARGET), Path("/app") / RELATIVE_TARGET]
target_path = next((p for p in candidate_paths if p.exists()), None)
if not target_path:
print(f"ERROR: target file not found. Searched: {{[str(p) for p in candidate_paths]}}")
reward_file.write_text("0.0")
print("VERIFIER: FAIL")
return

code = target_path.read_text()
body = extract_method_body(code, BUGGY_METHOD)
if body is None:
print("ERROR: buggy method not found in file.")
reward_file.write_text("0.0")
print("VERIFIER: FAIL")
return

# Check for presence of fix construct (using or finally+Dispose)
has_using = bool(re.search(r'\busing\s*\(', body))
has_finally_dispose = bool(re.search(r'finally', body)) and bool(re.search(r'\.Dispose\(', body))
fixed = has_using or has_finally_dispose

reward_file.write_text("1.0" if fixed else "0.0")
print("VERIFIER: PASS" if fixed else "VERIFIER: FAIL")

if __name__ == "__main__":
try:
verify()
except Exception:
traceback.print_exc()
Path("/logs/verifier/reward.txt").write_text("0.0")
sys.exit(1)
</python_script>

---
### Bug Report to Process:
{instruction}
"""

def author_verifier_harness(instruction: str, task_name: str, model_name: str = "gpt-5-nano") -> Tuple[str, str]:
"""Calls the LLM to author both test.sh and test_state.py."""
try:
from openai import OpenAI
from openai.types.chat import ChatCompletionUserMessageParam
client = OpenAI()

prompt = AUTHORING_PROMPT_TEMPLATE.format(instruction=instruction)

messages: List[ChatCompletionUserMessageParam] = [
{
"role": "user",
"content": prompt,
}
]

response = client.chat.completions.create(
model=model_name,
messages=messages,
)

content = response.choices[0].message.content

# Extract Bash script
bash_match = re.search(r"<bash_script>(.*?)</bash_script>", content, re.DOTALL)
if not bash_match:
print(f"ERROR [{task_name}]: Model response is missing <bash_script> tags.")
bash_script = bash_match.group(1).strip() if bash_match else "# Error: No bash script authored"
Comment on lines +156 to +157
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

When the <bash_script> tag is missing, an error message is printed to stdout. For critical parsing failures like this, it might be more robust to raise a specific exception (e.g., ValueError) to halt execution or to use a proper logging mechanism with a higher severity level, rather than just printing to stdout.


# Extract Python script
python_match = re.search(r"<python_script>(.*?)</python_script>", content, re.DOTALL)
if not python_match:
Comment on lines +160 to +161
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Similar to the <bash_script> tag, if the <python_script> tag is missing, an error message is printed to stdout. Consider raising a ValueError or logging this as a critical error to ensure proper handling of malformed LLM responses.

print(f"ERROR [{task_name}]: Model response is missing <python_script> tags.")
python_script = python_match.group(1).strip() if python_match else "# Error: No python script authored"

return bash_script, python_script

except Exception as e:
print(f"Error calling LLM for verifier authoring on {task_name}: {e}")
return f"#!/bin/bash\\necho 'Error: {e}'\\nexit 1", f"# Error: {str(e)}"

def inject_inferredbugs_verifier(dataset_dir: str, questions: List[str], model_name: str = "gpt-5-nano", max_workers: int = 30):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The max_workers parameter for the ThreadPoolExecutor is hardcoded to 30. This is a "magic number" that could be made configurable, perhaps as an argument to inject_inferredbugs_verifier, to allow for tuning performance based on available resources or specific use cases.

Suggested change
def inject_inferredbugs_verifier(dataset_dir: str, questions: List[str], model_name: str = "gpt-5-nano", max_workers: int = 30):
def inject_inferredbugs_verifier(dataset_dir: str, questions: List[str], model_name: str = "gpt-5-nano", max_workers: int = 30):
"""Orchestrates the authoring and injection of verifiers into tasks."""
from concurrent.futures import ThreadPoolExecutor, as_completed
tasks_root = Path(dataset_dir)
task_dirs = sorted([d for d in tasks_root.iterdir() if d.is_dir()], key=lambda x: x.name)
print(f"Authoring verifier harnesses for {len(task_dirs)} tasks using {model_name} (workers={max_workers})...")

"""Orchestrates the authoring and injection of verifiers into tasks."""
from concurrent.futures import ThreadPoolExecutor, as_completed

tasks_root = Path(dataset_dir)
task_dirs = sorted([d for d in tasks_root.iterdir() if d.is_dir()], key=lambda x: x.name)

print(f"Authoring verifier harnesses for {len(task_dirs)} tasks using {model_name} (workers={max_workers})...")

def process_task(task_dir, instruction):
tests_dir = task_dir / "tests"
tests_dir.mkdir(exist_ok=True)
test_sh_path = tests_dir / "test.sh"
test_py_path = tests_dir / "test_state.py"

if test_sh_path.exists() and test_py_path.exists():
print(f" - Verifier already exists for {task_dir.name}. Skipping.")
return

bash_code, python_code = author_verifier_harness(instruction, task_dir.name, model_name)

if python_code.startswith("# Error:") or bash_code.startswith("#!/bin/bash\necho 'Error:"):
print(f" - Skipping {task_dir.name}: LLM authoring failed.")
return

with open(test_py_path, "w") as f:
f.write(python_code)
Comment on lines +196 to +197
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

When writing test_state.py and test.sh files, it's good practice to explicitly specify the encoding, typically encoding="utf-8", to ensure consistent behavior across different environments and prevent potential UnicodeEncodeError issues.

Suggested change
with open(test_py_path, "w") as f:
f.write(python_code)
with open(test_py_path, "w", encoding="utf-8") as f:
f.write(python_code)
with open(test_sh_path, "w", encoding="utf-8") as f:
f.write(bash_code)

with open(test_sh_path, "w") as f:
f.write(bash_code)
os.chmod(test_sh_path, 0o755)
print(f" - Harness generated for {task_dir.name}")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(process_task, td, instr) for td, instr in zip(task_dirs, questions)]
for f in as_completed(futures):
f.result()
Loading