diff --git a/data/inferredbugs/generate_with_verifier.py b/data/inferredbugs/generate_with_verifier.py new file mode 100644 index 00000000..ec785d4c --- /dev/null +++ b/data/inferredbugs/generate_with_verifier.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +Generate verified InferredBugs dataset by adding LLM-authored verifiers to an existing tasks dataset. +""" + +import sys +import argparse +from pathlib import Path + +# Add project root to sys.path +PROJECT_ROOT = Path(__file__).resolve().parents[2] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +# Import from parent package +from data.commons import ( + upload_tasks_to_hf, + download_hf_dataset +) +from scripts.harbor import tasks_parquet_converter as tpc +from data.inferredbugs.structural_verifier import inject_inferredbugs_verifier + +def update_instructions_with_requirement(dataset_dir: str): + """ + Appends the Deliverable Requirement to each instruction.md file. + This ensures the agent knows where to save the fix for the structural verifier. + """ + tasks_root = Path(dataset_dir) + print(f"Updating instructions in: {tasks_root}") + + for task_dir in sorted(tasks_root.iterdir()): + if not task_dir.is_dir(): + continue + + instr_path = task_dir / "instruction.md" + if instr_path.exists(): + content = instr_path.read_text() + + # Simple heuristic to find the target file from the instruction text + import re + file_match = re.search(r"\*\*File:\*\* `(.*?)`", content) or re.search(r"###\s*File:\s*\n(.*)", content) + + if file_match: + target_file = file_match.group(1) + requirement = f""" + +## Deliverable Requirement +Write the complete, corrected version of the file to: `/app/{target_file}`. +The directory does not exist yet. You must create it and write the file: +```bash +mkdir -p /app/{target_file.rsplit('/', 1)[0]} +cat > /app/{target_file} << 'ENDOFFILE' + +ENDOFFILE +``` +""" + if "Deliverable Requirement" not in content: + instr_path.write_text(content + requirement) + +def main() -> None: + """Main function - processes InferredBugs tasks with LLM-authored verifiers""" + parser = argparse.ArgumentParser(description="Generate Verified InferredBugs dataset") + parser.add_argument("--skip_upload", action="store_true", help="Skip upload to Hugging Face") + parser.add_argument("--model", type=str, default="gpt-5-nano", help="LLM to use for authoring verifiers") + parser.add_argument("--limit", type=int, default=None, help="Limit the number of tasks to process") + args = parser.parse_args() + + source_repo = "mlfoundations-dev/inferredbugs-sandboxes" + target_repo = "DCAgent/inferredbugs-sandboxes-verifier" + + # 1. Download + print(f"Step 1: Downloading source tasks from {source_repo}...") + snapshot_dir = Path(download_hf_dataset(source_repo)) + + # 2. Extract tasks + print("Step 2: Extracting tasks from parquet files...") + parquet_files = sorted(snapshot_dir.rglob("*.parquet")) + if not parquet_files: + raise FileNotFoundError(f"No parquet files found in {snapshot_dir}") + + # Use a FIXED directory to allow for resumption and skip logic + output_dir = PROJECT_ROOT / "data" / "inferredbugs" / "workdir" + output_dir.mkdir(parents=True, exist_ok=True) + print(f"Working directory: {output_dir}") + + # Extract only if the directory is empty, to support resumption + if not any(output_dir.iterdir()): + print(f"Extracting tasks to: {output_dir}") + tpc.from_parquet( + parquet_path=str(parquet_files[0]), + base=str(output_dir), + on_exist="overwrite" + ) + else: + print(f"Directory {output_dir} not empty. Skipping extraction to support resumption.") + + # 3. Update Instructions + print("Step 3: Appending Deliverable Requirements to instructions...") + update_instructions_with_requirement(str(output_dir)) + + # 4. Inject Authored Verifiers + print(f"Step 4: Authoring and injecting structural verifiers using {args.model}...") + # We need to collect the instruction texts again to pass to the authoring engine + task_dirs = sorted([d for d in output_dir.iterdir() if d.is_dir()]) + + # Apply limit if specified + if args.limit: + print(f"Limiting processing to first {args.limit} tasks.") + task_dirs = task_dirs[:args.limit] + + instructions = [] + for d in task_dirs: + instr_file = d / "instruction.md" + instructions.append(instr_file.read_text() if instr_file.exists() else "") + + inject_inferredbugs_verifier(str(output_dir), instructions, model_name=args.model) + + # 5. Upload + if not args.skip_upload: + print(f"Step 5: Uploading verified tasks to {target_repo}...") + upload_tasks_to_hf(str(output_dir), target_repo) + print(f"Success! Tasks uploaded to: https://huggingface.co/datasets/{target_repo}") + else: + print(f"Upload skipped. Local tasks available in: {output_dir}") + + print("Verified InferredBugs Generation Complete!") + +if __name__ == "__main__": + main() diff --git a/data/inferredbugs/structural_verifier.py b/data/inferredbugs/structural_verifier.py new file mode 100644 index 00000000..aeffceb4 --- /dev/null +++ b/data/inferredbugs/structural_verifier.py @@ -0,0 +1,206 @@ +""" +Logic for authoring and injecting structural Python verifiers for InferredBugs. +""" + +import os +import re +import json +from pathlib import Path +from typing import List, Tuple + +AUTHORING_PROMPT_TEMPLATE = """ +You are authoring a verification harness for a bug fix task. The harness consists of two files: + +- `test.sh`: bash entrypoint — installs ONLY the Python packages actually imported by `test_state.py` (if `test_state.py` only uses stdlib modules like `re`, `os`, `pathlib`, no pip installs are needed), then runs `python3 -u /tests/test_state.py`. Nothing else. +- `test_state.py`: Python script — reads the agent's submitted file, checks whether the bug is fixed, and writes a reward score + +The harness must verify whether the bug described in the "Bug Report" below has been correctly fixed, by statically inspecting the source code (no compilation or execution). + +================ CORE PRINCIPLES ================ +1. Analyze Structure, Not Runtime: The environment lacks the .NET SDK. Use Python's `re` module or string parsing to inspect the source code files directly. +2. Target File: The bug report specifies the target file path. Read this file from the candidate paths (relative and `/app/` prefixed). +3. Analyze the Bug First: Read the "Before (Buggy File)" section in the bug report. Identify the exact buggy pattern and what class of fixes would address it. Document this in a comment in your verifier code. +4. The Contract: The Python script MUST write a scalar score (1.0 for success, 0.0 for failure) to the file `/logs/verifier/reward.txt`. +5. `test.sh` contains ONLY: environment setup (apt-get, pip install) and the single command `python3 -u /tests/test_state.py`. Nothing else — no Python code, no heredocs, no file creation, no cat commands. +6. `test_state.py` contains ALL verification logic: reading the target file, extracting the method body, checking the fix, and writing the reward. The target file is written by the agent being evaluated — do NOT create or modify it in either script. + +================ VERIFICATION QUALITY RULES ================ +These rules are CRITICAL. Violating them produces a useless verifier. + +7. Scope checks to the specific buggy method/function named in the bug report. Do NOT check the entire file — patterns that already exist elsewhere in the file will cause false positives on unfixed code. + +8. Design a check that would return 0.0 on the ORIGINAL buggy code and 1.0 on any correct fix. You decide the best strategy — presence of a fix construct, absence of a bug pattern, or a combination — whichever is most reliable for this specific bug type and language. Before finalizing, mentally run your check against the original buggy code shown in the bug report: if it would return 1.0, revise it. + +9. Accept multiple valid fix patterns. There is rarely only one correct fix. Your check should pass for any reasonable fix, not just one specific implementation. + +10. Use a brace-counting parser to extract the exact method body, not a regex that stops at the first closing brace. The reference example shows a simple brace-counter approach. + +================ REQUIRED OUTPUT FORMAT ================ +Emit ONLY the two XML blocks below — no prose before, between, or after them. + + +test.sh content here + + + +test_state.py content here + + +================ REFERENCE EXAMPLE ================ + + +#!/bin/bash +# Ensure standard setup +apt-get update > /dev/null 2>&1 +apt-get install -y python3-pip > /dev/null 2>&1 +# Run the judge with unbuffered output for real-time logs +python3 -u /tests/test_state.py + + + +import re +import os +import sys +import traceback +from pathlib import Path + +RELATIVE_TARGET = "src/Storage/Database.cs" +BUGGY_METHOD = "void SaveData(" + +def extract_method_body(code, method_signature): + # Brace-counting extractor -- handles nested braces correctly. + idx = code.find(method_signature) + if idx == -1: + return None + brace_start = code.find('{{', idx) + if brace_start == -1: + return None + depth = 0 + for i in range(brace_start, len(code)): + if code[i] == '{{': + depth += 1 + elif code[i] == '}}': + depth -= 1 + if depth == 0: + return code[brace_start:i+1] + return None + +def verify(): + reward_file = Path("/logs/verifier/reward.txt") + reward_file.parent.mkdir(parents=True, exist_ok=True) + + candidate_paths = [Path(RELATIVE_TARGET), Path("/app") / RELATIVE_TARGET] + target_path = next((p for p in candidate_paths if p.exists()), None) + if not target_path: + print(f"ERROR: target file not found. Searched: {{[str(p) for p in candidate_paths]}}") + reward_file.write_text("0.0") + print("VERIFIER: FAIL") + return + + code = target_path.read_text() + body = extract_method_body(code, BUGGY_METHOD) + if body is None: + print("ERROR: buggy method not found in file.") + reward_file.write_text("0.0") + print("VERIFIER: FAIL") + return + + # Check for presence of fix construct (using or finally+Dispose) + has_using = bool(re.search(r'\busing\s*\(', body)) + has_finally_dispose = bool(re.search(r'finally', body)) and bool(re.search(r'\.Dispose\(', body)) + fixed = has_using or has_finally_dispose + + reward_file.write_text("1.0" if fixed else "0.0") + print("VERIFIER: PASS" if fixed else "VERIFIER: FAIL") + +if __name__ == "__main__": + try: + verify() + except Exception: + traceback.print_exc() + Path("/logs/verifier/reward.txt").write_text("0.0") + sys.exit(1) + + +--- +### Bug Report to Process: +{instruction} +""" + +def author_verifier_harness(instruction: str, task_name: str, model_name: str = "gpt-5-nano") -> Tuple[str, str]: + """Calls the LLM to author both test.sh and test_state.py.""" + try: + from openai import OpenAI + from openai.types.chat import ChatCompletionUserMessageParam + client = OpenAI() + + prompt = AUTHORING_PROMPT_TEMPLATE.format(instruction=instruction) + + messages: List[ChatCompletionUserMessageParam] = [ + { + "role": "user", + "content": prompt, + } + ] + + response = client.chat.completions.create( + model=model_name, + messages=messages, + ) + + content = response.choices[0].message.content + + # Extract Bash script + bash_match = re.search(r"(.*?)", content, re.DOTALL) + if not bash_match: + print(f"ERROR [{task_name}]: Model response is missing tags.") + bash_script = bash_match.group(1).strip() if bash_match else "# Error: No bash script authored" + + # Extract Python script + python_match = re.search(r"(.*?)", content, re.DOTALL) + if not python_match: + print(f"ERROR [{task_name}]: Model response is missing tags.") + python_script = python_match.group(1).strip() if python_match else "# Error: No python script authored" + + return bash_script, python_script + + except Exception as e: + print(f"Error calling LLM for verifier authoring on {task_name}: {e}") + return f"#!/bin/bash\\necho 'Error: {e}'\\nexit 1", f"# Error: {str(e)}" + +def inject_inferredbugs_verifier(dataset_dir: str, questions: List[str], model_name: str = "gpt-5-nano", max_workers: int = 30): + """Orchestrates the authoring and injection of verifiers into tasks.""" + from concurrent.futures import ThreadPoolExecutor, as_completed + + tasks_root = Path(dataset_dir) + task_dirs = sorted([d for d in tasks_root.iterdir() if d.is_dir()], key=lambda x: x.name) + + print(f"Authoring verifier harnesses for {len(task_dirs)} tasks using {model_name} (workers={max_workers})...") + + def process_task(task_dir, instruction): + tests_dir = task_dir / "tests" + tests_dir.mkdir(exist_ok=True) + test_sh_path = tests_dir / "test.sh" + test_py_path = tests_dir / "test_state.py" + + if test_sh_path.exists() and test_py_path.exists(): + print(f" - Verifier already exists for {task_dir.name}. Skipping.") + return + + bash_code, python_code = author_verifier_harness(instruction, task_dir.name, model_name) + + if python_code.startswith("# Error:") or bash_code.startswith("#!/bin/bash\necho 'Error:"): + print(f" - Skipping {task_dir.name}: LLM authoring failed.") + return + + with open(test_py_path, "w") as f: + f.write(python_code) + with open(test_sh_path, "w") as f: + f.write(bash_code) + os.chmod(test_sh_path, 0o755) + print(f" - Harness generated for {task_dir.name}") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(process_task, td, instr) for td, instr in zip(task_dirs, questions)] + for f in as_completed(futures): + f.result() diff --git a/data/self_instruct/generate_with_verifier.py b/data/self_instruct/generate_with_verifier.py new file mode 100644 index 00000000..0ab27f09 --- /dev/null +++ b/data/self_instruct/generate_with_verifier.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +""" +Generate verified Self-Instruct dataset by adding LLM-authored functional verifiers. +""" + +import tempfile +import sys +import argparse +from pathlib import Path + +# Add project root to sys.path +PROJECT_ROOT = Path(__file__).resolve().parents[2] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +# Import from parent package +from data.commons import ( + upload_tasks_to_hf, + download_hf_dataset +) +from scripts.harbor import tasks_parquet_converter as tpc +from data.self_instruct.self_instruct_verifier import inject_self_instruct_verifier + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate Verified Self-Instruct dataset") + parser.add_argument("--limit", type=int, default=None, help="Limit number of tasks") + parser.add_argument("--skip_upload", action="store_true", help="Skip HF upload") + parser.add_argument("--model", type=str, default="gpt-5-nano", help="Authoring model") + args = parser.parse_args() + + source_repo = "DCAgent/selfinstruct-naive-sandboxes-2" + target_repo = "DCAgent/selfinstruct-naive-sandboxes-2-verified" + + # 1. Download + print(f"Step 1: Downloading source tasks from {source_repo}...") + snapshot_dir = Path(download_hf_dataset(source_repo)) + + # 2. Extract tasks + output_dir = PROJECT_ROOT / "data" / "self_instruct" / "workdir" + output_dir.mkdir(parents=True, exist_ok=True) + + if not any(output_dir.iterdir()): + print(f"Step 2: Extracting tasks to: {output_dir}") + parquet_files = sorted(snapshot_dir.rglob("*.parquet")) + tpc.from_parquet(parquet_path=str(parquet_files[0]), base=str(output_dir), on_exist="overwrite") + else: + print(f"Step 2: Reusing tasks in {output_dir}") + + # 3. Inject Verifiers + print(f"Step 3: Authoring functional verifiers using {args.model}...") + task_dirs = sorted([d for d in output_dir.iterdir() if d.is_dir()]) + if args.limit: + task_dirs = task_dirs[:args.limit] + + instructions = [ (d / "instruction.md").read_text() for d in task_dirs ] + + inject_self_instruct_verifier(str(output_dir), instructions, model_name=args.model) + + # 4. Upload + if not args.skip_upload: + print(f"Step 4: Uploading to {target_repo}...") + upload_tasks_to_hf(str(output_dir), target_repo) + else: + print(f"Upload skipped. Tasks available in: {output_dir}") + +if __name__ == "__main__": + main() diff --git a/data/self_instruct/self_instruct_verifier.py b/data/self_instruct/self_instruct_verifier.py new file mode 100644 index 00000000..bda43178 --- /dev/null +++ b/data/self_instruct/self_instruct_verifier.py @@ -0,0 +1,208 @@ +""" +Logic for authoring and injecting functional Python verifiers for Self-Instruct tasks. +""" + +import os +import re +import json +from pathlib import Path +from typing import List, Tuple + +AUTHORING_PROMPT_TEMPLATE = """ +You are authoring a verification harness for a task completion evaluation. The harness consists of two files: + +- `test.sh`: bash entrypoint — installs the Python packages imported by `test_state.py` (if `test_state.py` only uses stdlib modules like `re`, `os`, `pathlib`, no pip installs are needed), then runs `python3 -u /tests/test_state.py`. +- `test_state.py`: Python script — verifies the functional outcome of the task and writes a reward score. + +The harness must verify whether the agent successfully completed the "Task Instruction" below by checking the state of the sandbox (files created, content of files, permissions, etc.). + +================ CORE PRINCIPLES ================ +1. Verify Outcomes, Not Just Code: Focus on the side effects. If the task asks to create a file, check if that file exists and has the correct content. +2. Deliverables: The instruction often lists specific "Deliverables". Your verifier MUST check for these files at their specified paths. +3. Exhaustive Constraint Matching & Comprehensive Testing: Identify every technical constraint (e.g., 'recursive', 'case-insensitive', 'human-readable'). Your Python script MUST independently verify each requirement by creating specific test scenarios (e.g., nested folders for recursion, varied cases for case-insensitivity). The final score must be 1.0 only if **all** checks pass. +4. Descriptive Logging: Print clear "CHECK PASSED" or "CHECK FAILED" messages for every specific requirement you verify. +5. Robustness: Handle whitespace, case sensitivity (if appropriate), and varied formatting in the agent's output. +6. The Contract: The Python script MUST write a scalar score (1.0 for success, 0.0 for failure) to the file `/logs/verifier/reward.txt`. +7. Environment: The `test.sh` bash script must install any dependencies required by your `test_state.py` Python script. +8. Path Handling: The agent will be working in a sandbox. Focus your checks on the file names and relative paths specifically mentioned in the "Task Instruction." Do not assume absolute paths (like /app) unless they are explicitly stated in the instruction. + +================ REQUIRED OUTPUT FORMAT ================ +Emit ONLY the two XML blocks below — no prose before, between, or after them. + + +test.sh content here + + + +test_state.py content here + + +================ REFERENCE EXAMPLE ================ + +--- EXAMPLE INSTRUCTION --- +# Replace default with custom in /tmp .log files + +Goal: Find all files in the /tmp directory with a .log extension and replace the word "default" with "custom" in each file, editing them in place. + +Steps: + +1. Create a script named fix_tmp_logs.sh that will: + - Enumerate all .log files under /tmp (for example using find /tmp -type f -name "*.log" -print0). + - For every file found, apply an in-place substitution that replaces the word default with custom, matching whole words only. + - Print a short summary of how many files were updated. + +2. Example implementation outline (you may adapt the exact syntax): + - #!/usr/bin/env bash + set -euo pipefail + count=0 + while IFS= read -r -d '' file; do + perl -0777 -pe 's/\\bdefault\\b/custom/g' -i "$file" || exit 1 + ((count++)) + done < <(find /tmp -type f -name "*.log" -print0) + echo "$count file(s) updated." + +3. Run the script and verify: + - bash fix_tmp_logs.sh + - Verify there are no remaining 'default' occurrences in /tmp/*.log: + grep -R --word-regexp -n 'default' /tmp/*.log 2>/dev/null || true + +Deliverables: +- A single executable script named fix_tmp_logs.sh that performs the task when run. Include executable permissions (chmod +x fix_tmp_logs.sh) as part of instructions. Optionally provide a short log of the changes. + +--- EXAMPLE VERIFIER --- + +#!/bin/bash +# 1. Standard setup +apt-get update > /dev/null 2>&1 +apt-get install -y python3 > /dev/null 2>&1 +# 2. Run the functional judge +python3 -u /tests/test_state.py + + + +import os +import subprocess +from pathlib import Path + +def verify(): + print("--- Starting Functional Verification ---") + reward_file = Path("/logs/verifier/reward.txt") + reward_file.parent.mkdir(parents=True, exist_ok=True) + + # 1. Locate the agent's deliverable + script_name = "fix_tmp_logs.sh" + # Robust search for the script (check both current dir and /app) + candidate_paths = [Path(script_name), Path("/app") / script_name] + script_path = next((p for p in candidate_paths if p.exists()), None) + + if not script_path: + print(f"FAIL: {{script_name}} not found.") + reward_file.write_text("0.0") + return + + # 2. Setup a test case (Prepare dummy data) + test_log = Path("/tmp/test_verifier_dummy.log") + test_log.write_text("This is a default value. Also, another default here.") + print(f"Created test log at {{test_log}}") + + # 3. Execute the agent's work + try: + os.chmod(script_path, 0o755) + result = subprocess.run(["bash", str(script_path)], capture_output=True, text=True, timeout=30) + print(f"STDOUT: {{result.stdout}}") + except Exception as e: + print(f"ERROR: Script failed to run: {{e}}") + reward_file.write_text("0.0") + return + + # 4. Validate the outcome (Side effects) + content = test_log.read_text() + if "custom" in content and "default" not in content: + print("PASS: Substitution successful.") + reward_file.write_text("1.0") + else: + print("FAIL: Substitution failed or 'default' remains.") + reward_file.write_text("0.0") + + if test_log.exists(): test_log.unlink() + +if __name__ == "__main__": + verify() + + +--- +### Task Instruction to Process: +{instruction} +""" + +def author_verifier_harness(instruction: str, task_name: str, model_name: str = "gpt-5-nano") -> Tuple[str, str]: + """Calls the LLM to author both test.sh and test_state.py.""" + try: + from openai import OpenAI + from openai.types.chat import ChatCompletionUserMessageParam + client = OpenAI() + + prompt = AUTHORING_PROMPT_TEMPLATE.format(instruction=instruction) + + messages: List[ChatCompletionUserMessageParam] = [ + { + "role": "user", + "content": prompt, + } + ] + + response = client.chat.completions.create( + model=model_name, + messages=messages, + temperature=1.0 + ) + + content = response.choices[0].message.content + + # Extract scripts + bash_match = re.search(r"(.*?)", content, re.DOTALL) + if not bash_match: + print(f"ERROR [{task_name}]: Missing ") + bash_script = bash_match.group(1).strip() if bash_match else "# Error" + + python_match = re.search(r"(.*?)", content, re.DOTALL) + if not python_match: + print(f"ERROR [{task_name}]: Missing ") + python_script = python_match.group(1).strip() if python_match else "# Error" + + return bash_script, python_script + + except Exception as e: + print(f"Error calling LLM for verifier authoring on {task_name}: {e}") + return "#!/bin/bash\nexit 1", f"# Error: {str(e)}" + +def inject_self_instruct_verifier(dataset_dir: str, questions: List[str], model_name: str = "gpt-5-nano", max_workers: int = 30): + """Orchestrates the authoring and injection of verifiers into tasks.""" + from concurrent.futures import ThreadPoolExecutor, as_completed + + tasks_root = Path(dataset_dir) + task_dirs = sorted([d for d in tasks_root.iterdir() if d.is_dir()], key=lambda x: x.name) + + print(f"Authoring functional verifiers for {len(task_dirs)} tasks using {model_name}...") + + def process_task(task_dir, instruction): + tests_dir = task_dir / "tests" + tests_dir.mkdir(exist_ok=True) + + if (tests_dir / "test_state.py").exists(): + print(f"[{task_dir.name}] Skipping - verifier already exists.") + return + + bash_code, python_code = author_verifier_harness(instruction, task_dir.name, model_name) + + with open(tests_dir / "test_state.py", "w") as f: + f.write(python_code) + with open(tests_dir / "test.sh", "w") as f: + f.write(bash_code) + os.chmod(tests_dir / "test.sh", 0o755) + print(f"[{task_dir.name}] Verifier generated.") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(process_task, td, instr) for td, instr in zip(task_dirs, questions)] + for f in as_completed(futures): + f.result()