From 9451634596828fcea1168000ad739b475961ab1e Mon Sep 17 00:00:00 2001 From: Hanwen Xing <77495133+harvenstar@users.noreply.github.com> Date: Sun, 15 Mar 2026 16:13:16 -0700 Subject: [PATCH 1/4] data: add patcher for Nemotron-Terminal-Synthetic-Tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patches nvidia/Nemotron-Terminal-Synthetic-Tasks to use shared base images instead of per-task unique images, enabling RL training on Daytona. Two sources of unique images are fixed: - Removes `docker_image` from task.toml (pointed to private Nvidia Gitlab registry, inaccessible outside Nvidia) - Removes `COPY files/ /app/` from Dockerfiles (baked per-task data files into images, making every image unique) Task-specific data files are moved from environment/files/ to setup_files/, which Harbor uploads to /setup_files/ in the container before the agent runs. A setup preamble is prepended to instruction.md for tasks that have data files. Result: ~10 unique Dockerfiles (one per category: data_science, security, debugging, etc.) — within Daytona's snapshot limit. --- .../patch_nemotron_synthetic_tasks.py | 403 ++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 data/patchers/patch_nemotron_synthetic_tasks.py diff --git a/data/patchers/patch_nemotron_synthetic_tasks.py b/data/patchers/patch_nemotron_synthetic_tasks.py new file mode 100644 index 00000000..a9584562 --- /dev/null +++ b/data/patchers/patch_nemotron_synthetic_tasks.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +""" +Patch nvidia/Nemotron-Terminal-Synthetic-Tasks to use shared base images +instead of per-task unique images, making it compatible with Daytona's +snapshot system for RL training. + +The original dataset has two sources of unique images per task: + 1. task.toml specifies `docker_image` pointing to a private Nvidia Gitlab + registry (inaccessible outside Nvidia), causing failures on Daytona. + 2. Dockerfile contains `COPY files/ /app/` which embeds task-specific data + files into the image, making every image unique. + +After patching, all tasks within the same category share one Dockerfile, +resulting in ~10 unique images total (one per category: data_science, +security, debugging, etc.) — well within Daytona's snapshot limit. + +Changes made per task: + 1. environment/Dockerfile — Remove `COPY files/ /app/` line; fix WORKDIR to /app + 2. task.toml — Remove `docker_image` (use Dockerfile build instead) + 3. environment/files/ — Moved to setup_files/ (Harbor uploads these to + /setup_files/ in the container before agent runs) + 4. instruction.md — Prepend setup preamble for tasks that have data files + +Usage: + # Patch a local directory of extracted tasks: + python patch_nemotron_synthetic_tasks.py /path/to/tasks + + # Write to a separate output directory (leaves originals untouched): + python patch_nemotron_synthetic_tasks.py /path/to/tasks --output-dir /path/to/patched + + # Download, extract, and patch directly from HuggingFace: + python patch_nemotron_synthetic_tasks.py --hf-dataset nvidia/Nemotron-Terminal-Synthetic-Tasks --output-dir /path/to/patched + + # Dry run (show what would change without writing): + python patch_nemotron_synthetic_tasks.py /path/to/tasks --dry-run +""" + +from __future__ import annotations + +import argparse +import io +import re +import shutil +import tarfile +from pathlib import Path + + +# --------------------------------------------------------------------------- +# Templates +# --------------------------------------------------------------------------- + +# Preamble added to instruction.md when task has data files in setup_files/. +# Harbor uploads setup_files/* to /setup_files/ in the container before +# the agent runs. This preamble tells the agent to copy them to /app/. +SETUP_PREAMBLE = """\ +## Setup + +Data files for this task have been pre-loaded into `/setup_files/`. +Before starting, copy them to your working directory: + +```bash +cp -r /setup_files/. /app/ +``` + +--- + +""" + +ALREADY_PATCHED_MARKER = "Data files for this task have been pre-loaded into" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def patch_dockerfile(content: str) -> tuple[str, bool]: + """Remove COPY files/ /app/ line and fix WORKDIR to /app. + + Returns (patched_content, was_changed). + """ + lines = content.splitlines(keepends=True) + new_lines = [] + changed = False + + for line in lines: + stripped = line.strip() + # Remove any COPY that copies into /app (per-task files baked in at build time) + if re.match(r"COPY\s+files/\s+/app/", stripped): + changed = True + continue + # Fix WORKDIR / → WORKDIR /app (Nvidia's Dockerfiles set WORKDIR / which is wrong) + if re.match(r"WORKDIR\s+/$", stripped): + new_lines.append("WORKDIR /app\n") + changed = True + continue + new_lines.append(line) + + return "".join(new_lines), changed + + +def patch_task_toml(content: str) -> tuple[str, bool]: + """Remove the docker_image line from task.toml. + + The original tasks point to a private Nvidia Gitlab registry. Removing this + line causes Harbor to build from the Dockerfile instead. + + Returns (patched_content, was_changed). + """ + lines = content.splitlines(keepends=True) + new_lines = [] + changed = False + + for line in lines: + if re.match(r"\s*docker_image\s*=", line): + changed = True + continue + new_lines.append(line) + + return "".join(new_lines), changed + + +# --------------------------------------------------------------------------- +# Per-task patching +# --------------------------------------------------------------------------- + +def patch_task( + task_dir: Path, + output_dir: Path | None = None, + dry_run: bool = False, +) -> dict[str, bool | str]: + """Patch a single task directory. + + Returns a dict describing what was changed. Special keys: + "error" / "reason" — present when the task was skipped. + """ + changes: dict[str, bool | str] = {} + + # Validate basic structure + if not (task_dir / "instruction.md").exists(): + return {"error": True, "reason": "no instruction.md"} + + # Determine target (copy-then-patch vs in-place) + if output_dir is not None: + target = output_dir / task_dir.name + if not dry_run: + if target.exists(): + shutil.rmtree(target) + shutil.copytree(task_dir, target) + else: + target = task_dir + + # ------------------------------------------------------------------ + # 1. Dockerfile — remove COPY files/ /app/, fix WORKDIR + # ------------------------------------------------------------------ + dockerfile_path = target / "environment" / "Dockerfile" + if dockerfile_path.exists(): + original = dockerfile_path.read_text() + patched, changed = patch_dockerfile(original) + changes["Dockerfile"] = changed + if changed and not dry_run: + dockerfile_path.write_text(patched) + else: + changes["Dockerfile"] = False + + # ------------------------------------------------------------------ + # 2. task.toml — remove docker_image line + # ------------------------------------------------------------------ + toml_path = target / "task.toml" + if toml_path.exists(): + original = toml_path.read_text() + patched, changed = patch_task_toml(original) + changes["task.toml"] = changed + if changed and not dry_run: + toml_path.write_text(patched) + else: + changes["task.toml"] = False + + # ------------------------------------------------------------------ + # 3. environment/files/ → setup_files/ + # ------------------------------------------------------------------ + env_files_dir = target / "environment" / "files" + has_data_files = env_files_dir.is_dir() and any(env_files_dir.iterdir()) + changes["setup_files"] = has_data_files + + if has_data_files and not dry_run: + setup_files_dir = target / "setup_files" + setup_files_dir.mkdir(exist_ok=True) + for item in env_files_dir.iterdir(): + dest = setup_files_dir / item.name + if item.is_dir(): + shutil.copytree(item, dest, dirs_exist_ok=True) + else: + shutil.copy2(item, dest) + # Remove original files dir (it's now in setup_files/) + shutil.rmtree(env_files_dir) + + # ------------------------------------------------------------------ + # 4. instruction.md — prepend setup preamble if task has data files + # ------------------------------------------------------------------ + instruction_path = target / "instruction.md" + if has_data_files and instruction_path.exists(): + original = instruction_path.read_text() + if ALREADY_PATCHED_MARKER in original: + changes["instruction.md"] = False # already patched + elif dry_run: + changes["instruction.md"] = True + else: + instruction_path.write_text(SETUP_PREAMBLE + original) + changes["instruction.md"] = True + else: + changes["instruction.md"] = False + + return changes + + +# --------------------------------------------------------------------------- +# HuggingFace download + extraction helpers +# --------------------------------------------------------------------------- + +def _iter_tasks_from_tar(tar_path: Path, out_dir: Path) -> list[Path]: + """Extract tasks from a .tar.gz file, flattening the category subdirectory. + + The HF tarballs have structure: + ./easy_5000/data_science/data_science_task_1766/... + + We extract each leaf task directory directly into out_dir as: + out_dir/data_science_task_1766/... + """ + task_dirs: list[Path] = [] + with tarfile.open(tar_path, "r:gz") as tar: + members = tar.getmembers() + + # Identify leaf task dirs (contain instruction.md) + instruction_paths = { + m.name for m in members if m.name.endswith("/instruction.md") + } + + for instr_path in instruction_paths: + task_rel = instr_path[: -len("/instruction.md")] # e.g. ./easy_5000/data_science/task_1766 + task_name = Path(task_rel).name # e.g. task_1766 + + task_out = out_dir / task_name + task_out.mkdir(parents=True, exist_ok=True) + + # Extract all members belonging to this task + prefix = task_rel + "/" + for member in members: + if not (member.name == task_rel or member.name.startswith(prefix)): + continue + # Compute relative path within the task + rel = member.name[len(task_rel):].lstrip("/") + if not rel: + continue + + dest = task_out / rel + if member.isdir(): + dest.mkdir(parents=True, exist_ok=True) + elif member.isfile(): + dest.parent.mkdir(parents=True, exist_ok=True) + file_obj = tar.extractfile(member) + if file_obj: + dest.write_bytes(file_obj.read()) + + task_dirs.append(task_out) + + return task_dirs + + +def download_and_extract_from_hf(dataset_name: str, out_dir: Path) -> list[Path]: + """Download all tar.gz shards from a HuggingFace dataset and extract tasks. + + Returns list of extracted task directories. + """ + try: + from huggingface_hub import HfApi + except ImportError: + raise SystemExit("huggingface_hub is required. Run: pip install huggingface_hub") + + api = HfApi() + all_files = list(api.list_repo_files(dataset_name, repo_type="dataset")) + tar_files = [f for f in all_files if f.endswith(".tar.gz")] + + if not tar_files: + raise SystemExit(f"No .tar.gz files found in {dataset_name}") + + print(f"Found {len(tar_files)} tar.gz shards in {dataset_name}") + + out_dir.mkdir(parents=True, exist_ok=True) + all_task_dirs: list[Path] = [] + + for i, hf_path in enumerate(tar_files, 1): + print(f" [{i}/{len(tar_files)}] Downloading {hf_path} ...") + local_path = api.hf_hub_download( + repo_id=dataset_name, + filename=hf_path, + repo_type="dataset", + ) + shard_out = out_dir / f"_extracted_{i}" + shard_out.mkdir(exist_ok=True) + task_dirs = _iter_tasks_from_tar(Path(local_path), shard_out) + all_task_dirs.extend(task_dirs) + print(f" Extracted {len(task_dirs)} tasks") + + return all_task_dirs + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description="Patch Nemotron-Terminal-Synthetic-Tasks for shared Docker images", + ) + source = parser.add_mutually_exclusive_group(required=True) + source.add_argument( + "tasks_dir", + nargs="?", + help="Local directory containing extracted task folders", + ) + source.add_argument( + "--hf-dataset", + metavar="REPO_ID", + help="Download and extract from HuggingFace (e.g. nvidia/Nemotron-Terminal-Synthetic-Tasks)", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=None, + help="Write patched tasks here (default: patch in-place)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would change without writing", + ) + args = parser.parse_args() + + # ---- Resolve task directories ---- + if args.hf_dataset: + if args.output_dir is None: + raise SystemExit("--output-dir is required when using --hf-dataset") + extract_dir = args.output_dir / "_raw" + print(f"Downloading from HuggingFace: {args.hf_dataset}") + task_dirs = download_and_extract_from_hf(args.hf_dataset, extract_dir) + tasks_root = extract_dir + else: + tasks_root = Path(args.tasks_dir) + if not tasks_root.is_dir(): + raise SystemExit(f"Not a directory: {tasks_root}") + task_dirs = sorted( + d for d in tasks_root.iterdir() + if d.is_dir() and (d / "instruction.md").exists() + ) + + if not task_dirs: + raise SystemExit(f"No tasks found in {tasks_root}") + + print(f"Found {len(task_dirs)} tasks") + + if args.output_dir and not args.dry_run: + args.output_dir.mkdir(parents=True, exist_ok=True) + + # ---- Patch each task ---- + totals: dict[str, int] = {} + errors = 0 + + for td in task_dirs: + result = patch_task( + td, + output_dir=args.output_dir if not args.hf_dataset else args.output_dir, + dry_run=args.dry_run, + ) + if result.get("error"): + errors += 1 + continue + for k, v in result.items(): + if v: + totals[k] = totals.get(k, 0) + 1 + + # ---- Report ---- + action = "Would patch" if args.dry_run else "Patched" + print(f"\n{action}:") + for filename, count in sorted(totals.items()): + print(f" {filename}: {count}/{len(task_dirs)}") + if errors: + print(f" Errors (skipped): {errors}") + + # Count unique Dockerfiles in output + if not args.dry_run: + out_root = args.output_dir or tasks_root + dockerfiles: set[str] = set() + for td in out_root.rglob("environment/Dockerfile"): + dockerfiles.add(td.read_text()) + print(f"\nUnique Dockerfiles after patching: {len(dockerfiles)}") + if len(dockerfiles) <= 10: + print("✓ Within Daytona's snapshot limit (≤10)") + else: + print(f"⚠ Exceeds recommended limit of 10 — consider consolidating Dockerfiles") + + +if __name__ == "__main__": + main() From 9eb1707b23165ecc566a37d76bcbe8d4eb5a75a4 Mon Sep 17 00:00:00 2001 From: Hanwen Xing <77495133+harvenstar@users.noreply.github.com> Date: Tue, 17 Mar 2026 13:06:25 -0700 Subject: [PATCH 2/4] Add oracle solver synthesis script for Nemotron-Terminal-Synthetic-Tasks Generates solution/solve.sh for each task using GPT-5-mini, reading only instruction.md (never test files) to prevent LLM from cheating by hardcoding expected outputs or modifying verifiers. --- data/patchers/synthesize_oracle_solvers.py | 296 +++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 data/patchers/synthesize_oracle_solvers.py diff --git a/data/patchers/synthesize_oracle_solvers.py b/data/patchers/synthesize_oracle_solvers.py new file mode 100644 index 00000000..c8e92c62 --- /dev/null +++ b/data/patchers/synthesize_oracle_solvers.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +""" +Synthesize oracle solvers (solution/solve.sh) for Nemotron-Terminal-Synthetic-Tasks. + +For each task directory, reads ONLY instruction.md (never test files) and uses +an LLM to generate a bash script that correctly completes the task. The verifier +files (test.sh, test_outputs.py) are intentionally withheld to prevent the LLM +from cheating by hardcoding expected outputs or modifying the test suite. + +After running this script, validate the generated solvers with: + python scripts/daytona/validate_and_upload_from_hf.py \ + --extract_dir /path/to/patched_tasks \ + --oracle_check_only \ + --target_repo open-thoughts/nemotron-synthetic-tasks-rl + +Usage: + # Generate oracle solvers for a directory of patched tasks: + python synthesize_oracle_solvers.py /path/to/patched_tasks + + # Limit to first N tasks (useful for testing): + python synthesize_oracle_solvers.py /path/to/patched_tasks --limit 20 + + # Use a specific model and concurrency: + python synthesize_oracle_solvers.py /path/to/patched_tasks --model gpt-5-mini --workers 32 + + # Dry run (show what would be generated without writing): + python synthesize_oracle_solvers.py /path/to/patched_tasks --dry-run +""" + +from __future__ import annotations + +import argparse +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Optional + +# --------------------------------------------------------------------------- +# Prompt templates +# --------------------------------------------------------------------------- + +SYSTEM_PROMPT = """\ +You are an expert Linux terminal engineer. Your job is to write a bash script \ +that correctly completes a given task inside a Docker container running Ubuntu. + +Rules: +- Output ONLY a bash script (starting with #!/bin/bash), no markdown fences, \ +no explanation. +- The script must actually solve the task — do NOT hardcode expected outputs \ +or cheat. +- Use standard Unix tools, Python 3, or whatever is appropriate for the task. +- Assume the working directory is /app. Write output files to /app/ unless \ +the task specifies otherwise. +- If data files are needed, they are available in /setup_files/ — copy them \ +to /app/ first with: cp -r /setup_files/. /app/ +- Keep the script concise and correct. +""" + +USER_PROMPT_TEMPLATE = """\ +Complete the following task by writing a bash script (solve.sh). + +--- TASK --- +{instruction} +--- END TASK --- + +Output ONLY the bash script, starting with #!/bin/bash. +""" + + +# --------------------------------------------------------------------------- +# Core generation logic +# --------------------------------------------------------------------------- + +def _call_llm(instruction: str, model: str, client) -> str: + """Call the OpenAI API and return the generated script text.""" + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": USER_PROMPT_TEMPLATE.format(instruction=instruction)}, + ], + temperature=0.2, + max_tokens=2048, + ) + return response.choices[0].message.content.strip() + + +def _clean_script(raw: str) -> str: + """Strip markdown code fences if the LLM added them despite instructions.""" + if raw.startswith("```"): + lines = raw.splitlines() + # Drop first line (```bash or ```) and last line (```) + inner = lines[1:] if lines[-1].strip() == "```" else lines[1:] + if inner and inner[-1].strip() == "```": + inner = inner[:-1] + raw = "\n".join(inner) + if not raw.startswith("#!"): + raw = "#!/bin/bash\n" + raw + return raw + + +def synthesize_one( + task_dir: Path, + *, + model: str, + client, + overwrite: bool = False, + dry_run: bool = False, + max_retries: int = 3, +) -> dict[str, object]: + """Generate solution/solve.sh for a single task directory. + + Returns a result dict with keys: task, status, error. + Possible statuses: "skipped", "dry_run", "ok", "error" + """ + result: dict[str, object] = {"task": task_dir.name} + + instruction_path = task_dir / "instruction.md" + if not instruction_path.exists(): + result["status"] = "error" + result["error"] = "no instruction.md" + return result + + solution_dir = task_dir / "solution" + solve_path = solution_dir / "solve.sh" + + if solve_path.exists() and not overwrite: + result["status"] = "skipped" + return result + + if dry_run: + result["status"] = "dry_run" + return result + + instruction = instruction_path.read_text(encoding="utf-8") + + # Retry loop with exponential backoff + last_error: Optional[Exception] = None + for attempt in range(max_retries): + try: + raw = _call_llm(instruction, model, client) + script = _clean_script(raw) + solution_dir.mkdir(exist_ok=True) + solve_path.write_text(script, encoding="utf-8") + solve_path.chmod(0o755) + result["status"] = "ok" + return result + except Exception as exc: + last_error = exc + wait = 2 ** attempt + time.sleep(wait) + + result["status"] = "error" + result["error"] = str(last_error) + return result + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description="Synthesize oracle solvers for Nemotron-Terminal-Synthetic-Tasks" + ) + parser.add_argument( + "tasks_dir", + type=Path, + help="Directory containing patched task folders (each with instruction.md)", + ) + parser.add_argument( + "--model", + default="gpt-5-mini", + help="OpenAI model to use (default: gpt-5-mini)", + ) + parser.add_argument( + "--workers", + type=int, + default=32, + help="Number of parallel API calls (default: 32)", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Process only the first N tasks (for testing)", + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite existing solution/solve.sh files", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be generated without calling the API", + ) + parser.add_argument( + "--api-key", + default=None, + help="OpenAI API key (defaults to OPENAI_API_KEY env var)", + ) + args = parser.parse_args() + + # ---- Validate input directory ---- + if not args.tasks_dir.is_dir(): + raise SystemExit(f"Not a directory: {args.tasks_dir}") + + task_dirs = sorted( + d for d in args.tasks_dir.iterdir() + if d.is_dir() and (d / "instruction.md").exists() + ) + if not task_dirs: + raise SystemExit(f"No tasks found in {args.tasks_dir}") + + if args.limit: + task_dirs = task_dirs[: args.limit] + + print(f"Found {len(task_dirs)} tasks in {args.tasks_dir}") + if args.dry_run: + print("Dry run — no API calls will be made") + + # ---- Set up OpenAI client ---- + if not args.dry_run: + try: + from openai import OpenAI + except ImportError: + raise SystemExit("openai is required. Run: pip install openai") + + api_key = args.api_key or os.environ.get("OPENAI_API_KEY") + if not api_key: + raise SystemExit( + "OpenAI API key not found. Set OPENAI_API_KEY env var or pass --api-key." + ) + client = OpenAI(api_key=api_key) + else: + client = None + + # ---- Run generation in parallel ---- + counts = {"ok": 0, "skipped": 0, "dry_run": 0, "error": 0} + errors: list[str] = [] + + with ThreadPoolExecutor(max_workers=args.workers) as pool: + futures = { + pool.submit( + synthesize_one, + td, + model=args.model, + client=client, + overwrite=args.overwrite, + dry_run=args.dry_run, + ): td + for td in task_dirs + } + + completed = 0 + total = len(futures) + for future in as_completed(futures): + completed += 1 + result = future.result() + status = result["status"] + counts[status] = counts.get(status, 0) + 1 + if status == "error": + errors.append(f" {result['task']}: {result.get('error', '?')}") + if completed % 100 == 0 or completed == total: + print( + f" [{completed}/{total}] ok={counts['ok']} " + f"skipped={counts['skipped']} error={counts['error']}" + ) + + # ---- Report ---- + print("\nDone:") + print(f" Generated : {counts['ok']}") + print(f" Skipped : {counts['skipped']} (already had solve.sh)") + if args.dry_run: + print(f" Dry run : {counts['dry_run']}") + print(f" Errors : {counts['error']}") + if errors: + print("\nFailed tasks:") + for e in errors[:20]: + print(e) + if len(errors) > 20: + print(f" ... and {len(errors) - 20} more") + + print( + "\nNext step: validate with\n" + " python scripts/daytona/validate_and_upload_from_hf.py \\\n" + f" --extract_dir {args.tasks_dir} \\\n" + " --oracle_check_only \\\n" + " --target_repo open-thoughts/nemotron-synthetic-tasks-rl" + ) + + +if __name__ == "__main__": + main() From 0d79396649eef45d56ef71cb5c2485143b4193ce Mon Sep 17 00:00:00 2001 From: Hanwen Xing <77495133+harvenstar@users.noreply.github.com> Date: Wed, 18 Mar 2026 14:37:36 -0700 Subject: [PATCH 3/4] Fix gpt-5-mini API compatibility in oracle solver - Remove temperature param (gpt-5-mini only supports default) - Use max_completion_tokens instead of max_tokens --- data/patchers/synthesize_oracle_solvers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/data/patchers/synthesize_oracle_solvers.py b/data/patchers/synthesize_oracle_solvers.py index c8e92c62..91cd499e 100644 --- a/data/patchers/synthesize_oracle_solvers.py +++ b/data/patchers/synthesize_oracle_solvers.py @@ -80,8 +80,7 @@ def _call_llm(instruction: str, model: str, client) -> str: {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": USER_PROMPT_TEMPLATE.format(instruction=instruction)}, ], - temperature=0.2, - max_tokens=2048, + max_completion_tokens=2048, ) return response.choices[0].message.content.strip() From 4322544a0c4e2f1d46425961d8440c28025b2334 Mon Sep 17 00:00:00 2001 From: Hanwen Xing <77495133+harvenstar@users.noreply.github.com> Date: Wed, 18 Mar 2026 16:49:15 -0700 Subject: [PATCH 4/4] Increase max_completion_tokens to 128k for reasoning models gpt-5-mini uses reasoning tokens that count against max_completion_tokens. With 2048 the model often exhausted the budget on thinking, leaving zero tokens for output. --- data/patchers/synthesize_oracle_solvers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/patchers/synthesize_oracle_solvers.py b/data/patchers/synthesize_oracle_solvers.py index 91cd499e..518df17f 100644 --- a/data/patchers/synthesize_oracle_solvers.py +++ b/data/patchers/synthesize_oracle_solvers.py @@ -80,7 +80,7 @@ def _call_llm(instruction: str, model: str, client) -> str: {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": USER_PROMPT_TEMPLATE.format(instruction=instruction)}, ], - max_completion_tokens=2048, + max_completion_tokens=128000, ) return response.choices[0].message.content.strip()