From 9451634596828fcea1168000ad739b475961ab1e Mon Sep 17 00:00:00 2001
From: Hanwen Xing <77495133+harvenstar@users.noreply.github.com>
Date: Sun, 15 Mar 2026 16:13:16 -0700
Subject: [PATCH 1/4] data: add patcher for Nemotron-Terminal-Synthetic-Tasks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patches nvidia/Nemotron-Terminal-Synthetic-Tasks to use shared base
images instead of per-task unique images, enabling RL training on
Daytona.

Two sources of unique images are fixed:
- Removes `docker_image` from task.toml (pointed to private Nvidia
  Gitlab registry, inaccessible outside Nvidia)
- Removes `COPY files/ /app/` from Dockerfiles (baked per-task data
  files into images, making every image unique)

Task-specific data files are moved from environment/files/ to
setup_files/, which Harbor uploads to /setup_files/ in the container
before the agent runs. A setup preamble is prepended to instruction.md
for tasks that have data files.

Result: ~10 unique Dockerfiles (one per category: data_science,
security, debugging, etc.) — within Daytona's snapshot limit.
---
 .../patch_nemotron_synthetic_tasks.py         | 403 ++++++++++++++++++
 1 file changed, 403 insertions(+)
 create mode 100644 data/patchers/patch_nemotron_synthetic_tasks.py

diff --git a/data/patchers/patch_nemotron_synthetic_tasks.py b/data/patchers/patch_nemotron_synthetic_tasks.py
new file mode 100644
index 00000000..a9584562
--- /dev/null
+++ b/data/patchers/patch_nemotron_synthetic_tasks.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+"""
+Patch nvidia/Nemotron-Terminal-Synthetic-Tasks to use shared base images
+instead of per-task unique images, making it compatible with Daytona's
+snapshot system for RL training.
+
+The original dataset has two sources of unique images per task:
+  1. task.toml specifies `docker_image` pointing to a private Nvidia Gitlab
+     registry (inaccessible outside Nvidia), causing failures on Daytona.
+  2. Dockerfile contains `COPY files/ /app/` which embeds task-specific data
+     files into the image, making every image unique.
+
+After patching, all tasks within the same category share one Dockerfile,
+resulting in ~10 unique images total (one per category: data_science,
+security, debugging, etc.) — well within Daytona's snapshot limit.
+
+Changes made per task:
+  1. environment/Dockerfile  — Remove `COPY files/ /app/` line; fix WORKDIR to /app
+  2. task.toml               — Remove `docker_image` (use Dockerfile build instead)
+  3. environment/files/      — Moved to setup_files/ (Harbor uploads these to
+                               /setup_files/ in the container before agent runs)
+  4. instruction.md          — Prepend setup preamble for tasks that have data files
+
+Usage:
+    # Patch a local directory of extracted tasks:
+    python patch_nemotron_synthetic_tasks.py /path/to/tasks
+
+    # Write to a separate output directory (leaves originals untouched):
+    python patch_nemotron_synthetic_tasks.py /path/to/tasks --output-dir /path/to/patched
+
+    # Download, extract, and patch directly from HuggingFace:
+    python patch_nemotron_synthetic_tasks.py --hf-dataset nvidia/Nemotron-Terminal-Synthetic-Tasks --output-dir /path/to/patched
+
+    # Dry run (show what would change without writing):
+    python patch_nemotron_synthetic_tasks.py /path/to/tasks --dry-run
+"""
+
+from __future__ import annotations
+
+import argparse
+import io
+import re
+import shutil
+import tarfile
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# Templates
+# ---------------------------------------------------------------------------
+
+# Preamble added to instruction.md when task has data files in setup_files/.
+# Harbor uploads setup_files/* to /setup_files/ in the container before
+# the agent runs. This preamble tells the agent to copy them to /app/.
+SETUP_PREAMBLE = """\
+## Setup
+
+Data files for this task have been pre-loaded into `/setup_files/`.
+Before starting, copy them to your working directory:
+
+```bash
+cp -r /setup_files/. /app/
+```
+
+---
+
+"""
+
+ALREADY_PATCHED_MARKER = "Data files for this task have been pre-loaded into"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def patch_dockerfile(content: str) -> tuple[str, bool]:
+    """Remove COPY files/ /app/ line and fix WORKDIR to /app.
+
+    Returns (patched_content, was_changed).
+    """
+    lines = content.splitlines(keepends=True)
+    new_lines = []
+    changed = False
+
+    for line in lines:
+        stripped = line.strip()
+        # Remove any COPY that copies into /app (per-task files baked in at build time)
+        if re.match(r"COPY\s+files/\s+/app/", stripped):
+            changed = True
+            continue
+        # Fix WORKDIR / → WORKDIR /app (Nvidia's Dockerfiles set WORKDIR / which is wrong)
+        if re.match(r"WORKDIR\s+/$", stripped):
+            new_lines.append("WORKDIR /app\n")
+            changed = True
+            continue
+        new_lines.append(line)
+
+    return "".join(new_lines), changed
+
+
+def patch_task_toml(content: str) -> tuple[str, bool]:
+    """Remove the docker_image line from task.toml.
+
+    The original tasks point to a private Nvidia Gitlab registry. Removing this
+    line causes Harbor to build from the Dockerfile instead.
+
+    Returns (patched_content, was_changed).
+    """
+    lines = content.splitlines(keepends=True)
+    new_lines = []
+    changed = False
+
+    for line in lines:
+        if re.match(r"\s*docker_image\s*=", line):
+            changed = True
+            continue
+        new_lines.append(line)
+
+    return "".join(new_lines), changed
+
+
+# ---------------------------------------------------------------------------
+# Per-task patching
+# ---------------------------------------------------------------------------
+
+def patch_task(
+    task_dir: Path,
+    output_dir: Path | None = None,
+    dry_run: bool = False,
+) -> dict[str, bool | str]:
+    """Patch a single task directory.
+
+    Returns a dict describing what was changed. Special keys:
+      "error" / "reason" — present when the task was skipped.
+    """
+    changes: dict[str, bool | str] = {}
+
+    # Validate basic structure
+    if not (task_dir / "instruction.md").exists():
+        return {"error": True, "reason": "no instruction.md"}
+
+    # Determine target (copy-then-patch vs in-place)
+    if output_dir is not None:
+        target = output_dir / task_dir.name
+        if not dry_run:
+            if target.exists():
+                shutil.rmtree(target)
+            shutil.copytree(task_dir, target)
+    else:
+        target = task_dir
+
+    # ------------------------------------------------------------------
+    # 1. Dockerfile — remove COPY files/ /app/, fix WORKDIR
+    # ------------------------------------------------------------------
+    dockerfile_path = target / "environment" / "Dockerfile"
+    if dockerfile_path.exists():
+        original = dockerfile_path.read_text()
+        patched, changed = patch_dockerfile(original)
+        changes["Dockerfile"] = changed
+        if changed and not dry_run:
+            dockerfile_path.write_text(patched)
+    else:
+        changes["Dockerfile"] = False
+
+    # ------------------------------------------------------------------
+    # 2. task.toml — remove docker_image line
+    # ------------------------------------------------------------------
+    toml_path = target / "task.toml"
+    if toml_path.exists():
+        original = toml_path.read_text()
+        patched, changed = patch_task_toml(original)
+        changes["task.toml"] = changed
+        if changed and not dry_run:
+            toml_path.write_text(patched)
+    else:
+        changes["task.toml"] = False
+
+    # ------------------------------------------------------------------
+    # 3. environment/files/ → setup_files/
+    # ------------------------------------------------------------------
+    env_files_dir = target / "environment" / "files"
+    has_data_files = env_files_dir.is_dir() and any(env_files_dir.iterdir())
+    changes["setup_files"] = has_data_files
+
+    if has_data_files and not dry_run:
+        setup_files_dir = target / "setup_files"
+        setup_files_dir.mkdir(exist_ok=True)
+        for item in env_files_dir.iterdir():
+            dest = setup_files_dir / item.name
+            if item.is_dir():
+                shutil.copytree(item, dest, dirs_exist_ok=True)
+            else:
+                shutil.copy2(item, dest)
+        # Remove original files dir (it's now in setup_files/)
+        shutil.rmtree(env_files_dir)
+
+    # ------------------------------------------------------------------
+    # 4. instruction.md — prepend setup preamble if task has data files
+    # ------------------------------------------------------------------
+    instruction_path = target / "instruction.md"
+    if has_data_files and instruction_path.exists():
+        original = instruction_path.read_text()
+        if ALREADY_PATCHED_MARKER in original:
+            changes["instruction.md"] = False  # already patched
+        elif dry_run:
+            changes["instruction.md"] = True
+        else:
+            instruction_path.write_text(SETUP_PREAMBLE + original)
+            changes["instruction.md"] = True
+    else:
+        changes["instruction.md"] = False
+
+    return changes
+
+
+# ---------------------------------------------------------------------------
+# HuggingFace download + extraction helpers
+# ---------------------------------------------------------------------------
+
+def _iter_tasks_from_tar(tar_path: Path, out_dir: Path) -> list[Path]:
+    """Extract tasks from a .tar.gz file, flattening the category subdirectory.
+
+    The HF tarballs have structure:
+        ./easy_5000/data_science/data_science_task_1766/...
+
+    We extract each leaf task directory directly into out_dir as:
+        out_dir/data_science_task_1766/...
+    """
+    task_dirs: list[Path] = []
+    with tarfile.open(tar_path, "r:gz") as tar:
+        members = tar.getmembers()
+
+        # Identify leaf task dirs (contain instruction.md)
+        instruction_paths = {
+            m.name for m in members if m.name.endswith("/instruction.md")
+        }
+
+        for instr_path in instruction_paths:
+            task_rel = instr_path[: -len("/instruction.md")]  # e.g. ./easy_5000/data_science/task_1766
+            task_name = Path(task_rel).name  # e.g. task_1766
+
+            task_out = out_dir / task_name
+            task_out.mkdir(parents=True, exist_ok=True)
+
+            # Extract all members belonging to this task
+            prefix = task_rel + "/"
+            for member in members:
+                if not (member.name == task_rel or member.name.startswith(prefix)):
+                    continue
+                # Compute relative path within the task
+                rel = member.name[len(task_rel):].lstrip("/")
+                if not rel:
+                    continue
+
+                dest = task_out / rel
+                if member.isdir():
+                    dest.mkdir(parents=True, exist_ok=True)
+                elif member.isfile():
+                    dest.parent.mkdir(parents=True, exist_ok=True)
+                    file_obj = tar.extractfile(member)
+                    if file_obj:
+                        dest.write_bytes(file_obj.read())
+
+            task_dirs.append(task_out)
+
+    return task_dirs
+
+
+def download_and_extract_from_hf(dataset_name: str, out_dir: Path) -> list[Path]:
+    """Download all tar.gz shards from a HuggingFace dataset and extract tasks.
+
+    Returns list of extracted task directories.
+    """
+    try:
+        from huggingface_hub import HfApi
+    except ImportError:
+        raise SystemExit("huggingface_hub is required. Run: pip install huggingface_hub")
+
+    api = HfApi()
+    all_files = list(api.list_repo_files(dataset_name, repo_type="dataset"))
+    tar_files = [f for f in all_files if f.endswith(".tar.gz")]
+
+    if not tar_files:
+        raise SystemExit(f"No .tar.gz files found in {dataset_name}")
+
+    print(f"Found {len(tar_files)} tar.gz shards in {dataset_name}")
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+    all_task_dirs: list[Path] = []
+
+    for i, hf_path in enumerate(tar_files, 1):
+        print(f"  [{i}/{len(tar_files)}] Downloading {hf_path} ...")
+        local_path = api.hf_hub_download(
+            repo_id=dataset_name,
+            filename=hf_path,
+            repo_type="dataset",
+        )
+        shard_out = out_dir / f"_extracted_{i}"
+        shard_out.mkdir(exist_ok=True)
+        task_dirs = _iter_tasks_from_tar(Path(local_path), shard_out)
+        all_task_dirs.extend(task_dirs)
+        print(f"    Extracted {len(task_dirs)} tasks")
+
+    return all_task_dirs
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Patch Nemotron-Terminal-Synthetic-Tasks for shared Docker images",
+    )
+    source = parser.add_mutually_exclusive_group(required=True)
+    source.add_argument(
+        "tasks_dir",
+        nargs="?",
+        help="Local directory containing extracted task folders",
+    )
+    source.add_argument(
+        "--hf-dataset",
+        metavar="REPO_ID",
+        help="Download and extract from HuggingFace (e.g. nvidia/Nemotron-Terminal-Synthetic-Tasks)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=None,
+        help="Write patched tasks here (default: patch in-place)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would change without writing",
+    )
+    args = parser.parse_args()
+
+    # ---- Resolve task directories ----
+    if args.hf_dataset:
+        if args.output_dir is None:
+            raise SystemExit("--output-dir is required when using --hf-dataset")
+        extract_dir = args.output_dir / "_raw"
+        print(f"Downloading from HuggingFace: {args.hf_dataset}")
+        task_dirs = download_and_extract_from_hf(args.hf_dataset, extract_dir)
+        tasks_root = extract_dir
+    else:
+        tasks_root = Path(args.tasks_dir)
+        if not tasks_root.is_dir():
+            raise SystemExit(f"Not a directory: {tasks_root}")
+        task_dirs = sorted(
+            d for d in tasks_root.iterdir()
+            if d.is_dir() and (d / "instruction.md").exists()
+        )
+
+    if not task_dirs:
+        raise SystemExit(f"No tasks found in {tasks_root}")
+
+    print(f"Found {len(task_dirs)} tasks")
+
+    if args.output_dir and not args.dry_run:
+        args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # ---- Patch each task ----
+    totals: dict[str, int] = {}
+    errors = 0
+
+    for td in task_dirs:
+        result = patch_task(
+            td,
+            output_dir=args.output_dir if not args.hf_dataset else args.output_dir,
+            dry_run=args.dry_run,
+        )
+        if result.get("error"):
+            errors += 1
+            continue
+        for k, v in result.items():
+            if v:
+                totals[k] = totals.get(k, 0) + 1
+
+    # ---- Report ----
+    action = "Would patch" if args.dry_run else "Patched"
+    print(f"\n{action}:")
+    for filename, count in sorted(totals.items()):
+        print(f"  {filename}: {count}/{len(task_dirs)}")
+    if errors:
+        print(f"  Errors (skipped): {errors}")
+
+    # Count unique Dockerfiles in output
+    if not args.dry_run:
+        out_root = args.output_dir or tasks_root
+        dockerfiles: set[str] = set()
+        for td in out_root.rglob("environment/Dockerfile"):
+            dockerfiles.add(td.read_text())
+        print(f"\nUnique Dockerfiles after patching: {len(dockerfiles)}")
+        if len(dockerfiles) <= 10:
+            print("✓ Within Daytona's snapshot limit (≤10)")
+        else:
+            print(f"⚠ Exceeds recommended limit of 10 — consider consolidating Dockerfiles")
+
+
+if __name__ == "__main__":
+    main()

From 9eb1707b23165ecc566a37d76bcbe8d4eb5a75a4 Mon Sep 17 00:00:00 2001
From: Hanwen Xing <77495133+harvenstar@users.noreply.github.com>
Date: Tue, 17 Mar 2026 13:06:25 -0700
Subject: [PATCH 2/4] Add oracle solver synthesis script for
 Nemotron-Terminal-Synthetic-Tasks

Generates solution/solve.sh for each task using GPT-5-mini, reading only
instruction.md (never test files) to prevent LLM from cheating by hardcoding
expected outputs or modifying verifiers.
---
 data/patchers/synthesize_oracle_solvers.py | 296 +++++++++++++++++++++
 1 file changed, 296 insertions(+)
 create mode 100644 data/patchers/synthesize_oracle_solvers.py

diff --git a/data/patchers/synthesize_oracle_solvers.py b/data/patchers/synthesize_oracle_solvers.py
new file mode 100644
index 00000000..c8e92c62
--- /dev/null
+++ b/data/patchers/synthesize_oracle_solvers.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+"""
+Synthesize oracle solvers (solution/solve.sh) for Nemotron-Terminal-Synthetic-Tasks.
+
+For each task directory, reads ONLY instruction.md (never test files) and uses
+an LLM to generate a bash script that correctly completes the task.  The verifier
+files (test.sh, test_outputs.py) are intentionally withheld to prevent the LLM
+from cheating by hardcoding expected outputs or modifying the test suite.
+
+After running this script, validate the generated solvers with:
+    python scripts/daytona/validate_and_upload_from_hf.py \
+        --extract_dir /path/to/patched_tasks \
+        --oracle_check_only \
+        --target_repo open-thoughts/nemotron-synthetic-tasks-rl
+
+Usage:
+    # Generate oracle solvers for a directory of patched tasks:
+    python synthesize_oracle_solvers.py /path/to/patched_tasks
+
+    # Limit to first N tasks (useful for testing):
+    python synthesize_oracle_solvers.py /path/to/patched_tasks --limit 20
+
+    # Use a specific model and concurrency:
+    python synthesize_oracle_solvers.py /path/to/patched_tasks --model gpt-5-mini --workers 32
+
+    # Dry run (show what would be generated without writing):
+    python synthesize_oracle_solvers.py /path/to/patched_tasks --dry-run
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Optional
+
+# ---------------------------------------------------------------------------
+# Prompt templates
+# ---------------------------------------------------------------------------
+
+SYSTEM_PROMPT = """\
+You are an expert Linux terminal engineer. Your job is to write a bash script \
+that correctly completes a given task inside a Docker container running Ubuntu.
+
+Rules:
+- Output ONLY a bash script (starting with #!/bin/bash), no markdown fences, \
+no explanation.
+- The script must actually solve the task — do NOT hardcode expected outputs \
+or cheat.
+- Use standard Unix tools, Python 3, or whatever is appropriate for the task.
+- Assume the working directory is /app. Write output files to /app/ unless \
+the task specifies otherwise.
+- If data files are needed, they are available in /setup_files/ — copy them \
+to /app/ first with: cp -r /setup_files/. /app/
+- Keep the script concise and correct.
+"""
+
+USER_PROMPT_TEMPLATE = """\
+Complete the following task by writing a bash script (solve.sh).
+
+--- TASK ---
+{instruction}
+--- END TASK ---
+
+Output ONLY the bash script, starting with #!/bin/bash.
+"""
+
+
+# ---------------------------------------------------------------------------
+# Core generation logic
+# ---------------------------------------------------------------------------
+
+def _call_llm(instruction: str, model: str, client) -> str:
+    """Call the OpenAI API and return the generated script text."""
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": USER_PROMPT_TEMPLATE.format(instruction=instruction)},
+        ],
+        temperature=0.2,
+        max_tokens=2048,
+    )
+    return response.choices[0].message.content.strip()
+
+
+def _clean_script(raw: str) -> str:
+    """Strip markdown code fences if the LLM added them despite instructions."""
+    if raw.startswith("```"):
+        lines = raw.splitlines()
+        # Drop first line (```bash or ```) and last line (```)
+        inner = lines[1:] if lines[-1].strip() == "```" else lines[1:]
+        if inner and inner[-1].strip() == "```":
+            inner = inner[:-1]
+        raw = "\n".join(inner)
+    if not raw.startswith("#!"):
+        raw = "#!/bin/bash\n" + raw
+    return raw
+
+
+def synthesize_one(
+    task_dir: Path,
+    *,
+    model: str,
+    client,
+    overwrite: bool = False,
+    dry_run: bool = False,
+    max_retries: int = 3,
+) -> dict[str, object]:
+    """Generate solution/solve.sh for a single task directory.
+
+    Returns a result dict with keys: task, status, error.
+    Possible statuses: "skipped", "dry_run", "ok", "error"
+    """
+    result: dict[str, object] = {"task": task_dir.name}
+
+    instruction_path = task_dir / "instruction.md"
+    if not instruction_path.exists():
+        result["status"] = "error"
+        result["error"] = "no instruction.md"
+        return result
+
+    solution_dir = task_dir / "solution"
+    solve_path = solution_dir / "solve.sh"
+
+    if solve_path.exists() and not overwrite:
+        result["status"] = "skipped"
+        return result
+
+    if dry_run:
+        result["status"] = "dry_run"
+        return result
+
+    instruction = instruction_path.read_text(encoding="utf-8")
+
+    # Retry loop with exponential backoff
+    last_error: Optional[Exception] = None
+    for attempt in range(max_retries):
+        try:
+            raw = _call_llm(instruction, model, client)
+            script = _clean_script(raw)
+            solution_dir.mkdir(exist_ok=True)
+            solve_path.write_text(script, encoding="utf-8")
+            solve_path.chmod(0o755)
+            result["status"] = "ok"
+            return result
+        except Exception as exc:
+            last_error = exc
+            wait = 2 ** attempt
+            time.sleep(wait)
+
+    result["status"] = "error"
+    result["error"] = str(last_error)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Synthesize oracle solvers for Nemotron-Terminal-Synthetic-Tasks"
+    )
+    parser.add_argument(
+        "tasks_dir",
+        type=Path,
+        help="Directory containing patched task folders (each with instruction.md)",
+    )
+    parser.add_argument(
+        "--model",
+        default="gpt-5-mini",
+        help="OpenAI model to use (default: gpt-5-mini)",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=32,
+        help="Number of parallel API calls (default: 32)",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Process only the first N tasks (for testing)",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing solution/solve.sh files",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be generated without calling the API",
+    )
+    parser.add_argument(
+        "--api-key",
+        default=None,
+        help="OpenAI API key (defaults to OPENAI_API_KEY env var)",
+    )
+    args = parser.parse_args()
+
+    # ---- Validate input directory ----
+    if not args.tasks_dir.is_dir():
+        raise SystemExit(f"Not a directory: {args.tasks_dir}")
+
+    task_dirs = sorted(
+        d for d in args.tasks_dir.iterdir()
+        if d.is_dir() and (d / "instruction.md").exists()
+    )
+    if not task_dirs:
+        raise SystemExit(f"No tasks found in {args.tasks_dir}")
+
+    if args.limit:
+        task_dirs = task_dirs[: args.limit]
+
+    print(f"Found {len(task_dirs)} tasks in {args.tasks_dir}")
+    if args.dry_run:
+        print("Dry run — no API calls will be made")
+
+    # ---- Set up OpenAI client ----
+    if not args.dry_run:
+        try:
+            from openai import OpenAI
+        except ImportError:
+            raise SystemExit("openai is required. Run: pip install openai")
+
+        api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
+        if not api_key:
+            raise SystemExit(
+                "OpenAI API key not found. Set OPENAI_API_KEY env var or pass --api-key."
+            )
+        client = OpenAI(api_key=api_key)
+    else:
+        client = None
+
+    # ---- Run generation in parallel ----
+    counts = {"ok": 0, "skipped": 0, "dry_run": 0, "error": 0}
+    errors: list[str] = []
+
+    with ThreadPoolExecutor(max_workers=args.workers) as pool:
+        futures = {
+            pool.submit(
+                synthesize_one,
+                td,
+                model=args.model,
+                client=client,
+                overwrite=args.overwrite,
+                dry_run=args.dry_run,
+            ): td
+            for td in task_dirs
+        }
+
+        completed = 0
+        total = len(futures)
+        for future in as_completed(futures):
+            completed += 1
+            result = future.result()
+            status = result["status"]
+            counts[status] = counts.get(status, 0) + 1
+            if status == "error":
+                errors.append(f"  {result['task']}: {result.get('error', '?')}")
+            if completed % 100 == 0 or completed == total:
+                print(
+                    f"  [{completed}/{total}] ok={counts['ok']} "
+                    f"skipped={counts['skipped']} error={counts['error']}"
+                )
+
+    # ---- Report ----
+    print("\nDone:")
+    print(f"  Generated : {counts['ok']}")
+    print(f"  Skipped   : {counts['skipped']} (already had solve.sh)")
+    if args.dry_run:
+        print(f"  Dry run   : {counts['dry_run']}")
+    print(f"  Errors    : {counts['error']}")
+    if errors:
+        print("\nFailed tasks:")
+        for e in errors[:20]:
+            print(e)
+        if len(errors) > 20:
+            print(f"  ... and {len(errors) - 20} more")
+
+    print(
+        "\nNext step: validate with\n"
+        "  python scripts/daytona/validate_and_upload_from_hf.py \\\n"
+        f"    --extract_dir {args.tasks_dir} \\\n"
+        "    --oracle_check_only \\\n"
+        "    --target_repo open-thoughts/nemotron-synthetic-tasks-rl"
+    )
+
+
+if __name__ == "__main__":
+    main()

From 0d79396649eef45d56ef71cb5c2485143b4193ce Mon Sep 17 00:00:00 2001
From: Hanwen Xing <77495133+harvenstar@users.noreply.github.com>
Date: Wed, 18 Mar 2026 14:37:36 -0700
Subject: [PATCH 3/4] Fix gpt-5-mini API compatibility in oracle solver

- Remove temperature param (gpt-5-mini only supports default)
- Use max_completion_tokens instead of max_tokens
---
 data/patchers/synthesize_oracle_solvers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/data/patchers/synthesize_oracle_solvers.py b/data/patchers/synthesize_oracle_solvers.py
index c8e92c62..91cd499e 100644
--- a/data/patchers/synthesize_oracle_solvers.py
+++ b/data/patchers/synthesize_oracle_solvers.py
@@ -80,8 +80,7 @@ def _call_llm(instruction: str, model: str, client) -> str:
             {"role": "system", "content": SYSTEM_PROMPT},
             {"role": "user", "content": USER_PROMPT_TEMPLATE.format(instruction=instruction)},
         ],
-        temperature=0.2,
-        max_tokens=2048,
+        max_completion_tokens=2048,
     )
     return response.choices[0].message.content.strip()
 

From 4322544a0c4e2f1d46425961d8440c28025b2334 Mon Sep 17 00:00:00 2001
From: Hanwen Xing <77495133+harvenstar@users.noreply.github.com>
Date: Wed, 18 Mar 2026 16:49:15 -0700
Subject: [PATCH 4/4] Increase max_completion_tokens to 128k for reasoning
 models

gpt-5-mini uses reasoning tokens that count against
max_completion_tokens. With 2048 the model often exhausted
the budget on thinking, leaving zero tokens for output.
---
 data/patchers/synthesize_oracle_solvers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/patchers/synthesize_oracle_solvers.py b/data/patchers/synthesize_oracle_solvers.py
index 91cd499e..518df17f 100644
--- a/data/patchers/synthesize_oracle_solvers.py
+++ b/data/patchers/synthesize_oracle_solvers.py
@@ -80,7 +80,7 @@ def _call_llm(instruction: str, model: str, client) -> str:
             {"role": "system", "content": SYSTEM_PROMPT},
             {"role": "user", "content": USER_PROMPT_TEMPLATE.format(instruction=instruction)},
         ],
-        max_completion_tokens=2048,
+        max_completion_tokens=128000,
     )
     return response.choices[0].message.content.strip()