From 33f58a9c4cacf3da6c9b8c82d591146aa925087c Mon Sep 17 00:00:00 2001 From: Jay Scambler Date: Tue, 9 Jun 2026 14:38:06 -0500 Subject: [PATCH 1/3] fix(adapter-backends): make game scenarios trainable/servable + meaningful in-training assessment Surfaced by a live recursive-loop demo on local MLX (train -> publish -> auto-resolve -> serve). Two real gaps on the mlxlm/opd adapter path, both blocking game scenarios: 1. resolve_scenario_context returned '' for ScenarioInterface (game) scenarios -- they expose describe_rules/strategy_interface/evaluation_criteria but no get_task_prompt or description, so every game scenario trained/served on an EMPTY prompt. Now composes the describe_* methods into a task instruction. 2. _assess_mlxlm fed the model a RAW prompt, but mlx-lm's LoRA trainer and the serving path (MLXLMProvider.format_mlxlm_prompt) both apply the instruct chat template. An instruct model given a raw prompt emits prose, not verifier-scorable JSON, so the in-training metric read ~0 even for a good adapter. Extracted format_assess_prompt (chat-template + optional quality prefix, raw fallback for non-instruct tokenizers) and use it in assess. Adds scripts/demo_recursive_loop.py (self-contained end-to-end loop on grid_ctf) and docs/case-study-recursive-loop.md. Live result: base 0.58 -> auto-served adapter 0.82 (+41.9%) in 43s; in-training assessment (0.857) now agrees with the served measurement. Tests: resolve_scenario_context describe_* composition + precedence + empty fallback; format_assess_prompt chat-template + quality-prefix + raw fallback. CI-safe (no mlx). --- autocontext/docs/case-study-recursive-loop.md | 75 +++++++ autocontext/scripts/demo_recursive_loop.py | 205 ++++++++++++++++++ .../training/autoresearch/mlxlm_backend.py | 22 +- .../training/autoresearch/sequence_format.py | 22 +- autocontext/tests/test_mlxlm_backend.py | 31 +++ autocontext/tests/test_sequence_format.py | 41 ++++ 6 files changed, 392 insertions(+), 4 deletions(-) create mode 100644 autocontext/docs/case-study-recursive-loop.md create mode 100644 autocontext/scripts/demo_recursive_loop.py diff --git a/autocontext/docs/case-study-recursive-loop.md b/autocontext/docs/case-study-recursive-loop.md new file mode 100644 index 00000000..b74b007a --- /dev/null +++ b/autocontext/docs/case-study-recursive-loop.md @@ -0,0 +1,75 @@ +# Case study: the recursive loop, closed end to end on local MLX + +autocontext's premise is a loop: an agent attempts a task, the verifier scores the attempts, +the best trajectories train a model, and the _next_ run uses that trained model — with no human +in the middle. This is that loop running end to end on a single Mac: train a small LoRA adapter +on a scenario's verifier-scored strategies, publish and auto-activate it in the model registry, +and have the agent provider auto-resolve and serve it on the next run. The served model proposes +**41.9% better** strategies than the untrained base, and nothing about which model to serve is +hardcoded — it is resolved from the registry the training run wrote to. + +## Result + +`grid_ctf` scenario, base model `mlx-community/Qwen2.5-0.5B-Instruct-4bit`, 8 strategies +sampled per measurement and scored by the scenario's own verifier: + +| Stage | Mean verifier score | Valid JSON rate | +| -------------------------------------- | -------------------- | --------------- | +| **run N** — base model as the agent | 0.5809 | 75% | +| **run N+1** — auto-served LoRA adapter | **0.8241** | 100% | +| delta | **+0.2432 (+41.9%)** | | + +The adapter was fine-tuned for 80 LoRA steps on the 60 highest-scoring strategies the loop +accumulated (mean verifier score 0.849). The whole loop — train, publish, auto-resolve, serve, +re-measure — ran in **43 seconds**. The in-training assessment (0.8565) independently agreed +with the served-adapter measurement (0.8241), so the metric the training run reports is the +score the served model actually delivers. + +## What "closed loop" means here + +The point is not that fine-tuning improves a model — that is expected. The point is that the +next run picks up the trained model **on its own**: + +``` +run N base Qwen2.5-0.5B-Instruct proposes grid_ctf strategies -> 0.58 +train LoRA SFT on the elite verifier-scored strategies (38s) +publish register + activate the adapter; record base_model on it -> state=active +bridge scenario_bound resolver -> plan_local_client -> MLXLMClient -> auto-selected +run N+1 AUTOCONTEXT_AGENT_PROVIDER=mlx serves base + adapter -> 0.82 +``` + +The `bridge` step is the load-bearing one. The serving run is given no model path. It calls +`_resolve_local_record(settings, scenario)`, which finds the active record the training run +published, and `plan_local_client(record)`, which routes an `mlxlm`/`opd` adapter to +`MLXLMClient(base=record.metadata["base_model"], adapter_path=record.checkpoint_path, ...)`. +That is why the registry record has to carry the base model the adapter was trained against — +an adapter checkpoint is useless without it — and why the publish step records it. + +## Reproduce + +Requires Apple Silicon with the mlx extra plus mlx-lm (`uv pip install mlx mlx-lm`). The base +model downloads once from the `mlx-community` Hugging Face repo. + +```bash +uv run python scripts/demo_recursive_loop.py +``` + +The script is self-contained: it builds the elite training set from the scenario's verifier, +calls `run_mlxlm_training`, publishes via `publish_training_output(..., auto_activate=True)`, +then resolves and serves the adapter through the exact code path the agent provider uses +(`scenario_bound_clients`), and prints the before/after verifier scores. + +## Two fixes this surfaced + +Running the loop on a game scenario exposed two real gaps, both fixed alongside this demo: + +1. **Game scenarios produced an empty task prompt.** `ScenarioInterface` scenarios expose + `describe_rules` / `describe_strategy_interface` / `describe_evaluation_criteria` but no + `get_task_prompt` or `description`, so `resolve_scenario_context` returned `""` — every game + scenario was untrainable on the adapter backends. It now composes the `describe_*` methods + into a task instruction. +2. **The in-training assessment fed the model a raw prompt.** `_assess_mlxlm` passed the bare + task string to `generate()`, but mlx-lm's LoRA trainer and the serving path both apply the + instruct chat template. An instruct model given a raw prompt emits prose, not scorable JSON, + so the in-training metric read ~0 even when the adapter was good. Assessment now applies the + chat template (`format_assess_prompt`), matching training and serving. diff --git a/autocontext/scripts/demo_recursive_loop.py b/autocontext/scripts/demo_recursive_loop.py new file mode 100644 index 00000000..9c79a1d9 --- /dev/null +++ b/autocontext/scripts/demo_recursive_loop.py @@ -0,0 +1,205 @@ +"""Live end-to-end demo of the autocontext recursive loop on local MLX. + +Closes the loop the PRs built: train a small mlx-lm LoRA adapter on a scenario's +verifier-scored strategies, PUBLISH + AUTO-ACTIVATE it in the model registry, then have +the scenario-bound resolver AUTO-SERVE it as the agent (no hardcoded path) and show the +served model proposes better strategies than the untrained base. + + run N = base Qwen2.5-0.5B-Instruct proposing grid_ctf strategies + train = LoRA SFT on the elite (verifier-scored) strategies the loop accumulates + publish = register + activate the adapter (records base_model + score_conditioned) + bridge = scenario_bound resolver -> plan_local_client -> MLXLMClient(base, adapter) + run N+1 = the AUTO-RESOLVED served adapter proposing grid_ctf strategies + +Requires the mlx extra + mlx-lm: uv pip install mlx mlx-lm +Run (from the package root): uv run python scripts/demo_recursive_loop.py +""" + +from __future__ import annotations + +import json +import random +import statistics +import tempfile +import time +from pathlib import Path + +from autocontext.agents.llm_client import MLXLMClient +from autocontext.agents.scenario_bound_clients import _build_planned_client, _resolve_local_record, plan_local_client +from autocontext.config.settings import AppSettings +from autocontext.scenarios import SCENARIO_REGISTRY +from autocontext.training.autoresearch.mlxlm_backend import ( + DEFAULT_BASE_MODEL, + run_mlxlm_training, + scenario_task_prompt, +) +from autocontext.training.autoresearch.sequence_format import extract_json_object +from autocontext.training.backends import default_backend_registry +from autocontext.training.model_registry import ( + ModelRegistry, + TrainingCompletionOutput, + publish_training_output, +) + +SCENARIO = "grid_ctf" +N_SAMPLES = 8 # strategies generated per measurement +TRAIN_STEPS = 80 +N_TRAIN_RECORDS = 60 + + +def banner(msg: str) -> None: + print(f"\n{'=' * 78}\n{msg}\n{'=' * 78}", flush=True) + + +def print_measure(label: str, m: dict) -> None: + print( + f"{label}: mean={m['mean']:.4f} best={m['best']:.4f} valid={m['valid_rate']:.0%} scores={m['scores']}", + flush=True, + ) + + +def measure(client, scenario, task_prompt: str, *, n: int) -> dict: + """Generate n strategies through the REAL agent client, score each via the verifier. + + ``client`` is an MLXLMClient (base-only for run N, base+adapter for run N+1) so the demo + exercises the actual serving path -- including format_mlxlm_prompt's chat-template wrap, + which an instruct model needs to emit parseable JSON.""" + scores: list[float] = [] + valid = 0 + for i in range(n): + try: + resp = client.generate(model="", prompt=task_prompt, max_tokens=128, temperature=0.7) + strategy = extract_json_object(resp.text) + if strategy is None: + continue + ok, _ = scenario.validate_actions(scenario.initial_state(seed=0), "challenger", strategy) + if not ok: + continue + valid += 1 + scores.append(scenario.execute_match(strategy, seed=i).score) + except Exception as exc: # noqa: BLE001 - demo: surface and continue + print(f" (sample {i}: {type(exc).__name__})", flush=True) + continue + return { + "mean": statistics.fmean(scores) if scores else 0.0, + "best": max(scores) if scores else 0.0, + "valid_rate": valid / n, + "scores": [round(s, 3) for s in scores], + } + + +def build_elite_training_set(scenario, path: Path, *, n_records: int) -> float: + """Sample the strategy space, score with the real verifier, keep the elite as training data. + + Represents the verifier-scored trajectories the loop accumulates over generations. Returns + the mean score of the kept elite (what the adapter is taught to reproduce).""" + rng = random.Random(0) + candidates = [] + for _ in range(n_records * 8): + a = round(rng.uniform(0.0, 1.0), 3) + d = round(rng.uniform(0.0, min(1.0, 1.4 - a)), 3) # honor aggression + defense <= 1.4 + p = round(rng.uniform(0.0, 1.0), 3) + strat = {"aggression": a, "defense": d, "path_bias": p} + score = scenario.execute_match(strat, seed=0).score + candidates.append((score, strat)) + candidates.sort(key=lambda x: x[0], reverse=True) + elite = candidates[:n_records] + with path.open("w") as f: + for i, (score, strat) in enumerate(elite): + f.write( + json.dumps({"run_id": f"elite_{i // 10}", "scenario": SCENARIO, "strategy": strat, "score": score, "context": {}}) + + "\n" + ) + return statistics.fmean(s for s, _ in elite) + + +def main() -> None: + t0 = time.time() + scenario = SCENARIO_REGISTRY[SCENARIO]() + task_prompt = scenario_task_prompt(scenario) + base = DEFAULT_BASE_MODEL + + banner(f"autocontext recursive loop — live demo on local MLX\nscenario={SCENARIO} base={base}") + print(f"task prompt:\n {task_prompt}", flush=True) + + # --- run N: the untrained base model is the agent --------------------------------------- + banner("RUN N — baseline: base model proposes strategies (no trained model yet)") + before = measure(MLXLMClient(base), scenario, task_prompt, n=N_SAMPLES) + print_measure("base model", before) + + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + data_path = tmp_path / "training_data.jsonl" + out_dir = tmp_path / "mlxlm_run" + knowledge_root = tmp_path / "knowledge" + knowledge_root.mkdir() + + # --- accumulate the verifier-scored elite, then train an adapter on it --------------- + banner(f"TRAIN — LoRA SFT on the elite {N_TRAIN_RECORDS} verifier-scored strategies") + elite_mean = build_elite_training_set(scenario, data_path, n_records=N_TRAIN_RECORDS) + print(f"elite training set: {N_TRAIN_RECORDS} strategies, mean verifier score={elite_mean:.4f}", flush=True) + print(f"fine-tuning {base} for {TRAIN_STEPS} steps ...", flush=True) + metrics = run_mlxlm_training( + scenario_name=SCENARIO, + data_path=data_path, + output_dir=out_dir, + time_budget=900, + memory_limit_mb=16384, + train_steps=TRAIN_STEPS, + base_model=base, + assess_samples=N_SAMPLES, + assess_temperature=0.7, + ) + adapter_dir = out_dir / "adapters" + print( + f"trained. in-training assessment: avg_score={metrics['avg_score']:.4f} " + f"valid_rate={metrics['valid_rate']:.0%} ({metrics['training_seconds']:.0f}s)", + flush=True, + ) + + # --- publish + auto-activate (the recursive loop's hand-off) ------------------------- + banner("PUBLISH — register + auto-activate the adapter in the model registry") + registry = ModelRegistry(knowledge_root) + completion = TrainingCompletionOutput( + run_id="demo-run", + checkpoint_path=str(adapter_dir), + backend="mlxlm", + scenario=SCENARIO, + scenario_family="game", + runtime_types=default_backend_registry().get("mlxlm").supported_runtime_types(), + training_metrics={"avg_score": metrics["avg_score"]}, + metadata={"base_model": base, "score_conditioned": False}, + ) + record = publish_training_output(completion, registry, artifacts_root=None, auto_activate=True) + print(f"published: artifact={record.artifact_id} backend={record.backend} state={record.activation_state}", flush=True) + print(f" runtime_types={record.runtime_types} base_model={record.metadata.get('base_model')!r}", flush=True) + + # --- the BRIDGE: resolve + route purely from the registry (no hardcoded path) -------- + banner("BRIDGE — scenario_bound resolver auto-selects the trained adapter") + settings = AppSettings(agent_provider="mlx", mlx_model_path="", knowledge_root=knowledge_root) + resolved = _resolve_local_record(settings, SCENARIO) + assert resolved is not None, "resolver failed to find the active adapter" + plan = plan_local_client(resolved) + assert plan is not None, "router could not plan a client for the record" + print(f"resolved active record -> kind={plan.kind!r} base={plan.model!r}", flush=True) + print(f" adapter_path={plan.adapter_path} score_conditioned={plan.score_conditioned}", flush=True) + print(" => AUTOCONTEXT_AGENT_PROVIDER=mlx would now serve MLXLMClient(base, adapter)", flush=True) + + # --- run N+1: the auto-served adapter is the agent ----------------------------------- + banner("RUN N+1 — the auto-resolved served adapter proposes strategies") + served_client = _build_planned_client(plan, settings) # the bridge's real client construction + after = measure(served_client, scenario, task_prompt, n=N_SAMPLES) + print_measure("served adapter", after) + + # --- verdict ------------------------------------------------------------------------- + banner("VERDICT") + delta = after["mean"] - before["mean"] + print(f" run N (base model) mean score = {before['mean']:.4f}", flush=True) + print(f" run N+1 (served adapter) mean score = {after['mean']:.4f}", flush=True) + print(f" delta = {delta:+.4f} ({delta / max(before['mean'], 1e-9):+.1%})", flush=True) + print(f" loop closed: train -> publish -> auto-resolve -> serve, in {time.time() - t0:.0f}s", flush=True) + print(f"\n {'IMPROVED ✓' if delta > 0 else 'NO IMPROVEMENT'} — N+1 {'>' if delta > 0 else '<='} N", flush=True) + + +if __name__ == "__main__": + main() diff --git a/autocontext/src/autocontext/training/autoresearch/mlxlm_backend.py b/autocontext/src/autocontext/training/autoresearch/mlxlm_backend.py index 85d3e33b..f4b47a1f 100644 --- a/autocontext/src/autocontext/training/autoresearch/mlxlm_backend.py +++ b/autocontext/src/autocontext/training/autoresearch/mlxlm_backend.py @@ -52,6 +52,25 @@ def _quality_prefix(quality: int | None, num_buckets: int) -> str: return f"Target quality: {quality} out of {num_buckets - 1} (higher is better).\n" +def format_assess_prompt(tokenizer: Any, task_prompt: str, *, score_conditioned: bool) -> str: + """Render the assessment prompt through the model's chat template. + + mlx-lm's LoRA trainer applies the instruct chat template to prompt/completion records, and + the serving path (``MLXLMProvider.format_mlxlm_prompt``) does the same. Assessment must match: + feeding an instruct model a RAW prompt yields prose, not the JSON the verifier can score, so + the in-training metric reads ~0. Falls back to the raw text if the tokenizer has no chat + template (a base, non-instruct model).""" + prefix = _quality_prefix(NUM_QUALITY_BUCKETS - 1, NUM_QUALITY_BUCKETS) if score_conditioned else "" + content = prefix + task_prompt + try: + rendered = tokenizer.apply_chat_template( + [{"role": "user", "content": content}], add_generation_prompt=True, tokenize=False + ) + return str(rendered) + except Exception: + return content + + def build_completion_record( *, task_prompt: str, @@ -251,8 +270,7 @@ def _assess_mlxlm( loaded = load(base_model, adapter_path=str(adapter_dir)) model, tokenizer = loaded[0], loaded[1] - prefix = _quality_prefix(NUM_QUALITY_BUCKETS - 1, NUM_QUALITY_BUCKETS) if score_conditioned else "" - prompt = prefix + task_prompt + prompt = format_assess_prompt(tokenizer, task_prompt, score_conditioned=score_conditioned) is_game = hasattr(scenario, "execute_match") # Honor the requested assessment sampling (temp<=0 => greedy; top_k truncation). sampler = make_sampler(temp=max(float(temperature), 0.0), top_k=int(top_k)) diff --git a/autocontext/src/autocontext/training/autoresearch/sequence_format.py b/autocontext/src/autocontext/training/autoresearch/sequence_format.py index 17c3de21..c9095151 100644 --- a/autocontext/src/autocontext/training/autoresearch/sequence_format.py +++ b/autocontext/src/autocontext/training/autoresearch/sequence_format.py @@ -245,9 +245,27 @@ def resolve_scenario_context(scenario: Any) -> str: if isinstance(prompt, str): return prompt description = getattr(scenario, "description", None) - if isinstance(description, str): + if isinstance(description, str) and description: return description - return "" + # Game scenarios (ScenarioInterface) expose no get_task_prompt/description but do + # describe their rules, strategy interface, and evaluation criteria -- compose those + # into a task instruction so the adapter backends train/serve on a real prompt rather + # than an empty string (otherwise every game scenario is untrainable on this path). + parts = [] + for attr, label in ( + ("describe_rules", "Rules"), + ("describe_strategy_interface", "Response format"), + ("describe_evaluation_criteria", "Evaluation"), + ): + fn = getattr(scenario, attr, None) + if callable(fn): + try: + text = fn() + except Exception: + text = None + if isinstance(text, str) and text: + parts.append(f"{label}: {text}") + return "\n".join(parts) # Backward-compatible private aliases (prepare/cuda historically used underscored names). diff --git a/autocontext/tests/test_mlxlm_backend.py b/autocontext/tests/test_mlxlm_backend.py index cdd3b3f8..21cbd899 100644 --- a/autocontext/tests/test_mlxlm_backend.py +++ b/autocontext/tests/test_mlxlm_backend.py @@ -160,3 +160,34 @@ def test_mlxlm_end_to_end_smoke(tmp_path: Path) -> None: backend="mlxlm", ) assert "avg_score" in metrics and "num_records" in metrics + + +def test_format_assess_prompt_applies_chat_template() -> None: + """Assessment must wrap the prompt in the instruct chat template (matching training + serving); + a raw prompt makes an instruct model emit prose, not verifier-scorable JSON (the ~0-metric bug).""" + + class _Tok: + def apply_chat_template(self, messages, add_generation_prompt, tokenize): + assert add_generation_prompt is True and tokenize is False + return f"<|user|>{messages[0]['content']}<|assistant|>" + + out = mb.format_assess_prompt(_Tok(), "do the task", score_conditioned=False) + assert out == "<|user|>do the task<|assistant|>" + + +def test_format_assess_prompt_prefixes_quality_when_score_conditioned() -> None: + class _Tok: + def apply_chat_template(self, messages, add_generation_prompt, tokenize): + return messages[0]["content"] # echo content so we can assert the prefix + + out = mb.format_assess_prompt(_Tok(), "do the task", score_conditioned=True) + assert out.startswith("Target quality:") + assert out.endswith("do the task") + + +def test_format_assess_prompt_falls_back_to_raw_without_chat_template() -> None: + class _Tok: + def apply_chat_template(self, *a, **k): + raise ValueError("no chat template") + + assert mb.format_assess_prompt(_Tok(), "raw text", score_conditioned=False) == "raw text" diff --git a/autocontext/tests/test_sequence_format.py b/autocontext/tests/test_sequence_format.py index 84465acd..dcb9c1c9 100644 --- a/autocontext/tests/test_sequence_format.py +++ b/autocontext/tests/test_sequence_format.py @@ -353,3 +353,44 @@ def test_score_loss_weights_softmax_low_temperature_concentrates_on_max() -> Non w = score_loss_weights([0.0, 0.5, 1.0], mode="softmax", temperature=0.01) assert w[2] > w[0] and w[2] > w[1] assert w[2] > 2.5 # near n=3 (all mass on the max) + + +def test_resolve_scenario_context_prefers_get_task_prompt() -> None: + from autocontext.training.autoresearch.sequence_format import resolve_scenario_context + + class _Scn: + def get_task_prompt(self) -> str: + return "explicit task prompt" + + def describe_rules(self) -> str: # must NOT be used when get_task_prompt exists + return "rules" + + assert resolve_scenario_context(_Scn()) == "explicit task prompt" + + +def test_resolve_scenario_context_composes_describe_methods_for_game_scenarios() -> None: + """Game scenarios (ScenarioInterface) have no get_task_prompt/description; the resolver must + compose their describe_* methods so the adapter backends get a real prompt, not an empty one.""" + from autocontext.training.autoresearch.sequence_format import resolve_scenario_context + + class _GameScn: + def describe_rules(self) -> str: + return "20x20 CTF map." + + def describe_strategy_interface(self) -> str: + return "Return JSON with aggression, defense, path_bias." + + def describe_evaluation_criteria(self) -> str: + return "Maximize capture progress." + + out = resolve_scenario_context(_GameScn()) + assert out # not empty (the bug) + assert "Rules: 20x20 CTF map." in out + assert "Response format: Return JSON with aggression, defense, path_bias." in out + assert "Evaluation: Maximize capture progress." in out + + +def test_resolve_scenario_context_empty_when_nothing_available() -> None: + from autocontext.training.autoresearch.sequence_format import resolve_scenario_context + + assert resolve_scenario_context(object()) == "" From 8a0ccfedfedd58ac52b99d472fc42334bef6c455 Mon Sep 17 00:00:00 2001 From: Jay Scambler Date: Tue, 9 Jun 2026 14:57:47 -0500 Subject: [PATCH 2/3] docs(recursive-loop): link the case study from the docs indexes (review #1061) The new case-study-recursive-loop.md was orphaned from the public docs path. Link it beside the MLX-training / OPD-case-study entries in the root README, docs/README, and the package README so users discover it. Root README re-applied without the formatter to avoid mangling the synced whats-new block + Surfaces table. --- README.md | 1 + autocontext/README.md | 1 + docs/README.md | 1 + 3 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 27c312fe..4bc95e85 100644 --- a/README.md +++ b/README.md @@ -332,6 +332,7 @@ Yes. Wire `autoctx mcp-serve` (or `bunx autoctx mcp-serve`) into Claude Code, Cu - Repo layout for coding agents: [AGENTS.md](AGENTS.md) - Local + cross-platform model training (MLX and TRL backends): [autocontext/docs/mlx-training.md](autocontext/docs/mlx-training.md) - Validated training result (on-policy distillation vs RLVR on GSM8K): [autocontext/docs/case-study-on-policy-distillation.md](autocontext/docs/case-study-on-policy-distillation.md) +- Recursive loop closed end to end on local MLX (train -> auto-serve -> improve): [autocontext/docs/case-study-recursive-loop.md](autocontext/docs/case-study-recursive-loop.md) - Sandbox and executor notes: [autocontext/docs/sandbox.md](autocontext/docs/sandbox.md) - Persistent host worker: [autocontext/docs/persistent-host.md](autocontext/docs/persistent-host.md) - License: [LICENSE](LICENSE) diff --git a/autocontext/README.md b/autocontext/README.md index 3f5b2043..d1ca309b 100644 --- a/autocontext/README.md +++ b/autocontext/README.md @@ -699,6 +699,7 @@ For the TypeScript equivalent, see `ts/src/integrations/anthropic/STABILITY.md`. - [Sandbox modes](docs/sandbox.md) - [Persistent host worker](docs/persistent-host.md) - [MLX host training](docs/mlx-training.md) +- [Case study: recursive loop closed on local MLX](docs/case-study-recursive-loop.md) - [TypeScript package guide](../ts/README.md) — `analyze`, mission control, and interactive TUI surfaces - [Demo data notes](demo_data/README.md) - [Copy-paste examples](../examples/README.md) diff --git a/docs/README.md b/docs/README.md index 64faf5b4..ee157075 100644 --- a/docs/README.md +++ b/docs/README.md @@ -23,6 +23,7 @@ This directory is the maintainer-facing landing page for repository docs. Use it - [Sandbox and executor notes](../autocontext/docs/sandbox.md) - [Persistent host worker](../autocontext/docs/persistent-host.md) - [MLX host training notes](../autocontext/docs/mlx-training.md) +- [Case study: recursive loop closed on local MLX](../autocontext/docs/case-study-recursive-loop.md) ## Contributing And Support From b891c72470211032c518c27bb9278996ceba9d98 Mon Sep 17 00:00:00 2001 From: Jay Scambler Date: Tue, 9 Jun 2026 15:09:58 -0500 Subject: [PATCH 3/3] feat(recursive-loop): multi-generation self-improvement demo + case-study trajectory Take the loop further: scripts/demo_recursive_loop_multigen.py runs the genuine recursive loop, where each generation trains ONLY on the model's own verifier-curated proposals (no external data, no human). The served model proposes -> verifier scores -> best-so-far becomes the next adapter's training set -> publish + auto-activate -> next gen is served by it. Live result (grid_ctf, Qwen2.5-0.5B, 3 generations, 33s): mean 0.577 -> 0.619 and best 0.689 -> 0.737, monotonic, valid-JSON 13/20 -> 20/20. The +7.4% is smaller than the single step's +41.9% and honestly so -- bootstrapping from a weak cold-start distribution is slower than training on a globally-curated elite; the point is the shape (it compounds on its own output), not the magnitude. Documented as a new section in the case study. --- autocontext/docs/case-study-recursive-loop.md | 26 ++- .../scripts/demo_recursive_loop_multigen.py | 175 ++++++++++++++++++ 2 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 autocontext/scripts/demo_recursive_loop_multigen.py diff --git a/autocontext/docs/case-study-recursive-loop.md b/autocontext/docs/case-study-recursive-loop.md index b74b007a..9745d109 100644 --- a/autocontext/docs/case-study-recursive-loop.md +++ b/autocontext/docs/case-study-recursive-loop.md @@ -45,13 +45,37 @@ published, and `plan_local_client(record)`, which routes an `mlxlm`/`opd` adapte That is why the registry record has to carry the base model the adapter was trained against — an adapter checkpoint is useless without it — and why the publish step records it. +## Does it compound? Multi-generation self-improvement + +The single step above trains on a curated near-optimal elite (strategies sampled across the +space and scored by the verifier), so it shows the ceiling the loop can reach with good data. +The stronger claim is that the loop improves on its OWN output with no external data: each +generation, the currently-served model proposes, the verifier scores, the best of everything +proposed so far becomes the next adapter's training set, and the next generation is served by +that adapter. Three generations bootstrapping from the base model's cold-start proposals: + +| Generation | Mean | Best | Valid JSON | +| -------------- | ------ | ------ | ---------- | +| gen 0 (base) | 0.5767 | 0.6885 | 13/20 | +| gen 1 (served) | 0.5952 | 0.7254 | 20/20 | +| gen 2 (served) | 0.5998 | 0.7351 | 20/20 | +| gen 3 (served) | 0.6194 | 0.7369 | 20/20 | + +Mean and best both rise monotonically, valid-JSON rate goes 13 -> 20/20, and the whole 3-gen +run takes 33s. The gain (+7.4%) is far smaller than the single-step +41.9%, and honestly so: +bootstrapping from the base model's own weak, low-diversity cold-start distribution is slower +than training on a globally-curated elite. The point is not the magnitude but the shape — the +loop compounds on its own verifier-scored output, generation over generation, with no human and +no external data. The only external signal is the verifier, which scores but never generates. + ## Reproduce Requires Apple Silicon with the mlx extra plus mlx-lm (`uv pip install mlx mlx-lm`). The base model downloads once from the `mlx-community` Hugging Face repo. ```bash -uv run python scripts/demo_recursive_loop.py +uv run python scripts/demo_recursive_loop.py # single step (train -> serve -> +41.9%) +uv run python scripts/demo_recursive_loop_multigen.py # multi-generation self-improvement ``` The script is self-contained: it builds the elite training set from the scenario's verifier, diff --git a/autocontext/scripts/demo_recursive_loop_multigen.py b/autocontext/scripts/demo_recursive_loop_multigen.py new file mode 100644 index 00000000..0ad90418 --- /dev/null +++ b/autocontext/scripts/demo_recursive_loop_multigen.py @@ -0,0 +1,175 @@ +"""Multi-generation self-improvement: the recursive loop compounding on its OWN output. + +The single-step demo (scripts/demo_recursive_loop.py) trains one adapter on a fixed elite set. +This one runs the genuine recursive loop: every generation, the CURRENTLY-SERVED model proposes +strategies, the verifier (the only external signal) scores them, the best of everything proposed +so far becomes the next adapter's training set, that adapter is published + auto-activated, and the +next generation is served by it. No hand-authored training data, no human in the loop -- the agent +bootstraps from its own cold-start proposals toward the verifier's optimum. + + gen 0 base Qwen2.5-0.5B-Instruct proposes; verifier scores; pool = its valid proposals + gen g train on the elite of the pool -> publish + auto-activate -> bridge serves it + -> it proposes -> verifier scores -> add to pool -> repeat + +Requires the mlx extra + mlx-lm: uv pip install mlx mlx-lm +Run (from the package root): uv run python scripts/demo_recursive_loop_multigen.py +""" + +from __future__ import annotations + +import json +import statistics +import tempfile +import time +from pathlib import Path + +from autocontext.agents.llm_client import MLXLMClient +from autocontext.agents.scenario_bound_clients import _build_planned_client, _resolve_local_record, plan_local_client +from autocontext.config.settings import AppSettings +from autocontext.scenarios import SCENARIO_REGISTRY +from autocontext.training.autoresearch.mlxlm_backend import DEFAULT_BASE_MODEL, run_mlxlm_training, scenario_task_prompt +from autocontext.training.autoresearch.sequence_format import extract_json_object +from autocontext.training.backends import default_backend_registry +from autocontext.training.model_registry import ModelRegistry, TrainingCompletionOutput, publish_training_output + +SCENARIO = "grid_ctf" +GENERATIONS = 3 +PROPOSALS_PER_GEN = 20 +ELITE_FRACTION = 0.5 +MIN_ELITE = 16 +TRAIN_STEPS = 80 +# batch_size=1: the gen-0 pool is small (only the base model's valid cold-start proposals), +# and mlx-lm requires the validation split to hold >= batch_size examples. +BATCH_SIZE = 1 + + +def banner(msg: str) -> None: + print(f"\n{'=' * 78}\n{msg}\n{'=' * 78}", flush=True) + + +def propose_and_score(client, scenario, task_prompt: str, *, n: int) -> list[tuple[dict, float]]: + """The served model proposes n strategies; the verifier scores the valid ones.""" + out: list[tuple[dict, float]] = [] + for i in range(n): + try: + resp = client.generate(model="", prompt=task_prompt, max_tokens=128, temperature=0.8) + strategy = extract_json_object(resp.text) + if strategy is None: + continue + ok, _ = scenario.validate_actions(scenario.initial_state(seed=0), "challenger", strategy) + if not ok: + continue + out.append((strategy, scenario.execute_match(strategy, seed=i).score)) + except Exception as exc: # noqa: BLE001 - demo: surface and continue + print(f" (proposal {i}: {type(exc).__name__})", flush=True) + return out + + +def write_elite(pool: list[tuple[dict, float]], path: Path) -> tuple[int, float]: + """Write the top strategies the agents have proposed so far as the next training set.""" + ranked = sorted(pool, key=lambda x: x[1], reverse=True) + keep = max(MIN_ELITE, int(len(ranked) * ELITE_FRACTION)) + elite = ranked[:keep] + with path.open("w") as f: + for i, (strat, score) in enumerate(elite): + f.write( + json.dumps({"run_id": f"gen_{i // 12}", "scenario": SCENARIO, "strategy": strat, "score": score, "context": {}}) + + "\n" + ) + return len(elite), statistics.fmean(s for _, s in elite) + + +def summarize(label: str, scored: list[tuple[dict, float]]) -> dict: + scores = [s for _, s in scored] + row = { + "label": label, + "n": len(scores), + "mean": statistics.fmean(scores) if scores else 0.0, + "best": max(scores) if scores else 0.0, + } + print(f"{label}: proposals_valid={row['n']} mean={row['mean']:.4f} best={row['best']:.4f}", flush=True) + return row + + +def main() -> None: + t0 = time.time() + scenario = SCENARIO_REGISTRY[SCENARIO]() + task_prompt = scenario_task_prompt(scenario) + base = DEFAULT_BASE_MODEL + + banner(f"recursive self-improvement — {GENERATIONS} generations on local MLX\nscenario={SCENARIO} base={base}") + + pool: list[tuple[dict, float]] = [] + history: list[dict] = [] + + # --- gen 0: the cold-start base model is the agent -------------------------------------- + banner("GEN 0 — base model proposes (cold start, no training yet)") + gen0 = propose_and_score(MLXLMClient(base), scenario, task_prompt, n=PROPOSALS_PER_GEN) + pool.extend(gen0) + history.append(summarize("gen 0 (base)", gen0)) + + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + knowledge_root = tmp_path / "knowledge" + knowledge_root.mkdir() + registry = ModelRegistry(knowledge_root) + settings = AppSettings(agent_provider="mlx", mlx_model_path="", knowledge_root=knowledge_root) + runtime_types = default_backend_registry().get("mlxlm").supported_runtime_types() + + for g in range(1, GENERATIONS + 1): + banner(f"GEN {g} — train on the elite of {len(pool)} accumulated proposals, then self-serve") + data_path = tmp_path / f"train_gen{g}.jsonl" + out_dir = tmp_path / f"mlxlm_gen{g}" + n_elite, elite_mean = write_elite(pool, data_path) + print( + f" elite={n_elite} strategies (mean verifier score={elite_mean:.4f}) -> fine-tuning {TRAIN_STEPS} steps ...", + flush=True, + ) + + run_mlxlm_training( + scenario_name=SCENARIO, + data_path=data_path, + output_dir=out_dir, + time_budget=900, + memory_limit_mb=16384, + train_steps=TRAIN_STEPS, + batch_size=BATCH_SIZE, + base_model=base, + assess_samples=2, # we run our own measurement below; keep the internal assess cheap + assess_temperature=0.7, + ) + completion = TrainingCompletionOutput( + run_id=f"gen-{g}", + checkpoint_path=str(out_dir / "adapters"), + backend="mlxlm", + scenario=SCENARIO, + scenario_family="game", + runtime_types=runtime_types, + metadata={"base_model": base, "score_conditioned": False}, + ) + record = publish_training_output(completion, registry, artifacts_root=None, auto_activate=True) + print(f" published + activated: {record.artifact_id}", flush=True) + + # The bridge: resolve the just-activated adapter purely from the registry and serve it. + resolved = _resolve_local_record(settings, SCENARIO) + assert resolved is not None and resolved.artifact_id == record.artifact_id + client = _build_planned_client(plan_local_client(resolved), settings) + + geng = propose_and_score(client, scenario, task_prompt, n=PROPOSALS_PER_GEN) + pool.extend(geng) + history.append(summarize(f"gen {g} (served)", geng)) + + # --- trajectory --------------------------------------------------------------------------- + banner("TRAJECTORY — does the loop compound on its own output?") + base_mean = history[0]["mean"] + for row in history: + delta = row["mean"] - base_mean + bar = "#" * int(row["mean"] * 50) + print(f" {row['label']:<16} mean={row['mean']:.4f} best={row['best']:.4f} ({delta:+.4f} vs base) {bar}", flush=True) + final = history[-1]["mean"] + gain = (final - base_mean) / max(base_mean, 1e-9) + print(f"\n base -> final: {base_mean:.4f} -> {final:.4f} ({gain:+.1%}) in {time.time() - t0:.0f}s", flush=True) + + +if __name__ == "__main__": + main()