autonomousproductstudio-ai · rajatnagda45 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,35 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install deps
+        # Real orchestration stack (LangGraph + langchain-core + FastAPI) plus the light
+        # test deps. No LLM provider package / key is needed: the Research node degrades
+        # to a fixture offline. `pythonpath=src` in pyproject lets `import aps` work
+        # without an editable install. See memory.md §2.
+        run: |
+          python -m pip install --upgrade pip
+          pip install pydantic pydantic-settings requests structlog tenacity \
+                      langgraph langchain-core fastapi httpx cachetools \
+                      pytest pytest-asyncio ruff
+
+      - name: Lint (ruff)
+        run: ruff check src tests
+
+      - name: Test (pytest)
+        run: python -m pytest
diff --git a/scripts/demo_run.py b/scripts/demo_run.py
@@ -0,0 +1,128 @@
+"""demo_run.py — clean full-vertical demo on any idea (Phase 6 defense / repro entry point).
+
+Runs Idea -> Research(fan-out) -> Product -> Architecture -> Execution -> Presentation,
+persists every artifact to the file store (.artifacts/<run_id>/), and prints a human
+summary. With an LLM key + free source keys it runs fully live; with no keys it degrades to
+the fixture brief and still completes end-to-end (so a judge can reproduce either way).
+
+    python scripts/demo_run.py "a privacy-first personal finance tracker for couples"
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
+try:
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+except Exception:
+    pass
+
+
+def _parse_args(argv: list[str]) -> tuple[str, str | None]:
+    """Return (idea, model). `--model NAME` overrides the NIM model for verification runs
+    (e.g. qwen3.5-122b-a10b / glm-5.1); the positional arg is the idea."""
+    idea, model, rest = None, None, []
+    it = iter(argv)
+    for a in it:
+        if a == "--model":
+            model = next(it, None)
+        elif a.startswith("--model="):
+            model = a.split("=", 1)[1]
+        else:
+            rest.append(a)
+    if rest:
+        idea = rest[0]
+    return (idea or "a privacy-first personal finance tracker for couples", model)
+
+
+def main() -> int:
+    idea, model = _parse_args(sys.argv[1:])
+    # Must set the model env BEFORE importing settings (get_settings is lru_cached at import).
+    if model:
+        os.environ["APS_NIM_MODEL"] = model
+
+    from aps.orchestrator.events import EventBus
+    from aps.orchestrator.graph import run_sync
+    from aps.infra import artifact_store
+    from aps.config.settings import describe_runtime
+
+    run_id = "demo"
+    print(f"{describe_runtime()} fanout={os.getenv('APS_RESEARCH_FANOUT', 'true')}")
+    print(f">>> {idea!r}\n")
+
+    bus = EventBus()
+    state = run_sync(idea, bus, run_id=run_id)
+    path = artifact_store.save_run(run_id, state)
+
+    ev_types = [e.type for e in bus.history(run_id)]
+    produced = [a for a in ("research", "prd", "trd", "execution", "pitch")
+                if getattr(state, a) is not None]
+
+    # W6: drop a human-readable Markdown render of each artifact beside its JSON, so a judge
+    # running the demo gets readable documents (the pipeline still persists JSON only).
+    from aps.render import render_artifact
+    for name in produced:
+        (path / f"{name}.md").write_text(
+            render_artifact(name, getattr(state, name)), encoding="utf-8")
+    # T2.2: drop the TRD's Mermaid architecture diagrams alongside the JSON/MD
+    if state.trd is not None:
+        from aps.render import architecture_mmd
+        (path / "trd.mermaid.md").write_text(
+            architecture_mmd.render(state.trd), encoding="utf-8")
+    r, prd, trd, ex = state.research, state.prd, state.trd, state.execution
+
+    print(f"status        : {state.status.value}")
+    print(f"artifacts     : {', '.join(produced)}")
+    print(f"events        : {len(ev_types)}  (fan-out: "
+          f"{ev_types.count('research_unit_start')} sub-researchers)")
+    if r:
+        print(f"research      : {len(r.evidence)} evidence, {len(r.competitors)} competitors, "
+              f"{len(r.pain_points)} pains")
+        print(f"market_size   : {(r.market_size or '')[:90]}")
+    if prd:
+        print(f"prd           : {len(prd.personas)} personas, {len(prd.features)} features, "
+              f"{len(prd.requirements)} requirements, {len(prd.sources)} sources")
+    if trd:
+        print(f"trd           : OpenAPI {trd.api_spec.get('openapi')}, "
+              f"{len(trd.api_spec.get('paths', {}))} paths, stack {trd.stack[:4]}")
+    if ex:
+        print(f"execution     : {len(ex.backlog)} backlog items, {len(ex.sprints)} sprints")
+    print(f"pitch         : {'yes' if state.pitch else 'no'}")
+
+    if state.research:
+        from aps.scoring import score_startup
+        sc = score_startup(state.research, state.prd)
+        print(f"\nStartup Score : {sc.overall}/10 — {sc.verdict}")
+        for d in sc.dimensions:
+            print(f"  {d.name:24} {d.score:>4}/10  ({d.rationale})")
+
+        from aps.debate import run_debate
+        dbt = run_debate(state.research, state.prd)
+        print(f"\nDebate verdict: {dbt.verdict}  (confidence {int(dbt.confidence * 100)}%)")
+        print(f"  FOR : {len(dbt.build_case)} point(s) · AGAINST: {len(dbt.risk_case)} risk(s)")
+
+    if state.prd:
+        from aps.explain import explain_prd
+        ex = explain_prd(state.prd, state.research)
+        print(f"\nExplain-Why   : {int(ex.overall_confidence * 100)}% avg confidence "
+              f"across {len(ex.features)} feature(s) (every feature traced to its evidence)")
+
+    if state.prd:
+        # GitHub Launch preview (dry-run — creates nothing; set APS_GITHUB_PAT + run the
+        # live smoke / POST /launch/github to create the repo for real).
+        from aps.launch import build_launch_plan, launch_github
+        plan = build_launch_plan(state.idea, state.prd, state.execution, state.pitch)
+        prev = launch_github(plan, dry_run=True)
+        print(f"\nGitHub Launch : repo '{plan.repo_name}' — {len(plan.issues)} issues, "
+              f"{len(plan.milestones)} milestones (preview; set APS_GITHUB_PAT to create)")
+
+    print(f"\nartifacts saved to: {path}")
+
+    ok = state.status.value == "complete" and len(produced) == 5
+    print("\n" + ("PASS — full vertical reproduced end-to-end." if ok else "INCOMPLETE"))
+    return 0 if ok else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/eval_g01_live.py b/scripts/eval_g01_live.py
@@ -0,0 +1,69 @@
+"""Live single-idea eval (gold g01) for the real MEMO numbers.
+
+Runs the full orchestrator once (research fan-out + downstream agents) against a live
+model, scores it with the eval scorers, writes tests/evals/report.md, and prints the
+numbers to paste into MEMO.md. One idea on purpose — the full 8-idea gold set runs offline
+in CI (test_eval_runner.py); running all 8 live would burn ~240 model calls.
+
+    python scripts/eval_g01_live.py
+"""
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "tests", "evals"))
+try:
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+except Exception:
+    pass
+
+
+def _tool_counts() -> dict[str, float]:
+    from aps.infra.metrics import TOOL_CALLS
+    out: dict[str, float] = {}
+    collect = getattr(TOOL_CALLS, "collect", None)
+    if not collect:
+        return out
+    for fam in collect():
+        for s in fam.samples:
+            if s.name.endswith("_total") and s.value:
+                out[s.labels.get("tool")] = out.get(s.labels.get("tool"), 0.0) + s.value
+    return out
+
+
+def main() -> int:
+    # `--model NAME` overrides the NIM model for a verification run; set BEFORE importing
+    # settings/run_eval (get_settings is lru_cached at import).
+    argv = sys.argv[1:]
+    if "--model" in argv:
+        i = argv.index("--model")
+        if i + 1 < len(argv):
+            os.environ["APS_NIM_MODEL"] = argv[i + 1]
+
+    from aps.config.settings import describe_runtime
+    print(f"runtime: {describe_runtime()}")
+
+    import run_eval  # tests/evals/run_eval.py
+
+    g01 = [{"id": "g01", "idea": "Build an AI SaaS for resume screening",
+            "expect_sources": ["github", "hackernews", "reddit"], "min_evidence": 5}]
+    rows = run_eval.evaluate(g01)
+    report = Path(__file__).resolve().parents[1] / "tests" / "evals" / "report.md"
+    report.write_text(run_eval.to_markdown(rows), encoding="utf-8")
+
+    tools = _tool_counts()
+    r = rows[0]
+    print("=== g01 LIVE eval ===")
+    print(json.dumps(r, indent=2))
+    print("distinct tools called :", len(tools))
+    print("total tool calls      :", int(sum(tools.values())))
+    print("report.md written     :", report)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/live_fanout_smoke.py b/scripts/live_fanout_smoke.py
@@ -0,0 +1,85 @@
+"""live_fanout_smoke.py — Phase-3 fan-out verification (live).
+
+Runs the research fan-out supervisor on an idea and prints the plan, per-unit trace, the
+distinct retrieval tools the parallel sub-researchers selected, total tool calls, and the
+merged brief. Confirms the deliverable: >= 2 units, evidence > 0, ~15-20 tool calls.
+
+    python scripts/live_fanout_smoke.py "an AI resume builder that beats ATS filters"
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
+try:
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+except Exception:
+    pass
+
+
+def _tool_counts(namespace: str | None = None) -> dict[str, float]:
+    from aps.infra.metrics import TOOL_CALLS
+    out: dict[str, float] = {}
+    collect = getattr(TOOL_CALLS, "collect", None)
+    if not collect:
+        return out
+    for fam in collect():
+        for s in fam.samples:
+            if s.name.endswith("_total") and s.value:
+                ns = s.labels.get("namespace")
+                tool = s.labels.get("tool")
+                if namespace and ns != namespace:
+                    continue
+                out[tool] = out.get(tool, 0.0) + s.value
+    return out
+
+
+def main() -> int:
+    idea = sys.argv[1] if len(sys.argv) > 1 else \
+        "an AI resume builder that beats ATS filters"
+
+    from aps.config.settings import get_settings
+    s = get_settings()
+    model = s.nim_model if s.model_provider == "nim" else s.gemini_model
+    print(f"provider={s.model_provider} model={model} "
+          f"max_concurrent={s.max_concurrent_researchers}")
+
+    events: list = []
+
+    def on_event(t: str, d: dict) -> None:
+        events.append((t, d))
+        if t == "research_plan":
+            print("PLAN:")
+            for st in d["subtopics"]:
+                print(f"   - {st}")
+        elif t == "research_unit_start":
+            print(f"  unit START : {d['focus'][:60]}")
+        elif t == "research_unit_end":
+            print(f"  unit END   : {d['focus'][:55]} -> {d['evidence']} evidence")
+        elif t == "error":
+            print(f"  ERROR      : {d.get('error', '')[:90]}")
+
+    from aps.agents.research.supervisor import run_research_fanout
+    print(f"\n>>> fan-out research on: {idea!r}\n")
+    r = run_research_fanout(idea, on_event=on_event)
+
+    retrieval = _tool_counts("retrieval")
+    units = [e for e in events if e[0] == "research_unit_start"]
+    print("\n--- RESULT ---")
+    print("units spawned         :", len(units))
+    print("distinct retrieval    :", retrieval)
+    print("total retrieval calls :", int(sum(retrieval.values())))
+    print("evidence (merged)     :", len(r.evidence))
+    print("competitors           :", len(r.competitors))
+    print("pain_points           :", len(r.pain_points))
+    print("market_size           :", (r.market_size or "")[:80])
+
+    ok = len(units) >= 2 and len(r.evidence) > 0
+    print("\n" + ("PASS — fan-out produced a real merged brief; safe to ship Phase 3."
+                  if ok else "FAIL — see errors above."))
+    return 0 if ok else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/live_github_launch_smoke.py b/scripts/live_github_launch_smoke.py
@@ -0,0 +1,48 @@
+"""Live GitHub Launch smoke — creates a REAL repo from a run (needs APS_GITHUB_PAT, repo scope).
+
+    APS_GITHUB_PAT=ghp_xxx python scripts/live_github_launch_smoke.py "your idea"
+
+Runs the full vertical, then launches the execution package to GitHub for real and prints
+the repo URL + created issues. This is NOT run in CI (it makes live calls and creates a repo).
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
+
+
+def main() -> int:
+    idea = sys.argv[1] if len(sys.argv) > 1 else "A privacy-first habit tracker for couples"
+    # Importing aps.config loads .env into os.environ (pydantic-settings side effect), so the PAT
+    # check below sees a key set in .env — not only one exported in the shell.
+    import aps.config.settings  # noqa: F401
+    if not os.getenv("APS_GITHUB_PAT"):
+        print("FAIL: set APS_GITHUB_PAT (a repo-scoped PAT) to create the repo for real.")
+        return 1
+
+    from aps.orchestrator.events import EventBus
+    from aps.orchestrator.graph import run_sync
+    from aps.launch import build_launch_plan, launch_github
+
+    bus = EventBus()
+    state = run_sync(idea, bus, run_id="launch_smoke")
+    plan = build_launch_plan(state.idea, state.prd, state.execution, state.pitch)
+    print(f">>> launching repo '{plan.repo_name}' "
+          f"({len(plan.issues)} issues, {len(plan.milestones)} milestones)...")
+
+    result = launch_github(plan, dry_run=False)
+    print(result.message)
+    if result.created:
+        print("repo:", result.repo_url)
+        for u in result.issue_urls[:5]:
+            print("  issue:", u)
+        print("\nPASS — real GitHub repo created.")
+        return 0
+    print("\nFAIL — see message above.")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())