Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: CI

on:
push:
branches: [main]
pull_request:
branches: [main]

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Install deps
# Real orchestration stack (LangGraph + langchain-core + FastAPI) plus the light
# test deps. No LLM provider package / key is needed: the Research node degrades
# to a fixture offline. `pythonpath=src` in pyproject lets `import aps` work
# without an editable install. See memory.md §2.
run: |
python -m pip install --upgrade pip
pip install pydantic pydantic-settings requests structlog tenacity \
langgraph langchain-core fastapi httpx cachetools \
pytest pytest-asyncio ruff

- name: Lint (ruff)
run: ruff check src tests

- name: Test (pytest)
run: python -m pytest
128 changes: 128 additions & 0 deletions scripts/demo_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""demo_run.py — clean full-vertical demo on any idea (Phase 6 defense / repro entry point).

Runs Idea -> Research(fan-out) -> Product -> Architecture -> Execution -> Presentation,
persists every artifact to the file store (.artifacts/<run_id>/), and prints a human
summary. With an LLM key + free source keys it runs fully live; with no keys it degrades to
the fixture brief and still completes end-to-end (so a judge can reproduce either way).

python scripts/demo_run.py "a privacy-first personal finance tracker for couples"
"""
from __future__ import annotations

import os
import sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
try:
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
except Exception:
pass


def _parse_args(argv: list[str]) -> tuple[str, str | None]:
"""Return (idea, model). `--model NAME` overrides the NIM model for verification runs
(e.g. qwen3.5-122b-a10b / glm-5.1); the positional arg is the idea."""
idea, model, rest = None, None, []
it = iter(argv)
for a in it:
if a == "--model":
model = next(it, None)
elif a.startswith("--model="):
model = a.split("=", 1)[1]
else:
rest.append(a)
if rest:
idea = rest[0]
return (idea or "a privacy-first personal finance tracker for couples", model)


def main() -> int:
idea, model = _parse_args(sys.argv[1:])
# Must set the model env BEFORE importing settings (get_settings is lru_cached at import).
if model:
os.environ["APS_NIM_MODEL"] = model

from aps.orchestrator.events import EventBus
from aps.orchestrator.graph import run_sync
from aps.infra import artifact_store
from aps.config.settings import describe_runtime

run_id = "demo"
print(f"{describe_runtime()} fanout={os.getenv('APS_RESEARCH_FANOUT', 'true')}")
print(f">>> {idea!r}\n")

bus = EventBus()
state = run_sync(idea, bus, run_id=run_id)
path = artifact_store.save_run(run_id, state)

ev_types = [e.type for e in bus.history(run_id)]
produced = [a for a in ("research", "prd", "trd", "execution", "pitch")
if getattr(state, a) is not None]

# W6: drop a human-readable Markdown render of each artifact beside its JSON, so a judge
# running the demo gets readable documents (the pipeline still persists JSON only).
from aps.render import render_artifact
for name in produced:
(path / f"{name}.md").write_text(
render_artifact(name, getattr(state, name)), encoding="utf-8")
# T2.2: drop the TRD's Mermaid architecture diagrams alongside the JSON/MD
if state.trd is not None:
from aps.render import architecture_mmd
(path / "trd.mermaid.md").write_text(
architecture_mmd.render(state.trd), encoding="utf-8")
r, prd, trd, ex = state.research, state.prd, state.trd, state.execution

print(f"status : {state.status.value}")
print(f"artifacts : {', '.join(produced)}")
print(f"events : {len(ev_types)} (fan-out: "
f"{ev_types.count('research_unit_start')} sub-researchers)")
if r:
print(f"research : {len(r.evidence)} evidence, {len(r.competitors)} competitors, "
f"{len(r.pain_points)} pains")
print(f"market_size : {(r.market_size or '')[:90]}")
if prd:
print(f"prd : {len(prd.personas)} personas, {len(prd.features)} features, "
f"{len(prd.requirements)} requirements, {len(prd.sources)} sources")
if trd:
print(f"trd : OpenAPI {trd.api_spec.get('openapi')}, "
f"{len(trd.api_spec.get('paths', {}))} paths, stack {trd.stack[:4]}")
if ex:
print(f"execution : {len(ex.backlog)} backlog items, {len(ex.sprints)} sprints")
print(f"pitch : {'yes' if state.pitch else 'no'}")

if state.research:
from aps.scoring import score_startup
sc = score_startup(state.research, state.prd)
print(f"\nStartup Score : {sc.overall}/10 — {sc.verdict}")
for d in sc.dimensions:
print(f" {d.name:24} {d.score:>4}/10 ({d.rationale})")

from aps.debate import run_debate
dbt = run_debate(state.research, state.prd)
print(f"\nDebate verdict: {dbt.verdict} (confidence {int(dbt.confidence * 100)}%)")
print(f" FOR : {len(dbt.build_case)} point(s) · AGAINST: {len(dbt.risk_case)} risk(s)")

if state.prd:
from aps.explain import explain_prd
ex = explain_prd(state.prd, state.research)
print(f"\nExplain-Why : {int(ex.overall_confidence * 100)}% avg confidence "
f"across {len(ex.features)} feature(s) (every feature traced to its evidence)")

if state.prd:
# GitHub Launch preview (dry-run — creates nothing; set APS_GITHUB_PAT + run the
# live smoke / POST /launch/github to create the repo for real).
from aps.launch import build_launch_plan, launch_github
plan = build_launch_plan(state.idea, state.prd, state.execution, state.pitch)
prev = launch_github(plan, dry_run=True)
print(f"\nGitHub Launch : repo '{plan.repo_name}' — {len(plan.issues)} issues, "
f"{len(plan.milestones)} milestones (preview; set APS_GITHUB_PAT to create)")

print(f"\nartifacts saved to: {path}")

ok = state.status.value == "complete" and len(produced) == 5
print("\n" + ("PASS — full vertical reproduced end-to-end." if ok else "INCOMPLETE"))
return 0 if ok else 1


if __name__ == "__main__":
sys.exit(main())
69 changes: 69 additions & 0 deletions scripts/eval_g01_live.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Live single-idea eval (gold g01) for the real MEMO numbers.

Runs the full orchestrator once (research fan-out + downstream agents) against a live
model, scores it with the eval scorers, writes tests/evals/report.md, and prints the
numbers to paste into MEMO.md. One idea on purpose — the full 8-idea gold set runs offline
in CI (test_eval_runner.py); running all 8 live would burn ~240 model calls.

python scripts/eval_g01_live.py
"""
from __future__ import annotations

import json
import os
import sys
from pathlib import Path

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "tests", "evals"))
try:
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
except Exception:
pass


def _tool_counts() -> dict[str, float]:
from aps.infra.metrics import TOOL_CALLS
out: dict[str, float] = {}
collect = getattr(TOOL_CALLS, "collect", None)
if not collect:
return out
for fam in collect():
for s in fam.samples:
if s.name.endswith("_total") and s.value:
out[s.labels.get("tool")] = out.get(s.labels.get("tool"), 0.0) + s.value
return out


def main() -> int:
# `--model NAME` overrides the NIM model for a verification run; set BEFORE importing
# settings/run_eval (get_settings is lru_cached at import).
argv = sys.argv[1:]
if "--model" in argv:
i = argv.index("--model")
if i + 1 < len(argv):
os.environ["APS_NIM_MODEL"] = argv[i + 1]

from aps.config.settings import describe_runtime
print(f"runtime: {describe_runtime()}")

import run_eval # tests/evals/run_eval.py

g01 = [{"id": "g01", "idea": "Build an AI SaaS for resume screening",
"expect_sources": ["github", "hackernews", "reddit"], "min_evidence": 5}]
rows = run_eval.evaluate(g01)
report = Path(__file__).resolve().parents[1] / "tests" / "evals" / "report.md"
report.write_text(run_eval.to_markdown(rows), encoding="utf-8")

tools = _tool_counts()
r = rows[0]
print("=== g01 LIVE eval ===")
print(json.dumps(r, indent=2))
print("distinct tools called :", len(tools))
print("total tool calls :", int(sum(tools.values())))
print("report.md written :", report)
return 0


if __name__ == "__main__":
sys.exit(main())
85 changes: 85 additions & 0 deletions scripts/live_fanout_smoke.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""live_fanout_smoke.py — Phase-3 fan-out verification (live).

Runs the research fan-out supervisor on an idea and prints the plan, per-unit trace, the
distinct retrieval tools the parallel sub-researchers selected, total tool calls, and the
merged brief. Confirms the deliverable: >= 2 units, evidence > 0, ~15-20 tool calls.

python scripts/live_fanout_smoke.py "an AI resume builder that beats ATS filters"
"""
from __future__ import annotations

import os
import sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
try:
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
except Exception:
pass


def _tool_counts(namespace: str | None = None) -> dict[str, float]:
from aps.infra.metrics import TOOL_CALLS
out: dict[str, float] = {}
collect = getattr(TOOL_CALLS, "collect", None)
if not collect:
return out
for fam in collect():
for s in fam.samples:
if s.name.endswith("_total") and s.value:
ns = s.labels.get("namespace")
tool = s.labels.get("tool")
if namespace and ns != namespace:
continue
out[tool] = out.get(tool, 0.0) + s.value
return out


def main() -> int:
idea = sys.argv[1] if len(sys.argv) > 1 else \
"an AI resume builder that beats ATS filters"

from aps.config.settings import get_settings
s = get_settings()
model = s.nim_model if s.model_provider == "nim" else s.gemini_model
print(f"provider={s.model_provider} model={model} "
f"max_concurrent={s.max_concurrent_researchers}")

events: list = []

def on_event(t: str, d: dict) -> None:
events.append((t, d))
if t == "research_plan":
print("PLAN:")
for st in d["subtopics"]:
print(f" - {st}")
elif t == "research_unit_start":
print(f" unit START : {d['focus'][:60]}")
elif t == "research_unit_end":
print(f" unit END : {d['focus'][:55]} -> {d['evidence']} evidence")
elif t == "error":
print(f" ERROR : {d.get('error', '')[:90]}")

from aps.agents.research.supervisor import run_research_fanout
print(f"\n>>> fan-out research on: {idea!r}\n")
r = run_research_fanout(idea, on_event=on_event)

retrieval = _tool_counts("retrieval")
units = [e for e in events if e[0] == "research_unit_start"]
print("\n--- RESULT ---")
print("units spawned :", len(units))
print("distinct retrieval :", retrieval)
print("total retrieval calls :", int(sum(retrieval.values())))
print("evidence (merged) :", len(r.evidence))
print("competitors :", len(r.competitors))
print("pain_points :", len(r.pain_points))
print("market_size :", (r.market_size or "")[:80])

ok = len(units) >= 2 and len(r.evidence) > 0
print("\n" + ("PASS — fan-out produced a real merged brief; safe to ship Phase 3."
if ok else "FAIL — see errors above."))
return 0 if ok else 1


if __name__ == "__main__":
sys.exit(main())
48 changes: 48 additions & 0 deletions scripts/live_github_launch_smoke.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Live GitHub Launch smoke — creates a REAL repo from a run (needs APS_GITHUB_PAT, repo scope).

APS_GITHUB_PAT=ghp_xxx python scripts/live_github_launch_smoke.py "your idea"

Runs the full vertical, then launches the execution package to GitHub for real and prints
the repo URL + created issues. This is NOT run in CI (it makes live calls and creates a repo).
"""
from __future__ import annotations

import os
import sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))


def main() -> int:
idea = sys.argv[1] if len(sys.argv) > 1 else "A privacy-first habit tracker for couples"
# Importing aps.config loads .env into os.environ (pydantic-settings side effect), so the PAT
# check below sees a key set in .env — not only one exported in the shell.
import aps.config.settings # noqa: F401
if not os.getenv("APS_GITHUB_PAT"):
print("FAIL: set APS_GITHUB_PAT (a repo-scoped PAT) to create the repo for real.")
return 1

from aps.orchestrator.events import EventBus
from aps.orchestrator.graph import run_sync
from aps.launch import build_launch_plan, launch_github

bus = EventBus()
state = run_sync(idea, bus, run_id="launch_smoke")
plan = build_launch_plan(state.idea, state.prd, state.execution, state.pitch)
print(f">>> launching repo '{plan.repo_name}' "
f"({len(plan.issues)} issues, {len(plan.milestones)} milestones)...")

result = launch_github(plan, dry_run=False)
print(result.message)
if result.created:
print("repo:", result.repo_url)
for u in result.issue_urls[:5]:
print(" issue:", u)
print("\nPASS — real GitHub repo created.")
return 0
print("\nFAIL — see message above.")
return 1


if __name__ == "__main__":
sys.exit(main())
Loading
Loading