From 933285b9c46c9d059396dc12315572a4b645b281 Mon Sep 17 00:00:00 2001 From: rajatnagda45 Date: Sun, 14 Jun 2026 17:38:14 +0530 Subject: [PATCH 1/2] test: unit, integration & eval suites with developer scripts and CI Brings in the full pytest suite (unit, integration, and eval coverage), the live smoke plus research and load-test developer scripts, and the GitHub Actions CI workflow. --- .github/workflows/ci.yml | 35 +++ scripts/demo_run.py | 128 +++++++++ scripts/eval_g01_live.py | 69 +++++ scripts/live_fanout_smoke.py | 85 ++++++ scripts/live_github_launch_smoke.py | 48 ++++ scripts/live_providers_smoke.py | 59 ++++ scripts/live_research_smoke.py | 109 ++++++++ scripts/loadtest.py | 66 +++++ scripts/run_research.py | 52 ++++ scripts/smoke_test.py | 54 ++++ tests/__init__.py | 0 tests/conftest.py | 54 ++++ tests/evals/fixtures/offtopic.json | 18 ++ tests/evals/fixtures/sample_run.json | 22 ++ tests/evals/gold/gold.json | 10 + tests/evals/run_eval.py | 86 ++++++ tests/evals/scorers.py | 145 ++++++++++ tests/integration/test_api.py | 131 +++++++++ tests/integration/test_api_wiring.py | 104 +++++++ tests/integration/test_composition_chain.py | 89 ++++++ tests/integration/test_eval_runner.py | 35 +++ tests/integration/test_noisy_idea_quality.py | 72 +++++ tests/integration/test_orchestrator.py | 89 ++++++ tests/integration/test_v1_real_data.py | 164 +++++++++++ tests/unit/__init__.py | 0 tests/unit/test_agent_tools.py | 150 ++++++++++ tests/unit/test_agents.py | 62 +++++ tests/unit/test_analysis_quality.py | 136 ++++++++++ tests/unit/test_analysis_tools.py | 113 ++++++++ tests/unit/test_api_v1.py | 272 +++++++++++++++++++ tests/unit/test_architecture_mermaid.py | 65 +++++ tests/unit/test_artifact_quality.py | 49 ++++ tests/unit/test_artifact_store.py | 43 +++ tests/unit/test_availability_agent.py | 44 +++ tests/unit/test_availability_graph.py | 55 ++++ tests/unit/test_availability_tools.py | 66 +++++ tests/unit/test_brand_agent.py | 45 +++ tests/unit/test_brand_graph.py | 50 ++++ tests/unit/test_brand_tools.py | 88 ++++++ tests/unit/test_breaker.py | 41 +++ tests/unit/test_competitor_filter.py | 74 +++++ tests/unit/test_compliance_agent.py | 42 +++ tests/unit/test_compliance_graph.py | 61 +++++ tests/unit/test_compliance_tools.py | 77 ++++++ tests/unit/test_data_model_entities.py | 68 +++++ tests/unit/test_debate.py | 70 +++++ tests/unit/test_diversification.py | 83 ++++++ tests/unit/test_evidence_relevance.py | 99 +++++++ tests/unit/test_explain.py | 70 +++++ tests/unit/test_failover.py | 152 +++++++++++ tests/unit/test_feature_naming.py | 135 +++++++++ tests/unit/test_feature_synthesis.py | 58 ++++ tests/unit/test_firebase_auth.py | 64 +++++ tests/unit/test_funding_agent.py | 50 ++++ tests/unit/test_funding_graph.py | 48 ++++ tests/unit/test_funding_tools.py | 64 +++++ tests/unit/test_github_issues.py | 15 + tests/unit/test_github_launch.py | 120 ++++++++ tests/unit/test_health_lane.py | 19 ++ tests/unit/test_http.py | 100 +++++++ tests/unit/test_infra.py | 82 ++++++ tests/unit/test_legal_agent.py | 44 +++ tests/unit/test_legal_graph.py | 52 ++++ tests/unit/test_legal_tools.py | 77 ++++++ tests/unit/test_llm_ratelimit.py | 43 +++ tests/unit/test_pain_noise_filter.py | 209 ++++++++++++++ tests/unit/test_phase_a.py | 65 +++++ tests/unit/test_provider_polish.py | 110 ++++++++ tests/unit/test_provider_resolution.py | 91 +++++++ tests/unit/test_providers.py | 112 ++++++++ tests/unit/test_query_planning.py | 94 +++++++ tests/unit/test_registry.py | 65 +++++ tests/unit/test_relevance_eval.py | 79 ++++++ tests/unit/test_relevance_judge.py | 77 ++++++ tests/unit/test_render.py | 177 ++++++++++++ tests/unit/test_research_loop.py | 100 +++++++ tests/unit/test_research_mode.py | 22 ++ tests/unit/test_retrieval_tools.py | 48 ++++ tests/unit/test_run_control.py | 87 ++++++ tests/unit/test_scorers.py | 72 +++++ tests/unit/test_scoring_grounding.py | 84 ++++++ tests/unit/test_self_hosted.py | 57 ++++ tests/unit/test_startup_score.py | 86 ++++++ tests/unit/test_story_and_stack_quality.py | 64 +++++ tests/unit/test_thin_prd.py | 50 ++++ tests/unit/test_tool_cache.py | 81 ++++++ tests/unit/test_tool_fallback.py | 46 ++++ tests/unit/test_tool_trace.py | 27 ++ 88 files changed, 6673 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 scripts/demo_run.py create mode 100644 scripts/eval_g01_live.py create mode 100644 scripts/live_fanout_smoke.py create mode 100644 scripts/live_github_launch_smoke.py create mode 100644 scripts/live_providers_smoke.py create mode 100644 scripts/live_research_smoke.py create mode 100644 scripts/loadtest.py create mode 100644 scripts/run_research.py create mode 100644 scripts/smoke_test.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/evals/fixtures/offtopic.json create mode 100644 tests/evals/fixtures/sample_run.json create mode 100644 tests/evals/gold/gold.json create mode 100644 tests/evals/run_eval.py create mode 100644 tests/evals/scorers.py create mode 100644 tests/integration/test_api.py create mode 100644 tests/integration/test_api_wiring.py create mode 100644 tests/integration/test_composition_chain.py create mode 100644 tests/integration/test_eval_runner.py create mode 100644 tests/integration/test_noisy_idea_quality.py create mode 100644 tests/integration/test_orchestrator.py create mode 100644 tests/integration/test_v1_real_data.py create mode 100644 tests/unit/__init__.py create mode 100644 tests/unit/test_agent_tools.py create mode 100644 tests/unit/test_agents.py create mode 100644 tests/unit/test_analysis_quality.py create mode 100644 tests/unit/test_analysis_tools.py create mode 100644 tests/unit/test_api_v1.py create mode 100644 tests/unit/test_architecture_mermaid.py create mode 100644 tests/unit/test_artifact_quality.py create mode 100644 tests/unit/test_artifact_store.py create mode 100644 tests/unit/test_availability_agent.py create mode 100644 tests/unit/test_availability_graph.py create mode 100644 tests/unit/test_availability_tools.py create mode 100644 tests/unit/test_brand_agent.py create mode 100644 tests/unit/test_brand_graph.py create mode 100644 tests/unit/test_brand_tools.py create mode 100644 tests/unit/test_breaker.py create mode 100644 tests/unit/test_competitor_filter.py create mode 100644 tests/unit/test_compliance_agent.py create mode 100644 tests/unit/test_compliance_graph.py create mode 100644 tests/unit/test_compliance_tools.py create mode 100644 tests/unit/test_data_model_entities.py create mode 100644 tests/unit/test_debate.py create mode 100644 tests/unit/test_diversification.py create mode 100644 tests/unit/test_evidence_relevance.py create mode 100644 tests/unit/test_explain.py create mode 100644 tests/unit/test_failover.py create mode 100644 tests/unit/test_feature_naming.py create mode 100644 tests/unit/test_feature_synthesis.py create mode 100644 tests/unit/test_firebase_auth.py create mode 100644 tests/unit/test_funding_agent.py create mode 100644 tests/unit/test_funding_graph.py create mode 100644 tests/unit/test_funding_tools.py create mode 100644 tests/unit/test_github_issues.py create mode 100644 tests/unit/test_github_launch.py create mode 100644 tests/unit/test_health_lane.py create mode 100644 tests/unit/test_http.py create mode 100644 tests/unit/test_infra.py create mode 100644 tests/unit/test_legal_agent.py create mode 100644 tests/unit/test_legal_graph.py create mode 100644 tests/unit/test_legal_tools.py create mode 100644 tests/unit/test_llm_ratelimit.py create mode 100644 tests/unit/test_pain_noise_filter.py create mode 100644 tests/unit/test_phase_a.py create mode 100644 tests/unit/test_provider_polish.py create mode 100644 tests/unit/test_provider_resolution.py create mode 100644 tests/unit/test_providers.py create mode 100644 tests/unit/test_query_planning.py create mode 100644 tests/unit/test_registry.py create mode 100644 tests/unit/test_relevance_eval.py create mode 100644 tests/unit/test_relevance_judge.py create mode 100644 tests/unit/test_render.py create mode 100644 tests/unit/test_research_loop.py create mode 100644 tests/unit/test_research_mode.py create mode 100644 tests/unit/test_retrieval_tools.py create mode 100644 tests/unit/test_run_control.py create mode 100644 tests/unit/test_scorers.py create mode 100644 tests/unit/test_scoring_grounding.py create mode 100644 tests/unit/test_self_hosted.py create mode 100644 tests/unit/test_startup_score.py create mode 100644 tests/unit/test_story_and_stack_quality.py create mode 100644 tests/unit/test_thin_prd.py create mode 100644 tests/unit/test_tool_cache.py create mode 100644 tests/unit/test_tool_fallback.py create mode 100644 tests/unit/test_tool_trace.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..cdc6fd1 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,35 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install deps + # Real orchestration stack (LangGraph + langchain-core + FastAPI) plus the light + # test deps. No LLM provider package / key is needed: the Research node degrades + # to a fixture offline. `pythonpath=src` in pyproject lets `import aps` work + # without an editable install. See memory.md §2. + run: | + python -m pip install --upgrade pip + pip install pydantic pydantic-settings requests structlog tenacity \ + langgraph langchain-core fastapi httpx cachetools \ + pytest pytest-asyncio ruff + + - name: Lint (ruff) + run: ruff check src tests + + - name: Test (pytest) + run: python -m pytest diff --git a/scripts/demo_run.py b/scripts/demo_run.py new file mode 100644 index 0000000..72a7012 --- /dev/null +++ b/scripts/demo_run.py @@ -0,0 +1,128 @@ +"""demo_run.py — clean full-vertical demo on any idea (Phase 6 defense / repro entry point). + +Runs Idea -> Research(fan-out) -> Product -> Architecture -> Execution -> Presentation, +persists every artifact to the file store (.artifacts//), and prints a human +summary. With an LLM key + free source keys it runs fully live; with no keys it degrades to +the fixture brief and still completes end-to-end (so a judge can reproduce either way). + + python scripts/demo_run.py "a privacy-first personal finance tracker for couples" +""" +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) +try: + sys.stdout.reconfigure(encoding="utf-8", errors="replace") +except Exception: + pass + + +def _parse_args(argv: list[str]) -> tuple[str, str | None]: + """Return (idea, model). `--model NAME` overrides the NIM model for verification runs + (e.g. qwen3.5-122b-a10b / glm-5.1); the positional arg is the idea.""" + idea, model, rest = None, None, [] + it = iter(argv) + for a in it: + if a == "--model": + model = next(it, None) + elif a.startswith("--model="): + model = a.split("=", 1)[1] + else: + rest.append(a) + if rest: + idea = rest[0] + return (idea or "a privacy-first personal finance tracker for couples", model) + + +def main() -> int: + idea, model = _parse_args(sys.argv[1:]) + # Must set the model env BEFORE importing settings (get_settings is lru_cached at import). + if model: + os.environ["APS_NIM_MODEL"] = model + + from aps.orchestrator.events import EventBus + from aps.orchestrator.graph import run_sync + from aps.infra import artifact_store + from aps.config.settings import describe_runtime + + run_id = "demo" + print(f"{describe_runtime()} fanout={os.getenv('APS_RESEARCH_FANOUT', 'true')}") + print(f">>> {idea!r}\n") + + bus = EventBus() + state = run_sync(idea, bus, run_id=run_id) + path = artifact_store.save_run(run_id, state) + + ev_types = [e.type for e in bus.history(run_id)] + produced = [a for a in ("research", "prd", "trd", "execution", "pitch") + if getattr(state, a) is not None] + + # W6: drop a human-readable Markdown render of each artifact beside its JSON, so a judge + # running the demo gets readable documents (the pipeline still persists JSON only). + from aps.render import render_artifact + for name in produced: + (path / f"{name}.md").write_text( + render_artifact(name, getattr(state, name)), encoding="utf-8") + # T2.2: drop the TRD's Mermaid architecture diagrams alongside the JSON/MD + if state.trd is not None: + from aps.render import architecture_mmd + (path / "trd.mermaid.md").write_text( + architecture_mmd.render(state.trd), encoding="utf-8") + r, prd, trd, ex = state.research, state.prd, state.trd, state.execution + + print(f"status : {state.status.value}") + print(f"artifacts : {', '.join(produced)}") + print(f"events : {len(ev_types)} (fan-out: " + f"{ev_types.count('research_unit_start')} sub-researchers)") + if r: + print(f"research : {len(r.evidence)} evidence, {len(r.competitors)} competitors, " + f"{len(r.pain_points)} pains") + print(f"market_size : {(r.market_size or '')[:90]}") + if prd: + print(f"prd : {len(prd.personas)} personas, {len(prd.features)} features, " + f"{len(prd.requirements)} requirements, {len(prd.sources)} sources") + if trd: + print(f"trd : OpenAPI {trd.api_spec.get('openapi')}, " + f"{len(trd.api_spec.get('paths', {}))} paths, stack {trd.stack[:4]}") + if ex: + print(f"execution : {len(ex.backlog)} backlog items, {len(ex.sprints)} sprints") + print(f"pitch : {'yes' if state.pitch else 'no'}") + + if state.research: + from aps.scoring import score_startup + sc = score_startup(state.research, state.prd) + print(f"\nStartup Score : {sc.overall}/10 — {sc.verdict}") + for d in sc.dimensions: + print(f" {d.name:24} {d.score:>4}/10 ({d.rationale})") + + from aps.debate import run_debate + dbt = run_debate(state.research, state.prd) + print(f"\nDebate verdict: {dbt.verdict} (confidence {int(dbt.confidence * 100)}%)") + print(f" FOR : {len(dbt.build_case)} point(s) · AGAINST: {len(dbt.risk_case)} risk(s)") + + if state.prd: + from aps.explain import explain_prd + ex = explain_prd(state.prd, state.research) + print(f"\nExplain-Why : {int(ex.overall_confidence * 100)}% avg confidence " + f"across {len(ex.features)} feature(s) (every feature traced to its evidence)") + + if state.prd: + # GitHub Launch preview (dry-run — creates nothing; set APS_GITHUB_PAT + run the + # live smoke / POST /launch/github to create the repo for real). + from aps.launch import build_launch_plan, launch_github + plan = build_launch_plan(state.idea, state.prd, state.execution, state.pitch) + prev = launch_github(plan, dry_run=True) + print(f"\nGitHub Launch : repo '{plan.repo_name}' — {len(plan.issues)} issues, " + f"{len(plan.milestones)} milestones (preview; set APS_GITHUB_PAT to create)") + + print(f"\nartifacts saved to: {path}") + + ok = state.status.value == "complete" and len(produced) == 5 + print("\n" + ("PASS — full vertical reproduced end-to-end." if ok else "INCOMPLETE")) + return 0 if ok else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/eval_g01_live.py b/scripts/eval_g01_live.py new file mode 100644 index 0000000..2a936df --- /dev/null +++ b/scripts/eval_g01_live.py @@ -0,0 +1,69 @@ +"""Live single-idea eval (gold g01) for the real MEMO numbers. + +Runs the full orchestrator once (research fan-out + downstream agents) against a live +model, scores it with the eval scorers, writes tests/evals/report.md, and prints the +numbers to paste into MEMO.md. One idea on purpose — the full 8-idea gold set runs offline +in CI (test_eval_runner.py); running all 8 live would burn ~240 model calls. + + python scripts/eval_g01_live.py +""" +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "tests", "evals")) +try: + sys.stdout.reconfigure(encoding="utf-8", errors="replace") +except Exception: + pass + + +def _tool_counts() -> dict[str, float]: + from aps.infra.metrics import TOOL_CALLS + out: dict[str, float] = {} + collect = getattr(TOOL_CALLS, "collect", None) + if not collect: + return out + for fam in collect(): + for s in fam.samples: + if s.name.endswith("_total") and s.value: + out[s.labels.get("tool")] = out.get(s.labels.get("tool"), 0.0) + s.value + return out + + +def main() -> int: + # `--model NAME` overrides the NIM model for a verification run; set BEFORE importing + # settings/run_eval (get_settings is lru_cached at import). + argv = sys.argv[1:] + if "--model" in argv: + i = argv.index("--model") + if i + 1 < len(argv): + os.environ["APS_NIM_MODEL"] = argv[i + 1] + + from aps.config.settings import describe_runtime + print(f"runtime: {describe_runtime()}") + + import run_eval # tests/evals/run_eval.py + + g01 = [{"id": "g01", "idea": "Build an AI SaaS for resume screening", + "expect_sources": ["github", "hackernews", "reddit"], "min_evidence": 5}] + rows = run_eval.evaluate(g01) + report = Path(__file__).resolve().parents[1] / "tests" / "evals" / "report.md" + report.write_text(run_eval.to_markdown(rows), encoding="utf-8") + + tools = _tool_counts() + r = rows[0] + print("=== g01 LIVE eval ===") + print(json.dumps(r, indent=2)) + print("distinct tools called :", len(tools)) + print("total tool calls :", int(sum(tools.values()))) + print("report.md written :", report) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/live_fanout_smoke.py b/scripts/live_fanout_smoke.py new file mode 100644 index 0000000..9ee1d8c --- /dev/null +++ b/scripts/live_fanout_smoke.py @@ -0,0 +1,85 @@ +"""live_fanout_smoke.py — Phase-3 fan-out verification (live). + +Runs the research fan-out supervisor on an idea and prints the plan, per-unit trace, the +distinct retrieval tools the parallel sub-researchers selected, total tool calls, and the +merged brief. Confirms the deliverable: >= 2 units, evidence > 0, ~15-20 tool calls. + + python scripts/live_fanout_smoke.py "an AI resume builder that beats ATS filters" +""" +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) +try: + sys.stdout.reconfigure(encoding="utf-8", errors="replace") +except Exception: + pass + + +def _tool_counts(namespace: str | None = None) -> dict[str, float]: + from aps.infra.metrics import TOOL_CALLS + out: dict[str, float] = {} + collect = getattr(TOOL_CALLS, "collect", None) + if not collect: + return out + for fam in collect(): + for s in fam.samples: + if s.name.endswith("_total") and s.value: + ns = s.labels.get("namespace") + tool = s.labels.get("tool") + if namespace and ns != namespace: + continue + out[tool] = out.get(tool, 0.0) + s.value + return out + + +def main() -> int: + idea = sys.argv[1] if len(sys.argv) > 1 else \ + "an AI resume builder that beats ATS filters" + + from aps.config.settings import get_settings + s = get_settings() + model = s.nim_model if s.model_provider == "nim" else s.gemini_model + print(f"provider={s.model_provider} model={model} " + f"max_concurrent={s.max_concurrent_researchers}") + + events: list = [] + + def on_event(t: str, d: dict) -> None: + events.append((t, d)) + if t == "research_plan": + print("PLAN:") + for st in d["subtopics"]: + print(f" - {st}") + elif t == "research_unit_start": + print(f" unit START : {d['focus'][:60]}") + elif t == "research_unit_end": + print(f" unit END : {d['focus'][:55]} -> {d['evidence']} evidence") + elif t == "error": + print(f" ERROR : {d.get('error', '')[:90]}") + + from aps.agents.research.supervisor import run_research_fanout + print(f"\n>>> fan-out research on: {idea!r}\n") + r = run_research_fanout(idea, on_event=on_event) + + retrieval = _tool_counts("retrieval") + units = [e for e in events if e[0] == "research_unit_start"] + print("\n--- RESULT ---") + print("units spawned :", len(units)) + print("distinct retrieval :", retrieval) + print("total retrieval calls :", int(sum(retrieval.values()))) + print("evidence (merged) :", len(r.evidence)) + print("competitors :", len(r.competitors)) + print("pain_points :", len(r.pain_points)) + print("market_size :", (r.market_size or "")[:80]) + + ok = len(units) >= 2 and len(r.evidence) > 0 + print("\n" + ("PASS — fan-out produced a real merged brief; safe to ship Phase 3." + if ok else "FAIL — see errors above.")) + return 0 if ok else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/live_github_launch_smoke.py b/scripts/live_github_launch_smoke.py new file mode 100644 index 0000000..ec9386c --- /dev/null +++ b/scripts/live_github_launch_smoke.py @@ -0,0 +1,48 @@ +"""Live GitHub Launch smoke — creates a REAL repo from a run (needs APS_GITHUB_PAT, repo scope). + + APS_GITHUB_PAT=ghp_xxx python scripts/live_github_launch_smoke.py "your idea" + +Runs the full vertical, then launches the execution package to GitHub for real and prints +the repo URL + created issues. This is NOT run in CI (it makes live calls and creates a repo). +""" +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + + +def main() -> int: + idea = sys.argv[1] if len(sys.argv) > 1 else "A privacy-first habit tracker for couples" + # Importing aps.config loads .env into os.environ (pydantic-settings side effect), so the PAT + # check below sees a key set in .env — not only one exported in the shell. + import aps.config.settings # noqa: F401 + if not os.getenv("APS_GITHUB_PAT"): + print("FAIL: set APS_GITHUB_PAT (a repo-scoped PAT) to create the repo for real.") + return 1 + + from aps.orchestrator.events import EventBus + from aps.orchestrator.graph import run_sync + from aps.launch import build_launch_plan, launch_github + + bus = EventBus() + state = run_sync(idea, bus, run_id="launch_smoke") + plan = build_launch_plan(state.idea, state.prd, state.execution, state.pitch) + print(f">>> launching repo '{plan.repo_name}' " + f"({len(plan.issues)} issues, {len(plan.milestones)} milestones)...") + + result = launch_github(plan, dry_run=False) + print(result.message) + if result.created: + print("repo:", result.repo_url) + for u in result.issue_urls[:5]: + print(" issue:", u) + print("\nPASS — real GitHub repo created.") + return 0 + print("\nFAIL — see message above.") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/live_providers_smoke.py b/scripts/live_providers_smoke.py new file mode 100644 index 0000000..ce99e79 --- /dev/null +++ b/scripts/live_providers_smoke.py @@ -0,0 +1,59 @@ +"""Live multi-provider smoke — verify tool-calling on each provider you have a key for. + + APS_PROVIDER_CHAIN=groq,gemini,nim GROQ_API_KEY=... GEMINI_API_KEY=... \ + python scripts/live_providers_smoke.py "a privacy-first habit tracker" + +For every available provider it runs ONE real research turn (in isolation, that provider +only) and reports whether the model selected tools and gathered evidence — a provider × +tool-calling support matrix. Makes live network calls; NOT run in CI. +""" +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) +try: + sys.stdout.reconfigure(encoding="utf-8", errors="replace") +except Exception: + pass + + +def main() -> int: + idea = sys.argv[1] if len(sys.argv) > 1 else "a privacy-first habit tracker for couples" + + from aps.config.providers import REGISTRY, provider_available + from aps.agents.research.agent import gather_evidence + + available = [n for n in REGISTRY if provider_available(n)] + if not available: + print("No provider keys found. Set e.g. GROQ_API_KEY / GEMINI_API_KEY / NVIDIA_API_KEY " + "(see .env.example) and re-run.") + return 1 + + print(f">>> idea: {idea!r}") + print(f">>> testing {len(available)} provider(s): {', '.join(available)}\n") + print(f"{'provider':<14}{'tools':<8}{'evidence':<10}{'calls':<7}note") + print("-" * 60) + + results = {} + for name in available: + # isolate this provider: a single-provider chain so the loop talks ONLY to it + os.environ["APS_PROVIDER_CHAIN"] = name + try: + ev, n = gather_evidence(idea) + ok = n > 0 and len(ev) > 0 + results[name] = ok + print(f"{name:<14}{('YES' if n > 0 else 'no'):<8}{len(ev):<10}{n:<7}" + f"{'' if ok else 'no tool-calls/evidence — verify model supports tools'}") + except Exception as e: # noqa: BLE001 + results[name] = False + print(f"{name:<14}{'ERR':<8}{'-':<10}{'-':<7}{type(e).__name__}: {str(e)[:60]}") + + passed = sum(1 for v in results.values() if v) + print(f"\n{passed}/{len(available)} provider(s) selected tools and gathered evidence.") + return 0 if passed else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/live_research_smoke.py b/scripts/live_research_smoke.py new file mode 100644 index 0000000..f617209 --- /dev/null +++ b/scripts/live_research_smoke.py @@ -0,0 +1,109 @@ +"""live_research_smoke.py — foundation check for the p1/orchestrator-fanout branch. + +Runs the REAL research tool-loop against a LIVE model (no stubs) and asserts the +foundation that Send fan-out will sit on top of: + + 1. the model selects >= 2 DISTINCT retrieval tools (model-driven selection, Req-1), + 2. the loop terminates cleanly and returns a typed ResearchReturn, + 3. real evidence was collected. + +Why retrieval-tool count is the right signal: the compression step only ever calls +ANALYSIS tools deterministically, so any RETRIEVAL call must have come from the model +choosing it. Distinct retrieval tools > 1 ⇒ the model is genuinely selecting. + +This is meaningful even with NO source API keys: the no-key tools (HN, arXiv, Wikipedia, +PyPI, npm, Stack Exchange, jobs) return real data — you only need the LLM key. + +Recommended dev model: NIM `nvidia/nvidia-nemotron-nano-9b-v2` (free, agentic, cheap). + +Usage: + # .env: APS_MODEL_PROVIDER=nim NVIDIA_API_KEY=nvapi-... + python scripts/live_research_smoke.py "an AI resume builder that beats ATS filters" + +Exit code 0 = PASS (safe to build fan-out), 1 = FAIL (fix on the linear base first). +""" +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + + +def _tool_counts(namespace: str | None = None) -> dict[str, float]: + """Distinct tools with >0 calls this process, from the Prometheus counter.""" + from aps.infra.metrics import TOOL_CALLS + out: dict[str, float] = {} + collect = getattr(TOOL_CALLS, "collect", None) + if collect is None: # prometheus_client absent -> metrics are no-ops + return out + for fam in collect(): + for s in fam.samples: + if not s.name.endswith("_total"): + continue + ns = s.labels.get("namespace") + tool = s.labels.get("tool") + if namespace and ns != namespace: + continue + if s.value and s.value > 0: + out[tool] = out.get(tool, 0.0) + s.value + return out + + +def main() -> int: + idea = sys.argv[1] if len(sys.argv) > 1 else \ + "an AI resume builder that beats ATS filters" + + from aps.config.settings import get_settings + s = get_settings() + model = s.nim_model if s.model_provider == "nim" else s.gemini_model + print(f"provider = {s.model_provider}") + print(f"model = {model}") + print(f"tool-call cap/agent = {s.max_tool_calls_per_agent}") + + # fail fast on a missing key rather than a confusing 401 mid-loop + if s.model_provider == "nim" and not os.getenv("NVIDIA_API_KEY"): + print("\nFAIL: APS_MODEL_PROVIDER=nim but NVIDIA_API_KEY is not set.") + return 1 + if s.model_provider == "gemini" and not (os.getenv("GEMINI_API_KEY") + or os.getenv("GOOGLE_API_KEY")): + print("\nFAIL: APS_MODEL_PROVIDER=gemini but GEMINI_API_KEY/GOOGLE_API_KEY not set.") + return 1 + + from aps.agents.research.agent import run_research + print(f"\nrunning live research loop on: {idea!r}\n") + try: + research = run_research(idea) + except Exception as e: # the loop should never raise; if it does, that's the finding + print(f"FAIL: research loop raised {type(e).__name__}: {e}") + return 1 + + retrieval = _tool_counts("retrieval") + analysis = _tool_counts("analysis") + print("model-selected retrieval tools :", retrieval or "(none)") + print("analysis tools fired (compress):", analysis or "(none)") + print("evidence collected :", len(research.evidence)) + print("pain points :", len(research.pain_points)) + print("competitors :", len(research.competitors)) + print("market_size :", (research.market_size or "")[:80]) + + ok = True + if len(retrieval) < 2: + print("\nFAIL: model selected <2 distinct retrieval tools — selection unproven.") + print(" check: tools bound, descriptions specific, temperature not too low.") + ok = False + if not research.evidence: + print("\nFAIL: no evidence collected.") + ok = False + + print() + if ok: + print("PASS — linear research loop works against a live model. " + "Safe to build Send fan-out on this engine.") + return 0 + print("FAIL — fix the linear loop before layering fan-out on it.") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/loadtest.py b/scripts/loadtest.py new file mode 100644 index 0000000..341a710 --- /dev/null +++ b/scripts/loadtest.py @@ -0,0 +1,66 @@ +"""Concurrency load test (plan §4) — prove the bounded queue holds under burst. + +Fires N concurrent POST /runs at a running API and reports admission latency p50/p95, the +status spread (202 admitted vs 503 back-pressure), and the live queue depth from /stats. This +is the "before claiming multi-user reliability" check the execution plan calls for — the +in-process analog of k6/Locust, with zero extra deps (uses the `requests` already in the env). + +Usage (start the API first): + uvicorn aps.api.main:app + python scripts/loadtest.py --n 10 --url http://127.0.0.1:8000 --key dev-key + +It does NOT wait for runs to finish — it measures the admission path (queue + worker pool), +which is what determines whether a flood stays fair and bounded. +""" +from __future__ import annotations + +import argparse +import statistics +import time +from concurrent.futures import ThreadPoolExecutor + +import requests + + +def _one(url: str, key: str, i: int) -> tuple[int, float]: + t0 = time.perf_counter() + r = requests.post(f"{url}/runs", headers={"X-APS-Key": key}, + json={"idea": f"load-test idea #{i}"}, timeout=30) + return r.status_code, (time.perf_counter() - t0) * 1000 + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--n", type=int, default=10, help="concurrent POST /runs") + ap.add_argument("--url", default="http://127.0.0.1:8000") + ap.add_argument("--key", default="dev-key") + args = ap.parse_args() + + print(f"firing {args.n} concurrent POST {args.url}/runs ...") + with ThreadPoolExecutor(max_workers=args.n) as pool: + results = list(pool.map(lambda i: _one(args.url, args.key, i), range(args.n))) + + codes = [c for c, _ in results] + lat = sorted(ms for _, ms in results) + admitted = sum(1 for c in codes if c == 202) + throttled = sum(1 for c in codes if c == 503) + p50 = statistics.median(lat) + p95 = lat[min(len(lat) - 1, int(len(lat) * 0.95))] + + print(f" admitted (202): {admitted}") + print(f" back-pressure (503): {throttled}") + print(f" other codes: {[c for c in codes if c not in (202, 503)]}") + print(f" admission p50/p95: {p50:.1f} ms / {p95:.1f} ms") + + try: + s = requests.get(f"{args.url}/stats", headers={"X-APS-Key": args.key}, timeout=10).json() + print(f" queue_depth: {s.get('queue_depth')} " + f"(cap {s.get('max_concurrent_runs')} concurrent)") + print(f" by_status: {s.get('by_status')}") + print(f" tool_cache: {s.get('tool_cache')}") + except Exception as e: + print(f" (could not read /stats: {e})") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_research.py b/scripts/run_research.py new file mode 100644 index 0000000..0cc1ab5 --- /dev/null +++ b/scripts/run_research.py @@ -0,0 +1,52 @@ +"""run_research.py — Phase-2 deliverable: run the Research Agent standalone. + +Given an idea string, runs the real research tool-loop (live sources) and prints the +typed, evidence-backed brief: market_size, competitors[], pain_points[], evidence[]. + + python scripts/run_research.py "a self-hosted note-taking app for developers" +""" +from __future__ import annotations + +import json +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + +# Windows consoles default to cp1252 and choke on ★/—/etc. in real evidence text. +try: + sys.stdout.reconfigure(encoding="utf-8", errors="replace") +except Exception: + pass + + +def main() -> int: + idea = sys.argv[1] if len(sys.argv) > 1 else \ + "a self-hosted note-taking app for developers" + + from aps.agents.research.agent import run_research + r = run_research(idea) + + print("\n================ RESEARCH BRIEF ================") + print(f"idea : {r.idea}") + print(f"market_size : {r.market_size}") + print(f"competitors : {len(r.competitors)}") + for c in r.competitors[:8]: + price = f" — {c.pricing}" if c.pricing else "" + print(f" • {c.name}{price} ({len(c.features)} features)") + print(f"pain_points : {len(r.pain_points)}") + for p in r.pain_points[:8]: + print(f" • [{p.severity.value}] {p.text[:100]}") + print(f"evidence : {len(r.evidence)}") + for e in r.evidence[:12]: + title = (e.title or "")[:55] + print(f" [{e.source}] {title}") + print(f" {e.snippet[:110]}") + + print("\n================ TYPED JSON (first 1200 chars) ================") + print(json.dumps(r.model_dump(), default=str, indent=2)[:1200]) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/smoke_test.py b/scripts/smoke_test.py new file mode 100644 index 0000000..578d18a --- /dev/null +++ b/scripts/smoke_test.py @@ -0,0 +1,54 @@ +"""Phase-0 smoke test — proves model wiring and config centralization work. + +Run from the aps/ directory: + python scripts/smoke_test.py + +Exits 0 on success, 1 on failure. Prints provider + model used. +No agents, no tools — just a round-trip through get_chat_model(). +""" +from __future__ import annotations + +import sys +import os + +# Allow running from aps/ without installing the package +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + +from langchain_core.messages import HumanMessage + +from aps.config.settings import get_chat_model, get_compression_model, get_settings + + +def main() -> int: + s = get_settings() + print(f"provider : {s.model_provider}") + print(f"model : {s.gemini_model if s.model_provider == 'gemini' else s.nim_model}") + + # ── main model round-trip ───────────────────────────────────────────── + print("\n[1/2] invoking main model …") + try: + model = get_chat_model() + reply = model.invoke([HumanMessage("Reply with exactly one word: ready")]) + text = reply.content if hasattr(reply, "content") else str(reply) + print(f" response: {text!r}") + except Exception as exc: + print(f" FAILED: {exc}", file=sys.stderr) + return 1 + + # ── compression model round-trip ───────────────────────────────────── + print("[2/2] invoking compression model …") + try: + comp = get_compression_model() + reply2 = comp.invoke([HumanMessage("Reply with exactly one word: compressed")]) + text2 = reply2.content if hasattr(reply2, "content") else str(reply2) + print(f" response: {text2!r}") + except Exception as exc: + print(f" FAILED: {exc}", file=sys.stderr) + return 1 + + print("\nPhase-0 smoke test PASSED — model factory is wired.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..67e887f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,54 @@ +"""Shared pytest fixtures for the APS suite. + +Everything here is offline and deterministic: retrieval tools fall back to fixtures +(no API keys), analysis/agent tools are pure functions, agents are deterministic +pipelines. The suite must stay green on py3.10 with only pydantic + pytest installed. +""" +from __future__ import annotations + +import os + +import pytest + +# Force fixture fallback so any retrieval tool that *is* exercised never makes a live call. +os.environ.setdefault("APS_ALLOW_FIXTURE_FALLBACK", "true") + +from aps.state.models import ( + Evidence, Competitor, PainPoint, Severity, ResearchReturn, +) + + +@pytest.fixture +def rich_research() -> ResearchReturn: + """A realistic ResearchReturn so downstream agents have real data to chew on.""" + ev = [ + Evidence(source="github", url="https://github.com/acme/ats/issues/1", + title="Parser drops PDF resumes", + snippet="The parser is broken and keeps dropping valid PDF resumes."), + Evidence(source="reddit", url="https://reddit.com/r/recruiting/abc", + title="ATS keyword matching is dumb", + snippet="Keyword matching is confusing and misses qualified candidates."), + Evidence(source="web", url="https://acme.io/pricing", + title="Acme pricing", + snippet="Acme supports PDF export and integrates with Slack. Pricing $49/mo."), + Evidence(source="web", url="https://marketreport.example.com/ats", + title="ATS market", + snippet="The ATS market is worth $3 billion and growing fast."), + ] + return ResearchReturn( + idea="Build an AI SaaS for resume screening", + market_size="~$3B ATS market, growing", + competitors=[ + Competitor(name="Acme", url="https://acme.io", + features=["PDF export", "Slack integration"], pricing="$49/mo"), + Competitor(name="ScreenAI", features=["keyword match", "ranking"]), + ], + pain_points=[ + PainPoint(text="Parser drops valid PDF resumes", severity=Severity.HIGH, + source_evidence=[ev[0]]), + PainPoint(text="Keyword matching misses qualified candidates", + severity=Severity.MED, source_evidence=[ev[1]]), + PainPoint(text="Pricing is too high for small teams", severity=Severity.LOW), + ], + evidence=ev, + ) diff --git a/tests/evals/fixtures/offtopic.json b/tests/evals/fixtures/offtopic.json new file mode 100644 index 0000000..7cd4bee --- /dev/null +++ b/tests/evals/fixtures/offtopic.json @@ -0,0 +1,18 @@ +{ + "idea": "Private Activity Tracker", + "junk": [ + {"source": "github", "title": "Stake bonus cannot be reached", "snippet": "the stake bonus cannot be reached after the deposit"}, + {"source": "jobs", "title": "High-Ticket Financial Sales Specialist & Team Lead Track @ FSE LLC", "snippet": "high-ticket financial sales role, commission and bonus"}, + {"source": "jobs", "title": "Senior Data Scientist @ Lemon.io", "snippet": "hiring a senior data scientist contractor, remote"}, + {"source": "jobs", "title": "Freelance Writer @ IAPWE", "snippet": "freelance writing gig, paid per article"}, + {"source": "github", "title": "API: sun position", "snippet": "endpoint returns the sun position for a given coordinate and time"}, + {"source": "github", "title": "Google Container Breaks UBlock YouTube Filters", "snippet": "youtube adblock filters break inside the google container extension"}, + {"source": "jobs", "title": "Inside Sales Contractor @ Credit Wellness", "snippet": "inside sales contractor, mortgage and loan leads"} + ], + "relevant": [ + {"source": "hackernews", "title": "Ask HN: privacy-respecting activity trackers?", "snippet": "looking for a private activity tracker that does not sell my data"}, + {"source": "github", "title": "TakaTime privacy-first activity tracker", "snippet": "self-hosted privacy-first coding activity tracking, local only"}, + {"source": "github", "title": "ActivityWatch does not detect idle state", "snippet": "the activity tracker fails to detect idle time on linux"}, + {"source": "hackernews", "title": "Show HN: a private-by-design activity tracker", "snippet": "automatic activity tracking on your phone, privacy by design"} + ] +} diff --git a/tests/evals/fixtures/sample_run.json b/tests/evals/fixtures/sample_run.json new file mode 100644 index 0000000..9cdf73b --- /dev/null +++ b/tests/evals/fixtures/sample_run.json @@ -0,0 +1,22 @@ +{ + "run_id": "run_mock1", + "idea": "Build an AI SaaS for resume screening", + "events": [ + {"type":"agent_start","data":{"agent":"research"}}, + {"type":"tool_call","data":{"agent":"research","tool":"github_list_issues","args":{"repo":"example/ats"}}}, + {"type":"tool_result","data":{"tool":"github_list_issues","ok":true,"evidence_count":7}}, + {"type":"tool_call","data":{"agent":"research","tool":"hn_search","args":{"query":"resume ai"}}}, + {"type":"tool_result","data":{"tool":"hn_search","ok":true,"evidence_count":12}}, + {"type":"tool_call","data":{"agent":"research","tool":"reddit_search","args":{"query":"ats pain"}}}, + {"type":"tool_result","data":{"tool":"reddit_search","ok":false,"evidence_count":0}}, + {"type":"tool_call","data":{"agent":"research","tool":"extract_pain_points","args":{}}}, + {"type":"tool_result","data":{"tool":"extract_pain_points","ok":true,"evidence_count":0}}, + {"type":"artifact_ready","data":{"name":"research"}}, + {"type":"agent_end","data":{"agent":"research"}}, + {"type":"agent_start","data":{"agent":"product"}}, + {"type":"tool_call","data":{"agent":"product","tool":"assemble_prd","args":{}}}, + {"type":"artifact_ready","data":{"name":"prd"}}, + {"type":"agent_end","data":{"agent":"product"}}, + {"type":"run_complete","data":{"status":"complete","tool_calls":31}} + ] +} diff --git a/tests/evals/gold/gold.json b/tests/evals/gold/gold.json new file mode 100644 index 0000000..65c2eca --- /dev/null +++ b/tests/evals/gold/gold.json @@ -0,0 +1,10 @@ +[ + {"id":"g01","idea":"Build an AI SaaS for resume screening","expect_sources":["github","hackernews","reddit"],"min_evidence":5}, + {"id":"g02","idea":"A marketplace for renting camera gear between creators","expect_sources":["reddit","web"],"min_evidence":4}, + {"id":"g03","idea":"An open-source observability tool for LangGraph agents","expect_sources":["github","hackernews","arxiv"],"min_evidence":5}, + {"id":"g04","idea":"A mobile app that turns receipts into expense reports","expect_sources":["web","reddit"],"min_evidence":4}, + {"id":"g05","idea":"A Chrome extension that summarizes long GitHub issues","expect_sources":["github","hackernews"],"min_evidence":4}, + {"id":"g06","idea":"A B2B tool for automated SOC2 evidence collection","expect_sources":["web","reddit"],"min_evidence":4}, + {"id":"g07","idea":"A privacy-first habit tracker with local-only data","expect_sources":["reddit","web"],"min_evidence":3}, + {"id":"g08","idea":"A platform connecting clinical trials to eligible patients","expect_sources":["web","arxiv"],"min_evidence":4} +] diff --git a/tests/evals/run_eval.py b/tests/evals/run_eval.py new file mode 100644 index 0000000..a6548d4 --- /dev/null +++ b/tests/evals/run_eval.py @@ -0,0 +1,86 @@ +"""Eval harness — run each gold idea through the orchestrator and score it. + +Usage: python tests/evals/run_eval.py --gold tests/evals/gold --out tests/evals/report.md + +Runs the real LangGraph pipeline (Idea → Research → … → Pitch). With LLM keys the +Research step hits live sources; without them it degrades to the fixture brief, so this +harness still runs end-to-end offline (the deterministic downstream agents are always +real). Scores come from scorers.py. `evaluate()` is importable for unit tests. +""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +# make `import aps` and `import scorers` work whether run as a script or imported +_HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(_HERE)) # tests/evals -> scorers +sys.path.insert(0, str(_HERE.parents[1] / "src")) # repo/src -> aps + +import scorers # noqa: E402 + + +def evaluate(gold: list[dict]) -> list[dict]: + """Run each gold item through the orchestrator and return a scored row per item.""" + from aps.orchestrator.events import EventBus + from aps.orchestrator.graph import run_sync + + rows: list[dict] = [] + for g in gold: + bus = EventBus() + state = run_sync(g["idea"], bus, run_id=g["id"]) + research, prd = state.research, state.prd + ev = list(research.evidence) if research else [] + trace = [{"tool": "research", + "evidence": [e.model_dump() for e in ev]}] + rows.append({ + "id": g["id"], + "idea": g["idea"], + "e2e": all([state.research, state.prd, state.trd, state.execution, state.pitch]), + "prd_valid": scorers.prd_schema_valid(prd) if prd else False, + "coverage": scorers.evidence_coverage(prd) if prd else 0.0, + "diversity": scorers.source_diversity(trace), + "evidence": len(ev), + "min_evidence_met": len(ev) >= g.get("min_evidence", 0), + "features": scorers.prd_feature_count(prd) if prd else 0, + "feature_floor_met": scorers.meets_feature_floor(prd) if prd else False, + "relevance_rate": scorers.evidence_relevance_rate(g["idea"], ev), # E12 + "relevance_met": scorers.evidence_relevance_rate(g["idea"], ev) >= g.get("min_relevance", 0.8), + "titles_clean": scorers.feature_titles_clean(prd) if prd else False, # E14 + }) + return rows + + +def to_markdown(rows: list[dict]) -> str: + head = ("# Eval report\n\n" + "| id | idea | e2e (E7) | prd_valid (E6) | coverage (E4) | sources (E3) " + "| evidence | features (E11) | relevance (E12) | titles (E14) |\n" + "|---|---|---|---|---|---|---|---|---|---|\n") + body = "\n".join( + f"| {r['id']} | {r['idea'][:40]} | {'✓' if r['e2e'] else '✗'} | " + f"{'✓' if r['prd_valid'] else '✗'} | {r['coverage']} | {r['diversity']} | " + f"{r['evidence']}{'' if r['min_evidence_met'] else ' (below min)'} | " + f"{r['features']}{' ✓' if r['feature_floor_met'] else ' (<3)'} | " + f"{r['relevance_rate']}{' ✓' if r['relevance_met'] else ' (<0.8)'} | " + f"{'✓' if r['titles_clean'] else '✗ fragment'} |" + for r in rows + ) + return head + body + "\n" + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--gold", default=str(_HERE / "gold")) + ap.add_argument("--out", default=str(_HERE / "report.md")) + a = ap.parse_args() + gold = json.loads((Path(a.gold) / "gold.json").read_text()) + rows = evaluate(gold) + Path(a.out).write_text(to_markdown(rows), encoding="utf-8") + passed = sum(1 for r in rows if r["e2e"] and r["prd_valid"]) + print(f"wrote {a.out}: {passed}/{len(rows)} items passed e2e+prd_valid") + + +if __name__ == "__main__": + main() diff --git a/tests/evals/scorers.py b/tests/evals/scorers.py new file mode 100644 index 0000000..52f3f98 --- /dev/null +++ b/tests/evals/scorers.py @@ -0,0 +1,145 @@ +"""E1..E10 scorers (EVALUATION.md §2) — deterministic functions over a run's outputs. + +These are pure scorers: given a tool-call trace and the produced PRD, return a number/ +bool. They have NO dependency on the orchestrator, so they are unit-testable on their +own (see tests/unit/test_scorers.py). `run_eval.py` wires these to live runs (P1). + +A `trace` here is a list of tool-call records: dicts like + {"tool": "github_list_issues", "namespace": "retrieval", "evidence": [Evidence|dict]} +A `prd` is a PRD model (or an equivalent dict). +""" +from __future__ import annotations + +import re + + +def _tokens(text: str) -> set[str]: + return {w for w in re.findall(r"[a-z0-9]{4,}", (text or "").lower())} + + +def _evidence_iter(trace): + for call in trace or []: + for ev in (call.get("evidence") if isinstance(call, dict) else []) or []: + yield ev + + +def _ev_field(ev, name: str): + return ev.get(name) if isinstance(ev, dict) else getattr(ev, name, None) + + +def _prd_field(prd, name: str): + if isinstance(prd, dict): + return prd.get(name) + return getattr(prd, name, None) + + +def selection_validity(trace) -> float: # E1 + """Fraction of tool calls that selected a real, known tool name.""" + from aps.tools.registry import all_tools + known = {t.name for t in all_tools()} + calls = [c.get("tool") for c in (trace or []) if isinstance(c, dict)] + if not calls: + return 0.0 + return round(sum(1 for name in calls if name in known) / len(calls), 3) + + +def source_diversity(trace) -> int: # E3 + """Number of distinct evidence sources gathered across the run.""" + return len({_ev_field(ev, "source") for ev in _evidence_iter(trace) + if _ev_field(ev, "source")}) + + +def evidence_coverage(prd) -> float: # E4 + """Fraction of PRD features whose wording overlaps some cited source snippet.""" + features = _prd_field(prd, "features") or [] + sources = _prd_field(prd, "sources") or [] + if not features: + return 0.0 + source_toks = set() + for s in sources: + source_toks |= _tokens((_ev_field(s, "title") or "") + " " + (_ev_field(s, "snippet") or "")) + if not source_toks: + return 0.0 + covered = 0 + for f in features: + title = f.get("title") if isinstance(f, dict) else getattr(f, "title", "") + desc = f.get("description") if isinstance(f, dict) else getattr(f, "description", "") + if _tokens(f"{title} {desc}") & source_toks: + covered += 1 + return round(covered / len(features), 3) + + +def prd_schema_valid(prd) -> bool: # E6 + """True iff the PRD validates against the contract and carries real content.""" + from aps.state.models import PRD + try: + obj = prd if isinstance(prd, PRD) else PRD.model_validate(prd) + except Exception: + return False + return bool(obj.idea) and bool(obj.features) and bool(obj.requirements) + + +def prd_feature_count(prd) -> int: # E11 (W3/W5 regression guard) + """Number of features in the PRD. The eval guards `>= 3` on rich-signal ideas so the + thin-PRD problem (a one-feature doc) can't regress unnoticed.""" + features = _prd_field(prd, "features") or [] + return len(features) + + +def meets_feature_floor(prd, floor: int = 3) -> bool: + """Whether the PRD clears the feature floor (W3). Reported per gold idea by run_eval.""" + return prd_feature_count(prd) >= floor + + +# --------------------------------------------------------------------------- # +# Relevance metrics (E12–E14) — lock the research-quality work so it can't regress. +# --------------------------------------------------------------------------- # +def evidence_relevance_rate(idea: str, evidence, threshold: float = 0.15) -> float: # E12 + """Fraction of evidence that scores at/above the relevance threshold for the idea. + + The headline guard: on-topic research should keep almost only on-topic evidence (target + >= 0.8). A drop means the gate/query-planning regressed and junk is flowing back in.""" + from aps.tools.analysis.score_evidence_relevance import idea_profile, relevance_score + items = list(evidence or []) + if not items: + return 0.0 + prof = idea_profile(idea) + on = sum(1 for e in items if relevance_score(prof, e) >= threshold) + return round(on / len(items), 3) + + +def off_topic_rejection_rate(idea: str, junk_evidence, threshold: float = 0.15) -> float: # E13 + """Fraction of KNOWN-JUNK items the gate would reject (score < threshold). Target 1.0 — + seed this with the off-topic fixtures (sales jobs, "Stake bonus", sun-position API).""" + from aps.tools.analysis.score_evidence_relevance import idea_profile, relevance_score + items = list(junk_evidence or []) + if not items: + return 1.0 + prof = idea_profile(idea) + rejected = sum(1 for e in items if relevance_score(prof, e) < threshold) + return round(rejected / len(items), 3) + + +# A feature title that LEADS with a conjunction/subordinator is an orphaned sentence fragment. +_FRAGMENT_TITLE = re.compile( + r"^(however|therefore|moreover|furthermore|meanwhile|nevertheless|thus|hence|otherwise" + r"|besides|although|though|whereas|while|when|where|because|since|unless|and|but|so|or|yet" + r"|implement|solve|fix|todo)\b[\s:.\-]", re.I) +# Template/scaffolding or truncation markers that should never appear in a clean feature title. +_BAD_TITLE_MARKERS = ("]", "[", "feature request", "steps to reproduce", "describe the", + "documentation request", "...") + + +def feature_titles_clean(prd) -> bool: # E14 + """True iff no PRD feature title is a raw fragment — never leads with a conjunction + ("However…/When…/Implement:"), never carries a stray bracket or template/truncation marker.""" + features = _prd_field(prd, "features") or [] + for f in features: + title = (f.get("title") if isinstance(f, dict) else getattr(f, "title", "")) or "" + t = title.strip() + low = t.lower() + if _FRAGMENT_TITLE.match(t): + return False + if any(m in low for m in _BAD_TITLE_MARKERS): + return False + return True diff --git a/tests/integration/test_api.py b/tests/integration/test_api.py new file mode 100644 index 0000000..bd2dd80 --- /dev/null +++ b/tests/integration/test_api.py @@ -0,0 +1,131 @@ +"""FastAPI surface wired to the orchestrator (API_CONTRACT.md), via Starlette TestClient. + +The run executes in a background thread; we poll GET /runs/{id} until it completes, then +assert the artifact + event endpoints. No LLM key needed (research degrades to the stub). +""" +from __future__ import annotations + +import time + +import pytest +from starlette.testclient import TestClient + +from aps.api.main import app +from aps.config.settings import get_settings + +KEY = get_settings().api_key +HDR = {"X-APS-Key": KEY} + + +@pytest.fixture(scope="module") +def client(): + with TestClient(app) as c: + yield c + + +def _wait_complete(client, run_id, tries=100): + for _ in range(tries): + r = client.get(f"/runs/{run_id}", headers=HDR) + if r.json().get("status") in ("complete", "degraded", "failed"): + return r.json() + time.sleep(0.05) + raise AssertionError("run did not finish in time") + + +def test_auth_required(): + with TestClient(app) as c: + assert c.post("/runs", json={"idea": "x"}).status_code == 401 + assert c.get("/runs/nope").status_code == 401 + + +def test_full_run_via_api(client): + r = client.post("/runs", json={"idea": "Build an AI SaaS for resume screening"}, + headers=HDR) + assert r.status_code == 202 + run_id = r.json()["run_id"] + # admission-control queue (2.1): submit_run returns "queued"; a worker thread may have + # already flipped it to "running" — both are valid immediately after submission (the race + # that made this assert flaky when it demanded "running"). The terminal state is checked below. + assert r.json()["status"] in ("queued", "running") + + done = _wait_complete(client, run_id) + # No LLM key in CI -> honest "degraded" (ran on fixture), still all five artifacts. + assert done["status"] == "degraded" + assert set(done["artifacts"]) >= {"research", "prd", "trd", "execution", "pitch"} + + # artifact endpoint returns a real PRD + prd = client.get(f"/runs/{run_id}/artifacts/prd", headers=HDR) + assert prd.status_code == 200 + assert prd.json()["idea"] == "Build an AI SaaS for resume screening" + assert prd.json()["features"] + + # OpenAPI carried in the TRD artifact + trd = client.get(f"/runs/{run_id}/artifacts/trd", headers=HDR) + assert trd.status_code == 200 + assert trd.json()["api_spec"]["openapi"].startswith("3.") + + # W6: ?format=md returns Markdown; the plain JSON path is unchanged + md = client.get(f"/runs/{run_id}/artifacts/prd?format=md", headers=HDR) + assert md.status_code == 200 + assert md.headers["content-type"].startswith("text/markdown") + assert "# Product Requirements Document" in md.text + assert "Build an AI SaaS for resume screening" in md.text + # default (no format) is still JSON + assert client.get(f"/runs/{run_id}/artifacts/prd", headers=HDR).json()["idea"] + + # Startup Score (T1.4): derived endpoint, JSON + Markdown + sc = client.get(f"/runs/{run_id}/score", headers=HDR) + assert sc.status_code == 200 + body = sc.json() + assert 0 <= body["overall"] <= 10 and body["verdict"] and len(body["dimensions"]) == 5 + scmd = client.get(f"/runs/{run_id}/score?format=md", headers=HDR) + assert scmd.status_code == 200 and scmd.headers["content-type"].startswith("text/markdown") + assert "Startup Score" in scmd.text + + # Architecture Mermaid (T2.2): TRD only + mm = client.get(f"/runs/{run_id}/artifacts/trd?format=mermaid", headers=HDR) + assert mm.status_code == 200 and mm.headers["content-type"].startswith("text/markdown") + assert "```mermaid" in mm.text and "flowchart TD" in mm.text + # mermaid is not offered for non-trd artifacts + assert client.get(f"/runs/{run_id}/artifacts/prd?format=mermaid", headers=HDR).status_code == 404 + + # Autonomous Debate (T2.3): verdict + both sides, JSON + Markdown + db = client.get(f"/runs/{run_id}/debate", headers=HDR) + assert db.status_code == 200 + dbody = db.json() + assert dbody["verdict"] and dbody["build_case"] and dbody["risk_case"] + dbmd = client.get(f"/runs/{run_id}/debate?format=md", headers=HDR) + assert dbmd.status_code == 200 and "Verdict" in dbmd.text + + # GitHub Launch Mode (T2.4): dry-run preview creates nothing, returns the plan + lr = client.post(f"/runs/{run_id}/launch/github", json={"dry_run": True}, headers=HDR) + assert lr.status_code == 200 + lbody = lr.json() + assert lbody["dry_run"] is True and lbody["created"] is False + assert "Preview" in lbody["message"] + + # Explain-Why (T2.5): per-feature provenance, JSON + Markdown + ex = client.get(f"/runs/{run_id}/explain", headers=HDR) + assert ex.status_code == 200 + ebody = ex.json() + assert 0 <= ebody["overall_confidence"] <= 1 and isinstance(ebody["features"], list) + exmd = client.get(f"/runs/{run_id}/explain?format=md", headers=HDR) + assert exmd.status_code == 200 and "Explain-Why" in exmd.text + + +def test_unknown_artifact_and_run(client): + assert client.get("/runs/does_not_exist", headers=HDR).status_code == 404 + r = client.post("/runs", json={"idea": "x"}, headers=HDR) + rid = r.json()["run_id"] + _wait_complete(client, rid) + assert client.get(f"/runs/{rid}/artifacts/bogus", headers=HDR).status_code == 404 + + +def test_event_stream(client): + rid = client.post("/runs", json={"idea": "A privacy-first habit tracker"}, + headers=HDR).json()["run_id"] + _wait_complete(client, rid) + body = client.get(f"/runs/{rid}/events").text + assert "event: run_start" in body + assert "event: run_complete" in body + assert "event: agent_start" in body diff --git a/tests/integration/test_api_wiring.py b/tests/integration/test_api_wiring.py new file mode 100644 index 0000000..0735857 --- /dev/null +++ b/tests/integration/test_api_wiring.py @@ -0,0 +1,104 @@ +"""Frontend-wiring endpoints: /health, /models, /providers, /stats, /runs list, and the +per-run model override plumbing. Hermetic — no live LLM calls, no model construction (which +would need a key/provider package CI lacks); we assert plumbing, not provider I/O. +""" +from __future__ import annotations + +import time +import contextvars +from concurrent.futures import ThreadPoolExecutor + +import pytest +from starlette.testclient import TestClient + +from aps.api.main import app +from aps.config.settings import get_settings, set_run_model, reset_run_model, run_model + +KEY = get_settings().api_key +HDR = {"X-APS-Key": KEY} + + +@pytest.fixture(scope="module") +def client(): + with TestClient(app) as c: + yield c + + +def _wait(client, rid, tries=100): + for _ in range(tries): + if client.get(f"/runs/{rid}", headers=HDR).json().get("status") in ( + "complete", "degraded", "failed"): + return + time.sleep(0.05) + raise AssertionError("run did not finish") + + +# ── read-only metric/catalog endpoints ───────────────────────────────────── +def test_health_no_auth(client): + b = client.get("/health").json() + assert b["status"] == "ok" and isinstance(b["uptime_seconds"], (int, float)) + + +def test_models_catalog(client): + b = client.get("/models", headers=HDR).json() + ids = [p["id"] for p in b["providers"]] + assert "nim" in ids and "gemini" in ids + nim = next(p for p in b["providers"] if p["id"] == "nim") + assert any(m["id"] == "nvidia/nvidia-nemotron-nano-9b-v2" for m in nim["models"]) + assert b["default"]["provider"] and b["default"]["model"] + + +def test_providers_requires_auth_and_shape(client): + assert client.get("/providers").status_code == 401 + b = client.get("/providers", headers=HDR).json() + assert b["resolved"] and all("enabled" in r for r in b["providers"]) + + +def test_stats_shape(client): + assert client.get("/stats").status_code == 401 + b = client.get("/stats", headers=HDR).json() + for k in ("total_runs", "by_status", "in_flight", "total_evidence", + "total_tool_calls", "uptime_seconds"): + assert k in b + + +def test_runs_list_includes_started_run(client): + rid = client.post("/runs", json={"idea": "x"}, headers=HDR).json()["run_id"] + _wait(client, rid) + listing = client.get("/runs", headers=HDR).json() + assert listing["count"] >= 1 + assert any(r["run_id"] == rid for r in listing["runs"]) + + +def test_post_run_echoes_model_choice(client): + r = client.post("/runs", json={"idea": "x", + "config": {"provider": "nim", "model": "openai/gpt-oss-120b"}}, + headers=HDR) + assert r.status_code == 202 + body = r.json() + assert body["provider"] == "nim" and body["model"] == "openai/gpt-oss-120b" + _wait(client, body["run_id"]) + + +# ── per-run override plumbing (the contextvar + fan-out mechanism) ────────── +def test_run_model_contextvar_roundtrip(): + assert run_model() is None + tok = set_run_model("nim", "openai/gpt-oss-120b") + assert run_model() == {"provider": "nim", "model": "openai/gpt-oss-120b"} + reset_run_model(tok) + assert run_model() is None + + +def test_override_propagates_into_threadpool_workers(): + """Mirrors the supervisor: copy the context once per unit on this thread, .run() each in a + worker — the per-run override must be visible inside the worker (ThreadPoolExecutor does not + inherit context on its own).""" + tok = set_run_model("nim", "qwen/qwen3.5-122b-a10b") + try: + ctxs = [contextvars.copy_context() for _ in range(3)] + with ThreadPoolExecutor(max_workers=3) as pool: + seen = list(pool.map(lambda c: c.run(lambda: (run_model() or {}).get("model")), ctxs)) + assert seen == ["qwen/qwen3.5-122b-a10b"] * 3 + finally: + reset_run_model(tok) + assert run_model() is None diff --git a/tests/integration/test_composition_chain.py b/tests/integration/test_composition_chain.py new file mode 100644 index 0000000..c0ba919 --- /dev/null +++ b/tests/integration/test_composition_chain.py @@ -0,0 +1,89 @@ +"""Req-5 end-to-end: idea → Research → PRD → TRD → ExecutionPlan → Pitch, all offline. + +Proves the typed composition chain: each agent consumes the previous typed object and +the idea propagates the whole way. Uses the existing research stub as the upstream +(the Research agent itself is LLM-driven / P1 and out of scope here). +""" +from __future__ import annotations + +from aps.agents.research.stub import stub_research +from aps.agents.product.agent import run_product +from aps.agents.architecture.agent import run_architecture +from aps.agents.execution.agent import run_execution +from aps.agents.presentation.agent import run_presentation +from aps.state.models import StudioState, PRD, TRD, ExecutionPlan, PitchPackage + +IDEA = "Build an AI SaaS for resume screening" + + +def _run_chain(idea: str) -> StudioState: + research = stub_research(idea) + prd = run_product(research) + trd = run_architecture(prd) + plan = run_execution(trd, prd=prd) + state = StudioState(idea=idea, research=research, prd=prd, trd=trd, execution=plan) + state.pitch = run_presentation(state) + return state + + +def test_full_chain_produces_schema_valid_artifacts(): + s = _run_chain(IDEA) + assert isinstance(s.prd, PRD) + assert isinstance(s.trd, TRD) + assert isinstance(s.execution, ExecutionPlan) + assert isinstance(s.pitch, PitchPackage) + + +def test_idea_propagates_through_the_chain(): + s = _run_chain(IDEA) + assert s.idea == IDEA + assert s.research.idea == IDEA + assert s.prd.idea == IDEA + assert IDEA.split()[-1].lower() in (s.trd.api_spec["info"]["title"] + s.pitch.investor_memo).lower() + + +def test_handoffs_are_non_trivial(): + s = _run_chain(IDEA) + # PRD grounded in research + assert s.prd.features and s.prd.sources + # TRD's API derived from PRD's features (entities beyond just User) + assert len(s.trd.data_model["entities"]) >= 2 + assert s.trd.api_spec["paths"] + # Execution backlog derived from features/endpoints, with effort + sprints + assert len(s.execution.backlog) >= 3 + assert s.execution.sprints + # Pitch references the real market + competitors + assert s.pitch.investor_memo and s.pitch.pitch_outline + + +def test_chain_is_deterministic(): + # The pipeline is deterministic; the only non-deterministic field is each Evidence's + # `retrieved_at` timestamp, so we compare the structural artifacts that exclude it. + a = _run_chain(IDEA) + b = _run_chain(IDEA) + assert a.prd.model_dump(exclude={"sources"}) == b.prd.model_dump(exclude={"sources"}) + assert a.trd.api_spec == b.trd.api_spec + assert a.trd.stack == b.trd.stack + assert a.execution.model_dump() == b.execution.model_dump() + + +def test_chain_works_for_a_different_idea(): + s = _run_chain("A marketplace for freelance illustrators") + assert isinstance(s.pitch, PitchPackage) + assert s.trd.api_spec["openapi"].startswith("3.") + + +def test_typed_handoff_research_to_prd(): + """Req-5 (3c): research's typed pains/competitors flow INTO the PRD as typed objects, + never via a re-prompt. assemble_prd validates/assembles over what upstream produced.""" + research = stub_research(IDEA) + assert research.pain_points and research.competitors # upstream actually has signal + + prd = run_product(research) + + # the PRD is grounded in the research object, not regenerated from the idea string + assert prd.idea == research.idea + assert prd.features # pains + competitors -> prioritized features + assert prd.requirements # user stories + acceptance criteria + # evidence is carried through verbatim as the PRD's sources (typed arrow, by URL) + assert [s.url for s in prd.sources] == [e.url for e in research.evidence] diff --git a/tests/integration/test_eval_runner.py b/tests/integration/test_eval_runner.py new file mode 100644 index 0000000..14f7fdc --- /dev/null +++ b/tests/integration/test_eval_runner.py @@ -0,0 +1,35 @@ +"""The orchestrator-driven eval runner scores gold ideas end-to-end (offline).""" +from __future__ import annotations + +import importlib.util +from pathlib import Path + +_RUN_EVAL = Path(__file__).resolve().parents[1] / "evals" / "run_eval.py" +_spec = importlib.util.spec_from_file_location("aps_run_eval", _RUN_EVAL) +run_eval = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(run_eval) + + +GOLD = [ + {"id": "g01", "idea": "Build an AI SaaS for resume screening", "min_evidence": 1}, + {"id": "g02", "idea": "A marketplace for renting camera gear", "min_evidence": 1}, +] + + +def test_evaluate_runs_each_gold_item_through_the_graph(): + rows = run_eval.evaluate(GOLD) + assert len(rows) == 2 + for r in rows: + assert r["e2e"] is True # all five artifacts produced + assert r["prd_valid"] is True # PRD validates against the contract + assert 0.0 <= r["coverage"] <= 1.0 + assert r["evidence"] >= 1 + # W5: the feature-count regression guard is recorded per idea + assert isinstance(r["features"], int) + assert isinstance(r["feature_floor_met"], bool) + + +def test_report_markdown_renders(): + md = run_eval.to_markdown(run_eval.evaluate(GOLD)) + assert md.startswith("# Eval report") + assert "g01" in md and "g02" in md diff --git a/tests/integration/test_noisy_idea_quality.py b/tests/integration/test_noisy_idea_quality.py new file mode 100644 index 0000000..e8a9e69 --- /dev/null +++ b/tests/integration/test_noisy_idea_quality.py @@ -0,0 +1,72 @@ +"""End-to-end: the contributor's noisy idea yields a CLEAN PRD (compression → PRD). + +Feeds the exact noise classes that polluted the PR-review/security run (nav chrome, emoji +issue-templates, greetings, directory/social domains) through the real compression + Product +agent, and asserts the resulting PRD features / competitors are credible — no nav text as the +headline feature, no LinkedIn-as-competitor. +""" +from __future__ import annotations + +from aps.state.models import Evidence +from aps.agents.research.agent import _compress +from aps.agents.product.agent import run_product + +IDEA = "AI tool that reviews PRs for security vulnerabilities" + +NOISY_EVIDENCE = [ + Evidence(source="web", url="https://greptile.io/", title="Greptile", + snippet="Log inGet StartedBook a Demo. The current manual code review is broken and slow."), + Evidence(source="github", url="https://github.com/x/y/issues/1", title="issue", + snippet="\U0001F4DA Documentation Request Description I noticed that scanning is missing."), + Evidence(source="web", url="https://www.linkedin.com/posts/someone", title="post", + snippet="Hi everyone! Sharing thoughts — supports lots of integrations and a dashboard."), + Evidence(source="web", url="https://crozdesk.com/security", title="directory", + snippet="Compare the best code review tools. Offers analytics and reporting."), + Evidence(source="web", url="https://zeropath.com/pricing", title="Zeropath", + snippet="Zeropath offers SAST scanning and integrates with GitHub. Pricing $40/mo."), + Evidence(source="reddit", url="https://reddit.com/r/x/2", title="rant", + snippet="Manual PR security review is painful and we waste hours every single sprint."), +] + + +def test_noisy_evidence_produces_clean_prd(): + research = _compress(IDEA, NOISY_EVIDENCE) + prd = run_product(research) + + # competitors: real product kept; social / directory dropped + comp_names = {c.name.lower() for c in research.competitors} + assert any("zeropath" in n for n in comp_names), comp_names + assert "linkedin" not in comp_names and "crozdesk" not in comp_names + + # pains are real complaints, not page chrome + for p in research.pain_points: + low = p.text.lower() + assert not low.startswith(("log in", "documentation request", "hi ")) + assert "book a demo" not in low + + # PRD features (derived from pains) are credible — never nav/greeting/template chrome + titles = [f.title.lower() for f in prd.features] + assert titles, "PRD should still produce features" + for t in titles: + assert "book a demo" not in t and "get started" not in t + assert "documentation request" not in t + assert not t.startswith("solve: hi ") + # the genuine complaint made it through to a feature + assert any("review" in t or "scan" in t or "security" in t or "manual" in t for t in titles) + + +def test_off_topic_complaint_does_not_become_a_pain(): + # An on-topic complaint + an off-topic-but-valid complaint (shares no idea vocabulary). + # The relevance gate must keep the on-topic pain and reject the off-topic one — even though + # both are syntactically real complaints the noise filter alone would pass. + evidence = [ + Evidence(source="reddit", url="https://reddit.com/r/x/1", + title="rant", snippet="Manual PR security review is painful and slow every sprint."), + Evidence(source="reddit", url="https://reddit.com/r/x/2", + title="rant", snippet="My espresso machine is broken and the milk frother keeps clogging."), + ] + research = _compress(IDEA, evidence) + pains = " ".join(p.text.lower() for p in research.pain_points) + assert "espresso" not in pains and "frother" not in pains # off-topic complaint gated out + assert research.pain_points, "the on-topic security-review complaint should survive" + assert "review" in pains or "security" in pains or "manual" in pains diff --git a/tests/integration/test_orchestrator.py b/tests/integration/test_orchestrator.py new file mode 100644 index 0000000..69c5bfa --- /dev/null +++ b/tests/integration/test_orchestrator.py @@ -0,0 +1,89 @@ +"""Orchestrator: the real LangGraph pipeline runs end-to-end offline + emits events. + +Research has no LLM key here, so the research node degrades to the fixture brief; the +deterministic downstream agents run for real. The whole graph still reaches run_complete. +""" +from __future__ import annotations + +import asyncio + +from aps.orchestrator.events import EventBus +from aps.orchestrator.graph import run_sync +from aps.state.models import RunStatus, PRD, TRD, ExecutionPlan, PitchPackage, Event + +IDEA = "Build an AI SaaS for resume screening" + + +def _run(): + bus = EventBus() + return bus, run_sync(IDEA, bus, run_id="t_run") + + +def test_full_pipeline_produces_all_artifacts(): + _, state = _run() + # No LLM key in the test env -> research degrades to the fixture, so the run is honestly + # DEGRADED (not COMPLETE) but still produces all five downstream artifacts. + assert state.status == RunStatus.DEGRADED + assert state.idea == IDEA + assert isinstance(state.prd, PRD) + assert isinstance(state.trd, TRD) + assert isinstance(state.execution, ExecutionPlan) + assert isinstance(state.pitch, PitchPackage) + assert state.research is not None + # real downstream work + assert state.trd.api_spec.get("openapi", "").startswith("3.") + assert state.execution.backlog + + +def test_event_lifecycle_is_complete_and_ordered(): + bus, state = _run() + history = bus.history("t_run") + types = [e.type for e in history] + assert types[0] == "run_start" + assert types[-1] == "run_complete" + # The core 5-agent spine always runs; the Launch Studio parallel branches (brand/legal/ + # funding, default on) add more, so assert the spine is present rather than a fixed count. + starts = [e.data.get("agent") for e in history if e.type == "agent_start"] + ends = [e.data.get("agent") for e in history if e.type == "agent_end"] + for agent in ("research", "product", "architecture", "execution", "presentation"): + assert agent in starts and agent in ends + # every agent that starts also ends — a balanced lifecycle + assert sorted(starts) == sorted(ends) + # the lifecycle is bracketed by run_start … run_complete + assert types.index("run_start") < types.index("agent_start") < types.index("run_complete") + # state carries the full trace for no-loop consumers + assert state.events and len(state.events) == len(history) + + +def test_research_degrades_to_stub_without_keys(): + bus, state = _run() + errors = [e for e in bus.history("t_run") + if e.type == "error" and e.data.get("agent") == "research"] + # no LLM key/dep here -> the fan-out emits informative "no evidence" diagnostics, then + # the orchestrator records exactly one graceful stub fallback; the run still succeeds. + fallbacks = [e for e in errors if e.data.get("fallback") == "stub"] + assert len(fallbacks) == 1 + assert state.research.idea == IDEA + + +def test_eventbus_history_and_replay(): + bus = EventBus() + bus.publish("r", Event(type="agent_start", data={"a": 1})) + bus.publish("r", Event(type="run_complete", data={})) + assert [e.type for e in bus.history("r")] == ["agent_start", "run_complete"] + assert bus.is_complete("r") is True + + # a late subscriber still receives the full history via replay + async def drain(): + q = bus.subscribe("r") + return [q.get_nowait().type for _ in range(q.qsize())] + + assert asyncio.run(drain()) == ["agent_start", "run_complete"] + + +def test_two_runs_are_isolated(): + bus = EventBus() + run_sync("idea one", bus, run_id="a") + run_sync("idea two", bus, run_id="b") + assert bus.history("a") and bus.history("b") + assert all(e.type != "run_start" or e.data["idea"] == "idea one" for e in bus.history("a")) diff --git a/tests/integration/test_v1_real_data.py b/tests/integration/test_v1_real_data.py new file mode 100644 index 0000000..5db8734 --- /dev/null +++ b/tests/integration/test_v1_real_data.py @@ -0,0 +1,164 @@ +"""The /v1 endpoints that were wired from MOCK → REAL backend data. + +evidence-graph now shows real pain text + real source→pain edges; system/models lists the +real provider/model catalog with live availability; /v1/models exposes the selector catalog; +and POST /v1/runs accepts a per-run model/provider. +""" +from __future__ import annotations + +import pytest +from fastapi.testclient import TestClient + +from aps.api.main import app +from aps.api import main as main_mod +from aps.api.v1 import idmap +from aps.state.models import ( + StudioState, RunStatus, ResearchReturn, PRD, Competitor, PainPoint, Feature, Evidence, Severity, +) + +client = TestClient(app) + + +@pytest.fixture +def auth(): + r = client.post("/v1/auth/login", json={"email": "operator@aps.io", "password": "demo1234"}) + return {"Authorization": f"Bearer {r.json()['data']['token']}"} + + +def _seed() -> str: + ev = [Evidence(source="github", url="https://g/1", title="bug", + snippet="the resume parser drops valid pdfs"), + Evidence(source="reddit", url="https://r/2", title="rant", + snippet="ranking misses good candidates")] + research = ResearchReturn( + idea="AI resume screening", evidence=ev, + competitors=[Competitor(name="Acme", features=["x"])], + pain_points=[PainPoint(text="Parser drops valid PDF resumes", severity=Severity.HIGH, + source_evidence=ev)]) + prd = PRD(idea="AI resume screening", + features=[Feature(title="Reliable PDF parsing", description="x", priority="Must")], + sources=ev) + st = StudioState(idea="AI resume screening", status=RunStatus.COMPLETE, + research=research, prd=prd) + main_mod._STATES["run_real01"] = st + main_mod._RUNS["run_real01"] = {"run_id": "run_real01", "idea": st.idea, + "status": "complete", "artifacts": ["research", "prd"]} + return idmap.alias_for("run_real01") + + +def test_evidence_graph_uses_real_pain_text_and_edges(auth): + g = client.get(f"/v1/runs/{_seed()}/evidence-graph", headers=auth).json()["data"] + pains = [n for n in g["nodes"] if n["type"] == "pain"] + assert pains and "parser drops valid pdf" in pains[0]["label"].lower() # REAL pain text + assert not pains[0]["label"].startswith("Pain #") + ids = {n["id"] for n in g["nodes"]} + assert all(a in ids and b in ids for a, b in g["edges"]) + # the pain's github+reddit evidence → real source→pain edges + assert ["github", "pain1"] in g["edges"] and ["reddit", "pain1"] in g["edges"] + # the requirement node is labeled from the real PRD feature + assert any(n["id"] == "req1" and "Reliable" in n["label"] for n in g["nodes"]) + + +def test_system_models_are_real_catalog(auth): + rows = client.get("/v1/system/models", headers=auth).json()["data"] + assert len(rows) == 4 and sum(1 for m in rows if m["primary"]) == 1 + provs = {m["provider"] for m in rows} + assert provs & {"NVIDIA NIM", "Google Gemini"} # real providers, not Claude/GPT-4o + assert all(isinstance(m["available"], bool) for m in rows) + + +def test_v1_models_catalog_endpoint(auth): + d = client.get("/v1/models", headers=auth).json()["data"] + assert "providers" in d and "default" in d + assert d["default"]["provider"] in {"gemini", "nim"} + + +def test_start_run_accepts_model_and_provider(auth): + r = client.post("/v1/runs", json={"prompt": "an idea", "provider": "gemini", + "model": "gemini-2.0-flash"}, headers=auth) + assert r.status_code == 201 and r.json()["data"]["runId"].startswith("RUN_") + + +def test_explain_why_is_per_feature_with_confidence(auth): + d = client.get(f"/v1/runs/{_seed()}/explain", headers=auth).json()["data"] + assert 0 <= d["overallConfidence"] <= 100 + feats = d["features"] + assert feats and any("Reliable" in f["title"] for f in feats) # real PRD feature + f0 = feats[0] + assert set(f0) >= {"title", "priority", "why", "confidence", "evidence"} + assert isinstance(f0["confidence"], int) and 0 <= f0["confidence"] <= 100 + + +def test_github_launch_preview_without_token(auth): + d = client.post(f"/v1/runs/{_seed()}/launch", json={"dryRun": True}, headers=auth).json()["data"] + assert d["dryRun"] is True and d["created"] is False + assert d["repoName"] and d["issueCount"] >= 0 and "Preview" in d["message"] + + +def test_launch_404_when_no_prd(auth): + main_mod._STATES["run_noprd"] = StudioState(idea="x", status=RunStatus.RUNNING) + main_mod._RUNS["run_noprd"] = {"run_id": "run_noprd", "idea": "x", "status": "running", + "artifacts": []} + alias = idmap.alias_for("run_noprd") + r = client.post(f"/v1/runs/{alias}/launch", json={"dryRun": True}, headers=auth) + assert r.status_code == 404 and r.json()["error"]["code"] == "RUN_NOT_FOUND" + + +def test_launch_studio_artifacts_listed_and_render(auth): + # Brand/Legal/Funding/Availability/Compliance must surface in the /v1 catalog + render. + from aps.state.models import (BrandPackage, LegalPackage, FundingPackage, + AvailabilityReport, ComplianceReport) + st = StudioState(idea="AI resume screening", status=RunStatus.COMPLETE, + brand=BrandPackage(name="Acme"), legal=LegalPackage(), + funding=FundingPackage(), availability=AvailabilityReport(), + compliance=ComplianceReport()) + main_mod._STATES["run_ls01"] = st + main_mod._RUNS["run_ls01"] = {"run_id": "run_ls01", "idea": st.idea, + "status": "complete", "artifacts": []} + alias = idmap.alias_for("run_ls01") + rows = client.get(f"/v1/runs/{alias}/artifacts", headers=auth).json()["data"] + ids = {a["id"]: a for a in rows} + for aid in ("brand", "legal", "funding", "availability", "compliance"): + assert aid in ids and ids[aid]["status"] == "complete", f"{aid} missing/not complete" + assert ids[aid]["agents"] # has a producing-agent label + body = client.get(f"/v1/artifacts/{aid}/content?run={alias}", headers=auth).json()["data"] + assert body["format"] == "markdown" and body["body"] # renders to markdown + + +def test_disabled_branch_is_not_a_phantom_artifact(auth): + # compliance is OFF by default — when not produced it must NOT appear as a forever-queued card. + st = StudioState(idea="x", status=RunStatus.RUNNING) + main_mod._STATES["run_noLS"] = st + main_mod._RUNS["run_noLS"] = {"run_id": "run_noLS", "idea": "x", "status": "running", + "artifacts": []} + alias = idmap.alias_for("run_noLS") + ids = {a["id"] for a in client.get(f"/v1/runs/{alias}/artifacts", headers=auth).json()["data"]} + assert "compliance" not in ids # disabled + absent → not shown + + +def test_system_providers_is_real_failover_chain(auth): + d = client.get("/v1/system/providers", headers=auth).json()["data"] + assert isinstance(d["chain"], list) and d["chain"] # ordered failover path + names = {p["name"] for p in d["registry"]} + assert {"gemini", "nim", "groq"} <= names # real registry, not GPT-4o + p0 = d["chain"][0] + assert p0["primary"] is True + assert set(p0) >= {"name", "model", "available", "breakerOpen", "signup"} + assert all(isinstance(p["available"], bool) and isinstance(p["breakerOpen"], bool) + for p in d["registry"]) + + +def test_trd_mermaid_artifact_content(auth): + from aps.state.models import TRD + trd = TRD(stack=["FastAPI", "Postgres"], + data_model={"architecture": {"components": ["API Gateway", "Worker"], + "data_flow": ["API Gateway -> Worker"]}, + "entities": {"User": {"fields": {"id": "int", "email": "str"}}}}) + st = StudioState(idea="AI resume screening", status=RunStatus.COMPLETE, trd=trd) + main_mod._STATES["run_trd01"] = st + main_mod._RUNS["run_trd01"] = {"run_id": "run_trd01", "idea": st.idea, + "status": "complete", "artifacts": ["trd"]} + alias = idmap.alias_for("run_trd01") + d = client.get(f"/v1/artifacts/trd/content?run={alias}&format=mermaid", + headers=auth).json()["data"] + assert d["format"] == "mermaid" and "```mermaid" in d["body"] diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_agent_tools.py b/tests/unit/test_agent_tools.py new file mode 100644 index 0000000..df384b5 --- /dev/null +++ b/tests/unit/test_agent_tools.py @@ -0,0 +1,150 @@ +"""Product / Architecture / Execution / Presentation tools: shapes + OpenAPI validity.""" +from __future__ import annotations + +from aps.state.models import PainPoint, Competitor, Persona, Feature, Severity, PRD, TRD +from aps.tools.product import ( + generate_personas, generate_user_stories, prioritize_features, + define_mvp_scope, acceptance_criteria, assemble_prd, +) +from aps.tools.architecture import ( + design_data_model, design_api_contract, choose_tech_stack, + estimate_scale, design_architecture, assemble_trd, +) +from aps.tools.execution import ( + plan_repo_structure, generate_backlog, estimate_effort, + plan_sprints, generate_roadmap, estimate_infra_cost, +) +from aps.tools.presentation import ( + generate_pitch_outline, generate_demo_script, + generate_investor_memo, generate_judge_brief, +) + + +PAINS = [PainPoint(text="parser drops PDFs", severity=Severity.HIGH), + PainPoint(text="matching misses candidates", severity=Severity.MED)] + + +# ---- product ---------------------------------------------------------------- +def test_personas_from_pains(): + out = generate_personas.TOOL.run(idea="x", pain_points=PAINS) + assert out.ok and out.payload and isinstance(out.payload[0], Persona) + assert out.payload[0].frustrations + + +def test_persona_goals_are_clean_capabilities_not_raw_pain(): + # goals = the positive inverse (a capability), NOT a "Resolve: " paste + pains = [PainPoint(text="It is unusable", severity=Severity.HIGH), + PainPoint(text="no way to bulk delete", severity=Severity.MED)] + personas = generate_personas.TOOL.run(idea="x", pain_points=pains).payload + all_goals = [g for p in personas for g in p.goals] + assert all_goals + assert not any(g.startswith("Resolve:") for g in all_goals) + assert not any("it is unusable" in g.lower() or "no way to" in g.lower() for g in all_goals) + # frustrations still carry the raw pains (they ARE the frustrations) + all_frust = [f for p in personas for f in p.frustrations] + assert any("unusable" in f.lower() for f in all_frust) + + +def test_prioritize_maps_severity_to_moscow(): + out = prioritize_features.TOOL.run(pain_points=PAINS, competitors=[]) + pri = {f.priority for f in out.payload} + assert "Must" in pri # high severity -> Must + assert out.payload[0].priority == "Must" # sorted Must-first + + +def test_user_stories_and_scope_and_ac(): + personas = generate_personas.TOOL.run(idea="x", pain_points=PAINS).payload + stories = generate_user_stories.TOOL.run(personas=personas, pain_points=PAINS).payload + assert stories and stories[0].lower().startswith("as a") + feats = prioritize_features.TOOL.run(pain_points=PAINS, competitors=[]).payload + scope = define_mvp_scope.TOOL.run(features=feats).payload + assert "MVP includes" in scope + ac = acceptance_criteria.TOOL.run(features=feats).payload + assert ac["requirements"] and ac["rows"][0]["criteria"] + + +def test_assemble_prd_validates(): + feats = [Feature(title="Parse PDFs", description="d", priority="Must")] + out = assemble_prd.TOOL.run(idea="resume", features=feats, requirements=["r"]) + assert out.ok and isinstance(out.payload, PRD) and out.payload.idea == "resume" + + +# ---- architecture ----------------------------------------------------------- +def _data_model(): + feats = [Feature(title="Resume parsing engine", description="d", priority="Must"), + Feature(title="Candidate ranking", description="d", priority="Should")] + return design_data_model.TOOL.run(features=feats, personas=[]).payload + + +def test_data_model_has_user_and_feature_entities(): + dm = _data_model() + ents = dm["entities"] + assert "User" in ents + assert len(ents) >= 2 + assert "id" in ents["User"]["fields"] + + +def test_design_api_contract_emits_valid_openapi(): + dm = _data_model() + doc = design_api_contract.TOOL.run(data_model=dm, idea="resume screening").payload + # OpenAPI 3.0 structural validity + assert doc["openapi"].startswith("3.") + assert "title" in doc["info"] and "version" in doc["info"] + assert doc["paths"], "must declare paths" + assert doc["components"]["schemas"], "must declare component schemas" + for path, ops in doc["paths"].items(): + assert path.startswith("/") + verbs = [v for v in ("get", "post", "put", "delete") if v in ops] + assert verbs, f"{path}: at least one operation" + for v in verbs: + assert "responses" in ops[v] + + +def test_stack_scale_arch_and_trd(): + scale = estimate_scale.TOOL.run(idea="resume saas", features=[], personas=[]).payload + assert "scale" in scale.lower() + stack = choose_tech_stack.TOOL.run(requirements=["AI scoring", "search match"], + scale_estimate=scale).payload + assert any("FastAPI" in s for s in stack) + assert any("ML" in s or "search" in s.lower() for s in stack) + arch = design_architecture.TOOL.run(stack=stack, data_model=_data_model()).payload + assert arch["components"] and arch["data_flow"] + trd = assemble_trd.TOOL.run(data_model=_data_model(), + api_spec=design_api_contract.TOOL.run(data_model=_data_model()).payload, + stack=stack, scale_estimate=scale).payload + assert isinstance(trd, TRD) and trd.stack + + +# ---- execution -------------------------------------------------------------- +def test_execution_pipeline_tools(): + feats = [Feature(title="Parse PDFs", description="d", priority="Must")] + repo = plan_repo_structure.TOOL.run(idea="x", stack=["FastAPI", "Redis + worker", "ML"]).payload + assert "backend/app/workers" in repo["dirs"] and "backend/app/ml" in repo["dirs"] + backlog = generate_backlog.TOOL.run(features=feats, api_spec={"paths": {"/a": {}, "/b": {}}}).payload + assert len(backlog) >= 3 and backlog[0]["id"].startswith("APS-") + est = estimate_effort.TOOL.run(backlog=backlog).payload + assert est["total_points"] > 0 and all("points" in b for b in est["backlog"]) + sprints = plan_sprints.TOOL.run(backlog=est["backlog"], velocity=8).payload + assert sprints and all(s["points"] <= 8 or len(s["items"]) == 1 for s in sprints) + roadmap = generate_roadmap.TOOL.run(sprints=sprints).payload + assert "MVP" in roadmap + cost = estimate_infra_cost.TOOL.run(stack=["FastAPI", "ML inference", "Redis"], + scale_estimate="10k users").payload + assert "$" in cost and "/mo" in cost + + +# ---- presentation ----------------------------------------------------------- +def test_presentation_tools_produce_text(): + outline = generate_pitch_outline.TOOL.run(idea="resume", market_size="$3B", + pain_points=PAINS, mvp_scope="MVP x").payload + assert "Problem" in outline and "Ask" in outline + demo = generate_demo_script.TOOL.run(idea="resume", + features=[Feature(title="Parse", description="d")], + personas=[Persona(name="R", role="recruiter")]).payload + assert "Demo" in demo + memo = generate_investor_memo.TOOL.run(idea="resume", market_size="$3B", + competitors=[Competitor(name="Acme")]).payload + assert "INVESTOR MEMO" in memo and "Acme" in memo + brief = generate_judge_brief.TOOL.run(idea="resume", tool_count=52, + artifacts=["PRD", "TRD"]).payload + assert "Req1" in brief and "52" in brief diff --git a/tests/unit/test_agents.py b/tests/unit/test_agents.py new file mode 100644 index 0000000..0554656 --- /dev/null +++ b/tests/unit/test_agents.py @@ -0,0 +1,62 @@ +"""Each downstream agent returns its exact typed object, populated from real upstream data.""" +from __future__ import annotations + +from aps.state.models import PRD, TRD, ExecutionPlan, PitchPackage, StudioState +from aps.agents.product.agent import run_product +from aps.agents.architecture.agent import run_architecture +from aps.agents.execution.agent import run_execution +from aps.agents.presentation.agent import run_presentation + + +def test_product_agent_returns_populated_prd(rich_research): + prd = run_product(rich_research) + assert isinstance(prd, PRD) + assert prd.idea == rich_research.idea + assert prd.personas and prd.features and prd.requirements + assert prd.mvp_scope + # features trace back to the pains + assert any("PDF" in f.title or "Parser" in f.title or "parser" in f.title.lower() + for f in prd.features) + # top pain (HIGH) yields a Must feature + assert any(f.priority == "Must" for f in prd.features) + # sources carried from research evidence + assert prd.sources + + +def test_architecture_agent_returns_trd_with_valid_openapi(rich_research): + prd = run_product(rich_research) + trd = run_architecture(prd) + assert isinstance(trd, TRD) + assert trd.api_spec.get("openapi", "").startswith("3.") + assert trd.api_spec.get("paths") + assert "entities" in trd.data_model and "User" in trd.data_model["entities"] + assert trd.stack and trd.scale_estimate + + +def test_execution_agent_returns_plan(rich_research): + prd = run_product(rich_research) + trd = run_architecture(prd) + plan = run_execution(trd, prd=prd) + assert isinstance(plan, ExecutionPlan) + assert plan.backlog and plan.sprints + assert plan.roadmap and plan.infra_cost + assert all("points" in item for item in plan.backlog) + + +def test_presentation_agent_returns_pitch(rich_research): + prd = run_product(rich_research) + trd = run_architecture(prd) + plan = run_execution(trd, prd=prd) + state = StudioState(idea=rich_research.idea, research=rich_research, + prd=prd, trd=trd, execution=plan) + pitch = run_presentation(state) + assert isinstance(pitch, PitchPackage) + assert pitch.pitch_outline and pitch.demo_script and pitch.investor_memo + assert "JUDGE BRIEF" in pitch.investor_memo # judge brief folded in (decision.md D4) + + +def test_product_agent_handles_empty_research(): + from aps.state.models import ResearchReturn + prd = run_product(ResearchReturn(idea="bare idea")) + assert isinstance(prd, PRD) and prd.idea == "bare idea" + assert prd.personas # always at least one persona diff --git a/tests/unit/test_analysis_quality.py b/tests/unit/test_analysis_quality.py new file mode 100644 index 0000000..8315b80 --- /dev/null +++ b/tests/unit/test_analysis_quality.py @@ -0,0 +1,136 @@ +"""Analysis-layer quality guards: no job/market-report contamination, real competitors +surfaced, and demand evidence yields pains. Regression cover for the live failures where +job postings became PRD features and pain extraction returned nothing. +""" +from __future__ import annotations + +from aps.tools.analysis import build_competitor_matrix as cm +from aps.tools.analysis import extract_pain_points as pp +from aps.tools.product import prioritize_features as pf +from aps.tools.analysis._sources import evidence_kind, is_extractable +from aps.state.models import Evidence, Competitor + + +# ── source-type tagging + the extraction gate ────────────────────────────── +def test_evidence_kind_classifies_each_source_type(): + cases = { + "job": Evidence(source="jobs", url="https://remotive.com/job/1", title="Copywriter", snippet="role"), + "market_report": Evidence(source="web", url="https://x.com/r", title="Report", + snippet="Market size expected to reach $5B by 2030, CAGR 12%."), + "news": Evidence(source="web", url="https://finance.yahoo.com/x", title="N", snippet="story"), + "reference": Evidence(source="arxiv", url="https://arxiv.org/abs/1", title="paper", snippet="study"), + "discussion": Evidence(source="reddit", url="https://reddit.com/r/x", title="t", snippet="post"), + "product": Evidence(source="web", url="https://habitshare.app/", title="HabitShare", snippet="app"), + "fixture": Evidence(source="web", url="https://x", title="[fixture] X", snippet="placeholder"), + } + for expected, ev in cases.items(): + assert evidence_kind(ev) == expected, f"{expected} misclassified" + + +def test_only_substantive_kinds_are_extractable(): + barred = ["job", "market_report", "news", "fixture"] + allowed = ["reference", "discussion", "product"] + samples = { + "job": Evidence(source="jobs", url="https://remotive.com/j", title="t", snippet="s"), + "market_report": Evidence(source="web", url="https://x", title="t", snippet="CAGR forecast to 2031"), + "news": Evidence(source="web", url="https://yahoo.com/x", title="t", snippet="s"), + "fixture": Evidence(source="web", url="https://x", title="[fixture] t", snippet="s"), + "reference": Evidence(source="wikipedia", url="https://wikipedia.org/x", title="t", snippet="s"), + "discussion": Evidence(source="hackernews", url="https://news.ycombinator.com/x", title="t", snippet="s"), + "product": Evidence(source="web", url="https://acme.io/", title="Acme", snippet="s"), + } + for k in barred: + assert is_extractable(samples[k]) is False + for k in allowed: + assert is_extractable(samples[k]) is True + + +def _comp(ev: list[Evidence]) -> list[Competitor]: + return cm.TOOL.run(evidence=[e.model_dump() for e in ev]).payload + + +# ── build_competitor_matrix ──────────────────────────────────────────────── +def test_job_postings_are_not_competitors(): + ev = [Evidence(source="jobs", url="https://remotive.com/job/1", + title="Copywriter @ Coalition Technologies", + snippet="We offer remote work and support the team. Copywriter @ Coalition Technologies.")] + names = [c.name.lower() for c in _comp(ev)] + assert names == [] or all("coalition" not in n and "copywriter" not in n for n in names) + + +def test_market_report_and_job_hosts_excluded(): + ev = [ + Evidence(source="web", url="https://yahoo.com/finance/habit", + title="Habit market", snippet="Market size expected to reach $5B by 2030, CAGR 12%."), + Evidence(source="web", url="https://wiseguyreports.com/r/1", + title="Report", snippet="This market research report offers forecast to 2031."), + Evidence(source="web", url="https://remotive.com/remote-jobs/x", + title="Job", snippet="We offer a great role and support growth."), + ] + names = {c.name.lower() for c in _comp(ev)} + assert not ({"yahoo", "wiseguyreports", "remotive"} & names) + + +def test_producthunt_title_is_surfaced_as_competitor(): + ev = [Evidence(source="producthunt", url="https://www.producthunt.com/posts/twinbit", + title="TwinBit", snippet="TwinBit lets couples share habits and sync streaks.")] + names = {c.name for c in _comp(ev)} + assert "TwinBit" in names # real product surfaced despite producthunt.com being a research host + + +def test_show_hn_title_is_surfaced(): + ev = [Evidence(source="hackernews", url="https://news.ycombinator.com/item?id=1", + title="Show HN: HabitPair – shared habits for couples", + snippet="I built HabitPair so my partner and I can share habit streaks.")] + names = {c.name for c in _comp(ev)} + assert "HabitPair" in names + + +def test_real_product_domain_still_kept(): + ev = [Evidence(source="web", url="https://habitshare.app/", + title="HabitShare", snippet="HabitShare offers shared tracking. Free plan available.")] + names = {c.name.lower() for c in _comp(ev)} + assert any("habitshare" in n for n in names) + + +# ── extract_pain_points ──────────────────────────────────────────────────── +def test_demand_evidence_yields_a_pain(): + ev = [Evidence(source="reddit", url="https://r/1", title="ask", + snippet="I was looking for a privacy-first habit tracker for couples but couldn't find one.")] + pains = pp.TOOL.run(evidence=[e.model_dump() for e in ev]).payload + assert len(pains) >= 1 # unmet-need is a pain (was 0 before the demand tier) + + +def test_html_entities_are_decoded_in_pains(): + """A snippet with HTML entities ('I'm looking … couldn't find') must decode to + real text, not leak as junk like 'I& x27' after punctuation stripping.""" + ev = [Evidence(source="reddit", url="https://r/1", title="ask", + snippet="I'm looking for a privacy-first habit tracker but couldn't find one.")] + pains = pp.TOOL.run(evidence=[e.model_dump() for e in ev]).payload + assert pains, "demand pain should still be extracted" + joined = " ".join(p.text for p in pains).lower() + assert "x27" not in joined and "&#" not in joined and "&" not in joined + + +def test_nav_and_template_chrome_still_rejected(): + ev = [ + Evidence(source="web", url="https://x/1", title="nav", snippet="Log in Get Started Book a Demo"), + Evidence(source="github", url="https://github.com/x/y/issues/1", title="t", + snippet="Steps to reproduce: open the app. Expected behavior: it works."), + ] + pains = pp.TOOL.run(evidence=[e.model_dump() for e in ev]).payload + assert pains == [] + + +# ── end-to-end cascade guard ─────────────────────────────────────────────── +def test_job_text_never_becomes_a_feature(): + """The reported bug: a Remotive job posting flowed into the PRD as + 'Differentiator: copywriter @ coalition technologies'. With job evidence excluded from + the competitor matrix, no such feature can be derived.""" + ev = [Evidence(source="jobs", url="https://remotive.com/job/1", + title="Copywriter @ Coalition Technologies", + snippet="We offer remote work and support the team.")] + comps = _comp(ev) + feats = pf.TOOL.run(pain_points=[], competitors=[c.model_dump() for c in comps]).payload + titles = " ".join(f.title.lower() for f in feats) + assert "copywriter" not in titles and "coalition" not in titles diff --git a/tests/unit/test_analysis_tools.py b/tests/unit/test_analysis_tools.py new file mode 100644 index 0000000..b006398 --- /dev/null +++ b/tests/unit/test_analysis_tools.py @@ -0,0 +1,113 @@ +"""Analysis tools: deterministic behavior on crafted evidence (the 4 finished stubs +).""" +from __future__ import annotations + +from aps.state.models import Evidence, ToolResult +from aps.tools.analysis import ( + extract_pain_points, dedupe_and_rank_evidence, build_competitor_matrix, + estimate_market_size, rank_opportunities, detect_trend_signal, + cluster_themes, sentiment_breakdown, extract_competitor_features, + validate_with_sources, +) + + +def _ev(): + return [ + Evidence(source="reddit", url="https://reddit.com/r/x/1", title="rant", + snippet="The parser is broken and slow, I hate it."), + Evidence(source="web", url="https://acme.io/pricing", title="Acme", + snippet="Acme supports PDF export and integrates with Slack. Pricing $29/mo."), + Evidence(source="web", url="https://acme.io/features", title="Acme f", + snippet="Offers real-time analytics and a dashboard."), + Evidence(source="web", url="https://report.example.com", title="market", + snippet="The market is worth $3 billion and growing."), + ] + + +def _dump(ev): + return [e.model_dump() for e in ev] + + +def test_extract_pain_points_finds_high_severity(): + out = extract_pain_points.TOOL.run(evidence=_dump(_ev())) + assert out.ok and out.payload + assert any(p.severity.value == "high" for p in out.payload) + + +def test_dedupe_collapses_duplicate_urls(): + e = _ev() + dupe = Evidence(source="reddit", url="https://reddit.com/r/x/1?utm=1", + title="rant", snippet="dup") + out = dedupe_and_rank_evidence.TOOL.run(evidence=_dump(e + [dupe])) + urls = [x.url for x in out.payload] + assert len(urls) == len(set(_norm(u) for u in urls)) + + +def _norm(u): + return u.split("?")[0] + + +def test_build_competitor_matrix_skips_research_sources(): + out = build_competitor_matrix.TOOL.run(evidence=_dump(_ev())) + assert out.ok + names = [c.name for c in out.payload] + # acme.io is a competitor; reddit/report are not rivals + assert any("Acme" in n for n in names) + assert not any(n.lower().startswith("reddit") for n in names) + + +def test_estimate_market_size_extracts_figure(): + out = estimate_market_size.TOOL.run(evidence=_dump(_ev()), topic="resumes") + assert out.ok and isinstance(out.payload, str) + assert "$3.0B" in out.payload or "$3B" in out.payload + + +def test_estimate_market_size_no_figure_is_graceful(): + e = [Evidence(source="web", url="https://x.com/a", title="t", + snippet="lots of hiring demand and growing adoption")] + out = estimate_market_size.TOOL.run(evidence=_dump(e)) + assert out.ok and "No explicit market figure" in out.payload + + +def test_estimate_market_size_floors_implausible_figures(): + # a sub-$1M "$" mention (a price/salary, not a market) must NOT be reported as a TAM + e = [Evidence(source="web", url="https://x.com/s", title="pay", + snippet="median pay is $340 thousand for this role")] + out = estimate_market_size.TOOL.run(evidence=_dump(e)) + assert "No explicit market figure" in out.payload # not asserted as a TAM + assert "credible-TAM floor" in out.payload # flagged with provenance + + +def test_rank_opportunities_orders_by_score(): + out = rank_opportunities.TOOL.run(evidence=_dump(_ev())) + assert out.ok and out.payload + scores = [o["score"] for o in out.payload] + assert scores == sorted(scores, reverse=True) + + +def test_detect_trend_signal_directions(): + assert detect_trend_signal.TOOL.run(series=[10, 14, 18, 25, 31, 40]).payload["direction"] == "rising" + assert detect_trend_signal.TOOL.run(series=[40, 31, 25, 18, 10]).payload["direction"] == "declining" + assert detect_trend_signal.TOOL.run(series=[20, 20, 20, 20]).payload["direction"] == "flat" + assert detect_trend_signal.TOOL.run(series=[5]).payload["direction"] == "unknown" + + +def test_cluster_themes_and_sentiment_and_features_run(): + ev = _dump(_ev()) + assert cluster_themes.TOOL.run(evidence=ev).ok + sb = sentiment_breakdown.TOOL.run(evidence=ev) + assert sb.ok and sb.payload["total"] == len(ev) + feats = extract_competitor_features.TOOL.run(evidence=ev) + assert feats.ok and any("support" in f.lower() or "offer" in f.lower() for f in feats.payload) + + +def test_validate_with_sources_drops_bad_urls(): + e = [Evidence(source="web", url="https://x.com/a", title="A", snippet="real content here"), + Evidence(source="web", url="not-a-url", title="B", snippet="x")] + out = validate_with_sources.TOOL.run(evidence=_dump(e)) + assert out.ok and len(out.payload) == 1 + + +def test_all_analysis_return_toolresult(): + for mod in (extract_pain_points, dedupe_and_rank_evidence, build_competitor_matrix, + estimate_market_size, rank_opportunities, detect_trend_signal): + assert isinstance(mod.TOOL.run(evidence=_dump(_ev())), ToolResult) diff --git a/tests/unit/test_api_v1.py b/tests/unit/test_api_v1.py new file mode 100644 index 0000000..a4ed2cd --- /dev/null +++ b/tests/unit/test_api_v1.py @@ -0,0 +1,272 @@ +"""The /v1 Frontend Data Contract (docs/backenddatacontract.md) — envelope, auth, every +endpoint's required keys present (§0.8), deterministic mocks, and the websocket stream. + +Hermetic: starts no real orchestrator run for the data-shape tests — it injects a fully-formed +StudioState straight into the shared engine's in-memory store and aliases it, so the mappers +run against realistic data with zero network. One test does start a real run to prove the +POST→dashboard path (the orchestrator degrades to the deterministic stub without keys). +""" +from __future__ import annotations + +import pytest +from fastapi.testclient import TestClient + +from aps.api.main import app +from aps.api import main as main_mod +from aps.api.v1 import idmap +from aps.state.models import ( + StudioState, RunStatus, ResearchReturn, PRD, TRD, ExecutionPlan, PitchPackage, + Competitor, PainPoint, Persona, Feature, Evidence, Severity, +) + +client = TestClient(app) + + +# --------------------------------------------------------------------------- # +# Fixtures +# --------------------------------------------------------------------------- # +@pytest.fixture +def token() -> str: + r = client.post("/v1/auth/login", json={"email": "operator@aps.io", "password": "demo1234"}) + assert r.status_code == 200 + return r.json()["data"]["token"] + + +@pytest.fixture +def auth(token): + return {"Authorization": f"Bearer {token}"} + + +def _seed_state() -> str: + """Inject a complete StudioState into the engine and return its RUN_ alias (no network).""" + ev = [Evidence(source="github", url="https://g/1", title="ATS drops PDFs", + snippet="The parser keeps dropping valid resumes"), + Evidence(source="reddit", url="https://r/2", title="cant find tracker", + snippet="I can't find a privacy-respecting habit tracker")] + research = ResearchReturn( + idea="privacy habit tracker", market_size="$8.4B", + competitors=[Competitor(name="Habitica", features=["streaks", "reminders"]), + Competitor(name="Streaks", features=["reminders"])], + pain_points=[PainPoint(text="Can't find a privacy-respecting tracker", + severity=Severity.HIGH, source_evidence=ev)], + evidence=ev, tool_calls=12) + prd = PRD(idea="privacy habit tracker", + personas=[Persona(name="Sam", role="user", goals=["track offline"])], + features=[Feature(title="Offline Sync", description="x", priority="Must")], + sources=ev) + trd = TRD(stack=["FastAPI", "React"], api_spec={"openapi": "3.0.3"}) + state = StudioState(idea="privacy habit tracker", status=RunStatus.COMPLETE, + current_agent=None, research=research, prd=prd, trd=trd, + execution=ExecutionPlan(roadmap="Q1"), pitch=PitchPackage(demo_script="x")) + backend_id = "run_seed01" + main_mod._STATES[backend_id] = state + main_mod._RUNS[backend_id] = {"run_id": backend_id, "idea": state.idea, + "status": "complete", "artifacts": ["research", "prd"]} + return idmap.alias_for(backend_id) + + +# --------------------------------------------------------------------------- # +# Envelope + auth +# --------------------------------------------------------------------------- # +def test_login_returns_token_and_user(): + r = client.post("/v1/auth/login", json={"email": "operator@aps.io", "password": "demo1234"}) + body = r.json() + assert body["success"] is True + assert set(body["meta"]) >= {"requestId", "timestamp"} + assert body["data"]["token"] and body["data"]["user"]["email"] == "operator@aps.io" + + +def test_login_bad_password_error_envelope(): + r = client.post("/v1/auth/login", json={"email": "operator@aps.io", "password": "wrong"}) + assert r.status_code == 401 + body = r.json() + assert body["success"] is False + assert body["error"]["code"] == "INVALID_CREDENTIALS" + + +def test_signup_then_login_flow(): + email = "new.operator@aps.io" + r = client.post("/v1/auth/signup", json={"name": "New Op", "email": email, + "password": "secret12", "role": "Investor"}) + assert r.status_code == 201 and r.json()["data"]["user"]["role"] == "Investor" + # duplicate → 422 EMAIL_ALREADY_EXISTS + r2 = client.post("/v1/auth/signup", json={"name": "New Op", "email": email, + "password": "secret12", "role": "Investor"}) + assert r2.status_code == 422 and r2.json()["error"]["code"] == "EMAIL_ALREADY_EXISTS" + # login with the new account + r3 = client.post("/v1/auth/login", json={"email": email, "password": "secret12"}) + assert r3.status_code == 200 + + +def test_protected_route_requires_bearer(): + r = client.get("/v1/system/status") + assert r.status_code == 401 and r.json()["error"]["code"] == "UNAUTHORIZED" + + +def test_signup_validation_error_has_fields(): + r = client.post("/v1/auth/signup", json={"name": "x", "email": "bad", "password": "short", + "role": "Nope"}) + assert r.status_code == 422 + assert r.json()["error"]["code"] == "VALIDATION_ERROR" + + +# --------------------------------------------------------------------------- # +# System page — every contract-required key present (§0.8 "never omit a key") +# --------------------------------------------------------------------------- # +def test_system_status_keys(auth): + d = client.get("/v1/system/status", headers=auth).json()["data"] + assert set(d) >= {"status", "agentCount", "activeSwarms", "uptimePct", "apiStatus", "version"} + + +def test_system_health_keys(auth): + d = client.get("/v1/system/health", headers=auth).json()["data"] + assert set(d) >= {"agentsActive", "toolsOnline", "memoryLoad", "modelsReady", + "evidenceItems", "runsToday", "tokensUsed", "runtimeSec", "uptimePct", + "systemVersion", "statusLabel", "activeRunId"} + + +def test_system_models_shape(auth): + rows = client.get("/v1/system/models", headers=auth).json()["data"] + assert len(rows) == 4 and sum(1 for m in rows if m["primary"]) == 1 + for m in rows: + assert set(m) >= {"id", "name", "provider", "icon", "available", "latencyMs", + "tokensM", "costUSD", "successRate", "primary", "color"} + + +def test_system_observability_20_points(auth): + d = client.get("/v1/system/observability", headers=auth).json()["data"] + assert all(len(d[k]) == 20 for k in ("latency", "tokens", "errors", "runs")) + + +def test_system_heatmap_168_cells(auth): + d = client.get("/v1/system/activity-heatmap", headers=auth).json()["data"] + assert len(d["values"]) == 168 and all(0.0 <= v <= 1.0 for v in d["values"]) + + +def test_system_memory_six_layers(auth): + rows = client.get("/v1/system/memory", headers=auth).json()["data"] + assert [r["id"] for r in rows] == ["working", "run", "artifact", "evidence", "kg", "longterm"] + + +def test_mocks_are_deterministic(auth): + a = client.get("/v1/system/models", headers=auth).json()["data"] + b = client.get("/v1/system/models", headers=auth).json()["data"] + assert a == b # no randomness — stable across calls + + +def test_telemetry_no_auth_and_grows(): + a = client.get("/v1/system/telemetry/live").json()["data"] + b = client.get("/v1/system/telemetry/live").json()["data"] + assert b["memoryIndex"] > a["memoryIndex"] + + +# --------------------------------------------------------------------------- # +# Dashboard / Artifacts against a seeded run +# --------------------------------------------------------------------------- # +def test_dashboard_run_shape(auth): + alias = _seed_state() + d = client.get(f"/v1/runs/{alias}", headers=auth).json()["data"] + assert set(d) >= {"id", "label", "phase", "progressPct", "startedAt", "elapsedSec", + "viabilityScore", "status", "activeAgentId", "systemHealth"} + assert d["id"] == alias and d["status"] == "complete" + assert 0 <= d["viabilityScore"] <= 10 + + +def test_run_agents_five_fixed(auth): + alias = _seed_state() + rows = client.get(f"/v1/runs/{alias}/agents", headers=auth).json()["data"] + assert [a["id"] for a in rows] == ["research", "product", "arch", "execution", "present"] + + +def test_run_artifacts_detail(auth): + alias = _seed_state() + rows = client.get(f"/v1/runs/{alias}/artifacts", headers=auth).json()["data"] + research = next(a for a in rows if a["id"] == "research-brief") + assert research["status"] == "complete" and research["evidenceCount"] == 2 + assert research["sourceCount"] == 2 + + +def test_run_viability_radar(auth): + alias = _seed_state() + d = client.get(f"/v1/runs/{alias}/viability", headers=auth).json()["data"] + assert len(d["radarAxes"]) == 5 and len(d["scenarios"]) == 3 + assert all(len(s["values"]) == 5 for s in d["scenarios"]) + + +def test_run_debate_sides(auth): + alias = _seed_state() + rows = client.get(f"/v1/runs/{alias}/debate", headers=auth).json()["data"] + assert rows and all(r["side"] in ("Build", "Don't Build") for r in rows) + + +def test_evidence_graph_edges_reference_nodes(auth): + alias = _seed_state() + d = client.get(f"/v1/runs/{alias}/evidence-graph", headers=auth).json()["data"] + ids = {n["id"] for n in d["nodes"]} + assert all(a in ids and b in ids for a, b in d["edges"]) + github = next(n for n in d["nodes"] if n["id"] == "github") + assert github["count"] == 1 # one github evidence in the seed + + +def test_dna_and_timeline(auth): + alias = _seed_state() + dna = client.get(f"/v1/runs/{alias}/dna", headers=auth).json()["data"] + assert sum(1 for n in dna["nodes"] if n["core"]) == 1 + tl = client.get(f"/v1/runs/{alias}/timeline", headers=auth).json()["data"] + assert tl[0]["start"] == 0 and tl[-1]["end"] == 100 + + +def test_artifact_content_markdown(auth): + alias = _seed_state() + d = client.get("/v1/artifacts/research-brief/content", + params={"run": alias}, headers=auth).json()["data"] + assert d["format"] == "markdown" and "#" in d["body"] + + +def test_artifact_evidence_traces(auth): + alias = _seed_state() + rows = client.get("/v1/artifacts/research-brief/evidence-traces", + params={"run": alias}, headers=auth).json()["data"] + assert rows and rows[0]["sources"] + + +def test_unknown_run_404(auth): + r = client.get("/v1/runs/RUN_9999", headers=auth) + assert r.status_code == 404 and r.json()["error"]["code"] == "RUN_NOT_FOUND" + + +# --------------------------------------------------------------------------- # +# Run lifecycle (real orchestrator, degrades to stub without keys) + websocket +# --------------------------------------------------------------------------- # +def test_start_run_and_poll(auth): + r = client.post("/v1/runs", json={"prompt": "a habit tracker for couples"}, headers=auth) + assert r.status_code == 201 + alias = r.json()["data"]["runId"] + assert alias.startswith("RUN_") + # dashboard immediately resolvable (running shell or finished) + d = client.get(f"/v1/runs/{alias}", headers=auth) + assert d.status_code == 200 and d.json()["data"]["id"] == alias + + +def test_websocket_run_stream_seed_and_metric(auth, token): + alias = _seed_state() + with client.websocket_connect(f"/v1/ws/runs/{alias}/stream?token={token}") as ws: + # first frame is either a seeded event or the immediate metric_tick + first = ws.receive_json() + assert first["type"] in ("event", "metric_tick") + # drain until we see a metric_tick (seed has 0 events here, so it's immediate) + got_metric = first["type"] == "metric_tick" + for _ in range(3): + if got_metric: + break + msg = ws.receive_json() + got_metric = msg["type"] == "metric_tick" + assert got_metric + + +def test_websocket_rejects_bad_token(): + with client.websocket_connect("/v1/ws/runs/global/stream?token=bogus") as ws: + # server accepts then closes 1008; the close arrives as a WebSocketDisconnect on receive + import starlette.websockets + with pytest.raises(starlette.websockets.WebSocketDisconnect): + ws.receive_json() diff --git a/tests/unit/test_architecture_mermaid.py b/tests/unit/test_architecture_mermaid.py new file mode 100644 index 0000000..f3bd8b5 --- /dev/null +++ b/tests/unit/test_architecture_mermaid.py @@ -0,0 +1,65 @@ +"""T2.2 — TRD → Mermaid architecture diagrams: valid, complete, graceful, deterministic.""" +from __future__ import annotations + +from aps.state.models import TRD +from aps.render import architecture_mmd + + +def _trd(): + return TRD( + data_model={ + "entities": { + "User": {"fields": {"id": "uuid", "email": "string"}}, + "Resume": {"fields": {"id": "uuid", "owner_id": "uuid", "score": "float"}}, + }, + "architecture": { + "components": ["API gateway", "App service", "PostgreSQL", "Inference service"], + "services": ["auth", "scoring"], + "data_flow": ["Client → API gateway → App service (authn)", + "App service → Inference service → result persisted"], + }, + }, + api_spec={"openapi": "3.0.3", "paths": {"/resumes": {"get": {"summary": "List"}}}}, + stack=["Backend: FastAPI", "DB: PostgreSQL"], + ) + + +def test_emits_two_mermaid_blocks(): + md = architecture_mmd.render(_trd()) + assert md.count("```mermaid") == 2 + assert "flowchart TD" in md and "erDiagram" in md + + +def test_flowchart_has_components_and_edges(): + md = architecture_mmd.render(_trd()) + assert "API gateway" in md and "Inference service" in md + assert "-->" in md # at least one data-flow edge + + +def test_er_has_entities_fields_and_relationship(): + md = architecture_mmd.render(_trd()) + assert "User {" in md and "Resume {" in md + assert "uuid id" in md + # owner_id foreign key becomes a User--Resume relationship + assert "User ||--o{ Resume" in md + + +def test_node_ids_are_mermaid_safe(): + md = architecture_mmd.render(_trd()) + flow = md.split("flowchart TD", 1)[1].split("```", 1)[0] + for line in flow.splitlines(): + line = line.strip() + if line.startswith(("%", "")) and "[" in line and "-->" not in line: + nid = line.split("[", 1)[0] + assert nid.replace("_", "").isalnum(), f"unsafe node id: {nid!r}" + + +def test_empty_trd_is_graceful(): + md = architecture_mmd.render(TRD()) + assert md and "None" not in md + assert "_— none identified —_" in md + + +def test_deterministic(): + t = _trd() + assert architecture_mmd.render(t) == architecture_mmd.render(t) diff --git a/tests/unit/test_artifact_quality.py b/tests/unit/test_artifact_quality.py new file mode 100644 index 0000000..79c5d76 --- /dev/null +++ b/tests/unit/test_artifact_quality.py @@ -0,0 +1,49 @@ +"""Artifact-quality cascade fix: clean labels, domain-noun entities, competitor deny-list.""" +from __future__ import annotations + +from aps.tools.analysis._text import clean_label +from aps.tools.analysis import build_competitor_matrix as cm +from aps.tools.architecture import design_data_model as ddm +from aps.tools.architecture import design_api_contract as dac +from aps.state.models import Evidence, Feature + + +def test_clean_label_strips_boilerplate_and_markdown(): + out = clean_label("Solve: ## Feature Request: Scheduled Auto-Export for integrations…Please descr") + assert out == "Scheduled Auto-Export for integrations" + assert "##" not in out and "solve" not in out.lower() + assert "descr" not in out.lower() # no mid-word fragment leaks + + +def test_clean_label_is_short_and_capitalized(): + out = clean_label("the parser is broken and keeps dropping data and lots more text follows here") + assert 0 < len(out.split()) <= 8 + assert out[0].isupper() + + +def test_competitor_deny_excludes_integrations_and_categories(): + ev = [ + Evidence(source="web", url="https://zapier.com/apps", title="Zapier", snippet="integrates apps"), + Evidence(source="web", url="https://productivity.com/blog", title="p", snippet="productivity tips"), + Evidence(source="web", url="https://api.github.io/x", title="gh", snippet="code sample"), + Evidence(source="web", url="https://habitbox.com", title="Habitbox", + snippet="A habit tracker that supports reminders and shared goals. $5/mo."), + ] + names = {c.name.lower() for c in + cm.TOOL.run(evidence=[e.model_dump() for e in ev]).payload} + assert "habitbox" in names # real product kept + assert names.isdisjoint({"zapier", "productivity", "github"}) # noise excluded + + +def test_entities_are_domain_nouns_no_fragments(): + dm = ddm.TOOL.run( + idea="a privacy-first habit tracker for couples", + features=[Feature(title="Scheduled export for integrations", description="x", priority="Should").model_dump(), + Feature(title="Reminder notifications", description="x", priority="Must").model_dump()], + ).payload + names = {n.lower() for n in dm["entities"]} + assert "habit" in names # clean domain noun from the idea + for bad in ("descr", "scheduled", "external", "tool", "integration", "export"): + assert bad not in names + paths = list(dac.TOOL.run(data_model=dm, idea="x").payload["paths"].keys()) + assert not any(p.endswith("ss") for p in paths) # sane pluralization diff --git a/tests/unit/test_artifact_store.py b/tests/unit/test_artifact_store.py new file mode 100644 index 0000000..63fa376 --- /dev/null +++ b/tests/unit/test_artifact_store.py @@ -0,0 +1,43 @@ +"""File artifact store persists a run and serves it read-through (offline, deterministic).""" +from __future__ import annotations + +from aps.infra import artifact_store +from aps.agents.research.stub import stub_research +from aps.agents.product.agent import run_product +from aps.state.models import StudioState, RunStatus, PRD + + +def _state() -> StudioState: + research = stub_research("Build an AI SaaS for resume screening") + prd = run_product(research) + return StudioState(idea=research.idea, status=RunStatus.COMPLETE, + research=research, prd=prd) + + +def test_save_then_load_roundtrip(tmp_path, monkeypatch): + monkeypatch.setenv("APS_ARTIFACT_DIR", str(tmp_path)) + state = _state() + artifact_store.save_run("run_x", state) + + # artifacts written to disk + assert (tmp_path / "run_x" / "prd.json").exists() + assert (tmp_path / "run_x" / "meta.json").exists() + assert (tmp_path / "run_x" / "state.json").exists() + + # read-through (simulates a fresh process: only the files exist) + meta = artifact_store.load_meta("run_x") + assert meta["idea"] == state.idea and "prd" in meta["artifacts"] + + prd = artifact_store.load_artifact("run_x", "prd") + assert PRD.model_validate(prd).idea == state.idea + + reloaded = artifact_store.load_state("run_x") + assert reloaded.idea == state.idea and reloaded.prd is not None + assert "run_x" in artifact_store.list_runs() + + +def test_missing_run_returns_none(tmp_path, monkeypatch): + monkeypatch.setenv("APS_ARTIFACT_DIR", str(tmp_path)) + assert artifact_store.load_meta("nope") is None + assert artifact_store.load_artifact("nope", "prd") is None + assert artifact_store.load_state("nope") is None diff --git a/tests/unit/test_availability_agent.py b/tests/unit/test_availability_agent.py new file mode 100644 index 0000000..e3d5d44 --- /dev/null +++ b/tests/unit/test_availability_agent.py @@ -0,0 +1,44 @@ +"""Availability agent pipeline: AvailabilityReport with/without Brand; renders to Markdown.""" +from __future__ import annotations + +from aps.agents.availability.agent import run_availability +from aps.state.models import StudioState, BrandPackage, AvailabilityReport +from aps.infra import http +from aps.render import render_artifact + + +class _Resp: + def __init__(self, code): + self.status_code = code + + +def _stub_rdap(monkeypatch, available_first=True): + # first candidate (.com) available, the rest registered + def fake_get(url, **kw): + return _Resp(404 if url.endswith(".com") else 200) + monkeypatch.setattr(http, "get", fake_get) + + +def test_run_availability_uses_brand_name(monkeypatch): + _stub_rdap(monkeypatch) + state = StudioState(idea="a privacy-first habit tracker", brand=BrandPackage(name="Habitly")) + rep = run_availability(state) + assert isinstance(rep, AvailabilityReport) + assert rep.company_name == "Habitly" + assert rep.recommended_domain == "habitly.com" + assert rep.trademarks and rep.summary + + +def test_run_availability_idea_only_derives_name(monkeypatch): + _stub_rdap(monkeypatch) + rep = run_availability(StudioState(idea="a privacy-first habit tracker")) + assert rep.company_name # derived + assert rep.domains and len(rep.domains) >= 3 + + +def test_availability_renders_to_markdown(monkeypatch): + _stub_rdap(monkeypatch) + rep = run_availability(StudioState(idea="a habit tracker", brand=BrandPackage(name="Habitly"))) + md = render_artifact("availability", rep) + assert "# Name Availability" in md and "Domains" in md and "Trademark" in md + assert render_artifact("availability", rep.model_dump()) == md diff --git a/tests/unit/test_availability_graph.py b/tests/unit/test_availability_graph.py new file mode 100644 index 0000000..2ebd9e3 --- /dev/null +++ b/tests/unit/test_availability_graph.py @@ -0,0 +1,55 @@ +"""Availability graph wiring: flag off = unchanged; flag on = parallel branch off product, +no concurrent-write error, existing artifacts still produced.""" +from __future__ import annotations + +from aps.orchestrator import graph as g +from aps.orchestrator.events import EventBus +from aps.state.models import RunStatus +from aps.infra import http + +_ALL = ("research", "prd", "trd", "execution", "pitch", "brand", "legal", "funding", + "availability") + + +class _Resp: + def __init__(self, code): + self.status_code = code + + +def _run(monkeypatch, enabled: bool, run_id: str): + monkeypatch.setattr(g, "USE_STUBS", True) + # keep RDAP lookups hermetic/fast — no real network in the suite + monkeypatch.setattr(http, "get", lambda url, **kw: _Resp(404 if url.endswith(".com") else 200)) + monkeypatch.setenv("APS_ENABLE_TRADEMARK", "true" if enabled else "false") + bus = EventBus() + state = g.run_sync("a privacy-first habit tracker", bus, run_id=run_id) + return state, [e.type for e in bus.history(run_id)] + + +def _names(state) -> set: + return {a for a in _ALL if getattr(state, a, None) is not None} + + +def test_flag_off_no_availability(monkeypatch): + state, _ = _run(monkeypatch, enabled=False, run_id="av_off") + assert state.availability is None + assert "availability" not in _names(state) + assert state.prd is not None and state.pitch is not None + + +def test_flag_on_runs_availability_in_parallel(monkeypatch): + state, types = _run(monkeypatch, enabled=True, run_id="av_on") + assert state.status in (RunStatus.COMPLETE, RunStatus.DEGRADED) # no InvalidUpdateError + assert state.availability is not None and state.availability.company_name + assert state.availability.recommended_domain.endswith(".com") + assert {"prd", "trd", "execution", "pitch", "availability"} <= _names(state) + assert "artifact_ready" in types + + +def test_compiled_graph_node_set_reflects_flag(monkeypatch): + monkeypatch.setenv("APS_ENABLE_TRADEMARK", "false") + nodes_off = set(g.build_graph(EventBus(), "n1").get_graph().nodes) + monkeypatch.setenv("APS_ENABLE_TRADEMARK", "true") + nodes_on = set(g.build_graph(EventBus(), "n2").get_graph().nodes) + assert "availability" not in nodes_off + assert "availability" in nodes_on diff --git a/tests/unit/test_availability_tools.py b/tests/unit/test_availability_tools.py new file mode 100644 index 0000000..e21ef5d --- /dev/null +++ b/tests/unit/test_availability_tools.py @@ -0,0 +1,66 @@ +"""Availability tools (Launch Studio Phase 4): RDAP status mapping, trademark links.""" +from __future__ import annotations + +from aps.infra import http +from aps.tools.availability.check_domain_availability import TOOL as DOMAIN +from aps.tools.availability.search_trademark import TOOL as TM + + +class _Resp: + def __init__(self, code): + self.status_code = code + + +def test_registry_exposes_availability_namespace(): + from aps.tools.registry import load_registry + reg = load_registry() + assert len(reg["availability"]) == 2 + assert sum(len(v) for v in reg.values()) == 69 + + +def test_domain_status_maps_from_rdap_codes(monkeypatch): + # .com -> 404 (available), .io -> 200 (registered), rest -> 500 (unknown) + codes = {"habitly.com": 404, "habitly.io": 200} + + def fake_get(url, **kw): + domain = url.rsplit("/", 1)[-1] + return _Resp(codes.get(domain, 500)) + + monkeypatch.setattr(http, "get", fake_get) + out = DOMAIN.run(name="Habitly") + by = {d["domain"]: d["status"] for d in out.payload["domains"]} + assert by["habitly.com"] == "available" + assert by["habitly.io"] == "registered" + assert by["habitly.app"] == "unknown" + + +def test_domain_all_unknown_falls_back_to_fixture(monkeypatch): + def boom(url, **kw): + raise RuntimeError("offline") + + monkeypatch.setattr(http, "get", boom) + out = DOMAIN.run(name="Habitly") + assert out.ok # fixture fallback (allow_fixture_fallback) + assert any(d["status"] == "available" for d in out.payload["domains"]) + + +def test_domain_slug_strips_nonalnum(monkeypatch): + seen = [] + monkeypatch.setattr(http, "get", + lambda url, **kw: seen.append(url) or _Resp(404)) + DOMAIN.run(name="Privacy-First Tracker!") + assert any("privacyfirsttracker.com" in u for u in seen) + + +def test_trademark_returns_registry_link_per_jurisdiction(): + india = TM.run(mark="Habitly", jurisdiction="India").payload["trademarks"][0] + assert "ipindia" in india["search_url"].lower() and india["status"] == "check_required" + us = TM.run(mark="Habitly", jurisdiction="Delaware, USA").payload["trademarks"][0] + assert "uspto" in us["search_url"].lower() + eu = TM.run(mark="Habitly", jurisdiction="European Union").payload["trademarks"][0] + assert "euipo" in eu["search_url"].lower() + + +def test_trademark_is_indicative_only(): + tm = TM.run(mark="Habitly", jurisdiction="India").payload["trademarks"][0] + assert "indicative" in tm["note"].lower() diff --git a/tests/unit/test_brand_agent.py b/tests/unit/test_brand_agent.py new file mode 100644 index 0000000..10bebbf --- /dev/null +++ b/tests/unit/test_brand_agent.py @@ -0,0 +1,45 @@ +"""Brand agent pipeline: populated BrandPackage with and without a PRD.""" +from __future__ import annotations + +from aps.agents.brand.agent import run_brand +from aps.state.models import StudioState, PRD, Persona, Feature, BrandPackage +from aps.render import render_artifact + + +def test_run_brand_idea_only(): + state = StudioState(idea="a privacy-first habit tracker") + brand = run_brand(state) + assert isinstance(brand, BrandPackage) + assert brand.name + assert brand.logo_svg.startswith(" set: + return {a for a in ("research", "prd", "trd", "execution", "pitch", "brand") + if getattr(state, a, None) is not None} diff --git a/tests/unit/test_brand_tools.py b/tests/unit/test_brand_tools.py new file mode 100644 index 0000000..a8c380d --- /dev/null +++ b/tests/unit/test_brand_tools.py @@ -0,0 +1,88 @@ +"""Brand tools (Launch Studio Phase 1): determinism, valid SVG, clean copy, campaign shape.""" +from __future__ import annotations + +from aps.tools.brand.generate_logo_svg import TOOL as LOGO +from aps.tools.brand.generate_brand_sheet_svg import TOOL as SHEET +from aps.tools.brand.generate_brand_identity import TOOL as IDENTITY +from aps.tools.brand.generate_brand_campaign import TOOL as CAMPAIGN +from aps.tools.brand import _svg + + +def test_registry_exposes_brand_namespace(): + from aps.tools.registry import load_registry + reg = load_registry() + assert len(reg["brand"]) == 4 + assert sum(len(v) for v in reg.values()) == 69 + + +def test_logo_is_valid_svg_and_deterministic(): + a = LOGO.run(name="FinPilot", tagline="Ship faster.") + b = LOGO.run(name="FinPilot", tagline="Ship faster.") + assert a.ok and a.payload == b.payload # same input → identical SVG + assert a.payload.startswith("" in a.payload + assert "FinPilot" in a.payload + + +def test_logo_mark_only_omits_wordmark_box(): + mark = LOGO.run(name="FinPilot", lockup=False).payload + assert mark.startswith("= 2 and all(e.url.startswith("http") for e in out.evidence) + + +def test_guidance_fixture_fallback_offline(monkeypatch): + def boom(url, **kw): + raise RuntimeError("offline") + monkeypatch.setattr(http, "get", boom) + out = GUIDANCE.run(regimes=["DPDP Act (India)"]) + assert out.ok # fixture fallback (still labelled links) + assert out.evidence diff --git a/tests/unit/test_data_model_entities.py b/tests/unit/test_data_model_entities.py new file mode 100644 index 0000000..492322a --- /dev/null +++ b/tests/unit/test_data_model_entities.py @@ -0,0 +1,68 @@ +"""Adversarial hardening: the data model must mint DOMAIN-NOUN entities, never adjectives, +adverbs, or pure-verb gerunds. + +Before this, ideas like "a privacy-first PERSONAL finance tracker", "REALTIME MULTIPLAYER chess", +or "platform for MANAGING social media posts" produced database entities named `Personal`, +`Realtime`, `Multiplayer`, `Managing`, `Decentralized`, `Quickly`, `Damn` — which then become +OpenAPI schemas and API paths. These pin the denylist + morphological (-ly / -ized) guards. +""" +from __future__ import annotations + +from aps.tools.architecture.design_data_model import TOOL, _candidate_nouns + + +def _entities(idea: str) -> set[str]: + return set(TOOL.run(idea=idea).payload["entities"].keys()) + + +# adjectives / adverbs / pure gerunds that previously leaked, mapped to the head noun that should win +_LEAK_CASES = [ + ("a privacy-first personal finance tracker for couples", {"Personal"}, {"Finance", "Tracker"}), + ("realtime multiplayer chess with ELO ranking", {"Realtime", "Multiplayer"}, {"Chess", "Ranking"}), + ("the best damn app to quickly delete annoying spam emails", {"Damn", "Quickly"}, {"Email"}), + ("platform for managing scheduled social media posts", {"Managing", "Social"}, {"Media", "Post"}), + ("blockchain-based decentralized voting system", {"Decentralized"}, {"Voting", "System"}), + ("app for optimizing personalized workout plans", {"Optimizing", "Personalized"}, {"Workout", "Plan"}), +] + + +def test_modifiers_never_become_entities_but_head_nouns_do(): + for idea, forbidden, expected in _LEAK_CASES: + ents = _entities(idea) + assert not (ents & forbidden), f"{idea!r} leaked {ents & forbidden}" + assert expected <= ents, f"{idea!r} lost head nouns {expected - ents}" + + +def test_nominal_ing_and_ly_nouns_are_preserved(): + # -ing words that are genuine entities (not pure-verb gerunds) survive + assert {"Planning", "Screening", "Ranking"} <= _entities( + "resume screening with candidate ranking and sprint planning") + # -ly words that are real nouns survive the adverb rule + ents = _entities("family meal supply tracker") + assert "Family" in ents and "Supply" in ents + + +def test_candidate_nouns_drops_adverbs_and_participles(): + toks = _candidate_nouns("quickly decentralized personalized optimizing finance tracker") + assert "quickly" not in toks and "decentralized" not in toks + assert "personalized" not in toks and "optimizing" not in toks + assert "finance" in toks and "tracker" in toks + + +def test_user_entity_always_present_and_model_non_trivial(): + ents = _entities("app") # degenerate idea → still a usable model + assert "User" in ents and len(ents) >= 2 + + +def test_continuation_conjunctions_never_become_entities(): + # the /howevers bug: a fragment leading with "However/Therefore/Meanwhile" must not mint an + # entity (which would become a /howevers OpenAPI path). Head nouns still survive. + from aps.state.models import Feature + for lead in ("However", "Therefore", "Meanwhile", "Moreover", "Furthermore"): + ents = {e.lower() for e in TOOL.run( + idea=f"{lead} the activity tracker leaks user data", + features=[Feature(title=f"{lead} about a week the sync failed", + description="x", priority="Should").model_dump()], + ).payload["entities"]} + assert lead.lower() not in ents, f"{lead!r} leaked as an entity: {ents}" + assert "tracker" in ents or "activity" in ents diff --git a/tests/unit/test_debate.py b/tests/unit/test_debate.py new file mode 100644 index 0000000..14944a0 --- /dev/null +++ b/tests/unit/test_debate.py @@ -0,0 +1,70 @@ +"""T2.3 — Autonomous Debate: grounded risk flags, build case, verdict logic, determinism.""" +from __future__ import annotations + +from aps.state.models import ResearchReturn, Competitor, PainPoint, Evidence, Severity, PRD, Feature +from aps.debate import run_risk, run_debate, RiskAssessment, Debate +from aps.render import debate_md + + +def _strong(): + return ResearchReturn( + idea="A B2B SaaS for resume screening", + market_size="TAM ~$3B (cited at https://x.com/r)", + competitors=[Competitor(name="Acme", url="https://acme.io", pricing="$49/mo", + features=["pdf export"])], + pain_points=[PainPoint(text="parser drops PDFs", severity=Severity.HIGH)], + evidence=[Evidence(source=s, url=f"https://{s}/1", title="t", snippet="s") + for s in ("github", "reddit", "hackernews", "stackexchange")], + ) + + +def _weak(): + return ResearchReturn( + idea="A realtime ML video platform", + market_size="", + competitors=[Competitor(name=f"C{i}", features=["a", "b"]) for i in range(5)], + pain_points=[PainPoint(text="minor annoyance", severity=Severity.LOW)], + evidence=[], + degraded=True, + ) + + +def test_risk_flags_are_grounded_and_scored(): + ra = run_risk(_weak()) + assert isinstance(ra, RiskAssessment) + cats = {f.category for f in ra.flags} + assert {"Competition", "Monetization"} <= cats # 5 comps, no pricing + assert any(f.category == "Evidence" and f.severity == "high" for f in ra.flags) # degraded + assert ra.risk_score > run_risk(_strong()).risk_score + + +def test_strong_idea_builds_weak_idea_does_not(): + strong = run_debate(_strong()) + weak = run_debate(_weak()) + assert isinstance(strong, Debate) + assert strong.verdict == "Build" + assert weak.verdict == "Don't build (yet)" + assert strong.startup_score > weak.startup_score + assert strong.risk_score < weak.risk_score + + +def test_build_case_cites_real_positives(): + d = run_debate(_strong()) + joined = " ".join(d.build_case).lower() + assert "pain" in joined and "evidence" in joined + assert 0.0 <= d.confidence <= 1.0 + + +def test_technical_risk_flag_from_complex_idea(): + ra = run_risk(_weak(), prd=PRD(idea="x", features=[Feature(title="realtime ml scoring", description="d")])) + assert any(f.category == "Technical" for f in ra.flags) + + +def test_deterministic(): + r = _strong() + assert run_debate(r).model_dump() == run_debate(r).model_dump() + + +def test_debate_md_has_both_sides_and_verdict(): + md = debate_md.render(run_debate(_strong())) + assert "Verdict:" in md and "case FOR" in md and "case AGAINST" in md diff --git a/tests/unit/test_diversification.py b/tests/unit/test_diversification.py new file mode 100644 index 0000000..d2e6b9b --- /dev/null +++ b/tests/unit/test_diversification.py @@ -0,0 +1,83 @@ +"""Parallel diversification (multipleAPIplan P10) — fan-out units spread across providers.""" +from __future__ import annotations + +import pytest + +from aps.agents.research.supervisor import unit_providers +from aps.config.failover import build_failover_model, FailoverChatModel +from aps.config.settings import get_chat_model + +_CHAIN_KEYS = ("APS_PROVIDER_CHAIN", "GROQ_API_KEY", "CEREBRAS_API_KEY", + "GEMINI_API_KEY", "NVIDIA_API_KEY") + + +@pytest.fixture(autouse=True) +def _clean(monkeypatch): + for v in _CHAIN_KEYS: + monkeypatch.delenv(v, raising=False) + + +# ── unit_providers (the round-robin assignment) ─────────────────────────────── +def test_no_diversification_without_chain(): + assert unit_providers(3) == [None, None, None] + + +def test_no_diversification_with_single_provider(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq,gemini") + monkeypatch.setenv("GROQ_API_KEY", "k") # only groq available → 1-provider pool + assert unit_providers(3) == [None, None, None] + + +def test_three_units_get_three_distinct_providers(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq,cerebras,gemini") + for k in ("GROQ_API_KEY", "CEREBRAS_API_KEY", "GEMINI_API_KEY"): + monkeypatch.setenv(k, "k") + assigned = unit_providers(3) + # router may reorder by fit, but all three are distinct → 3 quotas in parallel + assert len(set(assigned)) == 3 + assert set(assigned) == {"groq", "cerebras", "gemini"} + + +def test_diversify_off_makes_all_units_use_chain_head(monkeypatch): + # APS_RESEARCH_DIVERSIFY=false → every unit uses the default chain head (e.g. paid OpenAI) + + # failover, instead of spreading across (possibly exhausted) free providers. + monkeypatch.setenv("APS_PROVIDER_CHAIN", "openai,nim,gemini") + for k in ("OPENAI_API_KEY", "NVIDIA_API_KEY", "GEMINI_API_KEY"): + monkeypatch.setenv(k, "k") + monkeypatch.setenv("APS_RESEARCH_DIVERSIFY", "false") + assert unit_providers(3) == [None, None, None] + monkeypatch.setenv("APS_RESEARCH_DIVERSIFY", "true") # default behavior still diversifies + assert len(set(unit_providers(3))) == 3 + + +def test_more_units_than_providers_round_robin(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq,gemini") + monkeypatch.setenv("GROQ_API_KEY", "k") + monkeypatch.setenv("GEMINI_API_KEY", "k") + assigned = unit_providers(5) + assert set(assigned) == {"groq", "gemini"} # round-robin over the routed 2-provider pool + assert assigned[0] != assigned[1] and assigned[0] == assigned[2] == assigned[4] + + +# ── prefer (the per-unit head-of-chain) ─────────────────────────────────────── +def test_prefer_moves_provider_to_head(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq,gemini,cerebras") + for k in ("GROQ_API_KEY", "GEMINI_API_KEY", "CEREBRAS_API_KEY"): + monkeypatch.setenv(k, "k") + m = build_failover_model(prefer="gemini") + assert m.providers == ["gemini", "groq", "cerebras"] # preferred first, rest as backup + + +def test_prefer_not_in_chain_is_ignored(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq,gemini") + monkeypatch.setenv("GROQ_API_KEY", "k") + monkeypatch.setenv("GEMINI_API_KEY", "k") + assert build_failover_model(prefer="nim").providers == ["groq", "gemini"] + + +def test_get_chat_model_prefer_threads_through(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq,gemini") + monkeypatch.setenv("GROQ_API_KEY", "k") + monkeypatch.setenv("GEMINI_API_KEY", "k") + m = get_chat_model(prefer="gemini") + assert isinstance(m, FailoverChatModel) and m.providers[0] == "gemini" diff --git a/tests/unit/test_evidence_relevance.py b/tests/unit/test_evidence_relevance.py new file mode 100644 index 0000000..700bc9f --- /dev/null +++ b/tests/unit/test_evidence_relevance.py @@ -0,0 +1,99 @@ +"""Research relevance gate — score evidence against the idea and keep pains on-topic. + +The defect this guards: an off-topic-but-syntactically-valid complaint ("YouTube AdBlock is +missing" for a "Private Activity Tracker") passes the noise filter and seeds a bogus pain/feature. +The deterministic lexical scorer + the `_compress` pain gate must drop it, while keeping genuinely +on-topic evidence — and never silently emitting zero pains. +""" +from __future__ import annotations + +from aps.tools.analysis.score_evidence_relevance import idea_profile, relevance_score, TOOL +from aps.agents.research.agent import _compress +from aps.config.settings import get_settings +from aps.state.models import Evidence + +IDEA = "Private Activity Tracker" + + +def _ev(title, snippet, source="web", url="https://x/1"): + return Evidence(source=source, url=url, title=title, snippet=snippet) + + +def test_on_topic_scores_high_off_topic_scores_zero(): + prof = idea_profile(IDEA) + on = _ev("Activity trackers", "this activity tracker leaks location data to advertisers") + off = _ev("YouTube AdBlock", "the adblock popup is missing in the new youtube ui") + assert relevance_score(prof, on) >= 0.3 + assert relevance_score(prof, off) == 0.0 + + +def test_morphology_match_catches_inflections(): + # private~privacy, tracker~tracking — a singular-stem intersection would miss these + prof = idea_profile(IDEA) + morph = _ev("Privacy-first tracking", "a private activity tracking app that respects users") + assert relevance_score(prof, morph) >= 0.5 + + +def test_off_domain_junk_is_rejected(): + # off-domain spam that shares one incidental word is hard-rejected by the junk lexicon + prof = idea_profile(IDEA) + assert relevance_score(prof, _ev("Stake bonus", "Stake bonus cannot be reached")) == 0.0 + assert relevance_score(prof, _ev("Sales role", "High-ticket financial sales specialist hiring now")) == 0.0 + + +def test_degenerate_idea_does_not_gate_everything(): + # an all-stopword idea has no profile → never zero out evidence (returns 1.0) + prof = idea_profile("the a an of to") + assert prof == set() + assert relevance_score(prof, _ev("x", "anything at all")) == 1.0 + + +def test_tool_tags_and_optionally_filters(): + rows = [_ev("Activity trackers", "activity tracker privacy leak"), + _ev("YouTube AdBlock", "adblock popup missing youtube")] + out = TOOL.run(idea=IDEA, evidence=[r.model_dump() for r in rows], min_score=0.15).evidence + # min_score drops the off-topic item; the kept one carries a populated relevance score + assert len(out) == 1 and out[0].title == "Activity trackers" + assert out[0].relevance and out[0].relevance > 0.15 + + +def test_compress_gates_off_topic_pain_but_keeps_on_topic(): + s = get_settings() + assert s.enable_relevance_gate # default on + evidence = [ + _ev("Activity tracker rant", "the activity tracker is slow and keeps crashing on every sync", + source="reddit", url="https://r/1"), + _ev("YouTube AdBlock", "youtube adblock is broken and missing in the new ui", + source="github", url="https://g/1"), + ] + res = _compress(IDEA, evidence) + pain_text = " ".join(p.text.lower() for p in res.pain_points) + assert "youtube" not in pain_text and "adblock" not in pain_text # off-topic pain gated out + assert res.pain_points, "the on-topic complaint should still yield a pain" + assert 0.0 <= res.evidence_relevance <= 1.0 + # every evidence item got scored + assert all(e.relevance is not None for e in res.evidence) + + +def test_compress_degrades_when_nothing_relevant(): + # all evidence off-topic for the idea → floor guard keeps top-K but marks the brief degraded + evidence = [ + _ev("YouTube AdBlock", "youtube adblock popup is missing", source="github", url="https://g/2"), + _ev("Gmail addon", "the gmail addon keeps crashing on send", source="web", url="https://w/2"), + ] + res = _compress(IDEA, evidence) + assert res.degraded is True and res.degrade_reason == "low_relevance" + + +def test_flag_off_disables_gate(monkeypatch): + get_settings.cache_clear() + monkeypatch.setenv("APS_ENABLE_RELEVANCE_GATE", "false") + try: + evidence = [_ev("YouTube AdBlock", "youtube adblock is broken and missing", + source="github", url="https://g/3")] + res = _compress(IDEA, evidence) + # gate off ⇒ the off-topic complaint is NOT filtered; relevance stays unscored + assert res.degraded is False + assert all(e.relevance is None for e in res.evidence) + finally: + get_settings.cache_clear() diff --git a/tests/unit/test_explain.py b/tests/unit/test_explain.py new file mode 100644 index 0000000..9c0aa31 --- /dev/null +++ b/tests/unit/test_explain.py @@ -0,0 +1,70 @@ +"""T2.5 — Explain-Why: every feature traced to its pain/competitor/evidence + confidence.""" +from __future__ import annotations + +from aps.state.models import PRD, ResearchReturn, Competitor, Evidence, Feature +from aps.explain import explain_prd, Explanation +from aps.render import explain_md + + +def _setup(): + ev = [Evidence(source="github", url="https://github.com/x/1", title="parser bug", + snippet="the resume parser drops valid pdf files"), + Evidence(source="reddit", url="https://reddit.com/r/2", title="ranking", + snippet="candidate ranking quality is poor")] + prd = PRD( + idea="AI resume screening", + features=[Feature(title="Solve: parser drops PDFs", description="reliable pdf parsing", priority="Must"), + Feature(title="Table stakes: ranking", description="rank candidates", priority="Should"), + Feature(title="Differentiator: analytics", description="dashboards", priority="Could")], + sources=ev, + ) + research = ResearchReturn( + idea="AI resume screening", evidence=ev, + competitors=[Competitor(name="Acme", features=["ranking", "analytics"])], + ) + return prd, research + + +def test_explains_every_feature(): + prd, research = _setup() + x = explain_prd(prd, research) + assert isinstance(x, Explanation) + assert len(x.features) == 3 + assert 0.0 <= x.overall_confidence <= 1.0 + for fe in x.features: + assert fe.why and 0.0 <= fe.confidence <= 1.0 + + +def test_pain_feature_cites_matching_evidence(): + prd, research = _setup() + pdf = next(f for f in explain_prd(prd, research).features if "parser" in f.feature_title.lower()) + assert "pain" in pdf.why.lower() + assert any("github.com/x/1" in e.url for e in pdf.evidence) # matched the parser source + + +def test_competitor_feature_names_its_inspiration(): + prd, research = _setup() + feats = {f.feature_title: f for f in explain_prd(prd, research).features} + assert feats["Table stakes: ranking"].inspired_by == "Acme" + assert feats["Differentiator: analytics"].inspired_by == "Acme" + + +def test_confidence_rewards_evidence_and_must_priority(): + prd, research = _setup() + x = explain_prd(prd, research) + must = next(f for f in x.features if f.priority == "Must") + could = next(f for f in x.features if f.priority == "Could") + assert must.confidence >= could.confidence + + +def test_works_without_research_using_prd_sources(): + prd, _ = _setup() + x = explain_prd(prd) # no research -> falls back to prd.sources + assert len(x.features) == 3 + + +def test_deterministic_and_renders(): + prd, research = _setup() + assert explain_prd(prd, research).model_dump() == explain_prd(prd, research).model_dump() + md = explain_md.render(explain_prd(prd, research)) + assert "Explain-Why" in md and "confidence" in md.lower() and "Acme" in md diff --git a/tests/unit/test_failover.py b/tests/unit/test_failover.py new file mode 100644 index 0000000..45c7a06 --- /dev/null +++ b/tests/unit/test_failover.py @@ -0,0 +1,152 @@ +"""FailoverChatModel (multipleAPIplan P2) — try → next on retryable errors, offline + mocked.""" +from __future__ import annotations + +import pytest + +from aps.config.failover import FailoverChatModel, _is_retryable, build_failover_model + + +# ── fake provider runtimes ──────────────────────────────────────────────────── +class _FakeModel: + def __init__(self, result=None, raises=None): + self._result = result + self._raises = raises + self.bound = None + + def bind_tools(self, tools, **kwargs): + self.bound = tools + return self + + def invoke(self, messages, **kwargs): + if self._raises is not None: + raise self._raises + return self._result + + +class _FakeRuntime: + def __init__(self, name, model): + self.name = name + self._model = model + + def chat_model(self): + return self._model + + +@pytest.fixture(autouse=True) +def _no_throttle(monkeypatch): + # keep tests instant + deterministic (don't exercise the real rate limiter here) + import aps.infra.llm as llm + monkeypatch.setattr(llm, "acquire_llm", lambda *a, **k: 0.0) + # reset the global circuit breaker so chain order isn't reordered by prior tests' trips + from aps.config import quota + quota.BREAKER.reset() + yield + quota.BREAKER.reset() + + +def _fail(msg): + return RuntimeError(msg) + + +# ── retryability classification ─────────────────────────────────────────────── +def test_is_retryable_classifies(): + assert _is_retryable(_fail("HTTP 429 rate limit exceeded")) + assert _is_retryable(_fail("503 Service Unavailable")) + assert _is_retryable(_fail("Connection timed out")) + assert _is_retryable(_fail("401 Unauthorized")) + assert _is_retryable(ImportError("no langchain_anthropic")) + assert not _is_retryable(ValueError("malformed tool schema")) # real bug → don't mask + + +# ── failover behavior ───────────────────────────────────────────────────────── +def test_fails_over_to_next_on_retryable(): + a = _FakeRuntime("groq", _FakeModel(raises=_fail("429 rate limit"))) + b = _FakeRuntime("gemini", _FakeModel(result="OK")) + m = FailoverChatModel([a, b]) + assert m.invoke(["hi"]) == "OK" + assert m.last_provider == "gemini" + + +def test_non_retryable_raises_immediately_no_failover(): + a = _FakeRuntime("groq", _FakeModel(raises=ValueError("bad prompt"))) + b = _FakeRuntime("gemini", _FakeModel(result="OK")) + m = FailoverChatModel([a, b]) + with pytest.raises(ValueError): + m.invoke(["hi"]) + assert m.last_provider is None # never reached provider b + + +def test_all_retryable_fail_raises_last(): + a = _FakeRuntime("groq", _FakeModel(raises=_fail("429"))) + b = _FakeRuntime("gemini", _FakeModel(raises=_fail("503"))) + m = FailoverChatModel([a, b]) + with pytest.raises(RuntimeError, match="503"): + m.invoke(["hi"]) + + +def test_bind_tools_propagates_to_the_chosen_provider(): + a = _FakeRuntime("groq", _FakeModel(raises=_fail("timeout"))) + okmodel = _FakeModel(result="OK") + b = _FakeRuntime("gemini", okmodel) + m = FailoverChatModel([a, b]).bind_tools(["TOOL_A", "TOOL_B"]) + assert m.invoke(["hi"]) == "OK" + assert okmodel.bound == ["TOOL_A", "TOOL_B"] # tools bound on the provider that answered + + +def test_providers_property(): + m = FailoverChatModel([_FakeRuntime("groq", _FakeModel()), _FakeRuntime("nim", _FakeModel())]) + assert m.providers == ["groq", "nim"] + + +# ── build_failover_model + wiring ───────────────────────────────────────────── +def test_build_failover_model_from_chain(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq,gemini") + monkeypatch.setenv("GROQ_API_KEY", "k") + monkeypatch.setenv("GEMINI_API_KEY", "k") + m = build_failover_model(0.2) + assert isinstance(m, FailoverChatModel) + assert m.providers == ["groq", "gemini"] # built lazily — no network + + +def test_build_failover_model_empty_chain_raises(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq") # no key → not available + monkeypatch.delenv("GROQ_API_KEY", raising=False) + with pytest.raises(RuntimeError, match="No LLM provider"): + build_failover_model() + + +def test_get_chat_model_returns_failover_when_chain_set(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq") + monkeypatch.setenv("GROQ_API_KEY", "k") + from aps.config.settings import get_chat_model + assert isinstance(get_chat_model(), FailoverChatModel) + + +def test_has_llm_key_uses_chain_when_set(monkeypatch): + from aps.infra.llm import has_llm_key + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq,cerebras") + monkeypatch.delenv("GROQ_API_KEY", raising=False) + monkeypatch.delenv("CEREBRAS_API_KEY", raising=False) + assert has_llm_key() is False + monkeypatch.setenv("CEREBRAS_API_KEY", "k") + assert has_llm_key() is True + + +def test_ui_pin_routes_through_failover_not_a_hard_lock(monkeypatch): + """A per-run/UI provider pin becomes the PREFERRED chain head but STILL fails over — it must + not return a single-provider model that dies when that provider is exhausted (the demo bug).""" + from aps.config import settings + monkeypatch.setenv("APS_PROVIDER_CHAIN", "openai,nim,gemini") + for k in ("OPENAI_API_KEY", "NVIDIA_API_KEY", "GEMINI_API_KEY"): + monkeypatch.setenv(k, "k") + settings.get_settings.cache_clear() + tok = settings.set_run_model("gemini", "gemini-2.5-flash") # user picks the exhausted provider + try: + m = settings.get_chat_model() + assert isinstance(m, FailoverChatModel) # failover, NOT a single gemini model + order = [rt.name for rt in m._runtimes] + assert order[0] == "gemini" # pin honored as the PREFERRED head + assert set(order) == {"openai", "nim", "gemini"} # …but the rest stay as failover + finally: + settings.reset_run_model(tok) + settings.get_settings.cache_clear() diff --git a/tests/unit/test_feature_naming.py b/tests/unit/test_feature_naming.py new file mode 100644 index 0000000..3815b6d --- /dev/null +++ b/tests/unit/test_feature_naming.py @@ -0,0 +1,135 @@ +"""Feature-title quality: pain phrasing → noun phrase; labels are properly cased.""" +from __future__ import annotations + +from aps.tools.analysis._text import pain_to_feature_title +from aps.state.models import PainPoint, Competitor, Severity +from aps.tools.product.prioritize_features import TOOL as prioritize + + +def test_complaint_framing_stripped(): + cases = [ + ("The resume parser is broken and keeps dropping valid PDFs", "resume parser"), + ("Candidate ranking is slow and confusing", "candidate ranking"), + ("Integration with ATS platforms doesn't work", "integration with ats"), + ("parser drops PDFs", "parser"), + ] + for raw, expected_substr in cases: + out = pain_to_feature_title(raw).lower() + assert expected_substr in out, f"{raw!r} → {out!r}" + + +def test_no_complaint_words_in_title(): + complaint_words = {"broken", "slow", "confusing", "painful", "doesn't", "can't", + "drops", "crashes", "fails", "frustrating", "annoying"} + for raw in [ + "The parser is broken", + "Auth is slow and painful", + "Export fails to handle large files", + "The dashboard crashes on load", + ]: + title = pain_to_feature_title(raw).lower() + assert not any(b in title for b in complaint_words), ( + f"complaint word in {title!r} (from {raw!r})" + ) + + +def test_leading_article_stripped(): + assert not pain_to_feature_title("The resume parser is broken").lower().startswith("the ") + assert not pain_to_feature_title("A candidate ranking is slow").lower().startswith("a ") + assert not pain_to_feature_title("An integration with ATS doesn't work").lower().startswith("an ") + + +def test_fallback_when_only_complaint(): + # pure complaint with no subject noun → should still return a non-empty string + result = pain_to_feature_title("Is broken") + assert isinstance(result, str) and len(result) > 0 + + +# ── adversarial hardening: pronoun-subject complaints must NOT become a feature titled "It"/"I", +# demand pains name the WANTED capability, and shouts/fragments get a clean theme. ────────── +def test_pronoun_subject_never_becomes_the_title(): + for raw in ["It is unusable", "I can't find a good app", "it is unusable and i hate it", + "Is broken", "this is slow"]: + title = pain_to_feature_title(raw) + toks = title.lower().split() + # never a bare pronoun / stopword, and never starts with a complaint/aux verb + assert title.lower() not in {"it", "i", "this", "that", "is", "are"} + assert toks and toks[0] not in {"it", "i", "is", "are", "cant", "cannot", "doesnt"} + assert len(title) >= 3 + + +def test_demand_pain_extracts_the_wanted_capability(): + assert "bulk delete" in pain_to_feature_title("no way to bulk delete").lower() + assert "habit tracker" in pain_to_feature_title("looking for a privacy-first habit tracker").lower() + + +def test_subjectless_complaint_maps_to_a_theme(): + assert pain_to_feature_title("It is unusable") == "Reliability & stability" + assert pain_to_feature_title("THIS APP IS USELESS") == "Reliability & stability" + + +def test_no_dangling_trailing_preposition(): + title = pain_to_feature_title("can't export my data to csv") + assert not title.lower().rstrip().endswith((" to", " with", " for", " of", " and", " my")) + + +def test_clean_noun_phrases_are_preserved(): + # the good path must be untouched by the new guards + assert pain_to_feature_title("The resume parser is broken").lower() == "resume parser" + assert pain_to_feature_title("Candidate ranking is slow").lower() == "candidate ranking" + assert "integration with ats" in pain_to_feature_title("Integration with ATS doesn't work").lower() + + +# ── fragment hardening: orphaned conjunctions, subordinate-clause leads, relative clauses, +# and stray brackets must not survive into a feature title (the "However about a week", +# "When following a Google", "API that gives me", "Maintainer]" class of garbage). ───────── +def test_orphaned_leading_conjunction_is_dropped(): + for raw in ["However about a week the sync kept failing", + "Therefore the dashboard never loaded", + "Moreover the export was incomplete"]: + title = pain_to_feature_title(raw).lower() + assert not title.startswith(("however", "therefore", "moreover", "and ", "but ")), title + + +def test_subordinate_clause_lead_skipped_for_real_subject(): + # the clause split orphaned "When following a Google…"; the real content follows the ellipsis + title = pain_to_feature_title("When following a Google… Ads setup is terrible and slow").lower() + assert not title.startswith(("when ", "where ", "while ", "if ")), title + assert "ads" in title or "setup" in title, title + + +def test_relative_clause_reduced_to_head_noun(): + # "API that gives me the wrong totals" → the feature is the head noun phrase, not the clause + title = pain_to_feature_title("API that gives me the wrong totals").lower() + assert "that" not in title.split() and "which" not in title.split(), title + assert "api" in title, title + + +def test_stray_bracket_is_stripped(): + assert pain_to_feature_title("Maintainer]") == "Maintainer" + assert "]" not in pain_to_feature_title("Export]") and "[" not in pain_to_feature_title("[Export") + + +def test_table_stakes_are_title_cased(): + pains = [PainPoint(text="slow export", severity=Severity.HIGH)] + comps = [ + Competitor(name="A", features=["pdf export support", "slack sync"]), + Competitor(name="B", features=["pdf export support", "analytics dashboard"]), + ] + feats = prioritize.run(pain_points=pains, competitors=comps).payload + ts = [f for f in feats if f.title.startswith("Table stakes:")] + assert ts, "expected at least one table-stakes feature" + label = ts[0].title.split(":", 1)[1].strip() + assert label[0].isupper(), f"table-stakes label should be title-cased, got: {label!r}" + assert label != label.lower(), f"label should not be all-lowercase: {label!r}" + + +def test_differentiator_are_title_cased(): + # one pain, one competitor with one feature → promotes as Differentiator + pains = [PainPoint(text="slow export", severity=Severity.HIGH)] + comps = [Competitor(name="A", features=["analytics dashboard"])] + feats = prioritize.run(pain_points=pains, competitors=comps, min_features=2).payload + diff = [f for f in feats if f.title.startswith("Differentiator:")] + if diff: + label = diff[0].title.split(":", 1)[1].strip() + assert label[0].isupper(), f"differentiator label should be title-cased, got: {label!r}" diff --git a/tests/unit/test_feature_synthesis.py b/tests/unit/test_feature_synthesis.py new file mode 100644 index 0000000..45980e7 --- /dev/null +++ b/tests/unit/test_feature_synthesis.py @@ -0,0 +1,58 @@ +"""Phase 4b — feature SYNTHESIS: cluster pains into themed features, don't paste one-per-pain. + +Pins the new behavior of `prioritize_features`: overlapping pains collapse into a single themed +feature (priority = max severity, grounding aggregated), while genuinely distinct pains stay +separate so the W3 feature floor still holds (see also test_thin_prd.py). +""" +from __future__ import annotations + +from aps.state.models import PainPoint, Severity +from aps.tools.product.prioritize_features import synthesize_pain_features, TOOL + + +def _titles(feats): + return [f.title for f in feats] + + +def test_overlapping_pains_collapse_into_one_theme(): + pains = [PainPoint(text="export is slow", severity=Severity.LOW), + PainPoint(text="can't export quickly to csv", severity=Severity.HIGH)] + feats = synthesize_pain_features(pains) + assert len(feats) == 1, _titles(feats) + assert feats[0].title == "Export" # the general label wins + assert feats[0].priority == "Must" # MAX severity across the cluster + assert "2 related user pains" in feats[0].description # grounded in both + + +def test_distinct_pains_stay_distinct(): + pains = [PainPoint(text="the parser drops PDFs", severity=Severity.HIGH), + PainPoint(text="ranking is slow and confusing", severity=Severity.MED), + PainPoint(text="no way to self-host the data", severity=Severity.MED)] + feats = synthesize_pain_features(pains) + assert len(feats) == 3, _titles(feats) + + +def test_plural_and_inflection_variants_merge(): + pains = [PainPoint(text="the export is broken"), PainPoint(text="exports keep failing")] + feats = synthesize_pain_features(pains) + assert len(feats) == 1 and feats[0].title.lower().startswith("export") + + +def test_single_pain_keeps_the_original_description_format(): + feats = synthesize_pain_features([PainPoint(text="parser drops PDFs", severity=Severity.HIGH)]) + assert len(feats) == 1 + assert feats[0].description == "Addresses the user pain: 'parser drops PDFs'." + + +def test_floor_still_holds_through_the_tool(): + # three degenerate-but-distinct pains, no competitors → three features (W3 floor preserved) + pains = [PainPoint(text=f"pain {i}", severity=Severity.HIGH).model_dump() for i in range(3)] + feats = TOOL.run(pain_points=pains, competitors=[]).payload + assert len(feats) >= 3 + + +def test_synthesis_titles_are_clean_noun_phrases_not_complaints(): + # the synthesized label is a capability noun phrase, never a complaint sentence/fragment + feats = synthesize_pain_features([PainPoint(text="However the dashboard keeps crashing badly")]) + assert feats and not feats[0].title.lower().startswith(("however", "the ")) + assert not any(w in feats[0].title.lower() for w in ("crashing", "badly", "keeps")) diff --git a/tests/unit/test_firebase_auth.py b/tests/unit/test_firebase_auth.py new file mode 100644 index 0000000..cfb4c2c --- /dev/null +++ b/tests/unit/test_firebase_auth.py @@ -0,0 +1,64 @@ +"""The /v1 API accepts Firebase ID tokens (Google/GitHub/email login via the frontend's Firebase +SDK) in addition to the built-in demo JWT — the 'proper fix' for the frontend↔backend token +mismatch that made the Start button 401. Network verification is gated on APS_FIREBASE_PROJECT_ID +(off by default → suite stays hermetic) and mocked here for the accept path. +""" +from __future__ import annotations + +from fastapi.testclient import TestClient + +from aps.api.main import app +from aps.api.v1 import firebase_auth, auth as auth_mod + +client = TestClient(app) + + +def _demo_token() -> str: + return client.post("/v1/auth/login", + json={"email": "operator@aps.io", "password": "demo1234"}).json()["data"]["token"] + + +def test_firebase_disabled_by_default_is_hermetic(monkeypatch): + # No APS_FIREBASE_PROJECT_ID → verify() returns None immediately (no network, no google-auth). + monkeypatch.delenv("APS_FIREBASE_PROJECT_ID", raising=False) + assert firebase_auth.configured() is False + assert firebase_auth.verify("a.b.c") is None + + +def test_non_firebase_token_returns_none_even_when_configured(monkeypatch): + monkeypatch.setenv("APS_FIREBASE_PROJECT_ID", "demo-proj") + # the demo HMAC JWT is not a Firebase token → google verify raises/returns None → None + assert firebase_auth.verify(_demo_token()) is None + assert firebase_auth.verify("not-a-jwt") is None + + +def test_demo_jwt_still_authenticates(): + r = client.get("/v1/system/status", headers={"Authorization": f"Bearer {_demo_token()}"}) + assert r.status_code == 200 + + +def test_firebase_token_is_accepted(monkeypatch): + # simulate a verified Firebase user (Google login) — current_user must accept it + provision. + fake = {"id": "fb_uid_1", "name": "Ada", "email": "ada@gmail.com", "avatarUrl": "", + "role": "Founder / CEO", "password_hash": ""} + monkeypatch.setattr(firebase_auth, "verify", lambda tok: fake if tok == "FIREBASE_TOK" else None) + r = client.get("/v1/system/status", headers={"Authorization": "Bearer FIREBASE_TOK"}) + assert r.status_code == 200 + assert auth_mod._USERS.get("ada@gmail.com", {}).get("id") == "fb_uid_1" # auto-provisioned + + +def test_bad_token_still_401(monkeypatch): + monkeypatch.setattr(firebase_auth, "verify", lambda tok: None) + r = client.get("/v1/system/status", headers={"Authorization": "Bearer garbage"}) + assert r.status_code == 401 + + +def test_ws_accepts_firebase_token(monkeypatch): + from aps.api.v1 import ws + monkeypatch.setattr(ws.firebase_auth, "verify", lambda tok: {"email": "x@y.z"} if tok == "FB" else None) + + class _WS: + query_params = {"token": "FB"} + assert ws._authed(_WS()) is True + _WS.query_params = {"token": "nope"} + assert ws._authed(_WS()) is False diff --git a/tests/unit/test_funding_agent.py b/tests/unit/test_funding_agent.py new file mode 100644 index 0000000..b65fc57 --- /dev/null +++ b/tests/unit/test_funding_agent.py @@ -0,0 +1,50 @@ +"""Funding agent pipeline: full FundingPackage from Research/PRD/Execution; renders to MD.""" +from __future__ import annotations + +from aps.agents.funding.agent import run_funding +from aps.state.models import ( + StudioState, ResearchReturn, PRD, Feature, ExecutionPlan, BrandPackage, FundingPackage, +) +from aps.render import render_artifact + + +def _rich_state() -> StudioState: + return StudioState( + idea="a privacy-first habit tracker", + brand=BrandPackage(name="Habitly"), + research=ResearchReturn(idea="x", market_size="~$1.2B market"), + prd=PRD(idea="x", features=[Feature(title="Streak tracking", description="x")], + mvp_scope="Track habits privately"), + execution=ExecutionPlan(infra_cost="$400/mo", roadmap="Sprint 1: auth"), + ) + + +def test_run_funding_full(): + pkg = run_funding(_rich_state()) + assert isinstance(pkg, FundingPackage) + assert pkg.company_name == "Habitly" + assert pkg.deck_slides and pkg.financials.get("years") + assert len(pkg.rounds) == 3 and pkg.use_of_funds + assert pkg.ask # headline raise set + # financials grounded in the research TAM + execution infra + assert pkg.financials["tam"] == 1_200_000_000 + + +def test_run_funding_idea_only_degrades_gracefully(): + pkg = run_funding(StudioState(idea="a habit tracker")) + assert pkg.company_name and len(pkg.deck_slides) >= 8 + assert pkg.financials["tam"] is None # no market size → no TAM, still a model + assert len(pkg.financials["years"]) == 3 + + +def test_run_funding_is_deterministic(): + s = _rich_state() + assert run_funding(s).model_dump() == run_funding(s).model_dump() + + +def test_funding_renders_to_markdown(): + pkg = run_funding(_rich_state()) + md = render_artifact("funding", pkg) + assert "# Funding Pack" in md and "Pitch Deck Outline" in md + assert "Fundraising Roadmap" in md and "Use of Funds" in md + assert render_artifact("funding", pkg.model_dump()) == md diff --git a/tests/unit/test_funding_graph.py b/tests/unit/test_funding_graph.py new file mode 100644 index 0000000..b0492ba --- /dev/null +++ b/tests/unit/test_funding_graph.py @@ -0,0 +1,48 @@ +"""Funding graph wiring: flag off = unchanged graph; flag on = parallel branch off execution, +no concurrent-write error, reuses upstream artifacts, existing artifacts still produced.""" +from __future__ import annotations + +from aps.orchestrator import graph as g +from aps.orchestrator.events import EventBus +from aps.state.models import RunStatus + +_ALL = ("research", "prd", "trd", "execution", "pitch", "brand", "legal", "funding") + + +def _run(monkeypatch, enabled: bool, run_id: str): + monkeypatch.setattr(g, "USE_STUBS", True) + monkeypatch.setenv("APS_ENABLE_FUNDING", "true" if enabled else "false") + bus = EventBus() + state = g.run_sync("a privacy-first habit tracker", bus, run_id=run_id) + return state, [e.type for e in bus.history(run_id)] + + +def _artifact_names(state) -> set: + return {a for a in _ALL if getattr(state, a, None) is not None} + + +def test_flag_off_no_funding(monkeypatch): + state, _ = _run(monkeypatch, enabled=False, run_id="fund_off") + assert state.funding is None + assert "funding" not in _artifact_names(state) + assert state.execution is not None and state.pitch is not None # vertical intact + + +def test_flag_on_runs_funding_in_parallel(monkeypatch): + state, types = _run(monkeypatch, enabled=True, run_id="fund_on") + assert state.status in (RunStatus.COMPLETE, RunStatus.DEGRADED) # no InvalidUpdateError + assert state.funding is not None and state.funding.company_name + assert len(state.funding.rounds) == 3 and state.funding.deck_slides + assert {"prd", "trd", "execution", "pitch", "funding"} <= _artifact_names(state) + # financials model exists (3 years) — reuses upstream execution infra estimate + assert len(state.funding.financials.get("years", [])) == 3 + assert "artifact_ready" in types + + +def test_compiled_graph_node_set_reflects_flag(monkeypatch): + monkeypatch.setenv("APS_ENABLE_FUNDING", "false") + nodes_off = set(g.build_graph(EventBus(), "n1").get_graph().nodes) + monkeypatch.setenv("APS_ENABLE_FUNDING", "true") + nodes_on = set(g.build_graph(EventBus(), "n2").get_graph().nodes) + assert "funding" not in nodes_off + assert "funding" in nodes_on diff --git a/tests/unit/test_funding_tools.py b/tests/unit/test_funding_tools.py new file mode 100644 index 0000000..1a1fbd0 --- /dev/null +++ b/tests/unit/test_funding_tools.py @@ -0,0 +1,64 @@ +"""Funding tools (Launch Studio Phase 3): deck outline, grounded financials, roadmap.""" +from __future__ import annotations + +from aps.tools.funding.generate_pitch_deck_outline import TOOL as DECK +from aps.tools.funding.generate_financial_projections import TOOL as FIN +from aps.tools.funding.generate_fundraising_roadmap import TOOL as ROADMAP +from aps.tools.funding import _finance + + +def test_registry_exposes_funding_namespace(): + from aps.tools.registry import load_registry + reg = load_registry() + assert len(reg["funding"]) == 3 + assert sum(len(v) for v in reg.values()) == 69 + + +def test_parse_tam_picks_largest_figure(): + assert _finance.parse_tam("~$3B ATS market, SOM $5M") == 3_000_000_000 + assert _finance.parse_tam("no money here") is None + assert _finance.fmt_usd(3_000_000_000) == "$3.0B" + assert _finance.fmt_usd(120_000) == "$120.0K" + + +def test_infra_monthly_vs_annual(): + assert _finance.annual_infra("$400/mo") == 4800 + assert _finance.annual_infra("$10,000 per year") == 10000 + assert _finance.annual_infra("") == 6000 # floor when unparseable + + +def test_projection_is_grounded_and_deterministic(): + a = FIN.run(market_size="~$3B market", infra_cost="$400/mo").payload + b = FIN.run(market_size="~$3B market", infra_cost="$400/mo").payload + assert a == b + assert len(a["years"]) == 3 + assert a["years"][0]["customers"] == 120 and a["years"][2]["customers"] == 900 + assert a["tam"] == 3_000_000_000 + # revenue grows with the customer ramp + revs = [y["revenue"] for y in a["years"]] + assert revs[0] < revs[1] < revs[2] + assert any("NOT a forecast" in n for n in a["notes"]) + + +def test_deck_has_standard_slides_grounded_in_inputs(): + from aps.state.models import PainPoint, Competitor, Feature + deck = DECK.run(company_name="Habitly", idea="a privacy-first habit tracker", + market_size="~$1.2B market", + pain_points=[PainPoint(text="can't find a private tracker")], + competitors=[Competitor(name="Streaks")], + features=[Feature(title="Streak tracking", description="x")]).payload + titles = [s["title"] for s in deck] + assert "Problem" in titles and "Market" in titles and "The Ask" in titles + problem = next(s for s in deck if s["title"] == "Problem") + assert "can't find a private tracker" in problem["bullets"] + market = next(s for s in deck if s["title"] == "Market") + assert any("$1.2B" in b for b in market["bullets"]) + + +def test_roadmap_has_three_rounds_and_use_of_funds(): + out = ROADMAP.run(company_name="Habitly", + roadmap="Sprint 1: auth\nSprint 2: tracking").payload + rounds = [r["round"] for r in out["rounds"]] + assert rounds == ["Pre-seed", "Seed", "Series A"] + assert sum(u["pct"] for u in out["use_of_funds"]) == 100 + assert "auth" in out["rounds"][0]["milestones"].lower() diff --git a/tests/unit/test_github_issues.py b/tests/unit/test_github_issues.py new file mode 100644 index 0000000..80e5c07 --- /dev/null +++ b/tests/unit/test_github_issues.py @@ -0,0 +1,15 @@ +"""Reference unit test. Runs against fixture fallback — no live call in CI (EVALUATION §6).""" +from aps.tools.retrieval.github_issues import TOOL + + +def test_returns_valid_toolresult_via_fixture(monkeypatch): + monkeypatch.delenv("APS_GITHUB_PAT", raising=False) # force fixture path + monkeypatch.setenv("APS_ALLOW_FIXTURE_FALLBACK", "true") + res = TOOL.run(repo="example/repo") + assert res.ok + assert res.evidence and res.evidence[0].source == "github" + + +def test_bad_args_rejected(): + res = TOOL.run(repo="x", limit=999) # limit>50 -> schema rejects + assert not res.ok and "bad_args" in (res.error or "") diff --git a/tests/unit/test_github_launch.py b/tests/unit/test_github_launch.py new file mode 100644 index 0000000..fb057ad --- /dev/null +++ b/tests/unit/test_github_launch.py @@ -0,0 +1,120 @@ +"""T2.4 — GitHub Launch Mode: deterministic plan, safe preview, REAL API path (mocked HTTP). + +The live calls are exercised through a fake `infra.http` so the real code path is tested +without touching GitHub. With a real PAT the same path creates an actual repo (see +scripts/live_github_launch_smoke.py). +""" +from __future__ import annotations + +from aps.state.models import PRD, ExecutionPlan, Feature, PitchPackage +from aps.launch import build_launch_plan, launch_github +import aps.infra.http as http + + +def _prd(): + return PRD(idea="Build an AI SaaS for resume screening", + features=[Feature(title="Reliable PDF parsing", description="handle pdfs", priority="Must")], + mvp_scope="Parse reliably.", requirements=["[Must] parse"]) + + +def _execution(): + return ExecutionPlan( + backlog=[{"id": "APS-001", "title": "Parse PDFs", "type": "story", "priority": "Must", "points": 5}, + {"id": "APS-002", "title": "Auth", "type": "task", "priority": "Must", "points": 3}], + sprints=[{"sprint": 1, "items": [{"id": "APS-001", "title": "Parse PDFs"}], "points": 5}], + roadmap="MVP", infra_cost="$200/mo") + + +def test_build_launch_plan_is_deterministic_and_grounded(): + plan = build_launch_plan(_prd().idea, _prd(), _execution(), PitchPackage(pitch_outline="1. Problem")) + assert plan.repo_name == "build-an-ai-saas-for-resume-screening" + assert len(plan.issues) == 2 and plan.issues[0].title == "Parse PDFs" + assert plan.milestones == ["Sprint 1"] + assert plan.issues[0].sprint == 1 # mapped to its sprint + assert "Reliable PDF parsing" in plan.readme and "# Build an AI SaaS" in plan.readme + # determinism + assert build_launch_plan(_prd().idea, _prd(), _execution()).model_dump() == \ + build_launch_plan(_prd().idea, _prd(), _execution()).model_dump() + + +def test_preview_without_token_makes_no_network(monkeypatch): + monkeypatch.delenv("APS_GITHUB_PAT", raising=False) + # blow up if any HTTP is attempted + monkeypatch.setattr(http, "post", lambda *a, **k: (_ for _ in ()).throw(AssertionError("network!"))) + plan = build_launch_plan(_prd().idea, _prd(), _execution()) + res = launch_github(plan) + assert res.created is False and res.dry_run is True + assert "Preview" in res.message and res.full_name.endswith(plan.repo_name) + + +class _Resp: + def __init__(self, payload, status=201): + self._p = payload + self.status_code = status + def raise_for_status(self): + if self.status_code >= 400: + raise RuntimeError(f"HTTP {self.status_code}") + def json(self): + return self._p + + +def test_real_launch_path_with_mocked_github(monkeypatch): + calls = [] + + def fake_post(url, **kw): + calls.append(("POST", url)) + if url.endswith("/user/repos"): + return _Resp({"full_name": "me/build-an-ai-saas-for-resume-screening", + "html_url": "https://github.com/me/build-an-ai-saas-for-resume-screening"}) + if url.endswith("/milestones"): + return _Resp({"number": 1}) + if url.endswith("/issues"): + return _Resp({"html_url": "https://github.com/me/x/issues/1", "number": 1}) + return _Resp({}, 404) + + def fake_request(method, url, **kw): + calls.append((method, url)) + return _Resp({}, 201) # README PUT + + monkeypatch.setattr(http, "post", fake_post) + monkeypatch.setattr(http, "request", fake_request) + + plan = build_launch_plan(_prd().idea, _prd(), _execution()) + res = launch_github(plan, token="ghp_fake") + + assert res.created is True and res.dry_run is False + assert res.repo_url.startswith("https://github.com/") + assert res.full_name == "me/build-an-ai-saas-for-resume-screening" + assert len(res.issue_urls) == 2 and res.milestones_created == 1 + # the real sequence happened: create repo → PUT README → milestone → issues + assert ("POST", "https://api.github.com/user/repos") in calls + assert any(m == "PUT" for m, _ in calls) + assert sum(1 for m, u in calls if u.endswith("/issues")) == 2 + + +def test_launch_failure_is_reported_not_raised(monkeypatch): + def boom(url, **kw): + return _Resp({}, 500) + monkeypatch.setattr(http, "post", boom) + res = launch_github(build_launch_plan(_prd().idea, _prd(), _execution()), token="ghp_fake") + assert res.created is False and "failed" in res.message.lower() + + +def test_permission_denied_gives_actionable_message(monkeypatch): + # the real live failure: a fine-grained PAT without Administration can't create repos (403). + def forbidden(url, **kw): + return _Resp({"message": "Resource not accessible by personal access token"}, 403) + monkeypatch.setattr(http, "post", forbidden) + res = launch_github(build_launch_plan(_prd().idea, _prd(), _execution()), token="github_pat_x") + assert res.created is False + low = res.message.lower() + assert "403" in res.message and "repo" in low + assert "classic" in low or "administration" in low # tells the user how to fix it + + +def test_repo_name_conflict_gives_422_message(monkeypatch): + def conflict(url, **kw): + return _Resp({"message": "name already exists on this account"}, 422) + monkeypatch.setattr(http, "post", conflict) + res = launch_github(build_launch_plan(_prd().idea, _prd(), _execution()), token="ghp_fake") + assert res.created is False and "422" in res.message diff --git a/tests/unit/test_health_lane.py b/tests/unit/test_health_lane.py new file mode 100644 index 0000000..72f5595 --- /dev/null +++ b/tests/unit/test_health_lane.py @@ -0,0 +1,19 @@ +"""Health/ping lane (plan 2.6): cheap dependency-free liveness, separate from /system/health.""" +from __future__ import annotations + +from fastapi.testclient import TestClient + +from aps.api.main import app + +client = TestClient(app) + + +def test_v1_ping_needs_no_auth_and_is_trivial(): + r = client.get("/v1/system/ping") + assert r.status_code == 200 + assert r.json()["data"] == {"ok": True} + + +def test_root_health_is_dependency_free(): + r = client.get("/health") + assert r.status_code == 200 and r.json()["status"] == "ok" diff --git a/tests/unit/test_http.py b/tests/unit/test_http.py new file mode 100644 index 0000000..bacf06b --- /dev/null +++ b/tests/unit/test_http.py @@ -0,0 +1,100 @@ +"""infra.http: rate-limit + retry + logging wrapper, and retrieval tools routed through it. + +No real network: we monkeypatch `requests.request` / `http.get` with fakes. This also +gives the retrieval tools their first *live-path* coverage (previously only the fixture +fallback was tested).""" +from __future__ import annotations + +import pytest + +from aps.infra import http +from aps.state.models import ToolResult, Evidence + + +class _Resp: + def __init__(self, payload, status=200): + self._payload = payload + self.status_code = status + self.text = "ok" + + def raise_for_status(self): + if self.status_code >= 400: + raise RuntimeError(f"HTTP {self.status_code}") + + def json(self): + return self._payload + + +def test_request_retries_transient_then_succeeds(monkeypatch): + calls = {"n": 0} + + def fake_request(method, url, **kw): + calls["n"] += 1 + if calls["n"] < 3: + raise http.requests.exceptions.ConnectionError("boom") + return _Resp({"ok": True}) + + monkeypatch.setattr(http.requests, "request", fake_request) + r = http.get("https://api.example.com/x", attempts=3) + assert r.json() == {"ok": True} + assert calls["n"] == 3 # retried twice, succeeded on the third + + +def test_request_gives_up_after_attempts(monkeypatch): + def always_fail(method, url, **kw): + raise http.requests.exceptions.Timeout("slow") + + monkeypatch.setattr(http.requests, "request", always_fail) + with pytest.raises(http.requests.exceptions.Timeout): + http.get("https://api.example.com/x", attempts=2) + + +def test_get_and_post_delegate_with_method(monkeypatch): + seen = {} + + def fake_request(method, url, **kw): + seen["method"] = method + return _Resp({}) + + monkeypatch.setattr(http.requests, "request", fake_request) + http.get("https://h/x") + assert seen["method"] == "GET" + http.post("https://h/x") + assert seen["method"] == "POST" + + +def test_host_is_derived_for_rate_key(): + assert http._host("https://api.github.com/repos/x") == "api.github.com" + assert http._host("not a url") == "unknown" + + +def test_github_issues_live_path_through_http(monkeypatch): + """With a token set, the tool takes the live branch and parses a faked response.""" + from aps.tools.retrieval import github_issues as gi + monkeypatch.setenv("APS_GITHUB_PAT", "fake-token") + + issues = [ + {"html_url": "https://github.com/x/y/issues/1", "title": "Crash on PDF", + "body": "parser dies"}, + {"html_url": "https://github.com/x/y/pull/2", "title": "a PR", + "pull_request": {}, "body": "ignore me"}, + ] + monkeypatch.setattr(http, "get", lambda *a, **k: _Resp(issues)) + + out = gi.TOOL.run(repo="x/y") + assert isinstance(out, ToolResult) and out.ok + # the PR entry is filtered out; only the real issue becomes evidence + assert len(out.evidence) == 1 + assert isinstance(out.evidence[0], Evidence) + assert out.evidence[0].title == "Crash on PDF" + + +def test_tool_call_is_metered(monkeypatch): + """BaseTool.run records every call centrally (no-op shim or real prometheus).""" + import aps.tools.base as base + seen = [] + monkeypatch.setattr(base, "record_tool_call", + lambda name, ns, ok: seen.append((name, ns, ok))) + from aps.tools.analysis import sentiment_breakdown as sb + sb.TOOL.run(evidence=[]) + assert seen and seen[-1][0] == "sentiment_breakdown" and seen[-1][1] == "analysis" diff --git a/tests/unit/test_infra.py b/tests/unit/test_infra.py new file mode 100644 index 0000000..efe45f2 --- /dev/null +++ b/tests/unit/test_infra.py @@ -0,0 +1,82 @@ +"""Infra works whether or not the optional deps (structlog/tenacity/prometheus) exist.""" +from __future__ import annotations + + +import pytest + +from aps.infra.logging import configure_logging, get_logger +from aps.infra.retry import with_retry +from aps.infra.metrics import record_tool_call, setup_metrics, TOOL_CALLS +from aps.infra.rate_limiter import RateLimiter + + +def test_logging_configures_and_logs(): + configure_logging() + log = get_logger("test") + log.info("hello", k=1) # must not raise on either backend + + +def test_retry_succeeds_after_transient_failures(): + calls = {"n": 0} + + @with_retry(attempts=3, base_delay=0.001) + def flaky(): + calls["n"] += 1 + if calls["n"] < 3: + raise ValueError("transient") + return "ok" + + assert flaky() == "ok" + assert calls["n"] == 3 + + +def test_retry_reraises_after_exhausting_attempts(): + calls = {"n": 0} + + @with_retry(attempts=2, base_delay=0.001) + def always_fail(): + calls["n"] += 1 + raise RuntimeError("boom") + + with pytest.raises(RuntimeError): + always_fail() + assert calls["n"] == 2 + + +def test_retry_bare_decorator_form(): + @with_retry + def ok(): + return 42 + + assert ok() == 42 + + +def test_metrics_record_is_safe(): + # no-op shim or real prometheus — either way these must not raise + record_tool_call("web_search", "retrieval", ok=True) + record_tool_call("web_search", "retrieval", ok=False) + TOOL_CALLS.labels(tool="x", namespace="y").inc() + + +def test_setup_metrics_noop_without_app(): + class _App: + def mount(self, *_a, **_k): + self.mounted = True + + app = _App() + setup_metrics(app) # mounts if prometheus present, no-op otherwise; never raises + + +def test_rate_limiter_allows_burst_then_throttles(): + rl = RateLimiter(rpm=6000) # 100/sec -> tiny waits, fast test + waits = [rl.acquire("github", block=False) for _ in range(10)] + assert waits[0] == 0.0 # first token always free + assert all(w >= 0 for w in waits) + + +def test_rate_limiter_isolates_sources(): + rl = RateLimiter(rpm=60) + rl.configure("slow", rpm=60) + # different sources draw from different buckets + assert rl.acquire("a", block=False) == 0.0 + assert rl.acquire("b", block=False) == 0.0 diff --git a/tests/unit/test_legal_agent.py b/tests/unit/test_legal_agent.py new file mode 100644 index 0000000..b6241eb --- /dev/null +++ b/tests/unit/test_legal_agent.py @@ -0,0 +1,44 @@ +"""Legal agent pipeline: full LegalPackage with/without TRD and Brand; renders to Markdown.""" +from __future__ import annotations + +from aps.agents.legal.agent import run_legal +from aps.state.models import ( + StudioState, TRD, BrandPackage, LegalPackage, +) +from aps.render import render_artifact + +DM = {"entities": {"User": {"fields": {"email": "string", "created_at": "datetime"}}}} + + +def test_run_legal_idea_only(): + pkg = run_legal(StudioState(idea="a privacy-first habit tracker")) + assert isinstance(pkg, LegalPackage) + assert pkg.company_name and pkg.jurisdiction and pkg.governing_law + assert "NOT LEGAL ADVICE" in pkg.disclaimer + kinds = {d.kind for d in pkg.documents} + assert kinds == {"privacy_policy", "tos", "nda", "founders_agreement", "employment"} + + +def test_run_legal_uses_brand_name_and_trd_data_model(): + state = StudioState( + idea="a privacy-first habit tracker", + brand=BrandPackage(name="Habitly"), + trd=TRD(data_model=DM), + ) + pkg = run_legal(state) + assert pkg.company_name == "Habitly" + privacy = next(d for d in pkg.documents if d.kind == "privacy_policy") + assert "Email address" in privacy.body # came from the TRD data model + + +def test_run_legal_is_deterministic(): + state = StudioState(idea="AI-powered accounting for SMEs") + assert run_legal(state).model_dump() == run_legal(state).model_dump() + + +def test_legal_renders_to_markdown(): + pkg = run_legal(StudioState(idea="a privacy-first habit tracker")) + md = render_artifact("legal", pkg) + assert "# Legal Documents" in md and "Placeholders to complete" in md + # dict path (artifact-store read-through) matches the model path + assert render_artifact("legal", pkg.model_dump()) == md diff --git a/tests/unit/test_legal_graph.py b/tests/unit/test_legal_graph.py new file mode 100644 index 0000000..a074b00 --- /dev/null +++ b/tests/unit/test_legal_graph.py @@ -0,0 +1,52 @@ +"""Legal graph wiring: flag off = unchanged graph; flag on = parallel branch off architecture, +no concurrent-write error, sees the TRD data model, existing artifacts still produced.""" +from __future__ import annotations + +from aps.orchestrator import graph as g +from aps.orchestrator.events import EventBus +from aps.state.models import RunStatus + +_ALL = ("research", "prd", "trd", "execution", "pitch", "brand", "legal") + + +def _run(monkeypatch, enabled: bool, run_id: str): + monkeypatch.setattr(g, "USE_STUBS", True) + monkeypatch.setenv("APS_ENABLE_LEGAL", "true" if enabled else "false") + bus = EventBus() + state = g.run_sync("a privacy-first habit tracker", bus, run_id=run_id) + return state, [e.type for e in bus.history(run_id)] + + +def _artifact_names(state) -> set: + return {a for a in _ALL if getattr(state, a, None) is not None} + + +def test_flag_off_no_legal(monkeypatch): + state, _ = _run(monkeypatch, enabled=False, run_id="legal_off") + assert state.legal is None + assert "legal" not in _artifact_names(state) + assert state.prd is not None and state.pitch is not None # vertical intact + + +def test_flag_on_runs_legal_in_parallel(monkeypatch): + state, types = _run(monkeypatch, enabled=True, run_id="legal_on") + # parallel branch completed without LangGraph InvalidUpdateError (would have raised) + assert state.status in (RunStatus.COMPLETE, RunStatus.DEGRADED) + assert state.legal is not None and state.legal.company_name + assert len(state.legal.documents) == 5 + # existing artifacts still produced alongside legal + assert {"prd", "trd", "execution", "pitch", "legal"} <= _artifact_names(state) + # privacy policy is grounded in the TRD data model produced upstream by architecture + privacy = next(d for d in state.legal.documents if d.kind == "privacy_policy") + assert "Data we collect" in privacy.body + # traceable + assert "artifact_ready" in types + + +def test_compiled_graph_node_set_reflects_flag(monkeypatch): + monkeypatch.setenv("APS_ENABLE_LEGAL", "false") + nodes_off = set(g.build_graph(EventBus(), "n1").get_graph().nodes) + monkeypatch.setenv("APS_ENABLE_LEGAL", "true") + nodes_on = set(g.build_graph(EventBus(), "n2").get_graph().nodes) + assert "legal" not in nodes_off + assert "legal" in nodes_on diff --git a/tests/unit/test_legal_tools.py b/tests/unit/test_legal_tools.py new file mode 100644 index 0000000..0014bcf --- /dev/null +++ b/tests/unit/test_legal_tools.py @@ -0,0 +1,77 @@ +"""Legal tools (Launch Studio Phase 2): valid documents, disclaimer, placeholders, +determinism, jurisdiction adaptivity, data-model-grounded privacy policy.""" +from __future__ import annotations + +from aps.tools.legal.generate_privacy_policy import TOOL as PRIVACY +from aps.tools.legal.generate_terms_of_service import TOOL as TOS +from aps.tools.legal.generate_nda import TOOL as NDA +from aps.tools.legal.generate_founders_agreement import TOOL as FOUNDERS +from aps.tools.legal.generate_employment_contract import TOOL as EMPLOYMENT +from aps.tools.legal import _legal + +ALL = [PRIVACY, TOS, NDA, FOUNDERS, EMPLOYMENT] +DM = {"entities": {"User": {"fields": {"email": "string", "owner_id": "uuid", + "created_at": "datetime"}}}} + + +def test_registry_exposes_legal_namespace(): + from aps.tools.registry import load_registry + reg = load_registry() + assert len(reg["legal"]) == 5 + assert sum(len(v) for v in reg.values()) == 69 + + +def test_every_doc_has_disclaimer_company_and_kind(): + for tool in ALL: + out = tool.run(company_name="Habitly", jurisdiction="India") + assert out.ok + d = out.payload + assert d["kind"] and d["title"] + assert "NOT LEGAL ADVICE" in d["body"] + assert "Habitly" in d["body"] + assert isinstance(d["placeholders"], list) and d["placeholders"] + + +def test_documents_are_deterministic(): + for tool in ALL: + a = tool.run(company_name="Habitly", jurisdiction="India").payload["body"] + b = tool.run(company_name="Habitly", jurisdiction="India").payload["body"] + assert a == b + + +def test_privacy_policy_reflects_data_model_and_dpdp(): + out = PRIVACY.run(company_name="Habitly", jurisdiction="India", data_model=DM).payload + assert "DPDP" in out["body"] or "Digital Personal Data Protection" in out["body"] + assert "Email address" in out["body"] # from the data model + assert "Usage and activity data" in out["body"] + + +def test_privacy_policy_jurisdiction_adaptive(): + eu = PRIVACY.run(company_name="Habitly", jurisdiction="European Union", data_model=DM).payload + assert "GDPR" in eu["body"] + us = PRIVACY.run(company_name="Habitly", jurisdiction="Delaware, USA", data_model=DM).payload + assert "CCPA" in us["body"] + + +def test_employment_framing_adapts_to_jurisdiction(): + india = EMPLOYMENT.run(company_name="Habitly", jurisdiction="India").payload["body"] + us = EMPLOYMENT.run(company_name="Habitly", jurisdiction="Delaware, USA").payload["body"] + assert "notice" in india.lower() and "at-will" not in india.lower() + assert "at-will" in us.lower() + + +def test_founders_agreement_lists_each_founder(): + out = FOUNDERS.run(company_name="Habitly", jurisdiction="India", num_founders=3).payload + assert "[FOUNDER 1 NAME]" in out["body"] and "[FOUNDER 3 NAME]" in out["body"] + assert "vest" in out["body"].lower() and "cliff" in out["body"].lower() + + +def test_missing_company_surfaces_placeholder(): + out = NDA.run(jurisdiction="India").payload # no company_name + assert "[COMPANY NAME]" in out["body"] + assert "[COMPANY NAME]" in out["placeholders"] + + +def test_data_categories_fallback_when_empty(): + assert _legal.data_categories({}) == _legal.data_categories(None) + assert "Email address" in _legal.data_categories({}) # sensible generic set diff --git a/tests/unit/test_llm_ratelimit.py b/tests/unit/test_llm_ratelimit.py new file mode 100644 index 0000000..6fdaa51 --- /dev/null +++ b/tests/unit/test_llm_ratelimit.py @@ -0,0 +1,43 @@ +"""Per-provider LLM rate limiting (multipleAPIplan P3) — each provider its own RPM bucket.""" +from __future__ import annotations + +import pytest + +import aps.infra.llm as llm + + +@pytest.fixture(autouse=True) +def _reset(monkeypatch): + # fresh limiter + configured-set per test so env overrides take effect deterministically + monkeypatch.setattr(llm, "_LIMITER", None) + monkeypatch.setattr(llm, "_CONFIGURED", set()) + for v in ("APS_GROQ_RPM", "APS_GEMINI_RPM"): + monkeypatch.delenv(v, raising=False) + + +def test_provider_rpm_from_registry(): + assert llm._provider_rpm("groq") == 30 + assert llm._provider_rpm("gemini") == 15 + assert llm._provider_rpm("nim") == 40 + assert llm._provider_rpm("llm") is None # generic source → default bucket + assert llm._provider_rpm("bogus") is None + + +def test_provider_rpm_env_override(monkeypatch): + monkeypatch.setenv("APS_GROQ_RPM", "7") + assert llm._provider_rpm("groq") == 7 + + +def test_acquire_configures_provider_bucket_once(): + assert llm.acquire_llm("groq") == 0.0 # first token free, no error + # the provider's bucket now exists, sized to its rpm (30), separate from "gemini" + assert "groq" in llm._CONFIGURED + lim = llm._limiter() + assert lim._buckets["groq"].capacity == 30.0 + + +def test_providers_have_isolated_buckets(): + # draining one provider's bucket does not throttle another (different keys) + for _ in range(5): + assert llm.acquire_llm("groq", ) >= 0.0 + assert llm.acquire_llm("gemini") == 0.0 # untouched bucket → free diff --git a/tests/unit/test_pain_noise_filter.py b/tests/unit/test_pain_noise_filter.py new file mode 100644 index 0000000..1b3a59d --- /dev/null +++ b/tests/unit/test_pain_noise_filter.py @@ -0,0 +1,209 @@ +"""Pain noise filter — the contributor's exact polluted snippets must NOT become pains. + +Closes finding (a): nav/CTA chrome, greetings, and issue-template scaffolding were ending up +as the PRD's headline 'Must' feature on noisy ideas (PR-review/security idea). +""" +from __future__ import annotations + +from aps.state.models import Evidence +from aps.tools.analysis.extract_pain_points import TOOL, _pick_pain, _looks_like_noise + + +# the exact junk the contributor reported (each contains a cue further in, so it slipped through) +_NOISE = [ + "Log inGet StartedBook a Demo. Honestly the whole thing is broken.", + "📚 Documentation Request Description I noticed that some features are missing here.", + "Hi Claude autonomous plugin maintainer, I was looking but it doesn't work for me.", +] +_REAL = [ + "The resume parser is broken and keeps dropping valid PDFs.", + "Candidate ranking is slow and confusing, I can't trust it.", +] + + +def test_noise_sentences_are_rejected(): + for snippet in _NOISE: + # the leading chrome sentence is flagged; the whole item yields no clean pain + ev = Evidence(source="web", url="https://x.com/a", title="", snippet=snippet) + out = TOOL.run(evidence=[ev.model_dump()]) + for p in out.payload: + # whatever (if anything) is extracted must NOT be the nav/greeting/template chrome + low = p.text.lower() + assert not low.startswith(("log in", "documentation request", "hi ", "📚")) + assert "book a demo" not in low and "get started" not in low + + +def test_pure_chrome_yields_no_pain(): + ev = Evidence(source="web", url="https://x.com/a", title="Home", + snippet="Log in · Get Started · Book a Demo · View Pricing · Contact Sales") + assert TOOL.run(evidence=[ev.model_dump()]).payload == [] + + +def test_real_complaints_still_extracted(): + evs = [Evidence(source="reddit", url=f"https://reddit.com/{i}", title="rant", snippet=s) + for i, s in enumerate(_REAL)] + pains = TOOL.run(evidence=[e.model_dump() for e in evs]).payload + assert len(pains) == 2 + assert any(p.severity.value == "high" for p in pains) + assert all(not _looks_like_noise(p.text) for p in pains) + + +def test_complaint_after_chrome_extracts_the_complaint_not_chrome(): + # a real complaint sentence AFTER nav chrome → the complaint is what's kept + snippet = ("Home Features Pricing Login. The export feature is completely broken and " + "I waste hours every week.") + ev = Evidence(source="web", url="https://acme.io/x", title="", snippet=snippet) + pains = TOOL.run(evidence=[ev.model_dump()]).payload + assert pains and "export" in pains[0].text.lower() + assert "pricing" not in pains[0].text.lower() + + +def test_helper_classifies_examples(): + assert _looks_like_noise("Hi there, just wondering about this") + assert _looks_like_noise("Documentation Request: add more") + assert _looks_like_noise("Get Started Book a Demo today") + assert not _looks_like_noise("the dashboard is painfully slow to load") + assert _pick_pain("The app is broken and crashes constantly.")[1].value == "high" + + +def test_github_feature_request_title_does_not_block_snippet_pain(): + ev = Evidence( + source="github", + url="https://github.com/x/y/issues/42", + title="Feature request: offline/privacy mode", + snippet="I can't find a good privacy-first habit tracker that works offline.", + ) + pains = TOOL.run(evidence=[ev.model_dump()]).payload + assert pains, "pain in snippet must survive noisy GitHub title" + assert any("privacy" in p.text.lower() or "find" in p.text.lower() for p in pains) + + +def test_demand_signal_cant_find_extracted_as_med(): + ev = Evidence( + source="reddit", + url="https://reddit.com/r/privacy/1", + title="Looking for a privacy-respecting habit tracker", + snippet="Can't find a single app that works offline and doesn't send data to the cloud.", + ) + pains = TOOL.run(evidence=[ev.model_dump()]).payload + assert pains, "demand-type pain must be extracted" + assert pains[0].severity.value in ("med", "high") + + +# ── adversarial hardening: a URL fragment / space-separated nav bar carries a pain cue but +# is not a complaint. (Both slipped through before — see the deep-hardening pass.) ────── +def test_bare_url_with_cue_word_is_not_a_pain(): + # the path "/broken-links-guide" carries the cue "broken" but it's a link, not prose + ev = Evidence(source="web", url="https://x.com/a", title="", + snippet="https://example.com/broken-links-guide") + assert TOOL.run(evidence=[ev.model_dump()]).payload == [] + + +def test_space_separated_navbar_with_cue_is_not_a_pain(): + ev = Evidence(source="web", url="https://x.com/a", title="", + snippet="Home Products Pricing About Login broken") + assert TOOL.run(evidence=[ev.model_dump()]).payload == [] + + +def test_helper_rejects_url_and_navbar_keeps_short_real_pain(): + assert _looks_like_noise("https://example.com/broken-links-guide") + assert _looks_like_noise("Home Products Pricing About Login broken") + assert not _looks_like_noise("it is unusable") # short, but genuine prose + + +# ── second live-data pass: forum solicitations, marketing/article titles, positive idioms, +# and "born out of" pitches still leaked on the subscription-tracker run. ───────────────── +def test_opinion_solicitation_question_is_not_a_pain(): + assert _looks_like_noise("What are your thoughts or pain points on subscription charges?") + assert _looks_like_noise("Anyone else frustrated with this, or am I the only one?") + # but a rhetorical COMPLAINT question is still a pain + assert not _looks_like_noise("Why do companies make it so hard to cancel subscriptions?") + + +def test_marketing_and_title_case_headlines_are_not_pains(): + assert _looks_like_noise("Why You Need a Subscription Tracker App") + assert _looks_like_noise("The 7 Best Subscription Management Apps in 2026") + assert _looks_like_noise("When Websites Make It Hard to Cancel") + # a lowercase complaint that names a couple of products is NOT a headline + assert not _looks_like_noise("the Slack and Notion integration is broken and loses data") + + +def test_positive_idiom_is_not_a_pain(): + assert _looks_like_noise("Currently in pre-release and honestly can't believe this worked") + assert _looks_like_noise("This works great and I highly recommend it") + + +def test_born_out_pitch_and_marketing_effort_are_not_pains(): + assert _looks_like_noise("SpeechPro was born out of my frustration during university") + assert _looks_like_noise("The market, we work hard to share a wide range of offers") + + +# ── live-data hardening: real GitHub/HN/web snippets that leaked garbage pains before. Each +# cascaded into junk feature titles, persona goals, and TRD entities. (Found during live testing.) +def test_product_pitch_is_not_a_pain(): + # a Show-HN founder pitch ("we built this because…") is not a user complaint + ev = Evidence(source="hackernews", url="https://h/1", title="Show HN: our hiring tool", + snippet="Couple friends and I built this cause we hated the direction hiring is going.") + assert TOOL.run(evidence=[ev.model_dump()]).payload == [] + + +def test_repo_description_with_star_prefix_is_not_a_pain(): + ev = Evidence(source="github", url="https://g/1", title="org/FairHiringProtocol", + snippet="4★ The Fair Hiring Protocol (FHP) is an open, community standard designed to fix hiring.") + assert TOOL.run(evidence=[ev.model_dump()]).payload == [] + + +def test_listing_metadata_is_not_a_pain(): + ev = Evidence(source="hackernews", url="https://h/2", + title="Looking for Employers for the job fair", snippet="1 points, 0 comments") + assert TOOL.run(evidence=[ev.model_dump()]).payload == [] + + +def test_vcs_missing_file_gripe_is_not_a_pain(): + # the dot-split used to fragment "resume.txt" so the VCS filter missed it; now it doesn't + ev = Evidence(source="github", url="https://g/2", title="Missing resume.txt", + snippet="Where can we find resume.txt? It does not exist into the repo.") + assert TOOL.run(evidence=[ev.model_dump()]).payload == [] + + +def test_real_market_pain_survives_and_is_not_truncated(): + # a genuine multi-clause complaint stays a complete thought (no dangling "… and") + ev = Evidence(source="web", url="https://x/1", title="AI recruiting review", + snippet="Sourcing is slower, candidate competition is fiercer, and the old " + "keyword playbook is failing recruiters everywhere.") + pains = TOOL.run(evidence=[ev.model_dump()]).payload + assert pains and not pains[0].text.rstrip().endswith((" and", " the", " is", ",")) + + +def test_plain_snippet_demand_pain_no_title_noise(): + ev = Evidence( + source="reddit", + url="https://reddit.com/r/privacy/2", + title="", + snippet="Can't find a privacy-respecting habit tracker. Would love an offline-first option.", + ) + pains = TOOL.run(evidence=[ev.model_dump()]).payload + assert pains, "bare demand snippet must yield at least one pain" + + +def test_product_description_is_not_a_pain(): + # Phase 4a: a repo/product blurb (generic "X is a <…> tool/app/platform/…") masquerading as a + # pain is rejected — it describes a product, it doesn't voice a user frustration. + for blurb in [ + "ZeroTrace is a powerful ethical hacking tool for anonymization via Tor.", + "ActivityWatch is an open-source automated time-tracking app.", + "Foo is a fast self-hosted analytics platform for teams.", + ]: + assert _looks_like_noise(blurb), f"blurb slipped through: {blurb!r}" + + +def test_product_description_with_real_complaint_survives(): + # …but a product mention FOLLOWED by an actual complaint is still a pain. + for s in [ + "ActivityWatch is a free time-tracking app but it is broken and crashes constantly.", + "Toggl is a popular tracking tool, however it can't export and the sync is slow.", + ]: + assert not _looks_like_noise(s), f"real complaint wrongly dropped: {s!r}" + ev = Evidence(source="hackernews", url="https://h/1", title="", + snippet="ActivityWatch is a free time-tracking app but it is broken and crashes constantly.") + assert TOOL.run(evidence=[ev.model_dump()]).payload, "complaint after a product mention must yield a pain" diff --git a/tests/unit/test_phase_a.py b/tests/unit/test_phase_a.py new file mode 100644 index 0000000..fcfc96d --- /dev/null +++ b/tests/unit/test_phase_a.py @@ -0,0 +1,65 @@ +"""Phase-A credibility fixes: idea-agnostic stub + noun entities / correct pluralization.""" +from __future__ import annotations + +from aps.agents.research.stub import stub_research +from aps.tools.architecture import design_data_model, design_api_contract +from aps.state.models import Feature + + +def test_stub_is_idea_agnostic_and_degraded(): + r = stub_research("a privacy-first habit tracker") + assert r.degraded is True + # the fixture references the actual idea and never claims a different domain (no ATS bleed) + blob = (r.market_size + " " + " ".join(p.text for p in r.pain_points) + + " " + " ".join(e.snippet for e in r.evidence)).lower() + assert "ats" not in blob and "resume" not in blob + assert "habit tracker" in blob + assert r.evidence and all(e.source == "stub_fallback" for e in r.evidence) + + +def test_arch_entities_are_domain_nouns_not_verbs(): + # idea is the clean source; the feature title is raw pain text that used to mint + # verb/adjective entities (`Rejects`, `Great`) and `/rejectss`. + dm = design_data_model.TOOL.run( + idea="a privacy-first habit tracker for couples", + features=[Feature(title="Solve: ATS rejects qualified candidates", + description="x", priority="High").model_dump()], + ).payload + names = {n.lower() for n in dm["entities"]} + assert "habit" in names or "tracker" in names # clean domain noun from the idea + for bad in ("rejects", "great", "inconvenient", "solve", "resolve", "qualified"): + assert bad not in names # no verbs/adjectives/filler + assert len(dm["entities"]) >= 2 + + +def test_api_contract_pluralization_has_no_double_s(): + dm = {"entities": {"Class": {"fields": {"id": "uuid"}}, + "Category": {"fields": {"id": "uuid"}}}} + doc = design_api_contract.TOOL.run(data_model=dm, idea="x").payload + paths = list(doc["paths"].keys()) + assert "/classes" in paths and "/categories" in paths + assert not any(p.endswith("ss") for p in paths) + assert doc["paths"]["/classes"]["get"]["operationId"] == "listClasses" + + +def test_keyless_research_returns_real_evidence_not_stub(monkeypatch): + # Phase C: with no LLM key, the no-key tools are called directly and compressed into a + # REAL ResearchReturn (degraded=False) — not the labeled stub. Tools are monkeypatched + # so the unit test stays offline/hermetic. + import importlib + from aps.agents.research import keyless + from aps.state.models import ToolResult, Evidence + + def fake_run(**kwargs): + return ToolResult(ok=True, evidence=[Evidence( + source="hackernews", url="https://news.ycombinator.com/item?id=1", + title="habit tracker friction", + snippet="people say existing habit trackers are broken and hard to stick with")]) + + for mod_path, _extra in keyless._KEYLESS_TOOLS: + monkeypatch.setattr(importlib.import_module(mod_path).TOOL, "run", fake_run) + + r = keyless.keyless_research("a privacy-first habit tracker") + assert r.idea == "a privacy-first habit tracker" + assert r.degraded is False # genuine evidence, not the stub fallback + assert r.evidence # compressed from the no-key tools' output diff --git a/tests/unit/test_provider_polish.py b/tests/unit/test_provider_polish.py new file mode 100644 index 0000000..54d5e1e --- /dev/null +++ b/tests/unit/test_provider_polish.py @@ -0,0 +1,110 @@ +"""multipleAPIplan P5/P7/P8/P9 — metrics, circuit breaker, ledger, router, portable context.""" +from __future__ import annotations + +from aps.config.quota import Ledger, CircuitBreaker +from aps.config.router import route, TaskProfile, RESEARCH, COMPRESSION +from aps.config.portable import normalize_history +from aps.config.failover import FailoverChatModel + + +# ── P9: ledger + circuit breaker ────────────────────────────────────────────── +def test_ledger_counts_per_provider(): + led = Ledger() + for p in ("groq", "groq", "gemini"): + led.record(p) + assert led.count("groq") == 2 and led.count("gemini") == 1 + assert led.snapshot() == {"groq": 2, "gemini": 1} + + +def test_circuit_breaker_trips_and_restores(): + t = {"now": 0.0} + cb = CircuitBreaker(cooldown=60.0, clock=lambda: t["now"]) + assert cb.is_open("groq") is False + cb.trip("groq") + assert cb.is_open("groq") is True # benched + t["now"] = 59.9 + assert cb.is_open("groq") is True + t["now"] = 60.1 + assert cb.is_open("groq") is False # auto-restored after cooldown + + +# ── P8: router ──────────────────────────────────────────────────────────────── +def test_route_excludes_no_tool_providers_for_tool_task(): + # ollama caps tools=2 (ok), but a hypothetical no-tool provider would be dropped; + # here verify a tool task keeps tool-capable providers and orders deterministically + order = route(RESEARCH, ["gemini", "groq", "cerebras"]) + assert set(order) == {"gemini", "groq", "cerebras"} + assert order == route(RESEARCH, ["gemini", "groq", "cerebras"]) # deterministic + + +def test_route_low_complexity_prefers_fast_cheap(): + # COMPRESSION (low complexity, long context) — Gemini (context 3) should rank for long ctx + order = route(COMPRESSION, ["groq", "gemini"]) + assert order[0] == "gemini" # only provider meeting context=long requirement + + +def test_route_quota_headroom_demotes_busy_provider(): + fresh = route(RESEARCH, ["groq", "cerebras"], load={}) + busy = route(RESEARCH, ["groq", "cerebras"], load={fresh[0]: 1000}) + assert busy[0] != fresh[0] # the heavily-used one sinks + + +def test_route_no_eligible_falls_back_to_input_order(): + profile = TaskProfile(needs_tools=True) + # unknown providers default to caps tools=2 (eligible) → returns them + assert route(profile, ["x", "y"]) == ["x", "y"] or set(route(profile, ["x", "y"])) == {"x", "y"} + + +# ── P7: portable context ────────────────────────────────────────────────────── +def test_normalize_history_canonicalizes_tool_call_ids(): + msgs = [ + {"role": "assistant", "tool_calls": [{"id": "abc123", "name": "t", "args": {}}]}, + {"role": "tool", "tool_call_id": "abc123", "content": "ok"}, + ] + out = normalize_history(msgs) + assert out[0]["tool_calls"][0]["id"] == "call_0" + assert out[1]["tool_call_id"] == "call_0" # matched pair stays consistent + + +def test_normalize_history_noop_without_tools(): + msgs = [{"role": "user", "content": "hi"}] + assert normalize_history(msgs) is msgs # fast no-op returns same object + + +def test_normalize_history_survives_garbage(): + assert normalize_history(["not a message"]) == ["not a message"] + + +# ── P9 wired into failover: a tripped provider is tried last ─────────────────── +class _M: + def __init__(self, result=None, raises=None): + self._r, self._e = result, raises + def bind_tools(self, t, **k): + return self + def invoke(self, m, **k): + if self._e: + raise self._e + return self._r + + +class _RT: + def __init__(self, name, model): + self.name = name + self.spec = type("S", (), {"name": name})() + self._m = model + def chat_model(self): + return self._m + + +def test_failover_records_metrics_and_ledger(monkeypatch): + import aps.infra.llm as llm + monkeypatch.setattr(llm, "acquire_llm", lambda *a, **k: 0.0) + from aps.config import quota + quota.BREAKER.reset() + before = quota.LEDGER.count("gemini") + m = FailoverChatModel([_RT("groq", _M(raises=RuntimeError("429"))), + _RT("gemini", _M(result="OK"))]) + assert m.invoke(["hi"]) == "OK" + assert quota.LEDGER.count("gemini") == before + 1 + assert quota.BREAKER.is_open("groq") is True # the 429'd provider got benched + quota.BREAKER.reset() diff --git a/tests/unit/test_provider_resolution.py b/tests/unit/test_provider_resolution.py new file mode 100644 index 0000000..d7a8eba --- /dev/null +++ b/tests/unit/test_provider_resolution.py @@ -0,0 +1,91 @@ +"""Provider/key resolution + honest degradation reasons. + +Covers the fix for the silent-401 bug: empty keys count as unset, the NIM factory raises +instead of sending a placeholder, the provider auto-detects from the available key, a +provider/key mismatch is a loud message, and every degraded brief records WHY. +""" +from __future__ import annotations + +import pytest + +from aps.config.settings import ( + nvidia_key, resolved_provider, get_chat_model, describe_runtime, +) +from aps.infra.llm import has_llm_key, key_mismatch +from aps.agents.research.stub import stub_research +from aps.state.models import ResearchReturn + +_KEYS = ("NVIDIA_API_KEY", "GEMINI_API_KEY", "GOOGLE_API_KEY", "APS_MODEL_PROVIDER") + + +@pytest.fixture +def clean_env(monkeypatch): + for k in _KEYS: + monkeypatch.delenv(k, raising=False) + return monkeypatch + + +def test_empty_or_whitespace_key_counts_as_unset(clean_env): + clean_env.setenv("NVIDIA_API_KEY", " ") + assert nvidia_key() == "" # whitespace stripped to empty + clean_env.setenv("NVIDIA_API_KEY", "nvapi-real") + assert nvidia_key() == "nvapi-real" + + +def test_resolved_provider_autodetects_from_single_key(clean_env): + clean_env.setenv("NVIDIA_API_KEY", "nvapi-x") + assert resolved_provider() == "nim" # NVIDIA-only env → nim, no switch needed + clean_env.delenv("NVIDIA_API_KEY") + clean_env.setenv("GEMINI_API_KEY", "g-x") + assert resolved_provider() == "gemini" + + +def test_explicit_provider_always_wins(clean_env): + clean_env.setenv("APS_MODEL_PROVIDER", "gemini") + clean_env.setenv("NVIDIA_API_KEY", "nvapi-x") # only NVIDIA key present + assert resolved_provider() == "gemini" # but explicit setting wins (a real misconfig) + assert key_mismatch() is not None # ...and is surfaced loudly + assert "NVIDIA key IS set" in key_mismatch() + + +def test_nim_factory_raises_without_key_no_placeholder(clean_env): + clean_env.setenv("APS_MODEL_PROVIDER", "nim") + # no NVIDIA_API_KEY → must raise, never construct a client with a bogus "placeholder" + with pytest.raises(RuntimeError, match="NVIDIA_API_KEY"): + get_chat_model() + + +def test_has_llm_key_respects_empty(clean_env): + clean_env.setenv("APS_MODEL_PROVIDER", "nim") + clean_env.setenv("NVIDIA_API_KEY", "") + assert has_llm_key() is False + clean_env.setenv("NVIDIA_API_KEY", "nvapi-real") + assert has_llm_key() is True + + +def test_key_mismatch_specific_remedy(clean_env): + clean_env.setenv("APS_MODEL_PROVIDER", "nim") + clean_env.setenv("GEMINI_API_KEY", "g-x") # only a Gemini key, but provider=nim + msg = key_mismatch() + assert msg and "NVIDIA_API_KEY" in msg and "APS_MODEL_PROVIDER=gemini" in msg + + +def test_describe_runtime_never_leaks_key(clean_env): + clean_env.setenv("APS_MODEL_PROVIDER", "nim") + clean_env.setenv("NVIDIA_API_KEY", "nvapi-secret") + rt = describe_runtime() + assert "provider=nim" in rt and "key=present" in rt + assert "nvapi-secret" not in rt # presence only, never the value + + +def test_stub_research_records_reason(): + r = stub_research("a habit tracker", reason="no_llm_key") + assert r.degraded is True + assert r.degrade_reason == "no_llm_key" + assert "no_llm_key" in r.evidence[0].snippet # self-diagnosing artifact + + +def test_degrade_reason_roundtrips_through_json(): + r = ResearchReturn(idea="x", degraded=True, degrade_reason="llm_auth_401") + again = ResearchReturn.model_validate_json(r.model_dump_json()) + assert again.degrade_reason == "llm_auth_401" diff --git a/tests/unit/test_providers.py b/tests/unit/test_providers.py new file mode 100644 index 0000000..43b415b --- /dev/null +++ b/tests/unit/test_providers.py @@ -0,0 +1,112 @@ +"""Multi-provider registry + chain resolution (multipleAPIplan P1) — offline, deterministic.""" +from __future__ import annotations + +import pytest + +from aps.config.providers import REGISTRY, DEFAULT_CHAIN, provider_keys, provider_available, \ + resolved_provider_chain + +# env vars the tests touch — cleared before each test so the host env can't leak in +_KEY_VARS = [v for spec in REGISTRY.values() for v in spec.env_keys] + \ + [f"{v}_2" for spec in REGISTRY.values() for v in spec.env_keys] + \ + ["APS_PROVIDER_CHAIN", "APS_MODEL_PROVIDER", "APS_ENABLE_OLLAMA"] + + +@pytest.fixture(autouse=True) +def _clean_env(monkeypatch): + for v in _KEY_VARS: + monkeypatch.delenv(v, raising=False) + + +# ── registry integrity ─────────────────────────────────────────────────────── +def test_registry_specs_are_well_formed(): + assert {"gemini", "nim", "groq", "cerebras", "openrouter"} <= set(REGISTRY) + for name, spec in REGISTRY.items(): + assert spec.name == name + assert spec.kind in ("openai", "gemini", "anthropic") + assert spec.default_model + if spec.kind == "openai": + assert spec.base_url, f"{name}: openai-kind needs a base_url" + if not spec.keyless: + assert spec.env_keys, f"{name}: needs env_keys unless keyless" + + +def test_default_chain_is_known(): + assert all(n in REGISTRY for n in DEFAULT_CHAIN) + + +def test_registry_matches_settings_for_existing_providers(): + # drift guard: gemini/nim defaults mirror config.settings + from aps.config.settings import get_settings + s = get_settings() + assert REGISTRY["gemini"].default_model == s.gemini_model + assert REGISTRY["nim"].default_model == s.nim_model + assert REGISTRY["nim"].base_url == s.nim_base_url + + +# ── key resolution + rotation ──────────────────────────────────────────────── +def test_provider_keys_collects_and_rotates(monkeypatch): + monkeypatch.setenv("GROQ_API_KEY", "k1") + monkeypatch.setenv("GROQ_API_KEY_2", "k2") + assert provider_keys("groq") == ["k1", "k2"] + + +def test_provider_keys_dedupes_and_trims(monkeypatch): + monkeypatch.setenv("GROQ_API_KEY", " k1 ") + monkeypatch.setenv("GROQ_API_KEY_2", "k1") # duplicate value + assert provider_keys("groq") == ["k1"] + + +def test_provider_keys_empty_without_env(): + assert provider_keys("groq") == [] + assert provider_keys("not_a_provider") == [] + + +def test_gemini_accepts_either_key(monkeypatch): + monkeypatch.setenv("GOOGLE_API_KEY", "g") + assert provider_keys("gemini") == ["g"] + + +# ── availability ───────────────────────────────────────────────────────────── +def test_available_iff_key_present(monkeypatch): + assert provider_available("groq") is False + monkeypatch.setenv("GROQ_API_KEY", "k") + assert provider_available("groq") is True + + +def test_keyless_ollama_needs_explicit_optin(monkeypatch): + assert provider_available("ollama") is False + monkeypatch.setenv("APS_ENABLE_OLLAMA", "true") + assert provider_available("ollama") is True + + +# ── chain resolution ───────────────────────────────────────────────────────── +def test_explicit_chain_parsed_and_filtered(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq, gemini , nim") + monkeypatch.setenv("GROQ_API_KEY", "k") + monkeypatch.setenv("GEMINI_API_KEY", "k") + # nim has no key → dropped; order preserved + assert resolved_provider_chain() == ["groq", "gemini"] + + +def test_unknown_names_dropped_and_deduped(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "groq,bogus,groq") + monkeypatch.setenv("GROQ_API_KEY", "k") + assert resolved_provider_chain() == ["groq"] + + +def test_back_compat_single_provider(monkeypatch): + monkeypatch.setenv("APS_MODEL_PROVIDER", "nim") + monkeypatch.setenv("NVIDIA_API_KEY", "k") + assert resolved_provider_chain() == ["nim"] + + +def test_default_chain_when_unset_filtered_to_available(monkeypatch): + monkeypatch.setenv("CEREBRAS_API_KEY", "k") + monkeypatch.setenv("GEMINI_API_KEY", "k") + # DEFAULT_CHAIN = groq,cerebras,gemini,nim,openrouter → only the two with keys, in order + assert resolved_provider_chain() == ["cerebras", "gemini"] + + +def test_empty_chain_when_no_keys(): + assert resolved_provider_chain() == [] # hermetic env → degrades (back-compat) diff --git a/tests/unit/test_query_planning.py b/tests/unit/test_query_planning.py new file mode 100644 index 0000000..f5daaf6 --- /dev/null +++ b/tests/unit/test_query_planning.py @@ -0,0 +1,94 @@ +"""Intent-based query planning — idea-anchored search phrases + sharp fan-out sub-questions. + +Under pytest there's no LLM key, so `plan_queries`/`plan_subtopics` exercise their DETERMINISTIC +fallbacks — which is exactly what must carry the "ask on-topic questions" behavior. These tests +pin the fallback paths (idea-anchored, deduped, deterministic) and the keyless wiring. +""" +from __future__ import annotations + +from aps.agents.research import supervisor as sup +from aps.agents.research import keyless as kl +from aps.config.settings import get_settings + +IDEA = "Private Activity Tracker" + + +def test_plan_queries_fallback_is_idea_anchored_and_deduped(): + qs = sup.plan_queries(IDEA) + assert len(qs) >= 5 + assert len(qs) == len({q.lower() for q in qs}) # deduped + assert all("activity" in q.lower() or "tracker" in q.lower() for q in qs) # anchored to idea + assert qs == sup.plan_queries(IDEA) # deterministic + + +def test_plan_queries_respects_count(): + assert len(sup.plan_queries(IDEA, n=3)) <= 3 + + +def test_fallback_subtopics_name_the_idea_not_a_bare_category(): + subs = sup._fallback_subtopics(IDEA, 3) + assert len(subs) == 3 + # every sub-question names the idea (sharp), not the old generic category labels + assert all("activity tracker" in s.lower() for s in subs) + assert subs != sup._GENERIC_SUBTOPICS[:3] + assert "user pain points & complaints with existing solutions" not in subs + + +def test_flag_off_restores_generic_subtopics(monkeypatch): + get_settings.cache_clear() + monkeypatch.setenv("APS_ENABLE_QUERY_PLANNING", "false") + try: + assert sup._fallback_subtopics(IDEA, 3) == sup._GENERIC_SUBTOPICS[:3] + finally: + get_settings.cache_clear() + + +def test_plan_subtopics_uses_idea_anchored_fallback_without_key(): + # no key under pytest → plan_subtopics returns the idea-anchored fallback + subs = sup.plan_subtopics(IDEA, k=3) + assert subs and all("activity tracker" in s.lower() for s in subs) + + +def test_keyless_issues_planned_phrases_across_tools(monkeypatch): + # capture the query= each no-key tool is asked; assert it's the idea-anchored phrase set, + # not a single raw-idea query. + get_settings.cache_clear() + seen_queries: list[str] = [] + + class _Res: + ok = True + evidence: list = [] + + class _Tool: + def run(self, *, query, **extra): + seen_queries.append(query) + return _Res() + + import importlib + monkeypatch.setattr(importlib, "import_module", lambda _p: type("M", (), {"TOOL": _Tool()})) + monkeypatch.setattr(kl, "_compress", lambda idea, ev: ("compressed", idea, ev)[0]) + + kl.keyless_research(IDEA) + try: + assert len(set(seen_queries)) >= 2 # multiple distinct planned phrases + assert any("activity" in q.lower() or "tracker" in q.lower() for q in seen_queries) + assert seen_queries != [IDEA] # not just the bare idea + finally: + get_settings.cache_clear() + + +def test_keyless_flag_off_uses_single_token_query(monkeypatch): + get_settings.cache_clear() + monkeypatch.setenv("APS_ENABLE_QUERY_PLANNING", "false") + try: + qs = kl._keyless_queries(IDEA) + assert len(qs) == 1 and "activity" in qs[0].lower() # the prior single token-query path + finally: + get_settings.cache_clear() + + +def test_gather_evidence_accepts_seed_queries(): + # signature/contract check: seed_queries is an accepted keyword (the single-unit path passes it) + import inspect + from aps.agents.research.agent import gather_evidence + assert "seed_queries" in inspect.signature(gather_evidence).parameters diff --git a/tests/unit/test_registry.py b/tests/unit/test_registry.py new file mode 100644 index 0000000..039edd0 --- /dev/null +++ b/tests/unit/test_registry.py @@ -0,0 +1,65 @@ +"""Registry & Req-1 invariants: exactly 69 model-callable tools, cleanly scoped. + +(52 core + Launch Studio: 4 brand (P1) + 5 legal (P2) + 3 funding (P3) + 2 availability (P4) ++ 2 compliance (P5); +1 analysis score_evidence_relevance for the research relevance gate.) +""" +from __future__ import annotations + +import pytest + +from aps.tools.registry import load_registry, all_tools, tools_for +from aps.state.models import ToolResult + +EXPECTED = { + "retrieval": 20, "analysis": 11, "product": 6, "architecture": 6, + "execution": 6, "presentation": 4, "brand": 4, "legal": 5, "funding": 3, + "availability": 2, "compliance": 2, +} + + +def test_total_is_69(): + assert len(all_tools()) == 69 + + +def test_namespace_counts(): + reg = load_registry() + assert {k: len(v) for k, v in reg.items()} == EXPECTED + + +def test_no_duplicate_tool_names(): + names = [t.name for t in all_tools()] + assert len(names) == len(set(names)), "tool names must be globally unique" + + +@pytest.mark.parametrize("tool", all_tools(), ids=[t.name for t in all_tools()]) +def test_every_tool_is_model_grade(tool): + # snake_case name, a real description the model reads, a typed args schema, namespace + assert tool.name and tool.name == tool.name.lower() + assert tool.namespace in EXPECTED + desc = (tool.description or "").strip() + assert len(desc) >= 30 and "TODO" not in desc, f"{tool.name}: weak description" + assert hasattr(tool.args_schema, "model_fields"), f"{tool.name}: args_schema not a model" + + +def test_scoping_returns_only_namespace(): + for ns in EXPECTED: + assert all(t.namespace == ns for t in tools_for(ns)) + + +def test_no_agent_sees_more_than_20_tools(): + # ADR-0005: per-agent scoping keeps selection coherent. + for ns in EXPECTED: + assert len(tools_for(ns)) <= 20 + + +def test_run_returns_toolresult_type(): + # contract: every tool's run() yields a ToolResult (sample one per namespace) + for ns in EXPECTED: + tool = tools_for(ns)[0] + # build empty/default args where possible; tools tolerate empties by design + try: + out = tool.run() + except TypeError: + out = None # required args — covered in per-namespace tests + if out is not None: + assert isinstance(out, ToolResult) diff --git a/tests/unit/test_relevance_eval.py b/tests/unit/test_relevance_eval.py new file mode 100644 index 0000000..d3cbd4d --- /dev/null +++ b/tests/unit/test_relevance_eval.py @@ -0,0 +1,79 @@ +"""Phase 5 — lock the research-quality work with an eval that runs in CI (hermetic). + +Three guards so the relevance gate / pain validation / feature synthesis can never silently +regress: (E12) on-topic evidence stays >= 0.8, (E13) known junk fixtures are all rejected, and +(E14) no PRD feature title is a raw fragment. +""" +from __future__ import annotations + +import importlib.util +import json +from pathlib import Path + +from aps.state.models import Evidence, PainPoint, Severity, PRD, Feature, Persona +from aps.agents.research.agent import _compress +from aps.agents.product.agent import run_product +from aps.agents.research.stub import stub_research + +# scorers.py lives under tests/evals (not an importable package) — load by path. +_SPEC = importlib.util.spec_from_file_location( + "aps_eval_scorers", Path(__file__).resolve().parents[1] / "evals" / "scorers.py") +scorers = importlib.util.module_from_spec(_SPEC) +_SPEC.loader.exec_module(scorers) + +_FIX = json.loads((Path(__file__).resolve().parents[1] / "evals" / "fixtures" / "offtopic.json").read_text()) + + +# ── E13: off-topic rejection — the headline guard ──────────────────────────── +def test_all_known_junk_is_rejected(): + rate = scorers.off_topic_rejection_rate(_FIX["idea"], _FIX["junk"]) + assert rate == 1.0, f"junk leaked through the gate: rejection rate {rate}" + + +def test_relevant_fixtures_score_above_threshold(): + rate = scorers.evidence_relevance_rate(_FIX["idea"], _FIX["relevant"]) + assert rate >= 0.8, f"on-topic evidence relevance rate too low: {rate}" + + +def test_gate_drops_junk_from_pains_end_to_end(): + # mix junk + a real complaint through the real compression gate → no junk in pains + evidence = [Evidence(url=f"https://x/{i}", **j) for i, j in enumerate(_FIX["junk"])] + evidence.append(Evidence(source="reddit", url="https://r/1", title="rant", + snippet="the activity tracker is broken and keeps crashing on sync")) + research = _compress(_FIX["idea"], evidence) + joined = " ".join(p.text.lower() for p in research.pain_points) + for bad in ("stake", "bonus", "sales", "freelance", "sun position", "youtube"): + assert bad not in joined, f"junk term {bad!r} reached the pains: {joined!r}" + + +# ── E12: evidence relevance rate ───────────────────────────────────────────── +def test_relevance_rate_is_high_for_clean_set_low_for_dirty(): + clean = scorers.evidence_relevance_rate(_FIX["idea"], _FIX["relevant"]) + dirty = scorers.evidence_relevance_rate(_FIX["idea"], _FIX["junk"]) + assert clean >= 0.8 and dirty <= 0.2 and clean > dirty + + +# ── E14: feature-title sanity ──────────────────────────────────────────────── +def _prd_with_titles(titles): + return PRD(idea="x", personas=[Persona(name="P", role="r")], + features=[Feature(title=t, description="d", priority="Should") for t in titles], + requirements=["r"], mvp_scope="m") + + +def test_feature_titles_clean_flags_fragments(): + assert scorers.feature_titles_clean(_prd_with_titles(["Resume Parser", "Export"])) is True + for bad in ["However about a week", "When following a Google", "Maintainer]", + "Implement: bulk delete", "Feature request: offline mode", "API that gives..."]: + assert scorers.feature_titles_clean(_prd_with_titles([bad])) is False, bad + + +def test_real_product_agent_yields_clean_titles(): + # the actual pipeline (stub research → product agent) must produce only clean feature titles + research = stub_research("a privacy-first activity tracker") + research.pain_points = [ + PainPoint(text="However the activity tracker keeps crashing", severity=Severity.HIGH), + PainPoint(text="no way to self-host the data", severity=Severity.MED), + ] + prd = run_product(research) + assert prd.features, "expected synthesized features" + assert scorers.feature_titles_clean(prd), [f.title for f in prd.features] diff --git a/tests/unit/test_relevance_judge.py b/tests/unit/test_relevance_judge.py new file mode 100644 index 0000000..59dea3a --- /dev/null +++ b/tests/unit/test_relevance_judge.py @@ -0,0 +1,77 @@ +"""Phase 3 — the language-level relevance judge (research/_relevance.judge). + +The deterministic lexical gate can't disambiguate word senses (a particle-physics "tracker" paper +shares the word with an activity-tracker app). The LLM judge is the second pass that discards such +false-positives and rescues borderline true-positives. It is gated hard (enabled + key + not-pytest), +so under pytest it must be a NO-OP — these tests pin that, plus the keep/discard behavior with the +model call monkeypatched. +""" +from __future__ import annotations + +from types import SimpleNamespace + +from aps.agents.research import _relevance as rel +from aps.state.models import Evidence + + +def _ev(title, score): + e = Evidence(source="web", url=f"https://x/{title}", title=title, snippet=title + " details") + e.relevance = score + return e + + +def test_judge_is_noop_under_pytest_even_when_enabled(): + # enabled flag on, but "pytest" in sys.modules ⇒ deterministic set returned unchanged (hermetic) + s = SimpleNamespace(enable_relevance_llm=True) + det = [_ev("on-topic", 0.6)] + assert rel.judge("idea", det, det, s, min_score=0.15) is det + + +def test_judge_disabled_returns_deterministic_set(): + s = SimpleNamespace(enable_relevance_llm=False) + det = [_ev("a", 0.5), _ev("b", 0.4)] + assert rel.judge("idea", det, det, s, min_score=0.15) == det + + +def test_judge_discards_and_rescues_when_active(monkeypatch): + # force the gate open and stub the model so no network/key is needed + monkeypatch.setattr(rel, "_enabled", lambda settings: True) + + on = _ev("Activity tracker privacy leak", 0.6) # det-relevant, truly on-topic + false_pos = _ev("CMS Strip Tracker physics paper", 0.4) # det-relevant but off-topic (word sense) + borderline = _ev("self-hosted activity logger", 0.10) # below cutoff → candidate for rescue + det_relevant = [on, false_pos] + all_ev = [on, false_pos, borderline] + + # the model keeps #1 (on) and #3 (borderline rescue), drops #2 (physics false-positive) + class _Resp: + content = "1, 3" + + # judge imports these lazily from their home modules — patch there, not on `rel` + import aps.config.settings as settings + import aps.infra.llm as llm + monkeypatch.setattr(settings, "get_chat_model", + lambda **k: SimpleNamespace(invoke=lambda msgs: _Resp()), raising=False) + monkeypatch.setattr(llm, "acquire_llm", lambda *a, **k: None, raising=False) + + out = rel.judge("Private Activity Tracker", all_ev, det_relevant, SimpleNamespace(), min_score=0.15) + titles = {e.title for e in out} + assert "Activity tracker privacy leak" in titles # kept + assert "self-hosted activity logger" in titles # rescued from borderline + assert "CMS Strip Tracker physics paper" not in titles # discarded false-positive + + +def test_judge_empty_verdict_falls_back_to_deterministic(monkeypatch): + monkeypatch.setattr(rel, "_enabled", lambda settings: True) + det = [_ev("on-topic", 0.6)] + + class _Resp: + content = "none" + + import aps.config.settings as settings + import aps.infra.llm as llm + monkeypatch.setattr(settings, "get_chat_model", + lambda **k: SimpleNamespace(invoke=lambda m: _Resp()), raising=False) + monkeypatch.setattr(llm, "acquire_llm", lambda *a, **k: None, raising=False) + # a 'none'/garbage verdict must NOT zero out the brief — fall back to the deterministic set + assert rel.judge("idea", det, det, SimpleNamespace(), min_score=0.15) == det diff --git a/tests/unit/test_render.py b/tests/unit/test_render.py new file mode 100644 index 0000000..e7a8961 --- /dev/null +++ b/tests/unit/test_render.py @@ -0,0 +1,177 @@ +"""Renderer layer (plan.md W1): completeness, empty-input, citation integrity, determinism.""" +from __future__ import annotations + +import pytest + +from aps.state.models import ( + ResearchReturn, PRD, TRD, ExecutionPlan, PitchPackage, + Evidence, Competitor, PainPoint, Persona, Feature, Severity, +) +from aps.render import render_artifact, base +from aps.render import research_md, prd_md, trd_md, execution_md, pitch_md + + +# ── fixtures ──────────────────────────────────────────────────────────────── +def _evidence(): + return [ + Evidence(source="github", url="https://github.com/x/y/issues/1", + title="Parser drops PDFs", snippet="the resume parser drops valid pdf files"), + Evidence(source="reddit", url="https://reddit.com/r/x/2", + title="ranking complaint", snippet="keyword ranking misses good candidates"), + ] + + +def _research(): + ev = _evidence() + return ResearchReturn( + idea="AI resume screening", + market_size="TAM ~$3B (cited at https://x.com/report)", + competitors=[Competitor(name="Acme", url="https://acme.io", + features=["PDF export", "Slack"], pricing="$49/mo", notes="incumbent")], + pain_points=[PainPoint(text="parser drops PDFs", severity=Severity.HIGH, + source_evidence=[ev[0]])], + evidence=ev, + ) + + +def _prd(): + ev = _evidence() + return PRD( + idea="AI resume screening", + personas=[Persona(name="Recruiter", role="recruiter", + goals=["screen faster"], frustrations=["parser drops PDFs"])], + features=[Feature(title="Reliable PDF parsing", description="handle pdf resumes", + priority="Must")], + mvp_scope="MVP: reliable parsing.", + requirements=["[Must] Reliable PDF parsing: handle pdf resumes", "Keyword ranking quality"], + sources=ev, + ) + + +def _trd(): + return TRD( + data_model={"entities": {"User": {"fields": {"id": "uuid", "email": "string"}}, + "Resume": {"fields": {"id": "uuid", "score": "float"}}}, + "architecture": {"components": ["API gateway", "worker"], + "data_flow": ["client → api → db"]}}, + api_spec={"openapi": "3.0.3", "info": {"title": "X API", "version": "1.0.0"}, + "paths": {"/resumes": {"get": {"summary": "List Resumes"}, + "post": {"summary": "Create Resume"}}}, + "components": {"schemas": {}}}, + stack=["Backend: FastAPI", "DB: PostgreSQL"], + scale_estimate="10k-100k users; p95 < 300ms", + ) + + +def _execution(): + return ExecutionPlan( + repo_plan={"dirs": ["backend/app", "frontend/src"], "key_files": ["README.md"]}, + backlog=[{"id": "APS-001", "title": "Parse PDFs", "type": "story", + "priority": "Must", "points": 5}], + sprints=[{"sprint": 1, "items": [{"title": "Parse PDFs"}], "points": 5}], + roadmap="MVP then Beta.", + infra_cost="~$235/mo", + ) + + +def _pitch(): + return PitchPackage(pitch_outline="1. Problem\n5. Ask", + demo_script="Demo steps", + investor_memo="INVESTOR MEMO\n\n---\nJUDGE BRIEF") + + +# ── completeness: every field's content appears in the output ─────────────── +def test_research_render_is_complete(): + r = _research() + md = research_md.render(r) + assert r.market_size in md + assert "Acme" in md and "$49/mo" in md + for e in r.evidence: + assert e.url in md # citation integrity: every evidence URL linked + assert "HIGH" in md # severity badge + + +def test_prd_render_is_complete_with_citations(): + p = _prd() + md = prd_md.render(p) + assert "Recruiter" in md + assert "Reliable PDF parsing" in md and "[Must]" in md + assert p.mvp_scope in md + # requirement citations: the PDF requirement overlaps the github source → linked + assert "github.com/x/y/issues/1" in md + + +def test_trd_render_has_tables_and_spec(): + md = trd_md.render(_trd()) + assert "FastAPI" in md and "PostgreSQL" in md + assert "User" in md and "Resume" in md # entity tables + assert "/resumes" in md and "GET" in md # endpoint summary + assert "```json" in md and "openapi" in md # fenced spec + + +def test_execution_render_is_complete(): + md = execution_md.render(_execution()) + assert "APS-001" in md and "Parse PDFs" in md + assert "Sprint 1" in md and "~$235/mo" in md + + +def test_pitch_render_sections(): + md = pitch_md.render(_pitch()) + assert "Pitch Outline" in md and "Demo Script" in md and "Investor Memo" in md + assert "JUDGE BRIEF" in md + + +# ── empty / degenerate input: graceful, no exception, no literal None/null ── +@pytest.mark.parametrize("name,obj", [ + ("research", ResearchReturn(idea="x")), + ("prd", PRD(idea="x")), + ("trd", TRD()), + ("execution", ExecutionPlan()), + ("pitch", PitchPackage()), +]) +def test_empty_artifacts_render_gracefully(name, obj): + md = render_artifact(name, obj) + assert md and base.PLACEHOLDER in md + # no raw None/null leaking into the document + assert "None" not in md + assert ": null" not in md.lower() + + +def test_degraded_research_is_flagged(): + r = _research() + r.degraded = True + assert "Degraded run" in research_md.render(r) + + +# ── determinism: render twice → byte-identical ────────────────────────────── +@pytest.mark.parametrize("name,factory", [ + ("research", _research), ("prd", _prd), ("trd", _trd), + ("execution", _execution), ("pitch", _pitch), +]) +def test_render_is_deterministic(name, factory): + obj = factory() + assert render_artifact(name, obj) == render_artifact(name, obj) + + +# ── registry: dict (artifact-store JSON) renders identically to the model ──── +def test_render_artifact_accepts_dict(): + p = _prd() + assert render_artifact("prd", p.model_dump()) == render_artifact("prd", p) + + +def test_render_artifact_unknown_name_raises(): + with pytest.raises(KeyError): + render_artifact("bogus", {}) + + +# ── base helpers ──────────────────────────────────────────────────────────── +def test_evidence_link_graceful_without_url(): + e = Evidence(source="hn", url="", title="t", snippet="s") + assert base.evidence_link(e) == "hn · t" # no broken link + assert base.citation_refs([]) == base.PLACEHOLDER + + +def test_table_escapes_pipes_and_handles_empty(): + assert base.table(["A"], []) .strip() == base.PLACEHOLDER + t = base.table(["A"], [["x|y"]]) + assert "x\\|y" in t diff --git a/tests/unit/test_research_loop.py b/tests/unit/test_research_loop.py new file mode 100644 index 0000000..0c41736 --- /dev/null +++ b/tests/unit/test_research_loop.py @@ -0,0 +1,100 @@ +"""W2 — research tool-loop: Gemini-safe binding, real tool execution, key-gated live check. + +Offline and hermetic: a fake model scripts tool calls; key-gated tools take their fixture +path (no network). The live test is skipped unless an LLM key is present. +""" +from __future__ import annotations + +import os + +import pytest +from langchain_core.messages import AIMessage + +import aps.agents.research.agent as R +from aps.tools.registry import tools_for + +# JSON-schema primitive types Gemini's function-calling reliably accepts. +_SIMPLE = {"string", "integer", "number", "boolean", "null"} + + +def _is_gemini_safe(schema: dict) -> bool: + """A tool arg schema is Gemini-safe if it's flat: no nested model ($defs/$ref) and + every property is a primitive or an array of primitives (optionally wrapped in anyOf).""" + if "$defs" in schema or "$ref" in str(schema): + return False + for prop in schema.get("properties", {}).values(): + t = prop.get("type") + if t == "array": + if (prop.get("items") or {}).get("type") not in _SIMPLE: + return False + elif t in _SIMPLE: + continue + elif "anyOf" in prop: # Optional[...] -> anyOf of simple types + if not all(o.get("type") in _SIMPLE or o.get("type") == "array" + for o in prop["anyOf"]): + return False + else: + return False + return True + + +@pytest.mark.parametrize("tool", tools_for("retrieval"), + ids=[t.name for t in tools_for("retrieval")]) +def test_retrieval_tool_schemas_are_gemini_safe(tool): + # the model only ever SELECTS retrieval tools, so these must be Gemini-compatible + assert _is_gemini_safe(tool.args_schema.model_json_schema()), tool.name + + +def test_analysis_tools_are_not_model_bound(): + # analysis tools carry nested list[Evidence] schemas (not Gemini-safe) — which is exactly + # why the research loop binds retrieval ONLY and runs analysis in _compress (W2). + from aps.tools.analysis import extract_pain_points as pp + assert not _is_gemini_safe(pp.TOOL.args_schema.model_json_schema()) + + +class _FakeBound: + def __init__(self, scripts): + self.scripts = scripts + self.i = 0 + + def invoke(self, messages): + msg = self.scripts[min(self.i, len(self.scripts) - 1)] + self.i += 1 + return msg + + +class _FakeModel: + def __init__(self, scripts): + self.scripts = scripts + + def bind_tools(self, lc_tools): + return _FakeBound(self.scripts) + + +def test_loop_executes_selected_tools_and_collects_evidence(monkeypatch): + # no keys -> github/web take their fixture path (no network); fully hermetic + monkeypatch.delenv("APS_GITHUB_PAT", raising=False) + monkeypatch.delenv("TAVILY_API_KEY", raising=False) + monkeypatch.setattr(R, "acquire_llm", lambda *a, **k: 0.0) + scripts = [ + AIMessage(content="", tool_calls=[ + {"name": "github_list_issues", "args": {"repo": "x/y"}, "id": "c1"}, + {"name": "web_search", "args": {"query": "demand"}, "id": "c2"}, + ]), + AIMessage(content="done", tool_calls=[]), + ] + monkeypatch.setattr(R, "get_chat_model", lambda *a, **k: _FakeModel(scripts)) + ev, n_calls = R.gather_evidence("a privacy-first habit tracker") + assert ev, "loop must collect evidence from the tools the model selected" + assert {e.source for e in ev} # real Evidence objects with sources + assert n_calls >= 1 # tool-call counter reflects the tools the model selected + + +@pytest.mark.live +@pytest.mark.skipif( + not (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or os.getenv("NVIDIA_API_KEY")), + reason="no LLM key — live tool-selection check (W2) requires GEMINI_API_KEY or NVIDIA_API_KEY", +) +def test_live_research_selects_tools_and_gathers_evidence(): + ev, _ = R.gather_evidence("a privacy-first habit tracker app") + assert ev, "live model must select tools and gather real evidence" diff --git a/tests/unit/test_research_mode.py b/tests/unit/test_research_mode.py new file mode 100644 index 0000000..1b16451 --- /dev/null +++ b/tests/unit/test_research_mode.py @@ -0,0 +1,22 @@ +"""Research depth knob (plan 1.7): fast vs deep scaling of fan-out + tool budget.""" +from __future__ import annotations + +from aps.config.settings import Settings + + +def test_fast_mode_uses_base_limits(): + s = Settings(research_mode="fast") + assert s.research_units() == s.max_concurrent_researchers + assert s.tool_budget() == s.max_tool_calls_per_agent + + +def test_deep_mode_widens_fanout_and_budget(): + s = Settings(research_mode="deep") + assert s.research_units() == s.deep_concurrent_researchers + assert s.tool_budget() == s.deep_tool_calls_per_agent + assert s.research_units() > s.max_concurrent_researchers + assert s.tool_budget() > s.max_tool_calls_per_agent + + +def test_default_is_fast(): + assert Settings().research_mode == "fast" diff --git a/tests/unit/test_retrieval_tools.py b/tests/unit/test_retrieval_tools.py new file mode 100644 index 0000000..b0e638e --- /dev/null +++ b/tests/unit/test_retrieval_tools.py @@ -0,0 +1,48 @@ +"""Retrieval tools: structural checks over all 20 + offline fixture-path for key-gated ones. + +We do NOT make live calls here. Tools that need a key (github, web_search) take the +fixture-fallback path with no key set; no-key tools are checked structurally only, so the +suite stays offline and deterministic. +""" +from __future__ import annotations + +import pytest + +from aps.tools.registry import tools_for +from aps.state.models import ToolResult, Evidence + +RETRIEVAL = tools_for("retrieval") + + +@pytest.mark.parametrize("tool", RETRIEVAL, ids=[t.name for t in RETRIEVAL]) +def test_retrieval_tool_shape(tool): + assert tool.namespace == "retrieval" + fields = tool.args_schema.model_fields + assert fields, f"{tool.name}: must declare typed args" + + +def test_github_issues_fixture_path(monkeypatch): + from aps.tools.retrieval import github_issues as gi + monkeypatch.delenv("APS_GITHUB_PAT", raising=False) + out = gi.TOOL.run(repo="langchain-ai/langgraph") + assert isinstance(out, ToolResult) + assert out.ok and out.evidence + assert all(isinstance(e, Evidence) for e in out.evidence) + assert out.evidence[0].source == "github" + + +def test_web_search_fixture_path(monkeypatch): + from aps.tools.retrieval import web_search as ws + monkeypatch.delenv("TAVILY_API_KEY", raising=False) + out = ws.TOOL.run(query="resume screening market size") + assert isinstance(out, ToolResult) + assert out.ok and out.evidence + assert out.evidence[0].url.startswith("http") + + +def test_bad_args_return_typed_error_not_crash(): + from aps.tools.retrieval import github_issues as gi + # missing required `repo` -> BaseTool turns the ValidationError into ok=False + out = gi.TOOL.run() + assert isinstance(out, ToolResult) + assert out.ok is False and out.error and out.error.startswith("bad_args") diff --git a/tests/unit/test_run_control.py b/tests/unit/test_run_control.py new file mode 100644 index 0000000..22bfe32 --- /dev/null +++ b/tests/unit/test_run_control.py @@ -0,0 +1,87 @@ +"""Concurrency / cancellation control plane (plan §2): cooperative cancel, deadline plumbing, +idempotency, and the cancel endpoints.""" +from __future__ import annotations + +import pytest +from fastapi.testclient import TestClient + +from aps.api.main import app +from aps.api import main as m +from aps.orchestrator import cancel +from aps.orchestrator.events import EventBus +from aps.orchestrator.graph import run_sync +from aps.state.models import RunStatus + +client = TestClient(app) +KEY = {"X-APS-Key": "dev-key"} + + +# ── cancellation primitive ──────────────────────────────────────────────────── +def test_checkpoint_raises_only_when_cancelled(): + assert cancel.is_cancelled() is False # no check installed + tok = cancel.set_check(lambda: True) + try: + assert cancel.is_cancelled() is True + with pytest.raises(cancel.RunCancelled): + cancel.checkpoint() + finally: + cancel.reset(tok) + assert cancel.is_cancelled() is False # reset restores "never cancelled" + + +def test_run_cancelled_settles_into_cancelled_terminal_state(): + bus = EventBus() + # should_cancel is already true → the run unwinds at the first stage boundary, no network. + st = run_sync("a privacy habit tracker", bus, run_id="cx1", should_cancel=lambda: True) + assert st.status == RunStatus.CANCELLED + types = [e.type for e in bus.history("cx1")] + assert "run_cancelled" in types and "run_complete" in types + + +# ── cancel signal store ──────────────────────────────────────────────────────── +def test_cancel_run_unknown_is_false(): + assert m.cancel_run("run_does_not_exist") is False + + +# ── idempotency (2.4) ──────────────────────────────────────────────────────────── +def test_submit_run_is_idempotent_while_in_flight(monkeypatch): + monkeypatch.setattr(m, "_ensure_workers", lambda: None) # don't drain → stays in-flight + r1 = r2 = None + try: + r1 = m.submit_run("dedup-idea-unique-7731", None) + r2 = m.submit_run("dedup-idea-unique-7731", None) + assert r1["run_id"] == r2["run_id"] # collapsed to one run + assert r1["status"] == RunStatus.QUEUED.value + finally: + # drain the parked queue item + clear state so other tests are unaffected + while not m._RUN_QUEUE.empty(): + m._RUN_QUEUE.get_nowait() + m._RUN_QUEUE.task_done() + if r1: + for store in (m._RUNS, m._BUSES, m._CANCEL): + store.pop(r1["run_id"], None) + m._IDEM.clear() + + +# ── cancel endpoints ────────────────────────────────────────────────────────── +def test_cancel_endpoint_404_for_unknown_run(): + r = client.post("/runs/run_nope42/cancel", headers=KEY) + assert r.status_code == 404 + + +def test_cancel_endpoint_accepts_known_run(monkeypatch): + monkeypatch.setattr(m, "_ensure_workers", lambda: None) + rec = None + try: + rec = m.submit_run("cancel-me-idea-9920", None) + r = client.post(f"/runs/{rec['run_id']}/cancel", headers=KEY) + assert r.status_code == 202 and r.json()["cancelling"] is True + assert m._CANCEL[rec["run_id"]].is_set() # cooperative flag tripped + finally: + while not m._RUN_QUEUE.empty(): + m._RUN_QUEUE.get_nowait() + m._RUN_QUEUE.task_done() + if rec: + for store in (m._RUNS, m._BUSES, m._CANCEL): + store.pop(rec["run_id"], None) + m._IDEM.clear() diff --git a/tests/unit/test_scorers.py b/tests/unit/test_scorers.py new file mode 100644 index 0000000..93d507a --- /dev/null +++ b/tests/unit/test_scorers.py @@ -0,0 +1,72 @@ +"""Eval scorers (tests/evals/scorers.py) — deterministic, run against real artifacts.""" +from __future__ import annotations + +import importlib.util +from pathlib import Path + +from aps.state.models import Evidence +from aps.agents.research.stub import stub_research +from aps.agents.product.agent import run_product + +# scorers.py lives under tests/evals (not importable as a package) — load by path. +_SPEC = importlib.util.spec_from_file_location( + "aps_eval_scorers", + Path(__file__).resolve().parents[1] / "evals" / "scorers.py", +) +scorers = importlib.util.module_from_spec(_SPEC) +_SPEC.loader.exec_module(scorers) + + +def _trace(): + return [ + {"tool": "github_list_issues", "namespace": "retrieval", + "evidence": [Evidence(source="github", url="https://github.com/x/1", + title="t", snippet="parser drops PDFs").model_dump()]}, + {"tool": "hn_search", "namespace": "retrieval", + "evidence": [Evidence(source="hackernews", url="https://h/2", + title="t", snippet="ranking misses people").model_dump()]}, + {"tool": "not_a_real_tool", "namespace": "retrieval", "evidence": []}, + ] + + +def test_selection_validity_counts_known_tools(): + # 2 of 3 calls are real registry tools + assert scorers.selection_validity(_trace()) == round(2 / 3, 3) + assert scorers.selection_validity([]) == 0.0 + + +def test_source_diversity_counts_distinct_sources(): + assert scorers.source_diversity(_trace()) == 2 # github + hackernews + + +def test_prd_schema_valid_true_for_real_prd(): + prd = run_product(stub_research("resume screening")) + assert scorers.prd_schema_valid(prd) is True + assert scorers.prd_schema_valid({"idea": ""}) is False + + +def test_evidence_coverage_in_unit_range(): + prd = run_product(stub_research("resume screening")) + cov = scorers.evidence_coverage(prd) + assert 0.0 <= cov <= 1.0 + + +def test_prd_feature_count_and_floor(): + from aps.state.models import PRD, Feature + prd = PRD(idea="x", requirements=["r"], + features=[Feature(title=f"f{i}", description="d") for i in range(3)]) + assert scorers.prd_feature_count(prd) == 3 + assert scorers.meets_feature_floor(prd) is True + assert scorers.meets_feature_floor(PRD(idea="x")) is False + # works on a plain dict too (artifact-store JSON) + assert scorers.prd_feature_count(prd.model_dump()) == 3 + + +def test_evidence_coverage_detects_overlap(): + from aps.state.models import PRD, Feature + prd = PRD(idea="x", + features=[Feature(title="resume parser fix", description="handle pdf")], + requirements=["r"], + sources=[Evidence(source="github", url="https://g/1", title="parser", + snippet="the resume parser drops pdf files")]) + assert scorers.evidence_coverage(prd) == 1.0 diff --git a/tests/unit/test_scoring_grounding.py b/tests/unit/test_scoring_grounding.py new file mode 100644 index 0000000..bc3f2ef --- /dev/null +++ b/tests/unit/test_scoring_grounding.py @@ -0,0 +1,84 @@ +"""Adversarial hardening: the Startup Score must not reward ABSENCE of evidence. + +Before this, an empty / degraded research brief scored ~7/10 "Promising — worth a focused MVP" +because Competitive Whitespace maxed at 10 (no competitors found) and Founder Velocity sat at 9 +(no features defined). A judge typing a nonsense idea would get an encouraging verdict. These +tests pin the grounding gate: thin/degraded evidence yields a low, honestly-captioned score. +""" +from __future__ import annotations + +from aps.state.models import ResearchReturn, Competitor, PainPoint, Feature, Evidence, Severity, PRD +from aps.scoring import score_startup + + +def _dim(s, name): + return next(d.score for d in s.dimensions if d.name == name) + + +def _empty(): + return ResearchReturn(idea="a vague idea with no research behind it") + + +def test_empty_research_is_not_promising(): + s = score_startup(_empty()) + assert s.overall <= 5.5 # not "Promising" (>=6.5) or "Strong" (>=8.0) + low = s.verdict.lower() + assert "build it" not in low and "promising" not in low + assert "evidence" in low # says WHY it's low + + +def test_whitespace_not_maxed_without_competitor_data(): + # no competitors found + thin evidence ⇒ "unknown", NOT maximum opportunity + s = score_startup(ResearchReturn(idea="x", evidence=[ + Evidence(source="reddit", url="https://r/1", title="t", snippet="s")])) + assert _dim(s, "Competitive Whitespace") < 8.0 + + +def test_well_researched_greenfield_beats_unresearched(): + # genuine greenfield (lots of evidence, still no competitors) should out-rank no-data + ev = [Evidence(source="hn", url=f"https://h/{i}", title="t", snippet="s") for i in range(20)] + researched = score_startup(ResearchReturn(idea="x", evidence=ev)) + unresearched = score_startup(ResearchReturn(idea="x")) + assert _dim(researched, "Competitive Whitespace") > _dim(unresearched, "Competitive Whitespace") + + +def test_no_prd_velocity_is_neutral_not_max(): + s = score_startup(_empty()) # no PRD ⇒ unscoped, not "ships fast" + assert _dim(s, "Founder Velocity") == 6.0 + + +def test_velocity_rewards_small_prd_over_no_prd(): + prd = PRD(idea="x", features=[Feature(title="one thing", description="d", priority="Must")]) + scoped = score_startup(_empty(), prd=prd) + assert _dim(scoped, "Founder Velocity") > 6.0 # a tight, defined scope beats "unknown" + + +def test_degraded_brief_caps_overall_even_with_rich_stub_data(): + # a DEGRADED run carries stub fixtures that LOOK rich — they must not earn a confident score + rich_stub = ResearchReturn( + idea="Build a B2B SaaS for X", + market_size="TAM ~$5B (cited)", + competitors=[Competitor(name="Acme", features=["a", "b"], pricing="$49/mo")], + pain_points=[PainPoint(text="p", severity=Severity.HIGH)], + evidence=[Evidence(source="github", url=f"https://g/{i}", title="t", snippet="s") + for i in range(5)], + degraded=True, + ) + s = score_startup(rich_stub) + assert s.overall <= 4.5 and s.grounded is False + assert "degraded" in s.verdict.lower() or "evidence" in s.verdict.lower() + + +def test_grounded_real_idea_can_still_score_well(): + # the gate must not punish a genuinely well-evidenced idea + s = score_startup(ResearchReturn( + idea="Build a B2B SaaS for resume screening", + market_size="TAM ~$3B (cited at https://x/report)", + competitors=[Competitor(name="Acme", features=["pdf export"], pricing="$49/mo")], + pain_points=[PainPoint(text="parser drops PDFs", severity=Severity.HIGH), + PainPoint(text="ranking misses people", severity=Severity.MED)], + evidence=[Evidence(source=s_, url=f"https://{s_}/1", title="t", snippet="s") + for s_ in ("github", "reddit", "hn", "ph")], + )) + assert s.overall >= 5.0 and s.grounded is True + assert "evidence" not in s.verdict.lower() # not the thin/degraded caption diff --git a/tests/unit/test_self_hosted.py b/tests/unit/test_self_hosted.py new file mode 100644 index 0000000..6d28b3d --- /dev/null +++ b/tests/unit/test_self_hosted.py @@ -0,0 +1,57 @@ +"""Self-hosted / local OpenAI-compatible providers (LM Studio · vLLM · LocalAI · llama.cpp).""" +from __future__ import annotations + +import pytest + +from aps.config.providers import REGISTRY, provider_available, resolved_provider_chain +from aps.config.failover import base_url_for + +_LOCAL = ("ollama", "lmstudio", "vllm", "localai", "llamacpp") +_ENV = [f"APS_ENABLE_{p.upper()}" for p in _LOCAL] + \ + [f"APS_{p.upper()}_BASE_URL" for p in _LOCAL] + ["APS_PROVIDER_CHAIN", "GROQ_API_KEY"] + + +@pytest.fixture(autouse=True) +def _clean(monkeypatch): + for v in _ENV: + monkeypatch.delenv(v, raising=False) + + +def test_local_providers_registered(): + for name in _LOCAL: + spec = REGISTRY[name] + assert spec.kind == "openai" and spec.keyless and spec.base_url.startswith("http://localhost") + + +def test_local_default_ports(): + assert REGISTRY["lmstudio"].base_url.endswith(":1234/v1") + assert REGISTRY["vllm"].base_url.endswith(":8000/v1") + assert REGISTRY["localai"].base_url.endswith(":8080/v1") + + +@pytest.mark.parametrize("name", _LOCAL) +def test_local_needs_explicit_optin(name, monkeypatch): + assert provider_available(name) is False + monkeypatch.setenv(f"APS_ENABLE_{name.upper()}", "true") + assert provider_available(name) is True + + +def test_base_url_override_per_machine(monkeypatch): + assert base_url_for(REGISTRY["vllm"]) == "http://localhost:8000/v1" # default + monkeypatch.setenv("APS_VLLM_BASE_URL", "http://192.168.1.50:8000/v1") + assert base_url_for(REGISTRY["vllm"]) == "http://192.168.1.50:8000/v1" + + +def test_local_provider_joins_the_chain(monkeypatch): + monkeypatch.setenv("APS_PROVIDER_CHAIN", "vllm,groq") + monkeypatch.setenv("APS_ENABLE_VLLM", "true") + monkeypatch.setenv("GROQ_API_KEY", "k") + assert resolved_provider_chain() == ["vllm", "groq"] # local first, cloud failover behind it + + +def test_build_failover_includes_local(monkeypatch): + from aps.config.failover import build_failover_model, FailoverChatModel + monkeypatch.setenv("APS_PROVIDER_CHAIN", "lmstudio") + monkeypatch.setenv("APS_ENABLE_LMSTUDIO", "true") + m = build_failover_model() + assert isinstance(m, FailoverChatModel) and m.providers == ["lmstudio"] diff --git a/tests/unit/test_startup_score.py b/tests/unit/test_startup_score.py new file mode 100644 index 0000000..2312432 --- /dev/null +++ b/tests/unit/test_startup_score.py @@ -0,0 +1,86 @@ +"""Startup Score (remaining.md T1.4): bounded, grounded, deterministic, explainable.""" +from __future__ import annotations + +from aps.state.models import ResearchReturn, Competitor, PainPoint, Feature, Evidence, Severity, PRD +from aps.scoring import score_startup, StartupScore +from aps.render import score_md + + +def _research(**kw): + base = dict( + idea="Build a B2B SaaS for resume screening", + market_size="TAM ~$3B (cited at https://x.com/report)", + competitors=[Competitor(name="Acme", features=["pdf export", "ranking"], pricing="$49/mo")], + pain_points=[PainPoint(text="parser drops PDFs", severity=Severity.HIGH), + PainPoint(text="ranking misses people", severity=Severity.MED)], + evidence=[Evidence(source="github", url="https://g/1", title="t", snippet="s"), + Evidence(source="reddit", url="https://r/2", title="t", snippet="s")], + ) + base.update(kw) + return ResearchReturn(**base) + + +def test_score_shape_and_bounds(): + s = score_startup(_research()) + assert isinstance(s, StartupScore) + assert {d.name for d in s.dimensions} == { + "Market Opportunity", "Competitive Whitespace", "Technical Feasibility", + "Monetization Potential", "Founder Velocity", + } + for d in s.dimensions: + assert 0.0 <= d.score <= 10.0 and d.rationale + assert 0.0 <= s.overall <= 10.0 + assert s.verdict + + +def test_more_competitors_lowers_whitespace(): + few = score_startup(_research(competitors=[Competitor(name="A")])) + many = score_startup(_research(competitors=[Competitor(name=f"C{i}", features=["x", "y"]) + for i in range(6)])) + + def ws(s): + return next(d.score for d in s.dimensions if d.name == "Competitive Whitespace") + assert ws(few) > ws(many) + + +def test_more_evidence_raises_market_opportunity(): + thin = score_startup(_research(evidence=[])) + rich = score_startup(_research(evidence=[Evidence(source="hn", url=f"https://h/{i}", + title="t", snippet="s") for i in range(20)])) + + def mo(s): + return next(d.score for d in s.dimensions if d.name == "Market Opportunity") + assert mo(rich) > mo(thin) + + +def test_verdict_thresholds_are_monotonic(): + # a strong idea outscores a weak one and earns a better verdict + strong = score_startup(_research()) + weak = score_startup(_research(market_size="", competitors=[Competitor(name=f"C{i}", + features=["a", "b", "c"]) for i in range(8)], pain_points=[], evidence=[])) + assert strong.overall > weak.overall + + +def test_deterministic(): + r = _research() + assert score_startup(r).model_dump() == score_startup(r).model_dump() + + +def test_degraded_research_flag_propagates(): + s = score_startup(_research(degraded=True)) + assert s.grounded is False + assert "degraded" in score_md.render(s).lower() + + +def test_prd_features_feed_feasibility_and_velocity(): + prd = PRD(idea="x", features=[Feature(title="realtime ML scoring", description="d", priority="Must"), + Feature(title="dashboard", description="d", priority="Should")]) + s = score_startup(_research(), prd=prd) + feas = next(d.score for d in s.dimensions if d.name == "Technical Feasibility") + assert feas < 9.0 # complex cues + features reduce feasibility + + +def test_score_md_renders_scorecard(): + md = score_md.render(score_startup(_research())) + assert "# Startup Score" in md and "Overall:" in md + assert "Market Opportunity" in md and "/ 10" in md diff --git a/tests/unit/test_story_and_stack_quality.py b/tests/unit/test_story_and_stack_quality.py new file mode 100644 index 0000000..e25e6d1 --- /dev/null +++ b/tests/unit/test_story_and_stack_quality.py @@ -0,0 +1,64 @@ +"""Adversarial hardening for user-story phrasing and tech-stack cue matching. + +- User stories must name a clean CAPABILITY ("I want bulk delete"), not quote a raw pain + ("I want to overcome 'no way to bulk delete'"). +- Tech-stack cues must match at word boundaries, not as substrings — "blockchain"/"email"/"html" + must NOT trigger ML serving (the 'ai' in blockch-ai-n / the 'ml' in ht-ml). +""" +from __future__ import annotations + +from aps.state.models import Persona, PainPoint, Severity +from aps.tools.product.generate_user_stories import TOOL as STORIES +from aps.tools.architecture.choose_tech_stack import TOOL as STACK + + +def _stories(pains): + p = [Persona(name="Recruiter", role="hiring manager", goals=["hire fast"]).model_dump()] + pp = [PainPoint(text=t, severity=Severity.HIGH).model_dump() for t in pains] + return STORIES.run(personas=p, pain_points=pp).payload + + +def test_user_story_names_capability_not_raw_pain(): + out = _stories(["no way to bulk delete", "Candidate ranking is slow"]) + assert all(s.lower().startswith("as a") for s in out) + assert any("i want bulk delete" in s.lower() for s in out) + assert any("i want candidate ranking" in s.lower() for s in out) + # the clumsy "overcome ''" phrasing is gone + assert not any("overcome '" in s for s in out) + + +def test_user_stories_dedupe_shared_capability(): + # two pains that map to the same capability theme → one story, not two identical ones + out = _stories(["It is unusable", "Reliability & stability"]) + assert len(out) == 1 + + +def test_user_stories_handle_empty_pains(): + p = [Persona(name="U", role="user").model_dump()] + out = STORIES.run(personas=p, pain_points=[]).payload + assert out and out[0].lower().startswith("as a") + + +def _stack_adds(reqs, scale=""): + rows = STACK.run(requirements=reqs, scale_estimate=scale).payload + return [r.split(":")[0] for r in rows[4:]] # drop the 4 baseline rows + + +def test_substring_cues_do_not_false_trigger_ml(): + # 'ai' inside blockchain / email / training, 'ml' inside html → NOT ML serving + assert "ML serving" not in _stack_adds(["blockchain ledger"], "10k users") + assert "ML serving" not in _stack_adds(["user training portal"]) + assert "ML serving" not in _stack_adds(["html email templates"]) + + +def test_real_cues_still_add_components(): + adds = _stack_adds(["AI scoring of resumes", "search and match candidates"], "high scale") + assert "ML serving" in adds and "Search" in adds + assert "Realtime" in _stack_adds(["live streaming dashboard"]) + # prefix/stem matching preserved: 'notif' → 'notifications' + assert "Realtime" in _stack_adds(["email notifications"]) + + +def test_baseline_always_present(): + rows = STACK.run(requirements=[], scale_estimate="").payload + assert len(rows) == 4 and rows[0].startswith("Backend") diff --git a/tests/unit/test_thin_prd.py b/tests/unit/test_thin_prd.py new file mode 100644 index 0000000..ae85f84 --- /dev/null +++ b/tests/unit/test_thin_prd.py @@ -0,0 +1,50 @@ +"""W3 — the feature floor prevents thin PRDs without fabricating features.""" +from __future__ import annotations + +from aps.state.models import PainPoint, Competitor, Severity, ResearchReturn, Evidence +from aps.tools.product.prioritize_features import TOOL as prioritize +from aps.agents.product.agent import run_product + + +def test_three_pains_yield_three_features(): + pains = [PainPoint(text=f"pain {i}", severity=Severity.HIGH) for i in range(3)] + feats = prioritize.run(pain_points=pains, competitors=[]).payload + assert len(feats) >= 3 + + +def test_floor_promotes_competitor_signal_when_thin(): + # one pain but a rich competitive set -> floor lifts to >=3 from REAL competitor features + pains = [PainPoint(text="parser drops PDFs", severity=Severity.HIGH)] + comps = [Competitor(name="A", features=["pdf export", "slack sync"]), + Competitor(name="B", features=["analytics dashboard"])] + feats = prioritize.run(pain_points=pains, competitors=comps).payload + assert len(feats) >= 3 + # every promoted feature traces to real competitor wording (no fabrication) + promoted = [f for f in feats if f.title.startswith("Differentiator:")] + pool_lower = " ".join(f.lower() for c in comps for f in c.features) + assert all(f.title.split(":", 1)[1].strip().lower() in pool_lower for f in promoted) + + +def test_no_signal_stays_honestly_short(): + # one pain, no competitors -> cannot reach the floor honestly; stays at 1 (not faked) + feats = prioritize.run(pain_points=[PainPoint(text="only pain")], competitors=[]).payload + assert len(feats) == 1 + + +def test_floor_never_exceeds_max(): + pains = [PainPoint(text=f"pain {i}") for i in range(2)] + comps = [Competitor(name="A", features=[f"feat{i}" for i in range(20)])] + feats = prioritize.run(pain_points=pains, competitors=comps, max_features=5).payload + assert len(feats) <= 5 + + +def test_product_agent_prd_meets_floor_with_real_research(): + research = ResearchReturn( + idea="resume screening", + pain_points=[PainPoint(text="parser drops PDFs", severity=Severity.HIGH)], + competitors=[Competitor(name="A", features=["pdf export", "ranking"]), + Competitor(name="B", features=["analytics"])], + evidence=[Evidence(source="github", url="https://g/1", title="t", snippet="s")], + ) + prd = run_product(research) + assert len(prd.features) >= 3 diff --git a/tests/unit/test_tool_cache.py b/tests/unit/test_tool_cache.py new file mode 100644 index 0000000..d18ae3b --- /dev/null +++ b/tests/unit/test_tool_cache.py @@ -0,0 +1,81 @@ +"""Unit tests for the read-through tool-result cache (plan 1.2).""" +from __future__ import annotations + +from aps.infra import cache + + +def setup_function(_): + cache.clear() + + +def test_second_call_is_a_hit_and_skips_compute(): + calls = {"n": 0} + + def compute(): + calls["n"] += 1 + return f"result-{calls['n']}" + + first = cache.get_or_call("github_list_issues", {"query": "x"}, compute) + second = cache.get_or_call("github_list_issues", {"query": "x"}, compute) + + assert first == "result-1" + assert second == "result-1" # served from cache, compute ran only once + assert calls["n"] == 1 + s = cache.stats() + assert s["hits"] == 1 and s["misses"] == 1 + + +def test_distinct_args_miss_independently(): + seen = [] + cache.get_or_call("hn_search", {"q": "a"}, lambda: seen.append("a") or "a") + cache.get_or_call("hn_search", {"q": "b"}, lambda: seen.append("b") or "b") + assert seen == ["a", "b"] + assert cache.stats()["misses"] == 2 + + +def test_key_is_order_independent(): + cache.get_or_call("t", {"a": 1, "b": 2}, lambda: "v") + # same args, different dict insertion order → same key → a hit + cache.get_or_call("t", {"b": 2, "a": 1}, lambda: "SHOULD_NOT_RUN") + assert cache.stats()["hits"] == 1 + + +def test_clear_resets_entries_and_counters(): + cache.get_or_call("t", {"a": 1}, lambda: "v") + cache.clear() + s = cache.stats() + assert s == {"hits": 0, "misses": 0, "size": 0, "hit_rate": 0.0} + + +def test_disabled_under_pytest(): + # The hermetic suite must not let the read-through path share state across cases. + assert cache.enabled() is False + + +# ── per-TTL buckets (Phase 4/5: long TTL for slow-changing domain/trademark/compliance) ── +def test_ttl_bucket_caches_and_hits(): + calls = {"n": 0} + + def compute(): + calls["n"] += 1 + return f"r-{calls['n']}" + + a = cache.get_or_call("check_domain_availability", {"d": "x.com"}, compute, ttl=21600) + b = cache.get_or_call("check_domain_availability", {"d": "x.com"}, compute, ttl=21600) + assert a == b == "r-1" and calls["n"] == 1 + assert cache.stats()["hits"] == 1 + + +def test_same_key_different_ttl_is_a_separate_bucket(): + # A long-TTL entry must not be served to a default-TTL lookup (different bucket). + cache.get_or_call("t", {"a": 1}, lambda: "long", ttl=86400) + cache.get_or_call("t", {"a": 1}, lambda: "default") # default bucket → its own miss + s = cache.stats() + assert s["misses"] == 2 and s["size"] == 2 + + +def test_clear_drops_all_buckets(): + cache.get_or_call("t", {"a": 1}, lambda: "v", ttl=21600) + cache.get_or_call("t", {"a": 2}, lambda: "v", ttl=86400) + cache.clear() + assert cache.stats() == {"hits": 0, "misses": 0, "size": 0, "hit_rate": 0.0} diff --git a/tests/unit/test_tool_fallback.py b/tests/unit/test_tool_fallback.py new file mode 100644 index 0000000..1bf91ba --- /dev/null +++ b/tests/unit/test_tool_fallback.py @@ -0,0 +1,46 @@ +"""W4 — token-gated tools degrade loudly: fixture evidence is [fixture]-stamped + logged.""" +from __future__ import annotations + +import aps.tools.base as base +from aps.state.models import Evidence + + +def test_fixture_fallback_stamps_and_logs(monkeypatch): + class _FakeLog: + def __init__(self): + self.warnings = [] + def warning(self, *a, **k): + self.warnings.append((a, k)) + def debug(self, *a, **k): + pass + + fake = _FakeLog() + monkeypatch.setattr(base, "_LOG", fake) + + res = base.fixture_or_error( + "TAVILY_API_KEY not set", + evidence=[Evidence(source="web", url="https://x.com/a", title="Live title", snippet="s")], + ) + assert res.ok + assert res.evidence[0].title.startswith("[fixture]") # judge can see it's fixture + assert fake.warnings and fake.warnings[0][0][0] == "tool_fixture_fallback" # and it's logged + + +def test_token_gated_tool_returns_stamped_fixture(monkeypatch): + # web_search with no key takes the fixture path with NO network call + monkeypatch.delenv("TAVILY_API_KEY", raising=False) + from aps.tools.retrieval import web_search as ws + out = ws.TOOL.run(query="resume screening market") + assert out.ok and out.evidence + assert all(e.title.startswith("[fixture]") for e in out.evidence) + + +def test_no_fallback_when_disabled(monkeypatch): + from aps.config.settings import get_settings + get_settings.cache_clear() + monkeypatch.setenv("APS_ALLOW_FIXTURE_FALLBACK", "false") + try: + res = base.fixture_or_error("boom", evidence=[]) + assert res.ok is False and res.error == "boom" + finally: + get_settings.cache_clear() diff --git a/tests/unit/test_tool_trace.py b/tests/unit/test_tool_trace.py new file mode 100644 index 0000000..dfc0f4d --- /dev/null +++ b/tests/unit/test_tool_trace.py @@ -0,0 +1,27 @@ +"""Per-tool event sink (plan §4): tools emit tool_call/tool_result with timing through the +run's sink, and are silent (no-op) outside a run.""" +from __future__ import annotations + +from aps.infra import trace +from aps.tools.analysis import dedupe_and_rank_evidence as dd + + +def test_tool_emits_call_and_result_through_sink(): + events: list[tuple[str, dict]] = [] + tok = trace.set_sink(lambda t, d: events.append((t, d))) + try: + dd.TOOL.run(evidence=[]) + finally: + trace.reset(tok) + types = [t for t, _ in events] + assert types == ["tool_call", "tool_result"] + result = events[1][1] + assert result["tool"] == dd.TOOL.name + assert "ms" in result and result["ms"] >= 0 + assert result["ok"] is True + + +def test_emit_is_noop_without_a_sink(): + # No sink installed → running a tool must not raise (CLI / bare-call path). + out = dd.TOOL.run(evidence=[]) + assert out.ok is True From f7256d4652cb22a5dbc890087b8600fc7652ca22 Mon Sep 17 00:00:00 2001 From: rajatnagda45 Date: Sun, 14 Jun 2026 19:58:14 +0530 Subject: [PATCH 2/2] ci: retrigger checks against updated main