From 5767eb82e12d34d888ba506d9276cf45230baffb Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Wed, 25 Feb 2026 20:11:49 +1000 Subject: [PATCH 01/16] feat: add autopilot mode + Agent Teams execution engine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds /gsd:autopilot — one command to run the full pipeline (discuss → plan → execute → verify) for remaining phases automatically. New workflows: - auto-discuss.md: synthetic multi-agent discuss spawns 3/5/7/9 expert agents that debate gray areas and converge via majority consensus, producing CONTEXT.md without human Q&A - autopilot.md: thin loop orchestrator that ensures context exists then chains the existing auto-advance pipeline per phase New execution engine option: - Agent Teams mode (Claude Code only) as alternative to subagents - Graceful degradation to subagents when unavailable - Configurable teammate model (sonnet/opus/haiku) Config additions: - execution.engine: "subagents" (default) or "agent-teams" - execution.teammate_model: model for teammates - autopilot.discuss_agents: odd number 3-9 for consensus - autopilot.discuss_model: model for discuss agents - Validation enforces odd numbers and valid engine/model values Also updates settings.md (3 new questions), progress.md (autopilot in next-action suggestions), and execute-phase.md (engine routing + auto- chain verification). --- commands/gsd/autopilot.md | 41 +++ get-shit-done/bin/lib/config.cjs | 32 +++ get-shit-done/bin/lib/init.cjs | 4 + get-shit-done/templates/config.json | 8 + get-shit-done/workflows/auto-discuss.md | 314 +++++++++++++++++++++++ get-shit-done/workflows/autopilot.md | 277 ++++++++++++++++++++ get-shit-done/workflows/execute-phase.md | 28 ++ get-shit-done/workflows/progress.md | 2 + get-shit-done/workflows/settings.md | 53 +++- 9 files changed, 757 insertions(+), 2 deletions(-) create mode 100644 commands/gsd/autopilot.md create mode 100644 get-shit-done/workflows/auto-discuss.md create mode 100644 get-shit-done/workflows/autopilot.md diff --git a/commands/gsd/autopilot.md b/commands/gsd/autopilot.md new file mode 100644 index 0000000000..9b2aa1eb9f --- /dev/null +++ b/commands/gsd/autopilot.md @@ -0,0 +1,41 @@ +--- +name: gsd:autopilot +description: Run full pipeline (discuss, plan, execute) for remaining phases automatically +argument-hint: "[phase] [start-end]" +allowed-tools: + - Read + - Write + - Edit + - Glob + - Grep + - Bash + - Task + - AskUserQuestion +--- + +Run the full GSD pipeline for remaining phases in the current milestone — automatically. + +For each incomplete phase: generate context via synthetic multi-agent discuss, then chain through plan → execute → verify → transition. One command, full autopilot. + +**Usage:** +- `/gsd:autopilot` — Run from current phase through end of milestone +- `/gsd:autopilot 5` — Run starting from phase 5 +- `/gsd:autopilot 3-7` — Run phases 3 through 7 + + + +@~/.claude/get-shit-done/workflows/autopilot.md +@~/.claude/get-shit-done/workflows/auto-discuss.md +@~/.claude/get-shit-done/references/ui-brand.md + + + +Arguments: $ARGUMENTS (optional phase number or range) + +Context files are resolved in-workflow using `gsd-tools init progress` and `roadmap analyze`. + + + +Execute the autopilot workflow from @~/.claude/get-shit-done/workflows/autopilot.md end-to-end. +Preserve all workflow gates (phase loop, synthetic discuss, auto-advance chain, stop conditions). + diff --git a/get-shit-done/bin/lib/config.cjs b/get-shit-done/bin/lib/config.cjs index 0d9a9260df..2737d98bdf 100644 --- a/get-shit-done/bin/lib/config.cjs +++ b/get-shit-done/bin/lib/config.cjs @@ -58,11 +58,21 @@ function cmdConfigEnsureSection(cwd, raw) { }, parallelization: true, brave_search: hasBraveSearch, + execution: { + engine: 'subagents', + teammate_model: 'sonnet', + }, + autopilot: { + discuss_agents: 5, + discuss_model: 'sonnet', + }, }; const defaults = { ...hardcoded, ...userDefaults, workflow: { ...hardcoded.workflow, ...(userDefaults.workflow || {}) }, + execution: { ...hardcoded.execution, ...(userDefaults.execution || {}) }, + autopilot: { ...hardcoded.autopilot, ...(userDefaults.autopilot || {}) }, }; try { @@ -97,6 +107,28 @@ function cmdConfigSet(cwd, keyPath, value, raw) { error('Failed to read config.json: ' + err.message); } + // Validate autopilot.discuss_agents must be odd + if (keyPath === 'autopilot.discuss_agents') { + if (typeof parsedValue !== 'number' || parsedValue % 2 === 0 || parsedValue < 3 || parsedValue > 9) { + error('discuss_agents must be odd (3/5/7/9) for consensus.'); + } + } + + // Validate execution.engine values + if (keyPath === 'execution.engine') { + if (parsedValue !== 'subagents' && parsedValue !== 'agent-teams') { + error('execution.engine must be "subagents" or "agent-teams".'); + } + } + + // Validate model values + const validModels = ['opus', 'sonnet', 'haiku']; + if (keyPath === 'execution.teammate_model' || keyPath === 'autopilot.discuss_model') { + if (!validModels.includes(parsedValue)) { + error(`${keyPath} must be one of: ${validModels.join(', ')}`); + } + } + // Set nested value using dot notation (e.g., "workflow.research") const keys = keyPath.split('.'); let current = config; diff --git a/get-shit-done/bin/lib/init.cjs b/get-shit-done/bin/lib/init.cjs index 7e551a01fb..6a6cf5e940 100644 --- a/get-shit-done/bin/lib/init.cjs +++ b/get-shit-done/bin/lib/init.cjs @@ -35,6 +35,7 @@ function cmdInitExecutePhase(cwd, phase, raw) { phase_branch_template: config.phase_branch_template, milestone_branch_template: config.milestone_branch_template, verifier_enabled: config.verifier, + execution_engine: (config.execution && config.execution.engine) || 'subagents', // Phase info phase_found: !!phaseInfo, @@ -663,6 +664,9 @@ function cmdInitProgress(cwd, raw) { // Config commit_docs: config.commit_docs, + auto_advance: (config.workflow && config.workflow.auto_advance) || false, + execution_engine: (config.execution && config.execution.engine) || 'subagents', + discuss_agents: (config.autopilot && config.autopilot.discuss_agents) || 5, // Milestone milestone_version: milestone.version, diff --git a/get-shit-done/templates/config.json b/get-shit-done/templates/config.json index d67ef30266..d928de7423 100644 --- a/get-shit-done/templates/config.json +++ b/get-shit-done/templates/config.json @@ -30,6 +30,14 @@ "issues_review": true, "confirm_transition": true }, + "execution": { + "engine": "subagents", + "teammate_model": "sonnet" + }, + "autopilot": { + "discuss_agents": 5, + "discuss_model": "sonnet" + }, "safety": { "always_confirm_destructive": true, "always_confirm_external_services": true diff --git a/get-shit-done/workflows/auto-discuss.md b/get-shit-done/workflows/auto-discuss.md new file mode 100644 index 0000000000..e20355a755 --- /dev/null +++ b/get-shit-done/workflows/auto-discuss.md @@ -0,0 +1,314 @@ + +Generate implementation context for a phase without human input. Spawns an odd number of agents (3/5/7/9) that analyze gray areas from different expert perspectives and converge on decisions via majority consensus. + +Produces CONTEXT.md in the same format as discuss-phase — downstream agents (researcher, planner) consume it identically. + + + +discuss-phase captures the user's vision through Q&A. In autopilot, there's no user. The solution: synthetic debate from multiple expert perspectives. + +This is better than skipping discuss — you get *debated* defaults from multiple expert perspectives, informed by existing project context. It won't capture the user's personal vision, but it will catch things a single agent would miss (edge cases, accessibility, performance tradeoffs). + + + + + +Phase number from argument (required). + +```bash +INIT=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs init phase-op "${PHASE}") +``` + +Parse JSON for: `phase_found`, `phase_dir`, `phase_number`, `phase_name`, `phase_slug`, `padded_phase`, `has_context`, `roadmap_exists`. + +**If `phase_found` is false:** Return error — phase not found. + +**If `has_context` is true:** Skip — CONTEXT.md already exists, return early with status. + +Read agent count and model from config: +```bash +AGENT_COUNT=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get autopilot.discuss_agents 2>/dev/null || echo "5") +DISCUSS_MODEL=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get autopilot.discuss_model 2>/dev/null || echo "sonnet") +``` + + + +Collect the context each agent needs to make informed decisions. + +**Read these files (paths only — agents read them with fresh context):** + +1. Phase goal from ROADMAP.md: +```bash +ROADMAP_PHASE=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs roadmap get-phase "${PHASE}") +``` + +2. Identify available context files: +- `.planning/PROJECT.md` — project vision +- `.planning/REQUIREMENTS.md` — requirements +- `.planning/codebase/*.md` — codebase patterns (if exists) +- Previous phases' CONTEXT.md files — decisions already made (consistency) + +```bash +PREV_CONTEXTS=$(ls .planning/phases/*-CONTEXT.md 2>/dev/null | sort) +``` + + + +Same logic as discuss-phase's `analyze_phase` step. Read the phase goal and identify domain-specific gray areas. + +**Determine the domain:** +- Something users SEE → layout, density, interactions, states +- Something users CALL → responses, errors, auth, versioning +- Something users RUN → output format, flags, modes, error handling +- Something users READ → structure, tone, depth, flow +- Something being ORGANIZED → criteria, grouping, naming, exceptions + +**Generate 4-6 specific gray areas** for this phase. These become the debate topics. + +Example for "User Authentication" phase: +``` +1. Session handling — JWT lifetime, refresh strategy, concurrent sessions +2. Error responses — generic vs specific, rate limiting feedback +3. Multi-device policy — allow all, limit count, kick oldest +4. Recovery flow — email-only, SMS option, security questions +``` + + + +Each agent gets a distinct expert role based on the configured count: + +**3 agents:** +1. UX Designer — user experience, interaction patterns, accessibility +2. Engineer — technical feasibility, performance, maintainability +3. Product Owner — business value, scope, user impact + +**5 agents (default):** +1-3 above, plus: +4. QA / Edge Cases — error states, boundary conditions, failure modes +5. Devil's Advocate — challenges assumptions, identifies risks + +**7 agents:** +1-5 above, plus: +6. Domain Expert — industry conventions, standards compliance +7. Accessibility Specialist — WCAG, screen readers, keyboard navigation + +**9 agents:** +1-7 above, plus: +8. Security — attack vectors, data protection, auth concerns +9. Performance — load times, caching, resource usage + + + +Spawn all agents in parallel. Each agent independently analyzes gray areas from their perspective. + +**Read execution engine config:** +```bash +ENGINE=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get execution.engine 2>/dev/null || echo "subagents") +``` + +**Agent Teams mode (`ENGINE = "agent-teams"`):** + +Create an Agent Team where teammates can message each other for real debate: +- Lead = this orchestrator (collects consensus) +- Teammates = one per perspective role +- Teammates debate gray areas, challenge each other, refine positions +- Lead collects final positions after debate concludes + +**Subagents mode (`ENGINE = "subagents"`, default):** + +Spawn each agent via Task(). Each independently returns recommendations. + +``` +For each agent (1 to AGENT_COUNT): + Task( + subagent_type="general-purpose", + model="${DISCUSS_MODEL}", + prompt=" + + You are a ${ROLE_NAME} reviewing implementation decisions for Phase ${PHASE}: ${PHASE_NAME}. + For each gray area, recommend a specific decision and justify it from your perspective. + + + + ${ROLE_DESCRIPTION} + Focus on: ${ROLE_FOCUS_AREAS} + + + + Phase goal: ${PHASE_GOAL} + Phase requirements: ${PHASE_REQUIREMENTS} + + + + Read these files for context: + - .planning/PROJECT.md (project vision) + - .planning/REQUIREMENTS.md (requirements, if exists) + - ${PREV_CONTEXT_FILES} (previous phase decisions, if any) + - .planning/codebase/*.md (codebase patterns, if exists) + + + + For EACH of these gray areas, provide: + 1. Your recommended decision (be specific, not vague) + 2. Your justification from ${ROLE_NAME} perspective (2-3 sentences) + 3. Any caveats or conditions + + ${GRAY_AREAS_LIST} + + + + Return your recommendations as structured text: + + ## Gray Area: [name] + **Decision:** [specific recommendation] + **Justification:** [why, from your perspective] + **Caveats:** [conditions or risks, if any] + + Repeat for each gray area. + + ", + description="${ROLE_NAME} review" + ) +``` + +Spawn all agents in parallel (single message, multiple Task calls). + + + +Tally votes per gray area. Majority wins (odd number = no ties). + +**For each gray area:** + +1. Collect all agent recommendations +2. Group similar recommendations (agents may phrase differently but mean the same thing) +3. Count votes for each distinct position +4. **Majority (>50%)** → Locked Decision +5. **Plurality but no majority** → Claude's Discretion (note the split) +6. **Any agent flags scope creep** → Deferred Idea + +**Build the decision summary:** + +``` +Gray Area: Session Handling +- JWT with 15min access + 7d refresh (3 votes: Engineer, Product, QA) +- JWT with 1hr access + 30d refresh (2 votes: UX, Devil's Advocate) +→ DECISION: JWT with 15min access + 7d refresh (consensus) +→ RATIONALE: Security-performance balance, standard practice + +Gray Area: Error Responses +- Specific error codes with user-friendly messages (2 votes) +- Generic errors in production, specific in dev (2 votes) +- Specific everywhere with rate limit headers (1 vote) +→ DISCRETION: Split between specific and environment-based. Note: majority favors specific errors. +``` + + + +Create CONTEXT.md in the same format as discuss-phase output. + +**Find or create phase directory:** + +Use values from init: `phase_dir`, `phase_slug`, `padded_phase`. + +If `phase_dir` is null: +```bash +mkdir -p ".planning/phases/${padded_phase}-${phase_slug}" +``` + +**File location:** `${phase_dir}/${padded_phase}-CONTEXT.md` + +```markdown +# Phase [X]: [Name] - Context + +**Generated:** [date] +**Method:** Synthetic discuss (${AGENT_COUNT} agents) +**Status:** Ready for planning + + +## Phase Boundary + +[Clear statement of what this phase delivers — from ROADMAP.md goal] + + + + +## Implementation Decisions + +### [Gray Area 1] +- **Decision:** [consensus recommendation] +- **Rationale:** [combined justification from voting agents] +- **Consensus:** [N]/[AGENT_COUNT] agents agreed + +### [Gray Area 2] +- **Decision:** [consensus recommendation] +- **Rationale:** [combined justification] +- **Consensus:** [N]/[AGENT_COUNT] agents agreed + +### Claude's Discretion +[Areas where votes were split — note both positions and let planner decide] +- [Gray Area X]: Split between [option A] and [option B]. [Brief context on tradeoffs.] + + + + +## Specific Ideas + +[Notable recommendations from individual agents worth preserving — e.g., a QA agent flagging an important edge case, or a UX agent suggesting a specific interaction pattern] + + + + +## Deferred Ideas + +[Any scope creep flagged by agents — captured for future phases] + +[If none: "None — all recommendations stayed within phase scope"] + + + +--- + +*Phase: ${padded_phase}-${phase_slug}* +*Context generated: [date] via synthetic discuss* +``` + + + +Commit the generated context: + +```bash +node ~/.claude/get-shit-done/bin/gsd-tools.cjs commit "docs(${padded_phase}): generate synthetic phase context" --files "${phase_dir}/${padded_phase}-CONTEXT.md" +``` + +Update STATE.md: +```bash +node ~/.claude/get-shit-done/bin/gsd-tools.cjs state record-session \ + --stopped-at "Phase ${PHASE} synthetic context generated" \ + --resume-file "${phase_dir}/${padded_phase}-CONTEXT.md" +``` + +Return status to caller: +``` +## CONTEXT GENERATED + +Phase: ${PHASE_NUMBER} - ${PHASE_NAME} +Method: Synthetic discuss (${AGENT_COUNT} agents) +Decisions: [N] locked, [M] discretion, [K] deferred +File: ${phase_dir}/${padded_phase}-CONTEXT.md +``` + + + + + +- [ ] Phase validated against roadmap +- [ ] Gray areas identified through domain analysis (not generic) +- [ ] Correct number of agents spawned (from config) +- [ ] Each agent assigned distinct perspective +- [ ] Majority consensus calculated per gray area +- [ ] CONTEXT.md matches discuss-phase output format +- [ ] Split votes noted as Claude's Discretion (not arbitrary picks) +- [ ] Scope creep flagged as Deferred Ideas +- [ ] Previous phase decisions considered for consistency +- [ ] File committed and state updated + diff --git a/get-shit-done/workflows/autopilot.md b/get-shit-done/workflows/autopilot.md new file mode 100644 index 0000000000..9bdfa632e0 --- /dev/null +++ b/get-shit-done/workflows/autopilot.md @@ -0,0 +1,277 @@ + +Run the full GSD pipeline for remaining phases in the current milestone — automatically. Thin loop that ensures context exists (via synthetic discuss) then spawns the existing auto-advance chain (plan → execute → verify → transition) for each phase. + +This workflow does NOT duplicate logic from discuss-phase, plan-phase, or execute-phase. It's a coordinator that: (a) ensures context exists via synthetic discuss, (b) spawns the existing auto-advance chain, (c) catches return status, and (d) loops. + + + +One command. Full autopilot. The complexity is in the existing workflows — this just chains them together with fresh context between phases. + + + + + +Parse arguments to determine phase range: + +- No argument → run from current phase through end of milestone +- Single number (e.g., `5`) → run starting from phase 5 +- Range (e.g., `3-7`) → run phases 3 through 7 + +```bash +INIT=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs init progress) +``` + +Parse JSON for: `phases`, `current_phase`, `milestone_version`, `milestone_name`, `phase_count`, `completed_count`, `roadmap_exists`, `state_exists`. + +**If `roadmap_exists` is false:** Error — no roadmap found. Run `/gsd:new-project` first. + +```bash +ROADMAP=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs roadmap analyze) +``` + +Parse roadmap analysis for all phases with goals, status, and completion info. + +**Determine phase list:** + +Filter to incomplete phases within the requested range. Skip phases already marked complete. + +If no incomplete phases remain: +``` +All phases in range are complete. + +Run /gsd:progress to see status, or /gsd:complete-milestone if milestone is done. +``` +Exit. + + + +Persist auto-advance to config so the entire chain honors it: + +```bash +node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-set workflow.auto_advance true +``` + +Display launch banner: +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + GSD ► AUTOPILOT + Milestone: ${milestone_version} ${milestone_name} + Phases: ${start_phase} → ${end_phase} (${remaining_count} remaining) +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +``` + + + +For each incomplete phase in the range: + +``` +For phase in ${INCOMPLETE_PHASES}: + + 1. Display phase header: + ┌─────────────────────────────────────┐ + │ Phase ${N}: ${NAME} │ + │ ${GOAL} │ + └─────────────────────────────────────┘ + + 2. Check if CONTEXT.md exists for this phase + → Use init phase-op or check disk directly + + 3. If no CONTEXT.md → run synthetic discuss (step: run_auto_discuss) + + 4. Spawn auto-advance chain (step: run_phase_chain) + + 5. Check result → continue, stop, or handle error (step: handle_result) +``` + + + +Spawn synthetic discuss when a phase has no CONTEXT.md. + +``` +Task( + prompt=" + + Generate implementation context for Phase ${PHASE}: ${PHASE_NAME} using synthetic multi-agent discuss. + + + + @~/.claude/get-shit-done/workflows/auto-discuss.md + + + + PHASE=${PHASE} + + + + 1. Read auto-discuss.md for your complete workflow + 2. Follow ALL steps: initialize, gather inputs, analyze gray areas, assign perspectives, spawn debate, synthesize, write context + 3. Return: CONTEXT GENERATED (success) or ERROR (failure with details) + + ", + subagent_type="general-purpose", + description="Auto-discuss Phase ${PHASE}" +) +``` + +**Handle result:** +- **CONTEXT GENERATED** → Continue to phase chain +- **ERROR** → Stop autopilot, report which phase failed and why + + + +Spawn the existing auto-advance chain starting from discuss-phase (which chains to plan → execute → verify → transition). + +Since CONTEXT.md now exists (either pre-existing or just generated), we start from plan-phase with auto-advance: + +``` +Task( + prompt=" + + You are the plan-phase orchestrator. Create executable plans for Phase ${PHASE}: ${PHASE_NAME}, then auto-advance to execution. + + + + @~/.claude/get-shit-done/workflows/plan-phase.md + @~/.claude/get-shit-done/references/ui-brand.md + @~/.claude/get-shit-done/references/model-profile-resolution.md + + + + PHASE=${PHASE} + ARGUMENTS='${PHASE} --auto' + + + + 1. Read plan-phase.md from execution_context for your complete workflow + 2. Follow ALL steps: initialize, validate, load context, research, plan, verify, auto-advance + 3. When spawning agents (gsd-phase-researcher, gsd-planner, gsd-plan-checker), use Task with specified subagent_type and model + 4. For step 14 (auto-advance to execute): spawn execute-phase as a Task with DIRECT file reference — tell it to read execute-phase.md. Include @file refs to execute-phase.md, checkpoints.md, tdd.md, model-profile-resolution.md. Pass --no-transition flag so execute-phase returns results instead of chaining further. + 5. Do NOT use the Skill tool or /gsd: commands. Read workflow .md files directly. + 6. Return: PHASE COMPLETE (full pipeline success), PLANNING COMPLETE (planning done but execute failed/skipped), PLANNING INCONCLUSIVE, or GAPS FOUND + + ", + subagent_type="general-purpose", + description="Plan+Execute Phase ${PHASE}" +) +``` + + + +Process the return from the phase chain: + +**PHASE COMPLETE:** +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + ✓ Phase ${PHASE} Complete +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +``` + +Run transition to mark complete and advance state: +```bash +# Transition is needed since we used --no-transition in the chain +TRANSITION=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs phase complete "${PHASE}") +``` + +Commit transition: +```bash +node ~/.claude/get-shit-done/bin/gsd-tools.cjs commit "docs(phase-${PHASE}): complete phase execution" --files .planning/ROADMAP.md .planning/STATE.md .planning/REQUIREMENTS.md +``` + +Continue to next phase in the loop. + +**PLANNING COMPLETE (execution didn't finish):** +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + ⚠ Phase ${PHASE}: Planning complete, execution incomplete + Stopping autopilot. +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +Continue manually: /gsd:execute-phase ${PHASE} +``` +Stop the loop. + +**GAPS FOUND:** +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + ⚠ Phase ${PHASE}: Verification gaps found + Stopping autopilot. +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +Fix gaps: /gsd:plan-phase ${PHASE} --gaps +Then resume: /gsd:autopilot ${NEXT_PHASE} +``` +Stop the loop. + +**CHECKPOINT (human-action):** +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + ⏸ Phase ${PHASE}: Human action required + Stopping autopilot. +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +Complete the required action, then resume: /gsd:autopilot ${PHASE} +``` +Stop the loop. + +**Any other failure:** +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + ✗ Phase ${PHASE}: Unexpected failure + Stopping autopilot. +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +Check /gsd:progress for current state. +``` +Stop the loop. + + + +When all phases in range complete successfully: + +```bash +# Clear auto-advance at milestone boundary +node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-set workflow.auto_advance false +``` + +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + GSD ► AUTOPILOT COMPLETE + Milestone: ${milestone_version} ${milestone_name} + Phases completed: ${completed_list} +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +All phases in range finished successfully. + +Next: /gsd:complete-milestone — archive and prepare for next +``` + + + + + +Autopilot stops when ANY of these occur: +1. **Milestone complete** — all phases in range finished +2. **human-action checkpoint** — requires manual intervention (auth gates, etc.) +3. **Gaps found** — verification failed, needs gap closure +4. **Execution failure** — plan or execution didn't complete +5. **Critical error** — unexpected failure in any step + + + +Each phase gets fresh context via Task() subagents. The autopilot orchestrator stays lean — it only tracks: +- Which phases remain +- The result status from each phase +- Whether to continue or stop + +All heavy lifting happens in subagents with their own 200k context windows. + + + +- [ ] Phase range parsed correctly from arguments +- [ ] Auto-advance config enabled for the chain +- [ ] Each phase gets CONTEXT.md (existing or synthetic) +- [ ] Auto-advance chain spawned correctly per phase +- [ ] Results handled: continue on success, stop on failure/gaps/checkpoint +- [ ] Milestone boundary stops autopilot and clears auto-advance +- [ ] User knows exactly where things stopped and how to resume + diff --git a/get-shit-done/workflows/execute-phase.md b/get-shit-done/workflows/execute-phase.md index 5149594ce0..ea8bcdb0dd 100644 --- a/get-shit-done/workflows/execute-phase.md +++ b/get-shit-done/workflows/execute-phase.md @@ -74,6 +74,16 @@ Report: Execute each wave in sequence. Within a wave: parallel if `PARALLELIZATION=true`, sequential if `false`. +**Execution engine selection:** + +```bash +ENGINE=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get execution.engine 2>/dev/null || echo "subagents") +TEAMMATE_MODEL=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get execution.teammate_model 2>/dev/null || echo "sonnet") +``` + +If `ENGINE` is `"agent-teams"`: use Agent Teams mode for wave execution (see below). Teammates use `${TEAMMATE_MODEL}`. +If `ENGINE` is `"subagents"` (default): use current Task-based execution. + **For each wave:** 1. **Describe what's being built (BEFORE spawning):** @@ -99,6 +109,8 @@ Execute each wave in sequence. Within a wave: parallel if `PARALLELIZATION=true` Pass paths only — executors read files themselves with their fresh 200k context. This keeps orchestrator context lean (~10-15%). + **Subagents mode (default):** + ``` Task( subagent_type="gsd-executor", @@ -136,6 +148,20 @@ Execute each wave in sequence. Within a wave: parallel if `PARALLELIZATION=true` ) ``` + **Agent Teams mode (`ENGINE = "agent-teams"`):** + + If the execution engine is set to `"agent-teams"`, create an Agent Team instead of individual Task() calls: + - Lead = this execute-phase orchestrator + - Teammates = one per plan in the current wave, using `${TEAMMATE_MODEL}` (e.g., "Use Sonnet for each teammate" or "Use Opus for each teammate") + - Each teammate gets the same prompt as the subagent executor above (plan file + state + config) + - Wave boundaries still enforced (wave 2 waits for wave 1 to complete) + - Lead handles STATE.md/ROADMAP.md updates after each wave (teammates don't touch shared state) + + **Graceful degradation:** If Agent Teams is not available (IDE doesn't support it, or runtime error), fall back to subagents with a warning: + ``` + ⚠ Agent Teams not available — falling back to subagent execution. + ``` + 3. **Wait for all agents in wave to complete.** 4. **Report completion — spot-check claims first:** @@ -412,6 +438,8 @@ STOP. Do not proceed to auto-advance or transition. **If `--auto` flag present OR `AUTO_CFG` is true (AND verification passed with no gaps):** +**Auto-chain verify-work:** If `config.workflow.verifier` is true and verification hasn't already run (no VERIFICATION.md), run the automated verification before transitioning. Skip the manual UAT portion — the automated `verify_phase_goal` step already checks that the codebase delivers what the phase promised. + ``` ╔══════════════════════════════════════════╗ ║ AUTO-ADVANCING → TRANSITION ║ diff --git a/get-shit-done/workflows/progress.md b/get-shit-done/workflows/progress.md index e1dcc2eb1c..7e7fedd10f 100644 --- a/get-shit-done/workflows/progress.md +++ b/get-shit-done/workflows/progress.md @@ -226,6 +226,7 @@ Check if `{phase_num}-CONTEXT.md` exists in phase directory. **Also available:** - `/gsd:plan-phase {phase}` — skip discussion, plan directly +- `/gsd:autopilot` — run full pipeline automatically (discuss → plan → execute) for remaining phases - `/gsd:list-phase-assumptions {phase}` — see Claude's assumptions --- @@ -299,6 +300,7 @@ Read ROADMAP.md to get the next phase's name and goal. **Also available:** - `/gsd:plan-phase {Z+1}` — skip discussion, plan directly +- `/gsd:autopilot {Z+1}` — run full pipeline automatically for remaining phases - `/gsd:verify-work {Z}` — user acceptance test before continuing --- diff --git a/get-shit-done/workflows/settings.md b/get-shit-done/workflows/settings.md index 9677001db0..052dc4be02 100644 --- a/get-shit-done/workflows/settings.md +++ b/get-shit-done/workflows/settings.md @@ -102,6 +102,36 @@ AskUserQuestion([ { label: "Per Phase", description: "Create branch for each phase (gsd/phase-{N}-{name})" }, { label: "Per Milestone", description: "Create branch for entire milestone (gsd/{version}-{name})" } ] + }, + { + question: "Execution engine for parallel agents?", + header: "Engine", + multiSelect: false, + options: [ + { label: "Subagents (Recommended)", description: "Standard Task() subagents — works in all IDEs" }, + { label: "Agent Teams", description: "Claude Code Agent Teams — teammates can collaborate (Claude Code only)" } + ] + }, + { + question: "How many agents for synthetic discuss in autopilot?", + header: "Discuss", + multiSelect: false, + options: [ + { label: "3 agents", description: "UX Designer, Engineer, Product Owner — fastest" }, + { label: "5 agents (Recommended)", description: "Adds QA/Edge Cases + Devil's Advocate" }, + { label: "7 agents", description: "Adds Domain Expert + Accessibility" }, + { label: "9 agents", description: "Adds Security + Performance — most thorough" } + ] + }, + { + question: "Model for Agent Teams teammates / autopilot discuss agents?", + header: "Agent Model", + multiSelect: false, + options: [ + { label: "Sonnet (Recommended)", description: "Best cost/quality balance for parallel agents" }, + { label: "Opus", description: "Highest quality — significantly higher token cost with many agents" }, + { label: "Haiku", description: "Fastest and cheapest — good for simple phases" } + ] } ]) ``` @@ -123,6 +153,14 @@ Merge new settings into existing config.json: }, "git": { "branching_strategy": "none" | "phase" | "milestone" + }, + "execution": { + "engine": "subagents" | "agent-teams", + "teammate_model": "sonnet" | "opus" | "haiku" + }, + "autopilot": { + "discuss_agents": 3 | 5 | 7 | 9, + "discuss_model": "sonnet" | "opus" | "haiku" } } ``` @@ -168,6 +206,14 @@ Write `~/.gsd/defaults.json` with: "verifier": , "auto_advance": , "nyquist_validation": + }, + "execution": { + "engine": , + "teammate_model": + }, + "autopilot": { + "discuss_agents": , + "discuss_model": } } ``` @@ -190,6 +236,9 @@ Display: | Auto-Advance | {On/Off} | | Nyquist Validation | {On/Off} | | Git Branching | {None/Per Phase/Per Milestone} | +| Execution Engine | {Subagents/Agent Teams} | +| Discuss Agents | {3/5/7/9} | +| Agent Model | {Sonnet/Opus/Haiku} | | Saved as Defaults | {Yes/No} | These settings apply to future /gsd:plan-phase and /gsd:execute-phase runs. @@ -206,8 +255,8 @@ Quick commands: - [ ] Current config read -- [ ] User presented with 7 settings (profile + 5 workflow toggles + git branching) -- [ ] Config updated with model_profile, workflow, and git sections +- [ ] User presented with 9 settings (profile + 5 workflow toggles + git branching + engine + discuss agents) +- [ ] Config updated with model_profile, workflow, git, execution, and autopilot sections - [ ] User offered to save as global defaults (~/.gsd/defaults.json) - [ ] Changes confirmed to user From a41d27e05c271a4ffae84ce86888d3f81b56e239 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Wed, 25 Feb 2026 21:14:20 +1000 Subject: [PATCH 02/16] refactor: remove Agent Teams engine, simplify to subagents-only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent Teams can't set per-teammate models (teammates inherit lead's model), defeating GSD's profile-based model differentiation. Since wave-based plans are independent (no inter-agent communication needed), Agent Teams adds complexity with no benefit. - Remove execution.engine config key and validation - Rename teammate_model → agent_model across config, templates, workflows - Strip Agent Teams sections from execute-phase, auto-discuss, settings - Drop engine question from settings (9 → 8 settings) --- get-shit-done/bin/lib/config.cjs | 12 ++--------- get-shit-done/bin/lib/init.cjs | 3 --- get-shit-done/templates/config.json | 3 +-- get-shit-done/workflows/auto-discuss.md | 17 +--------------- get-shit-done/workflows/execute-phase.md | 26 ------------------------ get-shit-done/workflows/settings.md | 20 ++++-------------- 6 files changed, 8 insertions(+), 73 deletions(-) diff --git a/get-shit-done/bin/lib/config.cjs b/get-shit-done/bin/lib/config.cjs index 2737d98bdf..a7f8293175 100644 --- a/get-shit-done/bin/lib/config.cjs +++ b/get-shit-done/bin/lib/config.cjs @@ -59,8 +59,7 @@ function cmdConfigEnsureSection(cwd, raw) { parallelization: true, brave_search: hasBraveSearch, execution: { - engine: 'subagents', - teammate_model: 'sonnet', + agent_model: 'sonnet', }, autopilot: { discuss_agents: 5, @@ -114,16 +113,9 @@ function cmdConfigSet(cwd, keyPath, value, raw) { } } - // Validate execution.engine values - if (keyPath === 'execution.engine') { - if (parsedValue !== 'subagents' && parsedValue !== 'agent-teams') { - error('execution.engine must be "subagents" or "agent-teams".'); - } - } - // Validate model values const validModels = ['opus', 'sonnet', 'haiku']; - if (keyPath === 'execution.teammate_model' || keyPath === 'autopilot.discuss_model') { + if (keyPath === 'execution.agent_model' || keyPath === 'autopilot.discuss_model') { if (!validModels.includes(parsedValue)) { error(`${keyPath} must be one of: ${validModels.join(', ')}`); } diff --git a/get-shit-done/bin/lib/init.cjs b/get-shit-done/bin/lib/init.cjs index 6a6cf5e940..c25ce026ff 100644 --- a/get-shit-done/bin/lib/init.cjs +++ b/get-shit-done/bin/lib/init.cjs @@ -35,8 +35,6 @@ function cmdInitExecutePhase(cwd, phase, raw) { phase_branch_template: config.phase_branch_template, milestone_branch_template: config.milestone_branch_template, verifier_enabled: config.verifier, - execution_engine: (config.execution && config.execution.engine) || 'subagents', - // Phase info phase_found: !!phaseInfo, phase_dir: phaseInfo?.directory || null, @@ -665,7 +663,6 @@ function cmdInitProgress(cwd, raw) { // Config commit_docs: config.commit_docs, auto_advance: (config.workflow && config.workflow.auto_advance) || false, - execution_engine: (config.execution && config.execution.engine) || 'subagents', discuss_agents: (config.autopilot && config.autopilot.discuss_agents) || 5, // Milestone diff --git a/get-shit-done/templates/config.json b/get-shit-done/templates/config.json index d928de7423..05cee28dbf 100644 --- a/get-shit-done/templates/config.json +++ b/get-shit-done/templates/config.json @@ -31,8 +31,7 @@ "confirm_transition": true }, "execution": { - "engine": "subagents", - "teammate_model": "sonnet" + "agent_model": "sonnet" }, "autopilot": { "discuss_agents": 5, diff --git a/get-shit-done/workflows/auto-discuss.md b/get-shit-done/workflows/auto-discuss.md index e20355a755..f7c1d6a6ba 100644 --- a/get-shit-done/workflows/auto-discuss.md +++ b/get-shit-done/workflows/auto-discuss.md @@ -101,22 +101,7 @@ Each agent gets a distinct expert role based on the configured count: Spawn all agents in parallel. Each agent independently analyzes gray areas from their perspective. -**Read execution engine config:** -```bash -ENGINE=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get execution.engine 2>/dev/null || echo "subagents") -``` - -**Agent Teams mode (`ENGINE = "agent-teams"`):** - -Create an Agent Team where teammates can message each other for real debate: -- Lead = this orchestrator (collects consensus) -- Teammates = one per perspective role -- Teammates debate gray areas, challenge each other, refine positions -- Lead collects final positions after debate concludes - -**Subagents mode (`ENGINE = "subagents"`, default):** - -Spawn each agent via Task(). Each independently returns recommendations. +**Spawn each agent via Task():** ``` For each agent (1 to AGENT_COUNT): diff --git a/get-shit-done/workflows/execute-phase.md b/get-shit-done/workflows/execute-phase.md index ea8bcdb0dd..5ee50b628c 100644 --- a/get-shit-done/workflows/execute-phase.md +++ b/get-shit-done/workflows/execute-phase.md @@ -74,16 +74,6 @@ Report: Execute each wave in sequence. Within a wave: parallel if `PARALLELIZATION=true`, sequential if `false`. -**Execution engine selection:** - -```bash -ENGINE=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get execution.engine 2>/dev/null || echo "subagents") -TEAMMATE_MODEL=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get execution.teammate_model 2>/dev/null || echo "sonnet") -``` - -If `ENGINE` is `"agent-teams"`: use Agent Teams mode for wave execution (see below). Teammates use `${TEAMMATE_MODEL}`. -If `ENGINE` is `"subagents"` (default): use current Task-based execution. - **For each wave:** 1. **Describe what's being built (BEFORE spawning):** @@ -109,8 +99,6 @@ If `ENGINE` is `"subagents"` (default): use current Task-based execution. Pass paths only — executors read files themselves with their fresh 200k context. This keeps orchestrator context lean (~10-15%). - **Subagents mode (default):** - ``` Task( subagent_type="gsd-executor", @@ -148,20 +136,6 @@ If `ENGINE` is `"subagents"` (default): use current Task-based execution. ) ``` - **Agent Teams mode (`ENGINE = "agent-teams"`):** - - If the execution engine is set to `"agent-teams"`, create an Agent Team instead of individual Task() calls: - - Lead = this execute-phase orchestrator - - Teammates = one per plan in the current wave, using `${TEAMMATE_MODEL}` (e.g., "Use Sonnet for each teammate" or "Use Opus for each teammate") - - Each teammate gets the same prompt as the subagent executor above (plan file + state + config) - - Wave boundaries still enforced (wave 2 waits for wave 1 to complete) - - Lead handles STATE.md/ROADMAP.md updates after each wave (teammates don't touch shared state) - - **Graceful degradation:** If Agent Teams is not available (IDE doesn't support it, or runtime error), fall back to subagents with a warning: - ``` - ⚠ Agent Teams not available — falling back to subagent execution. - ``` - 3. **Wait for all agents in wave to complete.** 4. **Report completion — spot-check claims first:** diff --git a/get-shit-done/workflows/settings.md b/get-shit-done/workflows/settings.md index 052dc4be02..af3c19318f 100644 --- a/get-shit-done/workflows/settings.md +++ b/get-shit-done/workflows/settings.md @@ -103,15 +103,6 @@ AskUserQuestion([ { label: "Per Milestone", description: "Create branch for entire milestone (gsd/{version}-{name})" } ] }, - { - question: "Execution engine for parallel agents?", - header: "Engine", - multiSelect: false, - options: [ - { label: "Subagents (Recommended)", description: "Standard Task() subagents — works in all IDEs" }, - { label: "Agent Teams", description: "Claude Code Agent Teams — teammates can collaborate (Claude Code only)" } - ] - }, { question: "How many agents for synthetic discuss in autopilot?", header: "Discuss", @@ -124,7 +115,7 @@ AskUserQuestion([ ] }, { - question: "Model for Agent Teams teammates / autopilot discuss agents?", + question: "Model for execution and discuss subagents?", header: "Agent Model", multiSelect: false, options: [ @@ -155,8 +146,7 @@ Merge new settings into existing config.json: "branching_strategy": "none" | "phase" | "milestone" }, "execution": { - "engine": "subagents" | "agent-teams", - "teammate_model": "sonnet" | "opus" | "haiku" + "agent_model": "sonnet" | "opus" | "haiku" }, "autopilot": { "discuss_agents": 3 | 5 | 7 | 9, @@ -208,8 +198,7 @@ Write `~/.gsd/defaults.json` with: "nyquist_validation": }, "execution": { - "engine": , - "teammate_model": + "agent_model": }, "autopilot": { "discuss_agents": , @@ -236,7 +225,6 @@ Display: | Auto-Advance | {On/Off} | | Nyquist Validation | {On/Off} | | Git Branching | {None/Per Phase/Per Milestone} | -| Execution Engine | {Subagents/Agent Teams} | | Discuss Agents | {3/5/7/9} | | Agent Model | {Sonnet/Opus/Haiku} | | Saved as Defaults | {Yes/No} | @@ -255,7 +243,7 @@ Quick commands: - [ ] Current config read -- [ ] User presented with 9 settings (profile + 5 workflow toggles + git branching + engine + discuss agents) +- [ ] User presented with 8 settings (profile + 5 workflow toggles + git branching + discuss agents) - [ ] Config updated with model_profile, workflow, git, execution, and autopilot sections - [ ] User offered to save as global defaults (~/.gsd/defaults.json) - [ ] Changes confirmed to user From 9fdbaa0046745f6dd0a9141c4ba2c42ef4d2a6c1 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Wed, 25 Feb 2026 21:24:59 +1000 Subject: [PATCH 03/16] refactor: remove dead execution section, consolidate to autopilot config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit execution.agent_model was configured and validated but never consumed by any workflow. Remove the entire execution config section — discuss model lives under autopilot.discuss_model where it's actually read. --- get-shit-done/bin/lib/config.cjs | 6 +----- get-shit-done/templates/config.json | 3 --- get-shit-done/workflows/settings.md | 16 +++++----------- 3 files changed, 6 insertions(+), 19 deletions(-) diff --git a/get-shit-done/bin/lib/config.cjs b/get-shit-done/bin/lib/config.cjs index a7f8293175..2756ef8731 100644 --- a/get-shit-done/bin/lib/config.cjs +++ b/get-shit-done/bin/lib/config.cjs @@ -58,9 +58,6 @@ function cmdConfigEnsureSection(cwd, raw) { }, parallelization: true, brave_search: hasBraveSearch, - execution: { - agent_model: 'sonnet', - }, autopilot: { discuss_agents: 5, discuss_model: 'sonnet', @@ -70,7 +67,6 @@ function cmdConfigEnsureSection(cwd, raw) { ...hardcoded, ...userDefaults, workflow: { ...hardcoded.workflow, ...(userDefaults.workflow || {}) }, - execution: { ...hardcoded.execution, ...(userDefaults.execution || {}) }, autopilot: { ...hardcoded.autopilot, ...(userDefaults.autopilot || {}) }, }; @@ -115,7 +111,7 @@ function cmdConfigSet(cwd, keyPath, value, raw) { // Validate model values const validModels = ['opus', 'sonnet', 'haiku']; - if (keyPath === 'execution.agent_model' || keyPath === 'autopilot.discuss_model') { + if (keyPath === 'autopilot.discuss_model') { if (!validModels.includes(parsedValue)) { error(`${keyPath} must be one of: ${validModels.join(', ')}`); } diff --git a/get-shit-done/templates/config.json b/get-shit-done/templates/config.json index 05cee28dbf..67a7a47cfb 100644 --- a/get-shit-done/templates/config.json +++ b/get-shit-done/templates/config.json @@ -30,9 +30,6 @@ "issues_review": true, "confirm_transition": true }, - "execution": { - "agent_model": "sonnet" - }, "autopilot": { "discuss_agents": 5, "discuss_model": "sonnet" diff --git a/get-shit-done/workflows/settings.md b/get-shit-done/workflows/settings.md index af3c19318f..2b100219b7 100644 --- a/get-shit-done/workflows/settings.md +++ b/get-shit-done/workflows/settings.md @@ -115,8 +115,8 @@ AskUserQuestion([ ] }, { - question: "Model for execution and discuss subagents?", - header: "Agent Model", + question: "Model for autopilot discuss subagents?", + header: "Discuss Model", multiSelect: false, options: [ { label: "Sonnet (Recommended)", description: "Best cost/quality balance for parallel agents" }, @@ -145,9 +145,6 @@ Merge new settings into existing config.json: "git": { "branching_strategy": "none" | "phase" | "milestone" }, - "execution": { - "agent_model": "sonnet" | "opus" | "haiku" - }, "autopilot": { "discuss_agents": 3 | 5 | 7 | 9, "discuss_model": "sonnet" | "opus" | "haiku" @@ -197,9 +194,6 @@ Write `~/.gsd/defaults.json` with: "auto_advance": , "nyquist_validation": }, - "execution": { - "agent_model": - }, "autopilot": { "discuss_agents": , "discuss_model": @@ -226,7 +220,7 @@ Display: | Nyquist Validation | {On/Off} | | Git Branching | {None/Per Phase/Per Milestone} | | Discuss Agents | {3/5/7/9} | -| Agent Model | {Sonnet/Opus/Haiku} | +| Discuss Model | {Sonnet/Opus/Haiku} | | Saved as Defaults | {Yes/No} | These settings apply to future /gsd:plan-phase and /gsd:execute-phase runs. @@ -243,8 +237,8 @@ Quick commands: - [ ] Current config read -- [ ] User presented with 8 settings (profile + 5 workflow toggles + git branching + discuss agents) -- [ ] Config updated with model_profile, workflow, git, execution, and autopilot sections +- [ ] User presented with 8 settings (profile + 5 workflow toggles + git branching + discuss agents + discuss model) +- [ ] Config updated with model_profile, workflow, git, and autopilot sections - [ ] User offered to save as global defaults (~/.gsd/defaults.json) - [ ] Changes confirmed to user From 7ee63ad6c422447872fb07d6d6771fe121847514 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Thu, 26 Feb 2026 10:04:04 +1000 Subject: [PATCH 04/16] docs: start milestone v2.0 MoE Panels --- .planning/PROJECT.md | 100 +++++++++++++++++++++++++++++++++++++++++++ .planning/STATE.md | 28 ++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 .planning/PROJECT.md create mode 100644 .planning/STATE.md diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md new file mode 100644 index 0000000000..236d81f049 --- /dev/null +++ b/.planning/PROJECT.md @@ -0,0 +1,100 @@ +# get-shit-done + +## What This Is + +An open-source npm package that orchestrates AI coding agents for software development workflows. Development includes both the core tool (agent orchestration, workflow routing, config management) and its test infrastructure (433 tests, 94.01% coverage, CI pipeline). + +## Core Value + +Reliable AI agent orchestration with quality gates that catch bad plans before execution burns context. Every module has tests that catch regressions before they reach users. + +## Requirements + +### Validated + +- ✓ Node.js built-in `node:test` + `node:assert` test framework — v1.0 +- ✓ CLI integration test pattern via `execSync` with temp directory isolation — v1.0 +- ✓ Test helpers (`createTempProject`, `runGsdTools`, `cleanup`, `createTempGitProject`) — v1.0 +- ✓ Tests for all 11 modules: phase, state, commands, init, roadmap, core, frontmatter, verify, config, template, milestone — v1.0 +- ✓ 355 tests passing, 0 failures — v1.0 +- ✓ 4 regression tests (REG-01 through REG-04) — v1.0 +- ✓ GitHub Actions CI pipeline with 3x3 OS/Node matrix — v1.0 +- ✓ commands.cjs from 59% to 88.86% line coverage — v1.1 +- ✓ init.cjs from 42% to 98.59% line coverage — v1.1 +- ✓ state.cjs from 40% to 96.16% line coverage — v1.1 +- ✓ gsd-tools.cjs dispatcher from 76% to 94.35% line coverage — v1.1 +- ✓ roadmap.cjs from 71% to 99.32% line coverage — v1.1 +- ✓ c8 devDependency with `npm run test:coverage` script — v1.1 +- ✓ Coverage thresholds enforced in CI (fail if any module drops below 70%) — v1.1 +- ✓ VERIFICATION.md audit trail for each coverage phase — v1.1 + +### Active (v2.0 — MoE Panels) + +- MoE panel infrastructure: 3 config keys (`plan_check_panel`, `verifier_panel`, `research_panel`), all default `false` +- Plan Checker Panel: 3 parallel specialists (Structural, Semantic, Compliance) with consensus logic +- Verifier Panel: 3 domain specialists (Artifacts, Requirements, Human) with domain-partitioned assembly +- Research Panel: 3 domain researchers (Stack, Architecture, Pitfalls) with inline synthesis +- Workflow routing: conditional panel dispatch based on config keys +- Output contract preservation: panel output identical to single-agent output (same headers, frontmatter, patterns) + +*Full requirements in `.planning/REQUIREMENTS.md` (pending creation)* + +### Out of Scope + +- Changing user-facing commands or output formats +- Performance/benchmark testing — not needed at current scale +- TypeScript migration — different milestone entirely +- Performance tests for large ROADMAP.md files (PERF-01) — future candidate +- Windows-specific path separator tests (WIN-01) — future candidate +- Windows CRLF line ending handling tests (CRLF-01) — future candidate + +## Current Milestone: v2.0 — MoE Panels + +**Phase:** Defining requirements (research pending) +**Starting phase:** 14 (continuing from v1.1's Phase 13) + +Adds Mixture of Experts panels for the three highest-variance quality gates: plan checking, verification, and phase research. Each panel replaces a single-agent step with 3 parallel specialists, improving coverage without changing user-facing commands or output formats. + +## Current State + +Shipped v1.1 with 433 tests, 94.01% overall line coverage, all 11 modules above 70%. Coverage is enforced in CI on every PR. All requirements have VERIFICATION.md audit trails. + +- **Test count:** 433 +- **Overall coverage:** 94.01% line coverage +- **Lowest module:** commands.cjs at 88.86% (target: 75%) +- **CI matrix:** Ubuntu, macOS, Windows × Node 18, 20, 22 (9 jobs) +- **Coverage enforcement:** c8 v11 on Node 20+; plain `npm test` on Node 18 + +Known bugs documented and tested (tests assert current behavior, production code not modified): +- `getRoadmapPhaseInternal` goal regex format mismatch (`**Goal:**` vs `**Goal**:`) +- `verify.cjs:82` — `content.search()` returns -1 handled by guard +- `frontmatter.cjs` — comma splitting doesn't handle quoted values (REG-04 documents limitation) +- `commands.cjs` — all git errors treated as "nothing to commit" + +**Codebase map:** `.planning/codebase/` (7 documents, analyzed 2026-02-25) + +## Constraints + +- **No new dependencies** (except c8 as devDependency): Follow existing lightweight convention +- **Backwards compatible**: Panel config keys default `false` — existing behavior unchanged +- **Output contract preservation**: Panel output must be identical to single-agent output +- **Existing patterns**: Tests use `node:test` + `node:assert`, CLI integration via `execSync`, temp directories +- **Cross-platform**: Tests must work on macOS, Linux, and Windows (CI matrix) +- **Not our repo**: We're contributing PRs, not merging directly + +## Key Decisions + +| Decision | Rationale | Outcome | +|----------|-----------|---------| +| Use node:test (no Jest/Vitest) | Match existing convention, zero dependencies | ✓ Good — 433 tests, fast execution | +| Integration tests for cmd* functions | process.exit() in output/error prevents direct require() | ✓ Good — consistent pattern across all modules | +| Unit tests for pure functions | comparePhaseNum, extractFrontmatter etc. can be require()'d directly | ✓ Good — faster, more granular | +| One PR per module | Keeps reviews focused, allows parallel submission | ✓ Good — each phase is an independent PR | +| createTempGitProject helper | Git-dependent tests need isolated repos with config | ✓ Good — used by verify-summary and verify-commits | +| Concurrency groups in CI | Cancel stale runs on same branch using head_ref \|\| run_id | ✓ Good — prevents queue buildup | +| c8 for coverage (not nyc) | Works natively with node:test via V8 coverage | ✓ Good — 94.01% overall, clean per-file report | +| Node 18 skip for c8 v11 | c8 v11 declares engines Node 20+, Node 18 EOL April 2025 | ✓ Good — CI stable, plain npm test still runs on Node 18 | +| VERIFICATION.md per coverage phase | Audit trail for orphaned requirements found in milestone audit | ✓ Good — all requirements now have 3-source verification | + +--- +*Last updated: 2026-02-26 — v2.0 MoE Panels milestone started* diff --git a/.planning/STATE.md b/.planning/STATE.md new file mode 100644 index 0000000000..60d7dca3ab --- /dev/null +++ b/.planning/STATE.md @@ -0,0 +1,28 @@ +# Project State + +## Current Milestone: v2.0 — MoE Panels + +**Phase:** Pre-phase (defining requirements) +**Status:** Research pending, then requirements definition + +## Milestone History + +### v1.0 — Test Infrastructure (Phases 1–9) +- 355 tests across all 11 modules +- GitHub Actions CI with 3x3 OS/Node matrix +- 4 regression tests (REG-01 through REG-04) + +### v1.1 — Coverage Hardening (Phases 10–13) +- 433 tests, 94.01% overall line coverage +- c8 coverage enforcement in CI +- All modules above 70% threshold +- VERIFICATION.md audit trails for every phase + +## Context + +- Codebase: 11 JavaScript modules in `bin/` directory +- Test framework: `node:test` + `node:assert` +- Config: `get-shit-done/templates/config.json` +- Agent definitions: `agents/` directory (markdown files) +- Workflow definitions: `get-shit-done/workflows/` directory +- Codebase map: `.planning/codebase/` (7 documents, analyzed 2026-02-25) From adc321be44fd3c7819fbab637381eb1f80c29451 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Thu, 26 Feb 2026 10:12:26 +1000 Subject: [PATCH 05/16] wip: v2.0 milestone setup paused at research synthesis --- .planning/.continue-here.md | 70 ++++ .planning/research/ARCHITECTURE.md | 638 +++++++++++++++++++++++++++++ .planning/research/FEATURES.md | 337 +++++++++++++++ .planning/research/PITFALLS.md | 352 ++++++++++++++++ .planning/research/STACK.md | 291 +++++++++++++ 5 files changed, 1688 insertions(+) create mode 100644 .planning/.continue-here.md create mode 100644 .planning/research/ARCHITECTURE.md create mode 100644 .planning/research/FEATURES.md create mode 100644 .planning/research/PITFALLS.md create mode 100644 .planning/research/STACK.md diff --git a/.planning/.continue-here.md b/.planning/.continue-here.md new file mode 100644 index 0000000000..4a716186fe --- /dev/null +++ b/.planning/.continue-here.md @@ -0,0 +1,70 @@ +--- +phase: pre-phase (milestone v2.0 setup) +task: 6 +total_tasks: 8 +status: in_progress +last_updated: 2026-02-26T00:11:19.319Z +--- + + +Milestone v2.0 MoE Panels setup is in progress. Steps 1-5 complete. Step 6 (research) nearly done — all 4 research files written to disk, all 4 agents completed. Synthesis not yet run. Steps 7 (requirements) and 8 (roadmap) not started. + + + + +- Step 1: Update PROJECT.md — Done. Broadened scope from test infrastructure to GSD tool development, added v2.0 milestone section, moved v1.0/v1.1 to validated, set active requirements to MoE panel scope. +- Step 2: Create STATE.md — Done. Fresh STATE.md for v2.0 with milestone history context. +- Step 3: Commit milestone start — Done. Commit `ade3945 docs: start milestone v2.0 MoE Panels` +- Step 4: Run init + config — Done. `gsd-tools init new-milestone` and `config-set workflow.research true` +- Step 5: Research agents — All 4 completed: + - STACK.md (291 lines) — HIGH confidence. Scatter-gather with LLM synthesizer pattern. + - FEATURES.md (337 lines) — MEDIUM-HIGH confidence. Domain-partitioned assembly, not voting. + - ARCHITECTURE.md (638 lines) — HIGH confidence. Transparent substitution, inline specialist prompts, config routing. + - PITFALLS.md (352 lines) — HIGH confidence. 13 pitfalls catalogued, output contract drift is #1 risk. + + + + +- Step 6 (finish): Run gsd-research-synthesizer to merge 4 research files into SUMMARY.md, then commit research artifacts +- Step 7: Define requirements — Derive REQ-IDs from user spec across 5 categories (Panel Infrastructure, Plan Checker Panel, Verifier Panel, Research Panel, Testing). Scope with user via AskUserQuestion per category. Commit REQUIREMENTS.md. +- Step 8: Create roadmap — Spawn gsd-roadmapper with phase numbering starting at 14, all requirements + research. Present for approval, commit ROADMAP.md. + + + + +- Architecture researcher found: panel agents should use inline specialist prompts (no separate specialist files). This contradicts Stack researcher's suggestion of 12 separate files. Architecture finding is higher confidence (based on direct codebase analysis of auto-discuss.md pattern). +- All researchers agree: domain-partitioned assembly over voting for consensus mechanism. +- Config routing: 3 new keys (`workflow.plan_check_panel`, `workflow.verifier_panel`, `workflow.research_panel`), all default `false`. Must nest under existing `workflow.*` to avoid collision with `workflow.plan_check`/`workflow.verifier`. +- Plan checker panel should be built first (most well-defined output contract). +- Output contract drift is the #1 risk — orchestrators must own the output template, not specialists. +- Graceful degradation: 2/3 specialists succeed = usable output; 1/3 or 0/3 = fallback to single-agent. + + + +- None. All research agents completed successfully. + + + +This is the v2.0 milestone setup following the plan from a previous planning session. The plan has 8 execution steps. Key architectural insight: panel agents are transparent substitutes — they produce identical output to single agents, with config flags controlling dispatch. The existing auto-discuss.md workflow already demonstrates the parallel-spawn-and-synthesize pattern. + +Research disagreement resolved: Stack says 12 separate specialist files, Architecture says inline prompts in 3 panel agent files. Architecture recommendation wins (follows existing codebase conventions, specialists not independently useful). + +Key research files: +- `.planning/research/STACK.md` — orchestration patterns +- `.planning/research/FEATURES.md` — consensus mechanisms, panel designs +- `.planning/research/ARCHITECTURE.md` — integration with GSD codebase +- `.planning/research/PITFALLS.md` — failure modes, prevention strategies + + + +1. Run gsd-research-synthesizer to create .planning/research/SUMMARY.md from the 4 research files +2. Commit all research artifacts +3. Begin Step 7: Requirements definition — present 5 categories to user for scoping via AskUserQuestion: + - Panel Infrastructure (PANEL-XX) + - Plan Checker Panel (PCHK-XX) + - Verifier Panel (VRFY-XX) + - Research Panel (RSRCH-XX) + - Testing (TEST-XX) +4. Write and commit REQUIREMENTS.md +5. Step 8: Spawn gsd-roadmapper for ROADMAP.md (phases starting at 14) + diff --git a/.planning/research/ARCHITECTURE.md b/.planning/research/ARCHITECTURE.md new file mode 100644 index 0000000000..a36bdb4766 --- /dev/null +++ b/.planning/research/ARCHITECTURE.md @@ -0,0 +1,638 @@ +# Architecture Patterns: MoE Panel Integration + +**Domain:** Agent orchestration panel dispatch within CLI workflow engine +**Researched:** 2026-02-26 +**Confidence:** HIGH (based on direct codebase analysis of existing patterns) + +## Recommended Architecture + +MoE Panels are **orchestrator agents that replace single agents at dispatch points**. Three panels replace three existing single agents: `gsd-plan-checker` (plan verification), `gsd-verifier` (phase verification), and `gsd-phase-researcher` (research). Each panel spawns 3 specialist subagents in parallel, synthesizes their outputs, and returns the **exact same structured output contract** as the single agent it replaces. + +The key architectural insight: panels are a **transparent substitution**. The workflow files (`plan-phase.md`, `execute-phase.md`) dispatch to either a single agent or a panel agent based on a config flag. The orchestrator workflow never knows or cares whether a single agent or a panel produced the output -- the return contract is identical. + +### Component Boundaries + +| Component | Responsibility | Communicates With | +|-----------|---------------|-------------------| +| `config.json` workflow section | Stores `plan_check_panel`, `verifier_panel`, `researcher_panel` booleans | Read by `init.cjs` commands | +| `init.cjs` init commands | Resolves panel flags + panel model into INIT JSON for workflows | Consumed by workflow orchestrators | +| `core.cjs` MODEL_PROFILES | Maps panel agent names to model tiers | Called by `resolveModelInternal` | +| Workflow files (plan-phase, execute-phase) | Conditional dispatch: panel flag true -> spawn panel agent, false -> spawn single agent | Spawn panel or single agents via Task() | +| Panel agent `.md` files | Orchestrate 3 parallel specialists, synthesize, return structured output | Spawn specialist agents via Task(), return to workflow | +| Specialist prompts | Inline within panel agent (NOT separate agent files) | Spawned by panel orchestrator, return findings | + +### Data Flow + +``` +config.json + | + v +init.cjs (resolves flags + models into INIT JSON) + | + v +workflow.md (plan-phase / execute-phase) + | + +--> [if panel=false] --> single agent (gsd-plan-checker / gsd-verifier / gsd-phase-researcher) + | | + | v + | structured return (## VERIFICATION PASSED / ## ISSUES FOUND / etc.) + | + +--> [if panel=true] --> panel agent (gsd-plan-checker-panel / gsd-verifier-panel / gsd-researcher-panel) + | + +--> Task(specialist-1, model=panel_model) --| + +--> Task(specialist-2, model=panel_model) --+--> parallel + +--> Task(specialist-3, model=panel_model) --| + | + v + synthesize (majority consensus / union of findings) + | + v + structured return (SAME contract as single agent) +``` + +## Integration Point 1: Config Routing + +### Current Config Structure + +The existing config uses `workflow.*` keys for boolean agent toggles: + +```json +{ + "workflow": { + "research": true, + "plan_check": true, + "verifier": true, + "auto_advance": false, + "nyquist_validation": false + } +} +``` + +### Recommended Config Extension + +Add `*_panel` keys alongside existing toggles. A panel flag is only meaningful when its parent toggle is `true`. + +```json +{ + "workflow": { + "research": true, + "research_panel": false, + "plan_check": true, + "plan_check_panel": false, + "verifier": true, + "verifier_panel": false, + "auto_advance": false, + "nyquist_validation": false + } +} +``` + +**Why parallel booleans instead of a mode enum:** The existing pattern is per-feature booleans (`research: true/false`, `plan_check: true/false`). Adding `research_panel: true/false` follows the same convention. A user can disable research entirely (`research: false`) or enable it with a panel (`research: true, research_panel: true`). These are independent toggles -- `research_panel: true` with `research: false` is a no-op (the panel flag is only checked when the parent feature is enabled). + +**Why NOT a global `panels: true` toggle:** Different panels have different cost/value tradeoffs. A user may want panel verification (catches more bugs) but not panel research (overkill for simple phases). Per-feature panel toggles give that control. + +### Config Resolution in `init.cjs` + +The `cmdInitPlanPhase` function already resolves `research_enabled`, `plan_checker_enabled`. Add panel resolution: + +```javascript +// In cmdInitPlanPhase result object: +{ + // Existing + research_enabled: config.research, + plan_checker_enabled: config.plan_checker, + researcher_model: resolveModelInternal(cwd, 'gsd-phase-researcher'), + checker_model: resolveModelInternal(cwd, 'gsd-plan-checker'), + + // New panel flags + research_panel: config.research_panel || false, + plan_check_panel: config.plan_check_panel || false, + + // New panel models (only resolved when panel enabled) + researcher_panel_model: config.research_panel + ? resolveModelInternal(cwd, 'gsd-researcher-panel') + : null, + checker_panel_model: config.plan_check_panel + ? resolveModelInternal(cwd, 'gsd-plan-checker-panel') + : null, +} +``` + +Similarly for `cmdInitExecutePhase`: + +```javascript +{ + verifier_model: resolveModelInternal(cwd, 'gsd-verifier'), + verifier_panel: config.verifier_panel || false, + verifier_panel_model: config.verifier_panel + ? resolveModelInternal(cwd, 'gsd-verifier-panel') + : null, +} +``` + +### Config Loading in `core.cjs` loadConfig + +Add three new fields to the config loader: + +```javascript +// In loadConfig return: +{ + // ...existing fields + research_panel: get('research_panel', { section: 'workflow', field: 'research_panel' }) ?? false, + plan_check_panel: get('plan_check_panel', { section: 'workflow', field: 'plan_check_panel' }) ?? false, + verifier_panel: get('verifier_panel', { section: 'workflow', field: 'verifier_panel' }) ?? false, +} +``` + +### Settings UI Addition + +Add three new questions to `settings.md` (one per panel toggle), positioned after their parent toggle: + +``` +{ + question: "Use MoE Panel for plan checking? (3 specialist agents instead of 1)", + header: "Plan Check Panel", + multiSelect: false, + options: [ + { label: "No (Recommended)", description: "Single plan-checker agent" }, + { label: "Yes", description: "3 specialists: coverage analyst, scope auditor, dependency checker" } + ] +} +``` + +## Integration Point 2: Model Profile Registration + +### Current MODEL_PROFILES Table + +```javascript +const MODEL_PROFILES = { + 'gsd-plan-checker': { quality: 'sonnet', balanced: 'sonnet', budget: 'haiku' }, + 'gsd-verifier': { quality: 'sonnet', balanced: 'sonnet', budget: 'haiku' }, + 'gsd-phase-researcher': { quality: 'opus', balanced: 'sonnet', budget: 'haiku' }, + // ... +}; +``` + +### Add Panel Orchestrator Entries + +Panel orchestrators do synthesis (moderate reasoning), not execution. They should use the same tier as the single agent they replace: + +```javascript +const MODEL_PROFILES = { + // ...existing entries + 'gsd-plan-checker-panel': { quality: 'sonnet', balanced: 'sonnet', budget: 'haiku' }, + 'gsd-verifier-panel': { quality: 'sonnet', balanced: 'sonnet', budget: 'haiku' }, + 'gsd-researcher-panel': { quality: 'opus', balanced: 'sonnet', budget: 'haiku' }, +}; +``` + +**Specialist subagent models:** Specialists spawned BY the panel use the SAME model as the panel orchestrator. The panel agent passes its own model to each Task() call. This avoids adding 9 more entries (3 specialists x 3 panels) to the profile table. The panel agent `.md` should document: "Pass your own model to specialist Task calls." + +## Integration Point 3: Conditional Dispatch in Workflows + +### Pattern: plan-phase.md Step 10 (Plan Checker) + +Current (single agent): +``` +Task( + prompt=checker_prompt, + subagent_type="gsd-plan-checker", + model="{checker_model}", + description="Verify Phase {phase} plans" +) +``` + +Recommended (conditional dispatch): +``` +# After parsing INIT JSON: +# PLAN_CHECK_PANEL=$(echo "$INIT" | jq -r '.plan_check_panel') +# CHECKER_PANEL_MODEL=$(echo "$INIT" | jq -r '.checker_panel_model // empty') + +if PLAN_CHECK_PANEL is true: + Task( + prompt="First, read ~/.claude/agents/gsd-plan-checker-panel.md for your role and instructions.\n\n" + checker_prompt, + subagent_type="general-purpose", + model="{checker_panel_model}", + description="Panel verify Phase {phase} plans" + ) +else: + Task( + prompt=checker_prompt, + subagent_type="gsd-plan-checker", + model="{checker_model}", + description="Verify Phase {phase} plans" + ) +``` + +**Critical detail:** Panel agents use `subagent_type="general-purpose"` (not a named type) because Claude Code only recognizes a fixed set of subagent types. Named types like `gsd-plan-checker` cause Claude Code to load the matching `agents/*.md` file. Panel agents load their own instructions via `@file` reference instead. This matches the existing pattern used by `auto-discuss.md` and `new-project.md` researcher spawns. + +### Pattern: execute-phase.md verify_phase_goal Step + +Same conditional dispatch pattern: +``` +if VERIFIER_PANEL is true: + Task( + prompt="First, read ~/.claude/agents/gsd-verifier-panel.md for your role and instructions.\n\n" + verifier_prompt, + subagent_type="general-purpose", + model="{verifier_panel_model}", + description="Panel verify phase {phase_number} goal" + ) +else: + Task( + prompt=verifier_prompt, + subagent_type="gsd-verifier", + model="{verifier_model}", + description="Verify phase {phase_number} goal" + ) +``` + +### Pattern: plan-phase.md Step 5 (Researcher) + +``` +if RESEARCH_PANEL is true: + Task( + prompt="First, read ~/.claude/agents/gsd-researcher-panel.md for your role and instructions.\n\n" + research_prompt, + subagent_type="general-purpose", + model="{researcher_panel_model}", + description="Panel research Phase {phase}" + ) +else: + Task( + prompt="First, read ~/.claude/agents/gsd-phase-researcher.md for your role and instructions.\n\n" + research_prompt, + subagent_type="general-purpose", + model="{researcher_model}", + description="Research Phase {phase}" + ) +``` + +## Integration Point 4: Panel Agent File Structure + +### Recommended: 3 New Agent Files + +``` +agents/ + gsd-plan-checker.md # existing single agent + gsd-plan-checker-panel.md # NEW: panel orchestrator + gsd-verifier.md # existing single agent + gsd-verifier-panel.md # NEW: panel orchestrator + gsd-phase-researcher.md # existing single agent + gsd-researcher-panel.md # NEW: panel orchestrator +``` + +### Why NOT Separate Specialist Agent Files + +Specialists should be **inline prompts within the panel agent**, not separate `.md` files. Reasons: + +1. **Context isolation:** Each specialist is spawned via Task() with a specific prompt. The panel agent constructs these prompts dynamically based on the input context (phase goal, plans, requirements). Separate files would be static and unable to adapt. + +2. **Coupling:** Specialists only make sense in the context of their panel. A "coverage analyst" is useless outside the plan-checker-panel. Separate files suggest independent reuse that doesn't exist. + +3. **Precedent:** `auto-discuss.md` already uses this pattern -- specialist prompts are constructed inline with role assignments, not loaded from separate files. + +4. **Maintenance:** 3 panel files vs 3 panels + 9 specialist files. The inline approach keeps each panel self-contained. + +### Panel Agent Anatomy (Template) + +```markdown +--- +name: gsd-plan-checker-panel +description: MoE panel that spawns 3 specialist agents to verify plans from different angles. Returns same output contract as gsd-plan-checker. +tools: Read, Bash, Glob, Grep +color: green +--- + + +You are a GSD plan-checker panel. You orchestrate 3 specialist verification agents, +synthesize their findings, and return the same structured output as gsd-plan-checker. + +The workflow that spawned you expects EXACTLY the same return format as a single +gsd-plan-checker agent. Your job is to produce BETTER results through parallel +specialist analysis, but the output contract is non-negotiable. + + + +Your return MUST be one of: +- ## VERIFICATION PASSED (same format as gsd-plan-checker) +- ## ISSUES FOUND (same format as gsd-plan-checker) + +The orchestrator workflow parses these headers. Any other format breaks the pipeline. + + + +## Specialist 1: Coverage Analyst +Focus: Requirement coverage (Dimensions 1, 6, 7 from plan-checker) +Checks: Every requirement has tasks, must_haves trace to goal, context compliance + +## Specialist 2: Scope & Structure Auditor +Focus: Task quality and scope (Dimensions 2, 5) +Checks: Task completeness (files/action/verify/done), scope sanity, context budget + +## Specialist 3: Dependency & Wiring Inspector +Focus: Dependency correctness and key links (Dimensions 3, 4, 8) +Checks: Dependency graph, key links, Nyquist compliance + + + +1. Load context (same as gsd-plan-checker Step 1-2) +2. Construct specialist prompts with shared context +3. Spawn all 3 specialists in parallel via Task() +4. Collect results from all 3 +5. Synthesize: union of all issues (deduplicate by plan+task+dimension) +6. Determine overall status (any blocker -> ISSUES FOUND) +7. Return in gsd-plan-checker output format + + + +Task( + subagent_type="general-purpose", + model="{same model as panel}", + prompt=" + You are a {specialist_name} for GSD plan verification. + {specialist_focus} + {same files from orchestrator prompt} + Return a YAML issues list + coverage table for your dimensions. + ", + description="{specialist_name}" +) + + + +1. Parse each specialist's issues list +2. Deduplicate by (plan, task, dimension) tuple +3. If any specialist found blockers -> overall = ISSUES FOUND +4. If no blockers but warnings -> overall = ISSUES FOUND (warnings should still be reviewed) +5. If no issues -> overall = VERIFICATION PASSED +6. Merge coverage tables from all specialists +7. Format into exact gsd-plan-checker return structure + +``` + +## Integration Point 5: Output Contract Preservation + +This is the most critical architectural constraint. Panel agents MUST produce byte-compatible output with single agents. + +### Plan Checker Contract + +Single agent returns one of: +```markdown +## VERIFICATION PASSED +**Phase:** {phase-name} +**Plans verified:** {N} +**Status:** All checks passed +### Coverage Summary +| Requirement | Plans | Status | +### Plan Summary +| Plan | Tasks | Files | Wave | Status | +``` + +OR: + +```markdown +## ISSUES FOUND +**Phase:** {phase-name} +**Plans checked:** {N} +**Issues:** {X} blocker(s), {Y} warning(s), {Z} info +### Blockers (must fix) +### Warnings (should fix) +### Structured Issues +(YAML issues list) +### Recommendation +``` + +The panel MUST return exactly these formats. The revision loop in plan-phase.md Steps 11-12 parses `## VERIFICATION PASSED` and `## ISSUES FOUND` headers to determine next action. + +### Verifier Contract + +Single agent returns: +```markdown +## Verification Complete +**Status:** {passed | gaps_found | human_needed} +**Score:** {N}/{M} must-haves verified +**Report:** .planning/phases/{phase_dir}/{phase_num}-VERIFICATION.md +``` + +The execute-phase.md `verify_phase_goal` step reads the VERIFICATION.md status field via grep. The panel must write the same file format. + +### Researcher Contract + +Single agent returns: +```markdown +## RESEARCH COMPLETE +**Phase:** {phase_number} - {phase_name} +**Confidence:** [HIGH/MEDIUM/LOW] +### Key Findings +### File Created +### Confidence Assessment +### Ready for Planning +``` + +OR: + +```markdown +## RESEARCH BLOCKED +**Phase:** {phase_number} - {phase_name} +**Blocked by:** [what] +``` + +The plan-phase.md Step 5 handler checks for these headers. + +### Contract Enforcement Strategy + +Each panel agent `.md` file should include an `` section that: +1. Lists the exact headers the workflow expects +2. Shows the complete output format template +3. States: "Your synthesis MUST produce this exact format. Do not add extra sections or change headers." + +## Integration Point 6: Panel-Specialist Relationship + +### Hierarchy + +``` +Workflow Orchestrator (plan-phase.md / execute-phase.md) + | + v +Panel Agent (gsd-plan-checker-panel.md) <-- has 200K context + | + +--> Specialist 1 (Task, general-purpose) <-- has 200K context + +--> Specialist 2 (Task, general-purpose) <-- has 200K context + +--> Specialist 3 (Task, general-purpose) <-- has 200K context + | + v +Synthesis (panel agent combines results) + | + v +Return to Workflow Orchestrator +``` + +### Context Budget + +Each specialist gets a fresh 200K context window. The panel orchestrator also has 200K. This means: +- Panel orchestrator: reads files, constructs prompts, synthesizes (~30-40% context usage) +- Each specialist: reads files, performs focused analysis, returns findings (~50-60% context usage) + +**Total token cost:** 4x a single agent (1 panel + 3 specialists). This is the primary tradeoff. + +### Specialist Prompt Construction + +The panel agent receives the same `` or `` block that the single agent would receive. It passes this context through to each specialist, adding the specialist's focus area: + +``` +specialist_prompt = f""" +{specialist_role_description} + + +You are responsible for verification dimensions: {dimension_list} +Ignore other dimensions -- other specialists handle them. + + +{original_context_from_workflow} + + +Return your findings as: + +### Findings + +#### Dimension {N}: {Name} +Status: PASS | FAIL +Issues: +```yaml +issues: + - plan: "XX-YY" + dimension: "{dimension_name}" + severity: "blocker|warning|info" + description: "..." + fix_hint: "..." +``` + +If no issues for a dimension, state: "Dimension {N}: PASS - no issues found" + +""" +``` + +### Synthesis Pattern + +The panel agent collects all specialist returns and merges: + +1. **Parse** each specialist's YAML issues list +2. **Union** all issues into a single list +3. **Deduplicate** by (plan, task, dimension) -- if two specialists flag the same issue, keep the higher severity +4. **Aggregate** coverage tables (each specialist reports on their dimensions) +5. **Determine** overall status: any blocker -> ISSUES FOUND, else VERIFICATION PASSED +6. **Format** into the exact single-agent output contract + +This is analogous to `auto-discuss.md`'s `synthesize_consensus` step, but for verification findings instead of decisions. + +## Patterns to Follow + +### Pattern 1: Transparent Substitution (from auto-discuss.md) + +**What:** auto-discuss.md produces CONTEXT.md in the exact same format as discuss-phase.md. Downstream agents (researcher, planner) consume it identically. + +**Apply to panels:** Panel agents produce output in the exact same format as single agents. Workflow orchestrators consume it identically. + +**Why this works:** The contract is at the output level, not the agent level. What happens inside the agent (1 agent or 3 specialists) is an implementation detail. + +### Pattern 2: Parallel Task Spawning (from auto-discuss.md) + +**What:** auto-discuss.md spawns N agents in parallel via multiple Task() calls in a single message. + +``` +For each agent (1 to AGENT_COUNT): + Task( + subagent_type="general-purpose", + model="${DISCUSS_MODEL}", + prompt="...", + description="${ROLE_NAME} review" + ) +``` + +**Apply to panels:** Panel agents spawn 3 specialists in parallel the same way. All three use `subagent_type="general-purpose"` with inline role prompts. + +### Pattern 3: Config Flag Gating (from existing workflow toggles) + +**What:** `workflow.research: true/false` gates whether the researcher is spawned. The workflow checks `research_enabled` from INIT JSON. + +**Apply to panels:** `workflow.plan_check_panel: true/false` gates whether the panel version is spawned. The workflow checks `plan_check_panel` from INIT JSON. + +### Pattern 4: INIT JSON Pre-computation (from init.cjs) + +**What:** All config resolution happens in `init.cjs` before the workflow starts. The workflow reads a single JSON blob with all flags and models pre-resolved. + +**Apply to panels:** Panel flags and panel models are resolved in init.cjs and included in the INIT JSON. The workflow never reads config.json directly for panel decisions. + +## Anti-Patterns to Avoid + +### Anti-Pattern 1: Panel Agents Altering Output Format +**What:** Adding extra sections, changing header levels, or renaming sections in the panel output. +**Why bad:** Workflow orchestrators parse specific headers (`## VERIFICATION PASSED`, `## ISSUES FOUND`). Any change breaks the revision loop. +**Instead:** Copy the exact output template from the single agent into the panel agent's `` section. Make the panel's synthesis step format its output using this template. + +### Anti-Pattern 2: Separate Agent Files for Specialists +**What:** Creating `agents/gsd-coverage-analyst.md`, `agents/gsd-scope-auditor.md`, etc. +**Why bad:** Specialists are not independently useful. They fragment the panel logic across files. They can't be spawned via `subagent_type` (Claude Code doesn't know about them). They add maintenance burden without benefit. +**Instead:** Inline specialist prompts within the panel agent file, constructed dynamically. + +### Anti-Pattern 3: Double-Reading Files +**What:** Panel agent reads all plan files, then each specialist also reads all plan files. +**Why bad:** Wastes context in the panel orchestrator. The panel only needs enough context to construct specialist prompts and synthesize results. +**Instead:** Panel agent reads file paths (not contents) from INIT JSON, passes paths to specialists via `` blocks. Specialists read files themselves with their fresh 200K context. Panel agent only reads files for synthesis if needed. + +### Anti-Pattern 4: Global Panel Toggle +**What:** `workflow.panels: true` enables all panels at once. +**Why bad:** Different panels have different cost/benefit profiles. Verification panels catch more bugs (high value). Research panels produce more thorough findings but may be overkill for simple phases (moderate value). Users should control each independently. +**Instead:** Per-feature panel toggles: `plan_check_panel`, `verifier_panel`, `researcher_panel`. + +### Anti-Pattern 5: Panel Orchestrator Doing Analysis +**What:** Panel agent performs its own verification analysis in addition to spawning specialists. +**Why bad:** Duplicates work, wastes context, creates conflicts between panel and specialist findings. +**Instead:** Panel agent is ONLY an orchestrator. It constructs prompts, spawns agents, collects results, synthesizes output. All analytical work is done by specialists. + +## Scalability Considerations + +| Concern | 1 panel active | 2 panels active | All 3 panels active | +|---------|---------------|-----------------|---------------------| +| Token cost | 4x single agent | 8x single agent | 12x single agent | +| Wall-clock time | ~same (parallel) | ~same (sequential between panels) | ~same | +| Quality improvement | Focused analysis per dimension | Comprehensive coverage | Maximum thoroughness | +| Context pressure on parent workflow | Minimal (same return size) | Minimal | Minimal | + +**Token cost is the primary constraint.** Each panel spawns 3 specialists, each with 200K context. For budget-conscious users, panels should default to `false`. For quality-focused users (quality model profile), panels provide significant value. + +**Recommended defaults:** +- `research_panel: false` -- research is already thorough with a single agent +- `plan_check_panel: false` -- single checker catches most issues +- `verifier_panel: false` -- single verifier is sufficient for most phases + +Panels are an opt-in quality boost, not a default. + +## Implementation Sequence + +The recommended build order for this feature: + +1. **Config layer first** -- add panel flags to `config.cjs`, `core.cjs`, `init.cjs` +2. **Model profiles** -- add panel entries to MODEL_PROFILES table +3. **Settings UI** -- add panel toggle questions to `settings.md` +4. **Plan-checker panel** -- build first panel (plan-checker is the most well-defined contract) +5. **Conditional dispatch in plan-phase.md** -- wire the config flag to dispatch +6. **Verifier panel** -- second panel (similar structure) +7. **Conditional dispatch in execute-phase.md** -- wire verifier dispatch +8. **Researcher panel** -- third panel (different synthesis pattern) +9. **Conditional dispatch in plan-phase.md research step** -- wire researcher dispatch +10. **Integration testing** -- verify panel output matches single agent contracts + +## Sources + +All findings are from direct codebase analysis (HIGH confidence): + +- `get-shit-done/templates/config.json` -- current config schema +- `get-shit-done/bin/lib/core.cjs` -- MODEL_PROFILES table, loadConfig, resolveModelInternal +- `get-shit-done/bin/lib/config.cjs` -- config CRUD operations +- `get-shit-done/bin/lib/init.cjs` -- INIT JSON pre-computation for all workflow types +- `get-shit-done/workflows/plan-phase.md` -- plan checker and researcher dispatch points +- `get-shit-done/workflows/execute-phase.md` -- verifier dispatch point +- `get-shit-done/workflows/auto-discuss.md` -- existing parallel agent spawn + synthesis pattern +- `get-shit-done/workflows/settings.md` -- settings UI pattern for config toggles +- `agents/gsd-plan-checker.md` -- plan checker output contract (VERIFICATION PASSED / ISSUES FOUND) +- `agents/gsd-verifier.md` -- verifier output contract (Verification Complete + VERIFICATION.md) +- `agents/gsd-phase-researcher.md` -- researcher output contract (RESEARCH COMPLETE / RESEARCH BLOCKED) +- `get-shit-done/references/model-profiles.md` -- model profile philosophy and table structure diff --git a/.planning/research/FEATURES.md b/.planning/research/FEATURES.md new file mode 100644 index 0000000000..489d06b454 --- /dev/null +++ b/.planning/research/FEATURES.md @@ -0,0 +1,337 @@ +# Feature Landscape: MoE Panels & Consensus Mechanisms + +**Domain:** AI agent orchestration quality gates with parallel specialist panels +**Researched:** 2026-02-26 +**Overall Confidence:** MEDIUM-HIGH + +## Executive Summary + +GSD currently uses single-agent quality gates (plan-checker, verifier, phase-researcher) that each bear the full responsibility of their domain. The v2.0 MoE Panels milestone replaces each gate with a panel of 3 parallel specialists, each covering a non-overlapping domain partition. This is not a voting system -- it is a domain-partitioned assembly pattern where each specialist owns distinct sections of the output document and a synthesizer merges their non-overlapping contributions. + +The key insight from research: **voting and consensus mechanisms solve a different problem than what GSD panels need.** Voting works when multiple agents evaluate the *same* thing and you need to pick the best answer. Domain-partitioned assembly works when agents evaluate *different things* and you need to combine their non-overlapping findings. GSD panels are the latter -- specialists checking distinct dimensions, not redundant reviewers voting on the same dimensions. + +The auto-discuss workflow already proves the panel pattern works in this codebase: it spawns N agents in parallel, collects structured outputs, and synthesizes them. The MoE panel pattern is a constrained version of auto-discuss where specialist domains are pre-defined (not dynamically generated) and output sections are non-overlapping (not debated). + +## Table Stakes + +Features that must exist for panels to deliver value over the current single-agent gates. + +| Feature | Why Expected | Complexity | Confidence | Notes | +|---------|-------------|------------|------------|-------| +| Parallel specialist spawning | Panels must run specialists concurrently to avoid 3x latency | Low | HIGH | GSD already spawns parallel agents in auto-discuss and wave execution | +| Domain-partitioned output assembly | Each specialist must own distinct document sections to avoid duplication and conflict | Medium | HIGH | Core innovation -- see Architecture section | +| Backward compatibility with orchestrators | plan-phase, execute-phase, and research-phase must consume panel output identically to single-agent output | Medium | HIGH | VERIFICATION.md, RESEARCH.md, and checker returns must keep same format | +| Per-panel configuration | Users must be able to enable/disable panels per gate (config.json) and fall back to single-agent | Low | HIGH | Follows existing `workflow.research`, `workflow.plan_check`, `workflow.verifier` pattern | +| Specialist agent definitions (9 agents) | 3 specialists per panel x 3 panels = 9 new agent .md files | High | HIGH | Largest content effort -- each agent needs focused role, dimensions, and output format | +| Panel synthesizer logic | Orchestrator or synthesizer agent must merge 3 specialist outputs into single output document | Medium | MEDIUM | Similar to gsd-research-synthesizer pattern already in codebase | +| Cross-validation between specialists | When one specialist flags an issue, adjacent specialists should verify (reduces false positives) | Medium | MEDIUM | Inspired by diffray's cross-validation approach (87% fewer false positives) | +| Model-per-specialist configuration | Different specialists may benefit from different model strengths | Low | HIGH | Already supported via model profile resolution | + +## Differentiators + +Features that elevate panels beyond basic parallel execution. + +| Feature | Value Proposition | Complexity | Confidence | Notes | +|---------|-------------------|------------|------------|-------| +| Conflict detection at merge time | When two specialists make contradictory claims about the same artifact, flag for resolution rather than silently including both | Medium | MEDIUM | Only relevant at domain boundaries -- should be rare with good partitioning | +| Specialist confidence weighting | Specialists report confidence per finding; synthesizer weights HIGH findings above LOW | Low | MEDIUM | Lightweight version of attention-based routing from MoE literature | +| Degraded-mode fallback | If one specialist fails/times out, produce partial panel output with explicit gaps rather than blocking entirely | Medium | HIGH | Important for reliability -- single-agent fallback for failed specialist | +| Panel-level scoring | Aggregate specialist scores into panel-level pass/fail with drill-down | Low | HIGH | Verifier already produces scores; extend to per-specialist breakdown | +| Configurable specialist count | Allow 1-specialist (single-agent mode), 3-specialist (standard), or 5-specialist (deep) per panel | Medium | LOW | Premature optimization; start with 3-specialist only, add later | + +## Anti-Features + +Features to explicitly NOT build. + +| Anti-Feature | Why Avoid | What to Do Instead | +|--------------|-----------|-------------------| +| Voting/majority consensus between specialists | Specialists own non-overlapping domains -- there is nothing to vote on. Voting suits overlapping evaluations (like auto-discuss where agents evaluate the same gray areas). | Use domain-partitioned assembly: each specialist contributes its section, synthesizer merges. | +| Multi-round debate between specialists | Research shows increasing discussion rounds *decreases* performance (Kaesberg et al., 2025). Adds latency with diminishing returns. | Single-round parallel execution. If cross-validation catches a conflict, the synthesizer resolves it -- no iterative debate. | +| Dynamic specialist routing (MoE-style gating) | True MoE routing requires a trained gating network. Our specialists are pre-assigned -- the "routing" is static by design. Dynamic routing adds complexity without benefit for 3-specialist panels. | Static assignment: each specialist always runs. All 3 always fire. | +| Shared context between parallel specialists | Research shows isolated execution produces better diversity. Claude Code subagents already run in isolated contexts. Sharing context risks groupthink. | Each specialist gets independent context. Synthesizer sees all outputs. | +| Complex weighting/scoring algorithms | Over-engineering the merge. The output is markdown, not numerical predictions. | Simple structured merge with section ownership. | +| Specialist-to-specialist communication during execution | Adds synchronization complexity, defeats the purpose of parallelism, risks cascading failures. | Post-execution cross-validation only (synthesizer reads all outputs, flags contradictions). | + +## Consensus Mechanism Analysis + +The central design question. Based on research into how multi-agent systems combine specialist outputs. + +### Mechanism 1: UNION Assembly (RECOMMENDED) + +**What:** Each specialist owns non-overlapping sections of the output document. Synthesizer concatenates sections, resolves boundary conflicts, and produces the final document. + +**When it works:** When specialists have clearly partitioned domains with minimal overlap (which is the design intent for all 3 panels). + +**Evidence:** This is how diffray's multi-agent code review works -- 11 specialists each own a concern, findings are merged and deduplicated. Google ADK's ParallelAgent pattern also uses this: parallel execution with post-processing merge. The existing gsd-research-synthesizer in this codebase is literally a UNION assembler -- it reads 4 parallel researcher outputs (STACK, FEATURES, ARCHITECTURE, PITFALLS) and synthesizes SUMMARY.md. + +**Tradeoffs:** +- PRO: No latency penalty beyond slowest specialist (parallel execution) +- PRO: No information loss (every specialist's findings included) +- PRO: Simple implementation (structured merge, no voting logic) +- CON: Requires clean domain partitioning (overlap = conflicts) +- CON: Synthesizer must handle boundary cases + +**Confidence:** HIGH -- This pattern is already proven in the codebase (research-synthesizer, auto-discuss synthesis step). + +### Mechanism 2: Majority Voting + +**What:** All specialists evaluate the same dimensions. Each votes pass/fail on each dimension. Majority wins. + +**When it works:** When you want redundancy -- multiple agents checking the same thing to reduce error. + +**Evidence:** The Kaesberg et al. (2025) study found voting improves reasoning tasks by 13.2% over consensus, but this applies to tasks where agents solve the *same* problem. Multi-Agent Verification (MAV) scales verifiers, not specialists. + +**Tradeoffs:** +- PRO: Built-in redundancy (3 agents checking same thing = fewer misses) +- CON: 3x the work for marginally better accuracy on the same dimensions +- CON: Loses the benefit of domain specialization (generalist voters, not specialists) +- CON: Still need a tie-breaking mechanism for 3 agents + +**Confidence:** HIGH that this is the WRONG pattern for GSD panels. Voting suits auto-discuss (same gray areas, different perspectives). Panels need specialization, not redundancy. + +### Mechanism 3: Consensus via Iterative Debate + +**What:** Specialists discuss findings, iterate toward agreement, converge on shared output. + +**Evidence:** Research shows consensus reduces hallucination on fact-based tasks (2.8% improvement) but multiple rounds decrease overall performance. Debate adds latency proportional to round count. + +**Tradeoffs:** +- PRO: May catch edge cases at domain boundaries +- CON: Significantly slower (2-5x latency per round) +- CON: Research explicitly recommends AGAINST multiple rounds +- CON: Complexity explosion in prompt engineering + +**Confidence:** HIGH that this is overkill for GSD panels. + +### Mechanism 4: Domain-Partitioned Assembly with Cross-Validation (RECOMMENDED VARIANT) + +**What:** UNION assembly (Mechanism 1) plus a lightweight cross-validation step where the synthesizer checks for contradictions between specialist outputs before producing the final document. + +**Example:** Plan-checker structural specialist says "dependencies valid" but semantic specialist says "task 3 references output from task 1 which doesn't produce that artifact." The synthesizer flags this as a cross-validation conflict and elevates severity. + +**Tradeoffs:** +- PRO: Gets the speed of UNION assembly +- PRO: Catches boundary-crossing issues +- PRO: Synthesizer is a natural place for this (already reads all outputs) +- CON: Slightly more complex synthesizer logic + +**Confidence:** HIGH -- This is the recommended approach. + +## Panel Designs: Detailed Feature Maps + +### Panel 1: Plan Checker Panel + +**Current single agent:** gsd-plan-checker (8 verification dimensions, returns "VERIFICATION PASSED" or "ISSUES FOUND") + +**Panel specialists:** + +| Specialist | Domain | Dimensions Owned | Output Section | +|------------|--------|-----------------|----------------| +| **Structural Integrity** | Plan mechanics: frontmatter, task completeness, dependency graphs, wave assignment, scope metrics | Dim 2 (Task Completeness), Dim 3 (Dependency Correctness), Dim 5 (Scope Sanity), Dim 8 (Nyquist Compliance) | `## Structural Analysis` | +| **Semantic Quality** | Goal-backward analysis: requirement coverage, must_haves derivation, key links planned, action specificity | Dim 1 (Requirement Coverage), Dim 4 (Key Links Planned), Dim 6 (Verification Derivation) | `## Semantic Analysis` | +| **Compliance** | External constraints: CONTEXT.md decisions, project skills, CLAUDE.md conventions, deferred ideas exclusion | Dim 7 (Context Compliance), plus project skill rules, plus CLAUDE.md conventions | `## Compliance Analysis` | + +**Why this partition:** Structural checks are mechanical (parseable, countable). Semantic checks require reasoning about intent vs. outcome. Compliance checks require cross-referencing external constraint documents. These are genuinely different cognitive tasks. + +**Synthesizer behavior:** Merge all 3 sections. Cross-validate: if Structural says "3 tasks" but Semantic says "requirement X covered by tasks 1,2,3,4" -- flag inconsistency. Produce unified issue list with severity. Overall pass/fail uses worst-case: any blocker from any specialist = ISSUES FOUND. + +**Output format (backward compatible):** +```markdown +## VERIFICATION PASSED | ISSUES FOUND + +**Phase:** {phase-name} +**Plans verified:** {N} +**Panel:** Structural + Semantic + Compliance + +### Structural Analysis +[Structural specialist output: dimensions 2,3,5,8] + +### Semantic Analysis +[Semantic specialist output: dimensions 1,4,6] + +### Compliance Analysis +[Compliance specialist output: dimension 7 + project rules] + +### Cross-Validation Notes +[Synthesizer's boundary-crossing findings] + +### Unified Issue List +[Merged, deduplicated, severity-ranked issues from all specialists] +``` + +**Confidence:** HIGH -- The 8-dimension structure naturally partitions into these 3 groups. No dimension is ambiguously assigned. + +### Panel 2: Verifier Panel + +**Current single agent:** gsd-verifier (3-level artifact verification, key link checks, anti-pattern scanning, creates VERIFICATION.md) + +**Panel specialists:** + +| Specialist | Domain | Checks Owned | Output Section | +|------------|--------|-------------|----------------| +| **Artifact & Wiring** | File existence, substantive content (not stubs), import/usage wiring, key link verification | Level 1 (exists), Level 2 (substantive), Level 3 (wired), Key Links | `## Artifact Verification` + `## Key Link Verification` | +| **Requirements & Anti-Patterns** | Requirement coverage, anti-pattern scanning (TODO/FIXME/placeholder/empty returns), goal-backward truth verification | Requirement mapping, anti-pattern detection, truth status determination | `## Requirements Coverage` + `## Anti-Patterns Found` | +| **Human Verification** | Items needing human testing, visual/UX concerns, external service integration checks, edge case identification | Human-needed classification, test script generation, uncertainty flagging | `## Human Verification Required` | + +**Why this partition:** Artifact/wiring checks are grep-based (mechanical file analysis). Requirement/anti-pattern checks are reasoning-based (does this code satisfy that requirement?). Human verification is judgment-based (what can't be verified programmatically?). Each requires different cognitive approaches and tool usage patterns. + +**Synthesizer behavior:** Merge sections into VERIFICATION.md format. Determine overall status: `passed` (all artifacts verified + all requirements satisfied + no blocker anti-patterns), `gaps_found` (any failure), `human_needed` (automated pass but human items remain). Score = verified truths / total truths. Cross-validate: if Artifact specialist says "file exists and is wired" but Anti-Pattern specialist says "file contains only TODO placeholders" -- elevate to blocker. + +**Output format (backward compatible):** +```yaml +--- +phase: XX-name +verified: YYYY-MM-DDTHH:MM:SSZ +status: passed | gaps_found | human_needed +score: N/M must-haves verified +panel: artifact-wiring + requirements-antipatterns + human-verification +gaps: [...] # Merged from all specialists +human_verification: [...] # From human verification specialist +--- +``` + +**Confidence:** HIGH -- The verifier's existing steps (verify_artifacts, verify_wiring, verify_requirements, scan_antipatterns, identify_human_verification) map cleanly to these 3 specialists. + +### Panel 3: Research Panel + +**Current single agent:** gsd-phase-researcher (produces RESEARCH.md with stack, patterns, pitfalls, code examples) + +**Panel specialists:** + +| Specialist | Domain | Sections Owned | Output Section | +|------------|--------|---------------|----------------| +| **Stack & Ecosystem** | Library recommendations, versions, alternatives, don't-hand-roll, installation commands | Standard Stack, Don't Hand-Roll, State of the Art, Installation | `## Standard Stack` + `## Don't Hand-Roll` + `## State of the Art` | +| **Architecture & Patterns** | Project structure, design patterns, code examples, recommended organization | Architecture Patterns, Code Examples, Recommended Project Structure | `## Architecture Patterns` + `## Code Examples` | +| **Pitfalls & Validation** | Common mistakes, pitfalls, gotchas, validation architecture (Nyquist), open questions | Common Pitfalls, Validation Architecture, Open Questions | `## Common Pitfalls` + `## Validation Architecture` + `## Open Questions` | + +**Why this partition:** These are genuinely different research domains. Stack research requires checking Context7/official docs for current versions. Architecture research requires understanding design patterns and project structure. Pitfall research requires finding community wisdom about what goes wrong. Different tool usage, different sources, different reasoning. + +**Synthesizer behavior:** Merge into single RESEARCH.md. Cross-validate: if Stack specialist recommends library X but Pitfalls specialist warns against library X -- flag conflict, let synthesizer resolve or include both with warning. Add Summary section (synthesized from all 3). Ensure User Constraints section appears first (copied from CONTEXT.md by all specialists independently, deduplicated by synthesizer). + +**Output format (backward compatible):** +```markdown +# Phase [X]: [Name] - Research + +**Researched:** [date] +**Domain:** [domain] +**Confidence:** [level] +**Panel:** Stack + Architecture + Pitfalls + +## User Constraints (from CONTEXT.md) +[Synthesizer deduplicates from all 3 specialists] + +## Summary +[Synthesizer writes this from combined findings] + +## Standard Stack +[From Stack & Ecosystem specialist] + +## Architecture Patterns +[From Architecture & Patterns specialist] + +## Don't Hand-Roll +[From Stack & Ecosystem specialist] + +## Common Pitfalls +[From Pitfalls & Validation specialist] + +## Code Examples +[From Architecture & Patterns specialist] + +## State of the Art +[From Stack & Ecosystem specialist] + +## Validation Architecture +[From Pitfalls & Validation specialist] + +## Open Questions +[From Pitfalls & Validation specialist, augmented by synthesizer] + +## Sources +[Merged from all specialists] +``` + +**Confidence:** HIGH -- This is essentially the same pattern as the existing project research pipeline (4 parallel researchers + synthesizer) but applied at phase level. + +## Feature Dependencies + +``` +Per-panel config (config.json) --> Panel enablement check in orchestrators + | + v +Specialist agent definitions (9 agents/*.md files) + | + v +Panel orchestration logic in workflows (plan-phase.md, execute-phase.md, research-phase.md) + | + v +Synthesizer logic (per-panel merge + cross-validation) + | + v +Backward-compatible output format (same VERIFICATION.md / RESEARCH.md / checker return) + | + v +Degraded-mode fallback (if specialist fails, fall back to single-agent) +``` + +**Critical dependency:** Specialist agent definitions must be complete before orchestration logic can be tested. The 9 agent .md files are the largest work item and the foundation for everything else. + +**Parallel work streams:** +- Stream A: Specialist agent definitions (9 files) -- can be done in parallel across panels +- Stream B: Config schema updates -- independent of agent definitions +- Stream C: Orchestrator workflow updates -- depends on A being at least partially done + +## Implementation Complexity Assessment + +| Component | Effort | Risk | Notes | +|-----------|--------|------|-------| +| 9 specialist agent .md files | HIGH (largest effort) | LOW (well-understood pattern from existing agents) | Each is ~200-400 lines. Total ~2700-3600 lines of prompt engineering. | +| Config schema updates | LOW | LOW | Add `panel` section to config.json with per-gate enable/disable | +| plan-phase.md orchestrator update | MEDIUM | MEDIUM | Replace single plan-checker spawn with 3 parallel + synthesizer | +| execute-phase.md orchestrator update | MEDIUM | MEDIUM | Replace single verifier spawn with 3 parallel + synthesizer | +| research-phase.md / plan-phase.md research step | MEDIUM | LOW | Already has pattern from project research pipeline | +| Synthesizer logic (3 synthesizers) | MEDIUM | MEDIUM | Could be inline in orchestrator or separate agents. Inline is simpler. | +| Cross-validation logic | LOW | LOW | Lightweight post-merge check in synthesizer | +| Degraded-mode fallback | LOW | LOW | If specialist timeout, run single-agent as fallback | +| Testing/validation | HIGH | HIGH | Need to verify panel output matches what downstream consumers expect | + +## MVP Recommendation + +**Phase 1 (Foundation): Agent Definitions + Config** +1. Define 9 specialist agent .md files (3 panels x 3 specialists) +2. Add panel config schema to config.json +3. No orchestrator changes yet -- agents can be tested standalone + +**Phase 2 (Integration): Orchestrator Panel Spawning** +1. Update plan-phase.md to spawn plan-checker panel (3 parallel + inline synthesis) +2. Update execute-phase.md to spawn verifier panel (3 parallel + inline synthesis) +3. Update plan-phase.md research step to spawn research panel (3 parallel + inline synthesis) +4. Add backward-compatible output format validation + +**Phase 3 (Hardening): Cross-Validation + Fallback** +1. Add cross-validation logic to synthesizers +2. Add degraded-mode fallback for specialist failures +3. Add panel-level scoring and drill-down +4. Test autopilot end-to-end with panels + +**Defer:** +- Configurable specialist count (1/3/5) -- start with 3 only, add later if needed +- Specialist-to-specialist communication -- anti-feature, don't build +- Complex weighting algorithms -- unnecessary for markdown-based outputs + +## Sources + +### Primary (HIGH confidence) +- GSD codebase analysis: agents/gsd-plan-checker.md, agents/gsd-verifier.md, agents/gsd-phase-researcher.md, agents/gsd-research-synthesizer.md +- GSD workflow analysis: workflows/plan-phase.md, workflows/execute-phase.md, workflows/auto-discuss.md, workflows/new-project.md +- Google ADK Parallel Agent documentation: https://google.github.io/adk-docs/agents/workflow-agents/parallel-agents/ + +### Secondary (MEDIUM confidence) +- Kaesberg et al. (2025) "Voting or Consensus? Decision-Making in Multi-Agent Debate" -- https://arxiv.org/abs/2502.19130 -- Systematic evaluation of 7 decision protocols. Key finding: voting better for reasoning, consensus for knowledge, more agents better than more rounds. +- Qodo "Single-Agent vs Multi-Agent Code Review" -- https://www.qodo.ai/blog/single-agent-vs-multi-agent-code-review/ -- Architecture for domain-partitioned code review with explicit pass/fail signals per specialist. +- Diffray "Multi-Agent Code Review" -- https://diffray.ai/multi-agent-code-review/ -- 11-specialist architecture with cross-validation and deduplication. 87% fewer false positives. +- ProofSource "Parallel Sub-Agents in Claude Code" -- https://proofsource.ai/2025/12/parallel-sub-agents-in-claude-code-multiplying-your-development-speed/ -- Claude Code synthesizes subagent findings into coherent responses. Diminishing returns beyond 4-5 parallel agents. + +### Tertiary (LOW confidence) +- General multi-agent system surveys from 2025 (classicinformatics, ioni.ai, neomanex) -- broad patterns, not GSD-specific +- MoE model architecture literature (HuggingFace, NVIDIA) -- neural network MoE patterns; analogy to agent panels is loose diff --git a/.planning/research/PITFALLS.md b/.planning/research/PITFALLS.md new file mode 100644 index 0000000000..4f86902556 --- /dev/null +++ b/.planning/research/PITFALLS.md @@ -0,0 +1,352 @@ +# Domain Pitfalls: MoE Panels for Agent Orchestration + +**Domain:** Parallel agent orchestration with output merging and consensus logic +**Researched:** 2026-02-26 +**Overall confidence:** HIGH (pitfalls derived from codebase analysis + multi-agent system literature) + +--- + +## Critical Pitfalls + +Mistakes that cause rewrites, broken workflows, or silent data loss. + +--- + +### Pitfall 1: Output Contract Drift Between Panel and Single-Agent Mode + +**What goes wrong:** Panel output diverges from the exact string patterns that downstream workflows regex-match. The plan-phase.md workflow matches `## VERIFICATION PASSED` and `## ISSUES FOUND` as literal strings. The execute-phase.md workflow greps `^status:` from VERIFICATION.md frontmatter. The research-phase.md workflow matches `## RESEARCH COMPLETE` and `## RESEARCH BLOCKED`. If a panel's merging logic produces `## Verification Passed` (wrong case), `## ISSUES FOUND\n\n` (extra newline before content), or `status: passed` (double space), the downstream orchestrator silently falls through to a default branch and the workflow breaks. + +**Why it happens:** Three specialists each produce markdown independently. A synthesizer/merger must reconstruct the exact output, but no specialist "owns" the final header format. The merger may normalize whitespace, adjust casing, or insert its own section headers that subtly differ. LLM agents are nondeterministic -- even with identical prompts, they produce slightly different formatting. + +**Consequences:** +- Orchestrator workflows silently misroute (no error, just wrong branch) +- `plan-phase.md` step 11 fails to detect `## VERIFICATION PASSED` and treats it as inconclusive, entering an infinite revision loop or max-iteration bailout +- `execute-phase.md` step `verify_phase_goal` fails to grep `^status:` from VERIFICATION.md, treating a passed verification as gaps_found +- Users see "gaps found" when everything actually passed -- trust erodes + +**Prevention:** +1. The orchestrator (not the specialists) must own the final output template. Specialists produce structured data (YAML/JSON or structured markdown sections). The orchestrator assembles the final output using a deterministic template with hardcoded headers. +2. Define output contracts as constants in a shared reference file (`references/panel-contracts.md`) that both panel workflows and tests reference. +3. Never let LLM agents write the `## VERIFICATION PASSED` / `## ISSUES FOUND` header -- the orchestrator writes it based on parsed specialist data. +4. Add contract tests: for each panel, assert that output exactly matches a regex set extracted from the consuming workflow's matching patterns. + +**Detection:** Integration test that runs panel mode and single-agent mode on identical input, then diffs the structural elements (headers, frontmatter keys, status values). Any diff is a test failure. + +**Confidence:** HIGH -- directly derived from codebase analysis of `plan-phase.md`, `execute-phase.md`, `verify-phase.md`, and `research-phase.md` workflow routing patterns. + +--- + +### Pitfall 2: Consensus Logic Double-Counting or Dropping Findings + +**What goes wrong:** In the Plan Checker Panel, three specialists (Structural, Semantic, Compliance) may each report the same underlying issue from different angles. UNION of blockers and MAJORITY of warnings sounds simple, but: +- **Double-counting:** Specialist A reports "Task 2 missing " as a structural issue. Specialist B reports "Task 2 cannot be validated" as a semantic issue. These are the same problem counted twice, inflating the blocker count and confusing the planner during revision. +- **Dropping findings:** If dedup is too aggressive (e.g., matching on task number alone), distinct issues on the same task get collapsed. "Task 2 missing " and "Task 2 scope too broad" are different problems that share a task reference. +- **Conflicting severities:** Specialist A says "blocker", Specialist B says "warning" for the same finding. UNION of blockers means any-one-says-blocker wins. But if the blocker assessment is wrong (LLM hallucination), there is no correction mechanism. + +**Why it happens:** Deduplication requires semantic similarity judgment, not exact string matching. The issue descriptions from three LLM agents will never be identical strings even when describing the same problem. Naive dedup (exact match) catches nothing. Aggressive dedup (substring match on plan+task) drops distinct issues. + +**Consequences:** +- Planner receives inflated issue count, over-revises plans (rewrites that introduce new problems) +- Planner receives collapsed issues, misses one of two distinct problems +- Revision loop hits max iterations because "fixed" issues keep reappearing from a different specialist's perspective + +**Prevention:** +1. Normalize findings to a canonical structure BEFORE dedup: `{plan_id, task_id, dimension, severity, description}`. Dedup on `{plan_id, task_id, dimension}` tuple -- same plan, same task, same dimension = same finding, take highest severity. +2. If two specialists report the same plan+task but different dimensions (e.g., `task_completeness` vs `scope_sanity`), keep both -- they are genuinely different concerns. +3. For severity conflicts: take the highest severity (conservative). A blocker from any specialist is a blocker. This matches the stated UNION-blockers rule. +4. Include a `reported_by` field in merged output so the planner can see which specialists flagged which issues. Transparency reduces confusion during revision. +5. Add a dedup count: "3 specialists flagged this" vs "1 specialist flagged this" helps the planner prioritize. + +**Detection:** Unit test with three specialist outputs containing known overlapping and distinct issues. Assert merged output has exact expected count, correct severity escalation, and no dropped findings. + +**Confidence:** HIGH -- the Plan Checker agent already defines 8 verification dimensions (requirement_coverage, task_completeness, dependency_correctness, etc.). The dimension field is the natural dedup key. This is directly grounded in `gsd-plan-checker.md`. + +--- + +### Pitfall 3: Verifier Panel Domain Boundary Bleed + +**What goes wrong:** The Verifier Panel uses domain-partitioned assembly (not voting): Artifacts specialist checks file existence/substance, Requirements specialist checks requirement coverage, Human-verification specialist identifies what needs manual testing. The risk is that domains bleed: the Artifacts specialist discovers a missing file that is also a requirement gap, and the Requirements specialist independently discovers the same gap from the requirements side. Or worse, neither specialist covers a cross-cutting concern because each assumes the other handles it. + +**Why it happens:** Clean domain boundaries look clear on paper (artifacts vs requirements vs human) but real verification findings are cross-cutting. A missing API route is simultaneously an artifact issue (file does not exist), a requirement issue (REQ-AUTH-01 not satisfied), and potentially a human-verification issue (cannot test login flow). Domain partitioning means each specialist sees only their slice of the problem. + +**Consequences:** +- **Gap between domains:** No specialist checks key_links (wiring between artifacts). The current single-agent verifier checks three levels: exists, substantive, wired. If "wired" falls between Artifacts and Requirements domains, nobody checks it. +- **Redundant findings:** Same missing file appears in both Artifacts and Requirements sections with different descriptions, confusing the gap-closure planner. +- **VERIFICATION.md structural inconsistency:** The single-agent verifier produces a specific YAML frontmatter structure (`gaps:` with `truth`, `status`, `reason`, `artifacts`, `missing`). Domain-partitioned assembly must reconstruct this exact structure from three separate domain reports. + +**Prevention:** +1. Pre-compute shared data BEFORE specialist dispatch. The orchestrator runs `gsd-tools.cjs verify artifacts` and `gsd-tools.cjs verify key-links` once, then distributes the JSON results to all specialists. This eliminates the "discovery" phase overlap. +2. Assign key_links (wiring) verification explicitly to the Artifacts specialist. Make domain ownership unambiguous in the specialist prompts. +3. Assembly logic must deduplicate on `artifact.path` -- if both Artifacts and Requirements report an issue for the same file, merge into one gap entry with evidence from both. +4. The assembly step is deterministic code (not LLM). It reads structured sections from each specialist and templates them into the VERIFICATION.md format. Never have an LLM "synthesize" verification output -- the format contract is too strict. + +**Detection:** Test with a scenario where a missing file satisfies multiple domain concerns. Assert the merged VERIFICATION.md contains exactly one gap entry for that file with composite evidence, not duplicates. + +**Confidence:** HIGH -- the verifier agent's VERIFICATION.md format is fully specified in `gsd-verifier.md` with YAML frontmatter schema. Domain partitioning assembly must reconstruct this exact schema. + +--- + +### Pitfall 4: Specialist Timeout or Failure Breaks Entire Panel + +**What goes wrong:** One of three specialists times out (Claude Code has execution time limits), crashes, or produces malformed output. The panel either: (a) fails entirely and produces no output, breaking the workflow, or (b) waits indefinitely for the failed specialist, blocking the entire pipeline. + +**Why it happens:** Claude Code Task subagents can fail due to context limits, model errors, `classifyHandoffIfNeeded` bugs (documented in execute-phase.md), or simple timeouts. With a single agent, failure is straightforward -- the workflow catches it. With three parallel agents, partial failure is the hard case. + +**Consequences:** +- If the panel requires all three specialists: one failure = total panel failure = workflow stops +- If the panel waits for all: one hanging specialist blocks everything +- If the panel proceeds with 2/3: output quality degrades but silently (user does not know one specialist did not contribute) + +**Prevention:** +1. **Graceful degradation rule:** If 2/3 specialists succeed, the panel produces output using available results with a warning header: `Note: {specialist_name} did not complete. Results from {N}/3 specialists.` +2. **Never wait indefinitely.** Set a timeout per specialist. If a specialist has not returned when others have, proceed after a reasonable delay. +3. **Fallback to single-agent mode:** If 2/3 specialists fail, abandon the panel and fall back to the single-agent version of the step. This is the safest degradation path because the single-agent mode is the existing, tested code path. +4. **For the Plan Checker Panel:** 2/3 is still valid for UNION blockers (any blocker from any specialist is still a blocker). For MAJORITY warnings, 2/3 means majority = 2 agrees, which still works. +5. **For the Verifier Panel:** If the Artifacts specialist fails, the pre-computed shared data (from gsd-tools.cjs) is still available. The orchestrator can inject it directly into the assembly. If the Requirements specialist fails, the orchestrator can do a simple requirements-to-artifacts cross-reference from the pre-computed data. +6. **For the Research Panel:** If one domain researcher fails, the inline synthesis simply notes the gap: "Stack research not available -- this area needs phase-specific research later." + +**Detection:** Test by mocking one specialist returning an error or empty output. Assert the panel still produces valid output (with degradation warning) that passes the output contract tests from Pitfall 1. + +**Confidence:** HIGH -- the `classifyHandoffIfNeeded` bug is already documented in `execute-phase.md` step 5. Partial failure handling is a well-established pattern in the codebase's wave execution model (where one plan failing in a wave does not necessarily stop other plans). + +--- + +### Pitfall 5: Context Window Bloat From Passing Full Context to All Three Specialists + +**What goes wrong:** Each specialist needs context to do its job. Naively passing the full context (ROADMAP, STATE, REQUIREMENTS, CONTEXT.md, RESEARCH.md, all PLAN.md files, codebase analysis docs) to all three specialists triples the effective context cost. Specialists hit context limits and produce degraded output (hallucinations, missed findings, truncated analysis). + +**Why it happens:** The current single-agent architecture passes context via `` blocks -- the agent reads files independently using its fresh 200K context window. This works well for one agent. For three agents, the problem is not the file reading itself (each gets fresh context) but the API cost and latency of three parallel 200K-context conversations. + +More insidiously: if specialists are given files irrelevant to their domain, they waste context on parsing and may get confused by irrelevant information, producing lower-quality findings. + +**Consequences:** +- 3x API cost per panel invocation (3 specialists each with full context) +- Slower panel execution (3 parallel model calls with large context) +- Lower specialist quality: irrelevant context = attention dilution = worse findings + +**Prevention:** +1. **Scope specialist context to their domain.** The Plan Checker Structural specialist needs PLAN.md files only (not RESEARCH.md, not STATE.md). The Semantic specialist needs PLAN.md + ROADMAP goal + REQUIREMENTS. The Compliance specialist needs PLAN.md + CONTEXT.md (user decisions). Each specialist reads only what it needs. +2. **Pre-compute shared data.** The Verifier Panel should run `gsd-tools.cjs verify artifacts` and `gsd-tools.cjs verify key-links` ONCE and distribute JSON results, not have each specialist independently read and grep the entire codebase. +3. **For the Research Panel:** Domain researchers already have scoped concerns (Stack, Architecture, Pitfalls). Each reads only the files relevant to their domain. The orchestrator should not pass all codebase analysis files to all three researchers. +4. **Measure context efficiency:** Add a diagnostic that logs total tokens consumed per specialist. If any specialist uses more than 60% of its context window, the scoping is too broad. + +**Detection:** Log context window usage per specialist invocation during testing. Alert if any specialist exceeds 60% context utilization on test inputs. + +**Confidence:** HIGH -- Anthropic's own engineering blog recommends scoped context per sub-agent. The GSD architecture already follows this principle (orchestrator passes paths, not content). The risk is that panel implementation regresses this pattern by over-sharing. + +--- + +## Moderate Pitfalls + +--- + +### Pitfall 6: Research Panel Inline Synthesis Produces Inconsistent File Structure + +**What goes wrong:** The Research Panel has 3 domain researchers (Stack, Architecture, Pitfalls) with inline synthesis by the orchestrator (no separate synthesizer). The risk: the orchestrator must produce multiple research files (SUMMARY.md, STACK.md, FEATURES.md, ARCHITECTURE.md, PITFALLS.md) from three domain-specific outputs. If each researcher produces different structural conventions (different heading levels, different frontmatter, different confidence level labels), the orchestrator's inline synthesis produces inconsistent files. + +**Why it happens:** The current `gsd-project-researcher.md` agent produces all five files with consistent internal formatting because one agent writes all files. When three specialists each write their domain, they may use different formatting conventions despite having the same output format specification in their prompts. LLM agents are not deterministic -- even identical prompts produce variation. + +**Prevention:** +1. Each domain researcher writes exactly ONE file in their domain (Stack researcher -> STACK.md, Architecture researcher -> ARCHITECTURE.md, Pitfalls researcher -> PITFALLS.md). +2. The orchestrator writes SUMMARY.md and FEATURES.md by reading the three domain files and synthesizing. These two files are cross-cutting (they reference findings from all domains). +3. Provide a strict template for each domain file with exact heading structure, table formats, and frontmatter fields. The template is in `get-shit-done/templates/research-project/`. +4. The orchestrator validates each file's structure before committing: correct frontmatter, expected headings present, confidence levels using correct vocabulary (HIGH/MEDIUM/LOW, not "high"/"medium"/"low"). + +**Detection:** Structure validation test: parse each research output file and assert expected headings, frontmatter keys, and confidence level vocabulary match the template specification. + +**Confidence:** MEDIUM -- the exact research output templates exist in the codebase but are guidance, not strict schemas. The risk is moderate because research files are consumed by humans (roadmap creation) not by regex-matching workflows, so slight format variation is more tolerable than in verification or plan-checking output. + +--- + +### Pitfall 7: Config Migration Breaks Existing Installations + +**What goes wrong:** Three new config keys (`plan_check_panel`, `verifier_panel`, `research_panel`) are added to `config.json`. Existing installations have no these keys. If the code checks `config.plan_check_panel === true` but the key does not exist, it returns `undefined` which is falsy -- correct behavior (panels disabled by default). But if ANY code path checks `config.plan_check_panel !== false` (testing for explicit opt-out instead of explicit opt-in), missing keys evaluate to `true` and panels activate unexpectedly on existing installations. + +**Why it happens:** JavaScript truthiness gotchas. `undefined !== false` is `true`. This is a classic boolean config default problem that has bitten many CLI tools. + +**Prevention:** +1. **Always check for explicit opt-in:** `config.plan_check_panel === true`, never `config.plan_check_panel !== false`. +2. **Normalize config at load time:** In `core.cjs` config loading, add default values for all panel keys: `{ plan_check_panel: false, verifier_panel: false, research_panel: false }`. Use `Object.assign(defaults, loadedConfig)` pattern. +3. **Do not update the template config.json** to include panel keys with `true` values. The template should either omit them (relying on defaults) or explicitly set them to `false`. +4. **Add a config schema version.** When new keys are added, bump the schema version. The health check (`/gsd:health`) can warn about missing keys and offer to add defaults. +5. **Test with a config.json that has NO panel keys** -- this is the upgrade path for every existing user. + +**Detection:** Unit test in `tests/core.test.cjs` (or wherever config loading is tested): load a config.json without panel keys, assert all panel features are disabled. + +**Confidence:** HIGH -- the existing config template in `get-shit-done/templates/config.json` does not have panel keys. The `workflow` section has `research`, `plan_check`, and `verifier` as boolean flags already. The new panel keys must coexist with these existing keys without conflict (see Pitfall 8). + +--- + +### Pitfall 8: Config Key Naming Collision With Existing Workflow Keys + +**What goes wrong:** The existing config already has `workflow.plan_check: true` (enables the plan checker step) and `workflow.verifier: true` (enables the verifier step). The new MoE keys are `plan_check_panel`, `verifier_panel`, `research_panel`. The risk: confusing interaction between `workflow.plan_check` (enable/disable the step entirely) and `plan_check_panel` (use panel mode vs single-agent mode when the step IS enabled). + +Consider: `workflow.plan_check: false, plan_check_panel: true`. Does this mean: skip plan checking entirely (first key wins) or use panel mode for plan checking (second key wins)? The correct answer is: `workflow.plan_check` gates whether the step runs at all; `plan_check_panel` selects the implementation when it does run. But if this is not documented and tested, bugs will emerge. + +**Why it happens:** Two levels of configuration (feature gate vs implementation selector) is inherently confusing. The naming convention does not make the relationship obvious. + +**Prevention:** +1. **Nest panel config under `workflow`:** `workflow.plan_check_panel`, `workflow.verifier_panel`, `workflow.research_panel`. This keeps all workflow toggles in one place and makes the hierarchy clear. +2. **Document the precedence rule explicitly:** `workflow.plan_check` must be `true` for `workflow.plan_check_panel` to have any effect. If the step is disabled, the panel key is ignored. +3. **Init command should enforce this:** `gsd-tools.cjs init plan-phase` already returns `plan_checker_enabled`. It should also return `plan_check_panel_enabled`, computed as `workflow.plan_check === true && workflow.plan_check_panel === true`. +4. **Add a truth table to the config template as a comment or to the user guide:** + +| `workflow.plan_check` | `workflow.plan_check_panel` | Behavior | +|---|---|---| +| `false` | any | Plan checking skipped entirely | +| `true` | `false` or missing | Single-agent plan checker (current behavior) | +| `true` | `true` | 3-specialist panel plan checker | + +**Detection:** Integration test: set `plan_check: false, plan_check_panel: true`, assert plan checking does NOT run. Set `plan_check: true, plan_check_panel: false`, assert single-agent mode runs. Set `plan_check: true, plan_check_panel: true`, assert panel mode runs. + +**Confidence:** HIGH -- directly derived from examining `get-shit-done/templates/config.json` and the `plan-phase.md` workflow which already checks `plan_checker_enabled` from init JSON. + +--- + +### Pitfall 9: Dedup Logic Fails on Semantically Similar But Textually Different Issues + +**What goes wrong:** The Plan Checker Panel's consensus logic requires deduplicating issues across three specialists. Two specialists might report: +- Specialist A: `"Task 2 missing element"` (dimension: task_completeness) +- Specialist B: `"Task 2 lacks verification step"` (dimension: task_completeness) + +These are the same issue, but string comparison misses the match. If dedup relies on exact description matching, duplicates survive. If dedup relies on `{plan, task, dimension}` tuple (as recommended in Pitfall 2), these correctly deduplicate. But edge cases exist: +- Phase-level issues (no task reference): `{plan: null, task: null, dimension: scope_sanity}` -- two specialists both flag scope concerns with different descriptions. Tuple match works but loses the distinct details. +- Multi-task issues: "Plans 02 and 03 have circular dependency" -- this has two plan references, not one. + +**Prevention:** +1. Require specialists to output issues in the existing structured YAML format (from `gsd-plan-checker.md`): `{plan, dimension, severity, description, task, fix_hint}`. Dedup on `{plan, task, dimension}`. +2. For phase-level issues (no task), dedup on `{plan: null, task: null, dimension}`. If two specialists flag the same dimension at the phase level, keep the more detailed description. +3. For multi-plan issues (circular dependencies), normalize to the first plan in the cycle as the canonical plan reference. +4. Do NOT attempt semantic similarity matching (no embeddings, no LLM-as-judge for dedup). The structured fields provide sufficient dedup keys. Semantic similarity adds complexity and nondeterminism to what should be a deterministic merge step. + +**Detection:** Unit test with edge cases: phase-level issues from multiple specialists, circular dependency issues, and identical-dimension-different-description pairs. + +**Confidence:** HIGH -- the issue format is already well-defined in `gsd-plan-checker.md`. Structured dedup on defined fields is deterministic and testable. + +--- + +### Pitfall 10: Panel Mode Cannot Be Tested End-to-End Without Expensive LLM Calls + +**What goes wrong:** Panel logic involves spawning 3 LLM agents, collecting their outputs, and merging. Unit tests can mock the specialist outputs and test the merge logic. But end-to-end tests (verifying the full pipeline from input to final output) require actual LLM calls, which are expensive, slow, and nondeterministic. + +**Why it happens:** The core value of panels is that multiple LLM specialists produce diverse findings. Mocking them eliminates the very thing being tested (LLM diversity). But real LLM calls make CI unreliable (model output varies, tests flake). + +**Consequences:** +- Tests pass with mocked outputs but fail with real LLM calls due to unexpected formatting +- Output contract violations are only discovered in production (user runs panel, gets malformed output) +- CI becomes slow and expensive if real LLM calls are included + +**Prevention:** +1. **Layer the testing strategy:** + - **Unit tests (CI):** Test merge/consensus/dedup logic with fixed specialist outputs. These are deterministic, fast, and catch logic bugs. This is where 90% of panel bugs will be caught. + - **Contract tests (CI):** Validate that merge output matches output contract patterns (regex from consuming workflows). Use fixed inputs, assert structural correctness. + - **Integration tests (manual/nightly):** Run full panel with real LLM calls. Compare structural output against single-agent output for same input. Flag structural differences (not content differences). +2. **Snapshot testing for output structure:** Capture the structural skeleton of panel output (headers, frontmatter keys, section order) and snapshot it. Content varies, structure must not. +3. **The merge step MUST be deterministic code, not LLM.** This makes the merge logic fully testable without LLM calls. Only specialist dispatch requires LLMs; everything after collection is pure code. + +**Detection:** CI test suite with contract tests that run on every PR. Nightly integration test that runs full panel and diffs structural output. + +**Confidence:** HIGH -- the existing test infrastructure uses `node:test` + `node:assert` with temp directory isolation. Merge logic can be tested as pure functions. The testing pattern is well-established in the codebase (433 tests, 94% coverage). + +--- + +## Minor Pitfalls + +--- + +### Pitfall 11: Parallel Specialist Spawn Order Creates Non-Deterministic Merge Order + +**What goes wrong:** Three specialists are spawned in parallel. The order they complete is nondeterministic (depends on model latency, context size, network conditions). If the merge logic processes results in completion order rather than a fixed order, the output varies between runs even with identical inputs. This makes debugging harder and snapshot tests fragile. + +**Prevention:** Always sort specialist results by specialist name/role before merging. The merge function takes an array sorted by `[structural, semantic, compliance]` (or `[artifacts, requirements, human]`), not by completion time. + +**Confidence:** HIGH -- trivial to implement, easy to miss. + +--- + +### Pitfall 12: Specialist Prompts Drift From Single-Agent Prompts Over Time + +**What goes wrong:** The single-agent versions (gsd-plan-checker.md, gsd-verifier.md, gsd-project-researcher.md) continue to be maintained and updated. The specialist panel prompts are derivatives. Over time, updates to the single agent are not propagated to the specialists, creating behavioral divergence. + +**Prevention:** +1. Specialist prompts should `@include` or reference the base agent prompt and add only their domain-scoping delta. Do not copy-paste the full agent prompt into specialist prompts. +2. If full inclusion is too expensive (context), extract the shared verification dimensions/process steps into a shared reference file (`references/plan-check-dimensions.md`) that both the single-agent and specialist prompts reference. +3. Add a CI check: hash the base agent prompt sections and compare against the specialist prompts. If the base changes, flag the specialists for review. + +**Confidence:** MEDIUM -- this is a maintenance concern, not an implementation bug. It will not cause problems at launch but will accumulate over months. + +--- + +### Pitfall 13: Research Panel Domain Researchers Produce Overlapping Content + +**What goes wrong:** The Stack researcher covers "what technology to use." The Architecture researcher covers "how to structure the system." The Pitfalls researcher covers "what can go wrong." But technology choices (Stack) inform architecture decisions (Architecture), and both inform pitfalls (Pitfalls). Without careful domain boundaries, each researcher partially duplicates the others' work. The inline synthesis then has to reconcile three partially-overlapping narratives. + +**Prevention:** +1. Define hard exclusion rules in each specialist prompt: + - Stack: technology selection and rationale. Does NOT discuss architecture patterns or failure modes. + - Architecture: system structure, component boundaries, data flow. Does NOT discuss technology selection or common mistakes. + - Pitfalls: failure modes, anti-patterns, risks. Does NOT recommend technologies or define architecture. +2. Accept minor overlap as natural. The synthesis step (SUMMARY.md, FEATURES.md) is where cross-cutting concerns are reconciled. The domain files are allowed to reference each other ("see STACK.md for technology choice rationale") without duplicating. + +**Confidence:** MEDIUM -- overlap is inherent in domain decomposition. It is manageable with clear prompt boundaries but cannot be fully eliminated. + +--- + +## Phase-Specific Warnings + +| Phase Topic | Likely Pitfall | Mitigation | +|---|---|---| +| Panel infrastructure (config, routing) | Config key collision with existing workflow keys (Pitfall 8) | Nest under `workflow.*`, test all 4 combinations of enable/panel flags | +| Plan Checker Panel consensus | Double-counting or dropping findings (Pitfall 2) | Structured dedup on `{plan, task, dimension}` tuple with severity escalation | +| Plan Checker Panel output | Output contract drift (Pitfall 1) | Orchestrator writes final headers from structured data, contract tests | +| Verifier Panel assembly | Domain boundary bleed (Pitfall 3) | Pre-compute shared data, assign key_links to Artifacts specialist explicitly | +| Verifier Panel output | VERIFICATION.md format deviation (Pitfall 1) | Deterministic code assembly, not LLM synthesis of final output | +| Research Panel synthesis | Overlapping domain content (Pitfall 13) | Hard exclusion rules in prompts, accept minor overlap, synthesize in SUMMARY.md | +| Specialist failure handling | One specialist timeout breaks panel (Pitfall 4) | Graceful degradation to 2/3, fallback to single-agent mode at 1/3 | +| Testing strategy | Cannot test panels end-to-end cheaply (Pitfall 10) | Layered testing: unit (merge logic) + contract (output structure) + integration (nightly) | +| Backwards compatibility | Config migration breaks existing users (Pitfall 7) | Explicit opt-in (`=== true`), normalize defaults at load time, test without panel keys | + +--- + +## Cross-Cutting Design Principles + +These principles address multiple pitfalls simultaneously: + +### Principle 1: Orchestrator Owns Output, Specialists Own Analysis + +Specialists produce structured findings (YAML issues, JSON verification results, markdown sections with defined headings). The orchestrator/merge-step assembles these into the final output using deterministic templates. Never let an LLM write the structural wrapper -- only the content within sections. + +**Addresses:** Pitfall 1, 3, 6 + +### Principle 2: Pre-Compute, Then Distribute + +Any data that multiple specialists need should be computed once by the orchestrator (using gsd-tools.cjs) and distributed as input, not computed independently by each specialist. This reduces cost, eliminates inconsistency, and makes the pre-computed data available for fallback if a specialist fails. + +**Addresses:** Pitfall 3, 4, 5 + +### Principle 3: Deterministic Merge, Nondeterministic Analysis + +The analysis step (what specialists find) is nondeterministic by nature -- that is the point of using multiple LLM agents. The merge step (how findings are combined) must be deterministic code. This separation makes the merge fully testable without LLM calls and ensures output contract compliance. + +**Addresses:** Pitfall 1, 2, 9, 10 + +### Principle 4: Graceful Degradation to Known-Good Path + +Every panel must have a fallback path to the existing single-agent mode. If panels are enabled but failing, the system should degrade to the working single-agent implementation rather than producing no output or broken output. This is the safety net that makes opt-in panels low-risk. + +**Addresses:** Pitfall 4, 7, 8 + +--- + +## Sources + +- GSD codebase analysis: `agents/gsd-plan-checker.md`, `agents/gsd-verifier.md`, `agents/gsd-project-researcher.md` (output contract patterns) -- HIGH confidence +- GSD codebase analysis: `get-shit-done/workflows/plan-phase.md`, `get-shit-done/workflows/execute-phase.md`, `get-shit-done/workflows/verify-phase.md` (downstream regex matching) -- HIGH confidence +- GSD codebase analysis: `get-shit-done/templates/config.json` (existing config structure) -- HIGH confidence +- GSD codebase analysis: `get-shit-done/bin/lib/verify.cjs`, `get-shit-done/bin/lib/frontmatter.cjs` (output parsing code) -- HIGH confidence +- [ACL 2025: Voting or Consensus? Decision-Making in Multi-Agent Debate](https://aclanthology.org/2025.findings-acl.606/) -- MEDIUM confidence (academic; applied indirectly to agent panels) +- [Maxim.ai: Multi-Agent System Reliability](https://www.getmaxim.ai/articles/multi-agent-system-reliability-failure-patterns-root-causes-and-production-validation-strategies/) -- MEDIUM confidence (industry patterns, verified against GSD architecture) +- [Azure Architecture Center: AI Agent Design Patterns](https://learn.microsoft.com/en-us/azure/architecture/ai-ml/guide/ai-agent-design-patterns) -- MEDIUM confidence (general patterns) +- [Google ADK: Parallel Agents](https://google.github.io/adk-docs/agents/workflow-agents/parallel-agents/) -- MEDIUM confidence (framework-specific but pattern applicable) +- [Anthropic: Effective Context Engineering for AI Agents](https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents) -- MEDIUM confidence (authoritative source on context scoping) +- [JetBrains Research: Context Management for LLM-Powered Agents](https://blog.jetbrains.com/research/2025/12/efficient-context-management/) -- LOW confidence (single source, applied indirectly) diff --git a/.planning/research/STACK.md b/.planning/research/STACK.md new file mode 100644 index 0000000000..04e9e09125 --- /dev/null +++ b/.planning/research/STACK.md @@ -0,0 +1,291 @@ +# Technology Stack: MoE Panels for Agent Orchestration + +**Project:** get-shit-done v2.0 -- MoE Panels +**Researched:** 2026-02-26 +**Domain:** Multi-agent panel orchestration for AI coding assistant quality gates + +## Constraint + +No new npm dependencies allowed. All patterns must be implementable using: +- Markdown agent definitions (`.md` files in `agents/`) +- Claude Code's Task tool for spawning subagents +- Node.js built-ins for CLI tooling (`bin/lib/*.cjs`) +- Existing `.planning/` state management layer + +## Recommended Architecture: Scatter-Gather with LLM Synthesizer + +**Confidence: HIGH** -- This pattern is the most proven and directly maps to GSD's existing infrastructure. + +The architecture replaces each single quality-gate agent with a **panel of 3 parallel specialists** plus a **synthesizer agent** that merges their outputs. This is the "fan-out/fan-in" or "scatter-gather" pattern documented by Microsoft Azure Architecture Center, AWS, Google ADK, and implemented in CrewAI, LangGraph, and AutoGen. + +### Why 3 Specialists (Not 2, Not 5) + +| Panel Size | Tradeoff | Verdict | +|------------|----------|---------| +| 2 agents | Ties possible, limited diversity | Too few | +| **3 agents** | **Majority possible, diverse enough, manageable token cost** | **Use this** | +| 4 agents | Even number creates ties, diminishing returns | Avoid | +| 5+ agents | Error compounding grows, 60k+ token overhead per panel, coordination tax exceeds gains | Too many | + +**Confidence: MEDIUM** -- The "Coordination Tax" research from Google DeepMind (2025) shows accuracy gains saturate or fluctuate beyond 4 agents. The "17x error trap" paper demonstrates that poorly coordinated multi-agent systems compound errors. 3 is the sweet spot for: (a) majority vote viability, (b) diverse perspectives, (c) affordable token cost (3 x 20k = 60k overhead), (d) matches Claude Code's 10-agent parallel cap. + +### Panel Composition Strategy + +Each panel needs **specialist diversity, not specialist redundancy**. The 3 agents must examine the problem from genuinely different angles. If all 3 use the same approach, you get redundancy not robustness. + +**Pattern: Role-Based Specialization** + +| Specialist Role | Focus | What It Catches | +|----------------|-------|-----------------| +| **Domain Expert** | Does this achieve the functional goal? | Missing requirements, wrong behavior | +| **Quality Auditor** | Does this meet structural standards? | Anti-patterns, missing tests, scope issues | +| **Devil's Advocate** | What could go wrong? | Edge cases, failure modes, hidden assumptions | + +**Confidence: MEDIUM** -- This role triad is derived from multi-agent debate literature (ACL 2025 findings, debate-based consensus patterns) and mirrors effective human review panels. The specific role names and scopes need phase-specific tuning. + +## Panel Types and Specialist Definitions + +### Panel 1: Plan Checker Panel (replaces single gsd-plan-checker) + +Currently one agent checks all 8 dimensions (requirement coverage, task completeness, dependencies, key links, scope, verification derivation, context compliance, Nyquist). Split into: + +| Specialist | Dimensions | Prompt Focus | +|-----------|------------|--------------| +| **Completeness Specialist** | Dims 1, 2, 6 (requirement coverage, task completeness, verification derivation) | "Do these plans cover everything the phase needs to deliver?" | +| **Structure Specialist** | Dims 3, 4, 5 (dependency correctness, key links, scope sanity) | "Are these plans structurally sound and properly connected?" | +| **Compliance Specialist** | Dims 7, 8 (context compliance, Nyquist) | "Do these plans honor user decisions and testing requirements?" | + +**Synthesizer:** Merges issue lists, deduplicates, assigns final severity, produces single ISSUES FOUND / VERIFICATION PASSED result. + +**Confidence: HIGH** -- This maps cleanly to the existing dimension structure in gsd-plan-checker.md. Each specialist gets a focused subset that reduces prompt complexity and improves depth of analysis. + +### Panel 2: Verifier Panel (replaces single gsd-verifier) + +Currently one agent performs 10-step verification (observable truths, artifacts, key links, requirements, anti-patterns, human verification). Split into: + +| Specialist | Steps | Prompt Focus | +|-----------|-------|--------------| +| **Truth Verifier** | Steps 2-3 (establish must-haves, verify observable truths) | "Are the phase goals actually achieved in the codebase?" | +| **Wiring Inspector** | Steps 4-5 (verify artifacts at 3 levels, verify key links) | "Are artifacts substantive and properly connected?" | +| **Quality Scanner** | Steps 6-7 (requirements coverage, anti-pattern scan) | "Are requirements satisfied and is the code clean?" | + +**Synthesizer:** Merges verification results, produces single VERIFICATION.md with unified truth table, artifact table, and gap list. + +**Confidence: HIGH** -- This follows the existing step structure. Each specialist is self-contained and reads different parts of the codebase. + +### Panel 3: Phase Researcher Panel (replaces single gsd-phase-researcher) + +Currently one agent researches all domains (stack, patterns, pitfalls, code examples). Split into: + +| Specialist | Sections | Prompt Focus | +|-----------|----------|--------------| +| **Stack Researcher** | Standard Stack, Don't Hand-Roll, State of the Art | "What libraries and tools does this phase need?" | +| **Pattern Researcher** | Architecture Patterns, Code Examples | "How do experts structure this type of implementation?" | +| **Risk Researcher** | Common Pitfalls, Open Questions | "What commonly goes wrong in this domain?" | + +**Synthesizer:** Merges research sections, resolves contradictions, produces single RESEARCH.md. + +**Confidence: MEDIUM** -- GSD already has a parallel research pattern (new-project spawns 4 researchers). This applies the same pattern at phase level. Note: the project-level research already uses 4 parallel agents -- this phase-level panel uses 3 focused specialists instead. + +### Panel 4: Project Research Panel (already exists -- 4 parallel agents) + +The `/gsd:new-project` workflow already spawns 4 parallel researchers (STACK, FEATURES, ARCHITECTURE, PITFALLS) plus a synthesizer. This is already a panel pattern. **No change needed** except possibly adding a synthesizer improvement. + +**Confidence: HIGH** -- Already implemented and working. + +## Consensus/Aggregation Strategy + +### Use: LLM Synthesizer (Not Majority Vote) + +**Confidence: HIGH** -- Majority voting only works for discrete classification tasks (pass/fail, yes/no). Quality gates produce rich structured output (issue lists, verification reports, research findings). An LLM synthesizer is the correct aggregation strategy for complex, non-discrete outputs. + +| Aggregation Method | Works For | Does NOT Work For | +|-------------------|-----------|-------------------| +| Majority Vote | Binary pass/fail decisions | Structured issue lists, research findings | +| Weighted Vote | Ranked recommendations | Complex verification reports | +| **LLM Synthesizer** | **All panel outputs in this system** | Nothing (universal, but costs one extra agent) | + +### Synthesizer Pattern + +``` +Specialists (parallel) → Write to distinct output keys → Synthesizer reads all → Produces unified result +``` + +The synthesizer agent: + +1. Reads all 3 specialist outputs +2. **Deduplicates** findings (same issue reported by 2+ specialists) +3. **Resolves conflicts** (specialist A says pass, specialist B says fail on same item) +4. **Elevates** items flagged by 2+ specialists (consensus = higher confidence) +5. **Produces** the final structured output in the format the orchestrator expects + +**Conflict resolution rule:** When specialists disagree on severity or status: +- 2 agree, 1 disagrees = go with majority +- All 3 disagree = synthesizer uses most conservative (highest severity) finding +- Specialist provides reasoning? Synthesizer evaluates reasoning quality, not just position + +**Confidence: HIGH** -- This mirrors the gsd-research-synthesizer pattern already in the codebase, which reads 4 researcher outputs and produces unified SUMMARY.md. The same pattern extends to all panels. + +## Implementation Stack + +### No New Dependencies Required + +Everything needed exists in the current stack: + +| Component | Implementation | Existing Precedent | +|-----------|---------------|-------------------| +| Parallel dispatch | Task tool (3 parallel calls) | new-project.md spawns 4 researchers | +| Specialist definitions | `agents/*.md` files | All 11 existing agents | +| Result collection | Task tool return values | new-project.md collects researcher outputs | +| Synthesizer | Task tool (1 call after parallel batch) | gsd-research-synthesizer.md | +| Panel config | `.planning/config.json` additions | Existing workflow toggles | +| CLI tooling | `bin/lib/*.cjs` modules | Existing verify, frontmatter modules | + +### New Agent Files Needed + +``` +agents/ + # Existing (unchanged) + gsd-plan-checker.md → becomes "solo mode" fallback + gsd-verifier.md → becomes "solo mode" fallback + gsd-phase-researcher.md → becomes "solo mode" fallback + + # New: Panel specialist agents + panels/ + plan-checker/ + completeness-specialist.md + structure-specialist.md + compliance-specialist.md + synthesizer.md + verifier/ + truth-verifier.md + wiring-inspector.md + quality-scanner.md + synthesizer.md + phase-researcher/ + stack-researcher.md + pattern-researcher.md + risk-researcher.md + synthesizer.md +``` + +**Total new agent files:** 12 (9 specialists + 3 synthesizers) + +**Confidence: HIGH** -- Follows existing agent definition pattern exactly. No architectural change, just more .md files. + +### Config Schema Extension + +```json +{ + "workflow": { + "research": true, + "plan_check": true, + "verifier": true, + "panels": { + "enabled": true, + "plan_checker": { + "enabled": true, + "specialists": 3 + }, + "verifier": { + "enabled": true, + "specialists": 3 + }, + "phase_researcher": { + "enabled": true, + "specialists": 3 + } + } + } +} +``` + +When `panels.enabled` is false, fall back to single-agent mode (existing behavior). This provides a clean upgrade path and allows users to opt out of higher token costs. + +**Confidence: HIGH** -- Follows existing config pattern. The `workflow` object already has boolean toggles. + +### Orchestrator Changes + +Each workflow that spawns a quality-gate agent needs a panel-aware dispatch function: + +**Before (plan-phase.md, step 10):** +``` +Task(prompt=checker_prompt, subagent_type="gsd-plan-checker", ...) +``` + +**After:** +``` +if (panels.plan_checker.enabled) { + // Spawn 3 specialists in parallel + Task(prompt=completeness_prompt, subagent_type="general-purpose", ...) + Task(prompt=structure_prompt, subagent_type="general-purpose", ...) + Task(prompt=compliance_prompt, subagent_type="general-purpose", ...) + + // After all 3 complete, spawn synthesizer + Task(prompt=synthesizer_prompt, subagent_type="general-purpose", ...) +} else { + // Fallback to single agent + Task(prompt=checker_prompt, subagent_type="gsd-plan-checker", ...) +} +``` + +**Confidence: HIGH** -- The branching pattern (config check -> conditional spawn) already exists in plan-phase.md for research and plan-check toggles. + +## Token Cost Analysis + +| Gate | Solo Mode | Panel Mode (3 + synthesizer) | Increase | +|------|-----------|------------------------------|----------| +| Plan Checker | ~20k overhead + work | ~80k overhead + 3x work + synthesis | 4x | +| Verifier | ~20k overhead + work | ~80k overhead + 3x work + synthesis | 4x | +| Phase Researcher | ~20k overhead + work | ~80k overhead + 3x work + synthesis | 4x | + +**Mitigation strategies:** +1. Panels are opt-in via config (default: off for budget profile, on for quality profile) +2. Each specialist gets a *narrower* prompt than the solo agent, so work per specialist is ~40% of solo +3. Net increase is roughly **2.5x per gate** (not 4x) because specialists do less work each +4. Model profile applies: use Sonnet for specialists, Opus only for synthesizer if quality profile + +**Confidence: MEDIUM** -- Token estimates are approximations. Real costs depend on phase complexity, codebase size, and model choice. + +## Alternatives Considered + +| Alternative | Why Not | +|------------|---------| +| **Debate pattern** (agents argue in rounds) | Too expensive (multiple rounds), complex to implement, diminishing returns after round 1 for structured analysis | +| **Hierarchical supervisor** (one agent delegates to others) | Adds latency (supervisor must reason before dispatching), unnecessary when specialization is static | +| **Group chat** (agents in shared thread) | Accumulating context bloats rapidly, cross-talk introduces confusion for structured verification | +| **5+ specialists** | Error compounding, coordination tax, 100k+ token overhead | +| **2 specialists** | No tiebreaking, limited diversity | +| **External framework (CrewAI, LangGraph)** | Violates no-new-dependencies constraint, adds complexity, GSD already has Task tool | +| **Majority vote only** | Only works for binary decisions; quality gates produce structured reports | + +## Key Technical Decisions + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| Panel size | 3 specialists + 1 synthesizer | Majority possible, affordable, below coordination tax threshold | +| Aggregation | LLM synthesizer agent | Rich structured outputs need synthesis, not counting | +| Specialist diversity | Role-based (domain, structure, risk) | Prevents redundant analysis | +| Config location | `.planning/config.json` | Follows existing pattern | +| Agent definitions | `agents/panels/*.md` | Follows existing agent pattern, scoped in subdirectory | +| Fallback | Single-agent mode when panels disabled | Backward compatible | +| File writes | Specialists return via Task output, synthesizer writes final file | Prevents file conflicts from parallel agents | + +## Sources + +### Primary (HIGH confidence) +- [Microsoft Azure Architecture Center -- AI Agent Orchestration Patterns](https://learn.microsoft.com/en-us/azure/architecture/ai-ml/guide/ai-agent-design-patterns) -- Concurrent orchestration pattern, fan-out/fan-in, aggregation strategies (Updated 2026-02-12) +- [AWS Prescriptive Guidance -- Parallelization and scatter-gather patterns](https://docs.aws.amazon.com/prescriptive-guidance/latest/agentic-ai-patterns/parallelization-and-scatter-gather-patterns.html) -- Scatter-gather implementation details +- GSD codebase (`new-project.md`, `gsd-research-synthesizer.md`) -- Existing 4-agent parallel + synthesizer pattern + +### Secondary (MEDIUM confidence) +- [LangGraph Parallel Agent Pattern](https://dev.to/rosen_hristov/why-i-split-one-langgraph-agent-into-four-running-in-parallel-2c65) -- Send API, state reducers, result merging pattern +- [Parallelization -- Agentic Design Pattern Series](https://datalearningscience.com/p/3-parallelization-agentic-design) -- Dispatch, concurrent execution, aggregation steps +- [Claude Code Task Tool Patterns](https://amitkoth.com/claude-code-task-tool-vs-subagents/) -- Parallel dispatch via Task tool, 10-agent cap, 20k token overhead per agent +- [Claude Code Sub-Agents: Parallel vs Sequential Patterns](https://claudefa.st/blog/guide/agents/sub-agent-best-practices) -- Task spawning best practices + +### Tertiary (LOW confidence -- flag for validation) +- [17x Error Trap / Bag of Agents](https://towardsdatascience.com/why-your-multi-agent-system-is-failing-escaping-the-17x-error-trap-of-the-bag-of-agents/) -- Error compounding in multi-agent systems, coordination tax concept +- [Voting or Consensus? Decision-Making in Multi-Agent Debate (ACL 2025)](https://aclanthology.org/2025.findings-acl.606.pdf) -- 7 decision protocols comparison +- [CrewAI Parallel Patterns](https://github.com/apappascs/crewai-parallel-patterns) -- CrewAI-specific patterns, async_execution model +- [Multi-Agent AI Systems Explained: LangGraph vs CrewAI vs AutoGen (2026)](https://www.mayhemcode.com/2026/02/multi-agent-ai-systems-explained.html) -- Framework comparison From 2b613ab633911ac96cc9427d6725341345c396d3 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 12:00:10 +1000 Subject: [PATCH 06/16] docs: start milestone v1.3 PR Review Fixes --- .planning/PROJECT.md | 24 ++++++++++++------------ .planning/STATE.md | 10 +++++++--- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md index 236d81f049..646e292f8b 100644 --- a/.planning/PROJECT.md +++ b/.planning/PROJECT.md @@ -28,16 +28,16 @@ Reliable AI agent orchestration with quality gates that catch bad plans before e - ✓ Coverage thresholds enforced in CI (fail if any module drops below 70%) — v1.1 - ✓ VERIFICATION.md audit trail for each coverage phase — v1.1 -### Active (v2.0 — MoE Panels) +### Active (v1.3 — PR Review Fixes) -- MoE panel infrastructure: 3 config keys (`plan_check_panel`, `verifier_panel`, `research_panel`), all default `false` -- Plan Checker Panel: 3 parallel specialists (Structural, Semantic, Compliance) with consensus logic -- Verifier Panel: 3 domain specialists (Artifacts, Requirements, Human) with domain-partitioned assembly -- Research Panel: 3 domain researchers (Stack, Architecture, Pitfalls) with inline synthesis -- Workflow routing: conditional panel dispatch based on config keys -- Output contract preservation: panel output identical to single-agent output (same headers, frontmatter, patterns) +- Split PR #762 into focused PRs: tests+CI, resolve-model fix, autopilot feature +- Remove committed `.planning/` artifacts from PR branch +- Fix auto-advance config mutation — use runtime flag instead of config.json persistence +- Coordinate resolve-model fix with PR #761 to avoid merge conflicts +- Add runtime validation for `discuss_agents` in auto-discuss workflow +- Document `model_overrides` config or remove if premature -*Full requirements in `.planning/REQUIREMENTS.md` (pending creation)* +*Full requirements in `.planning/REQUIREMENTS.md`* ### Out of Scope @@ -48,12 +48,12 @@ Reliable AI agent orchestration with quality gates that catch bad plans before e - Windows-specific path separator tests (WIN-01) — future candidate - Windows CRLF line ending handling tests (CRLF-01) — future candidate -## Current Milestone: v2.0 — MoE Panels +## Current Milestone: v1.3 — PR Review Fixes -**Phase:** Defining requirements (research pending) +**Phase:** Defining requirements **Starting phase:** 14 (continuing from v1.1's Phase 13) -Adds Mixture of Experts panels for the three highest-variance quality gates: plan checking, verification, and phase research. Each panel replaces a single-agent step with 3 parallel specialists, improving coverage without changing user-facing commands or output formats. +Addresses reviewer feedback on PR #762 (autopilot mode). Splits the monolithic PR into focused PRs, removes committed dev artifacts, fixes identified code issues (auto-advance mutation, validation gaps), and coordinates overlapping fixes with PR #761. ## Current State @@ -97,4 +97,4 @@ Known bugs documented and tested (tests assert current behavior, production code | VERIFICATION.md per coverage phase | Audit trail for orphaned requirements found in milestone audit | ✓ Good — all requirements now have 3-source verification | --- -*Last updated: 2026-02-26 — v2.0 MoE Panels milestone started* +*Last updated: 2026-02-28 — v1.3 PR Review Fixes milestone started* diff --git a/.planning/STATE.md b/.planning/STATE.md index 60d7dca3ab..8d03f1febe 100644 --- a/.planning/STATE.md +++ b/.planning/STATE.md @@ -1,9 +1,11 @@ # Project State -## Current Milestone: v2.0 — MoE Panels +## Current Milestone: v1.3 — PR Review Fixes -**Phase:** Pre-phase (defining requirements) -**Status:** Research pending, then requirements definition +**Phase:** Not started (defining requirements) +**Plan:** — +**Status:** Defining requirements +**Last activity:** 2026-02-28 — Milestone v1.3 started ## Milestone History @@ -26,3 +28,5 @@ - Agent definitions: `agents/` directory (markdown files) - Workflow definitions: `get-shit-done/workflows/` directory - Codebase map: `.planning/codebase/` (7 documents, analyzed 2026-02-25) +- PR #762: open, changes requested by @glittercowboy +- PR #761: overlapping resolve-model fix (closed) From 9f31fa1dbfab1800847f21a4da2e0504cc4df7af Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 12:07:24 +1000 Subject: [PATCH 07/16] docs: complete project research --- .planning/research/ARCHITECTURE.md | 977 ++++++++++++++--------------- .planning/research/FEATURES.md | 448 ++++++------- .planning/research/PITFALLS.md | 414 +++++------- .planning/research/STACK.md | 424 ++++++------- 4 files changed, 1014 insertions(+), 1249 deletions(-) diff --git a/.planning/research/ARCHITECTURE.md b/.planning/research/ARCHITECTURE.md index a36bdb4766..dc68a802a9 100644 --- a/.planning/research/ARCHITECTURE.md +++ b/.planning/research/ARCHITECTURE.md @@ -1,638 +1,609 @@ -# Architecture Patterns: MoE Panel Integration - -**Domain:** Agent orchestration panel dispatch within CLI workflow engine -**Researched:** 2026-02-26 -**Confidence:** HIGH (based on direct codebase analysis of existing patterns) - -## Recommended Architecture - -MoE Panels are **orchestrator agents that replace single agents at dispatch points**. Three panels replace three existing single agents: `gsd-plan-checker` (plan verification), `gsd-verifier` (phase verification), and `gsd-phase-researcher` (research). Each panel spawns 3 specialist subagents in parallel, synthesizes their outputs, and returns the **exact same structured output contract** as the single agent it replaces. - -The key architectural insight: panels are a **transparent substitution**. The workflow files (`plan-phase.md`, `execute-phase.md`) dispatch to either a single agent or a panel agent based on a config flag. The orchestrator workflow never knows or cares whether a single agent or a panel produced the output -- the return contract is identical. - -### Component Boundaries - -| Component | Responsibility | Communicates With | -|-----------|---------------|-------------------| -| `config.json` workflow section | Stores `plan_check_panel`, `verifier_panel`, `researcher_panel` booleans | Read by `init.cjs` commands | -| `init.cjs` init commands | Resolves panel flags + panel model into INIT JSON for workflows | Consumed by workflow orchestrators | -| `core.cjs` MODEL_PROFILES | Maps panel agent names to model tiers | Called by `resolveModelInternal` | -| Workflow files (plan-phase, execute-phase) | Conditional dispatch: panel flag true -> spawn panel agent, false -> spawn single agent | Spawn panel or single agents via Task() | -| Panel agent `.md` files | Orchestrate 3 parallel specialists, synthesize, return structured output | Spawn specialist agents via Task(), return to workflow | -| Specialist prompts | Inline within panel agent (NOT separate agent files) | Spawned by panel orchestrator, return findings | - -### Data Flow - -``` -config.json - | - v -init.cjs (resolves flags + models into INIT JSON) - | - v -workflow.md (plan-phase / execute-phase) - | - +--> [if panel=false] --> single agent (gsd-plan-checker / gsd-verifier / gsd-phase-researcher) - | | - | v - | structured return (## VERIFICATION PASSED / ## ISSUES FOUND / etc.) - | - +--> [if panel=true] --> panel agent (gsd-plan-checker-panel / gsd-verifier-panel / gsd-researcher-panel) - | - +--> Task(specialist-1, model=panel_model) --| - +--> Task(specialist-2, model=panel_model) --+--> parallel - +--> Task(specialist-3, model=panel_model) --| - | - v - synthesize (majority consensus / union of findings) - | - v - structured return (SAME contract as single agent) -``` - -## Integration Point 1: Config Routing - -### Current Config Structure - -The existing config uses `workflow.*` keys for boolean agent toggles: - -```json -{ - "workflow": { - "research": true, - "plan_check": true, - "verifier": true, - "auto_advance": false, - "nyquist_validation": false - } -} -``` +# Architecture Research: PR #762 Fix Integration -### Recommended Config Extension +**Domain:** CLI workflow engine — autopilot mode runtime state, config validation, model resolution +**Researched:** 2026-02-28 +**Confidence:** HIGH (direct codebase analysis of all affected files) -Add `*_panel` keys alongside existing toggles. A panel flag is only meaningful when its parent toggle is `true`. +## Context -```json -{ - "workflow": { - "research": true, - "research_panel": false, - "plan_check": true, - "plan_check_panel": false, - "verifier": true, - "verifier_panel": false, - "auto_advance": false, - "nyquist_validation": false - } -} -``` +This research addresses three specific fixes from PR #762 reviewer feedback: -**Why parallel booleans instead of a mode enum:** The existing pattern is per-feature booleans (`research: true/false`, `plan_check: true/false`). Adding `research_panel: true/false` follows the same convention. A user can disable research entirely (`research: false`) or enable it with a panel (`research: true, research_panel: true`). These are independent toggles -- `research_panel: true` with `research: false` is a no-op (the panel flag is only checked when the parent feature is enabled). +1. **Auto-advance config mutation** — `autopilot.md` currently persists `workflow.auto_advance true` to `config.json`. Reviewer wants this to be a runtime flag, not persistent config. +2. **discuss_agents runtime validation** — `auto-discuss.md` reads `autopilot.discuss_agents` without validating the value before spawning N agents. If the value is invalid, the workflow silently uses a bad agent count. +3. **model_overrides undocumented** — `resolveModelInternal` in `core.cjs` checks `config.model_overrides` but `cmdResolveModel` in `commands.cjs` does not. The feature is partially implemented and undocumented. -**Why NOT a global `panels: true` toggle:** Different panels have different cost/value tradeoffs. A user may want panel verification (catches more bugs) but not panel research (overkill for simple phases). Per-feature panel toggles give that control. +--- -### Config Resolution in `init.cjs` +## Standard Architecture + +### System Overview + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ User Commands (/gsd:autopilot, /gsd:plan-phase, etc.) │ +├─────────────────────────────────────────────────────────────────┤ +│ Workflow Files (get-shit-done/workflows/*.md) │ +│ Orchestrators: autopilot.md, plan-phase.md, auto-discuss.md │ +│ Read config via: config-get, config-set, init commands │ +├─────────────────────────────────────────────────────────────────┤ +│ Node CLI Toolkit (get-shit-done/bin/lib/*.cjs) │ +│ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌──────────┐ │ +│ │ core.cjs │ │ config.cjs │ │ init.cjs │ │commands │ │ +│ │ loadConfig │ │config-set │ │ init cmds │ │ .cjs │ │ +│ │ resolveModel│ │config-get │ │ (INIT JSON)│ │resolve- │ │ +│ │ Internal │ │ │ │ │ │ model │ │ +│ └────────────┘ └────────────┘ └────────────┘ └──────────┘ │ +├─────────────────────────────────────────────────────────────────┤ +│ State Layer (.planning/) │ +│ config.json — persistent settings │ +│ ROADMAP.md, STATE.md — planning documents │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Component Responsibilities + +| Component | Responsibility | Key Functions | +|-----------|----------------|---------------| +| `core.cjs` | Config loading, model resolution, shared utilities | `loadConfig()`, `resolveModelInternal()` | +| `config.cjs` | Config CRUD (read/write/validate) | `cmdConfigSet()`, `cmdConfigGet()`, `cmdConfigEnsureSection()` | +| `commands.cjs` | Standalone utility commands | `cmdResolveModel()` (the broken one), `cmdCommit()`, etc. | +| `init.cjs` | Pre-computed INIT JSON for each workflow type | `cmdInitPlanPhase()`, `cmdInitExecutePhase()`, `cmdInitProgress()` | +| `auto-discuss.md` | Synthetic phase context via N-agent debate | Reads `autopilot.discuss_agents`, spawns agent panel | +| `autopilot.md` | Full pipeline orchestration across phases | Sets/clears `workflow.auto_advance` in config | +| `plan-phase.md` / `execute-phase.md` | Per-phase orchestration | Read `workflow.auto_advance` via `config-get` | -The `cmdInitPlanPhase` function already resolves `research_enabled`, `plan_checker_enabled`. Add panel resolution: +--- -```javascript -// In cmdInitPlanPhase result object: -{ - // Existing - research_enabled: config.research, - plan_checker_enabled: config.plan_checker, - researcher_model: resolveModelInternal(cwd, 'gsd-phase-researcher'), - checker_model: resolveModelInternal(cwd, 'gsd-plan-checker'), - - // New panel flags - research_panel: config.research_panel || false, - plan_check_panel: config.plan_check_panel || false, - - // New panel models (only resolved when panel enabled) - researcher_panel_model: config.research_panel - ? resolveModelInternal(cwd, 'gsd-researcher-panel') - : null, - checker_panel_model: config.plan_check_panel - ? resolveModelInternal(cwd, 'gsd-plan-checker-panel') - : null, -} -``` +## Fix 1: Auto-Advance Runtime Flag -Similarly for `cmdInitExecutePhase`: +### Current Behavior (the problem) -```javascript -{ - verifier_model: resolveModelInternal(cwd, 'gsd-verifier'), - verifier_panel: config.verifier_panel || false, - verifier_panel_model: config.verifier_panel - ? resolveModelInternal(cwd, 'gsd-verifier-panel') - : null, -} +`autopilot.md` step `ensure_auto_advance` runs: + +```bash +node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-set workflow.auto_advance true ``` -### Config Loading in `core.cjs` loadConfig +This persists the flag to `.planning/config.json`. Two consequences: -Add three new fields to the config loader: +1. If autopilot crashes or is interrupted, `workflow.auto_advance` stays `true` in config.json permanently. The next manual `plan-phase` or `execute-phase` invocation auto-advances without the user asking for it. +2. The `config-set` writes config.json, which gets committed to git (if `commit_docs: true`). The flag then appears in git history as a persistent config change, not an ephemeral session flag. -```javascript -// In loadConfig return: -{ - // ...existing fields - research_panel: get('research_panel', { section: 'workflow', field: 'research_panel' }) ?? false, - plan_check_panel: get('plan_check_panel', { section: 'workflow', field: 'plan_check_panel' }) ?? false, - verifier_panel: get('verifier_panel', { section: 'workflow', field: 'verifier_panel' }) ?? false, -} +`plan-phase.md` and `execute-phase.md` check auto-advance at runtime via: + +```bash +AUTO_CFG=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get workflow.auto_advance 2>/dev/null || echo "false") ``` -### Settings UI Addition +`discuss-phase.md` also checks and conditionally sets it: + +```bash +# If --auto flag present AND AUTO_CFG is not true: persist to config +node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-set workflow.auto_advance true +``` + +### Where Auto-Advance is Currently Read + +These four workflows check `workflow.auto_advance`: -Add three new questions to `settings.md` (one per panel toggle), positioned after their parent toggle: +- `plan-phase.md` — Step 14: spawns execute-phase if true +- `execute-phase.md` — checkpoint handler: skips pause if true; Step (transition): chains verify if true +- `discuss-phase.md` — final step: chains plan-phase if true +- `transition.md` — milestone boundary: clears it +All four use the same `config-get workflow.auto_advance` pattern. + +### Option A: Environment Variable (RECOMMENDED) + +**Mechanism:** Pass `GSD_AUTO_ADVANCE=true` in the orchestrator context. Each subprocess inherits it. + +**How autopilot.md would set it:** +```bash +# In the bash environment of the autopilot orchestrator session +export GSD_AUTO_ADVANCE=true ``` -{ - question: "Use MoE Panel for plan checking? (3 specialist agents instead of 1)", - header: "Plan Check Panel", - multiSelect: false, - options: [ - { label: "No (Recommended)", description: "Single plan-checker agent" }, - { label: "Yes", description: "3 specialists: coverage analyst, scope auditor, dependency checker" } - ] -} + +**How plan-phase.md / execute-phase.md would read it:** +```bash +AUTO_CFG="${GSD_AUTO_ADVANCE:-false}" ``` -## Integration Point 2: Model Profile Registration +**Pros:** +- Zero filesystem writes — no config.json mutation +- Automatic cleanup — env var dies when the terminal session ends +- No git pollution — nothing to commit +- No reset step needed at milestone boundary -### Current MODEL_PROFILES Table +**Cons:** +- Environment variable must be exported by the orchestrator, not just set locally +- If Claude Code subagents don't inherit the parent environment (they typically do via `execSync`), this breaks +- Requires changing the check pattern in plan-phase.md, execute-phase.md, and discuss-phase.md -```javascript -const MODEL_PROFILES = { - 'gsd-plan-checker': { quality: 'sonnet', balanced: 'sonnet', budget: 'haiku' }, - 'gsd-verifier': { quality: 'sonnet', balanced: 'sonnet', budget: 'haiku' }, - 'gsd-phase-researcher': { quality: 'opus', balanced: 'sonnet', budget: 'haiku' }, - // ... -}; +**Confidence:** MEDIUM — Claude Code's Task() subagents do inherit environment variables from the parent process, but this behavior should be verified before relying on it. + +### Option B: Temp File / Session File + +**Mechanism:** Write a session marker file, e.g., `.planning/.autopilot-session`. Workflows check for file existence instead of a config key. + +**How autopilot.md would set it:** +```bash +touch .planning/.autopilot-session ``` -### Add Panel Orchestrator Entries +**How plan-phase.md / execute-phase.md would check it:** +```bash +AUTO_CFG=$([[ -f .planning/.autopilot-session ]] && echo "true" || echo "false") +``` -Panel orchestrators do synthesis (moderate reasoning), not execution. They should use the same tier as the single agent they replace: +Or via gsd-tools: +```bash +AUTO_CFG=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs verify-path-exists .planning/.autopilot-session --raw) +``` -```javascript -const MODEL_PROFILES = { - // ...existing entries - 'gsd-plan-checker-panel': { quality: 'sonnet', balanced: 'sonnet', budget: 'haiku' }, - 'gsd-verifier-panel': { quality: 'sonnet', balanced: 'sonnet', budget: 'haiku' }, - 'gsd-researcher-panel': { quality: 'opus', balanced: 'sonnet', budget: 'haiku' }, -}; +**How autopilot.md clears it at milestone boundary:** +```bash +rm -f .planning/.autopilot-session ``` -**Specialist subagent models:** Specialists spawned BY the panel use the SAME model as the panel orchestrator. The panel agent passes its own model to each Task() call. This avoids adding 9 more entries (3 specialists x 3 panels) to the profile table. The panel agent `.md` should document: "Pass your own model to specialist Task calls." +**Pros:** +- Survives context compaction (unlike env var) — if a subagent loses environment, the file persists +- Easy to check, create, delete — no JSON parsing +- No config.json mutation +- Gitignored naturally if `.planning/` is gitignored, or add `/.planning/.autopilot-session` to `.gitignore` -## Integration Point 3: Conditional Dispatch in Workflows +**Cons:** +- File must be cleaned up on crash/interrupt — if autopilot crashes, the marker file persists until manually deleted +- Requires a new `verify-path-exists` call or shell `[[ -f ]]` check in each workflow +- Slightly less obvious than a config flag -### Pattern: plan-phase.md Step 10 (Plan Checker) +**Confidence:** HIGH — straightforward file-based flag, no new dependencies, consistent with how GSD uses the filesystem for state. -Current (single agent): +### Option C: `--auto` Flag Passed Explicitly (No Persistence) + +**Mechanism:** autopilot.md passes `--auto` to every phase chain invocation. No config.json mutation. No env var. No file. + +**How autopilot.md would pass it:** ``` Task( - prompt=checker_prompt, - subagent_type="gsd-plan-checker", - model="{checker_model}", - description="Verify Phase {phase} plans" + prompt="... ARGUMENTS='${PHASE} --auto' ..." ) ``` -Recommended (conditional dispatch): -``` -# After parsing INIT JSON: -# PLAN_CHECK_PANEL=$(echo "$INIT" | jq -r '.plan_check_panel') -# CHECKER_PANEL_MODEL=$(echo "$INIT" | jq -r '.checker_panel_model // empty') +**Plan-phase.md already checks `--auto`** in its auto-advance step (Step 14). It then passes `--auto` to execute-phase via ARGUMENTS. The entire chain already supports `--auto` flag propagation. -if PLAN_CHECK_PANEL is true: - Task( - prompt="First, read ~/.claude/agents/gsd-plan-checker-panel.md for your role and instructions.\n\n" + checker_prompt, - subagent_type="general-purpose", - model="{checker_panel_model}", - description="Panel verify Phase {phase} plans" - ) -else: - Task( - prompt=checker_prompt, - subagent_type="gsd-plan-checker", - model="{checker_model}", - description="Verify Phase {phase} plans" - ) -``` +**Pros:** +- No state at all — flag lives only in the Task() prompt +- No cleanup required — the flag dies with the subagent +- No config mutation +- Consistent with existing `--auto` flag support throughout the chain -**Critical detail:** Panel agents use `subagent_type="general-purpose"` (not a named type) because Claude Code only recognizes a fixed set of subagent types. Named types like `gsd-plan-checker` cause Claude Code to load the matching `agents/*.md` file. Panel agents load their own instructions via `@file` reference instead. This matches the existing pattern used by `auto-discuss.md` and `new-project.md` researcher spawns. +**Cons:** +- autopilot.md already passes `ARGUMENTS='${PHASE} --auto'` in `run_phase_chain` — but intermediate workflows (discuss-phase) may not propagate it further +- Harder to trace "why is auto-advance happening" when debugging — no visible state -### Pattern: execute-phase.md verify_phase_goal Step +**Confidence:** HIGH — this is the cleanest approach architecturally, and the existing `--auto` flag infrastructure already exists throughout the chain. -Same conditional dispatch pattern: -``` -if VERIFIER_PANEL is true: - Task( - prompt="First, read ~/.claude/agents/gsd-verifier-panel.md for your role and instructions.\n\n" + verifier_prompt, - subagent_type="general-purpose", - model="{verifier_panel_model}", - description="Panel verify phase {phase_number} goal" - ) -else: - Task( - prompt=verifier_prompt, - subagent_type="gsd-verifier", - model="{verifier_model}", - description="Verify phase {phase_number} goal" - ) -``` +### Recommendation: Option C (Pass --auto Flag) -### Pattern: plan-phase.md Step 5 (Researcher) +The autopilot.md already passes `--auto` via `ARGUMENTS='${PHASE} --auto'` in the `run_phase_chain` step. The fix is: -``` -if RESEARCH_PANEL is true: - Task( - prompt="First, read ~/.claude/agents/gsd-researcher-panel.md for your role and instructions.\n\n" + research_prompt, - subagent_type="general-purpose", - model="{researcher_panel_model}", - description="Panel research Phase {phase}" - ) -else: - Task( - prompt="First, read ~/.claude/agents/gsd-phase-researcher.md for your role and instructions.\n\n" + research_prompt, - subagent_type="general-purpose", - model="{researcher_model}", - description="Research Phase {phase}" - ) -``` +1. Remove the `config-set workflow.auto_advance true` from `autopilot.md` step `ensure_auto_advance` +2. Remove the `config-set workflow.auto_advance false` from `autopilot.md` step `milestone_complete` +3. Remove the `config-set workflow.auto_advance true` persistence from `discuss-phase.md` (direct `--auto` flag usage doesn't need to write config) +4. Keep the `--auto` flag check in plan-phase.md, execute-phase.md, discuss-phase.md as-is +5. Keep `workflow.auto_advance` in config.json as a **user-settable persistent preference** (not autopilot-managed) — this allows users who always want auto-advance to set it once + +**Where to look for edge cases:** +- `new-project.md` also sets `workflow.auto_advance true` — check if this is intentional (user chose autopilot in project wizard) or should also be removed +- `transition.md` clears `workflow.auto_advance false` — this can stay (clears user's persistent preference at milestone boundary, which is appropriate) or be removed if the flag becomes fully flag-based + +**Files affected (Option C):** + +| File | Change | Type | +|------|--------|-------| +| `get-shit-done/workflows/autopilot.md` | Remove two `config-set workflow.auto_advance` calls | Modify (workflow) | +| `get-shit-done/workflows/discuss-phase.md` | Remove conditional `config-set workflow.auto_advance true` persistence | Modify (workflow) | +| `get-shit-done/workflows/new-project.md` | Evaluate and possibly remove `config-set workflow.auto_advance true` | Modify (workflow) | + +No JavaScript module changes needed for Option C. + +--- -## Integration Point 4: Panel Agent File Structure +## Fix 2: discuss_agents Runtime Validation -### Recommended: 3 New Agent Files +### Current Behavior (the problem) +`auto-discuss.md` reads `autopilot.discuss_agents` with a fallback: + +```bash +AGENT_COUNT=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get autopilot.discuss_agents 2>/dev/null || echo "5") ``` -agents/ - gsd-plan-checker.md # existing single agent - gsd-plan-checker-panel.md # NEW: panel orchestrator - gsd-verifier.md # existing single agent - gsd-verifier-panel.md # NEW: panel orchestrator - gsd-phase-researcher.md # existing single agent - gsd-researcher-panel.md # NEW: panel orchestrator + +The `|| echo "5"` fallback handles the case where the key is missing. But it does NOT validate: +- Whether the value is a valid odd number (3, 5, 7, 9) +- Whether the value is within the allowed range +- Whether `config.autopilot` section exists but has an invalid type for `discuss_agents` + +`cmdConfigSet` in `config.cjs` validates at write time (odd, 3-9). But if someone manually edits `config.json` or if a future code path sets an invalid value, `auto-discuss.md` gets a bad AGENT_COUNT and spawns the wrong number of agents. + +### Where Validation Should Live + +**Option A: In auto-discuss.md (workflow-side validation)** + +After reading AGENT_COUNT, add a shell check: + +```bash +# Validate AGENT_COUNT is one of: 3, 5, 7, 9 +case "$AGENT_COUNT" in + 3|5|7|9) ;; # valid + *) echo "Error: discuss_agents must be 3, 5, 7, or 9. Got: $AGENT_COUNT. Fix in .planning/config.json"; exit 1;; +esac ``` -### Why NOT Separate Specialist Agent Files +**Pros:** Catches bad values at execution time before spawning agents. No module changes. Fast fix. +**Cons:** Validation logic is in a markdown workflow file, not testable. -Specialists should be **inline prompts within the panel agent**, not separate `.md` files. Reasons: +**Option B: Add a dedicated `cmdConfigValidateAutopilot` to config.cjs** -1. **Context isolation:** Each specialist is spawned via Task() with a specific prompt. The panel agent constructs these prompts dynamically based on the input context (phase goal, plans, requirements). Separate files would be static and unable to adapt. +Create a new gsd-tools command `config validate autopilot` that reads and validates the autopilot config section, returning errors or the validated values. -2. **Coupling:** Specialists only make sense in the context of their panel. A "coverage analyst" is useless outside the plan-checker-panel. Separate files suggest independent reuse that doesn't exist. +**Pros:** Testable via Node.js tests. Centralized. Could be called by multiple workflows. +**Cons:** More code, new command in gsd-tools.cjs router, more test surface. -3. **Precedent:** `auto-discuss.md` already uses this pattern -- specialist prompts are constructed inline with role assignments, not loaded from separate files. +**Option C: Make `config-get` return a structured validation response** -4. **Maintenance:** 3 panel files vs 3 panels + 9 specialist files. The inline approach keeps each panel self-contained. +Modify `cmdConfigGet` to accept a `--validate` flag that checks the value against known constraints. -### Panel Agent Anatomy (Template) +**Pros:** Reusable validation pattern. +**Cons:** Changes existing API of `config-get` which is widely used. + +### Recommendation: Option A for this PR fix + +The reviewer feedback is about a specific validation gap, not a request to redesign config validation. Add an inline shell validation check in `auto-discuss.md` after the config-get. This is: +- A minimal fix that directly addresses the reported issue +- No module changes = no test changes = smaller PR scope +- Consistent with how other workflows handle bad config values (they error out with a message) + +**Additional fix needed:** Validate that `AGENT_COUNT` is actually a number, not a string. The `config-get --raw` returns the raw value, which could be `"null"` or `"undefined"` if the key is missing and the command fails. + +**Files affected:** + +| File | Change | Type | +|------|--------|-------| +| `get-shit-done/workflows/auto-discuss.md` | Add AGENT_COUNT validation after config-get | Modify (workflow) | + +No JavaScript module changes needed. -```markdown ---- -name: gsd-plan-checker-panel -description: MoE panel that spawns 3 specialist agents to verify plans from different angles. Returns same output contract as gsd-plan-checker. -tools: Read, Bash, Glob, Grep -color: green --- - -You are a GSD plan-checker panel. You orchestrate 3 specialist verification agents, -synthesize their findings, and return the same structured output as gsd-plan-checker. - -The workflow that spawned you expects EXACTLY the same return format as a single -gsd-plan-checker agent. Your job is to produce BETTER results through parallel -specialist analysis, but the output contract is non-negotiable. - - - -Your return MUST be one of: -- ## VERIFICATION PASSED (same format as gsd-plan-checker) -- ## ISSUES FOUND (same format as gsd-plan-checker) - -The orchestrator workflow parses these headers. Any other format breaks the pipeline. - - - -## Specialist 1: Coverage Analyst -Focus: Requirement coverage (Dimensions 1, 6, 7 from plan-checker) -Checks: Every requirement has tasks, must_haves trace to goal, context compliance - -## Specialist 2: Scope & Structure Auditor -Focus: Task quality and scope (Dimensions 2, 5) -Checks: Task completeness (files/action/verify/done), scope sanity, context budget - -## Specialist 3: Dependency & Wiring Inspector -Focus: Dependency correctness and key links (Dimensions 3, 4, 8) -Checks: Dependency graph, key links, Nyquist compliance - - - -1. Load context (same as gsd-plan-checker Step 1-2) -2. Construct specialist prompts with shared context -3. Spawn all 3 specialists in parallel via Task() -4. Collect results from all 3 -5. Synthesize: union of all issues (deduplicate by plan+task+dimension) -6. Determine overall status (any blocker -> ISSUES FOUND) -7. Return in gsd-plan-checker output format - - - -Task( - subagent_type="general-purpose", - model="{same model as panel}", - prompt=" - You are a {specialist_name} for GSD plan verification. - {specialist_focus} - {same files from orchestrator prompt} - Return a YAML issues list + coverage table for your dimensions. - ", - description="{specialist_name}" -) - +## Fix 3: model_overrides Documentation / Alignment - -1. Parse each specialist's issues list -2. Deduplicate by (plan, task, dimension) tuple -3. If any specialist found blockers -> overall = ISSUES FOUND -4. If no blockers but warnings -> overall = ISSUES FOUND (warnings should still be reviewed) -5. If no issues -> overall = VERIFICATION PASSED -6. Merge coverage tables from all specialists -7. Format into exact gsd-plan-checker return structure - -``` +### The Divergence -## Integration Point 5: Output Contract Preservation +Two functions in the codebase handle model resolution, and they behave differently: -This is the most critical architectural constraint. Panel agents MUST produce byte-compatible output with single agents. +**`resolveModelInternal` in `core.cjs` (lines 344-359):** +```javascript +function resolveModelInternal(cwd, agentType) { + const config = loadConfig(cwd); -### Plan Checker Contract + // Check per-agent override FIRST + const override = config.model_overrides?.[agentType]; + if (override) { + return override === 'opus' ? 'inherit' : override; + } -Single agent returns one of: -```markdown -## VERIFICATION PASSED -**Phase:** {phase-name} -**Plans verified:** {N} -**Status:** All checks passed -### Coverage Summary -| Requirement | Plans | Status | -### Plan Summary -| Plan | Tasks | Files | Wave | Status | + // Fall back to profile lookup + const profile = config.model_profile || 'balanced'; + const agentModels = MODEL_PROFILES[agentType]; + ... +} ``` -OR: +**`cmdResolveModel` in `commands.cjs` (lines 200-219):** +```javascript +function cmdResolveModel(cwd, agentType, raw) { + const config = loadConfig(cwd); + const profile = config.model_profile || 'balanced'; -```markdown -## ISSUES FOUND -**Phase:** {phase-name} -**Plans checked:** {N} -**Issues:** {X} blocker(s), {Y} warning(s), {Z} info -### Blockers (must fix) -### Warnings (should fix) -### Structured Issues -(YAML issues list) -### Recommendation + // NO model_overrides check — goes straight to profile + const agentModels = MODEL_PROFILES[agentType]; + ... +} ``` -The panel MUST return exactly these formats. The revision loop in plan-phase.md Steps 11-12 parses `## VERIFICATION PASSED` and `## ISSUES FOUND` headers to determine next action. +`resolveModelInternal` is called by `init.cjs` (in `cmdInitPlanPhase`, `cmdInitExecutePhase`, etc.) to populate INIT JSON with per-agent models. `cmdResolveModel` is the CLI-facing `resolve-model` command that workflows call directly. -### Verifier Contract +### Impact -Single agent returns: -```markdown -## Verification Complete -**Status:** {passed | gaps_found | human_needed} -**Score:** {N}/{M} must-haves verified -**Report:** .planning/phases/{phase_dir}/{phase_num}-VERIFICATION.md -``` +Workflows that call `resolve-model` CLI directly (bypassing init.cjs) will NOT honor `model_overrides`. Workflows that use INIT JSON (the majority) WILL honor them because init.cjs uses `resolveModelInternal`. + +The `model_overrides` feature is documented in `get-shit-done/references/model-profiles.md` and exists in the config template. But `loadConfig` in `core.cjs` does NOT include `model_overrides` in its return object — the `resolveModelInternal` function reads `config.model_overrides?.[agentType]` directly from the parsed JSON before loadConfig normalizes it. -The execute-phase.md `verify_phase_goal` step reads the VERIFICATION.md status field via grep. The panel must write the same file format. +### The Real Problem -### Researcher Contract +`loadConfig` returns a normalized object with known keys. `model_overrides` is NOT one of those keys: -Single agent returns: -```markdown -## RESEARCH COMPLETE -**Phase:** {phase_number} - {phase_name} -**Confidence:** [HIGH/MEDIUM/LOW] -### Key Findings -### File Created -### Confidence Assessment -### Ready for Planning +```javascript +// loadConfig return object (lines 95-107) — model_overrides missing! +return { + model_profile: ..., + commit_docs: ..., + // ... + brave_search: ..., + // NO model_overrides here +}; ``` -OR: +But `resolveModelInternal` calls `loadConfig` then accesses `config.model_overrides`. Since `loadConfig` drops unknown keys, `config.model_overrides` is always `undefined`. The override check silently no-ops for everyone. + +This is a bug: `model_overrides` is documented but never actually applied. + +### Fix Options -```markdown -## RESEARCH BLOCKED -**Phase:** {phase_number} - {phase_name} -**Blocked by:** [what] +**Option A: Add `model_overrides` to `loadConfig` return** + +```javascript +// In core.cjs loadConfig: +return { + // ...existing fields + model_overrides: get('model_overrides') ?? {}, +}; ``` -The plan-phase.md Step 5 handler checks for these headers. +This makes the feature work as documented. `resolveModelInternal` and `cmdResolveModel` both need to check `config.model_overrides` (cmdResolveModel still needs to be updated too). -### Contract Enforcement Strategy +**Option B: Remove `model_overrides` from documentation and `resolveModelInternal`** -Each panel agent `.md` file should include an `` section that: -1. Lists the exact headers the workflow expects -2. Shows the complete output format template -3. States: "Your synthesis MUST produce this exact format. Do not add extra sections or change headers." +If the feature is premature, remove the dead code path and the documentation. Simplifies the codebase. -## Integration Point 6: Panel-Specialist Relationship +**Option C: Document it as "experimental / not yet wired"** -### Hierarchy +Add a note to `model-profiles.md` that `model_overrides` is not yet active. Defer the fix. +### Recommendation: Option A (Fix the wiring) + +The feature is already documented in user-facing references (`model-profiles.md`), implemented in `resolveModelInternal`, and mentioned in the config schema. The only gap is that `loadConfig` drops it and `cmdResolveModel` ignores it. These are one-line fixes: + +**Change 1: `core.cjs` loadConfig return** +```javascript +model_overrides: parsed.model_overrides ?? {}, ``` -Workflow Orchestrator (plan-phase.md / execute-phase.md) - | - v -Panel Agent (gsd-plan-checker-panel.md) <-- has 200K context - | - +--> Specialist 1 (Task, general-purpose) <-- has 200K context - +--> Specialist 2 (Task, general-purpose) <-- has 200K context - +--> Specialist 3 (Task, general-purpose) <-- has 200K context - | - v -Synthesis (panel agent combines results) - | - v -Return to Workflow Orchestrator + +**Change 2: `commands.cjs` cmdResolveModel** +```javascript +function cmdResolveModel(cwd, agentType, raw) { + if (!agentType) { + error('agent-type required'); + } + // Delegate to resolveModelInternal to ensure model_overrides are honored + const model = resolveModelInternal(cwd, agentType); + const config = loadConfig(cwd); + const profile = config.model_profile || 'balanced'; + const unknownAgent = !MODEL_PROFILES[agentType]; + const result = { model, profile, ...(unknownAgent ? { unknown_agent: true } : {}) }; + output(result, raw, model); +} ``` -### Context Budget +This makes `cmdResolveModel` use `resolveModelInternal`, eliminating the divergence. -Each specialist gets a fresh 200K context window. The panel orchestrator also has 200K. This means: -- Panel orchestrator: reads files, constructs prompts, synthesizes (~30-40% context usage) -- Each specialist: reads files, performs focused analysis, returns findings (~50-60% context usage) +**Files affected:** -**Total token cost:** 4x a single agent (1 panel + 3 specialists). This is the primary tradeoff. +| File | Change | Type | +|------|--------|-------| +| `get-shit-done/bin/lib/core.cjs` | Add `model_overrides` to `loadConfig` return | Modify (module, needs tests) | +| `get-shit-done/bin/lib/commands.cjs` | Refactor `cmdResolveModel` to delegate to `resolveModelInternal` | Modify (module, needs tests) | +| `tests/commands.test.cjs` | Add tests for `model_overrides` honored by `resolve-model` CLI | Modify (test) | +| `tests/core.test.cjs` or `tests/commands.test.cjs` | Add tests for `loadConfig` returning `model_overrides` | Modify (test) | -### Specialist Prompt Construction +--- -The panel agent receives the same `` or `` block that the single agent would receive. It passes this context through to each specialist, adding the specialist's focus area: +## PR Split Architecture -``` -specialist_prompt = f""" -{specialist_role_description} +### Separation Logic - -You are responsible for verification dimensions: {dimension_list} -Ignore other dimensions -- other specialists handle them. - +The reviewer requested splitting PR #762. The three fixes have distinct dependencies: -{original_context_from_workflow} +**Fix 1 (auto-advance):** Workflow-only changes. No module code. No test changes. Pure markdown edits. - -Return your findings as: +**Fix 2 (discuss_agents):** Workflow-only change. No module code. No test changes. Pure markdown edits. -### Findings +**Fix 3 (model_overrides):** Module code changes + test changes. Touches `core.cjs` and `commands.cjs`, which already have test files. -#### Dimension {N}: {Name} -Status: PASS | FAIL -Issues: -```yaml -issues: - - plan: "XX-YY" - dimension: "{dimension_name}" - severity: "blocker|warning|info" - description: "..." - fix_hint: "..." -``` +Additionally, the original PR #762 includes: +- Tests and CI changes (from v1.1 work) +- `.planning/` artifacts that should be removed from the PR branch +- The resolve-model fix that overlaps with PR #761 (closed) -If no issues for a dimension, state: "Dimension {N}: PASS - no issues found" - -""" +### Recommended PR Split + +**PR A: Workflow Fixes (autopilot, auto-discuss)** + +Files: +``` +get-shit-done/workflows/autopilot.md # Remove config-set auto_advance calls +get-shit-done/workflows/discuss-phase.md # Remove conditional config-set persistence +get-shit-done/workflows/auto-discuss.md # Add AGENT_COUNT validation ``` -### Synthesis Pattern +No module changes. No test changes. No risk to CI. Can be reviewed in isolation. -The panel agent collects all specialist returns and merges: +**PR B: resolve-model / model_overrides Fix** -1. **Parse** each specialist's YAML issues list -2. **Union** all issues into a single list -3. **Deduplicate** by (plan, task, dimension) -- if two specialists flag the same issue, keep the higher severity -4. **Aggregate** coverage tables (each specialist reports on their dimensions) -5. **Determine** overall status: any blocker -> ISSUES FOUND, else VERIFICATION PASSED -6. **Format** into the exact single-agent output contract +Files: +``` +get-shit-done/bin/lib/core.cjs # Add model_overrides to loadConfig +get-shit-done/bin/lib/commands.cjs # Refactor cmdResolveModel +tests/commands.test.cjs # New tests for model_overrides +``` + +This is the module fix. Needs to be coordinated with any remaining resolve-model changes from PR #761 context. -This is analogous to `auto-discuss.md`'s `synthesize_consensus` step, but for verification findings instead of decisions. +**PR C: Tests + CI (from original PR #762)** -## Patterns to Follow +This was the bulk of the original PR: test files and CI configuration. Should be reviewed independently of the autopilot feature code. Remove the `.planning/` artifact files before submitting. -### Pattern 1: Transparent Substitution (from auto-discuss.md) +### Build Order -**What:** auto-discuss.md produces CONTEXT.md in the exact same format as discuss-phase.md. Downstream agents (researcher, planner) consume it identically. +``` +PR C (tests/CI) ─────────────────────────────────────> merge (no conflicts) +PR A (workflows) ────────────────────────────────────── merge (no conflicts with C) +PR B (modules) ─── depends on: no conflicts with A/C ─> merge last +``` -**Apply to panels:** Panel agents produce output in the exact same format as single agents. Workflow orchestrators consume it identically. +PR A and PR C have no file overlap and can be submitted and merged in any order. PR B touches `core.cjs` and `commands.cjs` — verify no conflicts with PR #761 changes (PR #761 closed but the fix may have landed or been incorporated). -**Why this works:** The contract is at the output level, not the agent level. What happens inside the agent (1 agent or 3 specialists) is an implementation detail. +--- -### Pattern 2: Parallel Task Spawning (from auto-discuss.md) +## Recommended Project Structure (Unchanged) -**What:** auto-discuss.md spawns N agents in parallel via multiple Task() calls in a single message. +The existing structure handles these fixes without new directories: ``` -For each agent (1 to AGENT_COUNT): - Task( - subagent_type="general-purpose", - model="${DISCUSS_MODEL}", - prompt="...", - description="${ROLE_NAME} review" - ) +get-shit-done/ +├── bin/lib/ +│ ├── core.cjs # model_overrides fix (loadConfig + resolveModelInternal) +│ └── commands.cjs # cmdResolveModel refactor +├── workflows/ +│ ├── autopilot.md # remove config-set calls +│ ├── discuss-phase.md # remove conditional config-set +│ └── auto-discuss.md # add AGENT_COUNT validation +└── tests/ + └── commands.test.cjs # new model_overrides tests ``` -**Apply to panels:** Panel agents spawn 3 specialists in parallel the same way. All three use `subagent_type="general-purpose"` with inline role prompts. +No new files needed. All three fixes are modifications to existing files. -### Pattern 3: Config Flag Gating (from existing workflow toggles) - -**What:** `workflow.research: true/false` gates whether the researcher is spawned. The workflow checks `research_enabled` from INIT JSON. +--- -**Apply to panels:** `workflow.plan_check_panel: true/false` gates whether the panel version is spawned. The workflow checks `plan_check_panel` from INIT JSON. +## Data Flow + +### Auto-Advance (After Fix) + +``` +User: /gsd:autopilot 3-7 + | + v +autopilot.md (orchestrator) + | + ├── No config-set (flag not persisted) + | + +--> Task(plan-phase.md, ARGUMENTS='3 --auto') + | + v + plan-phase.md reads --auto from ARGUMENTS + AUTO = true (flag only, no config read) + | + v + Execute → Verify → Transition + (each step receives --auto via ARGUMENTS propagation) +``` + +### Model Resolution (After Fix) + +``` +Workflow or agent calls: resolve-model gsd-executor + | + v +cmdResolveModel → resolveModelInternal(cwd, 'gsd-executor') + | + v +loadConfig(cwd) → returns { model_profile, model_overrides, ... } + | + ├── check config.model_overrides['gsd-executor'] + │ | + │ ├── found: return override value (sonnet/haiku/inherit) + │ └── not found: fall through to profile lookup + | + v +MODEL_PROFILES['gsd-executor'][profile] + | + v +return model string +``` + +### discuss_agents Validation (After Fix) + +``` +auto-discuss.md initialize step: + | + v +AGENT_COUNT=$(config-get autopilot.discuss_agents 2>/dev/null || echo "5") + | + v +[Validate: AGENT_COUNT must be 3, 5, 7, or 9] + | + ├── invalid: error, stop workflow, tell user to fix config.json + └── valid: continue to spawn_debate step +``` -### Pattern 4: INIT JSON Pre-computation (from init.cjs) +--- -**What:** All config resolution happens in `init.cjs` before the workflow starts. The workflow reads a single JSON blob with all flags and models pre-resolved. +## Integration Points -**Apply to panels:** Panel flags and panel models are resolved in init.cjs and included in the INIT JSON. The workflow never reads config.json directly for panel decisions. +### Internal Boundaries -## Anti-Patterns to Avoid +| Boundary | Communication | Notes | +|----------|---------------|-------| +| `autopilot.md` → `plan-phase.md` | `--auto` flag in ARGUMENTS | After fix: no config.json writes | +| `plan-phase.md` → `execute-phase.md` | `--auto` flag propagated via ARGUMENTS | Already works | +| `auto-discuss.md` → config | `config-get autopilot.discuss_agents` | Needs validation guard after read | +| `cmdResolveModel` → `resolveModelInternal` | Direct call (after fix) | Eliminates divergence | +| `loadConfig` → callers | Returns normalized config object | `model_overrides` added to return | +| `init.cjs` INIT JSON → workflows | Pre-computed flags + models | Already uses `resolveModelInternal`, benefits from fix automatically | -### Anti-Pattern 1: Panel Agents Altering Output Format -**What:** Adding extra sections, changing header levels, or renaming sections in the panel output. -**Why bad:** Workflow orchestrators parse specific headers (`## VERIFICATION PASSED`, `## ISSUES FOUND`). Any change breaks the revision loop. -**Instead:** Copy the exact output template from the single agent into the panel agent's `` section. Make the panel's synthesis step format its output using this template. +### External Services -### Anti-Pattern 2: Separate Agent Files for Specialists -**What:** Creating `agents/gsd-coverage-analyst.md`, `agents/gsd-scope-auditor.md`, etc. -**Why bad:** Specialists are not independently useful. They fragment the panel logic across files. They can't be spawned via `subagent_type` (Claude Code doesn't know about them). They add maintenance burden without benefit. -**Instead:** Inline specialist prompts within the panel agent file, constructed dynamically. +None. All three fixes are internal — filesystem, config, and in-process function calls only. -### Anti-Pattern 3: Double-Reading Files -**What:** Panel agent reads all plan files, then each specialist also reads all plan files. -**Why bad:** Wastes context in the panel orchestrator. The panel only needs enough context to construct specialist prompts and synthesize results. -**Instead:** Panel agent reads file paths (not contents) from INIT JSON, passes paths to specialists via `` blocks. Specialists read files themselves with their fresh 200K context. Panel agent only reads files for synthesis if needed. +--- -### Anti-Pattern 4: Global Panel Toggle -**What:** `workflow.panels: true` enables all panels at once. -**Why bad:** Different panels have different cost/benefit profiles. Verification panels catch more bugs (high value). Research panels produce more thorough findings but may be overkill for simple phases (moderate value). Users should control each independently. -**Instead:** Per-feature panel toggles: `plan_check_panel`, `verifier_panel`, `researcher_panel`. +## Anti-Patterns to Avoid -### Anti-Pattern 5: Panel Orchestrator Doing Analysis -**What:** Panel agent performs its own verification analysis in addition to spawning specialists. -**Why bad:** Duplicates work, wastes context, creates conflicts between panel and specialist findings. -**Instead:** Panel agent is ONLY an orchestrator. It constructs prompts, spawns agents, collects results, synthesizes output. All analytical work is done by specialists. +### Anti-Pattern 1: Using config.json as Session State -## Scalability Considerations +**What:** Writing `workflow.auto_advance true` to config.json during autopilot execution. +**Why bad:** Config.json is user-visible persistent settings. Session flags in config.json persist across invocations, survive crashes, and get committed to git history. +**Instead:** Use the `--auto` flag mechanism that already exists throughout the chain. -| Concern | 1 panel active | 2 panels active | All 3 panels active | -|---------|---------------|-----------------|---------------------| -| Token cost | 4x single agent | 8x single agent | 12x single agent | -| Wall-clock time | ~same (parallel) | ~same (sequential between panels) | ~same | -| Quality improvement | Focused analysis per dimension | Comprehensive coverage | Maximum thoroughness | -| Context pressure on parent workflow | Minimal (same return size) | Minimal | Minimal | +### Anti-Pattern 2: Duplicating Resolution Logic -**Token cost is the primary constraint.** Each panel spawns 3 specialists, each with 200K context. For budget-conscious users, panels should default to `false`. For quality-focused users (quality model profile), panels provide significant value. +**What:** Having `cmdResolveModel` reimplement resolution logic that `resolveModelInternal` already handles. +**Why bad:** Two code paths can diverge. `model_overrides` is an example of this happening — resolveModelInternal checks it, cmdResolveModel doesn't. +**Instead:** `cmdResolveModel` should delegate to `resolveModelInternal` rather than reimplementing the logic. -**Recommended defaults:** -- `research_panel: false` -- research is already thorough with a single agent -- `plan_check_panel: false` -- single checker catches most issues -- `verifier_panel: false` -- single verifier is sufficient for most phases +### Anti-Pattern 3: Deferred Validation (Read-Time vs Write-Time Only) -Panels are an opt-in quality boost, not a default. +**What:** Validating `discuss_agents` only at write time (`cmdConfigSet`) but not at read time in the workflow. +**Why bad:** Users can edit config.json directly. Values can arrive invalid. The workflow silently uses a bad value. +**Instead:** Validate at the point of use. The workflow that reads `discuss_agents` should check the value is valid before acting on it. -## Implementation Sequence +### Anti-Pattern 4: Silent Fallback Masking Config Errors -The recommended build order for this feature: +**What:** `AGENT_COUNT=$(config-get ... 2>/dev/null || echo "5")` — the `2>/dev/null` and `|| echo "5"` hide errors. +**Why bad:** If `config-get` fails for a legitimate reason (corrupt config, wrong key type), the workflow silently proceeds with the fallback value. The user has no idea their config is broken. +**Instead:** Keep the fallback for the "key not set" case, but add explicit validation of the returned value. -1. **Config layer first** -- add panel flags to `config.cjs`, `core.cjs`, `init.cjs` -2. **Model profiles** -- add panel entries to MODEL_PROFILES table -3. **Settings UI** -- add panel toggle questions to `settings.md` -4. **Plan-checker panel** -- build first panel (plan-checker is the most well-defined contract) -5. **Conditional dispatch in plan-phase.md** -- wire the config flag to dispatch -6. **Verifier panel** -- second panel (similar structure) -7. **Conditional dispatch in execute-phase.md** -- wire verifier dispatch -8. **Researcher panel** -- third panel (different synthesis pattern) -9. **Conditional dispatch in plan-phase.md research step** -- wire researcher dispatch -10. **Integration testing** -- verify panel output matches single agent contracts +--- ## Sources -All findings are from direct codebase analysis (HIGH confidence): - -- `get-shit-done/templates/config.json` -- current config schema -- `get-shit-done/bin/lib/core.cjs` -- MODEL_PROFILES table, loadConfig, resolveModelInternal -- `get-shit-done/bin/lib/config.cjs` -- config CRUD operations -- `get-shit-done/bin/lib/init.cjs` -- INIT JSON pre-computation for all workflow types -- `get-shit-done/workflows/plan-phase.md` -- plan checker and researcher dispatch points -- `get-shit-done/workflows/execute-phase.md` -- verifier dispatch point -- `get-shit-done/workflows/auto-discuss.md` -- existing parallel agent spawn + synthesis pattern -- `get-shit-done/workflows/settings.md` -- settings UI pattern for config toggles -- `agents/gsd-plan-checker.md` -- plan checker output contract (VERIFICATION PASSED / ISSUES FOUND) -- `agents/gsd-verifier.md` -- verifier output contract (Verification Complete + VERIFICATION.md) -- `agents/gsd-phase-researcher.md` -- researcher output contract (RESEARCH COMPLETE / RESEARCH BLOCKED) -- `get-shit-done/references/model-profiles.md` -- model profile philosophy and table structure +All findings from direct codebase analysis (HIGH confidence): + +- `/Users/annon/projects/get-shit-done/get-shit-done/bin/lib/core.cjs` — `loadConfig`, `resolveModelInternal` (lines 60-111, 344-359) +- `/Users/annon/projects/get-shit-done/get-shit-done/bin/lib/commands.cjs` — `cmdResolveModel` (lines 200-219) +- `/Users/annon/projects/get-shit-done/get-shit-done/bin/lib/config.cjs` — `cmdConfigSet` validation (lines 105-110) +- `/Users/annon/projects/get-shit-done/get-shit-done/bin/lib/init.cjs` — `discuss_agents` in INIT JSON (line 666) +- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/autopilot.md` — config-set calls (lines 51, 233) +- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/auto-discuss.md` — AGENT_COUNT read (lines 30-32) +- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/plan-phase.md` — auto_advance check (lines 444-446) +- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/execute-phase.md` — auto_advance check (lines 184, 408-410) +- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/discuss-phase.md` — auto_advance set/check (lines 444-451) +- `/Users/annon/projects/get-shit-done/get-shit-done/references/model-profiles.md` — model_overrides documentation +- `/Users/annon/projects/get-shit-done/.planning/codebase/ARCHITECTURE.md` — system layer analysis + +--- + +*Architecture research for: PR #762 fix integration* +*Researched: 2026-02-28* diff --git a/.planning/research/FEATURES.md b/.planning/research/FEATURES.md index 489d06b454..b159b45d1e 100644 --- a/.planning/research/FEATURES.md +++ b/.planning/research/FEATURES.md @@ -1,337 +1,285 @@ -# Feature Landscape: MoE Panels & Consensus Mechanisms +# Feature Research: PR Review Fixes -**Domain:** AI agent orchestration quality gates with parallel specialist panels -**Researched:** 2026-02-26 -**Overall Confidence:** MEDIUM-HIGH +**Domain:** OSS contributor workflow — addressing reviewer feedback on autopilot mode PR +**Researched:** 2026-02-28 +**Confidence:** HIGH -## Executive Summary +## Context -GSD currently uses single-agent quality gates (plan-checker, verifier, phase-researcher) that each bear the full responsibility of their domain. The v2.0 MoE Panels milestone replaces each gate with a panel of 3 parallel specialists, each covering a non-overlapping domain partition. This is not a voting system -- it is a domain-partitioned assembly pattern where each specialist owns distinct sections of the output document and a synthesizer merges their non-overlapping contributions. +This research addresses four discrete fix areas from reviewer feedback on PR #762 (autopilot mode). The PR was flagged for scope creep (5 distinct efforts bundled together), a config mutation bug, missing validation, and undocumented config. Research below maps each fix area to table stakes vs differentiators, with complexity and dependency notes. -The key insight from research: **voting and consensus mechanisms solve a different problem than what GSD panels need.** Voting works when multiple agents evaluate the *same* thing and you need to pick the best answer. Domain-partitioned assembly works when agents evaluate *different things* and you need to combine their non-overlapping findings. GSD panels are the latter -- specialists checking distinct dimensions, not redundant reviewers voting on the same dimensions. - -The auto-discuss workflow already proves the panel pattern works in this codebase: it spawns N agents in parallel, collects structured outputs, and synthesizes them. The MoE panel pattern is a constrained version of auto-discuss where specialist domains are pre-defined (not dynamically generated) and output sections are non-overlapping (not debated). +--- -## Table Stakes +## Fix Area 1: Runtime Flags vs Config File Mutation -Features that must exist for panels to deliver value over the current single-agent gates. +### The Problem -| Feature | Why Expected | Complexity | Confidence | Notes | -|---------|-------------|------------|------------|-------| -| Parallel specialist spawning | Panels must run specialists concurrently to avoid 3x latency | Low | HIGH | GSD already spawns parallel agents in auto-discuss and wave execution | -| Domain-partitioned output assembly | Each specialist must own distinct document sections to avoid duplication and conflict | Medium | HIGH | Core innovation -- see Architecture section | -| Backward compatibility with orchestrators | plan-phase, execute-phase, and research-phase must consume panel output identically to single-agent output | Medium | HIGH | VERIFICATION.md, RESEARCH.md, and checker returns must keep same format | -| Per-panel configuration | Users must be able to enable/disable panels per gate (config.json) and fall back to single-agent | Low | HIGH | Follows existing `workflow.research`, `workflow.plan_check`, `workflow.verifier` pattern | -| Specialist agent definitions (9 agents) | 3 specialists per panel x 3 panels = 9 new agent .md files | High | HIGH | Largest content effort -- each agent needs focused role, dimensions, and output format | -| Panel synthesizer logic | Orchestrator or synthesizer agent must merge 3 specialist outputs into single output document | Medium | MEDIUM | Similar to gsd-research-synthesizer pattern already in codebase | -| Cross-validation between specialists | When one specialist flags an issue, adjacent specialists should verify (reduces false positives) | Medium | MEDIUM | Inspired by diffray's cross-validation approach (87% fewer false positives) | -| Model-per-specialist configuration | Different specialists may benefit from different model strengths | Low | HIGH | Already supported via model profile resolution | +`autopilot.md` calls `config-set workflow.auto_advance true` to enable auto-advance for the duration of the autopilot run. This mutates `.planning/config.json` persistently. If autopilot is interrupted (crash, kill, user cancel), the `milestone_complete` cleanup step never fires, and `auto_advance` stays `true` in the user's config file permanently. The reviewer correctly identified this as a correctness bug. -## Differentiators +### Table Stakes (Must Fix) -Features that elevate panels beyond basic parallel execution. +| Feature | Why Expected | Complexity | Notes | +|---------|--------------|------------|-------| +| Auto-advance enabled only for autopilot session | Users expect that running `/gsd:autopilot` does not permanently change their config | LOW | Industry standard: CLI flags are session-scoped; config files are persistent preferences. npm, git, cargo, kubectl all follow this pattern. | +| Cleanup idempotency | If autopilot stops for any reason (gap found, checkpoint, crash), config must not be left in a mutated state | LOW | Session flag eliminates the cleanup problem entirely — no cleanup needed if nothing was mutated | +| No regression for manual `auto_advance` config | Users who have `workflow.auto_advance: true` in their config file manually must see no behavior change | LOW | Session flag is additive — it passes `--auto` argument or reads an in-memory flag, not touching disk config | -| Feature | Value Proposition | Complexity | Confidence | Notes | -|---------|-------------------|------------|------------|-------| -| Conflict detection at merge time | When two specialists make contradictory claims about the same artifact, flag for resolution rather than silently including both | Medium | MEDIUM | Only relevant at domain boundaries -- should be rare with good partitioning | -| Specialist confidence weighting | Specialists report confidence per finding; synthesizer weights HIGH findings above LOW | Low | MEDIUM | Lightweight version of attention-based routing from MoE literature | -| Degraded-mode fallback | If one specialist fails/times out, produce partial panel output with explicit gaps rather than blocking entirely | Medium | HIGH | Important for reliability -- single-agent fallback for failed specialist | -| Panel-level scoring | Aggregate specialist scores into panel-level pass/fail with drill-down | Low | HIGH | Verifier already produces scores; extend to per-specialist breakdown | -| Configurable specialist count | Allow 1-specialist (single-agent mode), 3-specialist (standard), or 5-specialist (deep) per panel | Medium | LOW | Premature optimization; start with 3-specialist only, add later | +### Differentiators (Nice to Have) -## Anti-Features +| Feature | Value Proposition | Complexity | Notes | +|---------|-------------------|------------|-------| +| `--no-auto` flag to disable auto-advance per-invocation | Allows users with persistent `auto_advance: true` to run a single manual phase | LOW | Inverse of the runtime flag pattern | +| Explicit autopilot mode banner showing active runtime overrides | User sees what config overrides are active for this run | LOW | Transparency over magic | -Features to explicitly NOT build. +### Anti-Features | Anti-Feature | Why Avoid | What to Do Instead | |--------------|-----------|-------------------| -| Voting/majority consensus between specialists | Specialists own non-overlapping domains -- there is nothing to vote on. Voting suits overlapping evaluations (like auto-discuss where agents evaluate the same gray areas). | Use domain-partitioned assembly: each specialist contributes its section, synthesizer merges. | -| Multi-round debate between specialists | Research shows increasing discussion rounds *decreases* performance (Kaesberg et al., 2025). Adds latency with diminishing returns. | Single-round parallel execution. If cross-validation catches a conflict, the synthesizer resolves it -- no iterative debate. | -| Dynamic specialist routing (MoE-style gating) | True MoE routing requires a trained gating network. Our specialists are pre-assigned -- the "routing" is static by design. Dynamic routing adds complexity without benefit for 3-specialist panels. | Static assignment: each specialist always runs. All 3 always fire. | -| Shared context between parallel specialists | Research shows isolated execution produces better diversity. Claude Code subagents already run in isolated contexts. Sharing context risks groupthink. | Each specialist gets independent context. Synthesizer sees all outputs. | -| Complex weighting/scoring algorithms | Over-engineering the merge. The output is markdown, not numerical predictions. | Simple structured merge with section ownership. | -| Specialist-to-specialist communication during execution | Adds synchronization complexity, defeats the purpose of parallelism, risks cascading failures. | Post-execution cross-validation only (synthesizer reads all outputs, flags contradictions). | - -## Consensus Mechanism Analysis +| Writing session state to config.json | Leaks ephemeral state into persistent user preferences; impossible to clean up on crash | Pass `--auto` as argument to the plan-phase subagent Task call instead of persisting to config | +| Using a `.lock` file as session marker | Adds complexity and still leaves cleanup problem | Argument-based activation has no cleanup problem | -The central design question. Based on research into how multi-agent systems combine specialist outputs. +### Implementation Pattern -### Mechanism 1: UNION Assembly (RECOMMENDED) +The standard pattern across CLI tooling (npm, git, curl): command-line arguments override config values for that invocation only. Config files store user preferences; flags store session intent. -**What:** Each specialist owns non-overlapping sections of the output document. Synthesizer concatenates sections, resolves boundary conflicts, and produces the final document. +For GSD: remove the `config-set workflow.auto_advance true` step from `autopilot.md`. Instead, the `run_phase_chain` step passes `ARGUMENTS='${PHASE} --auto'` to plan-phase. The plan-phase workflow already has an `auto_advance` check that reads config — extend it to also check for `--auto` in arguments. No config mutation, no cleanup needed. -**When it works:** When specialists have clearly partitioned domains with minimal overlap (which is the design intent for all 3 panels). +**Dependency:** Requires reading `--auto` flag in `plan-phase.md`'s auto_advance logic. Low touch — plan-phase already has this branching. -**Evidence:** This is how diffray's multi-agent code review works -- 11 specialists each own a concern, findings are merged and deduplicated. Google ADK's ParallelAgent pattern also uses this: parallel execution with post-processing merge. The existing gsd-research-synthesizer in this codebase is literally a UNION assembler -- it reads 4 parallel researcher outputs (STACK, FEATURES, ARCHITECTURE, PITFALLS) and synthesizes SUMMARY.md. +--- -**Tradeoffs:** -- PRO: No latency penalty beyond slowest specialist (parallel execution) -- PRO: No information loss (every specialist's findings included) -- PRO: Simple implementation (structured merge, no voting logic) -- CON: Requires clean domain partitioning (overlap = conflicts) -- CON: Synthesizer must handle boundary cases +## Fix Area 2: Input Validation for Config Values at Runtime -**Confidence:** HIGH -- This pattern is already proven in the codebase (research-synthesizer, auto-discuss synthesis step). +### The Problem -### Mechanism 2: Majority Voting +Two validation gaps identified in PR review: -**What:** All specialists evaluate the same dimensions. Each votes pass/fail on each dimension. Majority wins. +1. `config.cjs:cmdConfigSet` validates `discuss_agents` and `discuss_model` only when invoked via the `config-set` CLI command. Direct edits to `config.json` bypass this entirely. +2. `auto-discuss.md` reads `AGENT_COUNT` from config and uses it directly in agent spawning logic without validating the value it received. -**When it works:** When you want redundancy -- multiple agents checking the same thing to reduce error. +### Table Stakes (Must Fix) -**Evidence:** The Kaesberg et al. (2025) study found voting improves reasoning tasks by 13.2% over consensus, but this applies to tasks where agents solve the *same* problem. Multi-Agent Verification (MAV) scales verifiers, not specialists. +| Feature | Why Expected | Complexity | Notes | +|---------|--------------|------------|-------| +| Runtime validation of `discuss_agents` in auto-discuss | The workflow must not spawn 0, 2, 4, 6, 8, or 10+ agents just because config.json has a bad value | LOW | Standard defensive pattern: validate at consumption point, not only at write point | +| Fallback to default on invalid config value | Invalid `discuss_agents` (non-odd, out-of-range) should fall back to 5, not crash or spawn wrong count | LOW | Same pattern as the existing `AGENT_COUNT=$(... 2>/dev/null \|\| echo "5")` fallback already in auto-discuss.md — extend it | +| Validation message when falling back | User or caller knows a fallback occurred, not silently swallowing bad config | LOW | Print a warning to stderr: "discuss_agents=4 is invalid (must be odd 3-9), using 5" | -**Tradeoffs:** -- PRO: Built-in redundancy (3 agents checking same thing = fewer misses) -- CON: 3x the work for marginally better accuracy on the same dimensions -- CON: Loses the benefit of domain specialization (generalist voters, not specialists) -- CON: Still need a tie-breaking mechanism for 3 agents +### Differentiators (Nice to Have) -**Confidence:** HIGH that this is the WRONG pattern for GSD panels. Voting suits auto-discuss (same gray areas, different perspectives). Panels need specialization, not redundancy. +| Feature | Value Proposition | Complexity | Notes | +|---------|-------------------|------------|-------| +| `config-validate` CLI command | Validates all config values against schema on demand | MEDIUM | Useful but separable — not needed for this PR fix | +| JSON schema for `config.json` with validation on load | Catch all invalid values at config load time | MEDIUM | Adds robustness but is a larger change touching `loadConfig()` in core.cjs | +| Startup validation warning for unknown config keys | Warns user that `model_overrides` or other undocumented keys are present | LOW | Complements documentation fix (Fix Area 3) | -### Mechanism 3: Consensus via Iterative Debate +### Anti-Features -**What:** Specialists discuss findings, iterate toward agreement, converge on shared output. +| Anti-Feature | Why Avoid | What to Do Instead | +|--------------|-----------|-------------------| +| Validating only at `config-set` write time | Direct JSON edits bypass CLI; any agent reading config cannot trust values it receives | Validate at consumption point in the workflow | +| Crashing on invalid config | Breaks autopilot entirely for a recoverable problem | Fall back to documented default with a warning | -**Evidence:** Research shows consensus reduces hallucination on fact-based tasks (2.8% improvement) but multiple rounds decrease overall performance. Debate adds latency proportional to round count. +### Implementation Pattern -**Tradeoffs:** -- PRO: May catch edge cases at domain boundaries -- CON: Significantly slower (2-5x latency per round) -- CON: Research explicitly recommends AGAINST multiple rounds -- CON: Complexity explosion in prompt engineering +The OWASP Input Validation Cheat Sheet (2025) recommends allowlist validation: define exactly what is allowed, reject everything else. For `discuss_agents`: -**Confidence:** HIGH that this is overkill for GSD panels. +```bash +# In auto-discuss.md initialize step +AGENT_COUNT=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get autopilot.discuss_agents 2>/dev/null || echo "5") +# Validate: must be odd number 3-9 +case "$AGENT_COUNT" in + 3|5|7|9) ;; # valid + *) echo "Warning: discuss_agents=$AGENT_COUNT invalid (must be 3/5/7/9). Using 5." >&2; AGENT_COUNT=5 ;; +esac +``` -### Mechanism 4: Domain-Partitioned Assembly with Cross-Validation (RECOMMENDED VARIANT) +**Dependency:** Self-contained change to `auto-discuss.md`. No changes to `config.cjs` required for this fix. The existing validation in `cmdConfigSet` is a separate concern (prevents bad values from being written via CLI) and is fine as-is for this milestone. -**What:** UNION assembly (Mechanism 1) plus a lightweight cross-validation step where the synthesizer checks for contradictions between specialist outputs before producing the final document. +--- -**Example:** Plan-checker structural specialist says "dependencies valid" but semantic specialist says "task 3 references output from task 1 which doesn't produce that artifact." The synthesizer flags this as a cross-validation conflict and elevates severity. +## Fix Area 3: Config Documentation Best Practices -**Tradeoffs:** -- PRO: Gets the speed of UNION assembly -- PRO: Catches boundary-crossing issues -- PRO: Synthesizer is a natural place for this (already reads all outputs) -- CON: Slightly more complex synthesizer logic +### The Problem -**Confidence:** HIGH -- This is the recommended approach. +Two documentation gaps: -## Panel Designs: Detailed Feature Maps +1. `model_overrides` was added to `loadConfig()` in `core.cjs` with no validation, no documentation, and no tests. Reviewer flagged it as undocumented arbitrary JSON passthrough. +2. The `autopilot` config section (`discuss_agents`, `discuss_model`) is added to defaults in `config.cjs` but not documented anywhere users can discover it. -### Panel 1: Plan Checker Panel +### Table Stakes (Must Fix) -**Current single agent:** gsd-plan-checker (8 verification dimensions, returns "VERIFICATION PASSED" or "ISSUES FOUND") +| Feature | Why Expected | Complexity | Notes | +|---------|--------------|------------|-------| +| Every config key documented in one place | OSS users expect a single authoritative source for all config options | LOW | Industry convention: `README.md` config section, or `docs/config.md` — one file, all keys, types, defaults, description | +| `model_overrides` documented or removed from this PR | If it has no tests and no documentation, it should not be in the PR | LOW | Remove from `loadConfig()` if premature; if intentional, add docs + at least one test for it | +| New `autopilot.*` keys listed with types, defaults, valid values | Users configuring autopilot need to know the valid range for `discuss_agents` | LOW | Short addition to existing config docs section in README | -**Panel specialists:** +### Differentiators (Nice to Have) -| Specialist | Domain | Dimensions Owned | Output Section | -|------------|--------|-----------------|----------------| -| **Structural Integrity** | Plan mechanics: frontmatter, task completeness, dependency graphs, wave assignment, scope metrics | Dim 2 (Task Completeness), Dim 3 (Dependency Correctness), Dim 5 (Scope Sanity), Dim 8 (Nyquist Compliance) | `## Structural Analysis` | -| **Semantic Quality** | Goal-backward analysis: requirement coverage, must_haves derivation, key links planned, action specificity | Dim 1 (Requirement Coverage), Dim 4 (Key Links Planned), Dim 6 (Verification Derivation) | `## Semantic Analysis` | -| **Compliance** | External constraints: CONTEXT.md decisions, project skills, CLAUDE.md conventions, deferred ideas exclusion | Dim 7 (Context Compliance), plus project skill rules, plus CLAUDE.md conventions | `## Compliance Analysis` | +| Feature | Value Proposition | Complexity | Notes | +|---------|-------------------|------------|-------| +| Inline comments in the generated `config.json` defaults | Users see documentation when they open their config file | LOW | JSON doesn't support comments natively, but the `cmdConfigEnsureSection` output could include a header comment block as a separate file or README | +| `config-docs` CLI command | Prints all config options with descriptions | MEDIUM | Nice for discoverability, separable from PR fix | +| CHANGELOG.md entry for new config keys | Maintains project-level history of config schema evolution | LOW | Keep A Changelog pattern: add to `[Unreleased]` section under `### Added` | -**Why this partition:** Structural checks are mechanical (parseable, countable). Semantic checks require reasoning about intent vs. outcome. Compliance checks require cross-referencing external constraint documents. These are genuinely different cognitive tasks. +### Anti-Features -**Synthesizer behavior:** Merge all 3 sections. Cross-validate: if Structural says "3 tasks" but Semantic says "requirement X covered by tasks 1,2,3,4" -- flag inconsistency. Produce unified issue list with severity. Overall pass/fail uses worst-case: any blocker from any specialist = ISSUES FOUND. +| Anti-Feature | Why Avoid | What to Do Instead | +|--------------|-----------|-------------------| +| Documenting config in PR description only | PR descriptions are not part of the codebase; users won't find them | Put docs in README or a dedicated config reference file | +| Adding config options without tests | Untestable config options become technical debt | Pair each new config key with at least one test in `config.test.cjs` | -**Output format (backward compatible):** -```markdown -## VERIFICATION PASSED | ISSUES FOUND +### Documentation Pattern (OSS Standard) -**Phase:** {phase-name} -**Plans verified:** {N} -**Panel:** Structural + Semantic + Compliance +The Keep a Changelog specification and GitHub's own OSS project guidance both establish: every new configuration option added in a release should appear in (a) the changelog under `Added`, (b) the README config reference, and (c) if it has validation rules, those rules are tested. -### Structural Analysis -[Structural specialist output: dimensions 2,3,5,8] +For `model_overrides` specifically: the OSS principle is "don't ship what you can't support." If it has no validation, no tests, and no documentation, it should be either removed from this PR or explicitly scoped to a follow-up PR with a `TODO:` comment and a failing test marking it as incomplete. -### Semantic Analysis -[Semantic specialist output: dimensions 1,4,6] +**Dependency:** Purely additive changes to README.md and optionally CHANGELOG.md. No source code changes required for basic documentation fix. For `model_overrides`: either a one-line revert in `core.cjs` or a test addition in the test suite. -### Compliance Analysis -[Compliance specialist output: dimension 7 + project rules] +--- -### Cross-Validation Notes -[Synthesizer's boundary-crossing findings] +## Fix Area 4: PR Splitting Strategies -### Unified Issue List -[Merged, deduplicated, severity-ranked issues from all specialists] -``` +### The Problem -**Confidence:** HIGH -- The 8-dimension structure naturally partitions into these 3 groups. No dimension is ambiguously assigned. +PR #762 bundles 5 distinct efforts into one 8,179-line addition PR: +1. Test suite overhaul (~6,370 lines across 13 files) +2. CI pipeline (~50 lines) +3. Autopilot feature (~620 lines: `autopilot.md`, `auto-discuss.md`, config changes) +4. Resolve-model fix (overlaps with PR #761) +5. `model_overrides` config loading (undocumented, untested) -### Panel 2: Verifier Panel +### Table Stakes (Must Fix) -**Current single agent:** gsd-verifier (3-level artifact verification, key link checks, anti-pattern scanning, creates VERIFICATION.md) +| Feature | Why Expected | Complexity | Notes | +|---------|--------------|------------|-------| +| Test suite + CI as standalone PR | Tests and CI can be reviewed and merged independently; reviewer can verify tests pass without the autopilot feature | LOW | No dependencies on autopilot code — pure test infrastructure | +| Resolve-model fix coordinated with PR #761 | Two PRs fixing the same function will conflict on merge | LOW | One approach: rebase #762's resolve-model change on top of #761 after #761 lands, then drop it from this PR | +| Autopilot feature as focused PR | The actual new feature with only its direct dependencies | LOW | Once tests+CI and resolve-model are extracted, the autopilot PR becomes ~670 lines | +| `.planning/` artifacts removed from branch | Development artifacts (STATE.md, PLAN.md, SUMMARY.md referencing contributor filesystem paths) do not belong in the repo | LOW | `git rm .planning/STATE.md .planning/quick/` from the PR branch | -**Panel specialists:** +### Differentiators (Nice to Have) -| Specialist | Domain | Checks Owned | Output Section | -|------------|--------|-------------|----------------| -| **Artifact & Wiring** | File existence, substantive content (not stubs), import/usage wiring, key link verification | Level 1 (exists), Level 2 (substantive), Level 3 (wired), Key Links | `## Artifact Verification` + `## Key Link Verification` | -| **Requirements & Anti-Patterns** | Requirement coverage, anti-pattern scanning (TODO/FIXME/placeholder/empty returns), goal-backward truth verification | Requirement mapping, anti-pattern detection, truth status determination | `## Requirements Coverage` + `## Anti-Patterns Found` | -| **Human Verification** | Items needing human testing, visual/UX concerns, external service integration checks, edge case identification | Human-needed classification, test script generation, uncertainty flagging | `## Human Verification Required` | +| Feature | Value Proposition | Complexity | Notes | +|---------|-------------------|------------|-------| +| Stacked PR approach for future large features | Autopilot + MoE panels will be large; establish a workflow now | MEDIUM | Tools: `git rebase --update-refs`, Graphite, or manual stacking | +| PR template enforcing size and scope checklist | Prevents future scope creep in submissions | LOW | Separable from this milestone's fixes | -**Why this partition:** Artifact/wiring checks are grep-based (mechanical file analysis). Requirement/anti-pattern checks are reasoning-based (does this code satisfy that requirement?). Human verification is judgment-based (what can't be verified programmatically?). Each requires different cognitive approaches and tool usage patterns. +### Anti-Features -**Synthesizer behavior:** Merge sections into VERIFICATION.md format. Determine overall status: `passed` (all artifacts verified + all requirements satisfied + no blocker anti-patterns), `gaps_found` (any failure), `human_needed` (automated pass but human items remain). Score = verified truths / total truths. Cross-validate: if Artifact specialist says "file exists and is wired" but Anti-Pattern specialist says "file contains only TODO placeholders" -- elevate to blocker. +| Anti-Feature | Why Avoid | What to Do Instead | +|--------------|-----------|-------------------| +| Squashing everything into one commit before splitting | Loses granular history, makes bisect harder | Use `git cherry-pick` or `git rebase -i` to move commits to new branches | +| Creating split PRs that target main directly without stacking | If PR A depends on PR B, merging order matters; targeting main with dependent PRs risks broken states | Stack PRs on each other with clear dependency labels in PR body | -**Output format (backward compatible):** -```yaml ---- -phase: XX-name -verified: YYYY-MM-DDTHH:MM:SSZ -status: passed | gaps_found | human_needed -score: N/M must-haves verified -panel: artifact-wiring + requirements-antipatterns + human-verification -gaps: [...] # Merged from all specialists -human_verification: [...] # From human verification specialist ---- -``` +### Recommended Split Order -**Confidence:** HIGH -- The verifier's existing steps (verify_artifacts, verify_wiring, verify_requirements, scan_antipatterns, identify_human_verification) map cleanly to these 3 specialists. +Based on the dependency graph: -### Panel 3: Research Panel +``` +PR A: tests + CI (no dependencies) + └─> PR B: resolve-model fix (depends on: rebase after #761 lands) + └─> PR C: autopilot feature (depends on: config keys from core, auto-discuss from A) +``` -**Current single agent:** gsd-phase-researcher (produces RESEARCH.md with stack, patterns, pitfalls, code examples) +**Rationale:** Tests+CI can land immediately — it's the least risky and validates the CI setup itself. Resolve-model must coordinate with #761 to avoid conflicts. Autopilot should be last because it depends on the config infrastructure and the resolve-model fix being in main. -**Panel specialists:** +**Dependency on `model_overrides`:** Remove from PR C (autopilot) unless it has documentation and tests. If needed for autopilot, scope it explicitly with validation. -| Specialist | Domain | Sections Owned | Output Section | -|------------|--------|---------------|----------------| -| **Stack & Ecosystem** | Library recommendations, versions, alternatives, don't-hand-roll, installation commands | Standard Stack, Don't Hand-Roll, State of the Art, Installation | `## Standard Stack` + `## Don't Hand-Roll` + `## State of the Art` | -| **Architecture & Patterns** | Project structure, design patterns, code examples, recommended organization | Architecture Patterns, Code Examples, Recommended Project Structure | `## Architecture Patterns` + `## Code Examples` | -| **Pitfalls & Validation** | Common mistakes, pitfalls, gotchas, validation architecture (Nyquist), open questions | Common Pitfalls, Validation Architecture, Open Questions | `## Common Pitfalls` + `## Validation Architecture` + `## Open Questions` | +--- -**Why this partition:** These are genuinely different research domains. Stack research requires checking Context7/official docs for current versions. Architecture research requires understanding design patterns and project structure. Pitfall research requires finding community wisdom about what goes wrong. Different tool usage, different sources, different reasoning. +## Feature Dependencies -**Synthesizer behavior:** Merge into single RESEARCH.md. Cross-validate: if Stack specialist recommends library X but Pitfalls specialist warns against library X -- flag conflict, let synthesizer resolve or include both with warning. Add Summary section (synthesized from all 3). Ensure User Constraints section appears first (copied from CONTEXT.md by all specialists independently, deduplicated by synthesizer). +``` +Fix Area 1 (runtime flag for auto-advance) + └──modifies──> autopilot.md (remove config-set call) + └──modifies──> plan-phase.md (read --auto argument) + +Fix Area 2 (input validation for discuss_agents) + └──modifies──> auto-discuss.md (validate AGENT_COUNT at read time) + └──depends on──> Fix Area 1 (same PR: autopilot feature) + +Fix Area 3 (config documentation) + └──adds──> README.md (config reference section) + └──optionally modifies──> CHANGELOG.md (unreleased section) + └──optionally reverts──> core.cjs (remove model_overrides if premature) + +Fix Area 4 (PR splitting) + └──precedes──> all other fix areas (structure work, not code work) + └──depends on──> git branch manipulation (cherry-pick or rebase) +``` -**Output format (backward compatible):** -```markdown -# Phase [X]: [Name] - Research +### Dependency Notes -**Researched:** [date] -**Domain:** [domain] -**Confidence:** [level] -**Panel:** Stack + Architecture + Pitfalls +- **Fix Areas 1 and 2 share a PR (autopilot feature):** They both touch autopilot.md and auto-discuss.md, so they belong together in the same focused PR. +- **Fix Area 3 can land in any PR:** Documentation for `model_overrides` is independent of the runtime flag fix. If `model_overrides` is removed, Fix Area 3 is just a README addition. +- **Fix Area 4 must happen first:** The PR split is the prerequisite for all other fixes to be reviewable as separate PRs. +- **Resolve-model fix (PR #761 coordination):** This is not strictly part of v1.3 code changes but is a PR management task. It should be tracked separately. -## User Constraints (from CONTEXT.md) -[Synthesizer deduplicates from all 3 specialists] +--- -## Summary -[Synthesizer writes this from combined findings] +## MVP Definition -## Standard Stack -[From Stack & Ecosystem specialist] +### Do Now (v1.3 — this milestone) -## Architecture Patterns -[From Architecture & Patterns specialist] +- [x] Split PR: extract tests+CI, resolve-model, autopilot into separate PRs — **no code required, git branch work** +- [x] Remove `.planning/` artifacts from autopilot PR branch — **git rm** +- [x] Fix auto-advance config mutation — **remove 1 line from autopilot.md, add --auto to phase chain call** +- [x] Add runtime validation for `discuss_agents` in auto-discuss.md — **~5 lines of shell validation** +- [x] Document `autopilot.*` config keys in README — **~10 lines of docs** +- [x] Decide: remove `model_overrides` from loadConfig() or add tests+docs -## Don't Hand-Roll -[From Stack & Ecosystem specialist] +### Defer to Later -## Common Pitfalls -[From Pitfalls & Validation specialist] +- [ ] `config-validate` CLI command — useful but separable from PR review fixes +- [ ] JSON schema validation on config load — larger refactor, different milestone +- [ ] Stacked PR tooling setup — process improvement, not a code fix +- [ ] PR template for scope checklist — governance, not code -## Code Examples -[From Architecture & Patterns specialist] +--- -## State of the Art -[From Stack & Ecosystem specialist] +## Feature Prioritization Matrix -## Validation Architecture -[From Pitfalls & Validation specialist] +| Feature | User Value | Implementation Cost | Priority | +|---------|------------|---------------------|----------| +| Remove config mutation (auto-advance) | HIGH — prevents silent config corruption | LOW — remove 1 line, add --auto arg | P1 | +| Runtime validation for discuss_agents | HIGH — prevents autopilot spawning wrong agent count | LOW — 5 lines of shell validation | P1 | +| PR split (tests+CI separate) | HIGH — unblocks reviewer approval | LOW — git branch work only | P1 | +| Remove .planning/ artifacts | HIGH — removes contributor filesystem path leakage | LOW — git rm | P1 | +| Document autopilot.* config keys | MEDIUM — discoverability for users | LOW — README addition | P2 | +| Decide on model_overrides | MEDIUM — cleanliness of codebase | LOW — revert 1 line OR add tests | P2 | +| config-validate command | LOW — convenience | MEDIUM — new CLI command | P3 | +| Stacked PR workflow docs | LOW — process hygiene | LOW | P3 | -## Open Questions -[From Pitfalls & Validation specialist, augmented by synthesizer] +--- ## Sources -[Merged from all specialists] -``` - -**Confidence:** HIGH -- This is essentially the same pattern as the existing project research pipeline (4 parallel researchers + synthesizer) but applied at phase level. - -## Feature Dependencies -``` -Per-panel config (config.json) --> Panel enablement check in orchestrators - | - v -Specialist agent definitions (9 agents/*.md files) - | - v -Panel orchestration logic in workflows (plan-phase.md, execute-phase.md, research-phase.md) - | - v -Synthesizer logic (per-panel merge + cross-validation) - | - v -Backward-compatible output format (same VERIFICATION.md / RESEARCH.md / checker return) - | - v -Degraded-mode fallback (if specialist fails, fall back to single-agent) -``` +### Runtime Flags vs Config Mutation (HIGH confidence) +- npm config precedence model: [npm-config docs](https://docs.npmjs.com/cli/v6/using-npm/config/) — CLI flags override config files, not the reverse +- node-config library: [Environment Variables wiki](https://github.com/node-config/node-config/wiki/Environment-Variables) — env vars override config files; config files store persistent preferences +- GSD codebase: `autopilot.md` lines 47-52 (`ensure_auto_advance` step) — the mutation bug is directly observable -**Critical dependency:** Specialist agent definitions must be complete before orchestration logic can be tested. The 9 agent .md files are the largest work item and the foundation for everything else. - -**Parallel work streams:** -- Stream A: Specialist agent definitions (9 files) -- can be done in parallel across panels -- Stream B: Config schema updates -- independent of agent definitions -- Stream C: Orchestrator workflow updates -- depends on A being at least partially done - -## Implementation Complexity Assessment - -| Component | Effort | Risk | Notes | -|-----------|--------|------|-------| -| 9 specialist agent .md files | HIGH (largest effort) | LOW (well-understood pattern from existing agents) | Each is ~200-400 lines. Total ~2700-3600 lines of prompt engineering. | -| Config schema updates | LOW | LOW | Add `panel` section to config.json with per-gate enable/disable | -| plan-phase.md orchestrator update | MEDIUM | MEDIUM | Replace single plan-checker spawn with 3 parallel + synthesizer | -| execute-phase.md orchestrator update | MEDIUM | MEDIUM | Replace single verifier spawn with 3 parallel + synthesizer | -| research-phase.md / plan-phase.md research step | MEDIUM | LOW | Already has pattern from project research pipeline | -| Synthesizer logic (3 synthesizers) | MEDIUM | MEDIUM | Could be inline in orchestrator or separate agents. Inline is simpler. | -| Cross-validation logic | LOW | LOW | Lightweight post-merge check in synthesizer | -| Degraded-mode fallback | LOW | LOW | If specialist timeout, run single-agent as fallback | -| Testing/validation | HIGH | HIGH | Need to verify panel output matches what downstream consumers expect | - -## MVP Recommendation - -**Phase 1 (Foundation): Agent Definitions + Config** -1. Define 9 specialist agent .md files (3 panels x 3 specialists) -2. Add panel config schema to config.json -3. No orchestrator changes yet -- agents can be tested standalone - -**Phase 2 (Integration): Orchestrator Panel Spawning** -1. Update plan-phase.md to spawn plan-checker panel (3 parallel + inline synthesis) -2. Update execute-phase.md to spawn verifier panel (3 parallel + inline synthesis) -3. Update plan-phase.md research step to spawn research panel (3 parallel + inline synthesis) -4. Add backward-compatible output format validation - -**Phase 3 (Hardening): Cross-Validation + Fallback** -1. Add cross-validation logic to synthesizers -2. Add degraded-mode fallback for specialist failures -3. Add panel-level scoring and drill-down -4. Test autopilot end-to-end with panels - -**Defer:** -- Configurable specialist count (1/3/5) -- start with 3 only, add later if needed -- Specialist-to-specialist communication -- anti-feature, don't build -- Complex weighting algorithms -- unnecessary for markdown-based outputs +### Input Validation (HIGH confidence) +- OWASP Input Validation Cheat Sheet: [owasp.org](https://cheatsheetseries.owasp.org/cheatsheets/Input_Validation_Cheat_Sheet.html) — allowlist validation pattern; validate at consumption point +- GitHub security blog: [Validate all the things](https://github.blog/security/application-security/validate-all-things-input-validation/) — validate inputs before use, not only at write time +- GSD codebase: `auto-discuss.md` lines 30-32 — `AGENT_COUNT` read with fallback but no validation of the value received -## Sources +### Config Documentation (HIGH confidence) +- Keep a Changelog: [keepachangelog.com](https://keepachangelog.com/en/1.0.0/) — every new option in `### Added` under `[Unreleased]` +- Changelog best practices: [getbeamer.com](https://www.getbeamer.com/blog/11-best-practices-for-changelogs) — document breaking changes, categorize by type, link to additional material +- GSD codebase: `config.cjs` — `autopilot` section added in defaults but no README section or CHANGELOG entry -### Primary (HIGH confidence) -- GSD codebase analysis: agents/gsd-plan-checker.md, agents/gsd-verifier.md, agents/gsd-phase-researcher.md, agents/gsd-research-synthesizer.md -- GSD workflow analysis: workflows/plan-phase.md, workflows/execute-phase.md, workflows/auto-discuss.md, workflows/new-project.md -- Google ADK Parallel Agent documentation: https://google.github.io/adk-docs/agents/workflow-agents/parallel-agents/ +### PR Splitting (HIGH confidence) +- Graphite PR size guide: [graphite.com](https://graphite.com/guides/best-practices-managing-pr-size) — under 200 lines ideal, atomic PRs, no mixed change types +- Stacked pull requests: [michaelagreiler.com](https://www.michaelagreiler.com/stacked-pull-requests/) — stack dependent PRs on each other rather than targeting main directly +- PR splitting strategies: [awesomecodereviews.com](https://www.awesomecodereviews.com/best-practices/stacked-pull-requests/) — separate refactors, features, tests into distinct PRs +- GitHub community discussion: [github.com/orgs/community](https://github.com/orgs/community/discussions/181240) — separation of concerns is the primary split criterion +- Git stacking with --update-refs: [andrewlock.net](https://andrewlock.net/working-with-stacked-branches-in-git-is-easier-with-update-refs/) — native git support for stacked branches without third-party tools -### Secondary (MEDIUM confidence) -- Kaesberg et al. (2025) "Voting or Consensus? Decision-Making in Multi-Agent Debate" -- https://arxiv.org/abs/2502.19130 -- Systematic evaluation of 7 decision protocols. Key finding: voting better for reasoning, consensus for knowledge, more agents better than more rounds. -- Qodo "Single-Agent vs Multi-Agent Code Review" -- https://www.qodo.ai/blog/single-agent-vs-multi-agent-code-review/ -- Architecture for domain-partitioned code review with explicit pass/fail signals per specialist. -- Diffray "Multi-Agent Code Review" -- https://diffray.ai/multi-agent-code-review/ -- 11-specialist architecture with cross-validation and deduplication. 87% fewer false positives. -- ProofSource "Parallel Sub-Agents in Claude Code" -- https://proofsource.ai/2025/12/parallel-sub-agents-in-claude-code-multiplying-your-development-speed/ -- Claude Code synthesizes subagent findings into coherent responses. Diminishing returns beyond 4-5 parallel agents. +--- -### Tertiary (LOW confidence) -- General multi-agent system surveys from 2025 (classicinformatics, ioni.ai, neomanex) -- broad patterns, not GSD-specific -- MoE model architecture literature (HuggingFace, NVIDIA) -- neural network MoE patterns; analogy to agent panels is loose +*Feature research for: PR review fixes on autopilot mode (get-shit-done v1.3)* +*Researched: 2026-02-28* diff --git a/.planning/research/PITFALLS.md b/.planning/research/PITFALLS.md index 4f86902556..2aeaf9dd25 100644 --- a/.planning/research/PITFALLS.md +++ b/.planning/research/PITFALLS.md @@ -1,352 +1,238 @@ -# Domain Pitfalls: MoE Panels for Agent Orchestration +# Pitfalls Research -**Domain:** Parallel agent orchestration with output merging and consensus logic -**Researched:** 2026-02-26 -**Overall confidence:** HIGH (pitfalls derived from codebase analysis + multi-agent system literature) +**Domain:** PR splitting, runtime config flags, validation hardening, overlapping PRs, artifact cleanup +**Researched:** 2026-02-28 +**Confidence:** HIGH (derived from codebase analysis + documented PR state + common git/Node.js patterns) --- ## Critical Pitfalls -Mistakes that cause rewrites, broken workflows, or silent data loss. - --- -### Pitfall 1: Output Contract Drift Between Panel and Single-Agent Mode - -**What goes wrong:** Panel output diverges from the exact string patterns that downstream workflows regex-match. The plan-phase.md workflow matches `## VERIFICATION PASSED` and `## ISSUES FOUND` as literal strings. The execute-phase.md workflow greps `^status:` from VERIFICATION.md frontmatter. The research-phase.md workflow matches `## RESEARCH COMPLETE` and `## RESEARCH BLOCKED`. If a panel's merging logic produces `## Verification Passed` (wrong case), `## ISSUES FOUND\n\n` (extra newline before content), or `status: passed` (double space), the downstream orchestrator silently falls through to a default branch and the workflow breaks. +### Pitfall 1: Splitting a PR Leaves Commits Orphaned on the Original Branch -**Why it happens:** Three specialists each produce markdown independently. A synthesizer/merger must reconstruct the exact output, but no specialist "owns" the final header format. The merger may normalize whitespace, adjust casing, or insert its own section headers that subtly differ. LLM agents are nondeterministic -- even with identical prompts, they produce slightly different formatting. +**What goes wrong:** +When splitting PR #762 (autopilot) into focused PRs (tests+CI, resolve-model fix, autopilot feature), commits that belong to split-off branches are often left behind on the original branch. The split looks clean in `git log` on the new branch but the original branch still contains the commits, and when it is eventually merged or rebased, the commits appear twice — once from the split PR and once from the original. GitHub may show them as "already merged" in the PR diff, but if force-push or rebase is involved, commits can reappear unexpectedly. -**Consequences:** -- Orchestrator workflows silently misroute (no error, just wrong branch) -- `plan-phase.md` step 11 fails to detect `## VERIFICATION PASSED` and treats it as inconclusive, entering an infinite revision loop or max-iteration bailout -- `execute-phase.md` step `verify_phase_goal` fails to grep `^status:` from VERIFICATION.md, treating a passed verification as gaps_found -- Users see "gaps found" when everything actually passed -- trust erodes +**Why it happens:** +Developers create a new branch from the original and cherry-pick the relevant commits, believing the branch is now "clean." They do not rebase the original branch to remove the cherry-picked commits. Now both branches contain the same logical changes but as distinct commit objects (different SHAs). When both branches target main, git sees distinct commits and applies both changes, potentially duplicating lines or creating conflicts. -**Prevention:** -1. The orchestrator (not the specialists) must own the final output template. Specialists produce structured data (YAML/JSON or structured markdown sections). The orchestrator assembles the final output using a deterministic template with hardcoded headers. -2. Define output contracts as constants in a shared reference file (`references/panel-contracts.md`) that both panel workflows and tests reference. -3. Never let LLM agents write the `## VERIFICATION PASSED` / `## ISSUES FOUND` header -- the orchestrator writes it based on parsed specialist data. -4. Add contract tests: for each panel, assert that output exactly matches a regex set extracted from the consuming workflow's matching patterns. +**How to avoid:** +1. Start the split from the base commit (where feat/autopilot diverged from main), not from the tip of feat/autopilot. +2. Create each focused branch from main: `git checkout -b fix/resolve-model main`. +3. Cherry-pick only the commits belonging to that PR's scope into the new branch. +4. For the original branch (autopilot), interactively rebase to remove commits that were split into other PRs: `git rebase -i main` on feat/autopilot, dropping the lines for cherry-picked commits. +5. Verify the split is clean: `git diff main...fix/resolve-model` should show only the resolve-model fix. `git diff main...feat/autopilot` should show no resolve-model changes. -**Detection:** Integration test that runs panel mode and single-agent mode on identical input, then diffs the structural elements (headers, frontmatter keys, status values). Any diff is a test failure. +**Warning signs:** +- `git log main...feat/autopilot` shows commits that have also appeared in a merged PR +- PR #762's diff includes the resolve-model fix after PR #761 has merged (should be gone) +- GitHub reports "0 changed files" on a PR after another PR was merged (commits were already on base) -**Confidence:** HIGH -- directly derived from codebase analysis of `plan-phase.md`, `execute-phase.md`, `verify-phase.md`, and `research-phase.md` workflow routing patterns. +**Phase to address:** Phase 14 — PR Restructure (first phase of v1.3). Must happen before any other fix work because subsequent phases add commits on top of a correct branch structure. --- -### Pitfall 2: Consensus Logic Double-Counting or Dropping Findings +### Pitfall 2: Runtime Flag Leaks Into Subsequent Sessions via Config Mutation -**What goes wrong:** In the Plan Checker Panel, three specialists (Structural, Semantic, Compliance) may each report the same underlying issue from different angles. UNION of blockers and MAJORITY of warnings sounds simple, but: -- **Double-counting:** Specialist A reports "Task 2 missing " as a structural issue. Specialist B reports "Task 2 cannot be validated" as a semantic issue. These are the same problem counted twice, inflating the blocker count and confusing the planner during revision. -- **Dropping findings:** If dedup is too aggressive (e.g., matching on task number alone), distinct issues on the same task get collapsed. "Task 2 missing " and "Task 2 scope too broad" are different problems that share a task reference. -- **Conflicting severities:** Specialist A says "blocker", Specialist B says "warning" for the same finding. UNION of blockers means any-one-says-blocker wins. But if the blocker assessment is wrong (LLM hallucination), there is no correction mechanism. +**What goes wrong:** +The auto-advance feature (autopilot advancing phases without user confirmation) was implemented by mutating `config.json` to set `auto_advance: true`. The reviewer identified this as a bug: config mutation persists across sessions. If the flag is written to disk, a subsequent unrelated session reads the config, finds `auto_advance: true`, and auto-advances without the user expecting it. The user's "one-time" option becomes a permanent state change. -**Why it happens:** Deduplication requires semantic similarity judgment, not exact string matching. The issue descriptions from three LLM agents will never be identical strings even when describing the same problem. Naive dedup (exact match) catches nothing. Aggressive dedup (substring match on plan+task) drops distinct issues. +**Why it happens:** +The natural pattern in Node.js CLI tools is to persist options by writing to the config file. Developers reach for `loadConfig()` + `fs.writeFileSync(configPath, JSON.stringify(...))` because it is how every other setting works in this codebase. The distinction between session-scoped flags (should not persist) and user preferences (should persist) is easy to miss. -**Consequences:** -- Planner receives inflated issue count, over-revises plans (rewrites that introduce new problems) -- Planner receives collapsed issues, misses one of two distinct problems -- Revision loop hits max iterations because "fixed" issues keep reappearing from a different specialist's perspective +**How to avoid:** +1. Use an in-memory runtime flag only: add a module-level variable in the relevant workflow or pass it as a parameter through the call chain. Never write session-scoped flags to `config.json`. +2. For the gsd-tools.cjs CLI, pass the flag as a command-line argument or environment variable (`GSD_AUTO_ADVANCE=1`). The process reads it once at startup and it dies with the process. +3. Add a code comment at the flag's declaration: `// Runtime-only: never persist to config.json`. This makes intent explicit for future contributors. +4. If the config loading code ever re-reads `config.json` during a session, the session-scoped value must be held separately and merged after load: `const runtimeFlags = { auto_advance: cliArgs.autoAdvance }; const effective = { ...config, ...runtimeFlags }`. -**Prevention:** -1. Normalize findings to a canonical structure BEFORE dedup: `{plan_id, task_id, dimension, severity, description}`. Dedup on `{plan_id, task_id, dimension}` tuple -- same plan, same task, same dimension = same finding, take highest severity. -2. If two specialists report the same plan+task but different dimensions (e.g., `task_completeness` vs `scope_sanity`), keep both -- they are genuinely different concerns. -3. For severity conflicts: take the highest severity (conservative). A blocker from any specialist is a blocker. This matches the stated UNION-blockers rule. -4. Include a `reported_by` field in merged output so the planner can see which specialists flagged which issues. Transparency reduces confusion during revision. -5. Add a dedup count: "3 specialists flagged this" vs "1 specialist flagged this" helps the planner prioritize. +**Warning signs:** +- `config.json` is modified during a run that the user did not invoke as a settings change +- Tests that run in sequence pass individually but fail together (previous test left config state) +- `git diff` on `.planning/config.json` appears in test runs that should not touch config -**Detection:** Unit test with three specialist outputs containing known overlapping and distinct issues. Assert merged output has exact expected count, correct severity escalation, and no dropped findings. - -**Confidence:** HIGH -- the Plan Checker agent already defines 8 verification dimensions (requirement_coverage, task_completeness, dependency_correctness, etc.). The dimension field is the natural dedup key. This is directly grounded in `gsd-plan-checker.md`. +**Phase to address:** Phase 15 — Auto-Advance Fix. This is an isolated code change in `state.cjs` or the autopilot workflow. Low risk of cross-phase interference if done as its own PR. --- -### Pitfall 3: Verifier Panel Domain Boundary Bleed - -**What goes wrong:** The Verifier Panel uses domain-partitioned assembly (not voting): Artifacts specialist checks file existence/substance, Requirements specialist checks requirement coverage, Human-verification specialist identifies what needs manual testing. The risk is that domains bleed: the Artifacts specialist discovers a missing file that is also a requirement gap, and the Requirements specialist independently discovers the same gap from the requirements side. Or worse, neither specialist covers a cross-cutting concern because each assumes the other handles it. +### Pitfall 3: Validation Too Strict Breaks Existing Callers of `discuss_agents` -**Why it happens:** Clean domain boundaries look clear on paper (artifacts vs requirements vs human) but real verification findings are cross-cutting. A missing API route is simultaneously an artifact issue (file does not exist), a requirement issue (REQ-AUTH-01 not satisfied), and potentially a human-verification issue (cannot test login flow). Domain partitioning means each specialist sees only their slice of the problem. +**What goes wrong:** +Adding runtime validation for `discuss_agents` in the auto-discuss workflow could break existing users who have configs or workflows that provide `discuss_agents` in an unexpected format. If the validator throws or exits on any unexpected value (instead of defaulting gracefully), users who previously worked fine now get hard errors. -**Consequences:** -- **Gap between domains:** No specialist checks key_links (wiring between artifacts). The current single-agent verifier checks three levels: exists, substantive, wired. If "wired" falls between Artifacts and Requirements domains, nobody checks it. -- **Redundant findings:** Same missing file appears in both Artifacts and Requirements sections with different descriptions, confusing the gap-closure planner. -- **VERIFICATION.md structural inconsistency:** The single-agent verifier produces a specific YAML frontmatter structure (`gaps:` with `truth`, `status`, `reason`, `artifacts`, `missing`). Domain-partitioned assembly must reconstruct this exact structure from three separate domain reports. +**Why it happens:** +Validation is usually added after a bug is discovered. The developer validates the exact case that caused the bug but over-constrains the input space. For example: validating that `discuss_agents` must be an array of strings also rejects a single string (which is a reasonable user shorthand), or rejects an array with empty strings (which might be valid as a "use default agent" signal), or rejects `undefined` (which was previously allowed as "use all agents"). -**Prevention:** -1. Pre-compute shared data BEFORE specialist dispatch. The orchestrator runs `gsd-tools.cjs verify artifacts` and `gsd-tools.cjs verify key-links` once, then distributes the JSON results to all specialists. This eliminates the "discovery" phase overlap. -2. Assign key_links (wiring) verification explicitly to the Artifacts specialist. Make domain ownership unambiguous in the specialist prompts. -3. Assembly logic must deduplicate on `artifact.path` -- if both Artifacts and Requirements report an issue for the same file, merge into one gap entry with evidence from both. -4. The assembly step is deterministic code (not LLM). It reads structured sections from each specialist and templates them into the VERIFICATION.md format. Never have an LLM "synthesize" verification output -- the format contract is too strict. +**How to avoid:** +1. Before writing validation, enumerate all values that currently work (test existing configs and workflow invocations). The validation must accept all of them. +2. Prefer defensive coercion over rejection: if `discuss_agents` is a string, coerce to `[discuss_agents]`. If it is `null` or `undefined`, coerce to the default value. Only reject values that are structurally impossible to interpret. +3. The error message on rejection must tell the user exactly what to provide, not just what was wrong: "discuss_agents must be an array of agent names, got: `true`" is better than "invalid discuss_agents". +4. Add a test that passes the old config format (no `discuss_agents` key) and asserts the workflow still runs normally. Backwards compatibility test first, then add the validation. -**Detection:** Test with a scenario where a missing file satisfies multiple domain concerns. Assert the merged VERIFICATION.md contains exactly one gap entry for that file with composite evidence, not duplicates. +**Warning signs:** +- Validation added without a test for the pre-existing "no key" case +- Validation uses `=== undefined` check on a key that could also be `null`, `0`, or `false` +- No graceful default — the code throws instead of falling back -**Confidence:** HIGH -- the verifier agent's VERIFICATION.md format is fully specified in `gsd-verifier.md` with YAML frontmatter schema. Domain partitioning assembly must reconstruct this exact schema. +**Phase to address:** Phase 16 — Validation Hardening. Scope: auto-discuss workflow only. Must not touch other callers of discuss_agents outside auto-discuss. --- -### Pitfall 4: Specialist Timeout or Failure Breaks Entire Panel +### Pitfall 4: Coordinating With PR #761 (resolve-model fix) — Merge Order Creates Conflicts -**What goes wrong:** One of three specialists times out (Claude Code has execution time limits), crashes, or produces malformed output. The panel either: (a) fails entirely and produces no output, breaking the workflow, or (b) waits indefinitely for the failed specialist, blocking the entire pipeline. +**What goes wrong:** +PR #761 and PR #762 both touch `resolve-model` logic. If both PRs are open simultaneously targeting main and one merges first, the other PR's diff now shows a conflict on the same lines. Git cannot auto-merge because both PRs modified the same function. The developer must rebase the second PR against the updated main — but if they rebase incorrectly, they either lose the first PR's fix or introduce a double-application of the same change. -**Why it happens:** Claude Code Task subagents can fail due to context limits, model errors, `classifyHandoffIfNeeded` bugs (documented in execute-phase.md), or simple timeouts. With a single agent, failure is straightforward -- the workflow catches it. With three parallel agents, partial failure is the hard case. +**Why it happens:** +When two contributors (or two PRs from the same contributor) independently identify the same bug and fix it, their fixes diverge at the implementation level even if they solve the same problem. Cherry-picking is tempting but dangerous: cherry-picking a fix onto a branch that already has the same logical fix (with different surrounding code) silently applies a double-fix or creates syntactically valid but semantically wrong code. -**Consequences:** -- If the panel requires all three specialists: one failure = total panel failure = workflow stops -- If the panel waits for all: one hanging specialist blocks everything -- If the panel proceeds with 2/3: output quality degrades but silently (user does not know one specialist did not contribute) +**How to avoid:** +1. Decide on one canonical fix before both PRs are open simultaneously. If PR #761 (resolve-model) is already merged or likely to merge first, base the fix in PR #762 on the post-merge state of main. +2. After PR #761 merges, immediately rebase feat/autopilot against the updated main: `git fetch origin && git rebase origin/main`. Resolve conflicts at the resolve-model fix site manually — verify the post-rebase code has exactly one copy of the fix, not zero and not two. +3. If PR #761 is closed (not merged), cherry-pick the relevant commit from the closed PR's branch into the new focused PR rather than implementing the fix independently again. +4. If PR #761 was merged: check `git log main --oneline -- path/to/resolve-model-file` to confirm the fix is in main before removing it from PR #762's scope. -**Prevention:** -1. **Graceful degradation rule:** If 2/3 specialists succeed, the panel produces output using available results with a warning header: `Note: {specialist_name} did not complete. Results from {N}/3 specialists.` -2. **Never wait indefinitely.** Set a timeout per specialist. If a specialist has not returned when others have, proceed after a reasonable delay. -3. **Fallback to single-agent mode:** If 2/3 specialists fail, abandon the panel and fall back to the single-agent version of the step. This is the safest degradation path because the single-agent mode is the existing, tested code path. -4. **For the Plan Checker Panel:** 2/3 is still valid for UNION blockers (any blocker from any specialist is still a blocker). For MAJORITY warnings, 2/3 means majority = 2 agrees, which still works. -5. **For the Verifier Panel:** If the Artifacts specialist fails, the pre-computed shared data (from gsd-tools.cjs) is still available. The orchestrator can inject it directly into the assembly. If the Requirements specialist fails, the orchestrator can do a simple requirements-to-artifacts cross-reference from the pre-computed data. -6. **For the Research Panel:** If one domain researcher fails, the inline synthesis simply notes the gap: "Stack research not available -- this area needs phase-specific research later." +**Warning signs:** +- Both PRs modify the same file in their diffs +- `git diff main...feat/autopilot` shows changes to resolve-model code even after PR #761 merged +- CI shows a merge conflict check failing on PR #762 -**Detection:** Test by mocking one specialist returning an error or empty output. Assert the panel still produces valid output (with degradation warning) that passes the output contract tests from Pitfall 1. - -**Confidence:** HIGH -- the `classifyHandoffIfNeeded` bug is already documented in `execute-phase.md` step 5. Partial failure handling is a well-established pattern in the codebase's wave execution model (where one plan failing in a wave does not necessarily stop other plans). +**Phase to address:** Phase 14 — PR Restructure (same phase as the split, since coordinate-with-761 is prerequisite to the split being correct). --- -### Pitfall 5: Context Window Bloat From Passing Full Context to All Three Specialists - -**What goes wrong:** Each specialist needs context to do its job. Naively passing the full context (ROADMAP, STATE, REQUIREMENTS, CONTEXT.md, RESEARCH.md, all PLAN.md files, codebase analysis docs) to all three specialists triples the effective context cost. Specialists hit context limits and produce degraded output (hallucinations, missed findings, truncated analysis). +### Pitfall 5: Removing `.planning/` Artifacts Breaks the Local Dev Workflow Mid-Milestone -**Why it happens:** The current single-agent architecture passes context via `` blocks -- the agent reads files independently using its fresh 200K context window. This works well for one agent. For three agents, the problem is not the file reading itself (each gets fresh context) but the API cost and latency of three parallel 200K-context conversations. +**What goes wrong:** +The reviewer requested removing committed `.planning/` artifacts (PLAN.md files, SUMMARY.md files, research files) from the PR branch. If these are removed with `git rm` and committed, they disappear from the branch permanently. If the team is mid-execution (using those PLAN.md files to track what to do next), removing them mid-milestone orphans the working state. The agent trying to resume from a checkpoint no longer has the PLAN.md to resume from. -More insidiously: if specialists are given files irrelevant to their domain, they waste context on parsing and may get confused by irrelevant information, producing lower-quality findings. +**Why it happens:** +The reviewer sees `.planning/` files as dev artifacts (like `node_modules` or compiled output) that should not be in the repository. The executor agent produced them as part of the workflow. The conflict: they should be tracked in `.gitignore` for the repo but were committed on the branch before `.gitignore` was updated. -**Consequences:** -- 3x API cost per panel invocation (3 specialists each with full context) -- Slower panel execution (3 parallel model calls with large context) -- Lower specialist quality: irrelevant context = attention dilution = worse findings +**How to avoid:** +1. Before removing any `.planning/` file from git tracking, verify that the milestone is complete — all phases executed, no active PLAN.md files in-use. Check `.planning/STATE.md` to confirm status. +2. The removal order matters: (a) add `.planning/` patterns to `.gitignore` first, (b) then `git rm --cached .planning/phases/*/PLAN.md` to untrack without deleting the local files, (c) commit the `.gitignore` change and the `git rm --cached` in the same commit, (d) verify local dev still works by checking that the physical files still exist on disk. +3. Do not use `git rm` (without `--cached`) on PLAN.md or SUMMARY.md files that are currently in-use. The physical file must survive; only git's tracking of it should be removed. +4. After the cleanup commit, verify the workflow still functions: run `gsd-tools.cjs phases list` in the temp directory pattern used by tests to confirm `.planning/` artifact removal did not affect test fixtures (tests create their own temp dirs so this should be safe, but verify). -**Prevention:** -1. **Scope specialist context to their domain.** The Plan Checker Structural specialist needs PLAN.md files only (not RESEARCH.md, not STATE.md). The Semantic specialist needs PLAN.md + ROADMAP goal + REQUIREMENTS. The Compliance specialist needs PLAN.md + CONTEXT.md (user decisions). Each specialist reads only what it needs. -2. **Pre-compute shared data.** The Verifier Panel should run `gsd-tools.cjs verify artifacts` and `gsd-tools.cjs verify key-links` ONCE and distribute JSON results, not have each specialist independently read and grep the entire codebase. -3. **For the Research Panel:** Domain researchers already have scoped concerns (Stack, Architecture, Pitfalls). Each reads only the files relevant to their domain. The orchestrator should not pass all codebase analysis files to all three researchers. -4. **Measure context efficiency:** Add a diagnostic that logs total tokens consumed per specialist. If any specialist uses more than 60% of its context window, the scoping is too broad. +**Warning signs:** +- `git rm` without `--cached` on files that are still referenced by `.planning/STATE.md` +- `.gitignore` change is in a separate commit from the `git rm --cached` (leaves a window where CI includes artifacts) +- Physical `.planning/` files are deleted from disk during cleanup (confirms with `ls .planning/phases/`) -**Detection:** Log context window usage per specialist invocation during testing. Alert if any specialist exceeds 60% context utilization on test inputs. - -**Confidence:** HIGH -- Anthropic's own engineering blog recommends scoped context per sub-agent. The GSD architecture already follows this principle (orchestrator passes paths, not content). The risk is that panel implementation regresses this pattern by over-sharing. +**Phase to address:** Phase 17 — Artifact Cleanup. Should be a standalone PR — no code changes, only `.gitignore` additions and `git rm --cached`. Keeps the diff reviewable. --- -## Moderate Pitfalls +## Technical Debt Patterns ---- +Shortcuts that seem reasonable but create long-term problems. -### Pitfall 6: Research Panel Inline Synthesis Produces Inconsistent File Structure - -**What goes wrong:** The Research Panel has 3 domain researchers (Stack, Architecture, Pitfalls) with inline synthesis by the orchestrator (no separate synthesizer). The risk: the orchestrator must produce multiple research files (SUMMARY.md, STACK.md, FEATURES.md, ARCHITECTURE.md, PITFALLS.md) from three domain-specific outputs. If each researcher produces different structural conventions (different heading levels, different frontmatter, different confidence level labels), the orchestrator's inline synthesis produces inconsistent files. - -**Why it happens:** The current `gsd-project-researcher.md` agent produces all five files with consistent internal formatting because one agent writes all files. When three specialists each write their domain, they may use different formatting conventions despite having the same output format specification in their prompts. LLM agents are not deterministic -- even identical prompts produce variation. - -**Prevention:** -1. Each domain researcher writes exactly ONE file in their domain (Stack researcher -> STACK.md, Architecture researcher -> ARCHITECTURE.md, Pitfalls researcher -> PITFALLS.md). -2. The orchestrator writes SUMMARY.md and FEATURES.md by reading the three domain files and synthesizing. These two files are cross-cutting (they reference findings from all domains). -3. Provide a strict template for each domain file with exact heading structure, table formats, and frontmatter fields. The template is in `get-shit-done/templates/research-project/`. -4. The orchestrator validates each file's structure before committing: correct frontmatter, expected headings present, confidence levels using correct vocabulary (HIGH/MEDIUM/LOW, not "high"/"medium"/"low"). - -**Detection:** Structure validation test: parse each research output file and assert expected headings, frontmatter keys, and confidence level vocabulary match the template specification. - -**Confidence:** MEDIUM -- the exact research output templates exist in the codebase but are guidance, not strict schemas. The risk is moderate because research files are consumed by humans (roadmap creation) not by regex-matching workflows, so slight format variation is more tolerable than in verification or plan-checking output. +| Shortcut | Immediate Benefit | Long-term Cost | When Acceptable | +|----------|-------------------|----------------|-----------------| +| Writing runtime flags to config.json | No extra parameter threading through call stack | Flag persists across sessions, surprising users | Never — session flags must never hit disk | +| Validating `discuss_agents !== undefined` only | Catches missing key | Misses `null`, `false`, `0`, empty array — all of which arrive from real configs | Never — validate all falsy paths | +| Removing `.planning/` via `git rm` (not `--cached`) | Cleaner local directory | Destroys workflow state mid-execution | Never if milestone is in-progress | +| Cherry-picking the resolve-model fix without rebasing original branch | Avoids rebase complexity | Both branches contain the same logical change, double-applied when both merge | Never — must rebase original branch | +| Documenting `model_overrides` inline in config.json template | No separate doc to maintain | Template becomes the spec; when code diverges, template is wrong and users are confused | Only if the feature is stable and unlikely to change | --- -### Pitfall 7: Config Migration Breaks Existing Installations - -**What goes wrong:** Three new config keys (`plan_check_panel`, `verifier_panel`, `research_panel`) are added to `config.json`. Existing installations have no these keys. If the code checks `config.plan_check_panel === true` but the key does not exist, it returns `undefined` which is falsy -- correct behavior (panels disabled by default). But if ANY code path checks `config.plan_check_panel !== false` (testing for explicit opt-out instead of explicit opt-in), missing keys evaluate to `true` and panels activate unexpectedly on existing installations. - -**Why it happens:** JavaScript truthiness gotchas. `undefined !== false` is `true`. This is a classic boolean config default problem that has bitten many CLI tools. +## Integration Gotchas -**Prevention:** -1. **Always check for explicit opt-in:** `config.plan_check_panel === true`, never `config.plan_check_panel !== false`. -2. **Normalize config at load time:** In `core.cjs` config loading, add default values for all panel keys: `{ plan_check_panel: false, verifier_panel: false, research_panel: false }`. Use `Object.assign(defaults, loadedConfig)` pattern. -3. **Do not update the template config.json** to include panel keys with `true` values. The template should either omit them (relying on defaults) or explicitly set them to `false`. -4. **Add a config schema version.** When new keys are added, bump the schema version. The health check (`/gsd:health`) can warn about missing keys and offer to add defaults. -5. **Test with a config.json that has NO panel keys** -- this is the upgrade path for every existing user. +Common mistakes when connecting the v1.3 fixes to the existing system. -**Detection:** Unit test in `tests/core.test.cjs` (or wherever config loading is tested): load a config.json without panel keys, assert all panel features are disabled. - -**Confidence:** HIGH -- the existing config template in `get-shit-done/templates/config.json` does not have panel keys. The `workflow` section has `research`, `plan_check`, and `verifier` as boolean flags already. The new panel keys must coexist with these existing keys without conflict (see Pitfall 8). +| Integration | Common Mistake | Correct Approach | +|-------------|----------------|------------------| +| Runtime flag + loadConfig() | Reading `auto_advance` from config inside a function that is called multiple times per session, picking up stale disk state | Load config once at process start, pass effective config as parameter; never re-read config.json for session-scoped flags | +| `discuss_agents` validation + auto-discuss workflow | Adding validation that calls `process.exit(1)` on invalid input, breaking the workflow silently in CI where exit code 1 is indistinguishable from a test failure | Return structured error JSON matching the existing `output()` / `error()` helper convention, let the orchestrator surface the error | +| PR #762 rebase after PR #761 merge | Rebasing feat/autopilot on main after PR #761 merged, then forgetting to force-push the rebased branch, PR still shows old base | After rebase, always `git push --force-with-lease` to update the PR's remote branch (safer than `--force`: aborts if remote has new commits you have not seen) | +| .planning/ git rm + tests | Using `git rm` in a test that uses `createTempProject()` — the temp dir does not have git initialized, so `git rm` fails | Tests use temp directories that are not git repos; the cleanup change is a one-time git operation on the actual repo, not something to test via `runGsdTools()` | +| config.json template + `model_overrides` | Adding `model_overrides` to the template config.json with example values, users copy the template and get non-default model overrides they did not intend | Either omit the key from the template (rely on code defaults) or add it commented out with a clear "uncomment to customize" note | --- -### Pitfall 8: Config Key Naming Collision With Existing Workflow Keys - -**What goes wrong:** The existing config already has `workflow.plan_check: true` (enables the plan checker step) and `workflow.verifier: true` (enables the verifier step). The new MoE keys are `plan_check_panel`, `verifier_panel`, `research_panel`. The risk: confusing interaction between `workflow.plan_check` (enable/disable the step entirely) and `plan_check_panel` (use panel mode vs single-agent mode when the step IS enabled). - -Consider: `workflow.plan_check: false, plan_check_panel: true`. Does this mean: skip plan checking entirely (first key wins) or use panel mode for plan checking (second key wins)? The correct answer is: `workflow.plan_check` gates whether the step runs at all; `plan_check_panel` selects the implementation when it does run. But if this is not documented and tested, bugs will emerge. - -**Why it happens:** Two levels of configuration (feature gate vs implementation selector) is inherently confusing. The naming convention does not make the relationship obvious. - -**Prevention:** -1. **Nest panel config under `workflow`:** `workflow.plan_check_panel`, `workflow.verifier_panel`, `workflow.research_panel`. This keeps all workflow toggles in one place and makes the hierarchy clear. -2. **Document the precedence rule explicitly:** `workflow.plan_check` must be `true` for `workflow.plan_check_panel` to have any effect. If the step is disabled, the panel key is ignored. -3. **Init command should enforce this:** `gsd-tools.cjs init plan-phase` already returns `plan_checker_enabled`. It should also return `plan_check_panel_enabled`, computed as `workflow.plan_check === true && workflow.plan_check_panel === true`. -4. **Add a truth table to the config template as a comment or to the user guide:** - -| `workflow.plan_check` | `workflow.plan_check_panel` | Behavior | -|---|---|---| -| `false` | any | Plan checking skipped entirely | -| `true` | `false` or missing | Single-agent plan checker (current behavior) | -| `true` | `true` | 3-specialist panel plan checker | - -**Detection:** Integration test: set `plan_check: false, plan_check_panel: true`, assert plan checking does NOT run. Set `plan_check: true, plan_check_panel: false`, assert single-agent mode runs. Set `plan_check: true, plan_check_panel: true`, assert panel mode runs. +## Performance Traps -**Confidence:** HIGH -- directly derived from examining `get-shit-done/templates/config.json` and the `plan-phase.md` workflow which already checks `plan_checker_enabled` from init JSON. +Not applicable at the scale of v1.3 (the changes are code fixes and PR operations, not performance-sensitive features). No new code paths that touch ROADMAP.md scanning or large file operations. --- -### Pitfall 9: Dedup Logic Fails on Semantically Similar But Textually Different Issues +## Security Mistakes -**What goes wrong:** The Plan Checker Panel's consensus logic requires deduplicating issues across three specialists. Two specialists might report: -- Specialist A: `"Task 2 missing element"` (dimension: task_completeness) -- Specialist B: `"Task 2 lacks verification step"` (dimension: task_completeness) - -These are the same issue, but string comparison misses the match. If dedup relies on exact description matching, duplicates survive. If dedup relies on `{plan, task, dimension}` tuple (as recommended in Pitfall 2), these correctly deduplicate. But edge cases exist: -- Phase-level issues (no task reference): `{plan: null, task: null, dimension: scope_sanity}` -- two specialists both flag scope concerns with different descriptions. Tuple match works but loses the distinct details. -- Multi-task issues: "Plans 02 and 03 have circular dependency" -- this has two plan references, not one. - -**Prevention:** -1. Require specialists to output issues in the existing structured YAML format (from `gsd-plan-checker.md`): `{plan, dimension, severity, description, task, fix_hint}`. Dedup on `{plan, task, dimension}`. -2. For phase-level issues (no task), dedup on `{plan: null, task: null, dimension}`. If two specialists flag the same dimension at the phase level, keep the more detailed description. -3. For multi-plan issues (circular dependencies), normalize to the first plan in the cycle as the canonical plan reference. -4. Do NOT attempt semantic similarity matching (no embeddings, no LLM-as-judge for dedup). The structured fields provide sufficient dedup keys. Semantic similarity adds complexity and nondeterminism to what should be a deterministic merge step. - -**Detection:** Unit test with edge cases: phase-level issues from multiple specialists, circular dependency issues, and identical-dimension-different-description pairs. - -**Confidence:** HIGH -- the issue format is already well-defined in `gsd-plan-checker.md`. Structured dedup on defined fields is deterministic and testable. +| Mistake | Risk | Prevention | +|---------|------|------------| +| Writing auto_advance flag to config.json exposes it to git history | If a future automation commits config.json, `auto_advance: true` leaks into repo history and could be parsed by tools that auto-configure CI behavior | Keep session flags out of config.json entirely; they have no business being on disk | +| Over-broad validation error messages that echo back user input | If `discuss_agents` accepts arbitrary strings and the error message includes the raw input, a crafted input could inject into log output | Sanitize or truncate the echoed value in error messages; max 100 chars, no newlines | --- -### Pitfall 10: Panel Mode Cannot Be Tested End-to-End Without Expensive LLM Calls - -**What goes wrong:** Panel logic involves spawning 3 LLM agents, collecting their outputs, and merging. Unit tests can mock the specialist outputs and test the merge logic. But end-to-end tests (verifying the full pipeline from input to final output) require actual LLM calls, which are expensive, slow, and nondeterministic. - -**Why it happens:** The core value of panels is that multiple LLM specialists produce diverse findings. Mocking them eliminates the very thing being tested (LLM diversity). But real LLM calls make CI unreliable (model output varies, tests flake). - -**Consequences:** -- Tests pass with mocked outputs but fail with real LLM calls due to unexpected formatting -- Output contract violations are only discovered in production (user runs panel, gets malformed output) -- CI becomes slow and expensive if real LLM calls are included +## UX Pitfalls -**Prevention:** -1. **Layer the testing strategy:** - - **Unit tests (CI):** Test merge/consensus/dedup logic with fixed specialist outputs. These are deterministic, fast, and catch logic bugs. This is where 90% of panel bugs will be caught. - - **Contract tests (CI):** Validate that merge output matches output contract patterns (regex from consuming workflows). Use fixed inputs, assert structural correctness. - - **Integration tests (manual/nightly):** Run full panel with real LLM calls. Compare structural output against single-agent output for same input. Flag structural differences (not content differences). -2. **Snapshot testing for output structure:** Capture the structural skeleton of panel output (headers, frontmatter keys, section order) and snapshot it. Content varies, structure must not. -3. **The merge step MUST be deterministic code, not LLM.** This makes the merge logic fully testable without LLM calls. Only specialist dispatch requires LLMs; everything after collection is pure code. - -**Detection:** CI test suite with contract tests that run on every PR. Nightly integration test that runs full panel and diffs structural output. - -**Confidence:** HIGH -- the existing test infrastructure uses `node:test` + `node:assert` with temp directory isolation. Merge logic can be tested as pure functions. The testing pattern is well-established in the codebase (433 tests, 94% coverage). +| Pitfall | User Impact | Better Approach | +|---------|-------------|-----------------| +| Validation error on `discuss_agents` with no migration path | Users who already configured discuss_agents in the old format get a hard error with no guidance | Add validation + a clear upgrade message: "discuss_agents now requires an array, found: X. Update your config to: discuss_agents: [X]" | +| Documenting `model_overrides` config key that is not yet used | Users add it to their config expecting it to work, nothing happens, they file bugs | Either implement it or explicitly mark it as `// reserved for future use, has no effect yet` in the template and docs | +| PR split produces 3 separate PRs with dependencies not communicated | Reviewer merges the wrong PR first, creating a broken state | Add a PR description note to each PR: "Merge order: this PR first, then X, then Y" or use GitHub draft status on dependent PRs | --- -## Minor Pitfalls - ---- +## "Looks Done But Isn't" Checklist -### Pitfall 11: Parallel Specialist Spawn Order Creates Non-Deterministic Merge Order +Things that appear complete but are missing critical pieces. -**What goes wrong:** Three specialists are spawned in parallel. The order they complete is nondeterministic (depends on model latency, context size, network conditions). If the merge logic processes results in completion order rather than a fixed order, the output varies between runs even with identical inputs. This makes debugging harder and snapshot tests fragile. - -**Prevention:** Always sort specialist results by specialist name/role before merging. The merge function takes an array sorted by `[structural, semantic, compliance]` (or `[artifacts, requirements, human]`), not by completion time. - -**Confidence:** HIGH -- trivial to implement, easy to miss. +- [ ] **PR Split:** Branch has been cherry-picked to new PRs — verify original feat/autopilot branch has been rebased to drop those commits, not just that the new PRs exist. +- [ ] **Runtime flag fix:** auto_advance is no longer written to config.json — verify by running the autopilot workflow and diffing `git diff .planning/config.json` before and after; it should show no changes. +- [ ] **discuss_agents validation:** Validation added — verify with a test that uses the pre-existing "no discuss_agents key" config and confirms the workflow still runs without error (backwards compatibility). +- [ ] **PR #761 coordination:** The resolve-model fix is not double-applied — verify `git log main...feat/autopilot -- [resolve-model file path]` shows no resolve-model commits after PR #761 merged. +- [ ] **Artifact cleanup:** `.planning/` files removed from git tracking — verify with `git ls-files .planning/` that phase artifacts no longer appear; physical files still exist on disk (`ls .planning/phases/`). +- [ ] **model_overrides documentation:** If `model_overrides` is documented in config, verify a test exercises loading config with `model_overrides` present and the system does not crash (even if the key is currently a no-op). --- -### Pitfall 12: Specialist Prompts Drift From Single-Agent Prompts Over Time - -**What goes wrong:** The single-agent versions (gsd-plan-checker.md, gsd-verifier.md, gsd-project-researcher.md) continue to be maintained and updated. The specialist panel prompts are derivatives. Over time, updates to the single agent are not propagated to the specialists, creating behavioral divergence. +## Recovery Strategies -**Prevention:** -1. Specialist prompts should `@include` or reference the base agent prompt and add only their domain-scoping delta. Do not copy-paste the full agent prompt into specialist prompts. -2. If full inclusion is too expensive (context), extract the shared verification dimensions/process steps into a shared reference file (`references/plan-check-dimensions.md`) that both the single-agent and specialist prompts reference. -3. Add a CI check: hash the base agent prompt sections and compare against the specialist prompts. If the base changes, flag the specialists for review. +When pitfalls occur despite prevention, how to recover. -**Confidence:** MEDIUM -- this is a maintenance concern, not an implementation bug. It will not cause problems at launch but will accumulate over months. +| Pitfall | Recovery Cost | Recovery Steps | +|---------|---------------|----------------| +| Commits orphaned on original branch after split | MEDIUM | `git rebase -i main` on feat/autopilot, drop the split commits; force-push with `--force-with-lease`; notify PR reviewer the branch was rebased | +| auto_advance written to config.json and already merged | LOW | Hotfix PR: add migration in config load that strips `auto_advance` key from config.json on load; removes it from disk next time gsd-tools runs | +| Validation breaks existing discuss_agents config | LOW | Hotfix PR: add coercion before validation (string-to-array, null-to-default); bump patch version; communicate in CHANGELOG.md | +| PR #762 has double-applied resolve-model fix after PR #761 merged | MEDIUM | Identify the conflicting commits with `git log`; rebase feat/autopilot onto post-merge main; resolve conflicts by keeping PR #761's version; re-request review with explanation | +| `.planning/` files deleted from disk (not just untracked) | HIGH | `git checkout HEAD -- .planning/phases/` to restore from the commit before the bad `git rm`; if already committed, `git revert` the cleanup commit; never use bare `git rm` on active PLAN.md files | +| model_overrides documented but non-functional — user reports broken behavior | LOW | Add a CHANGELOG entry noting the key is reserved; add a warning log in config.cjs when the key is present: "model_overrides is not yet implemented and has no effect" | --- -### Pitfall 13: Research Panel Domain Researchers Produce Overlapping Content +## Pitfall-to-Phase Mapping -**What goes wrong:** The Stack researcher covers "what technology to use." The Architecture researcher covers "how to structure the system." The Pitfalls researcher covers "what can go wrong." But technology choices (Stack) inform architecture decisions (Architecture), and both inform pitfalls (Pitfalls). Without careful domain boundaries, each researcher partially duplicates the others' work. The inline synthesis then has to reconcile three partially-overlapping narratives. +How roadmap phases should address these pitfalls. -**Prevention:** -1. Define hard exclusion rules in each specialist prompt: - - Stack: technology selection and rationale. Does NOT discuss architecture patterns or failure modes. - - Architecture: system structure, component boundaries, data flow. Does NOT discuss technology selection or common mistakes. - - Pitfalls: failure modes, anti-patterns, risks. Does NOT recommend technologies or define architecture. -2. Accept minor overlap as natural. The synthesis step (SUMMARY.md, FEATURES.md) is where cross-cutting concerns are reconciled. The domain files are allowed to reference each other ("see STACK.md for technology choice rationale") without duplicating. - -**Confidence:** MEDIUM -- overlap is inherent in domain decomposition. It is manageable with clear prompt boundaries but cannot be fully eliminated. +| Pitfall | Prevention Phase | Verification | +|---------|------------------|--------------| +| Orphaned commits after PR split | Phase 14 — PR Restructure | `git log main...feat/autopilot` shows no commits that appear in the split-off PRs | +| Runtime flag config mutation | Phase 15 — Auto-Advance Fix | `git diff .planning/config.json` clean after autopilot run; unit test asserts config.json not written | +| Validation breaks backwards compat | Phase 16 — Validation Hardening | Test with config missing `discuss_agents` key passes; test with old string format passes (or gets clear error + coercion) | +| PR #761 resolve-model double-apply | Phase 14 — PR Restructure | Diff of new focused resolve-model PR against post-761-merge main shows no duplicated logic | +| Artifact removal breaks dev workflow | Phase 17 — Artifact Cleanup | Physical `.planning/` files exist on disk; `git ls-files .planning/phases/` returns empty for PLAN.md and SUMMARY.md files | +| model_overrides stale docs | Phase 18 (if addressed) — Config Docs | Either removed from template or accompanied by "no-op" warning in config.cjs | --- -## Phase-Specific Warnings - -| Phase Topic | Likely Pitfall | Mitigation | -|---|---|---| -| Panel infrastructure (config, routing) | Config key collision with existing workflow keys (Pitfall 8) | Nest under `workflow.*`, test all 4 combinations of enable/panel flags | -| Plan Checker Panel consensus | Double-counting or dropping findings (Pitfall 2) | Structured dedup on `{plan, task, dimension}` tuple with severity escalation | -| Plan Checker Panel output | Output contract drift (Pitfall 1) | Orchestrator writes final headers from structured data, contract tests | -| Verifier Panel assembly | Domain boundary bleed (Pitfall 3) | Pre-compute shared data, assign key_links to Artifacts specialist explicitly | -| Verifier Panel output | VERIFICATION.md format deviation (Pitfall 1) | Deterministic code assembly, not LLM synthesis of final output | -| Research Panel synthesis | Overlapping domain content (Pitfall 13) | Hard exclusion rules in prompts, accept minor overlap, synthesize in SUMMARY.md | -| Specialist failure handling | One specialist timeout breaks panel (Pitfall 4) | Graceful degradation to 2/3, fallback to single-agent mode at 1/3 | -| Testing strategy | Cannot test panels end-to-end cheaply (Pitfall 10) | Layered testing: unit (merge logic) + contract (output structure) + integration (nightly) | -| Backwards compatibility | Config migration breaks existing users (Pitfall 7) | Explicit opt-in (`=== true`), normalize defaults at load time, test without panel keys | - ---- - -## Cross-Cutting Design Principles - -These principles address multiple pitfalls simultaneously: - -### Principle 1: Orchestrator Owns Output, Specialists Own Analysis - -Specialists produce structured findings (YAML issues, JSON verification results, markdown sections with defined headings). The orchestrator/merge-step assembles these into the final output using deterministic templates. Never let an LLM write the structural wrapper -- only the content within sections. - -**Addresses:** Pitfall 1, 3, 6 - -### Principle 2: Pre-Compute, Then Distribute - -Any data that multiple specialists need should be computed once by the orchestrator (using gsd-tools.cjs) and distributed as input, not computed independently by each specialist. This reduces cost, eliminates inconsistency, and makes the pre-computed data available for fallback if a specialist fails. - -**Addresses:** Pitfall 3, 4, 5 - -### Principle 3: Deterministic Merge, Nondeterministic Analysis - -The analysis step (what specialists find) is nondeterministic by nature -- that is the point of using multiple LLM agents. The merge step (how findings are combined) must be deterministic code. This separation makes the merge fully testable without LLM calls and ensures output contract compliance. - -**Addresses:** Pitfall 1, 2, 9, 10 - -### Principle 4: Graceful Degradation to Known-Good Path - -Every panel must have a fallback path to the existing single-agent mode. If panels are enabled but failing, the system should degrade to the working single-agent implementation rather than producing no output or broken output. This is the safety net that makes opt-in panels low-risk. +## Sources -**Addresses:** Pitfall 4, 7, 8 +- GSD codebase analysis: `.planning/codebase/CONCERNS.md` — config mutation patterns, error handling gaps — HIGH confidence +- GSD codebase analysis: `.planning/codebase/CONVENTIONS.md` — no module-level mutable state convention, config loaded on-demand — HIGH confidence +- GSD codebase analysis: `.planning/codebase/TESTING.md` — `createTempProject()` creates non-git dirs, confirms `git rm` cannot be tested via test helpers — HIGH confidence +- GSD codebase analysis: `.planning/PROJECT.md` — v1.3 requirements, PR #762 and #761 relationship — HIGH confidence +- `.planning/STATE.md` — PR #761 closed, PR #762 open with changes requested — HIGH confidence +- `git diff main...feat/autopilot --stat` — confirmed `.planning/` artifacts committed on branch, config.json modified — HIGH confidence +- Common git rebase/cherry-pick pitfalls: standard git documentation and community best practices — HIGH confidence (well-established, not stale) +- Node.js `--force-with-lease` safety over `--force`: git documentation — HIGH confidence --- - -## Sources - -- GSD codebase analysis: `agents/gsd-plan-checker.md`, `agents/gsd-verifier.md`, `agents/gsd-project-researcher.md` (output contract patterns) -- HIGH confidence -- GSD codebase analysis: `get-shit-done/workflows/plan-phase.md`, `get-shit-done/workflows/execute-phase.md`, `get-shit-done/workflows/verify-phase.md` (downstream regex matching) -- HIGH confidence -- GSD codebase analysis: `get-shit-done/templates/config.json` (existing config structure) -- HIGH confidence -- GSD codebase analysis: `get-shit-done/bin/lib/verify.cjs`, `get-shit-done/bin/lib/frontmatter.cjs` (output parsing code) -- HIGH confidence -- [ACL 2025: Voting or Consensus? Decision-Making in Multi-Agent Debate](https://aclanthology.org/2025.findings-acl.606/) -- MEDIUM confidence (academic; applied indirectly to agent panels) -- [Maxim.ai: Multi-Agent System Reliability](https://www.getmaxim.ai/articles/multi-agent-system-reliability-failure-patterns-root-causes-and-production-validation-strategies/) -- MEDIUM confidence (industry patterns, verified against GSD architecture) -- [Azure Architecture Center: AI Agent Design Patterns](https://learn.microsoft.com/en-us/azure/architecture/ai-ml/guide/ai-agent-design-patterns) -- MEDIUM confidence (general patterns) -- [Google ADK: Parallel Agents](https://google.github.io/adk-docs/agents/workflow-agents/parallel-agents/) -- MEDIUM confidence (framework-specific but pattern applicable) -- [Anthropic: Effective Context Engineering for AI Agents](https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents) -- MEDIUM confidence (authoritative source on context scoping) -- [JetBrains Research: Context Management for LLM-Powered Agents](https://blog.jetbrains.com/research/2025/12/efficient-context-management/) -- LOW confidence (single source, applied indirectly) +*Pitfalls research for: PR Review Fixes — splitting, runtime flags, validation, overlapping PRs, artifact cleanup* +*Researched: 2026-02-28* diff --git a/.planning/research/STACK.md b/.planning/research/STACK.md index 04e9e09125..d470f25831 100644 --- a/.planning/research/STACK.md +++ b/.planning/research/STACK.md @@ -1,291 +1,251 @@ -# Technology Stack: MoE Panels for Agent Orchestration +# Stack Research -**Project:** get-shit-done v2.0 -- MoE Panels -**Researched:** 2026-02-26 -**Domain:** Multi-agent panel orchestration for AI coding assistant quality gates +**Domain:** Git branch and PR management for decomposing a monolithic PR into focused PRs +**Researched:** 2026-02-28 +**Confidence:** HIGH -## Constraint +## Context -No new npm dependencies allowed. All patterns must be implementable using: -- Markdown agent definitions (`.md` files in `agents/`) -- Claude Code's Task tool for spawning subagents -- Node.js built-ins for CLI tooling (`bin/lib/*.cjs`) -- Existing `.planning/` state management layer +This is a subsequent milestone (v1.3) addressing reviewer feedback on PR #762. The PR is monolithic: it bundles tests+CI, a resolve-model fix, the autopilot feature, and committed `.planning/` artifacts. The goal is to decompose it into focused PRs that reviewers can merge independently. -## Recommended Architecture: Scatter-Gather with LLM Synthesizer +The constraint from PROJECT.md applies: "Not our repo — We're contributing PRs, not merging directly." This means we push to `fork/` remotes and open PRs against `origin/main`. -**Confidence: HIGH** -- This pattern is the most proven and directly maps to GSD's existing infrastructure. +## Recommended Stack -The architecture replaces each single quality-gate agent with a **panel of 3 parallel specialists** plus a **synthesizer agent** that merges their outputs. This is the "fan-out/fan-in" or "scatter-gather" pattern documented by Microsoft Azure Architecture Center, AWS, Google ADK, and implemented in CrewAI, LangGraph, and AutoGen. +### Core Technologies -### Why 3 Specialists (Not 2, Not 5) +| Technology | Version | Purpose | Why Recommended | +|------------|---------|---------|-----------------| +| `git cherry-pick` | Built-in | Selectively apply specific commits to new branches | Best tool when source commits are already clean; no new branch history needed | +| `git checkout -- ` | Built-in | Bring specific files from a branch into the current branch without cherry-picking | Best tool when commits are mixed (one commit has multiple concerns); extract only the files you need | +| `git rm --cached -r ` | Built-in | Remove committed files from index/tracking without deleting them locally | Removes `.planning/` artifacts from branch without losing local files | +| `git rebase --onto` | Built-in | Transplant a range of commits onto a new base | Best tool when commits are sequential and cleanly separated | +| `git log --oneline ^` | Built-in | Enumerate commits that need to be split | Planning step — understand what commits exist before operating | +| GitHub PR (gh CLI) | Current | Open PRs against upstream from fork branches | `gh pr create --repo org/repo` to target upstream, not fork | -| Panel Size | Tradeoff | Verdict | -|------------|----------|---------| -| 2 agents | Ties possible, limited diversity | Too few | -| **3 agents** | **Majority possible, diverse enough, manageable token cost** | **Use this** | -| 4 agents | Even number creates ties, diminishing returns | Avoid | -| 5+ agents | Error compounding grows, 60k+ token overhead per panel, coordination tax exceeds gains | Too many | +### Supporting Techniques -**Confidence: MEDIUM** -- The "Coordination Tax" research from Google DeepMind (2025) shows accuracy gains saturate or fluctuate beyond 4 agents. The "17x error trap" paper demonstrates that poorly coordinated multi-agent systems compound errors. 3 is the sweet spot for: (a) majority vote viability, (b) diverse perspectives, (c) affordable token cost (3 x 20k = 60k overhead), (d) matches Claude Code's 10-agent parallel cap. +| Technique | Purpose | When to Use | +|-----------|---------|-------------| +| Branch-from-base pattern | Create each sub-PR branch from `origin/main`, not from the monolithic branch | Always — prevents sub-PRs from carrying unrelated changes | +| `git diff --name-only ` | Identify what changed in the monolithic PR | Planning step before splitting | +| `git show --stat ` | Understand what each commit touched | Planning step to decide which cherry-pick strategy to use | +| `git stash` | Preserve uncommitted local work during branch surgery | When you need to switch branches mid-operation | +| `.gitignore` entry | Prevent `.planning/` from being committed in future | Add `**/.planning/` to `.gitignore` if it isn't already | -### Panel Composition Strategy +### Development Tools -Each panel needs **specialist diversity, not specialist redundancy**. The 3 agents must examine the problem from genuinely different angles. If all 3 use the same approach, you get redundancy not robustness. +| Tool | Purpose | Notes | +|------|---------|-------| +| `gh` CLI | Create PRs against upstream from fork | Use `gh pr create --repo gsd-build/get-shit-done --head ethan-hurst:branch-name` | +| `git log --graph --oneline` | Verify branch topology before pushing | Sanity check that branch base is `origin/main`, not the monolithic branch | -**Pattern: Role-Based Specialization** +## Installation -| Specialist Role | Focus | What It Catches | -|----------------|-------|-----------------| -| **Domain Expert** | Does this achieve the functional goal? | Missing requirements, wrong behavior | -| **Quality Auditor** | Does this meet structural standards? | Anti-patterns, missing tests, scope issues | -| **Devil's Advocate** | What could go wrong? | Edge cases, failure modes, hidden assumptions | - -**Confidence: MEDIUM** -- This role triad is derived from multi-agent debate literature (ACL 2025 findings, debate-based consensus patterns) and mirrors effective human review panels. The specific role names and scopes need phase-specific tuning. - -## Panel Types and Specialist Definitions - -### Panel 1: Plan Checker Panel (replaces single gsd-plan-checker) - -Currently one agent checks all 8 dimensions (requirement coverage, task completeness, dependencies, key links, scope, verification derivation, context compliance, Nyquist). Split into: - -| Specialist | Dimensions | Prompt Focus | -|-----------|------------|--------------| -| **Completeness Specialist** | Dims 1, 2, 6 (requirement coverage, task completeness, verification derivation) | "Do these plans cover everything the phase needs to deliver?" | -| **Structure Specialist** | Dims 3, 4, 5 (dependency correctness, key links, scope sanity) | "Are these plans structurally sound and properly connected?" | -| **Compliance Specialist** | Dims 7, 8 (context compliance, Nyquist) | "Do these plans honor user decisions and testing requirements?" | - -**Synthesizer:** Merges issue lists, deduplicates, assigns final severity, produces single ISSUES FOUND / VERIFICATION PASSED result. - -**Confidence: HIGH** -- This maps cleanly to the existing dimension structure in gsd-plan-checker.md. Each specialist gets a focused subset that reduces prompt complexity and improves depth of analysis. +```bash +# No installation required — git and gh are already present +# Verify gh is authenticated: +gh auth status +``` -### Panel 2: Verifier Panel (replaces single gsd-verifier) +## Strategies for This Specific Split -Currently one agent performs 10-step verification (observable truths, artifacts, key links, requirements, anti-patterns, human verification). Split into: +The fork has 3 commits on `fork/feat/autopilot` vs `origin/main`: -| Specialist | Steps | Prompt Focus | -|-----------|-------|--------------| -| **Truth Verifier** | Steps 2-3 (establish must-haves, verify observable truths) | "Are the phase goals actually achieved in the codebase?" | -| **Wiring Inspector** | Steps 4-5 (verify artifacts at 3 levels, verify key links) | "Are artifacts substantive and properly connected?" | -| **Quality Scanner** | Steps 6-7 (requirements coverage, anti-pattern scan) | "Are requirements satisfied and is the code clean?" | +1. `b0aa9fc` — feat: add autopilot mode + Agent Teams execution engine (9 files) +2. `8850ebf` — refactor: remove Agent Teams engine, simplify to subagents-only +3. `000163a` — refactor: remove dead execution section, consolidate to autopilot config -**Synthesizer:** Merges verification results, produces single VERIFICATION.md with unified truth table, artifact table, and gap list. +The target split is 3 focused PRs: -**Confidence: HIGH** -- This follows the existing step structure. Each specialist is self-contained and reads different parts of the codebase. +### PR A: Tests + CI (from `fork/feat/coverage-hardening`) -### Panel 3: Phase Researcher Panel (replaces single gsd-phase-researcher) +This already exists as a separate branch. No splitting needed — just open a PR from it. -Currently one agent researches all domains (stack, patterns, pitfalls, code examples). Split into: +```bash +# Branch already exists at fork/feat/coverage-hardening +# Verify it contains only test files and CI changes: +git diff --name-only origin/main fork/feat/coverage-hardening -| Specialist | Sections | Prompt Focus | -|-----------|----------|--------------| -| **Stack Researcher** | Standard Stack, Don't Hand-Roll, State of the Art | "What libraries and tools does this phase need?" | -| **Pattern Researcher** | Architecture Patterns, Code Examples | "How do experts structure this type of implementation?" | -| **Risk Researcher** | Common Pitfalls, Open Questions | "What commonly goes wrong in this domain?" | +# Open PR against upstream: +gh pr create --repo gsd-build/get-shit-done \ + --head ethan-hurst:feat/coverage-hardening \ + --title "test: add full test suite with CI pipeline (433 tests, 94% coverage)" +``` -**Synthesizer:** Merges research sections, resolves contradictions, produces single RESEARCH.md. +### PR B: resolve-model Fix (coordinates with PR #761) -**Confidence: MEDIUM** -- GSD already has a parallel research pattern (new-project spawns 4 researchers). This applies the same pattern at phase level. Note: the project-level research already uses 4 parallel agents -- this phase-level panel uses 3 focused specialists instead. +The fix already exists on `fork/fix/load-model-overrides-from-config`. This may conflict with PR #761 — check before submitting. -### Panel 4: Project Research Panel (already exists -- 4 parallel agents) +```bash +# Branch already exists at fork/fix/load-model-overrides-from-config +git diff --name-only origin/main fork/fix/load-model-overrides-from-config -The `/gsd:new-project` workflow already spawns 4 parallel researchers (STACK, FEATURES, ARCHITECTURE, PITFALLS) plus a synthesizer. This is already a panel pattern. **No change needed** except possibly adding a synthesizer improvement. +# Check for overlap with PR #761's files: +# If no conflict, open PR: +gh pr create --repo gsd-build/get-shit-done \ + --head ethan-hurst:fix/load-model-overrides-from-config \ + --title "fix: load model_overrides from config and use resolveModelInternal in CLI" +``` -**Confidence: HIGH** -- Already implemented and working. +### PR C: Autopilot Feature (clean, without artifacts) -## Consensus/Aggregation Strategy +The autopilot commits include `.planning/` artifacts committed to the branch. Create a clean branch that cherry-picks only the feature files. -### Use: LLM Synthesizer (Not Majority Vote) +```bash +# Step 1: Create clean branch from origin/main +git checkout -b feat/autopilot-clean origin/main -**Confidence: HIGH** -- Majority voting only works for discrete classification tasks (pass/fail, yes/no). Quality gates produce rich structured output (issue lists, verification reports, research findings). An LLM synthesizer is the correct aggregation strategy for complex, non-discrete outputs. +# Step 2: Cherry-pick the 3 autopilot commits +git cherry-pick b0aa9fc # autopilot feature +git cherry-pick 8850ebf # remove Agent Teams +git cherry-pick 000163a # remove dead execution section -| Aggregation Method | Works For | Does NOT Work For | -|-------------------|-----------|-------------------| -| Majority Vote | Binary pass/fail decisions | Structured issue lists, research findings | -| Weighted Vote | Ranked recommendations | Complex verification reports | -| **LLM Synthesizer** | **All panel outputs in this system** | Nothing (universal, but costs one extra agent) | +# Step 3: Remove any .planning/ artifacts that got pulled in +git rm --cached -r .planning/ 2>/dev/null || true +echo '.planning/' >> .gitignore # if not already ignored +git add .gitignore +git commit --amend --no-edit # or: git commit -m "chore: remove .planning artifacts" -### Synthesizer Pattern +# Step 4: Verify only intended files are present +git diff --name-only origin/main feat/autopilot-clean +# Step 5: Push and open PR +git push fork feat/autopilot-clean +gh pr create --repo gsd-build/get-shit-done \ + --head ethan-hurst:feat/autopilot-clean \ + --title "feat: add /gsd:autopilot for fully automated pipeline execution" ``` -Specialists (parallel) → Write to distinct output keys → Synthesizer reads all → Produces unified result -``` - -The synthesizer agent: -1. Reads all 3 specialist outputs -2. **Deduplicates** findings (same issue reported by 2+ specialists) -3. **Resolves conflicts** (specialist A says pass, specialist B says fail on same item) -4. **Elevates** items flagged by 2+ specialists (consensus = higher confidence) -5. **Produces** the final structured output in the format the orchestrator expects +**Alternative if cherry-pick has conflicts:** Use `git checkout -- ` to bring specific files without commit history: -**Conflict resolution rule:** When specialists disagree on severity or status: -- 2 agree, 1 disagrees = go with majority -- All 3 disagree = synthesizer uses most conservative (highest severity) finding -- Specialist provides reasoning? Synthesizer evaluates reasoning quality, not just position +```bash +git checkout -b feat/autopilot-clean origin/main -**Confidence: HIGH** -- This mirrors the gsd-research-synthesizer pattern already in the codebase, which reads 4 researcher outputs and produces unified SUMMARY.md. The same pattern extends to all panels. +# Bring only the autopilot-related files from the monolithic branch +git checkout fork/feat/autopilot -- commands/gsd/autopilot.md +git checkout fork/feat/autopilot -- get-shit-done/workflows/auto-discuss.md +git checkout fork/feat/autopilot -- get-shit-done/workflows/autopilot.md +git checkout fork/feat/autopilot -- get-shit-done/workflows/execute-phase.md +git checkout fork/feat/autopilot -- get-shit-done/workflows/progress.md +git checkout fork/feat/autopilot -- get-shit-done/workflows/settings.md +git checkout fork/feat/autopilot -- get-shit-done/bin/lib/config.cjs +git checkout fork/feat/autopilot -- get-shit-done/templates/config.json -## Implementation Stack +# Do NOT bring: .planning/ files, tests/, .github/, package*.json +git commit -m "feat: add /gsd:autopilot for fully automated pipeline execution" +``` -### No New Dependencies Required +## Runtime Flag Pattern (for auto-advance fix) -Everything needed exists in the current stack: +The review flagged that `autopilot.md` mutates `config.json` to set `workflow.auto_advance true` and then sets it back to `false` after the run. This persists state to disk, which is a side effect if the run is interrupted. -| Component | Implementation | Existing Precedent | -|-----------|---------------|-------------------| -| Parallel dispatch | Task tool (3 parallel calls) | new-project.md spawns 4 researchers | -| Specialist definitions | `agents/*.md` files | All 11 existing agents | -| Result collection | Task tool return values | new-project.md collects researcher outputs | -| Synthesizer | Task tool (1 call after parallel batch) | gsd-research-synthesizer.md | -| Panel config | `.planning/config.json` additions | Existing workflow toggles | -| CLI tooling | `bin/lib/*.cjs` modules | Existing verify, frontmatter modules | +**Pattern to fix this:** Pass `AUTO_ADVANCE` as an environment variable or shell argument instead of persisting to config. -### New Agent Files Needed +**Current (mutates config.json):** +```bash +# autopilot.md start +node gsd-tools.cjs config-set workflow.auto_advance true -``` -agents/ - # Existing (unchanged) - gsd-plan-checker.md → becomes "solo mode" fallback - gsd-verifier.md → becomes "solo mode" fallback - gsd-phase-researcher.md → becomes "solo mode" fallback - - # New: Panel specialist agents - panels/ - plan-checker/ - completeness-specialist.md - structure-specialist.md - compliance-specialist.md - synthesizer.md - verifier/ - truth-verifier.md - wiring-inspector.md - quality-scanner.md - synthesizer.md - phase-researcher/ - stack-researcher.md - pattern-researcher.md - risk-researcher.md - synthesizer.md -``` +# ... phases run ... -**Total new agent files:** 12 (9 specialists + 3 synthesizers) - -**Confidence: HIGH** -- Follows existing agent definition pattern exactly. No architectural change, just more .md files. - -### Config Schema Extension - -```json -{ - "workflow": { - "research": true, - "plan_check": true, - "verifier": true, - "panels": { - "enabled": true, - "plan_checker": { - "enabled": true, - "specialists": 3 - }, - "verifier": { - "enabled": true, - "specialists": 3 - }, - "phase_researcher": { - "enabled": true, - "specialists": 3 - } - } - } -} +# autopilot.md end +node gsd-tools.cjs config-set workflow.auto_advance false ``` -When `panels.enabled` is false, fall back to single-agent mode (existing behavior). This provides a clean upgrade path and allows users to opt out of higher token costs. - -**Confidence: HIGH** -- Follows existing config pattern. The `workflow` object already has boolean toggles. - -### Orchestrator Changes - -Each workflow that spawns a quality-gate agent needs a panel-aware dispatch function: +**Fixed (runtime flag, no persistence):** +```bash +# Pass flag as environment variable +AUTO_ADVANCE=true node gsd-tools.cjs execute-phase ... -**Before (plan-phase.md, step 10):** -``` -Task(prompt=checker_prompt, subagent_type="gsd-plan-checker", ...) +# Or pass as CLI argument that execute-phase reads from args, not config +node gsd-tools.cjs execute-phase --auto-advance ... ``` -**After:** -``` -if (panels.plan_checker.enabled) { - // Spawn 3 specialists in parallel - Task(prompt=completeness_prompt, subagent_type="general-purpose", ...) - Task(prompt=structure_prompt, subagent_type="general-purpose", ...) - Task(prompt=compliance_prompt, subagent_type="general-purpose", ...) - - // After all 3 complete, spawn synthesizer - Task(prompt=synthesizer_prompt, subagent_type="general-purpose", ...) -} else { - // Fallback to single agent - Task(prompt=checker_prompt, subagent_type="gsd-plan-checker", ...) -} +**In `execute-phase.md`:** Read from env/arg first, fall back to config: +```bash +# Read auto_advance: env var takes priority over config +AUTO_CFG="${AUTO_ADVANCE:-$(node gsd-tools.cjs config-get workflow.auto_advance 2>/dev/null || echo false)}" ``` -**Confidence: HIGH** -- The branching pattern (config check -> conditional spawn) already exists in plan-phase.md for research and plan-check toggles. +This ensures `config.json` is never mutated during an autopilot run — it stays as the user left it. -## Token Cost Analysis +**Confidence: HIGH** — This is the standard Unix pattern: environment variables override config files for session-scoped behavior. No new dependencies, no new config keys. -| Gate | Solo Mode | Panel Mode (3 + synthesizer) | Increase | -|------|-----------|------------------------------|----------| -| Plan Checker | ~20k overhead + work | ~80k overhead + 3x work + synthesis | 4x | -| Verifier | ~20k overhead + work | ~80k overhead + 3x work + synthesis | 4x | -| Phase Researcher | ~20k overhead + work | ~80k overhead + 3x work + synthesis | 4x | +## Alternatives Considered -**Mitigation strategies:** -1. Panels are opt-in via config (default: off for budget profile, on for quality profile) -2. Each specialist gets a *narrower* prompt than the solo agent, so work per specialist is ~40% of solo -3. Net increase is roughly **2.5x per gate** (not 4x) because specialists do less work each -4. Model profile applies: use Sonnet for specialists, Opus only for synthesizer if quality profile +| Recommended | Alternative | When to Use Alternative | +|-------------|-------------|-------------------------| +| `git cherry-pick` per commit | `git rebase --onto` | Use rebase --onto when commits are sequential AND don't need file-level filtering | +| `git checkout -- ` | Interactive rebase + `git add -p` | Use interactive rebase when commits need to be re-split at hunk level (more complex) | +| New branch from `origin/main` | Amend the existing branch | Only amend existing branch when it's not yet published OR reviewer explicitly asks for force-push | +| Environment variable for runtime flag | New config key `autopilot.running` | Env var is session-scoped (no persistence risk), config key would require cleanup on crash | + +## What NOT to Use + +| Avoid | Why | Use Instead | +|-------|-----|-------------| +| `git filter-branch` | Deprecated, slow, dangerous for shared branches | `git rm --cached` for removing tracked files; BFG for history rewriting | +| `git push --force` to a branch with an open PR | Rewrites history reviewers may have fetched; confusing diffs in PR timeline | Push to a NEW branch, open a NEW PR | +| `git rebase -i` on published branches | Same issue — force-push required after | New branch + cherry-pick | +| Graphite CLI / git-multi-pr | External tooling, not available everywhere | Native git cherry-pick + gh CLI | +| Mutating `config.json` for runtime state | Leaves dirty state if process is interrupted; user's config is corrupted | Environment variables for session-scoped flags | +| `git add .` or `git add -A` when cleaning artifacts | Risk of accidentally re-adding files from adjacent directories | `git rm --cached -r .planning/` explicitly, then `git add` specific files | + +## Stack Patterns by Variant + +**If commits are clean and single-concern (each commit touches only one logical change):** +- Use `git cherry-pick ` per commit +- This is the simplest path — no file-level surgery needed + +**If commits are mixed-concern (one commit touches feature files AND test files AND artifacts):** +- Use `git checkout -- ` to bring only the files you want +- Build the new branch file-by-file, then commit once + +**If the PR has no clean commits (everything in one giant commit):** +- Use `git checkout -- ` for each file group +- Commit groups separately on the new branch +- This gives reviewers a meaningful commit history + +**If a sub-PR conflicts with another open PR (e.g., PR #761):** +- Wait for the other PR to merge first, then rebase your branch onto the updated base +- OR communicate with the maintainer to sequence the merges +- Do NOT attempt to manually merge the two PRs' changes together + +## Removing .planning/ Artifacts + +The monolithic PR has committed `.planning/` dev artifacts. To clean them: + +```bash +# On the branch that has the artifacts: +git rm --cached -r .planning/ +git commit -m "chore: remove .planning dev artifacts from branch" + +# Then add .planning/ to .gitignore to prevent recurrence: +echo '.planning/' >> .gitignore +git add .gitignore +git commit -m "chore: gitignore .planning artifacts" +``` -**Confidence: MEDIUM** -- Token estimates are approximations. Real costs depend on phase complexity, codebase size, and model choice. +This removes the files from tracking without deleting them locally — they stay in your working directory but won't appear in the PR diff. -## Alternatives Considered +## Version Compatibility -| Alternative | Why Not | -|------------|---------| -| **Debate pattern** (agents argue in rounds) | Too expensive (multiple rounds), complex to implement, diminishing returns after round 1 for structured analysis | -| **Hierarchical supervisor** (one agent delegates to others) | Adds latency (supervisor must reason before dispatching), unnecessary when specialization is static | -| **Group chat** (agents in shared thread) | Accumulating context bloats rapidly, cross-talk introduces confusion for structured verification | -| **5+ specialists** | Error compounding, coordination tax, 100k+ token overhead | -| **2 specialists** | No tiebreaking, limited diversity | -| **External framework (CrewAI, LangGraph)** | Violates no-new-dependencies constraint, adds complexity, GSD already has Task tool | -| **Majority vote only** | Only works for binary decisions; quality gates produce structured reports | - -## Key Technical Decisions - -| Decision | Choice | Rationale | -|----------|--------|-----------| -| Panel size | 3 specialists + 1 synthesizer | Majority possible, affordable, below coordination tax threshold | -| Aggregation | LLM synthesizer agent | Rich structured outputs need synthesis, not counting | -| Specialist diversity | Role-based (domain, structure, risk) | Prevents redundant analysis | -| Config location | `.planning/config.json` | Follows existing pattern | -| Agent definitions | `agents/panels/*.md` | Follows existing agent pattern, scoped in subdirectory | -| Fallback | Single-agent mode when panels disabled | Backward compatible | -| File writes | Specialists return via Task output, synthesizer writes final file | Prevents file conflicts from parallel agents | +| Technique | Git Version | Notes | +|-----------|-------------|-------| +| `git cherry-pick` | Any modern git | Available in git 1.7+ | +| `git checkout -- ` | Any modern git | Long-standing feature | +| `git rm --cached -r` | Any modern git | Standard since git 1.0 | +| `gh pr create --repo` | gh 2.x+ | Targets upstream repo from fork | ## Sources -### Primary (HIGH confidence) -- [Microsoft Azure Architecture Center -- AI Agent Orchestration Patterns](https://learn.microsoft.com/en-us/azure/architecture/ai-ml/guide/ai-agent-design-patterns) -- Concurrent orchestration pattern, fan-out/fan-in, aggregation strategies (Updated 2026-02-12) -- [AWS Prescriptive Guidance -- Parallelization and scatter-gather patterns](https://docs.aws.amazon.com/prescriptive-guidance/latest/agentic-ai-patterns/parallelization-and-scatter-gather-patterns.html) -- Scatter-gather implementation details -- GSD codebase (`new-project.md`, `gsd-research-synthesizer.md`) -- Existing 4-agent parallel + synthesizer pattern - -### Secondary (MEDIUM confidence) -- [LangGraph Parallel Agent Pattern](https://dev.to/rosen_hristov/why-i-split-one-langgraph-agent-into-four-running-in-parallel-2c65) -- Send API, state reducers, result merging pattern -- [Parallelization -- Agentic Design Pattern Series](https://datalearningscience.com/p/3-parallelization-agentic-design) -- Dispatch, concurrent execution, aggregation steps -- [Claude Code Task Tool Patterns](https://amitkoth.com/claude-code-task-tool-vs-subagents/) -- Parallel dispatch via Task tool, 10-agent cap, 20k token overhead per agent -- [Claude Code Sub-Agents: Parallel vs Sequential Patterns](https://claudefa.st/blog/guide/agents/sub-agent-best-practices) -- Task spawning best practices - -### Tertiary (LOW confidence -- flag for validation) -- [17x Error Trap / Bag of Agents](https://towardsdatascience.com/why-your-multi-agent-system-is-failing-escaping-the-17x-error-trap-of-the-bag-of-agents/) -- Error compounding in multi-agent systems, coordination tax concept -- [Voting or Consensus? Decision-Making in Multi-Agent Debate (ACL 2025)](https://aclanthology.org/2025.findings-acl.606.pdf) -- 7 decision protocols comparison -- [CrewAI Parallel Patterns](https://github.com/apappascs/crewai-parallel-patterns) -- CrewAI-specific patterns, async_execution model -- [Multi-Agent AI Systems Explained: LangGraph vs CrewAI vs AutoGen (2026)](https://www.mayhemcode.com/2026/02/multi-agent-ai-systems-explained.html) -- Framework comparison +- [Git Official Docs: git-rm](https://git-scm.com/docs/git-rm) — `--cached` flag behavior, recursive removal (HIGH confidence) +- [Graphite: How to split a PR](https://graphite.com/guides/how-to-split-a-pull-request-into-multiple-prs) — Strategy overview: cherry-pick, checkout-file, branch-from-base (MEDIUM confidence) +- [GitHub Gist: Split large PR into two](https://gist.github.com/loilo/930f141d9acf89e9e734ffa042acd750) — `git rebase --onto` and `cherry-pick` concrete commands (HIGH confidence) +- GSD codebase — `fork/feat/coverage-hardening` and `fork/feat/autopilot` branch inspection (HIGH confidence — direct observation) +- Unix environment variable pattern — session-scoped config override via `$ENV_VAR` (HIGH confidence — standard Unix practice) + +--- +*Stack research for: git PR decomposition and runtime config patterns* +*Researched: 2026-02-28* From c84660f8c5755f86bd6da76ecdd75ebd77512c7d Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 12:14:47 +1000 Subject: [PATCH 08/16] docs: define milestone v1.3 requirements --- .planning/REQUIREMENTS.md | 72 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 .planning/REQUIREMENTS.md diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md new file mode 100644 index 0000000000..3c62cf0fe1 --- /dev/null +++ b/.planning/REQUIREMENTS.md @@ -0,0 +1,72 @@ +# Requirements: get-shit-done + +**Defined:** 2026-02-28 +**Core Value:** Reliable AI agent orchestration with quality gates that catch bad plans before execution burns context. + +## v1.3 Requirements + +Requirements for PR #762 review fixes. Each maps to roadmap phases. + +### PR Structure + +- [ ] **PRS-01**: PR #762 is split into 3 focused PRs: tests+CI (PR A), resolve-model fix (PR B), autopilot feature (PR C) +- [ ] **PRS-02**: `.planning/` artifacts are removed from all PR branches via `git rm --cached` +- [ ] **PRS-03**: `.gitignore` updated to prevent `.planning/` from being committed again +- [ ] **PRS-04**: PR #761 status confirmed and resolve-model fix coordinated (no duplicate changes) + +### Config Fixes + +- [ ] **CFG-01**: Auto-advance uses `--auto` runtime flag instead of mutating `config.json` via `config-set` +- [ ] **CFG-02**: `discuss_agents` validated at runtime in `auto-discuss.md` (odd, 3-9 range, absent key accepted) +- [ ] **CFG-03**: `model_overrides` added to `loadConfig` return object so `resolveModelInternal` can read it +- [ ] **CFG-04**: `model_overrides` config key documented in appropriate reference files + +### Coordination + +- [ ] **CRD-01**: PR A (tests+CI) submitted first with no dependencies on B or C +- [ ] **CRD-02**: PR B (resolve-model) submitted after confirming PR #761 status +- [ ] **CRD-03**: PR C (autopilot) submitted last, includes all code fixes from CFG category + +## Future Requirements + +### v2.0 — MoE Panels (deferred) + +- MoE panel infrastructure with 3 config keys +- Plan Checker Panel: 3 parallel specialists +- Verifier Panel: 3 domain specialists +- Research Panel: 3 domain researchers +- Workflow routing and output contract preservation + +## Out of Scope + +| Feature | Reason | +|---------|--------| +| Rewriting autopilot feature logic | Reviewer didn't request feature changes, only structural/quality fixes | +| Adding tests for autopilot workflows | Reviewer noted it as missing but it's a separate effort (v1.4 candidate) | +| Changing existing test assertions | Tests PR (A) should preserve existing test content | +| TypeScript migration | Different milestone entirely | + +## Traceability + +| Requirement | Phase | Status | +|-------------|-------|--------| +| PRS-01 | — | Pending | +| PRS-02 | — | Pending | +| PRS-03 | — | Pending | +| PRS-04 | — | Pending | +| CFG-01 | — | Pending | +| CFG-02 | — | Pending | +| CFG-03 | — | Pending | +| CFG-04 | — | Pending | +| CRD-01 | — | Pending | +| CRD-02 | — | Pending | +| CRD-03 | — | Pending | + +**Coverage:** +- v1.3 requirements: 11 total +- Mapped to phases: 0 +- Unmapped: 11 + +--- +*Requirements defined: 2026-02-28* +*Last updated: 2026-02-28 after initial definition* From d4006c39b1d12db55b060ea4425dfbd25dfdb033 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 12:23:33 +1000 Subject: [PATCH 09/16] docs: create milestone v1.3 roadmap (4 phases) --- .planning/REQUIREMENTS.md | 28 +++++------ .planning/ROADMAP.md | 101 ++++++++++++++++++++++++++++++++++++++ .planning/STATE.md | 71 +++++++++++++++++++++------ 3 files changed, 170 insertions(+), 30 deletions(-) create mode 100644 .planning/ROADMAP.md diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md index 3c62cf0fe1..69cdd86081 100644 --- a/.planning/REQUIREMENTS.md +++ b/.planning/REQUIREMENTS.md @@ -50,23 +50,23 @@ Requirements for PR #762 review fixes. Each maps to roadmap phases. | Requirement | Phase | Status | |-------------|-------|--------| -| PRS-01 | — | Pending | -| PRS-02 | — | Pending | -| PRS-03 | — | Pending | -| PRS-04 | — | Pending | -| CFG-01 | — | Pending | -| CFG-02 | — | Pending | -| CFG-03 | — | Pending | -| CFG-04 | — | Pending | -| CRD-01 | — | Pending | -| CRD-02 | — | Pending | -| CRD-03 | — | Pending | +| PRS-01 | Phase 14 | Pending | +| PRS-02 | Phase 14 | Pending | +| PRS-03 | Phase 14 | Pending | +| PRS-04 | Phase 14 | Pending | +| CFG-01 | Phase 15 | Pending | +| CFG-02 | Phase 16 | Pending | +| CFG-03 | Phase 17 | Pending | +| CFG-04 | Phase 17 | Pending | +| CRD-01 | Phase 14 | Pending | +| CRD-02 | Phase 14 | Pending | +| CRD-03 | Phase 17 | Pending | **Coverage:** - v1.3 requirements: 11 total -- Mapped to phases: 0 -- Unmapped: 11 +- Mapped to phases: 11 +- Unmapped: 0 --- *Requirements defined: 2026-02-28* -*Last updated: 2026-02-28 after initial definition* +*Last updated: 2026-02-28 — traceability filled after roadmap creation* diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md new file mode 100644 index 0000000000..325b1c96bc --- /dev/null +++ b/.planning/ROADMAP.md @@ -0,0 +1,101 @@ +# Roadmap: get-shit-done + +## Milestones + +- ✅ **v1.0 Test Infrastructure** - Phases 1-6 (shipped 2026-02-25) +- ✅ **v1.1 Coverage Hardening** - Phases 7-13 (shipped 2026-02-25) +- 🚧 **v1.3 PR Review Fixes** - Phases 14-17 (in progress) + +## Phases + +
+✅ v1.0 Test Infrastructure (Phases 1-6) - SHIPPED 2026-02-25 + +Phases 1-6 completed. 245 new tests, 6,715 lines of test code, CI pipeline (3 OS x 3 Node matrix). See MILESTONES.md for full details. + +
+ +
+✅ v1.1 Coverage Hardening (Phases 7-13) - SHIPPED 2026-02-25 + +Phases 7-13 completed. 433 tests passing, 94.01% overall line coverage, all 11 modules above 70%, c8 coverage enforcement in CI. See MILESTONES.md for full details. + +
+ +### 🚧 v1.3 PR Review Fixes (In Progress) + +**Milestone Goal:** Respond to PR reviewer feedback on PR #762 by splitting the monolithic PR into focused branches, removing committed dev artifacts, and applying three targeted code fixes before resubmission. + +#### Phase 14: PR Restructure +**Goal**: Three clean, independently reviewable branches exist — one for tests+CI, one for resolve-model, one for autopilot — with no .planning/ artifacts committed and PR #761 coordination confirmed +**Depends on**: Phase 13 +**Requirements**: PRS-01, PRS-02, PRS-03, PRS-04, CRD-01, CRD-02 +**Success Criteria** (what must be TRUE): + 1. PR A branch (`feat/coverage-hardening`) contains only tests and CI changes — no autopilot code, no .planning/ files + 2. PR B branch (`fix/resolve-model`) is scoped to resolve-model logic only — PR #761 status confirmed and no duplicate changes present + 3. PR C branch (`feat/autopilot-clean`) contains autopilot feature code with .planning/ artifacts removed from git index (files still exist locally) + 4. `.gitignore` includes `.planning/` so the artifacts cannot be re-committed on any future branch + 5. All three branches are verifiably clean: `git diff main...{branch}` shows only the expected files +**Plans**: TBD + +Plans: +- [ ] 14-01: Confirm PR #761 status and scope resolve-model changes +- [ ] 14-02: Create feat/coverage-hardening branch (cherry-pick tests+CI commits) +- [ ] 14-03: Create fix/resolve-model branch (cherry-pick or stage resolve-model changes) +- [ ] 14-04: Clean feat/autopilot branch (git rm --cached .planning/, update .gitignore, rebase to drop extracted commits) + +#### Phase 15: Auto-Advance Runtime Flag Fix +**Goal**: Autopilot workflow no longer mutates config.json to drive auto-advance behavior — the --auto flag propagates through the existing call chain instead +**Depends on**: Phase 14 +**Requirements**: CFG-01 +**Success Criteria** (what must be TRUE): + 1. `autopilot.md` contains no `config-set` calls that write `auto_advance` to config.json + 2. `discuss-phase.md` contains no `config-set` calls that write `auto_advance` to config.json + 3. Running autopilot does not modify config.json's `workflow.auto_advance` value + 4. The `--auto` flag drives auto-advance behavior via the existing argument propagation chain +**Plans**: TBD + +Plans: +- [ ] 15-01: Remove config-set mutations from autopilot.md and discuss-phase.md; verify --auto propagation + +#### Phase 16: Validation Hardening +**Goal**: auto-discuss.md validates discuss_agents before spawning agents — invalid config produces a clear error rather than a silent misbehavior +**Depends on**: Phase 15 +**Requirements**: CFG-02 +**Success Criteria** (what must be TRUE): + 1. `auto-discuss.md` validates that `discuss_agents` is an odd number in the 3-9 range before spawning agents + 2. If `discuss_agents` is missing or falsy, the workflow falls back to a default (no hard failure for the "key not set" case) + 3. An invalid value (even number, out-of-range) produces a clear error message with guidance on valid values + 4. Existing workflows that do not set `discuss_agents` are unaffected (backwards compatible) +**Plans**: TBD + +Plans: +- [ ] 16-01: Add discuss_agents validation guard to auto-discuss.md + +#### Phase 17: Module Fixes + Documentation +**Goal**: model_overrides is wired correctly through loadConfig and resolveModelInternal, config keys are documented, and PR C is ready for submission +**Depends on**: Phase 16 +**Requirements**: CFG-03, CFG-04, CRD-03 +**Success Criteria** (what must be TRUE): + 1. `loadConfig` returns `model_overrides` in its result object so `resolveModelInternal` can read it + 2. `cmdResolveModel` delegates to `resolveModelInternal` (eliminating duplicated resolution logic) + 3. New tests in `tests/commands.test.cjs` verify that `model_overrides` is honored by the `resolve-model` CLI command + 4. README documents the `autopilot.*` config keys (`discuss_agents`, `discuss_model`) with valid values + 5. PR C (`feat/autopilot-clean`) is submitted to upstream with all code fixes from phases 15-17 included +**Plans**: TBD + +Plans: +- [ ] 17-01: Wire model_overrides through loadConfig and cmdResolveModel; add tests +- [ ] 17-02: Document autopilot config keys in README; submit PR C + +## Progress + +**Execution Order:** +Phases execute in numeric order: 14 → 15 → 16 → 17 + +| Phase | Milestone | Plans Complete | Status | Completed | +|-------|-----------|----------------|--------|-----------| +| 14. PR Restructure | v1.3 | 0/4 | Not started | - | +| 15. Auto-Advance Fix | v1.3 | 0/1 | Not started | - | +| 16. Validation Hardening | v1.3 | 0/1 | Not started | - | +| 17. Module Fixes + Docs | v1.3 | 0/2 | Not started | - | diff --git a/.planning/STATE.md b/.planning/STATE.md index 8d03f1febe..61fb830f0f 100644 --- a/.planning/STATE.md +++ b/.planning/STATE.md @@ -1,32 +1,71 @@ # Project State -## Current Milestone: v1.3 — PR Review Fixes +## Project Reference -**Phase:** Not started (defining requirements) -**Plan:** — -**Status:** Defining requirements -**Last activity:** 2026-02-28 — Milestone v1.3 started +See: .planning/PROJECT.md (updated 2026-02-28) + +**Core value:** Reliable AI agent orchestration with quality gates that catch bad plans before execution burns context. +**Current focus:** Phase 14 — PR Restructure + +## Current Position + +Phase: 14 of 17 (PR Restructure) +Plan: 0 of 4 in current phase +Status: Ready to plan +Last activity: 2026-02-28 — v1.3 roadmap created; 11 requirements mapped to 4 phases + +Progress: [░░░░░░░░░░] 0% + +## Performance Metrics + +**Velocity:** +- Total plans completed: 0 (this milestone) +- Average duration: — +- Total execution time: — + +**By Phase:** + +| Phase | Plans | Total | Avg/Plan | +|-------|-------|-------|----------| +| — | — | — | — | + +*Updated after each plan completion* + +## Accumulated Context + +### Decisions + +Decisions are logged in PROJECT.md Key Decisions table. +Recent decisions affecting current work: + +- [v1.3 start]: Split PR #762 into three focused PRs (tests+CI, resolve-model, autopilot); phases follow that dependency chain +- [v1.3 start]: Use `--auto` runtime flag (not config-set) for auto-advance — flag dies with the subagent, no config corruption risk +- [v1.3 start]: Use `git rm --cached -r .planning/` to remove artifacts — NOT bare `git rm` (would delete files from disk) + +### Pending Todos + +None yet. + +### Blockers/Concerns + +- [Phase 14]: PR #761 final state must be confirmed — merged vs closed determines whether resolve-model changes are already on main and how to scope the fix/resolve-model branch +- [Phase 17]: model_overrides fix touches core.cjs and commands.cjs — verify no conflict with PR #761 changes before submitting PR C ## Milestone History -### v1.0 — Test Infrastructure (Phases 1–9) +### v1.0 — Test Infrastructure (Phases 1-6) - 355 tests across all 11 modules - GitHub Actions CI with 3x3 OS/Node matrix - 4 regression tests (REG-01 through REG-04) -### v1.1 — Coverage Hardening (Phases 10–13) +### v1.1 — Coverage Hardening (Phases 7-13) - 433 tests, 94.01% overall line coverage - c8 coverage enforcement in CI - All modules above 70% threshold - VERIFICATION.md audit trails for every phase -## Context +## Session Continuity -- Codebase: 11 JavaScript modules in `bin/` directory -- Test framework: `node:test` + `node:assert` -- Config: `get-shit-done/templates/config.json` -- Agent definitions: `agents/` directory (markdown files) -- Workflow definitions: `get-shit-done/workflows/` directory -- Codebase map: `.planning/codebase/` (7 documents, analyzed 2026-02-25) -- PR #762: open, changes requested by @glittercowboy -- PR #761: overlapping resolve-model fix (closed) +Last session: 2026-02-28 +Stopped at: Roadmap created — ready to plan Phase 14 +Resume file: None From 346c3dd66e28dc53a22e0bc5d360fd9ba1c4fef6 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 12:34:02 +1000 Subject: [PATCH 10/16] docs(14): capture phase context --- .../phases/14-pr-restructure/14-CONTEXT.md | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 .planning/phases/14-pr-restructure/14-CONTEXT.md diff --git a/.planning/phases/14-pr-restructure/14-CONTEXT.md b/.planning/phases/14-pr-restructure/14-CONTEXT.md new file mode 100644 index 0000000000..39b3f0ee86 --- /dev/null +++ b/.planning/phases/14-pr-restructure/14-CONTEXT.md @@ -0,0 +1,66 @@ +# Phase 14: PR Restructure - Context + +**Gathered:** 2026-02-28 +**Status:** Ready for planning + + +## Phase Boundary + +Split PR #762 into 2 focused PRs (PR B: resolve-model fix, PR C: autopilot feature), remove committed `.planning/` artifacts, coordinate with closed PR #761, and close #762 with a comment linking the replacements. PR A (tests+CI) is no longer needed since PR #763 already merged that content. + + + + +## Implementation Decisions + +### PR #762 Handling +- Close #762 and open 2 new PRs (not 3 — PR A absorbed by merged #763) +- Add closing comment to #762 with links to new PRs: "Split into #X, #Y per review feedback from @glittercowboy" +- Each new PR references #762 in its body: "Split from #762 per review feedback" + +### PR #761 Coordination +- PR #761 was closed without merging — the resolve-model fix only exists in our branch +- No conflict risk since #761 never landed on main +- Credit @ChuckMayo in PR B body: "Also identified by @ChuckMayo in #761" + +### Branch Strategy +- Rebase feat/autopilot onto main first (main now includes #763's test content) +- Fork both new branches from rebased feat/autopilot +- PR B: branch from feat/autopilot, remove all non-resolve-model files +- PR C: branch from feat/autopilot, remove `.planning/` artifacts, keep autopilot feature code +- PR B stays separate from PR C (reviewer explicitly asked for separation) + +### PR Descriptions +- Follow repo's PR template: What / Why / Testing / Checklist / Breaking Changes +- Add extra section mapping which @glittercowboy review findings each PR addresses (point-by-point) +- Both PRs include test plan details + +### Claude's Discretion +- Exact branch names for PR B and PR C +- Commit message wording for the rebase and cleanup +- How to structure the "review findings addressed" section in PR body +- Whether template.test.cjs goes into PR B or PR C + + + + +## Specific Ideas + +- PR #763 (merged Feb 27) already landed tests+CI, so PR A is eliminated +- Repo has CODEOWNERS and a PR template at `.github/pull_request_template.md` +- The PR template requires: OS testing checkboxes, CHANGELOG updates for user-facing changes, GSD style compliance, Windows path testing + + + + +## Deferred Ideas + +- Adding tests for the autopilot feature itself (reviewer noted as [MISSING]) — candidate for v1.4 +- CHANGELOG.md updates for the autopilot feature — include in PR C if user-facing + + + +--- + +*Phase: 14-pr-restructure* +*Context gathered: 2026-02-28* From 5613a026601b6bf2527087cf163938ed8ea569d0 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 12:43:42 +1000 Subject: [PATCH 11/16] =?UTF-8?q?docs(14):=20research=20phase=20=E2=80=94?= =?UTF-8?q?=20PR=20restructure=20git=20workflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../phases/14-pr-restructure/14-RESEARCH.md | 478 ++++++++++++++++++ 1 file changed, 478 insertions(+) create mode 100644 .planning/phases/14-pr-restructure/14-RESEARCH.md diff --git a/.planning/phases/14-pr-restructure/14-RESEARCH.md b/.planning/phases/14-pr-restructure/14-RESEARCH.md new file mode 100644 index 0000000000..4f82ba22b3 --- /dev/null +++ b/.planning/phases/14-pr-restructure/14-RESEARCH.md @@ -0,0 +1,478 @@ +# Phase 14: PR Restructure - Research + +**Researched:** 2026-02-28 +**Domain:** Git branch management, GitHub PR workflow, git rebase/cherry-pick +**Confidence:** HIGH + + +## User Constraints (from CONTEXT.md) + +### Locked Decisions + +**PR #762 Handling** +- Close #762 and open 2 new PRs (not 3 — PR A absorbed by merged #763) +- Add closing comment to #762 with links to new PRs: "Split into #X, #Y per review feedback from @glittercowboy" +- Each new PR references #762 in its body: "Split from #762 per review feedback" + +**PR #761 Coordination** +- PR #761 was closed without merging — the resolve-model fix only exists in our branch +- No conflict risk since #761 never landed on main +- Credit @ChuckMayo in PR B body: "Also identified by @ChuckMayo in #761" + +**Branch Strategy** +- Rebase feat/autopilot onto main first (main now includes #763's test content) +- Fork both new branches from rebased feat/autopilot +- PR B: branch from feat/autopilot, remove all non-resolve-model files +- PR C: branch from feat/autopilot, remove `.planning/` artifacts, keep autopilot feature code +- PR B stays separate from PR C (reviewer explicitly asked for separation) + +**PR Descriptions** +- Follow repo's PR template: What / Why / Testing / Checklist / Breaking Changes +- Add extra section mapping which @glittercowboy review findings each PR addresses (point-by-point) +- Both PRs include test plan details + +### Claude's Discretion +- Exact branch names for PR B and PR C +- Commit message wording for the rebase and cleanup +- How to structure the "review findings addressed" section in PR body +- Whether template.test.cjs goes into PR B or PR C + +### Deferred Ideas (OUT OF SCOPE) +- Adding tests for the autopilot feature itself (reviewer noted as [MISSING]) — candidate for v1.4 +- CHANGELOG.md updates for the autopilot feature — include in PR C if user-facing + + + +## Phase Requirements + +| ID | Description | Research Support | +|----|-------------|-----------------| +| PRS-01 | PR #762 split into focused PRs (updated in CONTEXT: 2 PRs, not 3 — PR A already merged) | PR #763 confirmed MERGED; PR #762 confirmed CLOSED Feb 28 | +| PRS-02 | `.planning/` artifacts removed from all PR branches via `git rm --cached` | `.planning/` tracked in git index; git rm --cached -r .planning/ is the correct approach | +| PRS-03 | `.gitignore` updated to prevent `.planning/` from being committed again | `.planning/` is ALREADY in both feat/autopilot and origin/main .gitignore — no action needed | +| PRS-04 | PR #761 status confirmed and resolve-model fix coordinated (no duplicate changes) | PR #761 CLOSED. Resolve-model fix already on origin/main via PR #739 — see critical finding below | +| CRD-01 | PR A (tests+CI) submitted first with no dependencies | DONE — PR #763 already merged Feb 25 | +| CRD-02 | PR B (resolve-model) submitted after confirming PR #761 status | See critical finding: fix already on main; PR B scope is moot for code change but confirmed for coordination | + + +## Summary + +Phase 14 is a git restructuring task: rebase `feat/autopilot` onto `origin/main`, create a clean PR C branch containing only autopilot feature code with `.planning/` artifacts removed from the git index, and close PR #762 with a closing comment. + +**Critical finding:** The CONTEXT.md assumptions are partially outdated. Research reveals that (1) `origin/main` is far ahead of local `main` — local main must be synced first; (2) the resolve-model fix is ALREADY on `origin/main` via PR #739 (merged independently), making PR B a coordination confirmation rather than a new PR submission; (3) `.planning/` is already in `.gitignore` on both branches, so PRS-03 requires only removing the tracked files from the git index, not updating `.gitignore`. + +**Primary recommendation:** Sync local main with `git fetch origin && git merge origin/main`, rebase `feat/autopilot` onto the updated main, resolve the known conflicts (execute-phase.md path style), run `git rm --cached -r .planning/`, then create and push the clean PR C branch. Close PR #762 with a comment. Skip PR B creation since the code change is already on main; satisfy CRD-02 by documenting the confirmation in the PR C body. + +## Critical Findings (Research Discoveries) + +### Finding 1: Resolve-Model Fix Already on upstream main (SCOPE CHANGE) + +The CONTEXT.md states "the resolve-model fix only exists in our branch." This is incorrect as of Feb 28. Research confirms: + +- PR #739 ("fix: load model_overrides from config and use resolveModelInternal in CLI") was MERGED to `origin/main` independently +- `origin/main`'s `get-shit-done/bin/lib/commands.cjs` already has `resolveModelInternal` delegation (no `'inherit'` bug) +- `origin/main`'s `get-shit-done/bin/lib/core.cjs` already has `model_overrides: parsed.model_overrides || null` in `loadConfig` +- PR #761 (ChuckMayo's fix) was CLOSED without merging + +**Impact on Phase 14:** +- PR B (fix/resolve-model) does NOT need to be created as a new code PR +- CRD-02 is satisfied by: confirm PR #761 closed, document that fix landed via PR #739, credit ChuckMayo in PR C body instead +- PR C becomes the only new PR to submit + +**Confidence:** HIGH — verified by fetching origin/main and inspecting `commands.cjs` and `core.cjs` source directly. + +### Finding 2: Local main is ~15 commits behind origin/main + +Local `main` is at commit `3fddd62` (pre-1.21.0). `origin/main` is at `19ac77e` and includes: +- 1.21.0, 1.21.1 releases +- PR #763 (tests+CI merged) +- PR #739 (resolve-model fix) +- PR #786 (`$HOME` path fix) +- Several other community PRs (#737, #741, #759, etc.) + +**Impact:** Rebasing feat/autopilot must target `origin/main`, not local `main`. First step is `git fetch origin && git merge origin/main` (or `git pull origin main`). + +### Finding 3: .gitignore Already Contains .planning/ + +`.planning/` is already excluded in `.gitignore` on BOTH `feat/autopilot` and `origin/main` (line: `# Internal planning documents` / `.planning/`). + +**Impact on PRS-03:** No `.gitignore` edit needed. The issue is that git tracks files once they are committed — `.gitignore` only prevents untracked files from being staged. The fix is purely `git rm --cached -r .planning/`. PRS-03 is partially satisfied; only the index cleanup remains. + +### Finding 4: feat/autopilot is 10 Commits Ahead of origin/main + +``` +87f08eb docs(14): capture phase context — .planning only +56b6930 docs: create milestone v1.3 roadmap — .planning only +5900bcc docs: define milestone v1.3 requirements — .planning only +3c2c317 docs: complete project research — .planning only +7411599 docs: start milestone v1.3 PR Review — .planning only +b296079 wip: v2.0 milestone setup paused — .planning only +ade3945 docs: start milestone v2.0 MoE Panels — .planning only +000163a refactor: remove dead execution section — SOURCE CODE (autopilot) +8850ebf refactor: remove Agent Teams engine — SOURCE CODE (autopilot) +b0aa9fc feat: add autopilot mode — SOURCE CODE (autopilot) +``` + +**Impact:** After rebase onto `origin/main`, the 7 `.planning/` commits remain but contain only artifacts that will be removed. The 3 source commits are the autopilot feature. PR C should contain only the net diff from these 3 source commits (after conflict resolution). + +## Standard Stack + +### Core (Git Operations) +| Tool | Version | Purpose | Why Standard | +|------|---------|---------|--------------| +| git | system | Branch creation, rebase, cherry-pick, rm --cached | Only tool for git index operations | +| gh CLI | system | PR creation, comment posting, PR view | GitHub API automation | + +### Supporting +| Tool | Version | Purpose | When to Use | +|------|---------|---------|-------------| +| `git rm --cached -r` | git builtin | Remove tracked files from index without deleting from disk | Required for PRS-02 | +| `git rebase origin/main` | git builtin | Replay autopilot commits on top of updated main | Required before PR C creation | +| `git checkout -b` | git builtin | Create new clean branch | For PR C branch | + +**Installation:** No additional installs needed — git and gh CLI already present. + +## Architecture Patterns + +### Pattern 1: Rebase-then-Fork Branch Strategy (Locked Decision) + +**What:** Rebase `feat/autopilot` onto `origin/main`, then fork PR C from the rebased state after removing `.planning/` artifacts. + +**When to use:** When the source branch diverged before a large upstream update (1.21.0 → 1.21.1+ with multiple community PRs). + +**Steps:** +```bash +# 1. Sync local main +git checkout main +git pull origin main + +# 2. Rebase feat/autopilot (resolve conflicts during rebase) +git checkout feat/autopilot +git rebase origin/main + +# 3. Remove .planning/ from git index (files stay on disk) +git rm --cached -r .planning/ +git commit -m "chore: remove .planning/ artifacts from git index" + +# 4. Create PR C branch +git checkout -b feat/autopilot-clean # or chosen branch name + +# 5. Push and open PR +git push fork feat/autopilot-clean +gh pr create --repo gsd-build/get-shit-done --title "..." --body "..." +``` + +### Pattern 2: git rm --cached vs git rm + +**What:** `git rm --cached -r .planning/` removes files from the git index (staging area) without touching the working tree. Files remain on disk, only removed from git tracking. + +**Critical distinction:** +```bash +git rm -r .planning/ # WRONG: Deletes files from disk AND index +git rm --cached -r .planning/ # CORRECT: Removes from index only +``` + +**After running:** Files in `.planning/` become untracked. Since `.planning/` is in `.gitignore`, they will not appear in `git status` as untracked — they are effectively invisible to git going forward. + +### Pattern 3: Verifying Clean Branch Diff + +**What:** Use `git diff origin/main...{branch}` (three dots) to see only the commits unique to the branch. + +```bash +# Verify PR C contains only expected files +git diff origin/main...feat/autopilot-clean --name-only +``` + +Expected output for PR C: +``` +commands/gsd/autopilot.md +get-shit-done/bin/lib/config.cjs +get-shit-done/templates/config.json +get-shit-done/workflows/auto-discuss.md +get-shit-done/workflows/autopilot.md +get-shit-done/workflows/execute-phase.md +get-shit-done/workflows/progress.md +get-shit-done/workflows/settings.md +``` + +No `.planning/` entries should appear. + +### Pattern 4: Closing PR with Comment (GitHub CLI) +```bash +# Add closing comment to PR #762 +gh pr comment 762 --repo gsd-build/get-shit-done \ + --body "Closing and splitting into focused PRs per @glittercowboy review feedback: +- PR #XXX: feat/autopilot-clean — autopilot feature code only" + +gh pr close 762 --repo gsd-build/get-shit-done +``` + +Note: PR #762 is already CLOSED (closed Feb 28). Only the closing comment needs to be added. + +## Don't Hand-Roll + +| Problem | Don't Build | Use Instead | Why | +|---------|-------------|-------------|-----| +| Removing .planning/ from git | Manual file operations | `git rm --cached -r .planning/` | Index-only removal is built into git | +| Creating PR body | Manual HTML/text | `gh pr create --body "$(cat <<'EOF'...)"` heredoc | gh CLI handles escaping, API auth | +| Verifying branch cleanliness | Script parsing git log | `git diff origin/main...branch --name-only` | Three-dot notation shows only branch-specific commits | + +## Known Rebase Conflicts + +Based on direct code inspection, rebasing `feat/autopilot` onto `origin/main` will produce the following conflicts: + +### Conflict 1: execute-phase.md (path style) + +**Location:** `get-shit-done/workflows/execute-phase.md` + +**Nature:** The autopilot commit (b0aa9fc) added lines using `~/.claude/get-shit-done/bin/gsd-tools.cjs` path style. `origin/main` uses `"$HOME/.claude/get-shit-done/bin/gsd-tools.cjs"` (PR #786 fix: "use $HOME instead of ~ for gsd-tools.cjs paths to prevent subagent MODULE_NOT_FOUND"). + +**Resolution:** Accept `$HOME` style (origin/main's version is correct — per PR #786, subagents sometimes rewrite `~` to relative paths). Verify autopilot-specific additions still exist after resolution. + +**Autopilot-specific lines to preserve in execute-phase.md:** +```bash +AUTO_CFG=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" config-get workflow.auto_advance 2>/dev/null || echo "false") +``` +This line already exists in origin/main's execute-phase.md — the autopilot commit's additions may be entirely absorbed. + +### Conflict 2: init.cjs (execution_engine removal) + +**Location:** `get-shit-done/bin/lib/init.cjs` + +**Nature:** Autopilot commit (8850ebf) removes `execution_engine` field from `cmdInitExecutePhase`. `origin/main` has a substantially refactored version of `init.cjs` (from the module split + toPosixPath changes). + +**Resolution:** Accept origin/main's version as base; verify `execution_engine` field is not present (8850ebf's intent). The toPosixPath removal in origin/main's version supersedes the autopilot change. + +### Conflict 3: config.cjs (autopilot section vs origin/main's config) + +**Location:** `get-shit-done/bin/lib/config.cjs` + +**Nature:** Autopilot commits add `autopilot: { discuss_agents, discuss_model }` section with validation. origin/main may or may not have this section. + +**Resolution:** Keep the autopilot config section additions — these are unique to PR C. + +### Non-Conflict: commands.cjs + +No conflict expected. None of the 3 autopilot commits touch `commands.cjs`. After rebase, `commands.cjs` will automatically use `origin/main`'s fixed version (no `'inherit'` bug). + +### Non-Conflict: .gitignore + +No conflict expected. None of the 3 autopilot commits touch `.gitignore`. After rebase, `.gitignore` will be `origin/main`'s version (which has `coverage/` and `.planning/`). + +## Common Pitfalls + +### Pitfall 1: Rebasing onto local main instead of origin/main + +**What goes wrong:** Local `main` is at `3fddd62` (pre-1.21.0). Rebasing onto it puts `feat/autopilot` on a base that's ~15 commits behind upstream, creating a PR with massive unintended diff (includes all 1.21.0, 1.21.1, test suite changes). + +**How to avoid:** Always `git fetch origin` first, then `git rebase origin/main`. + +**Warning signs:** `git diff origin/main...{branch} --name-only` shows hundreds of files. + +### Pitfall 2: Using git rm (without --cached) to remove .planning/ + +**What goes wrong:** Deletes `.planning/` files from disk permanently. All planning documents, context files, requirements, etc. are lost. + +**How to avoid:** Always use `git rm --cached -r .planning/`. Double-check with `ls .planning/` after the command to confirm files are still present. + +**Warning signs:** `git status` shows nothing (files gone from disk, not just index). + +### Pitfall 3: Force-pushing feat/autopilot (the source branch) + +**What goes wrong:** After creating PR C as a new branch, force-pushing to `feat/autopilot` would rewrite the branch history and potentially confuse the `fork` remote. + +**How to avoid:** Create a NEW branch (`feat/autopilot-clean` or chosen name) for PR C. Do not modify `feat/autopilot` in place. + +### Pitfall 4: Reopening PR #762 instead of commenting + closing + +**What goes wrong:** PR #762 is already CLOSED. Reopening it would require re-review of a large diff. The correct flow is add a closing comment explaining the split, then submit new focused PRs. + +**How to avoid:** Use `gh pr comment 762` to add the comment. PR is already closed — no `gh pr close` needed. + +### Pitfall 5: Including .planning/ files in the rebase commit before git rm + +**What goes wrong:** If the rebase creates merge commits that include `.planning/` files, those files must still be removed AFTER the rebase with `git rm --cached`. + +**How to avoid:** Run `git rm --cached -r .planning/` AFTER the full rebase is complete, then commit the removal as a separate cleanup commit before pushing. + +### Pitfall 6: Assuming PR B is still needed as a new code PR + +**What goes wrong:** Creating PR B (`fix/resolve-model`) to fix the `'inherit'` bug when it's already fixed on `origin/main` (PR #739) creates a duplicate that will be rejected. + +**How to avoid:** Verify with `git show origin/main:get-shit-done/bin/lib/commands.cjs | grep resolveModelInternal` — returns the fixed version. No PR B needed for code. CRD-02 is satisfied by documenting the confirmation. + +## Code Examples + +### Verifying resolve-model fix on origin/main +```bash +# Confirm fix is present (should show resolveModelInternal, not 'inherit') +git show origin/main:get-shit-done/bin/lib/commands.cjs | sed -n '/function cmdResolveModel/,/^}/p' + +# Confirm model_overrides in loadConfig +git show origin/main:get-shit-done/bin/lib/core.cjs | grep "model_overrides" +``` + +### Full rebase workflow +```bash +# Step 1: Sync local main +git checkout main +git pull origin main + +# Step 2: Rebase (will have conflicts — resolve per "Known Rebase Conflicts" section) +git checkout feat/autopilot +git rebase origin/main +# ... resolve conflicts, git add, git rebase --continue ... + +# Step 3: Verify rebase result — should show only 10 commits ahead +git log --oneline origin/main..feat/autopilot + +# Step 4: Remove .planning/ from git index +git rm --cached -r .planning/ +git commit -m "chore: remove .planning/ artifacts from git index" + +# Step 5: Verify .planning/ files still exist on disk +ls .planning/ + +# Step 6: Create PR C branch +git checkout -b feat/autopilot-clean # pick branch name + +# Step 7: Verify clean diff +git diff origin/main...feat/autopilot-clean --name-only +# Should show ONLY: commands/gsd/autopilot.md, get-shit-done/bin/lib/config.cjs, +# get-shit-done/templates/config.json, get-shit-done/workflows/*.md (autopilot files) +# Should NOT show: .planning/, tests/, commands.cjs, core.cjs, etc. + +# Step 8: Push and create PR +git push fork feat/autopilot-clean -u +gh pr create --repo gsd-build/get-shit-done \ + --title "feat: autopilot mode — full pipeline automation with synthetic multi-agent discuss" \ + --body "$(cat <<'EOF' +## What + +Adds `/gsd:autopilot` — one command to run the full GSD pipeline (discuss → plan → execute → verify) for all remaining phases automatically. + +## Why + +Split from #762 per review feedback from @glittercowboy. + +## Review Findings Addressed + +- **[SCOPE] `.planning/` directory committed** — Resolved: removed from git index via `git rm --cached -r .planning/`; `.gitignore` already excludes `.planning/`. +- **[CONCERN] Auto-advance forced ON** — Resolved in Phase 15 (separate PR): use `--auto` runtime flag instead of `config-set`. +- **[BREAKING] Removed `execution.engine`** — Removed intentionally: Agent Teams can't set per-teammate models, defeating profile-based differentiation. Subagents-only is correct. +- **[QUALITY] No tests for autopilot** — Acknowledged: autopilot tests are a v1.4 candidate. + +**Note on resolve-model fix:** The `cmdResolveModel` / `model_overrides` fix originally in this branch was independently merged to main via PR #739. Also identified by @ChuckMayo in PR #761 (closed). + +## Testing + +- [ ] Tested on macOS +- [ ] Tested on Windows +- [ ] Tested on Linux + +## Checklist + +- [ ] Follows GSD style (no enterprise patterns, no filler) +- [ ] Updates CHANGELOG.md for user-facing changes +- [ ] No unnecessary dependencies added +- [ ] Works on Windows (backslash paths tested) + +## Breaking Changes + +None +EOF +)" +``` + +### Adding closing comment to PR #762 +```bash +gh pr comment 762 --repo gsd-build/get-shit-done \ + --body "Closing. Split into focused PR per @glittercowboy review feedback: + +- PR #XXX: \`feat/autopilot-clean\` — autopilot feature only (.planning/ artifacts removed) + +The resolve-model fix from this branch landed independently as PR #739. Tests+CI are in merged PR #763." +``` + +## State of the Art + +| Old Approach | Current Approach | When Changed | Impact | +|--------------|------------------|--------------|--------| +| Monolithic PR (tests+CI+feature+fix) | 2-3 focused PRs per feature area | PR #762 feedback | Faster review, easier to bisect issues | +| `config-set workflow.auto_advance true` (mutates config) | `--auto` runtime flag (stateless) | Phase 15 fixes this | No config corruption on crash | +| `~/` path prefix in workflow scripts | `"$HOME/"` prefix | PR #786 merged to main | Prevents subagent MODULE_NOT_FOUND errors | + +## Open Questions + +1. **PR B scope — does it still need to be submitted?** + - What we know: Resolve-model fix is already on `origin/main` via PR #739 + - What's unclear: Whether REQUIREMENTS.md PRS-01 / CRD-02 require explicitly creating a PR B branch as a deliverable, or just confirming coordination + - Recommendation: No new PR B. Satisfy CRD-02 by documenting in PR C body that fix is already on main + credit @ChuckMayo + +2. **Rebase conflict severity in execute-phase.md** + - What we know: feat/autopilot uses `~` paths; origin/main uses `$HOME`; autopilot commits add `AUTO_CFG` line to execute-phase.md + - What's unclear: Whether the `AUTO_CFG` line added by autopilot already exists in origin/main's execute-phase.md + - Recommendation: During rebase, inspect `git show origin/main:get-shit-done/workflows/execute-phase.md | grep AUTO_CFG` to determine actual conflict scope + +3. **init.cjs conflict severity** + - What we know: Both origin/main and autopilot commits modified init.cjs + - What's unclear: Exact overlap — origin/main has `toPosixPath` changes; autopilot removed `execution_engine` field + - Recommendation: Accept origin/main's version during conflict, manually verify `execution_engine` is absent + +## Validation Architecture + +### Test Framework +| Property | Value | +|----------|-------| +| Framework | Node.js built-in test runner (`node --test`) | +| Config file | none — run directly via node | +| Quick run command | `npm test` | +| Full suite command | `npm test` | + +### Phase Requirements → Test Map + +This phase is a git restructuring task (no new source code). Validation is manual git inspection, not automated test runs. + +| Req ID | Behavior | Test Type | Automated Command | File Exists? | +|--------|----------|-----------|-------------------|-------------| +| PRS-01 | PR #762 closed, 2 new PRs opened | manual | `gh pr view 762 --repo gsd-build/get-shit-done --json state` | N/A | +| PRS-02 | `.planning/` not in any PR branch's diff | git | `git diff origin/main...{branch} --name-only \| grep '\.planning'` | N/A | +| PRS-03 | `.gitignore` has `.planning/` | git | `git show {branch}:.gitignore \| grep planning` | N/A (already present) | +| PRS-04 | PR #761 status confirmed, no duplicate | manual | `gh pr view 761 --repo gsd-build/get-shit-done --json state` | N/A | +| CRD-01 | PR A already merged (#763) | manual | `gh pr view 763 --repo gsd-build/get-shit-done --json state` | N/A | +| CRD-02 | PR B coordination confirmed | manual | Inspect PR #739 on origin/main | N/A | + +### Sampling Rate +- **Per task commit:** `npm test` (verify no regressions from rebase) +- **Per wave merge:** `git diff origin/main...{branch} --name-only` (verify clean diff) +- **Phase gate:** All 6 success criteria verified before `/gsd:verify-work` + +### Wave 0 Gaps + +None — this phase requires no new test files. Validation is git state inspection. + +## Sources + +### Primary (HIGH confidence) +- Direct git inspection: `git log`, `git diff`, `git show` against live branches — branch structure, commit contents, file diffs +- `gh pr view 762/761/763` — PR states, review comments verified live +- `gh api repos/gsd-build/get-shit-done/contents/...` — upstream main file contents verified + +### Secondary (MEDIUM confidence) +- PR #739 commit message on upstream main ("fix: load model_overrides from config and use resolveModelInternal in CLI") — confirms resolve-model already landed +- PR #786 commit message ("fix: use $HOME instead of ~ for gsd-tools.cjs paths") — confirms path style change + +### Tertiary (LOW confidence) +- None — all findings are directly verified from git/GitHub + +## Metadata + +**Confidence breakdown:** +- Git operations (rebase, rm --cached, branch creation): HIGH — standard git features, well-understood +- Conflict prediction (execute-phase.md, init.cjs): HIGH — verified by direct diff inspection +- PR #762 state (CLOSED): HIGH — confirmed via gh CLI +- Resolve-model already on main: HIGH — verified by reading source on origin/main +- .gitignore already correct: HIGH — verified by git show + +**Research date:** 2026-02-28 +**Valid until:** 2026-03-07 (7 days — upstream main moves fast with community PRs) From 10f943457881311abd1516b55c190775ca2a7f1e Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 12:48:23 +1000 Subject: [PATCH 12/16] docs(14): create phase plan --- .planning/ROADMAP.md | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index 325b1c96bc..1071ee2c3c 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -27,22 +27,20 @@ Phases 7-13 completed. 433 tests passing, 94.01% overall line coverage, all 11 m **Milestone Goal:** Respond to PR reviewer feedback on PR #762 by splitting the monolithic PR into focused branches, removing committed dev artifacts, and applying three targeted code fixes before resubmission. #### Phase 14: PR Restructure -**Goal**: Three clean, independently reviewable branches exist — one for tests+CI, one for resolve-model, one for autopilot — with no .planning/ artifacts committed and PR #761 coordination confirmed +**Goal**: Clean PR C branch (feat/autopilot-clean) submitted with only autopilot feature code, .planning/ artifacts removed from git index, PR #762 closed with split comment, and PR coordination confirmed (PR A=#763 merged, PR B=not needed since fix landed via #739) **Depends on**: Phase 13 **Requirements**: PRS-01, PRS-02, PRS-03, PRS-04, CRD-01, CRD-02 **Success Criteria** (what must be TRUE): - 1. PR A branch (`feat/coverage-hardening`) contains only tests and CI changes — no autopilot code, no .planning/ files - 2. PR B branch (`fix/resolve-model`) is scoped to resolve-model logic only — PR #761 status confirmed and no duplicate changes present - 3. PR C branch (`feat/autopilot-clean`) contains autopilot feature code with .planning/ artifacts removed from git index (files still exist locally) - 4. `.gitignore` includes `.planning/` so the artifacts cannot be re-committed on any future branch - 5. All three branches are verifiably clean: `git diff main...{branch}` shows only the expected files -**Plans**: TBD + 1. PR C branch (`feat/autopilot-clean`) contains autopilot feature code with .planning/ artifacts removed from git index (files still exist locally) + 2. `.gitignore` includes `.planning/` so the artifacts cannot be re-committed on any future branch + 3. PR C diff against origin/main shows only autopilot feature files + 4. PR #762 has closing comment explaining the split and linking to PR C, #763, #739 + 5. PR #761 status confirmed CLOSED, resolve-model fix documented as landed via #739 +**Plans**: 2 plans Plans: -- [ ] 14-01: Confirm PR #761 status and scope resolve-model changes -- [ ] 14-02: Create feat/coverage-hardening branch (cherry-pick tests+CI commits) -- [ ] 14-03: Create fix/resolve-model branch (cherry-pick or stage resolve-model changes) -- [ ] 14-04: Clean feat/autopilot branch (git rm --cached .planning/, update .gitignore, rebase to drop extracted commits) +- [ ] 14-01-PLAN.md — Sync local main, rebase feat/autopilot onto origin/main, remove .planning/ from git index +- [ ] 14-02-PLAN.md — Create PR C branch, open PR on GitHub, add closing comment to PR #762 #### Phase 15: Auto-Advance Runtime Flag Fix **Goal**: Autopilot workflow no longer mutates config.json to drive auto-advance behavior — the --auto flag propagates through the existing call chain instead @@ -95,7 +93,7 @@ Phases execute in numeric order: 14 → 15 → 16 → 17 | Phase | Milestone | Plans Complete | Status | Completed | |-------|-----------|----------------|--------|-----------| -| 14. PR Restructure | v1.3 | 0/4 | Not started | - | +| 14. PR Restructure | v1.3 | 0/2 | Not started | - | | 15. Auto-Advance Fix | v1.3 | 0/1 | Not started | - | | 16. Validation Hardening | v1.3 | 0/1 | Not started | - | | 17. Module Fixes + Docs | v1.3 | 0/2 | Not started | - | From 8f105564f45391b1b87a7e7bf88ea3d56b03c46f Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 12:53:02 +1000 Subject: [PATCH 13/16] chore: remove .planning/ artifacts from git index --- .planning/.continue-here.md | 70 -- .planning/PROJECT.md | 100 --- .planning/REQUIREMENTS.md | 72 --- .planning/ROADMAP.md | 99 --- .planning/STATE.md | 71 -- .../phases/14-pr-restructure/14-CONTEXT.md | 66 -- .../phases/14-pr-restructure/14-RESEARCH.md | 478 -------------- .planning/research/ARCHITECTURE.md | 609 ------------------ .planning/research/FEATURES.md | 285 -------- .planning/research/PITFALLS.md | 238 ------- .planning/research/STACK.md | 251 -------- 11 files changed, 2339 deletions(-) delete mode 100644 .planning/.continue-here.md delete mode 100644 .planning/PROJECT.md delete mode 100644 .planning/REQUIREMENTS.md delete mode 100644 .planning/ROADMAP.md delete mode 100644 .planning/STATE.md delete mode 100644 .planning/phases/14-pr-restructure/14-CONTEXT.md delete mode 100644 .planning/phases/14-pr-restructure/14-RESEARCH.md delete mode 100644 .planning/research/ARCHITECTURE.md delete mode 100644 .planning/research/FEATURES.md delete mode 100644 .planning/research/PITFALLS.md delete mode 100644 .planning/research/STACK.md diff --git a/.planning/.continue-here.md b/.planning/.continue-here.md deleted file mode 100644 index 4a716186fe..0000000000 --- a/.planning/.continue-here.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -phase: pre-phase (milestone v2.0 setup) -task: 6 -total_tasks: 8 -status: in_progress -last_updated: 2026-02-26T00:11:19.319Z ---- - - -Milestone v2.0 MoE Panels setup is in progress. Steps 1-5 complete. Step 6 (research) nearly done — all 4 research files written to disk, all 4 agents completed. Synthesis not yet run. Steps 7 (requirements) and 8 (roadmap) not started. - - - - -- Step 1: Update PROJECT.md — Done. Broadened scope from test infrastructure to GSD tool development, added v2.0 milestone section, moved v1.0/v1.1 to validated, set active requirements to MoE panel scope. -- Step 2: Create STATE.md — Done. Fresh STATE.md for v2.0 with milestone history context. -- Step 3: Commit milestone start — Done. Commit `ade3945 docs: start milestone v2.0 MoE Panels` -- Step 4: Run init + config — Done. `gsd-tools init new-milestone` and `config-set workflow.research true` -- Step 5: Research agents — All 4 completed: - - STACK.md (291 lines) — HIGH confidence. Scatter-gather with LLM synthesizer pattern. - - FEATURES.md (337 lines) — MEDIUM-HIGH confidence. Domain-partitioned assembly, not voting. - - ARCHITECTURE.md (638 lines) — HIGH confidence. Transparent substitution, inline specialist prompts, config routing. - - PITFALLS.md (352 lines) — HIGH confidence. 13 pitfalls catalogued, output contract drift is #1 risk. - - - - -- Step 6 (finish): Run gsd-research-synthesizer to merge 4 research files into SUMMARY.md, then commit research artifacts -- Step 7: Define requirements — Derive REQ-IDs from user spec across 5 categories (Panel Infrastructure, Plan Checker Panel, Verifier Panel, Research Panel, Testing). Scope with user via AskUserQuestion per category. Commit REQUIREMENTS.md. -- Step 8: Create roadmap — Spawn gsd-roadmapper with phase numbering starting at 14, all requirements + research. Present for approval, commit ROADMAP.md. - - - - -- Architecture researcher found: panel agents should use inline specialist prompts (no separate specialist files). This contradicts Stack researcher's suggestion of 12 separate files. Architecture finding is higher confidence (based on direct codebase analysis of auto-discuss.md pattern). -- All researchers agree: domain-partitioned assembly over voting for consensus mechanism. -- Config routing: 3 new keys (`workflow.plan_check_panel`, `workflow.verifier_panel`, `workflow.research_panel`), all default `false`. Must nest under existing `workflow.*` to avoid collision with `workflow.plan_check`/`workflow.verifier`. -- Plan checker panel should be built first (most well-defined output contract). -- Output contract drift is the #1 risk — orchestrators must own the output template, not specialists. -- Graceful degradation: 2/3 specialists succeed = usable output; 1/3 or 0/3 = fallback to single-agent. - - - -- None. All research agents completed successfully. - - - -This is the v2.0 milestone setup following the plan from a previous planning session. The plan has 8 execution steps. Key architectural insight: panel agents are transparent substitutes — they produce identical output to single agents, with config flags controlling dispatch. The existing auto-discuss.md workflow already demonstrates the parallel-spawn-and-synthesize pattern. - -Research disagreement resolved: Stack says 12 separate specialist files, Architecture says inline prompts in 3 panel agent files. Architecture recommendation wins (follows existing codebase conventions, specialists not independently useful). - -Key research files: -- `.planning/research/STACK.md` — orchestration patterns -- `.planning/research/FEATURES.md` — consensus mechanisms, panel designs -- `.planning/research/ARCHITECTURE.md` — integration with GSD codebase -- `.planning/research/PITFALLS.md` — failure modes, prevention strategies - - - -1. Run gsd-research-synthesizer to create .planning/research/SUMMARY.md from the 4 research files -2. Commit all research artifacts -3. Begin Step 7: Requirements definition — present 5 categories to user for scoping via AskUserQuestion: - - Panel Infrastructure (PANEL-XX) - - Plan Checker Panel (PCHK-XX) - - Verifier Panel (VRFY-XX) - - Research Panel (RSRCH-XX) - - Testing (TEST-XX) -4. Write and commit REQUIREMENTS.md -5. Step 8: Spawn gsd-roadmapper for ROADMAP.md (phases starting at 14) - diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md deleted file mode 100644 index 646e292f8b..0000000000 --- a/.planning/PROJECT.md +++ /dev/null @@ -1,100 +0,0 @@ -# get-shit-done - -## What This Is - -An open-source npm package that orchestrates AI coding agents for software development workflows. Development includes both the core tool (agent orchestration, workflow routing, config management) and its test infrastructure (433 tests, 94.01% coverage, CI pipeline). - -## Core Value - -Reliable AI agent orchestration with quality gates that catch bad plans before execution burns context. Every module has tests that catch regressions before they reach users. - -## Requirements - -### Validated - -- ✓ Node.js built-in `node:test` + `node:assert` test framework — v1.0 -- ✓ CLI integration test pattern via `execSync` with temp directory isolation — v1.0 -- ✓ Test helpers (`createTempProject`, `runGsdTools`, `cleanup`, `createTempGitProject`) — v1.0 -- ✓ Tests for all 11 modules: phase, state, commands, init, roadmap, core, frontmatter, verify, config, template, milestone — v1.0 -- ✓ 355 tests passing, 0 failures — v1.0 -- ✓ 4 regression tests (REG-01 through REG-04) — v1.0 -- ✓ GitHub Actions CI pipeline with 3x3 OS/Node matrix — v1.0 -- ✓ commands.cjs from 59% to 88.86% line coverage — v1.1 -- ✓ init.cjs from 42% to 98.59% line coverage — v1.1 -- ✓ state.cjs from 40% to 96.16% line coverage — v1.1 -- ✓ gsd-tools.cjs dispatcher from 76% to 94.35% line coverage — v1.1 -- ✓ roadmap.cjs from 71% to 99.32% line coverage — v1.1 -- ✓ c8 devDependency with `npm run test:coverage` script — v1.1 -- ✓ Coverage thresholds enforced in CI (fail if any module drops below 70%) — v1.1 -- ✓ VERIFICATION.md audit trail for each coverage phase — v1.1 - -### Active (v1.3 — PR Review Fixes) - -- Split PR #762 into focused PRs: tests+CI, resolve-model fix, autopilot feature -- Remove committed `.planning/` artifacts from PR branch -- Fix auto-advance config mutation — use runtime flag instead of config.json persistence -- Coordinate resolve-model fix with PR #761 to avoid merge conflicts -- Add runtime validation for `discuss_agents` in auto-discuss workflow -- Document `model_overrides` config or remove if premature - -*Full requirements in `.planning/REQUIREMENTS.md`* - -### Out of Scope - -- Changing user-facing commands or output formats -- Performance/benchmark testing — not needed at current scale -- TypeScript migration — different milestone entirely -- Performance tests for large ROADMAP.md files (PERF-01) — future candidate -- Windows-specific path separator tests (WIN-01) — future candidate -- Windows CRLF line ending handling tests (CRLF-01) — future candidate - -## Current Milestone: v1.3 — PR Review Fixes - -**Phase:** Defining requirements -**Starting phase:** 14 (continuing from v1.1's Phase 13) - -Addresses reviewer feedback on PR #762 (autopilot mode). Splits the monolithic PR into focused PRs, removes committed dev artifacts, fixes identified code issues (auto-advance mutation, validation gaps), and coordinates overlapping fixes with PR #761. - -## Current State - -Shipped v1.1 with 433 tests, 94.01% overall line coverage, all 11 modules above 70%. Coverage is enforced in CI on every PR. All requirements have VERIFICATION.md audit trails. - -- **Test count:** 433 -- **Overall coverage:** 94.01% line coverage -- **Lowest module:** commands.cjs at 88.86% (target: 75%) -- **CI matrix:** Ubuntu, macOS, Windows × Node 18, 20, 22 (9 jobs) -- **Coverage enforcement:** c8 v11 on Node 20+; plain `npm test` on Node 18 - -Known bugs documented and tested (tests assert current behavior, production code not modified): -- `getRoadmapPhaseInternal` goal regex format mismatch (`**Goal:**` vs `**Goal**:`) -- `verify.cjs:82` — `content.search()` returns -1 handled by guard -- `frontmatter.cjs` — comma splitting doesn't handle quoted values (REG-04 documents limitation) -- `commands.cjs` — all git errors treated as "nothing to commit" - -**Codebase map:** `.planning/codebase/` (7 documents, analyzed 2026-02-25) - -## Constraints - -- **No new dependencies** (except c8 as devDependency): Follow existing lightweight convention -- **Backwards compatible**: Panel config keys default `false` — existing behavior unchanged -- **Output contract preservation**: Panel output must be identical to single-agent output -- **Existing patterns**: Tests use `node:test` + `node:assert`, CLI integration via `execSync`, temp directories -- **Cross-platform**: Tests must work on macOS, Linux, and Windows (CI matrix) -- **Not our repo**: We're contributing PRs, not merging directly - -## Key Decisions - -| Decision | Rationale | Outcome | -|----------|-----------|---------| -| Use node:test (no Jest/Vitest) | Match existing convention, zero dependencies | ✓ Good — 433 tests, fast execution | -| Integration tests for cmd* functions | process.exit() in output/error prevents direct require() | ✓ Good — consistent pattern across all modules | -| Unit tests for pure functions | comparePhaseNum, extractFrontmatter etc. can be require()'d directly | ✓ Good — faster, more granular | -| One PR per module | Keeps reviews focused, allows parallel submission | ✓ Good — each phase is an independent PR | -| createTempGitProject helper | Git-dependent tests need isolated repos with config | ✓ Good — used by verify-summary and verify-commits | -| Concurrency groups in CI | Cancel stale runs on same branch using head_ref \|\| run_id | ✓ Good — prevents queue buildup | -| c8 for coverage (not nyc) | Works natively with node:test via V8 coverage | ✓ Good — 94.01% overall, clean per-file report | -| Node 18 skip for c8 v11 | c8 v11 declares engines Node 20+, Node 18 EOL April 2025 | ✓ Good — CI stable, plain npm test still runs on Node 18 | -| VERIFICATION.md per coverage phase | Audit trail for orphaned requirements found in milestone audit | ✓ Good — all requirements now have 3-source verification | - ---- -*Last updated: 2026-02-28 — v1.3 PR Review Fixes milestone started* diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md deleted file mode 100644 index 69cdd86081..0000000000 --- a/.planning/REQUIREMENTS.md +++ /dev/null @@ -1,72 +0,0 @@ -# Requirements: get-shit-done - -**Defined:** 2026-02-28 -**Core Value:** Reliable AI agent orchestration with quality gates that catch bad plans before execution burns context. - -## v1.3 Requirements - -Requirements for PR #762 review fixes. Each maps to roadmap phases. - -### PR Structure - -- [ ] **PRS-01**: PR #762 is split into 3 focused PRs: tests+CI (PR A), resolve-model fix (PR B), autopilot feature (PR C) -- [ ] **PRS-02**: `.planning/` artifacts are removed from all PR branches via `git rm --cached` -- [ ] **PRS-03**: `.gitignore` updated to prevent `.planning/` from being committed again -- [ ] **PRS-04**: PR #761 status confirmed and resolve-model fix coordinated (no duplicate changes) - -### Config Fixes - -- [ ] **CFG-01**: Auto-advance uses `--auto` runtime flag instead of mutating `config.json` via `config-set` -- [ ] **CFG-02**: `discuss_agents` validated at runtime in `auto-discuss.md` (odd, 3-9 range, absent key accepted) -- [ ] **CFG-03**: `model_overrides` added to `loadConfig` return object so `resolveModelInternal` can read it -- [ ] **CFG-04**: `model_overrides` config key documented in appropriate reference files - -### Coordination - -- [ ] **CRD-01**: PR A (tests+CI) submitted first with no dependencies on B or C -- [ ] **CRD-02**: PR B (resolve-model) submitted after confirming PR #761 status -- [ ] **CRD-03**: PR C (autopilot) submitted last, includes all code fixes from CFG category - -## Future Requirements - -### v2.0 — MoE Panels (deferred) - -- MoE panel infrastructure with 3 config keys -- Plan Checker Panel: 3 parallel specialists -- Verifier Panel: 3 domain specialists -- Research Panel: 3 domain researchers -- Workflow routing and output contract preservation - -## Out of Scope - -| Feature | Reason | -|---------|--------| -| Rewriting autopilot feature logic | Reviewer didn't request feature changes, only structural/quality fixes | -| Adding tests for autopilot workflows | Reviewer noted it as missing but it's a separate effort (v1.4 candidate) | -| Changing existing test assertions | Tests PR (A) should preserve existing test content | -| TypeScript migration | Different milestone entirely | - -## Traceability - -| Requirement | Phase | Status | -|-------------|-------|--------| -| PRS-01 | Phase 14 | Pending | -| PRS-02 | Phase 14 | Pending | -| PRS-03 | Phase 14 | Pending | -| PRS-04 | Phase 14 | Pending | -| CFG-01 | Phase 15 | Pending | -| CFG-02 | Phase 16 | Pending | -| CFG-03 | Phase 17 | Pending | -| CFG-04 | Phase 17 | Pending | -| CRD-01 | Phase 14 | Pending | -| CRD-02 | Phase 14 | Pending | -| CRD-03 | Phase 17 | Pending | - -**Coverage:** -- v1.3 requirements: 11 total -- Mapped to phases: 11 -- Unmapped: 0 - ---- -*Requirements defined: 2026-02-28* -*Last updated: 2026-02-28 — traceability filled after roadmap creation* diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md deleted file mode 100644 index 1071ee2c3c..0000000000 --- a/.planning/ROADMAP.md +++ /dev/null @@ -1,99 +0,0 @@ -# Roadmap: get-shit-done - -## Milestones - -- ✅ **v1.0 Test Infrastructure** - Phases 1-6 (shipped 2026-02-25) -- ✅ **v1.1 Coverage Hardening** - Phases 7-13 (shipped 2026-02-25) -- 🚧 **v1.3 PR Review Fixes** - Phases 14-17 (in progress) - -## Phases - -
-✅ v1.0 Test Infrastructure (Phases 1-6) - SHIPPED 2026-02-25 - -Phases 1-6 completed. 245 new tests, 6,715 lines of test code, CI pipeline (3 OS x 3 Node matrix). See MILESTONES.md for full details. - -
- -
-✅ v1.1 Coverage Hardening (Phases 7-13) - SHIPPED 2026-02-25 - -Phases 7-13 completed. 433 tests passing, 94.01% overall line coverage, all 11 modules above 70%, c8 coverage enforcement in CI. See MILESTONES.md for full details. - -
- -### 🚧 v1.3 PR Review Fixes (In Progress) - -**Milestone Goal:** Respond to PR reviewer feedback on PR #762 by splitting the monolithic PR into focused branches, removing committed dev artifacts, and applying three targeted code fixes before resubmission. - -#### Phase 14: PR Restructure -**Goal**: Clean PR C branch (feat/autopilot-clean) submitted with only autopilot feature code, .planning/ artifacts removed from git index, PR #762 closed with split comment, and PR coordination confirmed (PR A=#763 merged, PR B=not needed since fix landed via #739) -**Depends on**: Phase 13 -**Requirements**: PRS-01, PRS-02, PRS-03, PRS-04, CRD-01, CRD-02 -**Success Criteria** (what must be TRUE): - 1. PR C branch (`feat/autopilot-clean`) contains autopilot feature code with .planning/ artifacts removed from git index (files still exist locally) - 2. `.gitignore` includes `.planning/` so the artifacts cannot be re-committed on any future branch - 3. PR C diff against origin/main shows only autopilot feature files - 4. PR #762 has closing comment explaining the split and linking to PR C, #763, #739 - 5. PR #761 status confirmed CLOSED, resolve-model fix documented as landed via #739 -**Plans**: 2 plans - -Plans: -- [ ] 14-01-PLAN.md — Sync local main, rebase feat/autopilot onto origin/main, remove .planning/ from git index -- [ ] 14-02-PLAN.md — Create PR C branch, open PR on GitHub, add closing comment to PR #762 - -#### Phase 15: Auto-Advance Runtime Flag Fix -**Goal**: Autopilot workflow no longer mutates config.json to drive auto-advance behavior — the --auto flag propagates through the existing call chain instead -**Depends on**: Phase 14 -**Requirements**: CFG-01 -**Success Criteria** (what must be TRUE): - 1. `autopilot.md` contains no `config-set` calls that write `auto_advance` to config.json - 2. `discuss-phase.md` contains no `config-set` calls that write `auto_advance` to config.json - 3. Running autopilot does not modify config.json's `workflow.auto_advance` value - 4. The `--auto` flag drives auto-advance behavior via the existing argument propagation chain -**Plans**: TBD - -Plans: -- [ ] 15-01: Remove config-set mutations from autopilot.md and discuss-phase.md; verify --auto propagation - -#### Phase 16: Validation Hardening -**Goal**: auto-discuss.md validates discuss_agents before spawning agents — invalid config produces a clear error rather than a silent misbehavior -**Depends on**: Phase 15 -**Requirements**: CFG-02 -**Success Criteria** (what must be TRUE): - 1. `auto-discuss.md` validates that `discuss_agents` is an odd number in the 3-9 range before spawning agents - 2. If `discuss_agents` is missing or falsy, the workflow falls back to a default (no hard failure for the "key not set" case) - 3. An invalid value (even number, out-of-range) produces a clear error message with guidance on valid values - 4. Existing workflows that do not set `discuss_agents` are unaffected (backwards compatible) -**Plans**: TBD - -Plans: -- [ ] 16-01: Add discuss_agents validation guard to auto-discuss.md - -#### Phase 17: Module Fixes + Documentation -**Goal**: model_overrides is wired correctly through loadConfig and resolveModelInternal, config keys are documented, and PR C is ready for submission -**Depends on**: Phase 16 -**Requirements**: CFG-03, CFG-04, CRD-03 -**Success Criteria** (what must be TRUE): - 1. `loadConfig` returns `model_overrides` in its result object so `resolveModelInternal` can read it - 2. `cmdResolveModel` delegates to `resolveModelInternal` (eliminating duplicated resolution logic) - 3. New tests in `tests/commands.test.cjs` verify that `model_overrides` is honored by the `resolve-model` CLI command - 4. README documents the `autopilot.*` config keys (`discuss_agents`, `discuss_model`) with valid values - 5. PR C (`feat/autopilot-clean`) is submitted to upstream with all code fixes from phases 15-17 included -**Plans**: TBD - -Plans: -- [ ] 17-01: Wire model_overrides through loadConfig and cmdResolveModel; add tests -- [ ] 17-02: Document autopilot config keys in README; submit PR C - -## Progress - -**Execution Order:** -Phases execute in numeric order: 14 → 15 → 16 → 17 - -| Phase | Milestone | Plans Complete | Status | Completed | -|-------|-----------|----------------|--------|-----------| -| 14. PR Restructure | v1.3 | 0/2 | Not started | - | -| 15. Auto-Advance Fix | v1.3 | 0/1 | Not started | - | -| 16. Validation Hardening | v1.3 | 0/1 | Not started | - | -| 17. Module Fixes + Docs | v1.3 | 0/2 | Not started | - | diff --git a/.planning/STATE.md b/.planning/STATE.md deleted file mode 100644 index 61fb830f0f..0000000000 --- a/.planning/STATE.md +++ /dev/null @@ -1,71 +0,0 @@ -# Project State - -## Project Reference - -See: .planning/PROJECT.md (updated 2026-02-28) - -**Core value:** Reliable AI agent orchestration with quality gates that catch bad plans before execution burns context. -**Current focus:** Phase 14 — PR Restructure - -## Current Position - -Phase: 14 of 17 (PR Restructure) -Plan: 0 of 4 in current phase -Status: Ready to plan -Last activity: 2026-02-28 — v1.3 roadmap created; 11 requirements mapped to 4 phases - -Progress: [░░░░░░░░░░] 0% - -## Performance Metrics - -**Velocity:** -- Total plans completed: 0 (this milestone) -- Average duration: — -- Total execution time: — - -**By Phase:** - -| Phase | Plans | Total | Avg/Plan | -|-------|-------|-------|----------| -| — | — | — | — | - -*Updated after each plan completion* - -## Accumulated Context - -### Decisions - -Decisions are logged in PROJECT.md Key Decisions table. -Recent decisions affecting current work: - -- [v1.3 start]: Split PR #762 into three focused PRs (tests+CI, resolve-model, autopilot); phases follow that dependency chain -- [v1.3 start]: Use `--auto` runtime flag (not config-set) for auto-advance — flag dies with the subagent, no config corruption risk -- [v1.3 start]: Use `git rm --cached -r .planning/` to remove artifacts — NOT bare `git rm` (would delete files from disk) - -### Pending Todos - -None yet. - -### Blockers/Concerns - -- [Phase 14]: PR #761 final state must be confirmed — merged vs closed determines whether resolve-model changes are already on main and how to scope the fix/resolve-model branch -- [Phase 17]: model_overrides fix touches core.cjs and commands.cjs — verify no conflict with PR #761 changes before submitting PR C - -## Milestone History - -### v1.0 — Test Infrastructure (Phases 1-6) -- 355 tests across all 11 modules -- GitHub Actions CI with 3x3 OS/Node matrix -- 4 regression tests (REG-01 through REG-04) - -### v1.1 — Coverage Hardening (Phases 7-13) -- 433 tests, 94.01% overall line coverage -- c8 coverage enforcement in CI -- All modules above 70% threshold -- VERIFICATION.md audit trails for every phase - -## Session Continuity - -Last session: 2026-02-28 -Stopped at: Roadmap created — ready to plan Phase 14 -Resume file: None diff --git a/.planning/phases/14-pr-restructure/14-CONTEXT.md b/.planning/phases/14-pr-restructure/14-CONTEXT.md deleted file mode 100644 index 39b3f0ee86..0000000000 --- a/.planning/phases/14-pr-restructure/14-CONTEXT.md +++ /dev/null @@ -1,66 +0,0 @@ -# Phase 14: PR Restructure - Context - -**Gathered:** 2026-02-28 -**Status:** Ready for planning - - -## Phase Boundary - -Split PR #762 into 2 focused PRs (PR B: resolve-model fix, PR C: autopilot feature), remove committed `.planning/` artifacts, coordinate with closed PR #761, and close #762 with a comment linking the replacements. PR A (tests+CI) is no longer needed since PR #763 already merged that content. - - - - -## Implementation Decisions - -### PR #762 Handling -- Close #762 and open 2 new PRs (not 3 — PR A absorbed by merged #763) -- Add closing comment to #762 with links to new PRs: "Split into #X, #Y per review feedback from @glittercowboy" -- Each new PR references #762 in its body: "Split from #762 per review feedback" - -### PR #761 Coordination -- PR #761 was closed without merging — the resolve-model fix only exists in our branch -- No conflict risk since #761 never landed on main -- Credit @ChuckMayo in PR B body: "Also identified by @ChuckMayo in #761" - -### Branch Strategy -- Rebase feat/autopilot onto main first (main now includes #763's test content) -- Fork both new branches from rebased feat/autopilot -- PR B: branch from feat/autopilot, remove all non-resolve-model files -- PR C: branch from feat/autopilot, remove `.planning/` artifacts, keep autopilot feature code -- PR B stays separate from PR C (reviewer explicitly asked for separation) - -### PR Descriptions -- Follow repo's PR template: What / Why / Testing / Checklist / Breaking Changes -- Add extra section mapping which @glittercowboy review findings each PR addresses (point-by-point) -- Both PRs include test plan details - -### Claude's Discretion -- Exact branch names for PR B and PR C -- Commit message wording for the rebase and cleanup -- How to structure the "review findings addressed" section in PR body -- Whether template.test.cjs goes into PR B or PR C - - - - -## Specific Ideas - -- PR #763 (merged Feb 27) already landed tests+CI, so PR A is eliminated -- Repo has CODEOWNERS and a PR template at `.github/pull_request_template.md` -- The PR template requires: OS testing checkboxes, CHANGELOG updates for user-facing changes, GSD style compliance, Windows path testing - - - - -## Deferred Ideas - -- Adding tests for the autopilot feature itself (reviewer noted as [MISSING]) — candidate for v1.4 -- CHANGELOG.md updates for the autopilot feature — include in PR C if user-facing - - - ---- - -*Phase: 14-pr-restructure* -*Context gathered: 2026-02-28* diff --git a/.planning/phases/14-pr-restructure/14-RESEARCH.md b/.planning/phases/14-pr-restructure/14-RESEARCH.md deleted file mode 100644 index 4f82ba22b3..0000000000 --- a/.planning/phases/14-pr-restructure/14-RESEARCH.md +++ /dev/null @@ -1,478 +0,0 @@ -# Phase 14: PR Restructure - Research - -**Researched:** 2026-02-28 -**Domain:** Git branch management, GitHub PR workflow, git rebase/cherry-pick -**Confidence:** HIGH - - -## User Constraints (from CONTEXT.md) - -### Locked Decisions - -**PR #762 Handling** -- Close #762 and open 2 new PRs (not 3 — PR A absorbed by merged #763) -- Add closing comment to #762 with links to new PRs: "Split into #X, #Y per review feedback from @glittercowboy" -- Each new PR references #762 in its body: "Split from #762 per review feedback" - -**PR #761 Coordination** -- PR #761 was closed without merging — the resolve-model fix only exists in our branch -- No conflict risk since #761 never landed on main -- Credit @ChuckMayo in PR B body: "Also identified by @ChuckMayo in #761" - -**Branch Strategy** -- Rebase feat/autopilot onto main first (main now includes #763's test content) -- Fork both new branches from rebased feat/autopilot -- PR B: branch from feat/autopilot, remove all non-resolve-model files -- PR C: branch from feat/autopilot, remove `.planning/` artifacts, keep autopilot feature code -- PR B stays separate from PR C (reviewer explicitly asked for separation) - -**PR Descriptions** -- Follow repo's PR template: What / Why / Testing / Checklist / Breaking Changes -- Add extra section mapping which @glittercowboy review findings each PR addresses (point-by-point) -- Both PRs include test plan details - -### Claude's Discretion -- Exact branch names for PR B and PR C -- Commit message wording for the rebase and cleanup -- How to structure the "review findings addressed" section in PR body -- Whether template.test.cjs goes into PR B or PR C - -### Deferred Ideas (OUT OF SCOPE) -- Adding tests for the autopilot feature itself (reviewer noted as [MISSING]) — candidate for v1.4 -- CHANGELOG.md updates for the autopilot feature — include in PR C if user-facing - - - -## Phase Requirements - -| ID | Description | Research Support | -|----|-------------|-----------------| -| PRS-01 | PR #762 split into focused PRs (updated in CONTEXT: 2 PRs, not 3 — PR A already merged) | PR #763 confirmed MERGED; PR #762 confirmed CLOSED Feb 28 | -| PRS-02 | `.planning/` artifacts removed from all PR branches via `git rm --cached` | `.planning/` tracked in git index; git rm --cached -r .planning/ is the correct approach | -| PRS-03 | `.gitignore` updated to prevent `.planning/` from being committed again | `.planning/` is ALREADY in both feat/autopilot and origin/main .gitignore — no action needed | -| PRS-04 | PR #761 status confirmed and resolve-model fix coordinated (no duplicate changes) | PR #761 CLOSED. Resolve-model fix already on origin/main via PR #739 — see critical finding below | -| CRD-01 | PR A (tests+CI) submitted first with no dependencies | DONE — PR #763 already merged Feb 25 | -| CRD-02 | PR B (resolve-model) submitted after confirming PR #761 status | See critical finding: fix already on main; PR B scope is moot for code change but confirmed for coordination | - - -## Summary - -Phase 14 is a git restructuring task: rebase `feat/autopilot` onto `origin/main`, create a clean PR C branch containing only autopilot feature code with `.planning/` artifacts removed from the git index, and close PR #762 with a closing comment. - -**Critical finding:** The CONTEXT.md assumptions are partially outdated. Research reveals that (1) `origin/main` is far ahead of local `main` — local main must be synced first; (2) the resolve-model fix is ALREADY on `origin/main` via PR #739 (merged independently), making PR B a coordination confirmation rather than a new PR submission; (3) `.planning/` is already in `.gitignore` on both branches, so PRS-03 requires only removing the tracked files from the git index, not updating `.gitignore`. - -**Primary recommendation:** Sync local main with `git fetch origin && git merge origin/main`, rebase `feat/autopilot` onto the updated main, resolve the known conflicts (execute-phase.md path style), run `git rm --cached -r .planning/`, then create and push the clean PR C branch. Close PR #762 with a comment. Skip PR B creation since the code change is already on main; satisfy CRD-02 by documenting the confirmation in the PR C body. - -## Critical Findings (Research Discoveries) - -### Finding 1: Resolve-Model Fix Already on upstream main (SCOPE CHANGE) - -The CONTEXT.md states "the resolve-model fix only exists in our branch." This is incorrect as of Feb 28. Research confirms: - -- PR #739 ("fix: load model_overrides from config and use resolveModelInternal in CLI") was MERGED to `origin/main` independently -- `origin/main`'s `get-shit-done/bin/lib/commands.cjs` already has `resolveModelInternal` delegation (no `'inherit'` bug) -- `origin/main`'s `get-shit-done/bin/lib/core.cjs` already has `model_overrides: parsed.model_overrides || null` in `loadConfig` -- PR #761 (ChuckMayo's fix) was CLOSED without merging - -**Impact on Phase 14:** -- PR B (fix/resolve-model) does NOT need to be created as a new code PR -- CRD-02 is satisfied by: confirm PR #761 closed, document that fix landed via PR #739, credit ChuckMayo in PR C body instead -- PR C becomes the only new PR to submit - -**Confidence:** HIGH — verified by fetching origin/main and inspecting `commands.cjs` and `core.cjs` source directly. - -### Finding 2: Local main is ~15 commits behind origin/main - -Local `main` is at commit `3fddd62` (pre-1.21.0). `origin/main` is at `19ac77e` and includes: -- 1.21.0, 1.21.1 releases -- PR #763 (tests+CI merged) -- PR #739 (resolve-model fix) -- PR #786 (`$HOME` path fix) -- Several other community PRs (#737, #741, #759, etc.) - -**Impact:** Rebasing feat/autopilot must target `origin/main`, not local `main`. First step is `git fetch origin && git merge origin/main` (or `git pull origin main`). - -### Finding 3: .gitignore Already Contains .planning/ - -`.planning/` is already excluded in `.gitignore` on BOTH `feat/autopilot` and `origin/main` (line: `# Internal planning documents` / `.planning/`). - -**Impact on PRS-03:** No `.gitignore` edit needed. The issue is that git tracks files once they are committed — `.gitignore` only prevents untracked files from being staged. The fix is purely `git rm --cached -r .planning/`. PRS-03 is partially satisfied; only the index cleanup remains. - -### Finding 4: feat/autopilot is 10 Commits Ahead of origin/main - -``` -87f08eb docs(14): capture phase context — .planning only -56b6930 docs: create milestone v1.3 roadmap — .planning only -5900bcc docs: define milestone v1.3 requirements — .planning only -3c2c317 docs: complete project research — .planning only -7411599 docs: start milestone v1.3 PR Review — .planning only -b296079 wip: v2.0 milestone setup paused — .planning only -ade3945 docs: start milestone v2.0 MoE Panels — .planning only -000163a refactor: remove dead execution section — SOURCE CODE (autopilot) -8850ebf refactor: remove Agent Teams engine — SOURCE CODE (autopilot) -b0aa9fc feat: add autopilot mode — SOURCE CODE (autopilot) -``` - -**Impact:** After rebase onto `origin/main`, the 7 `.planning/` commits remain but contain only artifacts that will be removed. The 3 source commits are the autopilot feature. PR C should contain only the net diff from these 3 source commits (after conflict resolution). - -## Standard Stack - -### Core (Git Operations) -| Tool | Version | Purpose | Why Standard | -|------|---------|---------|--------------| -| git | system | Branch creation, rebase, cherry-pick, rm --cached | Only tool for git index operations | -| gh CLI | system | PR creation, comment posting, PR view | GitHub API automation | - -### Supporting -| Tool | Version | Purpose | When to Use | -|------|---------|---------|-------------| -| `git rm --cached -r` | git builtin | Remove tracked files from index without deleting from disk | Required for PRS-02 | -| `git rebase origin/main` | git builtin | Replay autopilot commits on top of updated main | Required before PR C creation | -| `git checkout -b` | git builtin | Create new clean branch | For PR C branch | - -**Installation:** No additional installs needed — git and gh CLI already present. - -## Architecture Patterns - -### Pattern 1: Rebase-then-Fork Branch Strategy (Locked Decision) - -**What:** Rebase `feat/autopilot` onto `origin/main`, then fork PR C from the rebased state after removing `.planning/` artifacts. - -**When to use:** When the source branch diverged before a large upstream update (1.21.0 → 1.21.1+ with multiple community PRs). - -**Steps:** -```bash -# 1. Sync local main -git checkout main -git pull origin main - -# 2. Rebase feat/autopilot (resolve conflicts during rebase) -git checkout feat/autopilot -git rebase origin/main - -# 3. Remove .planning/ from git index (files stay on disk) -git rm --cached -r .planning/ -git commit -m "chore: remove .planning/ artifacts from git index" - -# 4. Create PR C branch -git checkout -b feat/autopilot-clean # or chosen branch name - -# 5. Push and open PR -git push fork feat/autopilot-clean -gh pr create --repo gsd-build/get-shit-done --title "..." --body "..." -``` - -### Pattern 2: git rm --cached vs git rm - -**What:** `git rm --cached -r .planning/` removes files from the git index (staging area) without touching the working tree. Files remain on disk, only removed from git tracking. - -**Critical distinction:** -```bash -git rm -r .planning/ # WRONG: Deletes files from disk AND index -git rm --cached -r .planning/ # CORRECT: Removes from index only -``` - -**After running:** Files in `.planning/` become untracked. Since `.planning/` is in `.gitignore`, they will not appear in `git status` as untracked — they are effectively invisible to git going forward. - -### Pattern 3: Verifying Clean Branch Diff - -**What:** Use `git diff origin/main...{branch}` (three dots) to see only the commits unique to the branch. - -```bash -# Verify PR C contains only expected files -git diff origin/main...feat/autopilot-clean --name-only -``` - -Expected output for PR C: -``` -commands/gsd/autopilot.md -get-shit-done/bin/lib/config.cjs -get-shit-done/templates/config.json -get-shit-done/workflows/auto-discuss.md -get-shit-done/workflows/autopilot.md -get-shit-done/workflows/execute-phase.md -get-shit-done/workflows/progress.md -get-shit-done/workflows/settings.md -``` - -No `.planning/` entries should appear. - -### Pattern 4: Closing PR with Comment (GitHub CLI) -```bash -# Add closing comment to PR #762 -gh pr comment 762 --repo gsd-build/get-shit-done \ - --body "Closing and splitting into focused PRs per @glittercowboy review feedback: -- PR #XXX: feat/autopilot-clean — autopilot feature code only" - -gh pr close 762 --repo gsd-build/get-shit-done -``` - -Note: PR #762 is already CLOSED (closed Feb 28). Only the closing comment needs to be added. - -## Don't Hand-Roll - -| Problem | Don't Build | Use Instead | Why | -|---------|-------------|-------------|-----| -| Removing .planning/ from git | Manual file operations | `git rm --cached -r .planning/` | Index-only removal is built into git | -| Creating PR body | Manual HTML/text | `gh pr create --body "$(cat <<'EOF'...)"` heredoc | gh CLI handles escaping, API auth | -| Verifying branch cleanliness | Script parsing git log | `git diff origin/main...branch --name-only` | Three-dot notation shows only branch-specific commits | - -## Known Rebase Conflicts - -Based on direct code inspection, rebasing `feat/autopilot` onto `origin/main` will produce the following conflicts: - -### Conflict 1: execute-phase.md (path style) - -**Location:** `get-shit-done/workflows/execute-phase.md` - -**Nature:** The autopilot commit (b0aa9fc) added lines using `~/.claude/get-shit-done/bin/gsd-tools.cjs` path style. `origin/main` uses `"$HOME/.claude/get-shit-done/bin/gsd-tools.cjs"` (PR #786 fix: "use $HOME instead of ~ for gsd-tools.cjs paths to prevent subagent MODULE_NOT_FOUND"). - -**Resolution:** Accept `$HOME` style (origin/main's version is correct — per PR #786, subagents sometimes rewrite `~` to relative paths). Verify autopilot-specific additions still exist after resolution. - -**Autopilot-specific lines to preserve in execute-phase.md:** -```bash -AUTO_CFG=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" config-get workflow.auto_advance 2>/dev/null || echo "false") -``` -This line already exists in origin/main's execute-phase.md — the autopilot commit's additions may be entirely absorbed. - -### Conflict 2: init.cjs (execution_engine removal) - -**Location:** `get-shit-done/bin/lib/init.cjs` - -**Nature:** Autopilot commit (8850ebf) removes `execution_engine` field from `cmdInitExecutePhase`. `origin/main` has a substantially refactored version of `init.cjs` (from the module split + toPosixPath changes). - -**Resolution:** Accept origin/main's version as base; verify `execution_engine` field is not present (8850ebf's intent). The toPosixPath removal in origin/main's version supersedes the autopilot change. - -### Conflict 3: config.cjs (autopilot section vs origin/main's config) - -**Location:** `get-shit-done/bin/lib/config.cjs` - -**Nature:** Autopilot commits add `autopilot: { discuss_agents, discuss_model }` section with validation. origin/main may or may not have this section. - -**Resolution:** Keep the autopilot config section additions — these are unique to PR C. - -### Non-Conflict: commands.cjs - -No conflict expected. None of the 3 autopilot commits touch `commands.cjs`. After rebase, `commands.cjs` will automatically use `origin/main`'s fixed version (no `'inherit'` bug). - -### Non-Conflict: .gitignore - -No conflict expected. None of the 3 autopilot commits touch `.gitignore`. After rebase, `.gitignore` will be `origin/main`'s version (which has `coverage/` and `.planning/`). - -## Common Pitfalls - -### Pitfall 1: Rebasing onto local main instead of origin/main - -**What goes wrong:** Local `main` is at `3fddd62` (pre-1.21.0). Rebasing onto it puts `feat/autopilot` on a base that's ~15 commits behind upstream, creating a PR with massive unintended diff (includes all 1.21.0, 1.21.1, test suite changes). - -**How to avoid:** Always `git fetch origin` first, then `git rebase origin/main`. - -**Warning signs:** `git diff origin/main...{branch} --name-only` shows hundreds of files. - -### Pitfall 2: Using git rm (without --cached) to remove .planning/ - -**What goes wrong:** Deletes `.planning/` files from disk permanently. All planning documents, context files, requirements, etc. are lost. - -**How to avoid:** Always use `git rm --cached -r .planning/`. Double-check with `ls .planning/` after the command to confirm files are still present. - -**Warning signs:** `git status` shows nothing (files gone from disk, not just index). - -### Pitfall 3: Force-pushing feat/autopilot (the source branch) - -**What goes wrong:** After creating PR C as a new branch, force-pushing to `feat/autopilot` would rewrite the branch history and potentially confuse the `fork` remote. - -**How to avoid:** Create a NEW branch (`feat/autopilot-clean` or chosen name) for PR C. Do not modify `feat/autopilot` in place. - -### Pitfall 4: Reopening PR #762 instead of commenting + closing - -**What goes wrong:** PR #762 is already CLOSED. Reopening it would require re-review of a large diff. The correct flow is add a closing comment explaining the split, then submit new focused PRs. - -**How to avoid:** Use `gh pr comment 762` to add the comment. PR is already closed — no `gh pr close` needed. - -### Pitfall 5: Including .planning/ files in the rebase commit before git rm - -**What goes wrong:** If the rebase creates merge commits that include `.planning/` files, those files must still be removed AFTER the rebase with `git rm --cached`. - -**How to avoid:** Run `git rm --cached -r .planning/` AFTER the full rebase is complete, then commit the removal as a separate cleanup commit before pushing. - -### Pitfall 6: Assuming PR B is still needed as a new code PR - -**What goes wrong:** Creating PR B (`fix/resolve-model`) to fix the `'inherit'` bug when it's already fixed on `origin/main` (PR #739) creates a duplicate that will be rejected. - -**How to avoid:** Verify with `git show origin/main:get-shit-done/bin/lib/commands.cjs | grep resolveModelInternal` — returns the fixed version. No PR B needed for code. CRD-02 is satisfied by documenting the confirmation. - -## Code Examples - -### Verifying resolve-model fix on origin/main -```bash -# Confirm fix is present (should show resolveModelInternal, not 'inherit') -git show origin/main:get-shit-done/bin/lib/commands.cjs | sed -n '/function cmdResolveModel/,/^}/p' - -# Confirm model_overrides in loadConfig -git show origin/main:get-shit-done/bin/lib/core.cjs | grep "model_overrides" -``` - -### Full rebase workflow -```bash -# Step 1: Sync local main -git checkout main -git pull origin main - -# Step 2: Rebase (will have conflicts — resolve per "Known Rebase Conflicts" section) -git checkout feat/autopilot -git rebase origin/main -# ... resolve conflicts, git add, git rebase --continue ... - -# Step 3: Verify rebase result — should show only 10 commits ahead -git log --oneline origin/main..feat/autopilot - -# Step 4: Remove .planning/ from git index -git rm --cached -r .planning/ -git commit -m "chore: remove .planning/ artifacts from git index" - -# Step 5: Verify .planning/ files still exist on disk -ls .planning/ - -# Step 6: Create PR C branch -git checkout -b feat/autopilot-clean # pick branch name - -# Step 7: Verify clean diff -git diff origin/main...feat/autopilot-clean --name-only -# Should show ONLY: commands/gsd/autopilot.md, get-shit-done/bin/lib/config.cjs, -# get-shit-done/templates/config.json, get-shit-done/workflows/*.md (autopilot files) -# Should NOT show: .planning/, tests/, commands.cjs, core.cjs, etc. - -# Step 8: Push and create PR -git push fork feat/autopilot-clean -u -gh pr create --repo gsd-build/get-shit-done \ - --title "feat: autopilot mode — full pipeline automation with synthetic multi-agent discuss" \ - --body "$(cat <<'EOF' -## What - -Adds `/gsd:autopilot` — one command to run the full GSD pipeline (discuss → plan → execute → verify) for all remaining phases automatically. - -## Why - -Split from #762 per review feedback from @glittercowboy. - -## Review Findings Addressed - -- **[SCOPE] `.planning/` directory committed** — Resolved: removed from git index via `git rm --cached -r .planning/`; `.gitignore` already excludes `.planning/`. -- **[CONCERN] Auto-advance forced ON** — Resolved in Phase 15 (separate PR): use `--auto` runtime flag instead of `config-set`. -- **[BREAKING] Removed `execution.engine`** — Removed intentionally: Agent Teams can't set per-teammate models, defeating profile-based differentiation. Subagents-only is correct. -- **[QUALITY] No tests for autopilot** — Acknowledged: autopilot tests are a v1.4 candidate. - -**Note on resolve-model fix:** The `cmdResolveModel` / `model_overrides` fix originally in this branch was independently merged to main via PR #739. Also identified by @ChuckMayo in PR #761 (closed). - -## Testing - -- [ ] Tested on macOS -- [ ] Tested on Windows -- [ ] Tested on Linux - -## Checklist - -- [ ] Follows GSD style (no enterprise patterns, no filler) -- [ ] Updates CHANGELOG.md for user-facing changes -- [ ] No unnecessary dependencies added -- [ ] Works on Windows (backslash paths tested) - -## Breaking Changes - -None -EOF -)" -``` - -### Adding closing comment to PR #762 -```bash -gh pr comment 762 --repo gsd-build/get-shit-done \ - --body "Closing. Split into focused PR per @glittercowboy review feedback: - -- PR #XXX: \`feat/autopilot-clean\` — autopilot feature only (.planning/ artifacts removed) - -The resolve-model fix from this branch landed independently as PR #739. Tests+CI are in merged PR #763." -``` - -## State of the Art - -| Old Approach | Current Approach | When Changed | Impact | -|--------------|------------------|--------------|--------| -| Monolithic PR (tests+CI+feature+fix) | 2-3 focused PRs per feature area | PR #762 feedback | Faster review, easier to bisect issues | -| `config-set workflow.auto_advance true` (mutates config) | `--auto` runtime flag (stateless) | Phase 15 fixes this | No config corruption on crash | -| `~/` path prefix in workflow scripts | `"$HOME/"` prefix | PR #786 merged to main | Prevents subagent MODULE_NOT_FOUND errors | - -## Open Questions - -1. **PR B scope — does it still need to be submitted?** - - What we know: Resolve-model fix is already on `origin/main` via PR #739 - - What's unclear: Whether REQUIREMENTS.md PRS-01 / CRD-02 require explicitly creating a PR B branch as a deliverable, or just confirming coordination - - Recommendation: No new PR B. Satisfy CRD-02 by documenting in PR C body that fix is already on main + credit @ChuckMayo - -2. **Rebase conflict severity in execute-phase.md** - - What we know: feat/autopilot uses `~` paths; origin/main uses `$HOME`; autopilot commits add `AUTO_CFG` line to execute-phase.md - - What's unclear: Whether the `AUTO_CFG` line added by autopilot already exists in origin/main's execute-phase.md - - Recommendation: During rebase, inspect `git show origin/main:get-shit-done/workflows/execute-phase.md | grep AUTO_CFG` to determine actual conflict scope - -3. **init.cjs conflict severity** - - What we know: Both origin/main and autopilot commits modified init.cjs - - What's unclear: Exact overlap — origin/main has `toPosixPath` changes; autopilot removed `execution_engine` field - - Recommendation: Accept origin/main's version during conflict, manually verify `execution_engine` is absent - -## Validation Architecture - -### Test Framework -| Property | Value | -|----------|-------| -| Framework | Node.js built-in test runner (`node --test`) | -| Config file | none — run directly via node | -| Quick run command | `npm test` | -| Full suite command | `npm test` | - -### Phase Requirements → Test Map - -This phase is a git restructuring task (no new source code). Validation is manual git inspection, not automated test runs. - -| Req ID | Behavior | Test Type | Automated Command | File Exists? | -|--------|----------|-----------|-------------------|-------------| -| PRS-01 | PR #762 closed, 2 new PRs opened | manual | `gh pr view 762 --repo gsd-build/get-shit-done --json state` | N/A | -| PRS-02 | `.planning/` not in any PR branch's diff | git | `git diff origin/main...{branch} --name-only \| grep '\.planning'` | N/A | -| PRS-03 | `.gitignore` has `.planning/` | git | `git show {branch}:.gitignore \| grep planning` | N/A (already present) | -| PRS-04 | PR #761 status confirmed, no duplicate | manual | `gh pr view 761 --repo gsd-build/get-shit-done --json state` | N/A | -| CRD-01 | PR A already merged (#763) | manual | `gh pr view 763 --repo gsd-build/get-shit-done --json state` | N/A | -| CRD-02 | PR B coordination confirmed | manual | Inspect PR #739 on origin/main | N/A | - -### Sampling Rate -- **Per task commit:** `npm test` (verify no regressions from rebase) -- **Per wave merge:** `git diff origin/main...{branch} --name-only` (verify clean diff) -- **Phase gate:** All 6 success criteria verified before `/gsd:verify-work` - -### Wave 0 Gaps - -None — this phase requires no new test files. Validation is git state inspection. - -## Sources - -### Primary (HIGH confidence) -- Direct git inspection: `git log`, `git diff`, `git show` against live branches — branch structure, commit contents, file diffs -- `gh pr view 762/761/763` — PR states, review comments verified live -- `gh api repos/gsd-build/get-shit-done/contents/...` — upstream main file contents verified - -### Secondary (MEDIUM confidence) -- PR #739 commit message on upstream main ("fix: load model_overrides from config and use resolveModelInternal in CLI") — confirms resolve-model already landed -- PR #786 commit message ("fix: use $HOME instead of ~ for gsd-tools.cjs paths") — confirms path style change - -### Tertiary (LOW confidence) -- None — all findings are directly verified from git/GitHub - -## Metadata - -**Confidence breakdown:** -- Git operations (rebase, rm --cached, branch creation): HIGH — standard git features, well-understood -- Conflict prediction (execute-phase.md, init.cjs): HIGH — verified by direct diff inspection -- PR #762 state (CLOSED): HIGH — confirmed via gh CLI -- Resolve-model already on main: HIGH — verified by reading source on origin/main -- .gitignore already correct: HIGH — verified by git show - -**Research date:** 2026-02-28 -**Valid until:** 2026-03-07 (7 days — upstream main moves fast with community PRs) diff --git a/.planning/research/ARCHITECTURE.md b/.planning/research/ARCHITECTURE.md deleted file mode 100644 index dc68a802a9..0000000000 --- a/.planning/research/ARCHITECTURE.md +++ /dev/null @@ -1,609 +0,0 @@ -# Architecture Research: PR #762 Fix Integration - -**Domain:** CLI workflow engine — autopilot mode runtime state, config validation, model resolution -**Researched:** 2026-02-28 -**Confidence:** HIGH (direct codebase analysis of all affected files) - -## Context - -This research addresses three specific fixes from PR #762 reviewer feedback: - -1. **Auto-advance config mutation** — `autopilot.md` currently persists `workflow.auto_advance true` to `config.json`. Reviewer wants this to be a runtime flag, not persistent config. -2. **discuss_agents runtime validation** — `auto-discuss.md` reads `autopilot.discuss_agents` without validating the value before spawning N agents. If the value is invalid, the workflow silently uses a bad agent count. -3. **model_overrides undocumented** — `resolveModelInternal` in `core.cjs` checks `config.model_overrides` but `cmdResolveModel` in `commands.cjs` does not. The feature is partially implemented and undocumented. - ---- - -## Standard Architecture - -### System Overview - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ User Commands (/gsd:autopilot, /gsd:plan-phase, etc.) │ -├─────────────────────────────────────────────────────────────────┤ -│ Workflow Files (get-shit-done/workflows/*.md) │ -│ Orchestrators: autopilot.md, plan-phase.md, auto-discuss.md │ -│ Read config via: config-get, config-set, init commands │ -├─────────────────────────────────────────────────────────────────┤ -│ Node CLI Toolkit (get-shit-done/bin/lib/*.cjs) │ -│ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌──────────┐ │ -│ │ core.cjs │ │ config.cjs │ │ init.cjs │ │commands │ │ -│ │ loadConfig │ │config-set │ │ init cmds │ │ .cjs │ │ -│ │ resolveModel│ │config-get │ │ (INIT JSON)│ │resolve- │ │ -│ │ Internal │ │ │ │ │ │ model │ │ -│ └────────────┘ └────────────┘ └────────────┘ └──────────┘ │ -├─────────────────────────────────────────────────────────────────┤ -│ State Layer (.planning/) │ -│ config.json — persistent settings │ -│ ROADMAP.md, STATE.md — planning documents │ -└─────────────────────────────────────────────────────────────────┘ -``` - -### Component Responsibilities - -| Component | Responsibility | Key Functions | -|-----------|----------------|---------------| -| `core.cjs` | Config loading, model resolution, shared utilities | `loadConfig()`, `resolveModelInternal()` | -| `config.cjs` | Config CRUD (read/write/validate) | `cmdConfigSet()`, `cmdConfigGet()`, `cmdConfigEnsureSection()` | -| `commands.cjs` | Standalone utility commands | `cmdResolveModel()` (the broken one), `cmdCommit()`, etc. | -| `init.cjs` | Pre-computed INIT JSON for each workflow type | `cmdInitPlanPhase()`, `cmdInitExecutePhase()`, `cmdInitProgress()` | -| `auto-discuss.md` | Synthetic phase context via N-agent debate | Reads `autopilot.discuss_agents`, spawns agent panel | -| `autopilot.md` | Full pipeline orchestration across phases | Sets/clears `workflow.auto_advance` in config | -| `plan-phase.md` / `execute-phase.md` | Per-phase orchestration | Read `workflow.auto_advance` via `config-get` | - ---- - -## Fix 1: Auto-Advance Runtime Flag - -### Current Behavior (the problem) - -`autopilot.md` step `ensure_auto_advance` runs: - -```bash -node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-set workflow.auto_advance true -``` - -This persists the flag to `.planning/config.json`. Two consequences: - -1. If autopilot crashes or is interrupted, `workflow.auto_advance` stays `true` in config.json permanently. The next manual `plan-phase` or `execute-phase` invocation auto-advances without the user asking for it. -2. The `config-set` writes config.json, which gets committed to git (if `commit_docs: true`). The flag then appears in git history as a persistent config change, not an ephemeral session flag. - -`plan-phase.md` and `execute-phase.md` check auto-advance at runtime via: - -```bash -AUTO_CFG=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get workflow.auto_advance 2>/dev/null || echo "false") -``` - -`discuss-phase.md` also checks and conditionally sets it: - -```bash -# If --auto flag present AND AUTO_CFG is not true: persist to config -node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-set workflow.auto_advance true -``` - -### Where Auto-Advance is Currently Read - -These four workflows check `workflow.auto_advance`: - -- `plan-phase.md` — Step 14: spawns execute-phase if true -- `execute-phase.md` — checkpoint handler: skips pause if true; Step (transition): chains verify if true -- `discuss-phase.md` — final step: chains plan-phase if true -- `transition.md` — milestone boundary: clears it - -All four use the same `config-get workflow.auto_advance` pattern. - -### Option A: Environment Variable (RECOMMENDED) - -**Mechanism:** Pass `GSD_AUTO_ADVANCE=true` in the orchestrator context. Each subprocess inherits it. - -**How autopilot.md would set it:** -```bash -# In the bash environment of the autopilot orchestrator session -export GSD_AUTO_ADVANCE=true -``` - -**How plan-phase.md / execute-phase.md would read it:** -```bash -AUTO_CFG="${GSD_AUTO_ADVANCE:-false}" -``` - -**Pros:** -- Zero filesystem writes — no config.json mutation -- Automatic cleanup — env var dies when the terminal session ends -- No git pollution — nothing to commit -- No reset step needed at milestone boundary - -**Cons:** -- Environment variable must be exported by the orchestrator, not just set locally -- If Claude Code subagents don't inherit the parent environment (they typically do via `execSync`), this breaks -- Requires changing the check pattern in plan-phase.md, execute-phase.md, and discuss-phase.md - -**Confidence:** MEDIUM — Claude Code's Task() subagents do inherit environment variables from the parent process, but this behavior should be verified before relying on it. - -### Option B: Temp File / Session File - -**Mechanism:** Write a session marker file, e.g., `.planning/.autopilot-session`. Workflows check for file existence instead of a config key. - -**How autopilot.md would set it:** -```bash -touch .planning/.autopilot-session -``` - -**How plan-phase.md / execute-phase.md would check it:** -```bash -AUTO_CFG=$([[ -f .planning/.autopilot-session ]] && echo "true" || echo "false") -``` - -Or via gsd-tools: -```bash -AUTO_CFG=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs verify-path-exists .planning/.autopilot-session --raw) -``` - -**How autopilot.md clears it at milestone boundary:** -```bash -rm -f .planning/.autopilot-session -``` - -**Pros:** -- Survives context compaction (unlike env var) — if a subagent loses environment, the file persists -- Easy to check, create, delete — no JSON parsing -- No config.json mutation -- Gitignored naturally if `.planning/` is gitignored, or add `/.planning/.autopilot-session` to `.gitignore` - -**Cons:** -- File must be cleaned up on crash/interrupt — if autopilot crashes, the marker file persists until manually deleted -- Requires a new `verify-path-exists` call or shell `[[ -f ]]` check in each workflow -- Slightly less obvious than a config flag - -**Confidence:** HIGH — straightforward file-based flag, no new dependencies, consistent with how GSD uses the filesystem for state. - -### Option C: `--auto` Flag Passed Explicitly (No Persistence) - -**Mechanism:** autopilot.md passes `--auto` to every phase chain invocation. No config.json mutation. No env var. No file. - -**How autopilot.md would pass it:** -``` -Task( - prompt="... ARGUMENTS='${PHASE} --auto' ..." -) -``` - -**Plan-phase.md already checks `--auto`** in its auto-advance step (Step 14). It then passes `--auto` to execute-phase via ARGUMENTS. The entire chain already supports `--auto` flag propagation. - -**Pros:** -- No state at all — flag lives only in the Task() prompt -- No cleanup required — the flag dies with the subagent -- No config mutation -- Consistent with existing `--auto` flag support throughout the chain - -**Cons:** -- autopilot.md already passes `ARGUMENTS='${PHASE} --auto'` in `run_phase_chain` — but intermediate workflows (discuss-phase) may not propagate it further -- Harder to trace "why is auto-advance happening" when debugging — no visible state - -**Confidence:** HIGH — this is the cleanest approach architecturally, and the existing `--auto` flag infrastructure already exists throughout the chain. - -### Recommendation: Option C (Pass --auto Flag) - -The autopilot.md already passes `--auto` via `ARGUMENTS='${PHASE} --auto'` in the `run_phase_chain` step. The fix is: - -1. Remove the `config-set workflow.auto_advance true` from `autopilot.md` step `ensure_auto_advance` -2. Remove the `config-set workflow.auto_advance false` from `autopilot.md` step `milestone_complete` -3. Remove the `config-set workflow.auto_advance true` persistence from `discuss-phase.md` (direct `--auto` flag usage doesn't need to write config) -4. Keep the `--auto` flag check in plan-phase.md, execute-phase.md, discuss-phase.md as-is -5. Keep `workflow.auto_advance` in config.json as a **user-settable persistent preference** (not autopilot-managed) — this allows users who always want auto-advance to set it once - -**Where to look for edge cases:** -- `new-project.md` also sets `workflow.auto_advance true` — check if this is intentional (user chose autopilot in project wizard) or should also be removed -- `transition.md` clears `workflow.auto_advance false` — this can stay (clears user's persistent preference at milestone boundary, which is appropriate) or be removed if the flag becomes fully flag-based - -**Files affected (Option C):** - -| File | Change | Type | -|------|--------|-------| -| `get-shit-done/workflows/autopilot.md` | Remove two `config-set workflow.auto_advance` calls | Modify (workflow) | -| `get-shit-done/workflows/discuss-phase.md` | Remove conditional `config-set workflow.auto_advance true` persistence | Modify (workflow) | -| `get-shit-done/workflows/new-project.md` | Evaluate and possibly remove `config-set workflow.auto_advance true` | Modify (workflow) | - -No JavaScript module changes needed for Option C. - ---- - -## Fix 2: discuss_agents Runtime Validation - -### Current Behavior (the problem) - -`auto-discuss.md` reads `autopilot.discuss_agents` with a fallback: - -```bash -AGENT_COUNT=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get autopilot.discuss_agents 2>/dev/null || echo "5") -``` - -The `|| echo "5"` fallback handles the case where the key is missing. But it does NOT validate: -- Whether the value is a valid odd number (3, 5, 7, 9) -- Whether the value is within the allowed range -- Whether `config.autopilot` section exists but has an invalid type for `discuss_agents` - -`cmdConfigSet` in `config.cjs` validates at write time (odd, 3-9). But if someone manually edits `config.json` or if a future code path sets an invalid value, `auto-discuss.md` gets a bad AGENT_COUNT and spawns the wrong number of agents. - -### Where Validation Should Live - -**Option A: In auto-discuss.md (workflow-side validation)** - -After reading AGENT_COUNT, add a shell check: - -```bash -# Validate AGENT_COUNT is one of: 3, 5, 7, 9 -case "$AGENT_COUNT" in - 3|5|7|9) ;; # valid - *) echo "Error: discuss_agents must be 3, 5, 7, or 9. Got: $AGENT_COUNT. Fix in .planning/config.json"; exit 1;; -esac -``` - -**Pros:** Catches bad values at execution time before spawning agents. No module changes. Fast fix. -**Cons:** Validation logic is in a markdown workflow file, not testable. - -**Option B: Add a dedicated `cmdConfigValidateAutopilot` to config.cjs** - -Create a new gsd-tools command `config validate autopilot` that reads and validates the autopilot config section, returning errors or the validated values. - -**Pros:** Testable via Node.js tests. Centralized. Could be called by multiple workflows. -**Cons:** More code, new command in gsd-tools.cjs router, more test surface. - -**Option C: Make `config-get` return a structured validation response** - -Modify `cmdConfigGet` to accept a `--validate` flag that checks the value against known constraints. - -**Pros:** Reusable validation pattern. -**Cons:** Changes existing API of `config-get` which is widely used. - -### Recommendation: Option A for this PR fix - -The reviewer feedback is about a specific validation gap, not a request to redesign config validation. Add an inline shell validation check in `auto-discuss.md` after the config-get. This is: -- A minimal fix that directly addresses the reported issue -- No module changes = no test changes = smaller PR scope -- Consistent with how other workflows handle bad config values (they error out with a message) - -**Additional fix needed:** Validate that `AGENT_COUNT` is actually a number, not a string. The `config-get --raw` returns the raw value, which could be `"null"` or `"undefined"` if the key is missing and the command fails. - -**Files affected:** - -| File | Change | Type | -|------|--------|-------| -| `get-shit-done/workflows/auto-discuss.md` | Add AGENT_COUNT validation after config-get | Modify (workflow) | - -No JavaScript module changes needed. - ---- - -## Fix 3: model_overrides Documentation / Alignment - -### The Divergence - -Two functions in the codebase handle model resolution, and they behave differently: - -**`resolveModelInternal` in `core.cjs` (lines 344-359):** -```javascript -function resolveModelInternal(cwd, agentType) { - const config = loadConfig(cwd); - - // Check per-agent override FIRST - const override = config.model_overrides?.[agentType]; - if (override) { - return override === 'opus' ? 'inherit' : override; - } - - // Fall back to profile lookup - const profile = config.model_profile || 'balanced'; - const agentModels = MODEL_PROFILES[agentType]; - ... -} -``` - -**`cmdResolveModel` in `commands.cjs` (lines 200-219):** -```javascript -function cmdResolveModel(cwd, agentType, raw) { - const config = loadConfig(cwd); - const profile = config.model_profile || 'balanced'; - - // NO model_overrides check — goes straight to profile - const agentModels = MODEL_PROFILES[agentType]; - ... -} -``` - -`resolveModelInternal` is called by `init.cjs` (in `cmdInitPlanPhase`, `cmdInitExecutePhase`, etc.) to populate INIT JSON with per-agent models. `cmdResolveModel` is the CLI-facing `resolve-model` command that workflows call directly. - -### Impact - -Workflows that call `resolve-model` CLI directly (bypassing init.cjs) will NOT honor `model_overrides`. Workflows that use INIT JSON (the majority) WILL honor them because init.cjs uses `resolveModelInternal`. - -The `model_overrides` feature is documented in `get-shit-done/references/model-profiles.md` and exists in the config template. But `loadConfig` in `core.cjs` does NOT include `model_overrides` in its return object — the `resolveModelInternal` function reads `config.model_overrides?.[agentType]` directly from the parsed JSON before loadConfig normalizes it. - -### The Real Problem - -`loadConfig` returns a normalized object with known keys. `model_overrides` is NOT one of those keys: - -```javascript -// loadConfig return object (lines 95-107) — model_overrides missing! -return { - model_profile: ..., - commit_docs: ..., - // ... - brave_search: ..., - // NO model_overrides here -}; -``` - -But `resolveModelInternal` calls `loadConfig` then accesses `config.model_overrides`. Since `loadConfig` drops unknown keys, `config.model_overrides` is always `undefined`. The override check silently no-ops for everyone. - -This is a bug: `model_overrides` is documented but never actually applied. - -### Fix Options - -**Option A: Add `model_overrides` to `loadConfig` return** - -```javascript -// In core.cjs loadConfig: -return { - // ...existing fields - model_overrides: get('model_overrides') ?? {}, -}; -``` - -This makes the feature work as documented. `resolveModelInternal` and `cmdResolveModel` both need to check `config.model_overrides` (cmdResolveModel still needs to be updated too). - -**Option B: Remove `model_overrides` from documentation and `resolveModelInternal`** - -If the feature is premature, remove the dead code path and the documentation. Simplifies the codebase. - -**Option C: Document it as "experimental / not yet wired"** - -Add a note to `model-profiles.md` that `model_overrides` is not yet active. Defer the fix. - -### Recommendation: Option A (Fix the wiring) - -The feature is already documented in user-facing references (`model-profiles.md`), implemented in `resolveModelInternal`, and mentioned in the config schema. The only gap is that `loadConfig` drops it and `cmdResolveModel` ignores it. These are one-line fixes: - -**Change 1: `core.cjs` loadConfig return** -```javascript -model_overrides: parsed.model_overrides ?? {}, -``` - -**Change 2: `commands.cjs` cmdResolveModel** -```javascript -function cmdResolveModel(cwd, agentType, raw) { - if (!agentType) { - error('agent-type required'); - } - // Delegate to resolveModelInternal to ensure model_overrides are honored - const model = resolveModelInternal(cwd, agentType); - const config = loadConfig(cwd); - const profile = config.model_profile || 'balanced'; - const unknownAgent = !MODEL_PROFILES[agentType]; - const result = { model, profile, ...(unknownAgent ? { unknown_agent: true } : {}) }; - output(result, raw, model); -} -``` - -This makes `cmdResolveModel` use `resolveModelInternal`, eliminating the divergence. - -**Files affected:** - -| File | Change | Type | -|------|--------|-------| -| `get-shit-done/bin/lib/core.cjs` | Add `model_overrides` to `loadConfig` return | Modify (module, needs tests) | -| `get-shit-done/bin/lib/commands.cjs` | Refactor `cmdResolveModel` to delegate to `resolveModelInternal` | Modify (module, needs tests) | -| `tests/commands.test.cjs` | Add tests for `model_overrides` honored by `resolve-model` CLI | Modify (test) | -| `tests/core.test.cjs` or `tests/commands.test.cjs` | Add tests for `loadConfig` returning `model_overrides` | Modify (test) | - ---- - -## PR Split Architecture - -### Separation Logic - -The reviewer requested splitting PR #762. The three fixes have distinct dependencies: - -**Fix 1 (auto-advance):** Workflow-only changes. No module code. No test changes. Pure markdown edits. - -**Fix 2 (discuss_agents):** Workflow-only change. No module code. No test changes. Pure markdown edits. - -**Fix 3 (model_overrides):** Module code changes + test changes. Touches `core.cjs` and `commands.cjs`, which already have test files. - -Additionally, the original PR #762 includes: -- Tests and CI changes (from v1.1 work) -- `.planning/` artifacts that should be removed from the PR branch -- The resolve-model fix that overlaps with PR #761 (closed) - -### Recommended PR Split - -**PR A: Workflow Fixes (autopilot, auto-discuss)** - -Files: -``` -get-shit-done/workflows/autopilot.md # Remove config-set auto_advance calls -get-shit-done/workflows/discuss-phase.md # Remove conditional config-set persistence -get-shit-done/workflows/auto-discuss.md # Add AGENT_COUNT validation -``` - -No module changes. No test changes. No risk to CI. Can be reviewed in isolation. - -**PR B: resolve-model / model_overrides Fix** - -Files: -``` -get-shit-done/bin/lib/core.cjs # Add model_overrides to loadConfig -get-shit-done/bin/lib/commands.cjs # Refactor cmdResolveModel -tests/commands.test.cjs # New tests for model_overrides -``` - -This is the module fix. Needs to be coordinated with any remaining resolve-model changes from PR #761 context. - -**PR C: Tests + CI (from original PR #762)** - -This was the bulk of the original PR: test files and CI configuration. Should be reviewed independently of the autopilot feature code. Remove the `.planning/` artifact files before submitting. - -### Build Order - -``` -PR C (tests/CI) ─────────────────────────────────────> merge (no conflicts) -PR A (workflows) ────────────────────────────────────── merge (no conflicts with C) -PR B (modules) ─── depends on: no conflicts with A/C ─> merge last -``` - -PR A and PR C have no file overlap and can be submitted and merged in any order. PR B touches `core.cjs` and `commands.cjs` — verify no conflicts with PR #761 changes (PR #761 closed but the fix may have landed or been incorporated). - ---- - -## Recommended Project Structure (Unchanged) - -The existing structure handles these fixes without new directories: - -``` -get-shit-done/ -├── bin/lib/ -│ ├── core.cjs # model_overrides fix (loadConfig + resolveModelInternal) -│ └── commands.cjs # cmdResolveModel refactor -├── workflows/ -│ ├── autopilot.md # remove config-set calls -│ ├── discuss-phase.md # remove conditional config-set -│ └── auto-discuss.md # add AGENT_COUNT validation -└── tests/ - └── commands.test.cjs # new model_overrides tests -``` - -No new files needed. All three fixes are modifications to existing files. - ---- - -## Data Flow - -### Auto-Advance (After Fix) - -``` -User: /gsd:autopilot 3-7 - | - v -autopilot.md (orchestrator) - | - ├── No config-set (flag not persisted) - | - +--> Task(plan-phase.md, ARGUMENTS='3 --auto') - | - v - plan-phase.md reads --auto from ARGUMENTS - AUTO = true (flag only, no config read) - | - v - Execute → Verify → Transition - (each step receives --auto via ARGUMENTS propagation) -``` - -### Model Resolution (After Fix) - -``` -Workflow or agent calls: resolve-model gsd-executor - | - v -cmdResolveModel → resolveModelInternal(cwd, 'gsd-executor') - | - v -loadConfig(cwd) → returns { model_profile, model_overrides, ... } - | - ├── check config.model_overrides['gsd-executor'] - │ | - │ ├── found: return override value (sonnet/haiku/inherit) - │ └── not found: fall through to profile lookup - | - v -MODEL_PROFILES['gsd-executor'][profile] - | - v -return model string -``` - -### discuss_agents Validation (After Fix) - -``` -auto-discuss.md initialize step: - | - v -AGENT_COUNT=$(config-get autopilot.discuss_agents 2>/dev/null || echo "5") - | - v -[Validate: AGENT_COUNT must be 3, 5, 7, or 9] - | - ├── invalid: error, stop workflow, tell user to fix config.json - └── valid: continue to spawn_debate step -``` - ---- - -## Integration Points - -### Internal Boundaries - -| Boundary | Communication | Notes | -|----------|---------------|-------| -| `autopilot.md` → `plan-phase.md` | `--auto` flag in ARGUMENTS | After fix: no config.json writes | -| `plan-phase.md` → `execute-phase.md` | `--auto` flag propagated via ARGUMENTS | Already works | -| `auto-discuss.md` → config | `config-get autopilot.discuss_agents` | Needs validation guard after read | -| `cmdResolveModel` → `resolveModelInternal` | Direct call (after fix) | Eliminates divergence | -| `loadConfig` → callers | Returns normalized config object | `model_overrides` added to return | -| `init.cjs` INIT JSON → workflows | Pre-computed flags + models | Already uses `resolveModelInternal`, benefits from fix automatically | - -### External Services - -None. All three fixes are internal — filesystem, config, and in-process function calls only. - ---- - -## Anti-Patterns to Avoid - -### Anti-Pattern 1: Using config.json as Session State - -**What:** Writing `workflow.auto_advance true` to config.json during autopilot execution. -**Why bad:** Config.json is user-visible persistent settings. Session flags in config.json persist across invocations, survive crashes, and get committed to git history. -**Instead:** Use the `--auto` flag mechanism that already exists throughout the chain. - -### Anti-Pattern 2: Duplicating Resolution Logic - -**What:** Having `cmdResolveModel` reimplement resolution logic that `resolveModelInternal` already handles. -**Why bad:** Two code paths can diverge. `model_overrides` is an example of this happening — resolveModelInternal checks it, cmdResolveModel doesn't. -**Instead:** `cmdResolveModel` should delegate to `resolveModelInternal` rather than reimplementing the logic. - -### Anti-Pattern 3: Deferred Validation (Read-Time vs Write-Time Only) - -**What:** Validating `discuss_agents` only at write time (`cmdConfigSet`) but not at read time in the workflow. -**Why bad:** Users can edit config.json directly. Values can arrive invalid. The workflow silently uses a bad value. -**Instead:** Validate at the point of use. The workflow that reads `discuss_agents` should check the value is valid before acting on it. - -### Anti-Pattern 4: Silent Fallback Masking Config Errors - -**What:** `AGENT_COUNT=$(config-get ... 2>/dev/null || echo "5")` — the `2>/dev/null` and `|| echo "5"` hide errors. -**Why bad:** If `config-get` fails for a legitimate reason (corrupt config, wrong key type), the workflow silently proceeds with the fallback value. The user has no idea their config is broken. -**Instead:** Keep the fallback for the "key not set" case, but add explicit validation of the returned value. - ---- - -## Sources - -All findings from direct codebase analysis (HIGH confidence): - -- `/Users/annon/projects/get-shit-done/get-shit-done/bin/lib/core.cjs` — `loadConfig`, `resolveModelInternal` (lines 60-111, 344-359) -- `/Users/annon/projects/get-shit-done/get-shit-done/bin/lib/commands.cjs` — `cmdResolveModel` (lines 200-219) -- `/Users/annon/projects/get-shit-done/get-shit-done/bin/lib/config.cjs` — `cmdConfigSet` validation (lines 105-110) -- `/Users/annon/projects/get-shit-done/get-shit-done/bin/lib/init.cjs` — `discuss_agents` in INIT JSON (line 666) -- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/autopilot.md` — config-set calls (lines 51, 233) -- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/auto-discuss.md` — AGENT_COUNT read (lines 30-32) -- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/plan-phase.md` — auto_advance check (lines 444-446) -- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/execute-phase.md` — auto_advance check (lines 184, 408-410) -- `/Users/annon/projects/get-shit-done/get-shit-done/workflows/discuss-phase.md` — auto_advance set/check (lines 444-451) -- `/Users/annon/projects/get-shit-done/get-shit-done/references/model-profiles.md` — model_overrides documentation -- `/Users/annon/projects/get-shit-done/.planning/codebase/ARCHITECTURE.md` — system layer analysis - ---- - -*Architecture research for: PR #762 fix integration* -*Researched: 2026-02-28* diff --git a/.planning/research/FEATURES.md b/.planning/research/FEATURES.md deleted file mode 100644 index b159b45d1e..0000000000 --- a/.planning/research/FEATURES.md +++ /dev/null @@ -1,285 +0,0 @@ -# Feature Research: PR Review Fixes - -**Domain:** OSS contributor workflow — addressing reviewer feedback on autopilot mode PR -**Researched:** 2026-02-28 -**Confidence:** HIGH - -## Context - -This research addresses four discrete fix areas from reviewer feedback on PR #762 (autopilot mode). The PR was flagged for scope creep (5 distinct efforts bundled together), a config mutation bug, missing validation, and undocumented config. Research below maps each fix area to table stakes vs differentiators, with complexity and dependency notes. - ---- - -## Fix Area 1: Runtime Flags vs Config File Mutation - -### The Problem - -`autopilot.md` calls `config-set workflow.auto_advance true` to enable auto-advance for the duration of the autopilot run. This mutates `.planning/config.json` persistently. If autopilot is interrupted (crash, kill, user cancel), the `milestone_complete` cleanup step never fires, and `auto_advance` stays `true` in the user's config file permanently. The reviewer correctly identified this as a correctness bug. - -### Table Stakes (Must Fix) - -| Feature | Why Expected | Complexity | Notes | -|---------|--------------|------------|-------| -| Auto-advance enabled only for autopilot session | Users expect that running `/gsd:autopilot` does not permanently change their config | LOW | Industry standard: CLI flags are session-scoped; config files are persistent preferences. npm, git, cargo, kubectl all follow this pattern. | -| Cleanup idempotency | If autopilot stops for any reason (gap found, checkpoint, crash), config must not be left in a mutated state | LOW | Session flag eliminates the cleanup problem entirely — no cleanup needed if nothing was mutated | -| No regression for manual `auto_advance` config | Users who have `workflow.auto_advance: true` in their config file manually must see no behavior change | LOW | Session flag is additive — it passes `--auto` argument or reads an in-memory flag, not touching disk config | - -### Differentiators (Nice to Have) - -| Feature | Value Proposition | Complexity | Notes | -|---------|-------------------|------------|-------| -| `--no-auto` flag to disable auto-advance per-invocation | Allows users with persistent `auto_advance: true` to run a single manual phase | LOW | Inverse of the runtime flag pattern | -| Explicit autopilot mode banner showing active runtime overrides | User sees what config overrides are active for this run | LOW | Transparency over magic | - -### Anti-Features - -| Anti-Feature | Why Avoid | What to Do Instead | -|--------------|-----------|-------------------| -| Writing session state to config.json | Leaks ephemeral state into persistent user preferences; impossible to clean up on crash | Pass `--auto` as argument to the plan-phase subagent Task call instead of persisting to config | -| Using a `.lock` file as session marker | Adds complexity and still leaves cleanup problem | Argument-based activation has no cleanup problem | - -### Implementation Pattern - -The standard pattern across CLI tooling (npm, git, curl): command-line arguments override config values for that invocation only. Config files store user preferences; flags store session intent. - -For GSD: remove the `config-set workflow.auto_advance true` step from `autopilot.md`. Instead, the `run_phase_chain` step passes `ARGUMENTS='${PHASE} --auto'` to plan-phase. The plan-phase workflow already has an `auto_advance` check that reads config — extend it to also check for `--auto` in arguments. No config mutation, no cleanup needed. - -**Dependency:** Requires reading `--auto` flag in `plan-phase.md`'s auto_advance logic. Low touch — plan-phase already has this branching. - ---- - -## Fix Area 2: Input Validation for Config Values at Runtime - -### The Problem - -Two validation gaps identified in PR review: - -1. `config.cjs:cmdConfigSet` validates `discuss_agents` and `discuss_model` only when invoked via the `config-set` CLI command. Direct edits to `config.json` bypass this entirely. -2. `auto-discuss.md` reads `AGENT_COUNT` from config and uses it directly in agent spawning logic without validating the value it received. - -### Table Stakes (Must Fix) - -| Feature | Why Expected | Complexity | Notes | -|---------|--------------|------------|-------| -| Runtime validation of `discuss_agents` in auto-discuss | The workflow must not spawn 0, 2, 4, 6, 8, or 10+ agents just because config.json has a bad value | LOW | Standard defensive pattern: validate at consumption point, not only at write point | -| Fallback to default on invalid config value | Invalid `discuss_agents` (non-odd, out-of-range) should fall back to 5, not crash or spawn wrong count | LOW | Same pattern as the existing `AGENT_COUNT=$(... 2>/dev/null \|\| echo "5")` fallback already in auto-discuss.md — extend it | -| Validation message when falling back | User or caller knows a fallback occurred, not silently swallowing bad config | LOW | Print a warning to stderr: "discuss_agents=4 is invalid (must be odd 3-9), using 5" | - -### Differentiators (Nice to Have) - -| Feature | Value Proposition | Complexity | Notes | -|---------|-------------------|------------|-------| -| `config-validate` CLI command | Validates all config values against schema on demand | MEDIUM | Useful but separable — not needed for this PR fix | -| JSON schema for `config.json` with validation on load | Catch all invalid values at config load time | MEDIUM | Adds robustness but is a larger change touching `loadConfig()` in core.cjs | -| Startup validation warning for unknown config keys | Warns user that `model_overrides` or other undocumented keys are present | LOW | Complements documentation fix (Fix Area 3) | - -### Anti-Features - -| Anti-Feature | Why Avoid | What to Do Instead | -|--------------|-----------|-------------------| -| Validating only at `config-set` write time | Direct JSON edits bypass CLI; any agent reading config cannot trust values it receives | Validate at consumption point in the workflow | -| Crashing on invalid config | Breaks autopilot entirely for a recoverable problem | Fall back to documented default with a warning | - -### Implementation Pattern - -The OWASP Input Validation Cheat Sheet (2025) recommends allowlist validation: define exactly what is allowed, reject everything else. For `discuss_agents`: - -```bash -# In auto-discuss.md initialize step -AGENT_COUNT=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs config-get autopilot.discuss_agents 2>/dev/null || echo "5") -# Validate: must be odd number 3-9 -case "$AGENT_COUNT" in - 3|5|7|9) ;; # valid - *) echo "Warning: discuss_agents=$AGENT_COUNT invalid (must be 3/5/7/9). Using 5." >&2; AGENT_COUNT=5 ;; -esac -``` - -**Dependency:** Self-contained change to `auto-discuss.md`. No changes to `config.cjs` required for this fix. The existing validation in `cmdConfigSet` is a separate concern (prevents bad values from being written via CLI) and is fine as-is for this milestone. - ---- - -## Fix Area 3: Config Documentation Best Practices - -### The Problem - -Two documentation gaps: - -1. `model_overrides` was added to `loadConfig()` in `core.cjs` with no validation, no documentation, and no tests. Reviewer flagged it as undocumented arbitrary JSON passthrough. -2. The `autopilot` config section (`discuss_agents`, `discuss_model`) is added to defaults in `config.cjs` but not documented anywhere users can discover it. - -### Table Stakes (Must Fix) - -| Feature | Why Expected | Complexity | Notes | -|---------|--------------|------------|-------| -| Every config key documented in one place | OSS users expect a single authoritative source for all config options | LOW | Industry convention: `README.md` config section, or `docs/config.md` — one file, all keys, types, defaults, description | -| `model_overrides` documented or removed from this PR | If it has no tests and no documentation, it should not be in the PR | LOW | Remove from `loadConfig()` if premature; if intentional, add docs + at least one test for it | -| New `autopilot.*` keys listed with types, defaults, valid values | Users configuring autopilot need to know the valid range for `discuss_agents` | LOW | Short addition to existing config docs section in README | - -### Differentiators (Nice to Have) - -| Feature | Value Proposition | Complexity | Notes | -|---------|-------------------|------------|-------| -| Inline comments in the generated `config.json` defaults | Users see documentation when they open their config file | LOW | JSON doesn't support comments natively, but the `cmdConfigEnsureSection` output could include a header comment block as a separate file or README | -| `config-docs` CLI command | Prints all config options with descriptions | MEDIUM | Nice for discoverability, separable from PR fix | -| CHANGELOG.md entry for new config keys | Maintains project-level history of config schema evolution | LOW | Keep A Changelog pattern: add to `[Unreleased]` section under `### Added` | - -### Anti-Features - -| Anti-Feature | Why Avoid | What to Do Instead | -|--------------|-----------|-------------------| -| Documenting config in PR description only | PR descriptions are not part of the codebase; users won't find them | Put docs in README or a dedicated config reference file | -| Adding config options without tests | Untestable config options become technical debt | Pair each new config key with at least one test in `config.test.cjs` | - -### Documentation Pattern (OSS Standard) - -The Keep a Changelog specification and GitHub's own OSS project guidance both establish: every new configuration option added in a release should appear in (a) the changelog under `Added`, (b) the README config reference, and (c) if it has validation rules, those rules are tested. - -For `model_overrides` specifically: the OSS principle is "don't ship what you can't support." If it has no validation, no tests, and no documentation, it should be either removed from this PR or explicitly scoped to a follow-up PR with a `TODO:` comment and a failing test marking it as incomplete. - -**Dependency:** Purely additive changes to README.md and optionally CHANGELOG.md. No source code changes required for basic documentation fix. For `model_overrides`: either a one-line revert in `core.cjs` or a test addition in the test suite. - ---- - -## Fix Area 4: PR Splitting Strategies - -### The Problem - -PR #762 bundles 5 distinct efforts into one 8,179-line addition PR: -1. Test suite overhaul (~6,370 lines across 13 files) -2. CI pipeline (~50 lines) -3. Autopilot feature (~620 lines: `autopilot.md`, `auto-discuss.md`, config changes) -4. Resolve-model fix (overlaps with PR #761) -5. `model_overrides` config loading (undocumented, untested) - -### Table Stakes (Must Fix) - -| Feature | Why Expected | Complexity | Notes | -|---------|--------------|------------|-------| -| Test suite + CI as standalone PR | Tests and CI can be reviewed and merged independently; reviewer can verify tests pass without the autopilot feature | LOW | No dependencies on autopilot code — pure test infrastructure | -| Resolve-model fix coordinated with PR #761 | Two PRs fixing the same function will conflict on merge | LOW | One approach: rebase #762's resolve-model change on top of #761 after #761 lands, then drop it from this PR | -| Autopilot feature as focused PR | The actual new feature with only its direct dependencies | LOW | Once tests+CI and resolve-model are extracted, the autopilot PR becomes ~670 lines | -| `.planning/` artifacts removed from branch | Development artifacts (STATE.md, PLAN.md, SUMMARY.md referencing contributor filesystem paths) do not belong in the repo | LOW | `git rm .planning/STATE.md .planning/quick/` from the PR branch | - -### Differentiators (Nice to Have) - -| Feature | Value Proposition | Complexity | Notes | -|---------|-------------------|------------|-------| -| Stacked PR approach for future large features | Autopilot + MoE panels will be large; establish a workflow now | MEDIUM | Tools: `git rebase --update-refs`, Graphite, or manual stacking | -| PR template enforcing size and scope checklist | Prevents future scope creep in submissions | LOW | Separable from this milestone's fixes | - -### Anti-Features - -| Anti-Feature | Why Avoid | What to Do Instead | -|--------------|-----------|-------------------| -| Squashing everything into one commit before splitting | Loses granular history, makes bisect harder | Use `git cherry-pick` or `git rebase -i` to move commits to new branches | -| Creating split PRs that target main directly without stacking | If PR A depends on PR B, merging order matters; targeting main with dependent PRs risks broken states | Stack PRs on each other with clear dependency labels in PR body | - -### Recommended Split Order - -Based on the dependency graph: - -``` -PR A: tests + CI (no dependencies) - └─> PR B: resolve-model fix (depends on: rebase after #761 lands) - └─> PR C: autopilot feature (depends on: config keys from core, auto-discuss from A) -``` - -**Rationale:** Tests+CI can land immediately — it's the least risky and validates the CI setup itself. Resolve-model must coordinate with #761 to avoid conflicts. Autopilot should be last because it depends on the config infrastructure and the resolve-model fix being in main. - -**Dependency on `model_overrides`:** Remove from PR C (autopilot) unless it has documentation and tests. If needed for autopilot, scope it explicitly with validation. - ---- - -## Feature Dependencies - -``` -Fix Area 1 (runtime flag for auto-advance) - └──modifies──> autopilot.md (remove config-set call) - └──modifies──> plan-phase.md (read --auto argument) - -Fix Area 2 (input validation for discuss_agents) - └──modifies──> auto-discuss.md (validate AGENT_COUNT at read time) - └──depends on──> Fix Area 1 (same PR: autopilot feature) - -Fix Area 3 (config documentation) - └──adds──> README.md (config reference section) - └──optionally modifies──> CHANGELOG.md (unreleased section) - └──optionally reverts──> core.cjs (remove model_overrides if premature) - -Fix Area 4 (PR splitting) - └──precedes──> all other fix areas (structure work, not code work) - └──depends on──> git branch manipulation (cherry-pick or rebase) -``` - -### Dependency Notes - -- **Fix Areas 1 and 2 share a PR (autopilot feature):** They both touch autopilot.md and auto-discuss.md, so they belong together in the same focused PR. -- **Fix Area 3 can land in any PR:** Documentation for `model_overrides` is independent of the runtime flag fix. If `model_overrides` is removed, Fix Area 3 is just a README addition. -- **Fix Area 4 must happen first:** The PR split is the prerequisite for all other fixes to be reviewable as separate PRs. -- **Resolve-model fix (PR #761 coordination):** This is not strictly part of v1.3 code changes but is a PR management task. It should be tracked separately. - ---- - -## MVP Definition - -### Do Now (v1.3 — this milestone) - -- [x] Split PR: extract tests+CI, resolve-model, autopilot into separate PRs — **no code required, git branch work** -- [x] Remove `.planning/` artifacts from autopilot PR branch — **git rm** -- [x] Fix auto-advance config mutation — **remove 1 line from autopilot.md, add --auto to phase chain call** -- [x] Add runtime validation for `discuss_agents` in auto-discuss.md — **~5 lines of shell validation** -- [x] Document `autopilot.*` config keys in README — **~10 lines of docs** -- [x] Decide: remove `model_overrides` from loadConfig() or add tests+docs - -### Defer to Later - -- [ ] `config-validate` CLI command — useful but separable from PR review fixes -- [ ] JSON schema validation on config load — larger refactor, different milestone -- [ ] Stacked PR tooling setup — process improvement, not a code fix -- [ ] PR template for scope checklist — governance, not code - ---- - -## Feature Prioritization Matrix - -| Feature | User Value | Implementation Cost | Priority | -|---------|------------|---------------------|----------| -| Remove config mutation (auto-advance) | HIGH — prevents silent config corruption | LOW — remove 1 line, add --auto arg | P1 | -| Runtime validation for discuss_agents | HIGH — prevents autopilot spawning wrong agent count | LOW — 5 lines of shell validation | P1 | -| PR split (tests+CI separate) | HIGH — unblocks reviewer approval | LOW — git branch work only | P1 | -| Remove .planning/ artifacts | HIGH — removes contributor filesystem path leakage | LOW — git rm | P1 | -| Document autopilot.* config keys | MEDIUM — discoverability for users | LOW — README addition | P2 | -| Decide on model_overrides | MEDIUM — cleanliness of codebase | LOW — revert 1 line OR add tests | P2 | -| config-validate command | LOW — convenience | MEDIUM — new CLI command | P3 | -| Stacked PR workflow docs | LOW — process hygiene | LOW | P3 | - ---- - -## Sources - -### Runtime Flags vs Config Mutation (HIGH confidence) -- npm config precedence model: [npm-config docs](https://docs.npmjs.com/cli/v6/using-npm/config/) — CLI flags override config files, not the reverse -- node-config library: [Environment Variables wiki](https://github.com/node-config/node-config/wiki/Environment-Variables) — env vars override config files; config files store persistent preferences -- GSD codebase: `autopilot.md` lines 47-52 (`ensure_auto_advance` step) — the mutation bug is directly observable - -### Input Validation (HIGH confidence) -- OWASP Input Validation Cheat Sheet: [owasp.org](https://cheatsheetseries.owasp.org/cheatsheets/Input_Validation_Cheat_Sheet.html) — allowlist validation pattern; validate at consumption point -- GitHub security blog: [Validate all the things](https://github.blog/security/application-security/validate-all-things-input-validation/) — validate inputs before use, not only at write time -- GSD codebase: `auto-discuss.md` lines 30-32 — `AGENT_COUNT` read with fallback but no validation of the value received - -### Config Documentation (HIGH confidence) -- Keep a Changelog: [keepachangelog.com](https://keepachangelog.com/en/1.0.0/) — every new option in `### Added` under `[Unreleased]` -- Changelog best practices: [getbeamer.com](https://www.getbeamer.com/blog/11-best-practices-for-changelogs) — document breaking changes, categorize by type, link to additional material -- GSD codebase: `config.cjs` — `autopilot` section added in defaults but no README section or CHANGELOG entry - -### PR Splitting (HIGH confidence) -- Graphite PR size guide: [graphite.com](https://graphite.com/guides/best-practices-managing-pr-size) — under 200 lines ideal, atomic PRs, no mixed change types -- Stacked pull requests: [michaelagreiler.com](https://www.michaelagreiler.com/stacked-pull-requests/) — stack dependent PRs on each other rather than targeting main directly -- PR splitting strategies: [awesomecodereviews.com](https://www.awesomecodereviews.com/best-practices/stacked-pull-requests/) — separate refactors, features, tests into distinct PRs -- GitHub community discussion: [github.com/orgs/community](https://github.com/orgs/community/discussions/181240) — separation of concerns is the primary split criterion -- Git stacking with --update-refs: [andrewlock.net](https://andrewlock.net/working-with-stacked-branches-in-git-is-easier-with-update-refs/) — native git support for stacked branches without third-party tools - ---- - -*Feature research for: PR review fixes on autopilot mode (get-shit-done v1.3)* -*Researched: 2026-02-28* diff --git a/.planning/research/PITFALLS.md b/.planning/research/PITFALLS.md deleted file mode 100644 index 2aeaf9dd25..0000000000 --- a/.planning/research/PITFALLS.md +++ /dev/null @@ -1,238 +0,0 @@ -# Pitfalls Research - -**Domain:** PR splitting, runtime config flags, validation hardening, overlapping PRs, artifact cleanup -**Researched:** 2026-02-28 -**Confidence:** HIGH (derived from codebase analysis + documented PR state + common git/Node.js patterns) - ---- - -## Critical Pitfalls - ---- - -### Pitfall 1: Splitting a PR Leaves Commits Orphaned on the Original Branch - -**What goes wrong:** -When splitting PR #762 (autopilot) into focused PRs (tests+CI, resolve-model fix, autopilot feature), commits that belong to split-off branches are often left behind on the original branch. The split looks clean in `git log` on the new branch but the original branch still contains the commits, and when it is eventually merged or rebased, the commits appear twice — once from the split PR and once from the original. GitHub may show them as "already merged" in the PR diff, but if force-push or rebase is involved, commits can reappear unexpectedly. - -**Why it happens:** -Developers create a new branch from the original and cherry-pick the relevant commits, believing the branch is now "clean." They do not rebase the original branch to remove the cherry-picked commits. Now both branches contain the same logical changes but as distinct commit objects (different SHAs). When both branches target main, git sees distinct commits and applies both changes, potentially duplicating lines or creating conflicts. - -**How to avoid:** -1. Start the split from the base commit (where feat/autopilot diverged from main), not from the tip of feat/autopilot. -2. Create each focused branch from main: `git checkout -b fix/resolve-model main`. -3. Cherry-pick only the commits belonging to that PR's scope into the new branch. -4. For the original branch (autopilot), interactively rebase to remove commits that were split into other PRs: `git rebase -i main` on feat/autopilot, dropping the lines for cherry-picked commits. -5. Verify the split is clean: `git diff main...fix/resolve-model` should show only the resolve-model fix. `git diff main...feat/autopilot` should show no resolve-model changes. - -**Warning signs:** -- `git log main...feat/autopilot` shows commits that have also appeared in a merged PR -- PR #762's diff includes the resolve-model fix after PR #761 has merged (should be gone) -- GitHub reports "0 changed files" on a PR after another PR was merged (commits were already on base) - -**Phase to address:** Phase 14 — PR Restructure (first phase of v1.3). Must happen before any other fix work because subsequent phases add commits on top of a correct branch structure. - ---- - -### Pitfall 2: Runtime Flag Leaks Into Subsequent Sessions via Config Mutation - -**What goes wrong:** -The auto-advance feature (autopilot advancing phases without user confirmation) was implemented by mutating `config.json` to set `auto_advance: true`. The reviewer identified this as a bug: config mutation persists across sessions. If the flag is written to disk, a subsequent unrelated session reads the config, finds `auto_advance: true`, and auto-advances without the user expecting it. The user's "one-time" option becomes a permanent state change. - -**Why it happens:** -The natural pattern in Node.js CLI tools is to persist options by writing to the config file. Developers reach for `loadConfig()` + `fs.writeFileSync(configPath, JSON.stringify(...))` because it is how every other setting works in this codebase. The distinction between session-scoped flags (should not persist) and user preferences (should persist) is easy to miss. - -**How to avoid:** -1. Use an in-memory runtime flag only: add a module-level variable in the relevant workflow or pass it as a parameter through the call chain. Never write session-scoped flags to `config.json`. -2. For the gsd-tools.cjs CLI, pass the flag as a command-line argument or environment variable (`GSD_AUTO_ADVANCE=1`). The process reads it once at startup and it dies with the process. -3. Add a code comment at the flag's declaration: `// Runtime-only: never persist to config.json`. This makes intent explicit for future contributors. -4. If the config loading code ever re-reads `config.json` during a session, the session-scoped value must be held separately and merged after load: `const runtimeFlags = { auto_advance: cliArgs.autoAdvance }; const effective = { ...config, ...runtimeFlags }`. - -**Warning signs:** -- `config.json` is modified during a run that the user did not invoke as a settings change -- Tests that run in sequence pass individually but fail together (previous test left config state) -- `git diff` on `.planning/config.json` appears in test runs that should not touch config - -**Phase to address:** Phase 15 — Auto-Advance Fix. This is an isolated code change in `state.cjs` or the autopilot workflow. Low risk of cross-phase interference if done as its own PR. - ---- - -### Pitfall 3: Validation Too Strict Breaks Existing Callers of `discuss_agents` - -**What goes wrong:** -Adding runtime validation for `discuss_agents` in the auto-discuss workflow could break existing users who have configs or workflows that provide `discuss_agents` in an unexpected format. If the validator throws or exits on any unexpected value (instead of defaulting gracefully), users who previously worked fine now get hard errors. - -**Why it happens:** -Validation is usually added after a bug is discovered. The developer validates the exact case that caused the bug but over-constrains the input space. For example: validating that `discuss_agents` must be an array of strings also rejects a single string (which is a reasonable user shorthand), or rejects an array with empty strings (which might be valid as a "use default agent" signal), or rejects `undefined` (which was previously allowed as "use all agents"). - -**How to avoid:** -1. Before writing validation, enumerate all values that currently work (test existing configs and workflow invocations). The validation must accept all of them. -2. Prefer defensive coercion over rejection: if `discuss_agents` is a string, coerce to `[discuss_agents]`. If it is `null` or `undefined`, coerce to the default value. Only reject values that are structurally impossible to interpret. -3. The error message on rejection must tell the user exactly what to provide, not just what was wrong: "discuss_agents must be an array of agent names, got: `true`" is better than "invalid discuss_agents". -4. Add a test that passes the old config format (no `discuss_agents` key) and asserts the workflow still runs normally. Backwards compatibility test first, then add the validation. - -**Warning signs:** -- Validation added without a test for the pre-existing "no key" case -- Validation uses `=== undefined` check on a key that could also be `null`, `0`, or `false` -- No graceful default — the code throws instead of falling back - -**Phase to address:** Phase 16 — Validation Hardening. Scope: auto-discuss workflow only. Must not touch other callers of discuss_agents outside auto-discuss. - ---- - -### Pitfall 4: Coordinating With PR #761 (resolve-model fix) — Merge Order Creates Conflicts - -**What goes wrong:** -PR #761 and PR #762 both touch `resolve-model` logic. If both PRs are open simultaneously targeting main and one merges first, the other PR's diff now shows a conflict on the same lines. Git cannot auto-merge because both PRs modified the same function. The developer must rebase the second PR against the updated main — but if they rebase incorrectly, they either lose the first PR's fix or introduce a double-application of the same change. - -**Why it happens:** -When two contributors (or two PRs from the same contributor) independently identify the same bug and fix it, their fixes diverge at the implementation level even if they solve the same problem. Cherry-picking is tempting but dangerous: cherry-picking a fix onto a branch that already has the same logical fix (with different surrounding code) silently applies a double-fix or creates syntactically valid but semantically wrong code. - -**How to avoid:** -1. Decide on one canonical fix before both PRs are open simultaneously. If PR #761 (resolve-model) is already merged or likely to merge first, base the fix in PR #762 on the post-merge state of main. -2. After PR #761 merges, immediately rebase feat/autopilot against the updated main: `git fetch origin && git rebase origin/main`. Resolve conflicts at the resolve-model fix site manually — verify the post-rebase code has exactly one copy of the fix, not zero and not two. -3. If PR #761 is closed (not merged), cherry-pick the relevant commit from the closed PR's branch into the new focused PR rather than implementing the fix independently again. -4. If PR #761 was merged: check `git log main --oneline -- path/to/resolve-model-file` to confirm the fix is in main before removing it from PR #762's scope. - -**Warning signs:** -- Both PRs modify the same file in their diffs -- `git diff main...feat/autopilot` shows changes to resolve-model code even after PR #761 merged -- CI shows a merge conflict check failing on PR #762 - -**Phase to address:** Phase 14 — PR Restructure (same phase as the split, since coordinate-with-761 is prerequisite to the split being correct). - ---- - -### Pitfall 5: Removing `.planning/` Artifacts Breaks the Local Dev Workflow Mid-Milestone - -**What goes wrong:** -The reviewer requested removing committed `.planning/` artifacts (PLAN.md files, SUMMARY.md files, research files) from the PR branch. If these are removed with `git rm` and committed, they disappear from the branch permanently. If the team is mid-execution (using those PLAN.md files to track what to do next), removing them mid-milestone orphans the working state. The agent trying to resume from a checkpoint no longer has the PLAN.md to resume from. - -**Why it happens:** -The reviewer sees `.planning/` files as dev artifacts (like `node_modules` or compiled output) that should not be in the repository. The executor agent produced them as part of the workflow. The conflict: they should be tracked in `.gitignore` for the repo but were committed on the branch before `.gitignore` was updated. - -**How to avoid:** -1. Before removing any `.planning/` file from git tracking, verify that the milestone is complete — all phases executed, no active PLAN.md files in-use. Check `.planning/STATE.md` to confirm status. -2. The removal order matters: (a) add `.planning/` patterns to `.gitignore` first, (b) then `git rm --cached .planning/phases/*/PLAN.md` to untrack without deleting the local files, (c) commit the `.gitignore` change and the `git rm --cached` in the same commit, (d) verify local dev still works by checking that the physical files still exist on disk. -3. Do not use `git rm` (without `--cached`) on PLAN.md or SUMMARY.md files that are currently in-use. The physical file must survive; only git's tracking of it should be removed. -4. After the cleanup commit, verify the workflow still functions: run `gsd-tools.cjs phases list` in the temp directory pattern used by tests to confirm `.planning/` artifact removal did not affect test fixtures (tests create their own temp dirs so this should be safe, but verify). - -**Warning signs:** -- `git rm` without `--cached` on files that are still referenced by `.planning/STATE.md` -- `.gitignore` change is in a separate commit from the `git rm --cached` (leaves a window where CI includes artifacts) -- Physical `.planning/` files are deleted from disk during cleanup (confirms with `ls .planning/phases/`) - -**Phase to address:** Phase 17 — Artifact Cleanup. Should be a standalone PR — no code changes, only `.gitignore` additions and `git rm --cached`. Keeps the diff reviewable. - ---- - -## Technical Debt Patterns - -Shortcuts that seem reasonable but create long-term problems. - -| Shortcut | Immediate Benefit | Long-term Cost | When Acceptable | -|----------|-------------------|----------------|-----------------| -| Writing runtime flags to config.json | No extra parameter threading through call stack | Flag persists across sessions, surprising users | Never — session flags must never hit disk | -| Validating `discuss_agents !== undefined` only | Catches missing key | Misses `null`, `false`, `0`, empty array — all of which arrive from real configs | Never — validate all falsy paths | -| Removing `.planning/` via `git rm` (not `--cached`) | Cleaner local directory | Destroys workflow state mid-execution | Never if milestone is in-progress | -| Cherry-picking the resolve-model fix without rebasing original branch | Avoids rebase complexity | Both branches contain the same logical change, double-applied when both merge | Never — must rebase original branch | -| Documenting `model_overrides` inline in config.json template | No separate doc to maintain | Template becomes the spec; when code diverges, template is wrong and users are confused | Only if the feature is stable and unlikely to change | - ---- - -## Integration Gotchas - -Common mistakes when connecting the v1.3 fixes to the existing system. - -| Integration | Common Mistake | Correct Approach | -|-------------|----------------|------------------| -| Runtime flag + loadConfig() | Reading `auto_advance` from config inside a function that is called multiple times per session, picking up stale disk state | Load config once at process start, pass effective config as parameter; never re-read config.json for session-scoped flags | -| `discuss_agents` validation + auto-discuss workflow | Adding validation that calls `process.exit(1)` on invalid input, breaking the workflow silently in CI where exit code 1 is indistinguishable from a test failure | Return structured error JSON matching the existing `output()` / `error()` helper convention, let the orchestrator surface the error | -| PR #762 rebase after PR #761 merge | Rebasing feat/autopilot on main after PR #761 merged, then forgetting to force-push the rebased branch, PR still shows old base | After rebase, always `git push --force-with-lease` to update the PR's remote branch (safer than `--force`: aborts if remote has new commits you have not seen) | -| .planning/ git rm + tests | Using `git rm` in a test that uses `createTempProject()` — the temp dir does not have git initialized, so `git rm` fails | Tests use temp directories that are not git repos; the cleanup change is a one-time git operation on the actual repo, not something to test via `runGsdTools()` | -| config.json template + `model_overrides` | Adding `model_overrides` to the template config.json with example values, users copy the template and get non-default model overrides they did not intend | Either omit the key from the template (rely on code defaults) or add it commented out with a clear "uncomment to customize" note | - ---- - -## Performance Traps - -Not applicable at the scale of v1.3 (the changes are code fixes and PR operations, not performance-sensitive features). No new code paths that touch ROADMAP.md scanning or large file operations. - ---- - -## Security Mistakes - -| Mistake | Risk | Prevention | -|---------|------|------------| -| Writing auto_advance flag to config.json exposes it to git history | If a future automation commits config.json, `auto_advance: true` leaks into repo history and could be parsed by tools that auto-configure CI behavior | Keep session flags out of config.json entirely; they have no business being on disk | -| Over-broad validation error messages that echo back user input | If `discuss_agents` accepts arbitrary strings and the error message includes the raw input, a crafted input could inject into log output | Sanitize or truncate the echoed value in error messages; max 100 chars, no newlines | - ---- - -## UX Pitfalls - -| Pitfall | User Impact | Better Approach | -|---------|-------------|-----------------| -| Validation error on `discuss_agents` with no migration path | Users who already configured discuss_agents in the old format get a hard error with no guidance | Add validation + a clear upgrade message: "discuss_agents now requires an array, found: X. Update your config to: discuss_agents: [X]" | -| Documenting `model_overrides` config key that is not yet used | Users add it to their config expecting it to work, nothing happens, they file bugs | Either implement it or explicitly mark it as `// reserved for future use, has no effect yet` in the template and docs | -| PR split produces 3 separate PRs with dependencies not communicated | Reviewer merges the wrong PR first, creating a broken state | Add a PR description note to each PR: "Merge order: this PR first, then X, then Y" or use GitHub draft status on dependent PRs | - ---- - -## "Looks Done But Isn't" Checklist - -Things that appear complete but are missing critical pieces. - -- [ ] **PR Split:** Branch has been cherry-picked to new PRs — verify original feat/autopilot branch has been rebased to drop those commits, not just that the new PRs exist. -- [ ] **Runtime flag fix:** auto_advance is no longer written to config.json — verify by running the autopilot workflow and diffing `git diff .planning/config.json` before and after; it should show no changes. -- [ ] **discuss_agents validation:** Validation added — verify with a test that uses the pre-existing "no discuss_agents key" config and confirms the workflow still runs without error (backwards compatibility). -- [ ] **PR #761 coordination:** The resolve-model fix is not double-applied — verify `git log main...feat/autopilot -- [resolve-model file path]` shows no resolve-model commits after PR #761 merged. -- [ ] **Artifact cleanup:** `.planning/` files removed from git tracking — verify with `git ls-files .planning/` that phase artifacts no longer appear; physical files still exist on disk (`ls .planning/phases/`). -- [ ] **model_overrides documentation:** If `model_overrides` is documented in config, verify a test exercises loading config with `model_overrides` present and the system does not crash (even if the key is currently a no-op). - ---- - -## Recovery Strategies - -When pitfalls occur despite prevention, how to recover. - -| Pitfall | Recovery Cost | Recovery Steps | -|---------|---------------|----------------| -| Commits orphaned on original branch after split | MEDIUM | `git rebase -i main` on feat/autopilot, drop the split commits; force-push with `--force-with-lease`; notify PR reviewer the branch was rebased | -| auto_advance written to config.json and already merged | LOW | Hotfix PR: add migration in config load that strips `auto_advance` key from config.json on load; removes it from disk next time gsd-tools runs | -| Validation breaks existing discuss_agents config | LOW | Hotfix PR: add coercion before validation (string-to-array, null-to-default); bump patch version; communicate in CHANGELOG.md | -| PR #762 has double-applied resolve-model fix after PR #761 merged | MEDIUM | Identify the conflicting commits with `git log`; rebase feat/autopilot onto post-merge main; resolve conflicts by keeping PR #761's version; re-request review with explanation | -| `.planning/` files deleted from disk (not just untracked) | HIGH | `git checkout HEAD -- .planning/phases/` to restore from the commit before the bad `git rm`; if already committed, `git revert` the cleanup commit; never use bare `git rm` on active PLAN.md files | -| model_overrides documented but non-functional — user reports broken behavior | LOW | Add a CHANGELOG entry noting the key is reserved; add a warning log in config.cjs when the key is present: "model_overrides is not yet implemented and has no effect" | - ---- - -## Pitfall-to-Phase Mapping - -How roadmap phases should address these pitfalls. - -| Pitfall | Prevention Phase | Verification | -|---------|------------------|--------------| -| Orphaned commits after PR split | Phase 14 — PR Restructure | `git log main...feat/autopilot` shows no commits that appear in the split-off PRs | -| Runtime flag config mutation | Phase 15 — Auto-Advance Fix | `git diff .planning/config.json` clean after autopilot run; unit test asserts config.json not written | -| Validation breaks backwards compat | Phase 16 — Validation Hardening | Test with config missing `discuss_agents` key passes; test with old string format passes (or gets clear error + coercion) | -| PR #761 resolve-model double-apply | Phase 14 — PR Restructure | Diff of new focused resolve-model PR against post-761-merge main shows no duplicated logic | -| Artifact removal breaks dev workflow | Phase 17 — Artifact Cleanup | Physical `.planning/` files exist on disk; `git ls-files .planning/phases/` returns empty for PLAN.md and SUMMARY.md files | -| model_overrides stale docs | Phase 18 (if addressed) — Config Docs | Either removed from template or accompanied by "no-op" warning in config.cjs | - ---- - -## Sources - -- GSD codebase analysis: `.planning/codebase/CONCERNS.md` — config mutation patterns, error handling gaps — HIGH confidence -- GSD codebase analysis: `.planning/codebase/CONVENTIONS.md` — no module-level mutable state convention, config loaded on-demand — HIGH confidence -- GSD codebase analysis: `.planning/codebase/TESTING.md` — `createTempProject()` creates non-git dirs, confirms `git rm` cannot be tested via test helpers — HIGH confidence -- GSD codebase analysis: `.planning/PROJECT.md` — v1.3 requirements, PR #762 and #761 relationship — HIGH confidence -- `.planning/STATE.md` — PR #761 closed, PR #762 open with changes requested — HIGH confidence -- `git diff main...feat/autopilot --stat` — confirmed `.planning/` artifacts committed on branch, config.json modified — HIGH confidence -- Common git rebase/cherry-pick pitfalls: standard git documentation and community best practices — HIGH confidence (well-established, not stale) -- Node.js `--force-with-lease` safety over `--force`: git documentation — HIGH confidence - ---- -*Pitfalls research for: PR Review Fixes — splitting, runtime flags, validation, overlapping PRs, artifact cleanup* -*Researched: 2026-02-28* diff --git a/.planning/research/STACK.md b/.planning/research/STACK.md deleted file mode 100644 index d470f25831..0000000000 --- a/.planning/research/STACK.md +++ /dev/null @@ -1,251 +0,0 @@ -# Stack Research - -**Domain:** Git branch and PR management for decomposing a monolithic PR into focused PRs -**Researched:** 2026-02-28 -**Confidence:** HIGH - -## Context - -This is a subsequent milestone (v1.3) addressing reviewer feedback on PR #762. The PR is monolithic: it bundles tests+CI, a resolve-model fix, the autopilot feature, and committed `.planning/` artifacts. The goal is to decompose it into focused PRs that reviewers can merge independently. - -The constraint from PROJECT.md applies: "Not our repo — We're contributing PRs, not merging directly." This means we push to `fork/` remotes and open PRs against `origin/main`. - -## Recommended Stack - -### Core Technologies - -| Technology | Version | Purpose | Why Recommended | -|------------|---------|---------|-----------------| -| `git cherry-pick` | Built-in | Selectively apply specific commits to new branches | Best tool when source commits are already clean; no new branch history needed | -| `git checkout -- ` | Built-in | Bring specific files from a branch into the current branch without cherry-picking | Best tool when commits are mixed (one commit has multiple concerns); extract only the files you need | -| `git rm --cached -r ` | Built-in | Remove committed files from index/tracking without deleting them locally | Removes `.planning/` artifacts from branch without losing local files | -| `git rebase --onto` | Built-in | Transplant a range of commits onto a new base | Best tool when commits are sequential and cleanly separated | -| `git log --oneline ^` | Built-in | Enumerate commits that need to be split | Planning step — understand what commits exist before operating | -| GitHub PR (gh CLI) | Current | Open PRs against upstream from fork branches | `gh pr create --repo org/repo` to target upstream, not fork | - -### Supporting Techniques - -| Technique | Purpose | When to Use | -|-----------|---------|-------------| -| Branch-from-base pattern | Create each sub-PR branch from `origin/main`, not from the monolithic branch | Always — prevents sub-PRs from carrying unrelated changes | -| `git diff --name-only ` | Identify what changed in the monolithic PR | Planning step before splitting | -| `git show --stat ` | Understand what each commit touched | Planning step to decide which cherry-pick strategy to use | -| `git stash` | Preserve uncommitted local work during branch surgery | When you need to switch branches mid-operation | -| `.gitignore` entry | Prevent `.planning/` from being committed in future | Add `**/.planning/` to `.gitignore` if it isn't already | - -### Development Tools - -| Tool | Purpose | Notes | -|------|---------|-------| -| `gh` CLI | Create PRs against upstream from fork | Use `gh pr create --repo gsd-build/get-shit-done --head ethan-hurst:branch-name` | -| `git log --graph --oneline` | Verify branch topology before pushing | Sanity check that branch base is `origin/main`, not the monolithic branch | - -## Installation - -```bash -# No installation required — git and gh are already present -# Verify gh is authenticated: -gh auth status -``` - -## Strategies for This Specific Split - -The fork has 3 commits on `fork/feat/autopilot` vs `origin/main`: - -1. `b0aa9fc` — feat: add autopilot mode + Agent Teams execution engine (9 files) -2. `8850ebf` — refactor: remove Agent Teams engine, simplify to subagents-only -3. `000163a` — refactor: remove dead execution section, consolidate to autopilot config - -The target split is 3 focused PRs: - -### PR A: Tests + CI (from `fork/feat/coverage-hardening`) - -This already exists as a separate branch. No splitting needed — just open a PR from it. - -```bash -# Branch already exists at fork/feat/coverage-hardening -# Verify it contains only test files and CI changes: -git diff --name-only origin/main fork/feat/coverage-hardening - -# Open PR against upstream: -gh pr create --repo gsd-build/get-shit-done \ - --head ethan-hurst:feat/coverage-hardening \ - --title "test: add full test suite with CI pipeline (433 tests, 94% coverage)" -``` - -### PR B: resolve-model Fix (coordinates with PR #761) - -The fix already exists on `fork/fix/load-model-overrides-from-config`. This may conflict with PR #761 — check before submitting. - -```bash -# Branch already exists at fork/fix/load-model-overrides-from-config -git diff --name-only origin/main fork/fix/load-model-overrides-from-config - -# Check for overlap with PR #761's files: -# If no conflict, open PR: -gh pr create --repo gsd-build/get-shit-done \ - --head ethan-hurst:fix/load-model-overrides-from-config \ - --title "fix: load model_overrides from config and use resolveModelInternal in CLI" -``` - -### PR C: Autopilot Feature (clean, without artifacts) - -The autopilot commits include `.planning/` artifacts committed to the branch. Create a clean branch that cherry-picks only the feature files. - -```bash -# Step 1: Create clean branch from origin/main -git checkout -b feat/autopilot-clean origin/main - -# Step 2: Cherry-pick the 3 autopilot commits -git cherry-pick b0aa9fc # autopilot feature -git cherry-pick 8850ebf # remove Agent Teams -git cherry-pick 000163a # remove dead execution section - -# Step 3: Remove any .planning/ artifacts that got pulled in -git rm --cached -r .planning/ 2>/dev/null || true -echo '.planning/' >> .gitignore # if not already ignored -git add .gitignore -git commit --amend --no-edit # or: git commit -m "chore: remove .planning artifacts" - -# Step 4: Verify only intended files are present -git diff --name-only origin/main feat/autopilot-clean - -# Step 5: Push and open PR -git push fork feat/autopilot-clean -gh pr create --repo gsd-build/get-shit-done \ - --head ethan-hurst:feat/autopilot-clean \ - --title "feat: add /gsd:autopilot for fully automated pipeline execution" -``` - -**Alternative if cherry-pick has conflicts:** Use `git checkout -- ` to bring specific files without commit history: - -```bash -git checkout -b feat/autopilot-clean origin/main - -# Bring only the autopilot-related files from the monolithic branch -git checkout fork/feat/autopilot -- commands/gsd/autopilot.md -git checkout fork/feat/autopilot -- get-shit-done/workflows/auto-discuss.md -git checkout fork/feat/autopilot -- get-shit-done/workflows/autopilot.md -git checkout fork/feat/autopilot -- get-shit-done/workflows/execute-phase.md -git checkout fork/feat/autopilot -- get-shit-done/workflows/progress.md -git checkout fork/feat/autopilot -- get-shit-done/workflows/settings.md -git checkout fork/feat/autopilot -- get-shit-done/bin/lib/config.cjs -git checkout fork/feat/autopilot -- get-shit-done/templates/config.json - -# Do NOT bring: .planning/ files, tests/, .github/, package*.json -git commit -m "feat: add /gsd:autopilot for fully automated pipeline execution" -``` - -## Runtime Flag Pattern (for auto-advance fix) - -The review flagged that `autopilot.md` mutates `config.json` to set `workflow.auto_advance true` and then sets it back to `false` after the run. This persists state to disk, which is a side effect if the run is interrupted. - -**Pattern to fix this:** Pass `AUTO_ADVANCE` as an environment variable or shell argument instead of persisting to config. - -**Current (mutates config.json):** -```bash -# autopilot.md start -node gsd-tools.cjs config-set workflow.auto_advance true - -# ... phases run ... - -# autopilot.md end -node gsd-tools.cjs config-set workflow.auto_advance false -``` - -**Fixed (runtime flag, no persistence):** -```bash -# Pass flag as environment variable -AUTO_ADVANCE=true node gsd-tools.cjs execute-phase ... - -# Or pass as CLI argument that execute-phase reads from args, not config -node gsd-tools.cjs execute-phase --auto-advance ... -``` - -**In `execute-phase.md`:** Read from env/arg first, fall back to config: -```bash -# Read auto_advance: env var takes priority over config -AUTO_CFG="${AUTO_ADVANCE:-$(node gsd-tools.cjs config-get workflow.auto_advance 2>/dev/null || echo false)}" -``` - -This ensures `config.json` is never mutated during an autopilot run — it stays as the user left it. - -**Confidence: HIGH** — This is the standard Unix pattern: environment variables override config files for session-scoped behavior. No new dependencies, no new config keys. - -## Alternatives Considered - -| Recommended | Alternative | When to Use Alternative | -|-------------|-------------|-------------------------| -| `git cherry-pick` per commit | `git rebase --onto` | Use rebase --onto when commits are sequential AND don't need file-level filtering | -| `git checkout -- ` | Interactive rebase + `git add -p` | Use interactive rebase when commits need to be re-split at hunk level (more complex) | -| New branch from `origin/main` | Amend the existing branch | Only amend existing branch when it's not yet published OR reviewer explicitly asks for force-push | -| Environment variable for runtime flag | New config key `autopilot.running` | Env var is session-scoped (no persistence risk), config key would require cleanup on crash | - -## What NOT to Use - -| Avoid | Why | Use Instead | -|-------|-----|-------------| -| `git filter-branch` | Deprecated, slow, dangerous for shared branches | `git rm --cached` for removing tracked files; BFG for history rewriting | -| `git push --force` to a branch with an open PR | Rewrites history reviewers may have fetched; confusing diffs in PR timeline | Push to a NEW branch, open a NEW PR | -| `git rebase -i` on published branches | Same issue — force-push required after | New branch + cherry-pick | -| Graphite CLI / git-multi-pr | External tooling, not available everywhere | Native git cherry-pick + gh CLI | -| Mutating `config.json` for runtime state | Leaves dirty state if process is interrupted; user's config is corrupted | Environment variables for session-scoped flags | -| `git add .` or `git add -A` when cleaning artifacts | Risk of accidentally re-adding files from adjacent directories | `git rm --cached -r .planning/` explicitly, then `git add` specific files | - -## Stack Patterns by Variant - -**If commits are clean and single-concern (each commit touches only one logical change):** -- Use `git cherry-pick ` per commit -- This is the simplest path — no file-level surgery needed - -**If commits are mixed-concern (one commit touches feature files AND test files AND artifacts):** -- Use `git checkout -- ` to bring only the files you want -- Build the new branch file-by-file, then commit once - -**If the PR has no clean commits (everything in one giant commit):** -- Use `git checkout -- ` for each file group -- Commit groups separately on the new branch -- This gives reviewers a meaningful commit history - -**If a sub-PR conflicts with another open PR (e.g., PR #761):** -- Wait for the other PR to merge first, then rebase your branch onto the updated base -- OR communicate with the maintainer to sequence the merges -- Do NOT attempt to manually merge the two PRs' changes together - -## Removing .planning/ Artifacts - -The monolithic PR has committed `.planning/` dev artifacts. To clean them: - -```bash -# On the branch that has the artifacts: -git rm --cached -r .planning/ -git commit -m "chore: remove .planning dev artifacts from branch" - -# Then add .planning/ to .gitignore to prevent recurrence: -echo '.planning/' >> .gitignore -git add .gitignore -git commit -m "chore: gitignore .planning artifacts" -``` - -This removes the files from tracking without deleting them locally — they stay in your working directory but won't appear in the PR diff. - -## Version Compatibility - -| Technique | Git Version | Notes | -|-----------|-------------|-------| -| `git cherry-pick` | Any modern git | Available in git 1.7+ | -| `git checkout -- ` | Any modern git | Long-standing feature | -| `git rm --cached -r` | Any modern git | Standard since git 1.0 | -| `gh pr create --repo` | gh 2.x+ | Targets upstream repo from fork | - -## Sources - -- [Git Official Docs: git-rm](https://git-scm.com/docs/git-rm) — `--cached` flag behavior, recursive removal (HIGH confidence) -- [Graphite: How to split a PR](https://graphite.com/guides/how-to-split-a-pull-request-into-multiple-prs) — Strategy overview: cherry-pick, checkout-file, branch-from-base (MEDIUM confidence) -- [GitHub Gist: Split large PR into two](https://gist.github.com/loilo/930f141d9acf89e9e734ffa042acd750) — `git rebase --onto` and `cherry-pick` concrete commands (HIGH confidence) -- GSD codebase — `fork/feat/coverage-hardening` and `fork/feat/autopilot` branch inspection (HIGH confidence — direct observation) -- Unix environment variable pattern — session-scoped config override via `$ENV_VAR` (HIGH confidence — standard Unix practice) - ---- -*Stack research for: git PR decomposition and runtime config patterns* -*Researched: 2026-02-28* From 3eda547f2f9f8374f15f8e0461cd8d8ab7cd12ff Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 13:33:36 +1000 Subject: [PATCH 14/16] docs: add autopilot command to help and README --- README.md | 1 + get-shit-done/workflows/help.md | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/README.md b/README.md index 91332b8cef..5ed4afb546 100644 --- a/README.md +++ b/README.md @@ -470,6 +470,7 @@ You're never locked in. The system adapts. | `/gsd:discuss-phase [N] [--auto]` | Capture implementation decisions before planning | | `/gsd:plan-phase [N] [--auto]` | Research + plan + verify for a phase | | `/gsd:execute-phase ` | Execute all plans in parallel waves, verify when complete | +| `/gsd:autopilot [N] [N-N]` | Full pipeline (discuss → plan → execute) for remaining phases | | `/gsd:verify-work [N]` | Manual user acceptance testing ¹ | | `/gsd:audit-milestone` | Verify milestone achieved its definition of done | | `/gsd:complete-milestone` | Archive milestone, tag release | diff --git a/get-shit-done/workflows/help.md b/get-shit-done/workflows/help.md index 2991aa18eb..6a8df34b80 100644 --- a/get-shit-done/workflows/help.md +++ b/get-shit-done/workflows/help.md @@ -113,6 +113,20 @@ Execute all plans in a phase. Usage: `/gsd:execute-phase 5` +### Automation + +**`/gsd:autopilot [phase] [start-end]`** +Run full pipeline for remaining phases automatically. + +- Chains discuss → plan → execute → verify for each phase +- Runs from current phase through end of milestone by default +- Pass a single phase number or range to limit scope +- Stops on verification failure or checkpoint requiring human input + +Usage: `/gsd:autopilot` (all remaining phases) +Usage: `/gsd:autopilot 5` (start from phase 5) +Usage: `/gsd:autopilot 3-7` (phases 3 through 7) + ### Quick Mode **`/gsd:quick`** @@ -440,6 +454,13 @@ Example config: /gsd:execute-phase 1 # Execute all plans in phase ``` +**Running phases on autopilot:** + +``` +/gsd:autopilot # All remaining phases +/gsd:autopilot 3-7 # Specific range +``` + **Resuming work after a break:** ``` From 9a0ba3251bce1a85c5d89308babb6d196bdecd90 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 17:02:30 +1000 Subject: [PATCH 15/16] fix: prevent false milestone-complete when unscaffolded phases remain Progress and phase-complete relied solely on disk directories to determine phase counts. When ROADMAP.md defined phases that had no directories yet, both code paths incorrectly reported the milestone as complete. Add getRoadmapPhaseNumbersInternal() to parse all phase numbers from ROADMAP.md. Use it in cmdInitProgress (new roadmap_phase_count field) and cmdPhaseComplete (ROADMAP fallback for next-phase detection). Fixes #689, #754, #757, #709 --- get-shit-done/bin/lib/core.cjs | 17 +++++++++++++++++ get-shit-done/bin/lib/init.cjs | 5 ++++- get-shit-done/bin/lib/phase.cjs | 20 +++++++++++++++++++- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/get-shit-done/bin/lib/core.cjs b/get-shit-done/bin/lib/core.cjs index 6ef6ccb2a1..cb7ce25f7b 100644 --- a/get-shit-done/bin/lib/core.cjs +++ b/get-shit-done/bin/lib/core.cjs @@ -351,6 +351,22 @@ function getRoadmapPhaseInternal(cwd, phaseNum) { } } +function getRoadmapPhaseNumbersInternal(cwd) { + const roadmapPath = path.join(cwd, '.planning', 'ROADMAP.md'); + try { + const content = fs.readFileSync(roadmapPath, 'utf-8'); + const pattern = /#{2,4}\s*Phase\s+(\d+[A-Z]?(?:\.\d+)*)\s*:/gi; + const numbers = []; + let m; + while ((m = pattern.exec(content)) !== null) { + numbers.push(m[1]); + } + return numbers.sort((a, b) => comparePhaseNum(a, b)); + } catch { + return []; + } +} + function resolveModelInternal(cwd, agentType) { const config = loadConfig(cwd); @@ -424,6 +440,7 @@ module.exports = { findPhaseInternal, getArchivedPhaseDirs, getRoadmapPhaseInternal, + getRoadmapPhaseNumbersInternal, resolveModelInternal, pathExistsInternal, generateSlugInternal, diff --git a/get-shit-done/bin/lib/init.cjs b/get-shit-done/bin/lib/init.cjs index c25ce026ff..7661227de7 100644 --- a/get-shit-done/bin/lib/init.cjs +++ b/get-shit-done/bin/lib/init.cjs @@ -5,7 +5,7 @@ const fs = require('fs'); const path = require('path'); const { execSync } = require('child_process'); -const { loadConfig, resolveModelInternal, findPhaseInternal, getRoadmapPhaseInternal, pathExistsInternal, generateSlugInternal, getMilestoneInfo, normalizePhaseName, toPosixPath, output, error } = require('./core.cjs'); +const { loadConfig, resolveModelInternal, findPhaseInternal, getRoadmapPhaseInternal, getRoadmapPhaseNumbersInternal, pathExistsInternal, generateSlugInternal, getMilestoneInfo, normalizePhaseName, toPosixPath, output, error } = require('./core.cjs'); function cmdInitExecutePhase(cwd, phase, raw) { if (!phase) { @@ -647,6 +647,8 @@ function cmdInitProgress(cwd, raw) { } } catch {} + const roadmapPhaseCount = getRoadmapPhaseNumbersInternal(cwd).length; + // Check for paused work let pausedAt = null; try { @@ -674,6 +676,7 @@ function cmdInitProgress(cwd, raw) { phase_count: phases.length, completed_count: phases.filter(p => p.status === 'complete').length, in_progress_count: phases.filter(p => p.status === 'in_progress').length, + roadmap_phase_count: roadmapPhaseCount, // Current state current_phase: currentPhase, diff --git a/get-shit-done/bin/lib/phase.cjs b/get-shit-done/bin/lib/phase.cjs index 4e4cbff609..d58b313509 100644 --- a/get-shit-done/bin/lib/phase.cjs +++ b/get-shit-done/bin/lib/phase.cjs @@ -4,7 +4,7 @@ const fs = require('fs'); const path = require('path'); -const { escapeRegex, normalizePhaseName, comparePhaseNum, findPhaseInternal, getArchivedPhaseDirs, generateSlugInternal, output, error } = require('./core.cjs'); +const { escapeRegex, normalizePhaseName, comparePhaseNum, findPhaseInternal, getArchivedPhaseDirs, getRoadmapPhaseNumbersInternal, generateSlugInternal, output, error } = require('./core.cjs'); const { extractFrontmatter } = require('./frontmatter.cjs'); const { writeStateMd } = require('./state.cjs'); @@ -806,6 +806,24 @@ function cmdPhaseComplete(cwd, phaseNum, raw) { } } catch {} + if (isLastPhase) { + const roadmapNumbers = getRoadmapPhaseNumbersInternal(cwd); + for (const rmNum of roadmapNumbers) { + if (comparePhaseNum(rmNum, phaseNum) > 0) { + nextPhaseNum = rmNum; + try { + const rmPath = path.join(cwd, '.planning', 'ROADMAP.md'); + const rmContent = fs.readFileSync(rmPath, 'utf-8'); + const escaped = escapeRegex(rmNum); + const nameMatch = rmContent.match(new RegExp(`#{2,4}\\s*Phase\\s+${escaped}:\\s*([^\\n]+)`, 'i')); + if (nameMatch) nextPhaseName = nameMatch[1].replace(/\(INSERTED\)/i, '').trim(); + } catch {} + isLastPhase = false; + break; + } + } + } + // Update STATE.md if (fs.existsSync(statePath)) { let stateContent = fs.readFileSync(statePath, 'utf-8'); From 699e60085b7881162b06f0f0bf8b0c5ac38ff181 Mon Sep 17 00:00:00 2001 From: Ethan Hurst Date: Sat, 28 Feb 2026 17:03:21 +1000 Subject: [PATCH 16/16] docs: add changelog entry for #689 milestone-complete fix --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b4fb94122d..c660a0d61d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] +### Fixed +- Progress and phase-complete incorrectly route to milestone-complete when ROADMAP defines phases that have no disk directories yet (#689, #754, #757, #709) + ## [1.21.1] - 2026-02-27 ### Added