From aaa061bf2cef583a0d113e2817e0c451cbc5ac4e Mon Sep 17 00:00:00 2001
From: Stephen Miller <Stephen@betterbox.pw>
Date: Fri, 6 Mar 2026 01:26:43 -0600
Subject: [PATCH 1/2] feat(reliability): add agent criticality classification
 module

Classify each Task() spawn point as critical (blocks pipeline) or
advisory (pipeline can continue without). Advisory agents include
comment classifier, plan-checker, and verifier. Critical agents
include planner, executor, and PR creator. Unknown spawn points
default to critical (fail-safe).

Provides wrapAdvisoryAgent() wrapper that catches errors for advisory
agents and logs warnings instead of halting the pipeline.

Closes #234 (partial)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/agent-criticality.cjs | 227 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 227 insertions(+)
 create mode 100644 lib/agent-criticality.cjs

diff --git a/lib/agent-criticality.cjs b/lib/agent-criticality.cjs
new file mode 100644
index 0000000..bc34e73
--- /dev/null
+++ b/lib/agent-criticality.cjs
@@ -0,0 +1,227 @@
+'use strict';
+
+/**
+ * lib/agent-criticality.cjs — Agent spawn point criticality classification
+ *
+ * Classifies each Task() spawn point in the MGW pipeline as either:
+ *   - critical: failure blocks the pipeline (retry/fallback before dead-letter)
+ *   - advisory: failure is non-fatal (log warning, continue pipeline)
+ *
+ * Advisory agents provide quality-of-life improvements (comment classification,
+ * plan checking, verification) but the pipeline can produce a valid PR without
+ * their output. Critical agents produce artifacts the pipeline cannot proceed
+ * without (plans, executed code, PRs).
+ *
+ * Integrates with:
+ *   - lib/retry.cjs          (generic retry infrastructure)
+ *   - lib/retry-policy.cjs   (policy-driven retry, from #232 — safe-imported)
+ *   - lib/errors.cjs         (MgwError base class)
+ *
+ * @module agent-criticality
+ */
+
+const { MgwError } = require('./errors.cjs');
+
+// ---------------------------------------------------------------------------
+// Safe import of retry-policy.cjs (dependency from #232, may not be merged)
+// ---------------------------------------------------------------------------
+
+let RetryPolicyEngine = null;
+try {
+  const retryPolicy = require('./retry-policy.cjs');
+  RetryPolicyEngine = retryPolicy.RetryPolicyEngine;
+} catch (_) {
+  // retry-policy.cjs not available — graceful degradation for the
+  // graceful degradation module itself
+}
+
+// ---------------------------------------------------------------------------
+// Criticality classification
+// ---------------------------------------------------------------------------
+
+/**
+ * Criticality levels for agent spawn points.
+ * @enum {string}
+ */
+const CRITICALITY = Object.freeze({
+  CRITICAL: 'critical',
+  ADVISORY: 'advisory',
+});
+
+/**
+ * Maps each Task() spawn point identifier to its criticality level.
+ *
+ * Spawn point identifiers match the agent's role in the pipeline,
+ * not its GSD agent type (since the same agent type can have different
+ * criticality depending on where it's used).
+ *
+ * Key:
+ *   triage/   — spawn points in commands/run/triage.md
+ *   quick/    — spawn points in commands/run/execute.md (quick route)
+ *   milestone/— spawn points in commands/run/execute.md (milestone route)
+ *   pr/       — spawn points in commands/run/pr-create.md
+ */
+const CRITICALITY_MAP = Object.freeze({
+  // --- triage.md spawn points ---
+  'comment-classifier':       CRITICALITY.ADVISORY,
+
+  // --- execute.md quick-route spawn points ---
+  'planner':                  CRITICALITY.CRITICAL,
+  'plan-checker':             CRITICALITY.ADVISORY,
+  'executor':                 CRITICALITY.CRITICAL,
+  'verifier':                 CRITICALITY.ADVISORY,
+
+  // --- execute.md milestone-route spawn points ---
+  'milestone-planner':        CRITICALITY.CRITICAL,
+  'milestone-executor':       CRITICALITY.CRITICAL,
+  'milestone-verifier':       CRITICALITY.ADVISORY,
+
+  // --- pr-create.md spawn points ---
+  'pr-creator':               CRITICALITY.CRITICAL,
+});
+
+/**
+ * Check whether a spawn point is classified as advisory (non-blocking).
+ *
+ * @param {string} spawnPoint - Spawn point identifier from CRITICALITY_MAP
+ * @returns {boolean} True if advisory, false if critical or unknown
+ */
+function isAdvisory(spawnPoint) {
+  return CRITICALITY_MAP[spawnPoint] === CRITICALITY.ADVISORY;
+}
+
+/**
+ * Check whether a spawn point is classified as critical (pipeline-blocking).
+ *
+ * Unknown spawn points default to critical (fail-safe: don't silently
+ * swallow errors for unclassified agents).
+ *
+ * @param {string} spawnPoint - Spawn point identifier from CRITICALITY_MAP
+ * @returns {boolean} True if critical or unknown
+ */
+function isCritical(spawnPoint) {
+  const level = CRITICALITY_MAP[spawnPoint];
+  // Unknown spawn points are treated as critical (fail-safe)
+  return level !== CRITICALITY.ADVISORY;
+}
+
+// ---------------------------------------------------------------------------
+// Advisory agent wrapper
+// ---------------------------------------------------------------------------
+
+/**
+ * Error subclass for advisory agent failures that were gracefully degraded.
+ * These are logged but do not halt the pipeline.
+ */
+class AdvisoryAgentWarning extends MgwError {
+  /**
+   * @param {string} message
+   * @param {object} [opts]
+   * @param {string} [opts.spawnPoint] - The spawn point that failed
+   * @param {string} [opts.agentType]  - GSD agent type
+   * @param {Error}  [opts.cause]      - Original error
+   */
+  constructor(message, opts) {
+    const o = opts || {};
+    super(message, {
+      code: 'ADVISORY_AGENT_DEGRADED',
+      stage: o.stage,
+      issueNumber: o.issueNumber,
+      cause: o.cause,
+    });
+    this.name = 'AdvisoryAgentWarning';
+    this.spawnPoint = o.spawnPoint || null;
+    this.agentType = o.agentType || null;
+  }
+}
+
+/**
+ * Wrap an async function (typically a Task() agent spawn) with advisory
+ * graceful degradation. If the function throws and the spawn point is
+ * advisory, the error is caught, a warning is logged to stderr, and
+ * the specified fallback value is returned.
+ *
+ * For critical spawn points, the error is re-thrown unchanged.
+ *
+ * @param {() => Promise<*>} fn         - Async function to execute
+ * @param {string} spawnPoint           - Spawn point identifier from CRITICALITY_MAP
+ * @param {object} [context]            - Context for logging
+ * @param {number} [context.issueNumber] - Issue being processed
+ * @param {string} [context.agentType]   - GSD agent type
+ * @param {*}      [context.fallback]    - Value to return on advisory failure (default: null)
+ * @returns {Promise<*>} Result of fn() or fallback value
+ * @throws {Error} Re-throws if spawn point is critical
+ */
+async function wrapAdvisoryAgent(fn, spawnPoint, context) {
+  const ctx = context || {};
+  const fallback = ctx.hasOwnProperty('fallback') ? ctx.fallback : null;
+
+  try {
+    return await fn();
+  } catch (err) {
+    if (isAdvisory(spawnPoint)) {
+      // Advisory: log warning and return fallback
+      const warning = new AdvisoryAgentWarning(
+        `Advisory agent "${spawnPoint}" failed gracefully: ${err.message}`,
+        {
+          spawnPoint,
+          agentType: ctx.agentType || null,
+          issueNumber: ctx.issueNumber || null,
+          cause: err,
+        }
+      );
+
+      // Log to stderr (non-blocking, never throws)
+      try {
+        const ts = new Date().toISOString().replace(/\.\d{3}Z$/, 'Z');
+        process.stderr.write(
+          `[${ts}] MGW WARNING: ${warning.message}\n` +
+          `  spawn_point=${spawnPoint} agent_type=${ctx.agentType || 'unknown'} ` +
+          `issue=#${ctx.issueNumber || '?'}\n` +
+          `  original_error=${err.message}\n`
+        );
+      } catch (_) {
+        // Even logging failed — truly swallow
+      }
+
+      return fallback;
+    }
+
+    // Critical: re-throw (existing retry/dead-letter flow handles it)
+    throw err;
+  }
+}
+
+/**
+ * Get a human-readable summary of all spawn point classifications.
+ * Useful for diagnostics and documentation.
+ *
+ * @returns {Array<{spawnPoint: string, criticality: string}>}
+ */
+function getClassificationSummary() {
+  return Object.entries(CRITICALITY_MAP).map(([sp, crit]) => ({
+    spawnPoint: sp,
+    criticality: crit,
+  }));
+}
+
+// ---------------------------------------------------------------------------
+// Exports
+// ---------------------------------------------------------------------------
+
+module.exports = {
+  // Constants
+  CRITICALITY,
+  CRITICALITY_MAP,
+
+  // Query functions
+  isAdvisory,
+  isCritical,
+  getClassificationSummary,
+
+  // Wrapper
+  wrapAdvisoryAgent,
+
+  // Error class
+  AdvisoryAgentWarning,
+};

From c66ba94d1546caf955c731d73af329972a985ed8 Mon Sep 17 00:00:00 2001
From: Stephen Miller <Stephen@betterbox.pw>
Date: Fri, 6 Mar 2026 01:29:28 -0600
Subject: [PATCH 2/2] feat(reliability): add criticality annotations to all
 Task() spawn points

Annotate all 9 Task() spawn points in mgw:run pipeline with
<!-- mgw:criticality=critical|advisory --> comments that classify
each agent's impact on pipeline continuity.

Advisory agents (4): comment-classifier, plan-checker, verifier,
milestone-verifier. These provide quality-of-life improvements but
the pipeline can produce a valid PR without their output.

Critical agents (5): planner, executor, milestone-planner,
milestone-executor, pr-creator. These produce artifacts the pipeline
cannot proceed without.

The retry loop in execute.md now checks agent criticality before
deciding between retry/dead-letter (critical) or warn-and-continue
(advisory). Uses lib/agent-criticality.cjs for classification.

Closes #234 (partial)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 commands/run/execute.md   | 120 ++++++++++++++++++++++++++++++++------
 commands/run/pr-create.md |   5 ++
 commands/run/triage.md    |  14 +++++
 3 files changed, 121 insertions(+), 18 deletions(-)

diff --git a/commands/run/execute.md b/commands/run/execute.md
index 4aba0e0..1b207f4 100644
--- a/commands/run/execute.md
+++ b/commands/run/execute.md
@@ -77,6 +77,9 @@ ic.buildGSDPromptContext({
 " 2>/dev/null || echo "")
 ```
 
+<!-- mgw:criticality=critical  spawn_point=planner -->
+<!-- Critical: plan creation is required for pipeline to proceed.
+     On failure: retry via existing retry loop, then dead-letter. -->
 ```
 Task(
   prompt="
@@ -150,6 +153,21 @@ PLAN_CHECK=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs verify plan-structur
 Parse the JSON result. If structural issues found, include them in the plan-checker prompt below so it has concrete problems to evaluate rather than searching from scratch.
 
 6. **(If --full) Spawn plan-checker, handle revision loop (max 2 iterations):**
+
+<!-- mgw:criticality=advisory  spawn_point=plan-checker -->
+<!-- Advisory: plan checking is a quality gate, not a pipeline blocker.
+     If this agent fails, log a warning and proceed with the unchecked plan.
+     The plan was already verified structurally by gsd-tools verify plan-structure.
+
+     Graceful degradation pattern:
+     ```
+     PLAN_CHECK_RESULT=$(wrapAdvisoryAgent(Task(...), 'plan-checker', {
+       issueNumber: ISSUE_NUMBER,
+       fallback: '## VERIFICATION PASSED (plan-checker unavailable — structural check only)'
+     }))
+     # If fallback returned, skip the revision loop and proceed to execution
+     ```
+-->
 ```
 Task(
   prompt="
@@ -195,6 +213,11 @@ If issues found and iteration < 2: spawn planner revision, then re-check.
 If iteration >= 2: offer force proceed or abort.
 
 7. **Spawn executor (task agent):**
+
+<!-- mgw:criticality=critical  spawn_point=executor -->
+<!-- Critical: execution produces the code changes. Without it, there is
+     nothing to commit or PR. On failure: retry via existing retry loop,
+     then dead-letter. -->
 ```
 Task(
   prompt="
@@ -240,6 +263,23 @@ VERIFY_RESULT=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs verify-summary "$
 Parse JSON result. Use `passed` field for go/no-go. Checks summary existence, files created, and commits.
 
 9. **(If --full) Spawn verifier:**
+
+<!-- mgw:criticality=advisory  spawn_point=verifier -->
+<!-- Advisory: verification is quality assurance after execution is complete.
+     The code changes and commits already exist. If verification fails,
+     log a warning and proceed to PR creation with a note that verification
+     was skipped.
+
+     Graceful degradation pattern:
+     ```
+     VERIFY_RESULT=$(wrapAdvisoryAgent(Task(...), 'verifier', {
+       issueNumber: ISSUE_NUMBER,
+       fallback: null
+     }))
+     # If fallback returned (null), create a minimal VERIFICATION.md:
+     #   "## VERIFICATION SKIPPED\nVerifier agent unavailable. Manual review recommended."
+     ```
+-->
 ```
 Task(
   prompt="
@@ -294,8 +334,13 @@ If any step above fails (executor or verifier agent returns error, summary missi
 
 ```bash
 # On failure — classify and decide whether to retry
+# CRITICALITY-AWARE: check if the failing agent is advisory first.
+# Advisory agent failures are logged and skipped (pipeline continues).
+# Critical agent failures follow the existing retry/dead-letter flow.
+
 FAILURE_CLASS=$(node -e "
 const { classifyFailure, canRetry, incrementRetry, getBackoffMs } = require('./lib/retry.cjs');
+const { isAdvisory } = require('./lib/agent-criticality.cjs');
 const { loadActiveIssue } = require('./lib/state.cjs');
 const fs = require('fs'), path = require('path');
 
@@ -305,29 +350,43 @@ const file = files.find(f => f.startsWith('${ISSUE_NUMBER}-') && f.endsWith('.js
 const filePath = path.join(activeDir, file);
 let issueState = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
 
-// Classify the failure from the error context
-const error = { message: '${EXECUTION_ERROR_MESSAGE}' };
-const result = classifyFailure(error);
-console.error('Failure classified as: ' + result.class + ' — ' + result.reason);
-
-// Persist failure class to state
-issueState.last_failure_class = result.class;
-
-if (result.class === 'transient' && canRetry(issueState)) {
-  const backoff = getBackoffMs(issueState.retry_count || 0);
-  issueState = incrementRetry(issueState);
-  fs.writeFileSync(filePath, JSON.stringify(issueState, null, 2));
-  // Output: backoff ms so shell can sleep
-  console.log('retry:' + backoff + ':' + result.class);
+// Check if the failing agent is advisory
+const failingSpawnPoint = '${FAILING_SPAWN_POINT}';
+if (failingSpawnPoint && isAdvisory(failingSpawnPoint)) {
+  // Advisory agent failure — log warning and continue pipeline
+  console.error('MGW WARNING: Advisory agent \"' + failingSpawnPoint + '\" failed. Continuing pipeline.');
+  console.log('advisory_degraded:' + failingSpawnPoint);
 } else {
-  // Permanent failure or retries exhausted — dead-letter
-  issueState.dead_letter = true;
-  fs.writeFileSync(filePath, JSON.stringify(issueState, null, 2));
-  console.log('dead_letter:' + result.class);
+  // Critical agent failure — classify and apply retry/dead-letter logic
+  const error = { message: '${EXECUTION_ERROR_MESSAGE}' };
+  const result = classifyFailure(error);
+  console.error('Failure classified as: ' + result.class + ' — ' + result.reason);
+
+  // Persist failure class to state
+  issueState.last_failure_class = result.class;
+
+  if (result.class === 'transient' && canRetry(issueState)) {
+    const backoff = getBackoffMs(issueState.retry_count || 0);
+    issueState = incrementRetry(issueState);
+    fs.writeFileSync(filePath, JSON.stringify(issueState, null, 2));
+    // Output: backoff ms so shell can sleep
+    console.log('retry:' + backoff + ':' + result.class);
+  } else {
+    // Permanent failure or retries exhausted — dead-letter
+    issueState.dead_letter = true;
+    fs.writeFileSync(filePath, JSON.stringify(issueState, null, 2));
+    console.log('dead_letter:' + result.class);
+  }
 }
 ")
 
 case "$FAILURE_CLASS" in
+  advisory_degraded:*)
+    DEGRADED_AGENT=$(echo "$FAILURE_CLASS" | cut -d':' -f2)
+    echo "MGW: Advisory agent '${DEGRADED_AGENT}' failed — gracefully degraded, continuing pipeline."
+    # Do NOT set EXECUTION_SUCCEEDED=false — pipeline continues
+    # Skip to next step (advisory output is optional)
+    ;;
   retry:*)
     BACKOFF_MS=$(echo "$FAILURE_CLASS" | cut -d':' -f2)
     BACKOFF_SEC=$(( (BACKOFF_MS + 999) / 1000 ))
@@ -532,6 +591,9 @@ fi
    " 2>/dev/null || echo "")
    ```
 
+   <!-- mgw:criticality=critical  spawn_point=milestone-planner -->
+   <!-- Critical: phase planning is required — cannot execute without a plan.
+        On failure: retry via milestone retry loop, then dead-letter. -->
    ```
    Task(
      prompt="
@@ -595,6 +657,11 @@ fi
    EXEC_INIT=$(node ~/.claude/get-shit-done/bin/gsd-tools.cjs init execute-phase "${PHASE_NUMBER}")
    # Parse EXEC_INIT JSON for: executor_model, verifier_model, phase_dir, plans, incomplete_plans, plan_count
    ```
+
+   <!-- mgw:criticality=critical  spawn_point=milestone-executor -->
+   <!-- Critical: phase execution produces the code changes. Without it,
+        there is nothing to commit or PR. On failure: retry via milestone
+        retry loop, then dead-letter. -->
    ```
    Task(
      prompt="
@@ -640,6 +707,23 @@ fi
    ```
 
    **e. Spawn verifier agent (gsd:verify-phase):**
+
+   <!-- mgw:criticality=advisory  spawn_point=milestone-verifier -->
+   <!-- Advisory: phase verification is quality assurance after execution
+        completes. The code changes and commits already exist. If verification
+        fails, log a warning and proceed with a note that verification was
+        skipped for this phase.
+
+        Graceful degradation pattern:
+        ```
+        VERIFY_RESULT=$(wrapAdvisoryAgent(Task(...), 'milestone-verifier', {
+          issueNumber: ISSUE_NUMBER,
+          fallback: null
+        }))
+        # If fallback returned, create minimal VERIFICATION.md:
+        #   "## VERIFICATION SKIPPED\nVerifier agent unavailable for phase ${PHASE_NUMBER}."
+        ```
+   -->
    ```
    Task(
      prompt="
diff --git a/commands/run/pr-create.md b/commands/run/pr-create.md
index 6bff0c2..de4bc19 100644
--- a/commands/run/pr-create.md
+++ b/commands/run/pr-create.md
@@ -105,6 +105,11 @@ ic.assembleIssueContext(${ISSUE_NUMBER})
 
 Read issue state for context.
 
+<!-- mgw:criticality=critical  spawn_point=pr-creator -->
+<!-- Critical: PR creation is the pipeline's final output. Without it,
+     the entire pipeline run produces no deliverable. On failure: the
+     pipeline marks the issue as failed (no retry — PR creation errors
+     are typically permanent: branch conflicts, permissions, etc.). -->
 ```
 Task(
   prompt="
diff --git a/commands/run/triage.md b/commands/run/triage.md
index 70648e5..b42af9e 100644
--- a/commands/run/triage.md
+++ b/commands/run/triage.md
@@ -360,6 +360,20 @@ NEW_COMMENTS=$(gh issue view $ISSUE_NUMBER --json comments \
 ```
 
 2. **Spawn classification agent:**
+
+<!-- mgw:criticality=advisory  spawn_point=comment-classifier -->
+<!-- Advisory: comment classification failure does not block the pipeline.
+     If this agent fails, log a warning and treat all new comments as
+     informational (safe default — pipeline continues with stale data).
+
+     Graceful degradation pattern:
+     ```
+     CLASSIFICATION_RESULT=$(wrapAdvisoryAgent(Task(...), 'comment-classifier', {
+       issueNumber: ISSUE_NUMBER,
+       fallback: '{"classification":"informational","reasoning":"comment classifier unavailable","new_requirements":[],"blocking_reason":""}'
+     }))
+     ```
+-->
 ```
 Task(
   prompt="