Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions apps/backend/cli/qa_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,12 @@ def handle_qa_command(
print("\n✅ Build already approved by QA.")
else:
completed, total = count_subtasks(spec_dir)
print(f"\n❌ Build not complete ({completed}/{total} subtasks).")
print("Complete all subtasks before running QA validation.")
print(
f"\n❌ Build not ready for QA ({completed}/{total} subtasks completed)."
)
print(
"All subtasks must reach a terminal state (completed, failed, or stuck) before running QA."
)
return

if has_human_feedback:
Expand Down
55 changes: 55 additions & 0 deletions apps/backend/core/progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,61 @@
return total > 0 and completed == total


def is_build_ready_for_qa(spec_dir: Path) -> bool:
"""
Check if the build is ready for QA validation.

Unlike is_build_complete() which requires all subtasks to be "completed",
this function considers the build ready when all subtasks have reached
a terminal state: completed, failed, or stuck (exhausted retries in attempt_history.json).

Args:
spec_dir: Directory containing implementation_plan.json

Returns:
True if all subtasks are in a terminal state, False otherwise
"""
plan_file = spec_dir / "implementation_plan.json"
if not plan_file.exists():
return False

# Load stuck subtask IDs from attempt_history.json
stuck_subtask_ids = set()
attempt_history_file = spec_dir / "memory" / "attempt_history.json"
if attempt_history_file.exists():
try:
with open(attempt_history_file, encoding="utf-8") as f:
attempt_history = json.load(f)
stuck_subtask_ids = {
entry["subtask_id"]
for entry in attempt_history.get("stuck_subtasks", [])
if "subtask_id" in entry
}
except (OSError, json.JSONDecodeError, UnicodeDecodeError):

Check notice

Code scanning / CodeQL

Empty except Note

'except' clause does nothing but pass and there is no explanatory comment.

Copilot Autofix

AI 1 day ago

In general, empty except blocks should either be removed (letting the exception propagate) or should perform some explicit handling such as logging, metrics, or substituting a safe default with a clear explanation. If ignoring an exception is truly desired, there should at least be a comment and usually a log statement to document that decision.

For this specific case in is_build_ready_for_qa, we want to keep the existing functional behavior: if attempt_history.json cannot be read or parsed, the function should behave as though there are no stuck subtasks (stuck_subtask_ids remains an empty set) and continue. The best fix is therefore:

  • In the except (OSError, json.JSONDecodeError, UnicodeDecodeError): block around loading attempt_history_file, add a call to the existing logger to record the failure, and optionally a brief doc comment that it is intentionally falling back to assuming no stuck subtasks.
  • Keep returning with the same logic downstream; do not re‑raise.

Only the block at lines 140–149 in apps/backend/core/progress.py needs to change. No new imports are required because logging and logger are already present at the top of the file.

Suggested changeset 1
apps/backend/core/progress.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/apps/backend/core/progress.py b/apps/backend/core/progress.py
--- a/apps/backend/core/progress.py
+++ b/apps/backend/core/progress.py
@@ -145,8 +145,13 @@
                 for entry in attempt_history.get("stuck_subtasks", [])
                 if "subtask_id" in entry
             }
-        except (OSError, json.JSONDecodeError, UnicodeDecodeError):
-            pass
+        except (OSError, json.JSONDecodeError, UnicodeDecodeError) as exc:
+            # Best-effort: if attempt history cannot be read, proceed assuming no stuck subtasks.
+            logger.warning(
+                "Failed to load attempt history from %s: %s",
+                attempt_history_file,
+                exc,
+            )
 
     try:
         with open(plan_file, encoding="utf-8") as f:
EOF
@@ -145,8 +145,13 @@
for entry in attempt_history.get("stuck_subtasks", [])
if "subtask_id" in entry
}
except (OSError, json.JSONDecodeError, UnicodeDecodeError):
pass
except (OSError, json.JSONDecodeError, UnicodeDecodeError) as exc:
# Best-effort: if attempt history cannot be read, proceed assuming no stuck subtasks.
logger.warning(
"Failed to load attempt history from %s: %s",
attempt_history_file,
exc,
)

try:
with open(plan_file, encoding="utf-8") as f:
Copilot is powered by AI and may make mistakes. Always verify output.
pass
Comment on lines +136 to +149
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Duplicated stuck-subtask loading logic — extract a shared helper.

The block that reads attempt_history.json and builds stuck_subtask_ids is duplicated almost verbatim at lines 479–493 in get_next_subtask(). Extracting a shared helper reduces maintenance burden and ensures both callers stay in sync.

Proposed helper
+def _load_stuck_subtask_ids(spec_dir: Path) -> set[str]:
+    """Load IDs of subtasks marked as stuck from attempt_history.json."""
+    stuck_subtask_ids: set[str] = set()
+    attempt_history_file = spec_dir / "memory" / "attempt_history.json"
+    if attempt_history_file.exists():
+        try:
+            with open(attempt_history_file, encoding="utf-8") as f:
+                attempt_history = json.load(f)
+            stuck_subtask_ids = {
+                entry["subtask_id"]
+                for entry in attempt_history.get("stuck_subtasks", [])
+                if "subtask_id" in entry
+            }
+        except (OSError, json.JSONDecodeError, UnicodeDecodeError):
+            pass
+    return stuck_subtask_ids

Then replace both call sites with stuck_subtask_ids = _load_stuck_subtask_ids(spec_dir).

🤖 Prompt for AI Agents
In `@apps/backend/core/progress.py` around lines 136 - 149, The duplicated logic
that reads attempt_history.json and builds stuck_subtask_ids appears in two
places (the shown block and inside get_next_subtask); extract that logic into a
single helper function named _load_stuck_subtask_ids(spec_dir) that returns a
set of subtask IDs, move the try/except, file path construction (spec_dir /
"memory" / "attempt_history.json"), JSON loading, and the comprehension into
that helper, keep the same exception handling (OSError, json.JSONDecodeError,
UnicodeDecodeError) and return an empty set on error, then replace both call
sites with stuck_subtask_ids = _load_stuck_subtask_ids(spec_dir) so both callers
use the shared implementation.


try:
with open(plan_file, encoding="utf-8") as f:
plan = json.load(f)

total = 0
terminal = 0

for phase in plan.get("phases", []):
for subtask in phase.get("subtasks", []):
total += 1
status = subtask.get("status", "pending")
subtask_id = subtask.get("id")

if status in ("completed", "failed") or subtask_id in stuck_subtask_ids:
terminal += 1

return total > 0 and terminal == total

except (OSError, json.JSONDecodeError, UnicodeDecodeError):
return False


def get_progress_percentage(spec_dir: Path) -> float:
"""
Get the progress as a percentage.
Expand Down
2 changes: 2 additions & 0 deletions apps/backend/progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
get_plan_summary,
get_progress_percentage,
is_build_complete,
is_build_ready_for_qa,
print_build_complete_banner,
print_paused_banner,
print_progress_summary,
Expand All @@ -29,6 +30,7 @@
"get_plan_summary",
"get_progress_percentage",
"is_build_complete",
"is_build_ready_for_qa",
"print_build_complete_banner",
"print_paused_banner",
"print_progress_summary",
Expand Down
6 changes: 3 additions & 3 deletions apps/backend/qa/criteria.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import json
from pathlib import Path

from progress import is_build_complete
from progress import is_build_ready_for_qa

# =============================================================================
# IMPLEMENTATION PLAN I/O
Expand Down Expand Up @@ -95,10 +95,10 @@ def should_run_qa(spec_dir: Path) -> bool:
Determine if QA validation should run.

QA should run when:
- All subtasks are completed
- All subtasks have reached a terminal state (completed, failed, or stuck)
- QA has not yet approved
"""
if not is_build_complete(spec_dir):
if not is_build_ready_for_qa(spec_dir):
return False

if is_qa_approved(spec_dir):
Expand Down
31 changes: 18 additions & 13 deletions apps/backend/qa/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
get_phase_model_betas,
)
from phase_event import ExecutionPhase, emit_phase
from progress import count_subtasks, is_build_complete
from progress import count_subtasks, is_build_ready_for_qa
from security.constants import PROJECT_DIR_ENV_VAR
from task_logger import (
LogPhase,
Expand Down Expand Up @@ -114,14 +114,23 @@ async def run_qa_validation_loop(
# Initialize task logger for the validation phase
task_logger = get_task_logger(spec_dir)

# Verify build is complete
if not is_build_complete(spec_dir):
debug_warning("qa_loop", "Build is not complete, cannot run QA")
print("\n❌ Build is not complete. Cannot run QA validation.")
completed, total = count_subtasks(spec_dir)
debug("qa_loop", "Build progress", completed=completed, total=total)
print(f" Progress: {completed}/{total} subtasks completed")
return False
# Check if there's pending human feedback that needs to be processed
fix_request_file = spec_dir / "QA_FIX_REQUEST.md"
has_human_feedback = fix_request_file.exists()

# Human feedback takes priority — if the user explicitly asked to proceed,
# skip the build completeness gate entirely
if not has_human_feedback:
# Verify build is ready for QA (all subtasks in terminal state)
if not is_build_ready_for_qa(spec_dir):
debug_warning(
"qa_loop", "Build is not ready for QA - subtasks still in progress"
)
print("\n❌ Build is not ready for QA validation.")
completed, total = count_subtasks(spec_dir)
debug("qa_loop", "Build progress", completed=completed, total=total)
print(f" Progress: {completed}/{total} subtasks completed")
return False
Comment on lines +117 to +133
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Find the count_subtasks function definition
rg -n 'def count_subtasks' --type=py -A 15

Repository: AndyMik90/Auto-Claude

Length of output: 1873


🏁 Script executed:

#!/bin/bash
# Also search for is_build_ready_for_qa to understand the terminal state logic
rg -n 'def is_build_ready_for_qa' --type=py -A 20

Repository: AndyMik90/Auto-Claude

Length of output: 1568


🏁 Script executed:

#!/bin/bash
# Check what files are in the qa directory
find apps/backend/qa -type f -name "*.py" | head -20

Repository: AndyMik90/Auto-Claude

Length of output: 250


🏁 Script executed:

#!/bin/bash
# Get full count_subtasks implementation
sed -n '33,65p' apps/backend/core/progress.py

Repository: AndyMik90/Auto-Claude

Length of output: 911


🏁 Script executed:

#!/bin/bash
# Get full is_build_ready_for_qa implementation
sed -n '118,170p' apps/backend/core/progress.py

Repository: AndyMik90/Auto-Claude

Length of output: 1926


🏁 Script executed:

#!/bin/bash
# Check count_subtasks_detailed full implementation
sed -n '66,117p' apps/backend/core/progress.py

Repository: AndyMik90/Auto-Claude

Length of output: 1370


🏁 Script executed:

#!/bin/bash
# Get the context around line 130 in loop.py
sed -n '115,135p' apps/backend/qa/loop.py

Repository: AndyMik90/Auto-Claude

Length of output: 1087


🏁 Script executed:

#!/bin/bash
# Check if count_subtasks_detailed is used anywhere in loop.py or related files
rg -n 'count_subtasks_detailed' --type=py apps/backend/qa/

Repository: AndyMik90/Auto-Claude

Length of output: 47


Update the progress message to reflect terminal state semantics.

When the QA readiness gate fails (line 130), the message shows count_subtasks() output which reports only completed subtasks. However, is_build_ready_for_qa() checks for terminal states (completed, failed, or stuck). This creates a mismatch: if 3 of 5 subtasks are completed, 1 failed, and 1 in-progress, the user sees "Progress: 3/5 subtasks completed" — which doesn't explain why QA is blocked.

Either update the message to show terminal/total count (e.g., "4/5 subtasks in terminal state") to align with the gate's semantics, or use count_subtasks_detailed() to show the breakdown of non-terminal subtasks.

🤖 Prompt for AI Agents
In `@apps/backend/qa/loop.py` around lines 117 - 133, The QA readiness failure
message currently prints only completed subtasks using count_subtasks(), which
mismatches is_build_ready_for_qa()'s terminal-state logic; update the failure
branch in qa_loop to call count_subtasks_detailed(spec_dir) (or otherwise
compute terminal states) and print a progress line like "Progress:
{terminal}/{total} subtasks in terminal state" and optionally include a
breakdown (completed/failed/stuck) so the message aligns with
is_build_ready_for_qa() semantics; keep debug_warning("qa_loop", ...) and
debug("qa_loop", ...) as-is but use the detailed counts when formatting the
user-facing print.


# Emit phase event at start of QA validation (before any early returns)
emit_phase(ExecutionPhase.QA_REVIEW, "Starting QA validation")
Expand All @@ -136,10 +145,6 @@ async def run_qa_validation_loop(
f"[Fast Mode] {'ENABLED' if fast_mode else 'disabled'} for QA validation",
)

# Check if there's pending human feedback that needs to be processed
fix_request_file = spec_dir / "QA_FIX_REQUEST.md"
has_human_feedback = fix_request_file.exists()

# Check if already approved - but if there's human feedback, we need to process it first
if is_qa_approved(spec_dir) and not has_human_feedback:
debug_success("qa_loop", "Build already approved by QA")
Expand Down
27 changes: 27 additions & 0 deletions apps/backend/services/recovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,33 @@ def mark_subtask_stuck(self, subtask_id: str, reason: str) -> None:

self._save_attempt_history(history)

# Also update the subtask status in implementation_plan.json
# so that other callers (like is_build_ready_for_qa) see accurate status
try:
plan_file = self.spec_dir / "implementation_plan.json"
if plan_file.exists():
with open(plan_file, encoding="utf-8") as f:
plan = json.load(f)

updated = False
for phase in plan.get("phases", []):
for subtask in phase.get("subtasks", []):
if subtask.get("id") == subtask_id:
subtask["status"] = "failed"
subtask["notes"] = f"Marked as stuck: {reason}"
Comment on lines +529 to +530
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The introduction of a new notes field here is inconsistent with the Subtask data model defined in implementation_plan/subtask.py. The Subtask class does not have a notes field, which means this data will be lost if the implementation_plan.json is ever loaded and resaved using the ImplementationPlan model objects.

To maintain data model consistency, I recommend using the existing actual_output field, which seems suitable for this kind of information. The Subtask.fail() method already uses this field for failure reasons.

Please also update the new test test_mark_subtask_stuck_updates_plan in tests/test_recovery.py to assert on the actual_output field instead of notes.

Suggested change
subtask["status"] = "failed"
subtask["notes"] = f"Marked as stuck: {reason}"
subtask["status"] = "failed"
subtask["actual_output"] = f"Marked as stuck: {reason}"

updated = True
break
if updated:
break

if updated:
with open(plan_file, "w", encoding="utf-8") as f:
json.dump(plan, f, indent=2)
except (OSError, json.JSONDecodeError, UnicodeDecodeError) as e:
logger.warning(
f"Failed to update implementation_plan.json for stuck subtask {subtask_id}: {e}"
)
Comment on lines +517 to +542
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Overwrites existing notes field — consider preserving prior notes.

Line 530 unconditionally sets subtask["notes"], discarding any pre-existing value. If subtasks carry notes from earlier stages (e.g., planner annotations), that context is silently lost.

Proposed fix: prepend stuck reason to existing notes
-                            subtask["notes"] = f"Marked as stuck: {reason}"
+                            stuck_note = f"Marked as stuck: {reason}"
+                            existing = subtask.get("notes", "")
+                            subtask["notes"] = (
+                                f"{stuck_note}\n{existing}" if existing else stuck_note
+                            )
🤖 Prompt for AI Agents
In `@apps/backend/services/recovery.py` around lines 517 - 542, The current
implementation in the block that opens implementation_plan.json unconditionally
overwrites subtask["notes"] when marking a subtask failed (in the loop
referencing self.spec_dir, subtask_id and plan["phases"]/subtasks), which loses
prior notes; change it to preserve existing notes by prepending or appending the
"Marked as stuck: {reason}" text to the existing subtask.get("notes", "") (e.g.,
build a new_notes variable that combines the new marker and the prior notes with
a separator only if prior notes exist), then set subtask["notes"] = new_notes
before writing the updated plan back to disk, leaving the rest of the error
handling (logger.warning on exceptions) intact.


def get_stuck_subtasks(self) -> list[dict]:
"""
Get all subtasks marked as stuck.
Expand Down
20 changes: 10 additions & 10 deletions tests/agents/test_agent_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,8 +922,8 @@ class TestQALoopStateTransitions:
def test_qa_not_required_when_build_incomplete(self, test_env):
"""QA should not run when build is incomplete."""
from qa_loop import save_implementation_plan
# Import the real is_build_complete to patch at the right level
from core.progress import is_build_complete as real_is_build_complete
# Import the real is_build_ready_for_qa to patch at the right level
from core.progress import is_build_ready_for_qa as real_is_build_ready_for_qa

temp_dir, spec_dir, project_dir = test_env

Expand All @@ -943,16 +943,16 @@ def test_qa_not_required_when_build_incomplete(self, test_env):
}
save_implementation_plan(spec_dir, plan)

# Patch is_build_complete where it's used (qa.criteria) to use real implementation
# Patch is_build_ready_for_qa where it's used (qa.criteria) to use real implementation
# This is needed because test_qa_criteria.py module-level mocks may pollute
with patch('qa.criteria.is_build_complete', side_effect=real_is_build_complete):
with patch('qa.criteria.is_build_ready_for_qa', side_effect=real_is_build_ready_for_qa):
from qa.criteria import should_run_qa
assert should_run_qa(spec_dir) is False, "QA should not run with pending subtasks"

def test_qa_required_when_build_complete(self, test_env):
"""QA should run when build is complete and not yet approved."""
from qa_loop import save_implementation_plan
from core.progress import is_build_complete as real_is_build_complete
from core.progress import is_build_ready_for_qa as real_is_build_ready_for_qa

temp_dir, spec_dir, project_dir = test_env

Expand All @@ -972,15 +972,15 @@ def test_qa_required_when_build_complete(self, test_env):
}
save_implementation_plan(spec_dir, plan)

# Patch is_build_complete where it's used (qa.criteria) to use real implementation
with patch('qa.criteria.is_build_complete', side_effect=real_is_build_complete):
# Patch is_build_ready_for_qa where it's used (qa.criteria) to use real implementation
with patch('qa.criteria.is_build_ready_for_qa', side_effect=real_is_build_ready_for_qa):
from qa.criteria import should_run_qa
assert should_run_qa(spec_dir) is True, "QA should run when build complete"

def test_qa_not_required_when_already_approved(self, test_env):
"""QA should not run when build is already approved."""
from qa_loop import save_implementation_plan
from core.progress import is_build_complete as real_is_build_complete
from core.progress import is_build_ready_for_qa as real_is_build_ready_for_qa

temp_dir, spec_dir, project_dir = test_env

Expand All @@ -1003,8 +1003,8 @@ def test_qa_not_required_when_already_approved(self, test_env):
}
save_implementation_plan(spec_dir, plan)

# Patch is_build_complete where it's used (qa.criteria) to use real implementation
with patch('qa.criteria.is_build_complete', side_effect=real_is_build_complete):
# Patch is_build_ready_for_qa where it's used (qa.criteria) to use real implementation
with patch('qa.criteria.is_build_ready_for_qa', side_effect=real_is_build_ready_for_qa):
from qa.criteria import should_run_qa
assert should_run_qa(spec_dir) is False, "QA should not run when already approved"

Expand Down
2 changes: 1 addition & 1 deletion tests/test_cli_qa_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def test_incomplete_build_message(
)

captured = capsys.readouterr()
assert "Build not complete" in captured.out
assert "Build not ready for QA" in captured.out
assert "1/2" in captured.out

def test_processes_human_feedback(
Expand Down
Loading
Loading