Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions apps/backend/cli/qa_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,12 @@ def handle_qa_command(
print("\n✅ Build already approved by QA.")
else:
completed, total = count_subtasks(spec_dir)
print(f"\n❌ Build not complete ({completed}/{total} subtasks).")
print("Complete all subtasks before running QA validation.")
print(
f"\n❌ Build not ready for QA ({completed}/{total} subtasks completed)."
)
print(
"All subtasks must reach a terminal state (completed, failed, or stuck) before running QA."
)
return

if has_human_feedback:
Expand Down
76 changes: 60 additions & 16 deletions apps/backend/core/progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,65 @@ def is_build_complete(spec_dir: Path) -> bool:
return total > 0 and completed == total


def _load_stuck_subtask_ids(spec_dir: Path) -> set[str]:
"""Load IDs of subtasks marked as stuck from attempt_history.json."""
stuck_subtask_ids: set[str] = set()
attempt_history_file = spec_dir / "memory" / "attempt_history.json"
if attempt_history_file.exists():
try:
with open(attempt_history_file, encoding="utf-8") as f:
attempt_history = json.load(f)
for entry in attempt_history.get("stuck_subtasks", []):
if "subtask_id" in entry:
stuck_subtask_ids.add(entry["subtask_id"])
except (OSError, json.JSONDecodeError, UnicodeDecodeError):
# Corrupted attempt history is non-fatal; skip stuck-subtask filtering
pass
return stuck_subtask_ids


def is_build_ready_for_qa(spec_dir: Path) -> bool:
"""
Check if the build is ready for QA validation.

Unlike is_build_complete() which requires all subtasks to be "completed",
this function considers the build ready when all subtasks have reached
a terminal state: completed, failed, or stuck (exhausted retries in attempt_history.json).

Args:
spec_dir: Directory containing implementation_plan.json

Returns:
True if all subtasks are in a terminal state, False otherwise
"""
plan_file = spec_dir / "implementation_plan.json"
if not plan_file.exists():
return False

stuck_subtask_ids = _load_stuck_subtask_ids(spec_dir)

try:
with open(plan_file, encoding="utf-8") as f:
plan = json.load(f)

total = 0
terminal = 0

for phase in plan.get("phases", []):
for subtask in phase.get("subtasks", []):
total += 1
status = subtask.get("status", "pending")
subtask_id = subtask.get("id")

if status in ("completed", "failed") or subtask_id in stuck_subtask_ids:
terminal += 1

return total > 0 and terminal == total

except (OSError, json.JSONDecodeError, UnicodeDecodeError):
return False


def get_progress_percentage(spec_dir: Path) -> float:
"""
Get the progress as a percentage.
Expand Down Expand Up @@ -420,22 +479,7 @@ def get_next_subtask(spec_dir: Path) -> dict | None:
if not plan_file.exists():
return None

# Load stuck subtasks from recovery manager's attempt history
stuck_subtask_ids = set()
attempt_history_file = spec_dir / "memory" / "attempt_history.json"
if attempt_history_file.exists():
try:
with open(attempt_history_file, encoding="utf-8") as f:
attempt_history = json.load(f)
# Collect IDs of subtasks marked as stuck
stuck_subtask_ids = {
entry["subtask_id"]
for entry in attempt_history.get("stuck_subtasks", [])
if "subtask_id" in entry
}
except (OSError, json.JSONDecodeError, UnicodeDecodeError):
# If we can't read the file, continue without stuck checking
pass
stuck_subtask_ids = _load_stuck_subtask_ids(spec_dir)

try:
with open(plan_file, encoding="utf-8") as f:
Expand Down
2 changes: 2 additions & 0 deletions apps/backend/progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
get_plan_summary,
get_progress_percentage,
is_build_complete,
is_build_ready_for_qa,
print_build_complete_banner,
print_paused_banner,
print_progress_summary,
Expand All @@ -29,6 +30,7 @@
"get_plan_summary",
"get_progress_percentage",
"is_build_complete",
"is_build_ready_for_qa",
"print_build_complete_banner",
"print_paused_banner",
"print_progress_summary",
Expand Down
6 changes: 3 additions & 3 deletions apps/backend/qa/criteria.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import json
from pathlib import Path

from progress import is_build_complete
from progress import is_build_ready_for_qa

# =============================================================================
# IMPLEMENTATION PLAN I/O
Expand Down Expand Up @@ -95,10 +95,10 @@ def should_run_qa(spec_dir: Path) -> bool:
Determine if QA validation should run.

QA should run when:
- All subtasks are completed
- All subtasks have reached a terminal state (completed, failed, or stuck)
- QA has not yet approved
"""
if not is_build_complete(spec_dir):
if not is_build_ready_for_qa(spec_dir):
return False

if is_qa_approved(spec_dir):
Expand Down
33 changes: 20 additions & 13 deletions apps/backend/qa/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
get_phase_model_betas,
)
from phase_event import ExecutionPhase, emit_phase
from progress import count_subtasks, is_build_complete
from progress import count_subtasks, is_build_ready_for_qa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Import of count_subtasks is retained but only used in the misleading progress message.

count_subtasks is still imported on line 31 alongside is_build_ready_for_qa. Once the progress message is fixed (see above), this import may become unused. Keep this in mind when addressing the progress message.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@apps/backend/qa/loop.py` at line 31, The import line brings in count_subtasks
but that function is not actually used (only referenced in a misleading progress
message); update apps/backend/qa/loop.py by either removing count_subtasks from
the import (retain is_build_ready_for_qa) or actually use count_subtasks to
compute and display a correct progress value in the progress message—adjust the
progress message logic where it references count_subtasks so the import becomes
necessary, otherwise delete count_subtasks from the "from progress import
count_subtasks, is_build_ready_for_qa" statement to avoid an unused import.

from security.constants import PROJECT_DIR_ENV_VAR
from task_logger import (
LogPhase,
Expand Down Expand Up @@ -114,14 +114,25 @@ async def run_qa_validation_loop(
# Initialize task logger for the validation phase
task_logger = get_task_logger(spec_dir)

# Verify build is complete
if not is_build_complete(spec_dir):
debug_warning("qa_loop", "Build is not complete, cannot run QA")
print("\n❌ Build is not complete. Cannot run QA validation.")
completed, total = count_subtasks(spec_dir)
debug("qa_loop", "Build progress", completed=completed, total=total)
print(f" Progress: {completed}/{total} subtasks completed")
return False
# Check if there's pending human feedback that needs to be processed
fix_request_file = spec_dir / "QA_FIX_REQUEST.md"
has_human_feedback = fix_request_file.exists()

# Human feedback takes priority — if the user explicitly asked to proceed,
# skip the build completeness gate entirely
if not has_human_feedback:
# Verify build is ready for QA (all subtasks in terminal state)
if not is_build_ready_for_qa(spec_dir):
debug_warning(
"qa_loop", "Build is not ready for QA - subtasks still in progress"
)
print("\n❌ Build is not ready for QA validation.")
completed, total = count_subtasks(spec_dir)
debug("qa_loop", "Build progress", completed=completed, total=total)
print(
f" Progress: {completed}/{total} subtasks in terminal state (completed/failed/stuck)"
)
return False
Comment on lines 117 to 135
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Find the count_subtasks function definition
rg -n 'def count_subtasks' --type=py -A 15

Repository: AndyMik90/Auto-Claude

Length of output: 1873


🏁 Script executed:

#!/bin/bash
# Also search for is_build_ready_for_qa to understand the terminal state logic
rg -n 'def is_build_ready_for_qa' --type=py -A 20

Repository: AndyMik90/Auto-Claude

Length of output: 1568


🏁 Script executed:

#!/bin/bash
# Check what files are in the qa directory
find apps/backend/qa -type f -name "*.py" | head -20

Repository: AndyMik90/Auto-Claude

Length of output: 250


🏁 Script executed:

#!/bin/bash
# Get full count_subtasks implementation
sed -n '33,65p' apps/backend/core/progress.py

Repository: AndyMik90/Auto-Claude

Length of output: 911


🏁 Script executed:

#!/bin/bash
# Get full is_build_ready_for_qa implementation
sed -n '118,170p' apps/backend/core/progress.py

Repository: AndyMik90/Auto-Claude

Length of output: 1926


🏁 Script executed:

#!/bin/bash
# Check count_subtasks_detailed full implementation
sed -n '66,117p' apps/backend/core/progress.py

Repository: AndyMik90/Auto-Claude

Length of output: 1370


🏁 Script executed:

#!/bin/bash
# Get the context around line 130 in loop.py
sed -n '115,135p' apps/backend/qa/loop.py

Repository: AndyMik90/Auto-Claude

Length of output: 1087


🏁 Script executed:

#!/bin/bash
# Check if count_subtasks_detailed is used anywhere in loop.py or related files
rg -n 'count_subtasks_detailed' --type=py apps/backend/qa/

Repository: AndyMik90/Auto-Claude

Length of output: 47


Update the progress message to reflect terminal state semantics.

When the QA readiness gate fails (line 130), the message shows count_subtasks() output which reports only completed subtasks. However, is_build_ready_for_qa() checks for terminal states (completed, failed, or stuck). This creates a mismatch: if 3 of 5 subtasks are completed, 1 failed, and 1 in-progress, the user sees "Progress: 3/5 subtasks completed" — which doesn't explain why QA is blocked.

Either update the message to show terminal/total count (e.g., "4/5 subtasks in terminal state") to align with the gate's semantics, or use count_subtasks_detailed() to show the breakdown of non-terminal subtasks.

🤖 Prompt for AI Agents
In `@apps/backend/qa/loop.py` around lines 117 - 133, The QA readiness failure
message currently prints only completed subtasks using count_subtasks(), which
mismatches is_build_ready_for_qa()'s terminal-state logic; update the failure
branch in qa_loop to call count_subtasks_detailed(spec_dir) (or otherwise
compute terminal states) and print a progress line like "Progress:
{terminal}/{total} subtasks in terminal state" and optionally include a
breakdown (completed/failed/stuck) so the message aligns with
is_build_ready_for_qa() semantics; keep debug_warning("qa_loop", ...) and
debug("qa_loop", ...) as-is but use the detailed counts when formatting the
user-facing print.


# Emit phase event at start of QA validation (before any early returns)
emit_phase(ExecutionPhase.QA_REVIEW, "Starting QA validation")
Expand All @@ -136,10 +147,6 @@ async def run_qa_validation_loop(
f"[Fast Mode] {'ENABLED' if fast_mode else 'disabled'} for QA validation",
)

# Check if there's pending human feedback that needs to be processed
fix_request_file = spec_dir / "QA_FIX_REQUEST.md"
has_human_feedback = fix_request_file.exists()

# Check if already approved - but if there's human feedback, we need to process it first
if is_qa_approved(spec_dir) and not has_human_feedback:
debug_success("qa_loop", "Build already approved by QA")
Expand Down
32 changes: 32 additions & 0 deletions apps/backend/services/recovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from enum import Enum
from pathlib import Path

from core.file_utils import write_json_atomic

# Recovery manager configuration
ATTEMPT_WINDOW_SECONDS = 7200 # Only count attempts within last 2 hours
MAX_ATTEMPT_HISTORY_PER_SUBTASK = 50 # Cap stored attempts per subtask
Expand Down Expand Up @@ -514,6 +516,36 @@ def mark_subtask_stuck(self, subtask_id: str, reason: str) -> None:

self._save_attempt_history(history)

# Also update the subtask status in implementation_plan.json
# so that other callers (like is_build_ready_for_qa) see accurate status
try:
plan_file = self.spec_dir / "implementation_plan.json"
if plan_file.exists():
with open(plan_file, encoding="utf-8") as f:
plan = json.load(f)

updated = False
for phase in plan.get("phases", []):
for subtask in phase.get("subtasks", []):
if subtask.get("id") == subtask_id:
subtask["status"] = "failed"
stuck_note = f"Marked as stuck: {reason}"
existing = subtask.get("actual_output", "")
subtask["actual_output"] = (
f"{stuck_note}\n{existing}" if existing else stuck_note
)
updated = True
break
if updated:
break

if updated:
write_json_atomic(plan_file, plan, indent=2)
except (OSError, json.JSONDecodeError, UnicodeDecodeError) as e:
logger.warning(
f"Failed to update implementation_plan.json for stuck subtask {subtask_id}: {e}"
)
Comment on lines +519 to +547
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Variable existing shadows outer scope on Line 533.

Line 507 defines existing (list of stuck entries matching subtask_id), and Line 533 redefines existing (the actual_output string). While not a bug — the outer existing is no longer needed — it hurts readability. Consider renaming the inner one, e.g. prior_output.

Proposed rename
-                            existing = subtask.get("actual_output", "")
+                            prior_output = subtask.get("actual_output", "")
                             subtask["actual_output"] = (
-                                f"{stuck_note}\n{existing}" if existing else stuck_note
+                                f"{stuck_note}\n{prior_output}" if prior_output else stuck_note
                             )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Also update the subtask status in implementation_plan.json
# so that other callers (like is_build_ready_for_qa) see accurate status
try:
plan_file = self.spec_dir / "implementation_plan.json"
if plan_file.exists():
with open(plan_file, encoding="utf-8") as f:
plan = json.load(f)
updated = False
for phase in plan.get("phases", []):
for subtask in phase.get("subtasks", []):
if subtask.get("id") == subtask_id:
subtask["status"] = "failed"
stuck_note = f"Marked as stuck: {reason}"
existing = subtask.get("actual_output", "")
subtask["actual_output"] = (
f"{stuck_note}\n{existing}" if existing else stuck_note
)
updated = True
break
if updated:
break
if updated:
write_json_atomic(plan_file, plan, indent=2)
except (OSError, json.JSONDecodeError, UnicodeDecodeError) as e:
logger.warning(
f"Failed to update implementation_plan.json for stuck subtask {subtask_id}: {e}"
)
# Also update the subtask status in implementation_plan.json
# so that other callers (like is_build_ready_for_qa) see accurate status
try:
plan_file = self.spec_dir / "implementation_plan.json"
if plan_file.exists():
with open(plan_file, encoding="utf-8") as f:
plan = json.load(f)
updated = False
for phase in plan.get("phases", []):
for subtask in phase.get("subtasks", []):
if subtask.get("id") == subtask_id:
subtask["status"] = "failed"
stuck_note = f"Marked as stuck: {reason}"
prior_output = subtask.get("actual_output", "")
subtask["actual_output"] = (
f"{stuck_note}\n{prior_output}" if prior_output else stuck_note
)
updated = True
break
if updated:
break
if updated:
write_json_atomic(plan_file, plan, indent=2)
except (OSError, json.JSONDecodeError, UnicodeDecodeError) as e:
logger.warning(
f"Failed to update implementation_plan.json for stuck subtask {subtask_id}: {e}"
)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@apps/backend/services/recovery.py` around lines 519 - 547, The inner variable
named existing in the implementation_plan.json update block shadows an outer
variable also named existing (the list of stuck entries); rename the inner
variable (e.g., to prior_output) and update its usage in the f-string and
assignment (subtask["actual_output"] = ...) so the outer existing remains
unshadowed and the intent is clearer; locate the code around the loop that
iterates phase.get("subtasks", []) and change the inner existing → prior_output
and any references within that scope (including the conditional
f"{stuck_note}\n{existing}" if existing else stuck_note).


def get_stuck_subtasks(self) -> list[dict]:
"""
Get all subtasks marked as stuck.
Expand Down
20 changes: 10 additions & 10 deletions tests/agents/test_agent_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,8 +922,8 @@ class TestQALoopStateTransitions:
def test_qa_not_required_when_build_incomplete(self, test_env):
"""QA should not run when build is incomplete."""
from qa_loop import save_implementation_plan
# Import the real is_build_complete to patch at the right level
from core.progress import is_build_complete as real_is_build_complete
# Import the real is_build_ready_for_qa to patch at the right level
from core.progress import is_build_ready_for_qa as real_is_build_ready_for_qa

temp_dir, spec_dir, project_dir = test_env

Expand All @@ -943,16 +943,16 @@ def test_qa_not_required_when_build_incomplete(self, test_env):
}
save_implementation_plan(spec_dir, plan)

# Patch is_build_complete where it's used (qa.criteria) to use real implementation
# Patch is_build_ready_for_qa where it's used (qa.criteria) to use real implementation
# This is needed because test_qa_criteria.py module-level mocks may pollute
with patch('qa.criteria.is_build_complete', side_effect=real_is_build_complete):
with patch('qa.criteria.is_build_ready_for_qa', side_effect=real_is_build_ready_for_qa):
from qa.criteria import should_run_qa
assert should_run_qa(spec_dir) is False, "QA should not run with pending subtasks"

def test_qa_required_when_build_complete(self, test_env):
"""QA should run when build is complete and not yet approved."""
from qa_loop import save_implementation_plan
from core.progress import is_build_complete as real_is_build_complete
from core.progress import is_build_ready_for_qa as real_is_build_ready_for_qa

temp_dir, spec_dir, project_dir = test_env

Expand All @@ -972,15 +972,15 @@ def test_qa_required_when_build_complete(self, test_env):
}
save_implementation_plan(spec_dir, plan)

# Patch is_build_complete where it's used (qa.criteria) to use real implementation
with patch('qa.criteria.is_build_complete', side_effect=real_is_build_complete):
# Patch is_build_ready_for_qa where it's used (qa.criteria) to use real implementation
with patch('qa.criteria.is_build_ready_for_qa', side_effect=real_is_build_ready_for_qa):
from qa.criteria import should_run_qa
assert should_run_qa(spec_dir) is True, "QA should run when build complete"

def test_qa_not_required_when_already_approved(self, test_env):
"""QA should not run when build is already approved."""
from qa_loop import save_implementation_plan
from core.progress import is_build_complete as real_is_build_complete
from core.progress import is_build_ready_for_qa as real_is_build_ready_for_qa

temp_dir, spec_dir, project_dir = test_env

Expand All @@ -1003,8 +1003,8 @@ def test_qa_not_required_when_already_approved(self, test_env):
}
save_implementation_plan(spec_dir, plan)

# Patch is_build_complete where it's used (qa.criteria) to use real implementation
with patch('qa.criteria.is_build_complete', side_effect=real_is_build_complete):
# Patch is_build_ready_for_qa where it's used (qa.criteria) to use real implementation
with patch('qa.criteria.is_build_ready_for_qa', side_effect=real_is_build_ready_for_qa):
from qa.criteria import should_run_qa
assert should_run_qa(spec_dir) is False, "QA should not run when already approved"

Expand Down
2 changes: 1 addition & 1 deletion tests/test_cli_qa_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def test_incomplete_build_message(
)

captured = capsys.readouterr()
assert "Build not complete" in captured.out
assert "Build not ready for QA" in captured.out
assert "1/2" in captured.out

def test_processes_human_feedback(
Expand Down
Loading
Loading