Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions apps/backend/runners/github/services/followup_reviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,7 +886,8 @@ async def _attempt_extraction_call(
"""Attempt a short SDK call with minimal schema to recover review data.

This is the extraction recovery step when full structured output validation fails.
Uses FollowupExtractionResponse (~6 flat fields) which has near-100% success rate.
Uses FollowupExtractionResponse (small schema with ExtractedFindingSummary nesting)
which has near-100% success rate.

Uses create_client() + process_sdk_stream() for proper OAuth handling,
matching the pattern in parallel_followup_reviewer.py.
Expand All @@ -900,7 +901,8 @@ async def _attempt_extraction_call(
extraction_prompt = (
"Extract the key review data from the following AI analysis output. "
"Return the verdict, reasoning, resolved finding IDs, unresolved finding IDs, "
"one-line summaries of any new findings, and counts of confirmed/dismissed findings.\n\n"
"structured summaries of any new findings (including severity, description, file path, and line number), "
"and counts of confirmed/dismissed findings.\n\n"
f"--- AI ANALYSIS OUTPUT ---\n{text[:8000]}\n--- END ---"
)

Expand Down Expand Up @@ -946,9 +948,16 @@ async def _attempt_extraction_call(

# Convert extraction to internal format with reconstructed findings
new_findings = []
for i, summary in enumerate(extracted.new_finding_summaries):
for i, summary_obj in enumerate(extracted.new_finding_summaries):
new_findings.append(
create_finding_from_summary(summary, i, id_prefix="FR")
create_finding_from_summary(
summary=summary_obj.description,
index=i,
id_prefix="FR",
severity_override=summary_obj.severity,
file=summary_obj.file,
line=summary_obj.line,
)
)

# Build finding_resolutions from extraction data for _apply_ai_resolutions
Expand Down
21 changes: 15 additions & 6 deletions apps/backend/runners/github/services/parallel_followup_reviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1129,7 +1129,8 @@ async def _attempt_extraction_call(
"""Attempt a short SDK call with a minimal schema to recover review data.

This is the Tier 2 recovery step when full structured output validation fails.
Uses FollowupExtractionResponse (~6 flat fields) which has near-100% success rate.
Uses FollowupExtractionResponse (small schema with ExtractedFindingSummary nesting)
which has near-100% success rate.

Returns parsed result dict on success, None on failure.
"""
Expand All @@ -1146,7 +1147,8 @@ async def _attempt_extraction_call(
extraction_prompt = (
"Extract the key review data from the following AI analysis output. "
"Return the verdict, reasoning, resolved finding IDs, unresolved finding IDs, "
"one-line summaries of any new findings, and counts of confirmed/dismissed findings.\n\n"
"structured summaries of any new findings (including severity, description, file path, and line number), "
"and counts of confirmed/dismissed findings.\n\n"
f"--- AI ANALYSIS OUTPUT ---\n{text[:8000]}\n--- END ---"
)

Expand Down Expand Up @@ -1205,10 +1207,17 @@ async def _attempt_extraction_call(
findings = []
new_finding_ids = []

# 1. Convert new_finding_summaries to minimal PRReviewFinding objects
# Uses shared helper for "SEVERITY: description" parsing and ID generation
for i, summary in enumerate(extracted.new_finding_summaries):
finding = create_finding_from_summary(summary, i, id_prefix="FU")
# 1. Convert new_finding_summaries to PRReviewFinding objects
# ExtractedFindingSummary objects carry file/line from extraction
for i, summary_obj in enumerate(extracted.new_finding_summaries):
finding = create_finding_from_summary(
summary=summary_obj.description,
index=i,
id_prefix="FU",
severity_override=summary_obj.severity,
file=summary_obj.file,
line=summary_obj.line,
)
new_finding_ids.append(finding.id)
findings.append(finding)

Expand Down
22 changes: 19 additions & 3 deletions apps/backend/runners/github/services/pydantic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,10 +533,26 @@ class FindingValidationResponse(BaseModel):
# =============================================================================


class ExtractedFindingSummary(BaseModel):
"""Per-finding summary with file location for extraction recovery."""

severity: str = Field(description="Severity level: LOW, MEDIUM, HIGH, or CRITICAL")
description: str = Field(description="One-line description of the finding")
file: str = Field(
default="unknown", description="File path where the issue was found"
)
line: int = Field(default=0, description="Line number in the file (0 if unknown)")
Comment on lines +536 to +544
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The severity field in ExtractedFindingSummary is defined as a str. For better type safety and to enforce the allowed values at the schema level, it would be better to use typing.Literal. This would be consistent with other models in this file that use Literal for fields with a fixed set of values. To handle potential case variations from the AI, a field_validator can be added to normalize the severity to uppercase before validation.

class ExtractedFindingSummary(BaseModel):
    """Per-finding summary with file location for extraction recovery."""

    @field_validator("severity", mode="before")
    @classmethod
    def _normalize_severity_case(cls, v: str) -> str:
        if isinstance(v, str):
            return v.upper()
        return v

    severity: Literal["LOW", "MEDIUM", "HIGH", "CRITICAL"] = Field(description="Severity level: LOW, MEDIUM, HIGH, or CRITICAL")
    description: str = Field(description="One-line description of the finding")
    file: str = Field(
        default="unknown", description="File path where the issue was found"
    )
    line: int = Field(default=0, description="Line number in the file (0 if unknown)")


@field_validator("severity", mode="before")
@classmethod
def _normalize_severity(cls, v: str) -> str:
return _normalize_severity(v)


class FollowupExtractionResponse(BaseModel):
"""Minimal extraction schema for recovering data when full structured output fails.

Deliberately kept small (~6 fields, no nesting) for near-100% validation success.
Uses ExtractedFindingSummary for new findings to preserve file/line information.
Used as an intermediate recovery step before falling back to raw text parsing.
"""

Expand All @@ -552,9 +568,9 @@ class FollowupExtractionResponse(BaseModel):
default_factory=list,
description="IDs of previous findings that remain unresolved",
)
new_finding_summaries: list[str] = Field(
new_finding_summaries: list[ExtractedFindingSummary] = Field(
default_factory=list,
description="One-line summary of each new finding (e.g. 'HIGH: cleanup deletes QA-rejected specs in batch_commands.py')",
description="Structured summary of each new finding with file location",

This comment was marked as outdated.

This comment was marked as outdated.

)
confirmed_finding_count: int = Field(
0, description="Number of findings confirmed as valid"
Expand Down
16 changes: 14 additions & 2 deletions apps/backend/runners/github/services/recovery_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ def create_finding_from_summary(
summary: str,
index: int,
id_prefix: str = "FR",
severity_override: str | None = None,
file: str = "unknown",
line: int = 0,
) -> PRReviewFinding:
"""Create a PRReviewFinding from an extraction summary string.

Expand All @@ -90,11 +93,20 @@ def create_finding_from_summary(
summary: Raw summary string, e.g. "HIGH: Missing null check in parser.py"
index: The index of the finding in the extraction list.
id_prefix: ID prefix for traceability. Default "FR" (Followup Recovery).
severity_override: If provided, use this severity instead of parsing from summary.
file: File path where the issue was found (default "unknown").
line: Line number in the file (default 0).

Returns:
A PRReviewFinding with parsed severity, generated ID, and description.
"""
severity, description = parse_severity_from_summary(summary)

# Use severity_override if provided
if severity_override is not None:
severity_map = {k.rstrip(":"): v for k, v in _EXTRACTION_SEVERITY_MAP}
severity = severity_map.get(severity_override.upper(), severity)

finding_id = generate_recovery_finding_id(index, description, prefix=id_prefix)

return PRReviewFinding(
Expand All @@ -103,6 +115,6 @@ def create_finding_from_summary(
category=ReviewCategory.QUALITY,
title=description[:80],
description=f"[Recovered via extraction] {description}",
file="unknown",
line=0,
file=file,
line=line,
)
114 changes: 108 additions & 6 deletions tests/test_structured_output_recovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,22 @@
# services/ package at both apps/backend/services/ and runners/github/services/.
# To avoid collision, add the github services dir directly and import bare module names.
_backend_dir = Path(__file__).parent.parent / "apps" / "backend"
_github_services_dir = _backend_dir / "runners" / "github" / "services"
_github_runner_dir = _backend_dir / "runners" / "github"
_github_services_dir = _github_runner_dir / "services"
if str(_backend_dir) not in sys.path:
sys.path.insert(0, str(_backend_dir))
if str(_github_runner_dir) not in sys.path:
sys.path.insert(0, str(_github_runner_dir))
if str(_github_services_dir) not in sys.path:
sys.path.insert(0, str(_github_services_dir))
Comment on lines +21 to 28
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Fragile sys.path manipulation — pre-existing but growing.

This file now inserts three directories into sys.path. While this pattern was already in place, adding _github_runner_dir increases coupling to the repo layout. This is a pre-existing concern and not introduced by this PR, so no action needed now.

🤖 Prompt for AI Agents
In `@tests/test_structured_output_recovery.py` around lines 21 - 28, The current
sys.path manipulation adds three repo directories (_backend_dir,
_github_runner_dir, _github_services_dir) which tightly couples tests to layout;
remove the explicit insertion of _github_runner_dir and _github_services_dir and
instead keep only the minimal _backend_dir in sys.path (it already guards with a
membership check), and when you need to import modules from the runner/services
paths use importlib (importlib.machinery.SourceFileLoader or
importlib.util.spec_from_file_location) to load by file path at runtime; update
the code that relied on those added paths to load via importlib using the
_github_runner_dir/_github_services_dir paths so you no longer mutate sys.path
for these two directories.


from agents.tools_pkg.models import AGENT_CONFIGS
from pydantic_models import (
ExtractedFindingSummary,
FollowupExtractionResponse,
ParallelFollowupResponse,
)
from recovery_utils import create_finding_from_summary
from sdk_utils import RECOVERABLE_ERRORS


Expand All @@ -53,20 +58,38 @@ def test_minimal_valid_response(self):
assert resp.dismissed_finding_count == 0

def test_full_valid_response(self):
"""Accepts fully populated response."""
"""Accepts fully populated response with ExtractedFindingSummary objects."""
resp = FollowupExtractionResponse(
verdict="READY_TO_MERGE",
verdict_reasoning="All findings resolved",
resolved_finding_ids=["NCR-001", "NCR-002"],
unresolved_finding_ids=[],
new_finding_summaries=["HIGH: potential cleanup issue in batch_commands.py"],
new_finding_summaries=[
ExtractedFindingSummary(
severity="HIGH",
description="potential cleanup issue in batch_commands.py",
file="apps/backend/cli/batch_commands.py",
line=42,
)
],
confirmed_finding_count=1,
dismissed_finding_count=1,
)
assert len(resp.resolved_finding_ids) == 2
assert len(resp.new_finding_summaries) == 1
assert resp.new_finding_summaries[0].file == "apps/backend/cli/batch_commands.py"
assert resp.new_finding_summaries[0].line == 42
assert resp.confirmed_finding_count == 1

def test_finding_summary_defaults(self):
"""ExtractedFindingSummary defaults file='unknown' and line=0."""
summary = ExtractedFindingSummary(
severity="MEDIUM",
description="Some issue without location",
)
assert summary.file == "unknown"
assert summary.line == 0

def test_schema_is_small(self):
"""Schema should be significantly smaller than ParallelFollowupResponse."""
extraction_schema = json.dumps(
Expand All @@ -75,10 +98,11 @@ def test_schema_is_small(self):
followup_schema = json.dumps(
ParallelFollowupResponse.model_json_schema()
)
# Extraction schema should be less than half the size of the full schema
assert len(extraction_schema) < len(followup_schema) / 2, (
# Actual ratio is ~50.7% after adding ExtractedFindingSummary nesting.
# Threshold at 55% gives headroom while still guarding against schema bloat.
assert len(extraction_schema) < len(followup_schema) * 0.55, (
f"Extraction schema ({len(extraction_schema)} chars) should be "
f"less than half of full schema ({len(followup_schema)} chars)"
f"less than 55% of full schema ({len(followup_schema)} chars)"
)

def test_all_verdict_values_accepted(self):
Expand Down Expand Up @@ -143,3 +167,81 @@ def test_extraction_agent_low_thinking(self):
"""Extraction agent should use low thinking (lightweight call)."""
config = AGENT_CONFIGS["pr_followup_extraction"]
assert config["thinking_default"] == "low"


# ============================================================================
# Test create_finding_from_summary with file/line params
# ============================================================================


class TestCreateFindingFromSummary:
"""Tests for create_finding_from_summary with file/line support."""

def test_backward_compatible_defaults(self):
"""Calling without file/line still produces file='unknown', line=0."""
finding = create_finding_from_summary("HIGH: some issue", 0)
assert finding.file == "unknown"
assert finding.line == 0
assert finding.severity.value == "high"

def test_file_and_line_passed_through(self):
"""File and line params are used in the resulting finding."""
finding = create_finding_from_summary(
summary="Missing null check",
index=0,
file="src/parser.py",
line=42,
)
assert finding.file == "src/parser.py"
assert finding.line == 42

def test_severity_override(self):
"""severity_override takes precedence over parsed severity."""
finding = create_finding_from_summary(
summary="HIGH: some issue",
index=0,
severity_override="CRITICAL",
)
assert finding.severity.value == "critical"

def test_severity_override_case_insensitive(self):
"""severity_override works regardless of case."""
finding = create_finding_from_summary(
summary="some issue",
index=0,
severity_override="high",
)
assert finding.severity.value == "high"

def test_severity_override_invalid_falls_back(self):
"""Invalid severity_override falls back to parsed severity."""
finding = create_finding_from_summary(
summary="LOW: minor issue",
index=0,
severity_override="UNKNOWN",
)
# Falls back to parsed "LOW" from summary
assert finding.severity.value == "low"

def test_id_prefix(self):
"""Custom id_prefix is used in the finding ID."""
finding = create_finding_from_summary(
summary="some issue", index=0, id_prefix="FU"
)
assert finding.id.startswith("FU-")

def test_all_params_together(self):
"""All new params work together correctly."""
finding = create_finding_from_summary(
summary="Regex issue in subtask title truncation",
index=3,
id_prefix="FU",
severity_override="MEDIUM",
file="apps/backend/agents/planner.py",
line=187,
)
assert finding.id.startswith("FU-")
assert finding.severity.value == "medium"
assert finding.file == "apps/backend/agents/planner.py"
assert finding.line == 187
assert "Regex issue" in finding.title
Loading