diff --git a/apps/backend/runners/github/services/followup_reviewer.py b/apps/backend/runners/github/services/followup_reviewer.py index 56726676e9..b9cb1b5dd9 100644 --- a/apps/backend/runners/github/services/followup_reviewer.py +++ b/apps/backend/runners/github/services/followup_reviewer.py @@ -886,7 +886,8 @@ async def _attempt_extraction_call( """Attempt a short SDK call with minimal schema to recover review data. This is the extraction recovery step when full structured output validation fails. - Uses FollowupExtractionResponse (~6 flat fields) which has near-100% success rate. + Uses FollowupExtractionResponse (small schema with ExtractedFindingSummary nesting) + which has near-100% success rate. Uses create_client() + process_sdk_stream() for proper OAuth handling, matching the pattern in parallel_followup_reviewer.py. @@ -900,7 +901,8 @@ async def _attempt_extraction_call( extraction_prompt = ( "Extract the key review data from the following AI analysis output. " "Return the verdict, reasoning, resolved finding IDs, unresolved finding IDs, " - "one-line summaries of any new findings, and counts of confirmed/dismissed findings.\n\n" + "structured summaries of any new findings (including severity, description, file path, and line number), " + "and counts of confirmed/dismissed findings.\n\n" f"--- AI ANALYSIS OUTPUT ---\n{text[:8000]}\n--- END ---" ) @@ -946,9 +948,16 @@ async def _attempt_extraction_call( # Convert extraction to internal format with reconstructed findings new_findings = [] - for i, summary in enumerate(extracted.new_finding_summaries): + for i, summary_obj in enumerate(extracted.new_finding_summaries): new_findings.append( - create_finding_from_summary(summary, i, id_prefix="FR") + create_finding_from_summary( + summary=summary_obj.description, + index=i, + id_prefix="FR", + severity_override=summary_obj.severity, + file=summary_obj.file, + line=summary_obj.line, + ) ) # Build finding_resolutions from extraction data for _apply_ai_resolutions diff --git a/apps/backend/runners/github/services/parallel_followup_reviewer.py b/apps/backend/runners/github/services/parallel_followup_reviewer.py index 14820916ba..74c9ece545 100644 --- a/apps/backend/runners/github/services/parallel_followup_reviewer.py +++ b/apps/backend/runners/github/services/parallel_followup_reviewer.py @@ -1129,7 +1129,8 @@ async def _attempt_extraction_call( """Attempt a short SDK call with a minimal schema to recover review data. This is the Tier 2 recovery step when full structured output validation fails. - Uses FollowupExtractionResponse (~6 flat fields) which has near-100% success rate. + Uses FollowupExtractionResponse (small schema with ExtractedFindingSummary nesting) + which has near-100% success rate. Returns parsed result dict on success, None on failure. """ @@ -1146,7 +1147,8 @@ async def _attempt_extraction_call( extraction_prompt = ( "Extract the key review data from the following AI analysis output. " "Return the verdict, reasoning, resolved finding IDs, unresolved finding IDs, " - "one-line summaries of any new findings, and counts of confirmed/dismissed findings.\n\n" + "structured summaries of any new findings (including severity, description, file path, and line number), " + "and counts of confirmed/dismissed findings.\n\n" f"--- AI ANALYSIS OUTPUT ---\n{text[:8000]}\n--- END ---" ) @@ -1205,10 +1207,17 @@ async def _attempt_extraction_call( findings = [] new_finding_ids = [] - # 1. Convert new_finding_summaries to minimal PRReviewFinding objects - # Uses shared helper for "SEVERITY: description" parsing and ID generation - for i, summary in enumerate(extracted.new_finding_summaries): - finding = create_finding_from_summary(summary, i, id_prefix="FU") + # 1. Convert new_finding_summaries to PRReviewFinding objects + # ExtractedFindingSummary objects carry file/line from extraction + for i, summary_obj in enumerate(extracted.new_finding_summaries): + finding = create_finding_from_summary( + summary=summary_obj.description, + index=i, + id_prefix="FU", + severity_override=summary_obj.severity, + file=summary_obj.file, + line=summary_obj.line, + ) new_finding_ids.append(finding.id) findings.append(finding) diff --git a/apps/backend/runners/github/services/pydantic_models.py b/apps/backend/runners/github/services/pydantic_models.py index b3f75f08bd..ad697d8c05 100644 --- a/apps/backend/runners/github/services/pydantic_models.py +++ b/apps/backend/runners/github/services/pydantic_models.py @@ -533,10 +533,26 @@ class FindingValidationResponse(BaseModel): # ============================================================================= +class ExtractedFindingSummary(BaseModel): + """Per-finding summary with file location for extraction recovery.""" + + severity: str = Field(description="Severity level: LOW, MEDIUM, HIGH, or CRITICAL") + description: str = Field(description="One-line description of the finding") + file: str = Field( + default="unknown", description="File path where the issue was found" + ) + line: int = Field(default=0, description="Line number in the file (0 if unknown)") + + @field_validator("severity", mode="before") + @classmethod + def _normalize_severity(cls, v: str) -> str: + return _normalize_severity(v) + + class FollowupExtractionResponse(BaseModel): """Minimal extraction schema for recovering data when full structured output fails. - Deliberately kept small (~6 fields, no nesting) for near-100% validation success. + Uses ExtractedFindingSummary for new findings to preserve file/line information. Used as an intermediate recovery step before falling back to raw text parsing. """ @@ -552,9 +568,9 @@ class FollowupExtractionResponse(BaseModel): default_factory=list, description="IDs of previous findings that remain unresolved", ) - new_finding_summaries: list[str] = Field( + new_finding_summaries: list[ExtractedFindingSummary] = Field( default_factory=list, - description="One-line summary of each new finding (e.g. 'HIGH: cleanup deletes QA-rejected specs in batch_commands.py')", + description="Structured summary of each new finding with file location", ) confirmed_finding_count: int = Field( 0, description="Number of findings confirmed as valid" diff --git a/apps/backend/runners/github/services/recovery_utils.py b/apps/backend/runners/github/services/recovery_utils.py index 643817cdeb..b560e3e7c1 100644 --- a/apps/backend/runners/github/services/recovery_utils.py +++ b/apps/backend/runners/github/services/recovery_utils.py @@ -80,6 +80,9 @@ def create_finding_from_summary( summary: str, index: int, id_prefix: str = "FR", + severity_override: str | None = None, + file: str = "unknown", + line: int = 0, ) -> PRReviewFinding: """Create a PRReviewFinding from an extraction summary string. @@ -90,11 +93,20 @@ def create_finding_from_summary( summary: Raw summary string, e.g. "HIGH: Missing null check in parser.py" index: The index of the finding in the extraction list. id_prefix: ID prefix for traceability. Default "FR" (Followup Recovery). + severity_override: If provided, use this severity instead of parsing from summary. + file: File path where the issue was found (default "unknown"). + line: Line number in the file (default 0). Returns: A PRReviewFinding with parsed severity, generated ID, and description. """ severity, description = parse_severity_from_summary(summary) + + # Use severity_override if provided + if severity_override is not None: + severity_map = {k.rstrip(":"): v for k, v in _EXTRACTION_SEVERITY_MAP} + severity = severity_map.get(severity_override.upper(), severity) + finding_id = generate_recovery_finding_id(index, description, prefix=id_prefix) return PRReviewFinding( @@ -103,6 +115,6 @@ def create_finding_from_summary( category=ReviewCategory.QUALITY, title=description[:80], description=f"[Recovered via extraction] {description}", - file="unknown", - line=0, + file=file, + line=line, ) diff --git a/tests/test_structured_output_recovery.py b/tests/test_structured_output_recovery.py index bc02e32c34..08970e640d 100644 --- a/tests/test_structured_output_recovery.py +++ b/tests/test_structured_output_recovery.py @@ -18,17 +18,22 @@ # services/ package at both apps/backend/services/ and runners/github/services/. # To avoid collision, add the github services dir directly and import bare module names. _backend_dir = Path(__file__).parent.parent / "apps" / "backend" -_github_services_dir = _backend_dir / "runners" / "github" / "services" +_github_runner_dir = _backend_dir / "runners" / "github" +_github_services_dir = _github_runner_dir / "services" if str(_backend_dir) not in sys.path: sys.path.insert(0, str(_backend_dir)) +if str(_github_runner_dir) not in sys.path: + sys.path.insert(0, str(_github_runner_dir)) if str(_github_services_dir) not in sys.path: sys.path.insert(0, str(_github_services_dir)) from agents.tools_pkg.models import AGENT_CONFIGS from pydantic_models import ( + ExtractedFindingSummary, FollowupExtractionResponse, ParallelFollowupResponse, ) +from recovery_utils import create_finding_from_summary from sdk_utils import RECOVERABLE_ERRORS @@ -53,20 +58,38 @@ def test_minimal_valid_response(self): assert resp.dismissed_finding_count == 0 def test_full_valid_response(self): - """Accepts fully populated response.""" + """Accepts fully populated response with ExtractedFindingSummary objects.""" resp = FollowupExtractionResponse( verdict="READY_TO_MERGE", verdict_reasoning="All findings resolved", resolved_finding_ids=["NCR-001", "NCR-002"], unresolved_finding_ids=[], - new_finding_summaries=["HIGH: potential cleanup issue in batch_commands.py"], + new_finding_summaries=[ + ExtractedFindingSummary( + severity="HIGH", + description="potential cleanup issue in batch_commands.py", + file="apps/backend/cli/batch_commands.py", + line=42, + ) + ], confirmed_finding_count=1, dismissed_finding_count=1, ) assert len(resp.resolved_finding_ids) == 2 assert len(resp.new_finding_summaries) == 1 + assert resp.new_finding_summaries[0].file == "apps/backend/cli/batch_commands.py" + assert resp.new_finding_summaries[0].line == 42 assert resp.confirmed_finding_count == 1 + def test_finding_summary_defaults(self): + """ExtractedFindingSummary defaults file='unknown' and line=0.""" + summary = ExtractedFindingSummary( + severity="MEDIUM", + description="Some issue without location", + ) + assert summary.file == "unknown" + assert summary.line == 0 + def test_schema_is_small(self): """Schema should be significantly smaller than ParallelFollowupResponse.""" extraction_schema = json.dumps( @@ -75,10 +98,11 @@ def test_schema_is_small(self): followup_schema = json.dumps( ParallelFollowupResponse.model_json_schema() ) - # Extraction schema should be less than half the size of the full schema - assert len(extraction_schema) < len(followup_schema) / 2, ( + # Actual ratio is ~50.7% after adding ExtractedFindingSummary nesting. + # Threshold at 55% gives headroom while still guarding against schema bloat. + assert len(extraction_schema) < len(followup_schema) * 0.55, ( f"Extraction schema ({len(extraction_schema)} chars) should be " - f"less than half of full schema ({len(followup_schema)} chars)" + f"less than 55% of full schema ({len(followup_schema)} chars)" ) def test_all_verdict_values_accepted(self): @@ -143,3 +167,81 @@ def test_extraction_agent_low_thinking(self): """Extraction agent should use low thinking (lightweight call).""" config = AGENT_CONFIGS["pr_followup_extraction"] assert config["thinking_default"] == "low" + + +# ============================================================================ +# Test create_finding_from_summary with file/line params +# ============================================================================ + + +class TestCreateFindingFromSummary: + """Tests for create_finding_from_summary with file/line support.""" + + def test_backward_compatible_defaults(self): + """Calling without file/line still produces file='unknown', line=0.""" + finding = create_finding_from_summary("HIGH: some issue", 0) + assert finding.file == "unknown" + assert finding.line == 0 + assert finding.severity.value == "high" + + def test_file_and_line_passed_through(self): + """File and line params are used in the resulting finding.""" + finding = create_finding_from_summary( + summary="Missing null check", + index=0, + file="src/parser.py", + line=42, + ) + assert finding.file == "src/parser.py" + assert finding.line == 42 + + def test_severity_override(self): + """severity_override takes precedence over parsed severity.""" + finding = create_finding_from_summary( + summary="HIGH: some issue", + index=0, + severity_override="CRITICAL", + ) + assert finding.severity.value == "critical" + + def test_severity_override_case_insensitive(self): + """severity_override works regardless of case.""" + finding = create_finding_from_summary( + summary="some issue", + index=0, + severity_override="high", + ) + assert finding.severity.value == "high" + + def test_severity_override_invalid_falls_back(self): + """Invalid severity_override falls back to parsed severity.""" + finding = create_finding_from_summary( + summary="LOW: minor issue", + index=0, + severity_override="UNKNOWN", + ) + # Falls back to parsed "LOW" from summary + assert finding.severity.value == "low" + + def test_id_prefix(self): + """Custom id_prefix is used in the finding ID.""" + finding = create_finding_from_summary( + summary="some issue", index=0, id_prefix="FU" + ) + assert finding.id.startswith("FU-") + + def test_all_params_together(self): + """All new params work together correctly.""" + finding = create_finding_from_summary( + summary="Regex issue in subtask title truncation", + index=3, + id_prefix="FU", + severity_override="MEDIUM", + file="apps/backend/agents/planner.py", + line=187, + ) + assert finding.id.startswith("FU-") + assert finding.severity.value == "medium" + assert finding.file == "apps/backend/agents/planner.py" + assert finding.line == 187 + assert "Regex issue" in finding.title