From 55baf7e8c0cb8406c251ea731e1dfc781ffc7197 Mon Sep 17 00:00:00 2001 From: Abdullah Khan Date: Mon, 15 Jun 2026 17:00:44 -0400 Subject: [PATCH 1/5] feat(scm-multi-platform-detection): Extending functionality with capped content reads --- .../github/multi_platform_detection.py | 254 +++++++------ .../github/test_multi_platform_detection.py | 334 ++++++++++++++---- 2 files changed, 411 insertions(+), 177 deletions(-) diff --git a/src/sentry/integrations/github/multi_platform_detection.py b/src/sentry/integrations/github/multi_platform_detection.py index 2981568f02b3..37257e478734 100644 --- a/src/sentry/integrations/github/multi_platform_detection.py +++ b/src/sentry/integrations/github/multi_platform_detection.py @@ -1,11 +1,16 @@ from __future__ import annotations +import re import time from collections import defaultdict from typing import TYPE_CHECKING, Any, TypedDict import sentry_sdk +from sentry.integrations.github.platform_detection import ( + _get_repo_file_content, + _parse_package_manifest, +) from sentry.integrations.github.platform_registry import ( _FRAMEWORKS_BY_PLATFORM, _NON_SELECTABLE_PLATFORMS, @@ -38,9 +43,6 @@ from sentry.integrations.github.platform_registry import ( _PackageManifest as _PackageManifest, ) -from sentry.integrations.github.platform_registry import ( - _rule_matches as _rule_matches, -) if TYPE_CHECKING: from sentry.integrations.github.client import GitHubBaseClient @@ -54,6 +56,11 @@ # languages_count / k_reads_needed metrics. MAX_LANGUAGES = 3 +# Maximum number of per-file REST reads performed. +# Sized at p99 of k_candidate from the measurement run (≈5 reads covers the +# vast majority of repos while keeping the per-detection API footprint small). +MAX_CONTENT_READS = 5 + # Sort key weight for confidence tier: high > medium > low. # Ensures a framework match (high) always ranks above a bare-language fallback # (medium) regardless of byte count. @@ -288,11 +295,12 @@ class MultiDetectionResult(TypedDict): """ platforms: list[DetectedPlatform] - k_candidate: int # how many content-reads would be needed to resolve content/package rules - needed_paths: set[str] # the actual filenames (measurement scaffolding) + k_candidate: int # distinct content/package paths the full rule set would need + k_reads_realized: ( + int # files actually fetched in the content pass (capped at MAX_CONTENT_READS) + ) tree_entry_count: int # total entries returned by GitHub is_truncated: bool # GitHub truncated the tree at 100k entries / 7MB - full_repo_size_bytes: int # sum of ALL blob sizes including vendored/build dirs def _collect_needed_paths( @@ -346,16 +354,36 @@ def _rule_parent_dirs( rule: DetectorRule, files_full_paths_by_basename: dict[str, set[str]], dirs_full_paths_by_basename: dict[str, set[str]], -) -> set[str] | None: - """Collect parent directories where this existence rule is satisfiable. + content_by_path: dict[str, str], + manifests_by_path: dict[str, _PackageManifest], +) -> set[str]: + """Collect parent directories where this rule is satisfiable. - Returns None when the rule needs content or package data (match_content / - match_package) — those rules cannot be evaluated in the existence pass. - Returns an empty set when the rule is an existence rule but no matching - file or directory is present in the tree. + Works for all rule types. Pass empty dicts for ``content_by_path`` and + ``manifests_by_path`` in the existence pass; content/package rules will + then return an empty set (don't fire) without any special-casing. """ - if "match_package" in rule or "match_content" in rule: - return None + if "match_package" in rule: + return { + _parent_dir(path) + for path, manifest in manifests_by_path.items() + if _package_in_manifest(rule["match_package"], manifest) + } + + if "match_content" in rule: + pattern = rule["match_content"] + path_filter = rule.get("path") + ext_filter = rule.get("match_ext") + result: set[str] = set() + for full_path, content in content_by_path.items(): + basename = full_path.rsplit("/", 1)[-1] + if path_filter and basename != path_filter: + continue + if ext_filter and not basename.endswith(ext_filter): + continue + if re.search(pattern, content, re.IGNORECASE): + result.add(_parent_dir(full_path)) + return result if "match_dir" in rule: dirname = rule["match_dir"] @@ -383,56 +411,25 @@ def _rule_parent_dirs( return {_parent_dir(p) for p in files_full_paths_by_basename.get(path, set())} -def _rule_existence_fires_in_scope( - rule: DetectorRule, - scope: str, - files_full_paths_by_basename: dict[str, set[str]], - dirs_full_paths_by_basename: dict[str, set[str]], -) -> bool: - """Return True if an existence rule fires within the given scope. - - A file or directory is "in scope" when its immediate parent directory - equals ``scope`` ('' = repo root). - Rules requiring content or package data always return False. - """ - if "match_package" in rule or "match_content" in rule: - return False - if "match_dir" in rule: - dirname = rule["match_dir"] - if dirname.startswith("."): - for bn, paths in dirs_full_paths_by_basename.items(): - if bn.endswith(dirname) and any(_parent_dir(p) == scope for p in paths): - return True - return False - return any(_parent_dir(p) == scope for p in dirs_full_paths_by_basename.get(dirname, set())) - if "match_ext" in rule: - ext = rule["match_ext"] - for bn, paths in files_full_paths_by_basename.items(): - if bn.endswith(ext) and any(_parent_dir(p) == scope for p in paths): - return True - return False - path = rule.get("path") - if path is None: - return False - return any(_parent_dir(p) == scope for p in files_full_paths_by_basename.get(path, set())) - - def _framework_matches_scoped( fw: FrameworkDef, files_full_paths_by_basename: dict[str, set[str]], dirs_full_paths_by_basename: dict[str, set[str]], + content_by_path: dict[str, str], + manifests_by_path: dict[str, _PackageManifest], ) -> bool: - """Co-location-aware existence-only framework matcher for the multi detector. - - For ``some``-only frameworks any single self-contained signal anywhere in - the tree is sufficient — no co-location constraint needed since a single - rule is already self-contained. - - For frameworks with ``every`` rules all rules must be satisfiable within - the same parent directory, preventing stray files in unrelated subtrees - from causing false positives (e.g. a deployment ``.csproj`` in - ``tools/deploy/`` combined with an unrelated ``appsettings.json`` in - ``backend/`` must not fire ``dotnet-aspnetcore``). + """Co-location-aware framework matcher for the multi detector. + + Pass empty dicts for ``content_by_path`` and ``manifests_by_path`` in the + existence pass; content/package rules will return empty scope sets and + therefore not fire, with no special-casing needed. + + For ``some``-only frameworks any single signal anywhere in the tree is + sufficient. For frameworks with ``every`` rules all conditions must be + satisfiable within the same parent directory, preventing stray files in + unrelated subtrees from causing false positives (e.g. a deployment + ``.csproj`` in ``tools/deploy/`` combined with an unrelated + ``appsettings.json`` in ``backend/`` must not fire ``dotnet-aspnetcore``). If ``some`` rules are also present they are additionally required to fire within at least one scope where all ``every`` rules are satisfied. """ @@ -442,23 +439,22 @@ def _framework_matches_scoped( if not every and not some: return False + def scopes_for(rule: DetectorRule) -> set[str]: + return _rule_parent_dirs( + rule, + files_full_paths_by_basename, + dirs_full_paths_by_basename, + content_by_path, + manifests_by_path, + ) + if not every: - # some-only: any single rule firing anywhere is sufficient. - # _rule_matches with empty file_contents and no manifest evaluates - # existence-only signals — identical to the flat single-detector pass. - files_basenames = set(files_full_paths_by_basename.keys()) - dirs_basenames = set(dirs_full_paths_by_basename.keys()) - return any(_rule_matches(rule, files_basenames, {}, None, dirs_basenames) for rule in some) + return any(scopes_for(rule) for rule in some) # Collect the intersection of parent-dir scopes across all every rules. scopes: set[str] | None = None for rule in every: - rule_scopes = _rule_parent_dirs( - rule, files_full_paths_by_basename, dirs_full_paths_by_basename - ) - if rule_scopes is None: - # Rule needs content/package data → cannot fire in existence pass. - return False + rule_scopes = scopes_for(rule) scopes = set(rule_scopes) if scopes is None else scopes & rule_scopes if not scopes: return False @@ -471,13 +467,7 @@ def _framework_matches_scoped( # every+some: at least one some rule must fire within a scope where all # every rules are already satisfied. - return any( - _rule_existence_fires_in_scope( - rule, scope, files_full_paths_by_basename, dirs_full_paths_by_basename - ) - for scope in scopes - for rule in some - ) + return any(scope in scopes_for(rule) for scope in scopes for rule in some) def detect_platforms_multi( @@ -487,15 +477,21 @@ def detect_platforms_multi( ) -> MultiDetectionResult: """Detect Sentry platforms for a GitHub repository. - Selects up to MAX_LANGUAGES base platforms by byte count, fetches the - full recursive git tree once, and evaluates existence rules (path / - match_dir / match_ext) across all paths — subdir-aware with no extra API - calls. Content/package rules (match_content / match_package) are not - evaluated here; the paths they would need are counted as k_reads_needed - without fetching them. - - The return value feeds the Mode A harness and (eventually) the live - detection endpoint. + Selects up to MAX_LANGUAGES base platforms by byte count, fetches the full + recursive git tree once, then runs two high-confidence passes: + + - Pass 1 (existence): evaluates path/match_dir/match_ext rules with no + extra API calls, co-location-aware so stray files in unrelated subtrees + don't produce false positives. + - Pass 2 (content reads): fetches up to MAX_CONTENT_READS files and + evaluates match_content/match_package rules within the same per-file + scope. + + Supersession runs after both high-confidence passes so that a framework + detected via content reads (e.g. react-native from package.json) can + supersede an existence match (e.g. apple-ios from *.xcodeproj) before the + bare-language fallback is emitted. Bare-language medium fallbacks fill in + any base platform not claimed by either high pass. """ start_time = time.monotonic() @@ -508,23 +504,20 @@ def detect_platforms_multi( results: list[DetectedPlatform] = [] seen_platforms: set[str] = set() - # Pass 1: framework (existence) matches across all base-platform buckets. - # Running this before the medium fallback pass ensures a framework match - # from any bucket (e.g. Swift firing apple-ios high) claims the platform - # id before the bare-language fallback for a differently-keyed bucket - # (e.g. Objective-C → base "apple-ios" medium) can occupy it first. - # Co-location enforcement: every-rules must all fire within the same parent - # directory so stray files in unrelated subtrees don't produce false hits. + # Pass 1: existence-only high matches (no API reads beyond the tree). + # Co-location enforcement ensures every-rules all fire within the same + # parent directory so stray files in unrelated subtrees don't match. for base_platform, lang_entries in active_platforms.items(): - # Use the dominant language as the label, but sum all bytes in the - # bucket so the sort reflects the platform's true weight (e.g. TS + - # JS combined, not just the larger of the two). language = max(lang_entries, key=lambda x: x[1])[0] byte_count = sum(b for _, b in lang_entries) for fw in _FRAMEWORKS_BY_PLATFORM.get(base_platform, []): if _framework_matches_scoped( - fw, index.files_full_paths_by_basename, index.dirs_full_paths_by_basename + fw, + index.files_full_paths_by_basename, + index.dirs_full_paths_by_basename, + {}, + {}, ): platform_id = fw["platform"] if platform_id not in seen_platforms: @@ -539,7 +532,60 @@ def detect_platforms_multi( ) ) - # Pass 2: bare-language fallbacks for base platforms not resolved above. + # Pass 2 (content reads): fetch up to MAX_CONTENT_READS files and evaluate + # match_content / match_package rules within their per-file scope. + needed_paths = _collect_needed_paths(active_platforms, index.files_full_paths_by_basename) + # Sort shallowest paths first so root manifests (package.json, Gemfile, go.mod, …) + # are always within the cap before subdirectory files from monorepo workspaces. + capped_paths = sorted(needed_paths, key=lambda p: (p.count("/"), p))[:MAX_CONTENT_READS] + + content_by_path: dict[str, str] = {} + for path in capped_paths: + content = _get_repo_file_content(client, repo, path, ref) + if content is not None: + content_by_path[path] = content + + manifests_by_path: dict[str, _PackageManifest] = {} + for path, content in content_by_path.items(): + basename = path.rsplit("/", 1)[-1] + manifest = _parse_package_manifest(content, basename) + if manifest is not None: + manifests_by_path[path] = manifest + + for base_platform, lang_entries in active_platforms.items(): + language = max(lang_entries, key=lambda x: x[1])[0] + byte_count = sum(b for _, b in lang_entries) + + for fw in _FRAMEWORKS_BY_PLATFORM.get(base_platform, []): + platform_id = fw["platform"] + if platform_id in seen_platforms: + continue + if _framework_matches_scoped( + fw, + index.files_full_paths_by_basename, + index.dirs_full_paths_by_basename, + content_by_path, + manifests_by_path, + ): + seen_platforms.add(platform_id) + results.append( + DetectedPlatform( + platform=platform_id, + language=language, + bytes=byte_count, + confidence="high", + priority=100 - fw["sort"], + ) + ) + + # Supersession runs after both high-confidence passes so a framework + # detected via content reads can supersede an existence match. + results = _apply_supersession(results) + + # Pass 3: bare-language medium fallbacks for base platforms not resolved + # by either high pass. Uses seen_platforms (not results) so supersession + # above doesn't accidentally re-open a slot for a base platform whose + # framework was just superseded by a higher-specificity content match. for base_platform, lang_entries in active_platforms.items(): if base_platform not in seen_platforms: language = max(lang_entries, key=lambda x: x[1])[0] @@ -555,14 +601,13 @@ def detect_platforms_multi( ) ) - results = _apply_supersession(results) results = [r for r in results if r["platform"] not in _NON_SELECTABLE_PLATFORMS] results.sort( key=lambda r: (_CONFIDENCE_ORDER[r["confidence"]], r["bytes"], r["priority"]), reverse=True, ) - needed_paths = _collect_needed_paths(active_platforms, index.files_full_paths_by_basename) + k_reads_realized = len(content_by_path) sentry_sdk.metrics.distribution( f"{_MULTI_METRICS_PREFIX}.duration", @@ -590,6 +635,10 @@ def detect_platforms_multi( f"{_MULTI_METRICS_PREFIX}.k_reads_needed", len(needed_paths), ) + sentry_sdk.metrics.distribution( + f"{_MULTI_METRICS_PREFIX}.k_reads_realized", + k_reads_realized, + ) sentry_sdk.metrics.count( f"{_MULTI_METRICS_PREFIX}.completed", 1, @@ -603,8 +652,7 @@ def detect_platforms_multi( return MultiDetectionResult( platforms=results, k_candidate=len(needed_paths), - needed_paths=needed_paths, + k_reads_realized=k_reads_realized, tree_entry_count=len(entries), is_truncated=is_truncated, - full_repo_size_bytes=index.full_repo_size_bytes, ) diff --git a/tests/sentry/integrations/github/test_multi_platform_detection.py b/tests/sentry/integrations/github/test_multi_platform_detection.py index 6185577a8247..3fd82df850c4 100644 --- a/tests/sentry/integrations/github/test_multi_platform_detection.py +++ b/tests/sentry/integrations/github/test_multi_platform_detection.py @@ -1,17 +1,23 @@ from __future__ import annotations +from base64 import b64encode from typing import Any +from unittest import mock from sentry.integrations.github.multi_platform_detection import ( _build_tree_index, + _collect_needed_paths, _framework_matches_scoped, - _rule_existence_fires_in_scope, _rule_parent_dirs, + detect_platforms_multi, ) from sentry.integrations.github.platform_registry import ( DetectorRule, FrameworkDef, + _PackageManifest, ) +from sentry.shared_integrations.exceptions import ApiError +from sentry.utils import json class TestBuildTreeIndex: @@ -82,21 +88,21 @@ def test_root_level_entries_indexed(self) -> None: class TestRuleParentDirs: def test_path_rule_returns_parent_dir(self) -> None: files = {"next.config.js": {"fe/next.config.js"}} - result = _rule_parent_dirs({"path": "next.config.js"}, files, {}) + result = _rule_parent_dirs({"path": "next.config.js"}, files, {}, {}, {}) assert result == {"fe"} def test_path_rule_at_root_returns_empty_string_scope(self) -> None: files = {"manage.py": {"manage.py"}} - result = _rule_parent_dirs({"path": "manage.py"}, files, {}) + result = _rule_parent_dirs({"path": "manage.py"}, files, {}, {}, {}) assert result == {""} def test_path_rule_multiple_occurrences_collects_all_parents(self) -> None: files = {"package.json": {"fe/package.json", "be/package.json"}} - result = _rule_parent_dirs({"path": "package.json"}, files, {}) + result = _rule_parent_dirs({"path": "package.json"}, files, {}, {}, {}) assert result == {"fe", "be"} def test_path_rule_absent_returns_empty_set(self) -> None: - result = _rule_parent_dirs({"path": "manage.py"}, {}, {}) + result = _rule_parent_dirs({"path": "manage.py"}, {}, {}, {}, {}) assert result == set() def test_match_ext_returns_union_of_parent_dirs(self) -> None: @@ -104,78 +110,66 @@ def test_match_ext_returns_union_of_parent_dirs(self) -> None: "myapp.csproj": {"apps/web/myapp.csproj"}, "lib.csproj": {"apps/lib/lib.csproj"}, } - result = _rule_parent_dirs({"match_ext": ".csproj"}, files, {}) + result = _rule_parent_dirs({"match_ext": ".csproj"}, files, {}, {}, {}) assert result == {"apps/web", "apps/lib"} def test_match_dir_returns_parent_dirs(self) -> None: dirs = {"Assets": {"Assets", "myproject/Assets"}} - result = _rule_parent_dirs({"match_dir": "Assets"}, {}, dirs) + result = _rule_parent_dirs({"match_dir": "Assets"}, {}, dirs, {}, {}) assert result == {"", "myproject"} def test_match_dir_dotted_name_uses_endswith(self) -> None: # .xcodeproj dirs are matched by endswith, not equality dirs = {"MyApp.xcodeproj": {"MyApp.xcodeproj"}} - result = _rule_parent_dirs({"match_dir": ".xcodeproj"}, {}, dirs) + result = _rule_parent_dirs({"match_dir": ".xcodeproj"}, {}, dirs, {}, {}) assert result == {""} - def test_match_content_returns_none(self) -> None: - # match_content rules are deferred to Tier 2 — existence pass cannot evaluate them - files = {"requirements.txt": {"requirements.txt"}} + def test_match_content_empty_in_existence_pass(self) -> None: + # empty content maps → content rule returns empty set (doesn't fire) rule: DetectorRule = {"path": "requirements.txt", "match_content": r"django"} - assert _rule_parent_dirs(rule, files, {}) is None - - def test_match_package_returns_none(self) -> None: - # match_package rules need a parsed manifest — deferred to Tier 2 - assert _rule_parent_dirs({"match_package": "next"}, {}, {}) is None - - -class TestRuleExistenceFiresInScope: - def test_path_rule_matching_scope(self) -> None: - files = {"appsettings.json": {"backend/appsettings.json"}} - assert ( - _rule_existence_fires_in_scope({"path": "appsettings.json"}, "backend", files, {}) - is True - ) - - def test_path_rule_wrong_scope(self) -> None: - files = {"appsettings.json": {"backend/appsettings.json"}} - assert ( - _rule_existence_fires_in_scope({"path": "appsettings.json"}, "frontend", files, {}) - is False - ) - - def test_path_rule_root_scope(self) -> None: - files = {"manage.py": {"manage.py"}} - assert _rule_existence_fires_in_scope({"path": "manage.py"}, "", files, {}) is True + assert _rule_parent_dirs(rule, {}, {}, {}, {}) == set() - def test_match_ext_in_scope(self) -> None: - files = {"myapp.csproj": {"apps/web/myapp.csproj"}} - assert ( - _rule_existence_fires_in_scope({"match_ext": ".csproj"}, "apps/web", files, {}) is True - ) + def test_match_package_empty_in_existence_pass(self) -> None: + # empty manifest map → package rule returns empty set (doesn't fire) + assert _rule_parent_dirs({"match_package": "next"}, {}, {}, {}, {}) == set() - def test_match_ext_outside_scope(self) -> None: - files = {"myapp.csproj": {"apps/web/myapp.csproj"}} - # "apps" is the grandparent, not the immediate parent - assert _rule_existence_fires_in_scope({"match_ext": ".csproj"}, "apps", files, {}) is False + def test_match_content_with_content_returns_parent_dir(self) -> None: + rule: DetectorRule = {"path": "requirements.txt", "match_content": r"(?i)\bdjango\b"} + content = {"requirements.txt": "Django==4.2\n"} + result = _rule_parent_dirs(rule, {}, {}, content, {}) + assert result == {""} - def test_match_dir_in_scope(self) -> None: - dirs = {"Assets": {"myproject/Assets"}} - assert ( - _rule_existence_fires_in_scope({"match_dir": "Assets"}, "myproject", {}, dirs) is True - ) + def test_match_content_no_match_returns_empty_set(self) -> None: + rule: DetectorRule = {"path": "requirements.txt", "match_content": r"(?i)\bdjango\b"} + content = {"requirements.txt": "flask==3.0\n"} + assert _rule_parent_dirs(rule, {}, {}, content, {}) == set() - def test_match_dir_outside_scope(self) -> None: - dirs = {"Assets": {"myproject/Assets"}} - assert _rule_existence_fires_in_scope({"match_dir": "Assets"}, "", {}, dirs) is False + def test_match_package_with_manifest_returns_parent_dir(self) -> None: + manifest = _PackageManifest(dependencies={"next", "react"}, dev_dependencies=set()) + manifests = {"fe/package.json": manifest} + result = _rule_parent_dirs({"match_package": "next"}, {}, {}, {}, manifests) + assert result == {"fe"} - def test_match_content_always_false(self) -> None: - files = {"requirements.txt": {"requirements.txt"}} - rule: DetectorRule = {"path": "requirements.txt", "match_content": r"django"} - assert _rule_existence_fires_in_scope(rule, "", files, {}) is False + def test_match_content_with_match_ext_filters_by_extension(self) -> None: + rule: DetectorRule = {"match_ext": ".csproj", "match_content": r"Microsoft\.Maui"} + content = { + "myapp.csproj": "... None: - assert _rule_existence_fires_in_scope({"match_package": "next"}, "", {}, {}) is False + def test_match_content_no_path_or_ext_filter_scans_all_files(self) -> None: + # A bare match_content rule (no path/match_ext) should match any fetched file + # whose content satisfies the pattern and collect all their parent dirs. + rule: DetectorRule = {"match_content": r"SECRET"} + content = { + "root_file.txt": "SECRET=abc", + "sub/nested.txt": "SECRET=xyz", + "other.txt": "nothing here", + } + result = _rule_parent_dirs(rule, {}, {}, content, {}) + assert result == {"", "sub"} class TestFrameworkMatchesScoped: @@ -186,7 +180,9 @@ def test_some_only_path_matches(self) -> None: "base_platform": "godot", "some": [{"path": "project.godot"}], } - assert _framework_matches_scoped(fw, {"project.godot": {"project.godot"}}, {}) is True + assert ( + _framework_matches_scoped(fw, {"project.godot": {"project.godot"}}, {}, {}, {}) is True + ) def test_some_only_path_absent(self) -> None: fw: FrameworkDef = { @@ -195,10 +191,10 @@ def test_some_only_path_absent(self) -> None: "base_platform": "godot", "some": [{"path": "project.godot"}], } - assert _framework_matches_scoped(fw, {}, {}) is False + assert _framework_matches_scoped(fw, {}, {}, {}, {}) is False - def test_some_only_match_package_false_without_manifest(self) -> None: - # match_package rules need a parsed manifest — always False in existence pass + def test_some_only_match_package_false_in_existence_pass(self) -> None: + # empty manifest maps → package rule doesn't fire in existence pass fw: FrameworkDef = { "platform": "javascript-nextjs", "sort": 1, @@ -206,7 +202,20 @@ def test_some_only_match_package_false_without_manifest(self) -> None: "some": [{"match_package": "next"}], "supersedes": ["javascript-react"], } - assert _framework_matches_scoped(fw, {"package.json": {"package.json"}}, {}) is False + assert ( + _framework_matches_scoped(fw, {"package.json": {"package.json"}}, {}, {}, {}) is False + ) + + def test_some_only_match_package_matches_with_manifest(self) -> None: + # populated manifest → package rule fires in content pass + fw: FrameworkDef = { + "platform": "javascript-nextjs", + "sort": 1, + "base_platform": "javascript", + "some": [{"match_package": "next"}], + } + manifest = _PackageManifest(dependencies={"next"}, dev_dependencies=set()) + assert _framework_matches_scoped(fw, {}, {}, {}, {"package.json": manifest}) is True def test_every_only_files_in_same_parent_scope(self) -> None: # dotnet-aspnetcore: .csproj + appsettings.json co-located → match @@ -220,7 +229,7 @@ def test_every_only_files_in_same_parent_scope(self) -> None: "myapp.csproj": {"apps/web/myapp.csproj"}, "appsettings.json": {"apps/web/appsettings.json"}, } - assert _framework_matches_scoped(fw, files, {}) is True + assert _framework_matches_scoped(fw, files, {}, {}, {}) is True def test_every_only_stray_files_in_different_scopes_no_match(self) -> None: # The blind spot fixed by co-location: deploy.csproj + unrelated appsettings.json @@ -234,7 +243,7 @@ def test_every_only_stray_files_in_different_scopes_no_match(self) -> None: "deploy.csproj": {"tools/deploy/deploy.csproj"}, "appsettings.json": {"backend/appsettings.json"}, } - assert _framework_matches_scoped(fw, files, {}) is False + assert _framework_matches_scoped(fw, files, {}, {}, {}) is False def test_every_match_dir_same_scope(self) -> None: # unity: Assets/ + ProjectSettings/ at the same level → match @@ -245,7 +254,7 @@ def test_every_match_dir_same_scope(self) -> None: "every": [{"match_dir": "Assets"}, {"match_dir": "ProjectSettings"}], } dirs = {"Assets": {"Assets"}, "ProjectSettings": {"ProjectSettings"}} - assert _framework_matches_scoped(fw, {}, dirs) is True + assert _framework_matches_scoped(fw, {}, dirs, {}, {}) is True def test_every_match_dir_different_scopes_no_match(self) -> None: # Assets at root, ProjectSettings inside backend/ — not a Unity project @@ -256,17 +265,30 @@ def test_every_match_dir_different_scopes_no_match(self) -> None: "every": [{"match_dir": "Assets"}, {"match_dir": "ProjectSettings"}], } dirs = {"Assets": {"Assets"}, "ProjectSettings": {"backend/ProjectSettings"}} - assert _framework_matches_scoped(fw, {}, dirs) is False + assert _framework_matches_scoped(fw, {}, dirs, {}, {}) is False + + def test_every_with_match_content_false_in_existence_pass(self) -> None: + # empty content maps → content rule doesn't fire; framework doesn't match yet + fw: FrameworkDef = { + "platform": "dotnet-maui", + "sort": 10, + "base_platform": "dotnet", + "every": [{"match_ext": ".csproj", "match_content": r"Microsoft\.Maui"}], + } + assert ( + _framework_matches_scoped(fw, {"myapp.csproj": {"myapp.csproj"}}, {}, {}, {}) is False + ) - def test_every_with_match_content_deferred_returns_false(self) -> None: - # dotnet-maui requires content inside the .csproj — deferred to Tier 2 + def test_every_with_match_content_matches_with_content(self) -> None: + # populated content → content rule fires; framework matches fw: FrameworkDef = { "platform": "dotnet-maui", "sort": 10, "base_platform": "dotnet", "every": [{"match_ext": ".csproj", "match_content": r"Microsoft\.Maui"}], } - assert _framework_matches_scoped(fw, {"myapp.csproj": {"myapp.csproj"}}, {}) is False + content = {"apps/myapp.csproj": ""} + assert _framework_matches_scoped(fw, {}, {}, content, {}) is True def test_every_and_some_both_fire_in_same_scope(self) -> None: # every: match_dir "app" at root; some: path "build.gradle" at root → match @@ -279,7 +301,7 @@ def test_every_and_some_both_fire_in_same_scope(self) -> None: } files = {"build.gradle": {"build.gradle"}} # parent "" dirs = {"app": {"app"}} # parent "" - assert _framework_matches_scoped(fw, files, dirs) is True + assert _framework_matches_scoped(fw, files, dirs, {}, {}) is True def test_every_and_some_some_only_outside_every_scope(self) -> None: # every scope is "" (app/ at root), some rule only fires in "other/" — no match @@ -292,8 +314,172 @@ def test_every_and_some_some_only_outside_every_scope(self) -> None: } files = {"build.gradle": {"other/build.gradle"}} # parent "other" dirs = {"app": {"app"}} # parent "" - assert _framework_matches_scoped(fw, files, dirs) is False + assert _framework_matches_scoped(fw, files, dirs, {}, {}) is False + + def test_every_and_some_some_is_content_rule_matches_in_scope(self) -> None: + # every: match_dir "src" at root; some: match_content in requirements.txt at root + fw: FrameworkDef = { + "platform": "hypothetical", + "sort": 10, + "base_platform": "python", + "every": [{"match_dir": "src"}], + "some": [{"path": "requirements.txt", "match_content": r"(?i)\bdjango\b"}], + } + dirs = {"src": {"src"}} + content = {"requirements.txt": "Django==4.2\n"} + assert _framework_matches_scoped(fw, {}, dirs, content, {}) is True def test_empty_every_and_some_returns_false(self) -> None: fw: FrameworkDef = {"platform": "empty", "sort": 10, "base_platform": "python"} - assert _framework_matches_scoped(fw, {}, {}) is False + assert _framework_matches_scoped(fw, {}, {}, {}, {}) is False + + +class TestCollectNeededPaths: + def test_includes_all_package_manifest_paths(self) -> None: + # Both root and subdir package.json should be included for match_package rules. + active = {"javascript": [("JavaScript", 100)]} + files = {"package.json": {"package.json", "fe/package.json"}} + result = _collect_needed_paths(active, files) + assert result == {"package.json", "fe/package.json"} + + def test_includes_match_content_target_path(self) -> None: + # python-django has a match_content rule targeting requirements.txt by path. + active = {"python": [("Python", 80000)]} + files = {"requirements.txt": {"requirements.txt"}} + result = _collect_needed_paths(active, files) + assert "requirements.txt" in result + + def test_includes_match_ext_content_files(self) -> None: + # dotnet-maui has match_ext=".csproj" + match_content -> all .csproj paths included. + active = {"dotnet": [("C#", 50000)]} + files = { + "myapp.csproj": {"apps/web/myapp.csproj"}, + "lib.csproj": {"lib/lib.csproj"}, + } + result = _collect_needed_paths(active, files) + assert "apps/web/myapp.csproj" in result + assert "lib/lib.csproj" in result + + def test_excludes_absent_manifest(self) -> None: + # No package.json in tree -> nothing to read for match_package rules. + active = {"javascript": [("JavaScript", 100)]} + files = {"index.js": {"index.js"}} + result = _collect_needed_paths(active, files) + assert "package.json" not in result + assert "index.js" not in result + + +def _make_tree_entry(path: str, entry_type: str = "blob") -> dict[str, Any]: + return {"path": path, "type": entry_type, "size": 100} + + +def _make_client( + languages: dict[str, int], + tree: list[dict[str, Any]], + contents: dict[str, str], + truncated: bool = False, +) -> mock.MagicMock: + """Return a fake GitHubBaseClient that serves a fixed tree and content map.""" + client = mock.MagicMock() + client.get_languages.return_value = languages + + def get_side_effect(path: str, params: dict | None = None) -> Any: + if "/git/trees/" in path: + return {"tree": tree, "truncated": truncated} + # contents endpoint: /repos/{owner/repo}/contents/{rel_path} + rel = path.split("/contents/", 1)[1] + if rel in contents: + return {"content": b64encode(contents[rel].encode()).decode()} + raise ApiError("Not Found", code=404) + + client.get.side_effect = get_side_effect + return client + + +class TestDetectPlatformsMulti: + def test_content_match_detected_high(self) -> None: + # requirements.txt only (no manage.py) -> python-django only fires via content read. + tree = [_make_tree_entry("requirements.txt")] + client = _make_client( + languages={"Python": 80000}, + tree=tree, + contents={"requirements.txt": "Django==4.2\n"}, + ) + result = detect_platforms_multi(client, "owner/repo") + platforms = {p["platform"]: p for p in result["platforms"]} + assert "python-django" in platforms + assert platforms["python-django"]["confidence"] == "high" + assert "python" in platforms + assert platforms["python"]["confidence"] == "medium" + + def test_package_match_detected_high(self) -> None: + # javascript-react has only a match_package rule — no existence trigger. + pkg = json.dumps({"dependencies": {"react": "18.0.0"}}) + tree = [_make_tree_entry("package.json")] + client = _make_client( + languages={"JavaScript": 60000}, + tree=tree, + contents={"package.json": pkg}, + ) + result = detect_platforms_multi(client, "owner/repo") + platforms = {p["platform"] for p in result["platforms"]} + assert "javascript-react" in platforms + + def test_content_read_cap_and_shallow_first(self) -> None: + # Root package.json (deps: next) + 6 deeper workspace package.json files. + # Cap is 5; root must be read so javascript-nextjs is detected. + # The alphabetically-last deep manifest (packages/f/package.json) must not be fetched. + deep_names = [f"packages/{c}/package.json" for c in "abcdef"] # 6 paths + tree = [_make_tree_entry("package.json")] + [_make_tree_entry(p) for p in deep_names] + root_pkg = json.dumps({"dependencies": {"next": "14.0.0"}}) + contents = {"package.json": root_pkg} + for p in deep_names: + contents[p] = json.dumps({"dependencies": {}}) + + client = _make_client( + languages={"JavaScript": 100000}, + tree=tree, + contents=contents, + ) + result = detect_platforms_multi(client, "owner/repo") + + fetched = [ + call.args[0].split("/contents/", 1)[1] + for call in client.get.call_args_list + if "/contents/" in call.args[0] + ] + assert "package.json" in fetched + assert "packages/f/package.json" not in fetched + + platforms = {p["platform"] for p in result["platforms"]} + assert "javascript-nextjs" in platforms + + def test_content_driven_supersession(self) -> None: + # package.json declares both react and react-native; react-native supersedes react. + pkg = json.dumps({"dependencies": {"react": "18.0.0", "react-native": "0.73.0"}}) + tree = [_make_tree_entry("package.json")] + client = _make_client( + languages={"JavaScript": 70000}, + tree=tree, + contents={"package.json": pkg}, + ) + result = detect_platforms_multi(client, "owner/repo") + platforms = {p["platform"] for p in result["platforms"]} + assert "react-native" in platforms + assert "javascript-react" not in platforms + + def test_no_content_reads_when_no_candidates(self) -> None: + # Only main.py in tree — no manifest file, no match_content target file present. + tree = [_make_tree_entry("main.py")] + client = _make_client( + languages={"Python": 40000}, + tree=tree, + contents={}, + ) + result = detect_platforms_multi(client, "owner/repo") + + platforms = {p["platform"] for p in result["platforms"]} + assert platforms == {"python"} + # No /contents/ call should have been issued + contents_calls = [c for c in client.get.call_args_list if "/contents/" in c.args[0]] + assert contents_calls == [] From 409d28da19a4c29000725ce46e219756627aa04e Mon Sep 17 00:00:00 2001 From: Abdullah Khan Date: Tue, 16 Jun 2026 13:42:06 -0400 Subject: [PATCH 2/5] feat(scm-multi-platform-detection): Cleaning up --- .../integrations/github/multi_platform_detection.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/sentry/integrations/github/multi_platform_detection.py b/src/sentry/integrations/github/multi_platform_detection.py index 37257e478734..610c494208ac 100644 --- a/src/sentry/integrations/github/multi_platform_detection.py +++ b/src/sentry/integrations/github/multi_platform_detection.py @@ -359,9 +359,7 @@ def _rule_parent_dirs( ) -> set[str]: """Collect parent directories where this rule is satisfiable. - Works for all rule types. Pass empty dicts for ``content_by_path`` and - ``manifests_by_path`` in the existence pass; content/package rules will - then return an empty set (don't fire) without any special-casing. + Works for all rule types i.e. existence-only, content, and package. """ if "match_package" in rule: return { @@ -420,10 +418,6 @@ def _framework_matches_scoped( ) -> bool: """Co-location-aware framework matcher for the multi detector. - Pass empty dicts for ``content_by_path`` and ``manifests_by_path`` in the - existence pass; content/package rules will return empty scope sets and - therefore not fire, with no special-casing needed. - For ``some``-only frameworks any single signal anywhere in the tree is sufficient. For frameworks with ``every`` rules all conditions must be satisfiable within the same parent directory, preventing stray files in From 99fa8d0653df1acc2d6b5da862bdc1192a4ff9fa Mon Sep 17 00:00:00 2001 From: Abdullah Khan Date: Tue, 16 Jun 2026 13:48:25 -0400 Subject: [PATCH 3/5] feat(scm-multi-platform-detection): Removing case insensitivity --- .../integrations/github/multi_platform_detection.py | 4 +++- .../github/test_multi_platform_detection.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/sentry/integrations/github/multi_platform_detection.py b/src/sentry/integrations/github/multi_platform_detection.py index 610c494208ac..bb33cbd544f8 100644 --- a/src/sentry/integrations/github/multi_platform_detection.py +++ b/src/sentry/integrations/github/multi_platform_detection.py @@ -379,7 +379,9 @@ def _rule_parent_dirs( continue if ext_filter and not basename.endswith(ext_filter): continue - if re.search(pattern, content, re.IGNORECASE): + # Match case-sensitively to mirror the registry's _rule_matches; + # patterns that want case-insensitivity embed an inline (?i) flag. + if re.search(pattern, content): result.add(_parent_dir(full_path)) return result diff --git a/tests/sentry/integrations/github/test_multi_platform_detection.py b/tests/sentry/integrations/github/test_multi_platform_detection.py index 3fd82df850c4..c098fe2fb0d3 100644 --- a/tests/sentry/integrations/github/test_multi_platform_detection.py +++ b/tests/sentry/integrations/github/test_multi_platform_detection.py @@ -159,6 +159,19 @@ def test_match_content_with_match_ext_filters_by_extension(self) -> None: result = _rule_parent_dirs(rule, {}, {}, content, {}) assert result == {""} + def test_match_content_is_case_sensitive(self) -> None: + # Mirror the registry's case-sensitive re.search: a case-sensitive pattern + # (no inline (?i)) must NOT match differently-cased content. + rule: DetectorRule = {"match_ext": ".csproj", "match_content": r"Microsoft\.Maui"} + content = {"myapp.csproj": "..."} # lowercase — must not fire + assert _rule_parent_dirs(rule, {}, {}, content, {}) == set() + + def test_match_content_honors_inline_ignorecase_flag(self) -> None: + # Patterns that want case-insensitivity embed (?i) themselves. + rule: DetectorRule = {"path": "requirements.txt", "match_content": r"(?i)\bdjango\b"} + content = {"requirements.txt": "DJANGO==4.2\n"} + assert _rule_parent_dirs(rule, {}, {}, content, {}) == {""} + def test_match_content_no_path_or_ext_filter_scans_all_files(self) -> None: # A bare match_content rule (no path/match_ext) should match any fetched file # whose content satisfies the pattern and collect all their parent dirs. From 9d6c7213df11efa46f2d863e0968c9e630a18967 Mon Sep 17 00:00:00 2001 From: Abdullah Khan Date: Wed, 17 Jun 2026 15:06:10 -0400 Subject: [PATCH 4/5] feat(scm-multi-platform-detection): Cleaning up --- .../integrations/github/multi_platform_detection.py | 11 ----------- .../github/test_multi_platform_detection.py | 7 +++---- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/src/sentry/integrations/github/multi_platform_detection.py b/src/sentry/integrations/github/multi_platform_detection.py index bb33cbd544f8..f93dc25da31f 100644 --- a/src/sentry/integrations/github/multi_platform_detection.py +++ b/src/sentry/integrations/github/multi_platform_detection.py @@ -223,7 +223,6 @@ class _TreeIndex: def __init__( self, - dirs: set[str], files_full_paths_by_basename: dict[str, set[str]], dirs_full_paths_by_basename: dict[str, set[str]], full_repo_size_bytes: int, @@ -231,18 +230,11 @@ def __init__( # basename → set of all non-ignored full file paths with that name. self.files_full_paths_by_basename = files_full_paths_by_basename # basename → set of all non-ignored full directory paths with that name. - # Kept alongside the fast-lookup basename set below. self.dirs_full_paths_by_basename = dirs_full_paths_by_basename - # Fast basename-only sets for O(1) existence checks. - self.dirs = dirs # Sum of ALL blobs including vendored/build dirs — the true tarball # weight. self.full_repo_size_bytes = full_repo_size_bytes - @property - def files(self) -> set[str]: - return set(self.files_full_paths_by_basename.keys()) - def _build_tree_index(entries: list[dict[str, Any]]) -> _TreeIndex: """Build a searchable index from raw git tree entries. @@ -252,7 +244,6 @@ def _build_tree_index(entries: list[dict[str, Any]]) -> _TreeIndex: ``node_modules/some-lib/package.json`` never contributes a false signal. ``full_repo_size_bytes`` is the sum of ``size`` across all blobs. """ - dirs: set[str] = set() files_full_paths_by_basename: dict[str, set[str]] = defaultdict(set) dirs_full_paths_by_basename: dict[str, set[str]] = defaultdict(set) full_repo_size_bytes = 0 @@ -273,11 +264,9 @@ def _build_tree_index(entries: list[dict[str, Any]]) -> _TreeIndex: if entry_type == "blob": files_full_paths_by_basename[basename].add(path) elif entry_type == "tree": - dirs.add(basename) dirs_full_paths_by_basename[basename].add(path) return _TreeIndex( - dirs=dirs, files_full_paths_by_basename=dict(files_full_paths_by_basename), dirs_full_paths_by_basename=dict(dirs_full_paths_by_basename), full_repo_size_bytes=full_repo_size_bytes, diff --git a/tests/sentry/integrations/github/test_multi_platform_detection.py b/tests/sentry/integrations/github/test_multi_platform_detection.py index c098fe2fb0d3..bb8e24e27f94 100644 --- a/tests/sentry/integrations/github/test_multi_platform_detection.py +++ b/tests/sentry/integrations/github/test_multi_platform_detection.py @@ -39,8 +39,6 @@ def test_dirs_indexed_by_basename_with_full_paths(self) -> None: ] index = _build_tree_index(entries) assert index.dirs_full_paths_by_basename["Assets"] == {"Assets", "myproject/Assets"} - # Fast basename set also populated - assert "Assets" in index.dirs def test_full_repo_size_bytes_includes_ignored_blobs(self) -> None: entries = [ @@ -67,13 +65,14 @@ def test_ignored_paths_excluded_from_dir_index(self) -> None: index = _build_tree_index(entries) assert len(index.dirs_full_paths_by_basename) == 0 - def test_files_property_returns_basenames(self) -> None: + def test_files_indexed_by_basename_across_subdirs(self) -> None: entries = [ {"path": "fe/next.config.js", "type": "blob", "size": 100}, {"path": "be/manage.py", "type": "blob", "size": 200}, ] index = _build_tree_index(entries) - assert index.files == {"next.config.js", "manage.py"} + assert index.files_full_paths_by_basename["next.config.js"] == {"fe/next.config.js"} + assert index.files_full_paths_by_basename["manage.py"] == {"be/manage.py"} def test_root_level_entries_indexed(self) -> None: entries: list[dict[str, Any]] = [ From 9fd3345bf694383c68e2da75a093aec126ebe9a4 Mon Sep 17 00:00:00 2001 From: Abdullah Khan <60121741+Abdkhan14@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:33:00 -0400 Subject: [PATCH 5/5] feat(scm-multi-platform-detection): Extending test coverage (#117861) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds unit and integration tests for the previously untested parts of `multi_platform_detection.py`, and cleans up minor quality issues in the existing test suite. New test coverage: `TestSelectActivePlatforms` — `MAX_LANGUAGES` cap, language grouping (TypeScript + JavaScript → one javascript slot), `IGNORED_LANGUAGES` skipped, byte-count descending ordering `TestPathIsIgnored` — segment-vs-substring correctness (e.g. build.gradle is not ignored even though build/ is an ignored directory) `TestGetTree` — defensive parsing of non-dict responses, missing tree key, truncated flag propagation TestDetectPlatformsMulti (extended) — existence-only Pass 1 high match with zero content reads, co-location false-positive prevention end-to-end, high confidence sorts before medium regardless of byte count Test quality cleanup: --- .../github/test_multi_platform_detection.py | 201 +++++++++++++++++- 1 file changed, 190 insertions(+), 11 deletions(-) diff --git a/tests/sentry/integrations/github/test_multi_platform_detection.py b/tests/sentry/integrations/github/test_multi_platform_detection.py index bb8e24e27f94..a90bbc416cde 100644 --- a/tests/sentry/integrations/github/test_multi_platform_detection.py +++ b/tests/sentry/integrations/github/test_multi_platform_detection.py @@ -1,17 +1,24 @@ from __future__ import annotations +import string from base64 import b64encode from typing import Any from unittest import mock from sentry.integrations.github.multi_platform_detection import ( + MAX_CONTENT_READS, + MAX_LANGUAGES, _build_tree_index, _collect_needed_paths, _framework_matches_scoped, + _get_tree, + _path_is_ignored, _rule_parent_dirs, + _select_active_platforms, detect_platforms_multi, ) from sentry.integrations.github.platform_registry import ( + GITHUB_LANGUAGE_TO_SENTRY_PLATFORM, DetectorRule, FrameworkDef, _PackageManifest, @@ -20,6 +27,25 @@ from sentry.utils import json +def _distinct_platform_languages(n: int) -> list[str]: + """Return n languages that each map to a different Sentry base platform. + + Iterates GITHUB_LANGUAGE_TO_SENTRY_PLATFORM in insertion order, picking + the first language seen for each new base platform, until n entries are + collected. Useful for building test inputs that exercise the MAX_LANGUAGES + cap without hardcoding specific language names. + """ + seen: set[str] = set() + result: list[str] = [] + for lang, bp in GITHUB_LANGUAGE_TO_SENTRY_PLATFORM.items(): + if bp not in seen: + seen.add(bp) + result.append(lang) + if len(result) == n: + break + return result + + class TestBuildTreeIndex: def test_files_indexed_by_basename_with_full_paths(self) -> None: entries = [ @@ -165,12 +191,6 @@ def test_match_content_is_case_sensitive(self) -> None: content = {"myapp.csproj": "..."} # lowercase — must not fire assert _rule_parent_dirs(rule, {}, {}, content, {}) == set() - def test_match_content_honors_inline_ignorecase_flag(self) -> None: - # Patterns that want case-insensitivity embed (?i) themselves. - rule: DetectorRule = {"path": "requirements.txt", "match_content": r"(?i)\bdjango\b"} - content = {"requirements.txt": "DJANGO==4.2\n"} - assert _rule_parent_dirs(rule, {}, {}, content, {}) == {""} - def test_match_content_no_path_or_ext_filter_scans_all_files(self) -> None: # A bare match_content rule (no path/match_ext) should match any fetched file # whose content satisfies the pattern and collect all their parent dirs. @@ -438,10 +458,10 @@ def test_package_match_detected_high(self) -> None: assert "javascript-react" in platforms def test_content_read_cap_and_shallow_first(self) -> None: - # Root package.json (deps: next) + 6 deeper workspace package.json files. - # Cap is 5; root must be read so javascript-nextjs is detected. - # The alphabetically-last deep manifest (packages/f/package.json) must not be fetched. - deep_names = [f"packages/{c}/package.json" for c in "abcdef"] # 6 paths + # Root package.json (deps: next) + MAX_CONTENT_READS deeper workspace package.json files. + # Root takes the first cap slot; the alphabetically-last deep manifest must not be fetched. + deep_letters = string.ascii_lowercase[:MAX_CONTENT_READS] + deep_names = [f"packages/{c}/package.json" for c in deep_letters] tree = [_make_tree_entry("package.json")] + [_make_tree_entry(p) for p in deep_names] root_pkg = json.dumps({"dependencies": {"next": "14.0.0"}}) contents = {"package.json": root_pkg} @@ -460,8 +480,9 @@ def test_content_read_cap_and_shallow_first(self) -> None: for call in client.get.call_args_list if "/contents/" in call.args[0] ] + last_deep = f"packages/{deep_letters[-1]}/package.json" assert "package.json" in fetched - assert "packages/f/package.json" not in fetched + assert last_deep not in fetched platforms = {p["platform"] for p in result["platforms"]} assert "javascript-nextjs" in platforms @@ -495,3 +516,161 @@ def test_no_content_reads_when_no_candidates(self) -> None: # No /contents/ call should have been issued contents_calls = [c for c in client.get.call_args_list if "/contents/" in c.args[0]] assert contents_calls == [] + + def test_existence_only_pass1_high_match_no_content_reads(self) -> None: + # manage.py is a pure path rule for python-django (no match_content required). + # Pass 1 should fire it as high-confidence with zero /contents/ calls. + tree = [_make_tree_entry("manage.py")] + client = _make_client( + languages={"Python": 80000}, + tree=tree, + contents={}, + ) + result = detect_platforms_multi(client, "owner/repo") + platforms = {p["platform"]: p for p in result["platforms"]} + assert "python-django" in platforms + assert platforms["python-django"]["confidence"] == "high" + contents_calls = [c for c in client.get.call_args_list if "/contents/" in c.args[0]] + assert contents_calls == [] + + def test_colocation_prevents_false_positive_end_to_end(self) -> None: + # dotnet-aspnetcore requires .csproj AND appsettings.json in the same directory. + # Placing them in separate subtrees must NOT produce a high match. + tree = [ + _make_tree_entry("tools/deploy/deploy.csproj"), + _make_tree_entry("backend/appsettings.json"), + ] + client = _make_client( + languages={"C#": 50000}, + tree=tree, + contents={}, + ) + result = detect_platforms_multi(client, "owner/repo") + platforms = {p["platform"] for p in result["platforms"]} + assert "dotnet-aspnetcore" not in platforms + + def test_confidence_ordering_high_before_medium(self) -> None: + # A high-confidence framework match must always rank above a medium + # bare-language fallback, even if the medium entry has more bytes. + # Use a Python repo with manage.py (pure existence → high) so no content + # reads are issued, then verify result ordering. + tree = [_make_tree_entry("manage.py")] + client = _make_client( + languages={"Python": 80000}, + tree=tree, + contents={}, + ) + result = detect_platforms_multi(client, "owner/repo") + # First entry must be the high-confidence framework, not the medium fallback. + assert result["platforms"][0]["confidence"] == "high" + assert result["platforms"][0]["platform"] == "python-django" + + +class TestSelectActivePlatforms: + def test_max_languages_cap_keeps_top_n(self) -> None: + # Feed MAX_LANGUAGES + 1 distinct base platforms; only the top MAX_LANGUAGES survive. + candidates = _distinct_platform_languages(MAX_LANGUAGES + 1) + languages = {lang: 100_000 - i * 10_000 for i, lang in enumerate(candidates)} + result = _select_active_platforms(languages) + assert len(result) == MAX_LANGUAGES + dropped_platform = GITHUB_LANGUAGE_TO_SENTRY_PLATFORM[candidates[-1]] + assert dropped_platform not in result + + def test_related_languages_group_into_single_bucket(self) -> None: + # TypeScript and JavaScript both map to "javascript"; they share one slot. + languages = { + "TypeScript": 70_000, + "JavaScript": 50_000, + } + result = _select_active_platforms(languages) + assert list(result.keys()) == ["javascript"] + # Both language entries should appear in the bucket. + bucket = result["javascript"] + lang_names = {lang for lang, _ in bucket} + assert lang_names == {"TypeScript", "JavaScript"} + + def test_grouping_does_not_consume_extra_cap_slot(self) -> None: + # TypeScript + JavaScript + Python + Ruby = 3 distinct base platforms, not 4. + # All three base platforms should be present despite 4 input languages. + languages = { + "TypeScript": 80_000, + "JavaScript": 70_000, + "Python": 60_000, + "Ruby": 50_000, + } + result = _select_active_platforms(languages) + assert "javascript" in result + assert "python" in result + assert "ruby" in result + assert len(result) == 3 + + def test_ignored_language_skipped(self) -> None: + # "Shell" is in IGNORED_LANGUAGES and must never appear. + languages = {"Shell": 999_999, "Python": 10_000} + result = _select_active_platforms(languages) + assert "python" in result + # Shell has no mapped base platform so it won't appear under any key. + for lang_entries in result.values(): + for lang, _ in lang_entries: + assert lang != "Shell" + + def test_byte_count_descending_ordering(self) -> None: + # The platform with the most bytes should appear first in iteration order. + languages = {"Ruby": 90_000, "Python": 120_000, "Go": 70_000} + result = _select_active_platforms(languages) + # dict preserves insertion order; first key is the top platform. + first_platform = next(iter(result)) + assert first_platform == "python" + + +class TestPathIsIgnored: + def test_node_modules_segment_ignored(self) -> None: + assert _path_is_ignored("node_modules/react/index.js") is True + + def test_nested_ignored_segment(self) -> None: + assert _path_is_ignored("a/b/vendor/c/util.py") is True + + def test_build_gradle_file_not_ignored(self) -> None: + # "build" is an ignored *directory* segment, but "build.gradle" is a filename, + # not the bare segment "build", so it must NOT be ignored. + assert _path_is_ignored("build.gradle") is False + + def test_clean_path_not_ignored(self) -> None: + assert _path_is_ignored("src/app/main.py") is False + + def test_root_level_file_not_ignored(self) -> None: + assert _path_is_ignored("manage.py") is False + + def test_dist_dir_ignored(self) -> None: + assert _path_is_ignored("dist/bundle.js") is True + + +class TestGetTree: + def test_normal_dict_response_returns_entries(self) -> None: + entries = [{"path": "manage.py", "type": "blob", "size": 100}] + client = mock.MagicMock() + client.get.return_value = {"tree": entries, "truncated": False} + result_entries, is_truncated = _get_tree(client, "owner/repo") + assert result_entries == entries + assert is_truncated is False + + def test_non_dict_response_returns_empty(self) -> None: + # GitHub occasionally returns a list or unexpected type on error. + client = mock.MagicMock() + client.get.return_value = [] + result_entries, is_truncated = _get_tree(client, "owner/repo") + assert result_entries == [] + assert is_truncated is False + + def test_missing_tree_key_returns_empty_entries(self) -> None: + client = mock.MagicMock() + client.get.return_value = {"truncated": False} + result_entries, is_truncated = _get_tree(client, "owner/repo") + assert result_entries == [] + assert is_truncated is False + + def test_truncated_flag_propagated(self) -> None: + client = mock.MagicMock() + client.get.return_value = {"tree": [], "truncated": True} + _, is_truncated = _get_tree(client, "owner/repo") + assert is_truncated is True