From 340f7fcab082704d09447bfe219b5523aa2e7a78 Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Thu, 18 Jun 2026 19:07:41 -0400 Subject: [PATCH 1/8] feat(seer): Shard night shift triage into per-chunk feature runs A night shift run dispatched all scored candidates to a single Seer feature run. Large candidate sets degrade a single triage agent (limited time and context), so split candidates into chunks of seer.night_shift.shard_size (default 5) and dispatch each chunk as its own feature run / SeerRun. Each shard is recorded as a new SeerNightShiftRunShard, making the run -> SeerRun relationship one-to-many. Result delivery resolves the run via a shard's SeerRun uuid, falling back to the legacy scalar seer_run FK for runs created before sharding. The legacy FK still points at the first shard during the transition; it is backfilled and dropped in follow-up PRs. Co-Authored-By: Claude Opus 4.8 --- migrations_lockfile.txt | 2 +- src/sentry/options/defaults.py | 6 ++ .../0019_add_night_shift_run_shard.py | 62 ++++++++++++ src/sentry/seer/models/night_shift.py | 20 ++++ src/sentry/seer/night_shift/delivery.py | 15 +-- src/sentry/tasks/seer/night_shift/cron.py | 99 +++++++++++-------- .../sentry/seer/night_shift/test_delivery.py | 39 +++++++- tests/sentry/tasks/seer/test_night_shift.py | 78 ++++++++++++++- 8 files changed, 273 insertions(+), 48 deletions(-) create mode 100644 src/sentry/seer/migrations/0019_add_night_shift_run_shard.py diff --git a/migrations_lockfile.txt b/migrations_lockfile.txt index 6afd74fd7992..9f3d5eddd92c 100644 --- a/migrations_lockfile.txt +++ b/migrations_lockfile.txt @@ -29,7 +29,7 @@ releases: 0004_cleanup_failed_safe_deletes replays: 0007_organizationmember_replay_access -seer: 0018_backfill_seer_agent_run_group_id +seer: 0019_add_night_shift_run_shard sentry: 1117_drop_organizationmapping_codecov_access_delete diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 6fe9fab392c6..05ce4a227c5d 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -1135,6 +1135,12 @@ default=10, flags=FLAG_AUTOMATOR_MODIFIABLE, ) +register( + "seer.night_shift.shard_size", + type=Int, + default=5, + flags=FLAG_AUTOMATOR_MODIFIABLE, +) # Per-org overrides for night shift run options. Keyed by stringified # organization id; each value is a partial set of run-option overrides (e.g. # {"max_candidates": 20}) that layer on top of the global defaults but below diff --git a/src/sentry/seer/migrations/0019_add_night_shift_run_shard.py b/src/sentry/seer/migrations/0019_add_night_shift_run_shard.py new file mode 100644 index 000000000000..a0bbbdf5eca3 --- /dev/null +++ b/src/sentry/seer/migrations/0019_add_night_shift_run_shard.py @@ -0,0 +1,62 @@ +# Generated by Django 5.2.14 on 2026-06-18 22:39 + +import django.db.models.deletion +import sentry.db.models.fields.bounded +import sentry.db.models.fields.foreignkey +from django.db import migrations, models + +from sentry.new_migrations.migrations import CheckedMigration + + +class Migration(CheckedMigration): + # This flag is used to mark that a migration shouldn't be automatically run in production. + # This should only be used for operations where it's safe to run the migration after your + # code has deployed. So this should not be used for most operations that alter the schema + # of a table. + # Here are some things that make sense to mark as post deployment: + # - Large data migrations. Typically we want these to be run manually so that they can be + # monitored and not block the deploy for a long period of time while they run. + # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to + # run this outside deployments so that we don't block them. Note that while adding an index + # is a schema change, it's completely safe to run the operation after the code has deployed. + # Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment + + is_post_deployment = False + + dependencies = [ + ("seer", "0018_backfill_seer_agent_run_group_id"), + ] + + operations = [ + migrations.CreateModel( + name="SeerNightShiftRunShard", + fields=[ + ( + "id", + sentry.db.models.fields.bounded.BoundedBigAutoField( + primary_key=True, serialize=False + ), + ), + ("date_updated", models.DateTimeField(auto_now=True)), + ("date_added", models.DateTimeField(auto_now_add=True)), + ("extras", models.JSONField(db_default={}, default=dict)), + ( + "run", + sentry.db.models.fields.foreignkey.FlexibleForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="shards", + to="seer.seernightshiftrun", + ), + ), + ( + "seer_run", + sentry.db.models.fields.foreignkey.FlexibleForeignKey( + null=True, on_delete=django.db.models.deletion.SET_NULL, to="seer.seerrun" + ), + ), + ], + options={ + "db_table": "seer_nightshiftrunshard", + }, + ), + ] diff --git a/src/sentry/seer/models/night_shift.py b/src/sentry/seer/models/night_shift.py index bbf2babb5911..2503a697d311 100644 --- a/src/sentry/seer/models/night_shift.py +++ b/src/sentry/seer/models/night_shift.py @@ -63,3 +63,23 @@ class Meta: ] __repr__ = sane_repr("run_id", "kind", "group_id") + + +@cell_silo_model +class SeerNightShiftRunShard(DefaultFieldsModel): + """One triage shard of a night shift run, owning the SeerRun for its chunk + of candidates.""" + + __relocation_scope__ = RelocationScope.Excluded + + run = FlexibleForeignKey( + "seer.SeerNightShiftRun", on_delete=models.CASCADE, related_name="shards" + ) + seer_run = FlexibleForeignKey("seer.SeerRun", on_delete=models.SET_NULL, null=True) + extras = models.JSONField(db_default={}, default=dict) + + class Meta: + app_label = "seer" + db_table = "seer_nightshiftrunshard" + + __repr__ = sane_repr("run_id", "seer_run_id") diff --git a/src/sentry/seer/night_shift/delivery.py b/src/sentry/seer/night_shift/delivery.py index 9bb5235cb8fe..68221776ba84 100644 --- a/src/sentry/seer/night_shift/delivery.py +++ b/src/sentry/seer/night_shift/delivery.py @@ -7,6 +7,7 @@ from typing import Any import sentry_sdk +from django.db.models import Q from sentry.constants import SEER_AUTOMATED_RUN_STOPPING_POINT_DEFAULT, ObjectStatus from sentry.models.group import Group @@ -29,12 +30,14 @@ def deliver_night_shift_result( error: str | None, ) -> None: """Process a night_shift result from Seer.""" - try: - run = SeerNightShiftRun.objects.select_related("organization", "seer_run").get( - organization_id=organization_id, - seer_run__uuid=run_uuid, - ) - except SeerNightShiftRun.DoesNotExist: + run = ( + SeerNightShiftRun.objects.filter(organization_id=organization_id) + .filter(Q(shards__seer_run__uuid=run_uuid) | Q(seer_run__uuid=run_uuid)) + .select_related("organization") + .distinct() + .first() + ) + if run is None: logger.warning( "night_shift.delivery.missing_run", extra={"organization_id": organization_id, "run_uuid": run_uuid}, diff --git a/src/sentry/tasks/seer/night_shift/cron.py b/src/sentry/tasks/seer/night_shift/cron.py index d653dc8f0628..d558c3c69037 100644 --- a/src/sentry/tasks/seer/night_shift/cron.py +++ b/src/sentry/tasks/seer/night_shift/cron.py @@ -29,6 +29,7 @@ from sentry.seer.models.night_shift import ( SeerNightShiftRun, SeerNightShiftRunResult, + SeerNightShiftRunShard, ) from sentry.seer.models.project_repository import SeerProjectRepository from sentry.seer.models.run import SeerRun @@ -36,7 +37,11 @@ from sentry.seer.night_shift.models import NightShiftPayload, TriageCandidate, TriageTweaks from sentry.tasks.base import instrumented_task from sentry.tasks.seer.night_shift.models import TriageAction, TriageResult -from sentry.tasks.seer.night_shift.simple_triage import fixability_score_strategy, priority_label +from sentry.tasks.seer.night_shift.simple_triage import ( + ScoredCandidate, + fixability_score_strategy, + priority_label, +) from sentry.tasks.seer.night_shift.tweaks import ( DEFAULT_EXTRA_TRIAGE_INSTRUCTIONS, DEFAULT_INTELLIGENCE_LEVEL, @@ -478,24 +483,11 @@ def _get_eligible_projects( return eligible -def _dispatch_to_seer_feature( - run: SeerNightShiftRun, - organization: Organization, - eligible: Sequence[EligibleProject], +def _build_triage_payload( + candidates: Sequence[ScoredCandidate], resolved_options: SeerNightShiftRunOptions, - log_extra: dict[str, object], - start_time: float, -) -> None: - """Hand triage off to Seer's feature-run endpoint. Seer runs the triage agent - and pushes verdicts back via deliver_feature_result, which marks skips and - triggers autofix (using dry_run from run.extras["options"]).""" - eligible_projects = [ep.project for ep in eligible] - scored = fixability_score_strategy(eligible_projects, resolved_options["max_candidates"]) - if not scored: - logger.info("night_shift.no_candidates", extra=log_extra) - return - - payload = NightShiftPayload( +) -> NightShiftPayload: + return NightShiftPayload( candidates=[ TriageCandidate( group_id=c.group.id, @@ -506,7 +498,7 @@ def _dispatch_to_seer_feature( first_seen=c.group.first_seen.isoformat(), priority=priority_label(c.group.priority), ) - for c in scored + for c in candidates ], tweaks=TriageTweaks( intelligence_level=resolved_options["intelligence_level"], @@ -514,6 +506,26 @@ def _dispatch_to_seer_feature( extra_triage_instructions=resolved_options["extra_triage_instructions"], ), ) + + +def _dispatch_to_seer_feature( + run: SeerNightShiftRun, + organization: Organization, + eligible: Sequence[EligibleProject], + resolved_options: SeerNightShiftRunOptions, + log_extra: dict[str, object], + start_time: float, +) -> None: + """Shard the scored candidates into chunks of seer.night_shift.shard_size and + dispatch each chunk as its own Seer feature run, recorded as a + SeerNightShiftRunShard. Seer pushes verdicts back per shard via + deliver_feature_result.""" + eligible_projects = [ep.project for ep in eligible] + scored = fixability_score_strategy(eligible_projects, resolved_options["max_candidates"]) + if not scored: + logger.info("night_shift.no_candidates", extra=log_extra) + return + try: client = SeerAgentClient(organization) except SeerPermissionError: @@ -521,26 +533,35 @@ def _dispatch_to_seer_feature( _record_run_error(run, "Organization does not have Seer access") return - def _link_run(created: SeerRun) -> None: - # Link inside the dispatch transaction so the row exists before the outbox - # drains and Seer's result correlates back to this night shift run. - run.update(seer_run=created) + shards = list(chunked(scored, options.get("seer.night_shift.shard_size"))) + dispatched = 0 + for shard_index, chunk in enumerate(shards): + payload = _build_triage_payload(chunk, resolved_options) - try: - seer_run = client.start_feature_run( - feature_id="night_shift", - payload=payload.dict(), - flush=False, - on_run_created=_link_run, - ) - except Exception: + def _link_shard(created: SeerRun, is_first: bool = shard_index == 0) -> None: + SeerNightShiftRunShard.objects.create(run=run, seer_run=created) + if is_first: + run.update(seer_run=created) + + try: + client.start_feature_run( + feature_id="night_shift", + payload=payload.dict(), + flush=False, + on_run_created=_link_shard, + ) + except Exception: + logger.exception( + "night_shift.shard_dispatch_failed", + extra={**log_extra, "shard_index": shard_index, "num_shards": len(shards)}, + ) + continue + dispatched += 1 + + if dispatched == 0: sentry_sdk.metrics.count("night_shift.run_error", 1) - _fail_run( - run, - message="Night shift dispatch failed", - event="night_shift.dispatch_failed", - extra=log_extra, - ) + _record_run_error(run, "Night shift dispatch failed") + logger.error("night_shift.dispatch_failed", extra={**log_extra, "num_shards": len(shards)}) return sentry_sdk.metrics.distribution("night_shift.org_run_duration", time.monotonic() - start_time) @@ -548,10 +569,10 @@ def _link_run(created: SeerRun) -> None: "night_shift.feature_dispatched", extra={ **log_extra, - "seer_run_id": seer_run.id, - "seer_run_uuid": str(seer_run.uuid), "num_eligible_projects": len(eligible_projects), "num_candidates": len(scored), + "num_shards": len(shards), + "num_shards_dispatched": dispatched, }, ) diff --git a/tests/sentry/seer/night_shift/test_delivery.py b/tests/sentry/seer/night_shift/test_delivery.py index 9c8306a70419..50fb51cedd58 100644 --- a/tests/sentry/seer/night_shift/test_delivery.py +++ b/tests/sentry/seer/night_shift/test_delivery.py @@ -3,7 +3,11 @@ from sentry.models.organization import Organization from sentry.seer.autofix.utils import AutofixStoppingPoint -from sentry.seer.models.night_shift import SeerNightShiftRun, SeerNightShiftRunResult +from sentry.seer.models.night_shift import ( + SeerNightShiftRun, + SeerNightShiftRunResult, + SeerNightShiftRunShard, +) from sentry.seer.night_shift.delivery import deliver_night_shift_result from sentry.tasks.seer.night_shift.models import TriageAction from sentry.tasks.seer.night_shift.skip_cache import key as skip_cache_key @@ -27,6 +31,39 @@ def _create_night_shift_run( extras=extras, ) + def test_correlates_via_shard_seer_run(self) -> None: + """Sharded runs carry no scalar seer_run; delivery resolves the run from + the shard's SeerRun uuid and processes that shard's verdicts.""" + org = self.create_organization() + project = self.create_project(organization=org) + group = self.create_group(project=project) + shard_seer_run = self.create_seer_run(organization=org) + run = SeerNightShiftRun.objects.create( + organization=org, seer_run=None, extras={"options": {}} + ) + SeerNightShiftRunShard.objects.create(run=run, seer_run=shard_seer_run) + + result = { + "verdicts": [ + {"group_id": group.id, "action": TriageAction.AUTOFIX.value, "reason": "ok"} + ] + } + with patch( + "sentry.tasks.seer.night_shift.cron.trigger_autofix_agent", return_value=42 + ) as mock_trigger: + deliver_night_shift_result( + organization_id=org.id, + run_uuid=str(shard_seer_run.uuid), + status="completed", + result=result, + error=None, + ) + + mock_trigger.assert_called_once() + results = list(SeerNightShiftRunResult.objects.filter(run=run)) + assert len(results) == 1 + assert results[0].group_id == group.id + def test_missing_run_logs_warning(self) -> None: """When run_uuid doesn't match any SeerNightShiftRun, log and return.""" org = self.create_organization() diff --git a/tests/sentry/tasks/seer/test_night_shift.py b/tests/sentry/tasks/seer/test_night_shift.py index efde513463c2..d2cf2e6162ab 100644 --- a/tests/sentry/tasks/seer/test_night_shift.py +++ b/tests/sentry/tasks/seer/test_night_shift.py @@ -5,7 +5,11 @@ from sentry.models.group import Group from sentry.models.organization import OrganizationStatus from sentry.seer.autofix.constants import AutofixAutomationTuningSettings -from sentry.seer.models.night_shift import SeerNightShiftRun, SeerNightShiftRunResult +from sentry.seer.models.night_shift import ( + SeerNightShiftRun, + SeerNightShiftRunResult, + SeerNightShiftRunShard, +) from sentry.seer.models.run import SeerRun, SeerRunMirrorStatus, SeerRunType from sentry.seer.models.workflow import SeerWorkflowStrategy from sentry.tasks.seer.night_shift.cron import ( @@ -532,6 +536,78 @@ def test_dispatches_candidates_to_seer_feature(self) -> None: # Verdicts and autofix are Seer's responsibility now; no result rows here. assert not SeerNightShiftRunResult.objects.filter(run=run).exists() + def test_shards_candidates_across_feature_runs(self) -> None: + org = self.create_organization() + project = self.create_project(organization=org) + self._make_eligible(project) + + groups = [ + self._store_event_and_update_group( + project, f"fixable-{i}", seer_fixability_score=0.9, times_seen=5 + i + ) + for i in range(3) + ] + + with ( + self.options({"seer.night_shift.shard_size": 2}), + self.feature("organizations:gen-ai-features"), + ): + run_night_shift_for_org(org.id) + + run = SeerNightShiftRun.objects.get(organization=org) + # 3 candidates, shard size 2 -> 2 shards (2 + 1). + shards = list(SeerNightShiftRunShard.objects.filter(run=run).order_by("id")) + assert len(shards) == 2 + assert SeerRun.objects.filter(organization=org, type=SeerRunType.FEATURE_RUN).count() == 2 + + shard_sizes = [] + dispatched_group_ids: list[int] = [] + for shard in shards: + outbox = CellOutbox.objects.get( + category=OutboxCategory.SEER_RUN_CREATE, object_identifier=shard.seer_run_id + ) + assert outbox.payload is not None + candidates = outbox.payload["body"]["payload"]["candidates"] + shard_sizes.append(len(candidates)) + dispatched_group_ids.extend(c["group_id"] for c in candidates) + + assert sorted(shard_sizes) == [1, 2] + assert sorted(dispatched_group_ids) == sorted(g.id for g in groups) + + assert run.seer_run_id == shards[0].seer_run_id + assert run.extras.get("error_message") is None + + def test_partial_shard_failure_still_dispatches(self) -> None: + org = self.create_organization() + project = self.create_project(organization=org) + self._make_eligible(project) + for i in range(2): + self._store_event_and_update_group( + project, f"fixable-{i}", seer_fixability_score=0.9, times_seen=5 + i + ) + + real_create = SeerNightShiftRunShard.objects.create + calls: list[int] = [] + + def flaky_create(*args, **kwargs): + calls.append(1) + if len(calls) == 2: + raise RuntimeError("boom") + return real_create(*args, **kwargs) + + with ( + self.options({"seer.night_shift.shard_size": 1}), + self.feature("organizations:gen-ai-features"), + patch.object(SeerNightShiftRunShard.objects, "create", side_effect=flaky_create), + ): + run_night_shift_for_org(org.id) + + run = SeerNightShiftRun.objects.get(organization=org) + assert run.extras.get("error_message") is None + assert SeerNightShiftRunShard.objects.filter(run=run).count() == 1 + assert SeerRun.objects.filter(organization=org, type=SeerRunType.FEATURE_RUN).count() == 1 + assert run.seer_run_id is not None + def test_no_candidates_skips_dispatch(self) -> None: org = self.create_organization() project = self.create_project(organization=org) From 291a074310c84f3783a19a4e5486e60424b3509a Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Thu, 18 Jun 2026 19:15:08 -0400 Subject: [PATCH 2/8] docs(seer): Make SeerNightShiftRunShard docstring workflow-agnostic The shard model is generic to any night shift workflow, not just triage. Co-Authored-By: Claude Opus 4.8 --- src/sentry/seer/models/night_shift.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/sentry/seer/models/night_shift.py b/src/sentry/seer/models/night_shift.py index 2503a697d311..566fecfc6b50 100644 --- a/src/sentry/seer/models/night_shift.py +++ b/src/sentry/seer/models/night_shift.py @@ -67,8 +67,9 @@ class Meta: @cell_silo_model class SeerNightShiftRunShard(DefaultFieldsModel): - """One triage shard of a night shift run, owning the SeerRun for its chunk - of candidates.""" + """One shard of a night shift run, owning the SeerRun for a single + dispatched Seer feature run. A run fans out its work into one or more shards + dispatched as independent feature runs.""" __relocation_scope__ = RelocationScope.Excluded From 293b572ab3abe9cb2e8c4424c6470192fbc88b04 Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Thu, 18 Jun 2026 19:30:38 -0400 Subject: [PATCH 3/8] ref(seer): Make shard->SeerRun link a OneToOneField Each SeerRun is dispatched for exactly one shard, so model the link as one-to-one to enforce the invariant at the DB level. Keep SET_NULL: the SeerRun is a mirror the shard references, not its owner, and gets TTL-cleaned. Co-Authored-By: Claude Opus 4.8 --- .../seer/migrations/0019_add_night_shift_run_shard.py | 9 ++++++--- src/sentry/seer/models/night_shift.py | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/sentry/seer/migrations/0019_add_night_shift_run_shard.py b/src/sentry/seer/migrations/0019_add_night_shift_run_shard.py index a0bbbdf5eca3..0521b7abc560 100644 --- a/src/sentry/seer/migrations/0019_add_night_shift_run_shard.py +++ b/src/sentry/seer/migrations/0019_add_night_shift_run_shard.py @@ -1,4 +1,4 @@ -# Generated by Django 5.2.14 on 2026-06-18 22:39 +# Generated by Django 5.2.14 on 2026-06-18 23:19 import django.db.models.deletion import sentry.db.models.fields.bounded @@ -50,8 +50,11 @@ class Migration(CheckedMigration): ), ( "seer_run", - sentry.db.models.fields.foreignkey.FlexibleForeignKey( - null=True, on_delete=django.db.models.deletion.SET_NULL, to="seer.seerrun" + models.OneToOneField( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="night_shift_shard", + to="seer.seerrun", ), ), ], diff --git a/src/sentry/seer/models/night_shift.py b/src/sentry/seer/models/night_shift.py index 566fecfc6b50..8f7f581d0f37 100644 --- a/src/sentry/seer/models/night_shift.py +++ b/src/sentry/seer/models/night_shift.py @@ -76,7 +76,9 @@ class SeerNightShiftRunShard(DefaultFieldsModel): run = FlexibleForeignKey( "seer.SeerNightShiftRun", on_delete=models.CASCADE, related_name="shards" ) - seer_run = FlexibleForeignKey("seer.SeerRun", on_delete=models.SET_NULL, null=True) + seer_run = models.OneToOneField( + "seer.SeerRun", on_delete=models.SET_NULL, null=True, related_name="night_shift_shard" + ) extras = models.JSONField(db_default={}, default=dict) class Meta: From b430f63f0d73b5c4e9263e6d51aad07d414f2e36 Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Thu, 18 Jun 2026 20:07:45 -0400 Subject: [PATCH 4/8] fix(seer): Record night shift delivery errors per shard Sharded runs share one SeerNightShiftRun, so writing per-delivery error_message to the run let one shard's success clear another shard's error, and pinned the legacy seer_run FK to shard_index 0 even if that chunk failed to dispatch. Record delivery errors on the shard, and point the legacy FK at the first successfully dispatched shard. Addresses Cursor review. Co-Authored-By: Claude Opus 4.8 --- src/sentry/seer/night_shift/delivery.py | 24 ++-- src/sentry/tasks/seer/night_shift/cron.py | 18 +-- .../sentry/seer/night_shift/test_delivery.py | 113 +++++++++++------- tests/sentry/tasks/seer/test_night_shift.py | 56 ++++++++- 4 files changed, 154 insertions(+), 57 deletions(-) diff --git a/src/sentry/seer/night_shift/delivery.py b/src/sentry/seer/night_shift/delivery.py index 68221776ba84..d57542f97265 100644 --- a/src/sentry/seer/night_shift/delivery.py +++ b/src/sentry/seer/night_shift/delivery.py @@ -14,7 +14,11 @@ from sentry.models.organization import Organization from sentry.seer.agent.types import FeatureRunStatus from sentry.seer.autofix.utils import AutofixStoppingPoint, bulk_read_preferences_from_sentry_db -from sentry.seer.models.night_shift import SeerNightShiftRun, SeerNightShiftRunResult +from sentry.seer.models.night_shift import ( + SeerNightShiftRun, + SeerNightShiftRunResult, + SeerNightShiftRunShard, +) from sentry.seer.night_shift.models import TriageResponse from sentry.tasks.seer.night_shift.models import TriageAction, TriageResult from sentry.tasks.seer.night_shift.skip_cache import mark_skipped @@ -44,8 +48,14 @@ def deliver_night_shift_result( ) return + # Per-delivery error_message lives on the shard so a sibling shard's success + # can't clear it; the run is the fallback only for pre-shard rows. + error_target: SeerNightShiftRun | SeerNightShiftRunShard = ( + run.shards.filter(seer_run__uuid=run_uuid).first() or run + ) + if error: - run.update(extras={**(run.extras or {}), "error_message": error}) + error_target.update(extras={**(error_target.extras or {}), "error_message": error}) log_extra: dict[str, object] = { "organization_id": run.organization_id, @@ -73,13 +83,11 @@ def deliver_night_shift_result( options = (run.extras or {}).get("options") or {} dry_run = bool(options.get("dry_run", False)) - # A failed dispatch may have left a stale error_message even though Seer went - # on to process the run and is now delivering verdicts. Clear it so the run's - # state reflects the successful delivery. - if (run.extras or {}).get("error_message"): - extras = {**run.extras} + # Clear any stale error_message now that this delivery has succeeded. + if (error_target.extras or {}).get("error_message"): + extras = {**error_target.extras} del extras["error_message"] - run.update(extras=extras) + error_target.update(extras=extras) _process_verdicts( run=run, diff --git a/src/sentry/tasks/seer/night_shift/cron.py b/src/sentry/tasks/seer/night_shift/cron.py index d558c3c69037..1d1418761c46 100644 --- a/src/sentry/tasks/seer/night_shift/cron.py +++ b/src/sentry/tasks/seer/night_shift/cron.py @@ -533,18 +533,16 @@ def _dispatch_to_seer_feature( _record_run_error(run, "Organization does not have Seer access") return + def _link_shard(created: SeerRun) -> None: + SeerNightShiftRunShard.objects.create(run=run, seer_run=created) + shards = list(chunked(scored, options.get("seer.night_shift.shard_size"))) + first_seer_run: SeerRun | None = None dispatched = 0 for shard_index, chunk in enumerate(shards): payload = _build_triage_payload(chunk, resolved_options) - - def _link_shard(created: SeerRun, is_first: bool = shard_index == 0) -> None: - SeerNightShiftRunShard.objects.create(run=run, seer_run=created) - if is_first: - run.update(seer_run=created) - try: - client.start_feature_run( + seer_run = client.start_feature_run( feature_id="night_shift", payload=payload.dict(), flush=False, @@ -556,6 +554,8 @@ def _link_shard(created: SeerRun, is_first: bool = shard_index == 0) -> None: extra={**log_extra, "shard_index": shard_index, "num_shards": len(shards)}, ) continue + if first_seer_run is None: + first_seer_run = seer_run dispatched += 1 if dispatched == 0: @@ -564,6 +564,10 @@ def _link_shard(created: SeerRun, is_first: bool = shard_index == 0) -> None: logger.error("night_shift.dispatch_failed", extra={**log_extra, "num_shards": len(shards)}) return + # Point the legacy scalar FK at the first dispatched shard for the transition. + if first_seer_run is not None: + run.update(seer_run=first_seer_run) + sentry_sdk.metrics.distribution("night_shift.org_run_duration", time.monotonic() - start_time) logger.info( "night_shift.feature_dispatched", diff --git a/tests/sentry/seer/night_shift/test_delivery.py b/tests/sentry/seer/night_shift/test_delivery.py index 50fb51cedd58..50ce5714bc22 100644 --- a/tests/sentry/seer/night_shift/test_delivery.py +++ b/tests/sentry/seer/night_shift/test_delivery.py @@ -21,27 +21,29 @@ class TestDeliverNightShiftResult(TestCase): def _create_night_shift_run( self, organization: Organization | None = None, **extras_overrides: Any ) -> SeerNightShiftRun: - """Create a SeerNightShiftRun with associated SeerRun.""" + """Create a sharded SeerNightShiftRun: one shard owning a SeerRun and no + legacy scalar seer_run (the steady state after migration).""" org = organization or self.create_organization() - seer_run = self.create_seer_run(organization=org) extras = {"options": {}, **extras_overrides} - return SeerNightShiftRun.objects.create( - organization=org, - seer_run=seer_run, - extras=extras, + run = SeerNightShiftRun.objects.create(organization=org, extras=extras) + SeerNightShiftRunShard.objects.create( + run=run, seer_run=self.create_seer_run(organization=org) ) + return run + + def _run_uuid(self, run: SeerNightShiftRun) -> str: + return str(run.shards.get().seer_run.uuid) - def test_correlates_via_shard_seer_run(self) -> None: - """Sharded runs carry no scalar seer_run; delivery resolves the run from - the shard's SeerRun uuid and processes that shard's verdicts.""" + def test_correlates_via_legacy_seer_run_fallback(self) -> None: + """Pre-shard runs have only the scalar seer_run FK and no shard rows; + delivery still resolves them through the fallback branch.""" org = self.create_organization() project = self.create_project(organization=org) group = self.create_group(project=project) - shard_seer_run = self.create_seer_run(organization=org) + seer_run = self.create_seer_run(organization=org) run = SeerNightShiftRun.objects.create( - organization=org, seer_run=None, extras={"options": {}} + organization=org, seer_run=seer_run, extras={"options": {}} ) - SeerNightShiftRunShard.objects.create(run=run, seer_run=shard_seer_run) result = { "verdicts": [ @@ -53,13 +55,14 @@ def test_correlates_via_shard_seer_run(self) -> None: ) as mock_trigger: deliver_night_shift_result( organization_id=org.id, - run_uuid=str(shard_seer_run.uuid), + run_uuid=str(seer_run.uuid), status="completed", result=result, error=None, ) mock_trigger.assert_called_once() + assert not run.shards.exists() results = list(SeerNightShiftRunResult.objects.filter(run=run)) assert len(results) == 1 assert results[0].group_id == group.id @@ -81,14 +84,13 @@ def test_missing_run_logs_warning(self) -> None: assert "night_shift.delivery.missing_run" in mock_logger.warning.call_args.args[0] def test_error_status_records_error_and_returns(self) -> None: - """When status is 'error', record error message and return early.""" + """When status is 'error', record the error on the shard and return early.""" run = self._create_night_shift_run() - assert run.seer_run is not None with patch("sentry.seer.night_shift.delivery.logger") as mock_logger: deliver_night_shift_result( organization_id=run.organization_id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="error", result=None, error="Seer exploded", @@ -97,19 +99,55 @@ def test_error_status_records_error_and_returns(self) -> None: mock_logger.warning.assert_called() assert "night_shift.delivery.no_result" in mock_logger.warning.call_args.args[0] - run.refresh_from_db() - assert run.extras["error_message"] == "Seer exploded" + shard = run.shards.get() + assert shard.extras["error_message"] == "Seer exploded" assert not SeerNightShiftRunResult.objects.filter(run=run).exists() + def test_sibling_shard_success_keeps_other_shard_error(self) -> None: + """A successful shard delivery must not clear an error a sibling shard + recorded on the same run.""" + org = self.create_organization() + project = self.create_project(organization=org) + group = self.create_group(project=project) + run = SeerNightShiftRun.objects.create(organization=org, extras={"options": {}}) + failed_shard = SeerNightShiftRunShard.objects.create( + run=run, seer_run=self.create_seer_run(organization=org) + ) + ok_shard = SeerNightShiftRunShard.objects.create( + run=run, seer_run=self.create_seer_run(organization=org) + ) + + deliver_night_shift_result( + organization_id=org.id, + run_uuid=str(failed_shard.seer_run.uuid), + status="error", + result=None, + error="shard failed", + ) + with patch("sentry.tasks.seer.night_shift.cron.trigger_autofix_agent", return_value=1): + deliver_night_shift_result( + organization_id=org.id, + run_uuid=str(ok_shard.seer_run.uuid), + status="completed", + result={ + "verdicts": [ + {"group_id": group.id, "action": TriageAction.AUTOFIX.value, "reason": "ok"} + ] + }, + error=None, + ) + + failed_shard.refresh_from_db() + assert failed_shard.extras["error_message"] == "shard failed" + def test_invalid_result_logs_exception(self) -> None: """When result can't be parsed as TriageResponse, log and return.""" run = self._create_night_shift_run() - assert run.seer_run is not None with patch("sentry.seer.night_shift.delivery.logger") as mock_logger: deliver_night_shift_result( organization_id=run.organization_id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result={"invalid": "schema"}, error=None, @@ -133,11 +171,10 @@ def test_skip_verdict_marks_group_skipped(self) -> None: ] } - assert run.seer_run is not None with patch("sentry.tasks.seer.night_shift.cron.trigger_autofix_agent") as mock_trigger: deliver_night_shift_result( organization_id=org.id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result=result, error=None, @@ -171,13 +208,12 @@ def test_autofix_verdict_triggers_autofix(self) -> None: ] } - assert run.seer_run is not None with patch( "sentry.tasks.seer.night_shift.cron.trigger_autofix_agent", return_value=42 ) as mock_trigger: deliver_night_shift_result( organization_id=org.id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result=result, error=None, @@ -211,11 +247,10 @@ def test_root_cause_only_verdict_marks_group_skipped(self) -> None: ] } - assert run.seer_run is not None with patch("sentry.tasks.seer.night_shift.cron.trigger_autofix_agent") as mock_trigger: deliver_night_shift_result( organization_id=org.id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result=result, error=None, @@ -246,11 +281,10 @@ def test_dry_run_skips_autofix(self) -> None: ] } - assert run.seer_run is not None with patch("sentry.tasks.seer.night_shift.cron.trigger_autofix_agent") as mock_trigger: deliver_night_shift_result( organization_id=org.id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result=result, error=None, @@ -288,7 +322,6 @@ def trigger_side_effect(**kwargs: Any) -> int: raise RuntimeError("trigger failed") return 7 - assert run.seer_run is not None with ( patch( "sentry.tasks.seer.night_shift.cron.trigger_autofix_agent", @@ -298,7 +331,7 @@ def trigger_side_effect(**kwargs: Any) -> int: ): deliver_night_shift_result( organization_id=org.id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result=result, error=None, @@ -330,14 +363,13 @@ def test_unknown_group_ids_logged(self) -> None: ] } - assert run.seer_run is not None with ( patch("sentry.tasks.seer.night_shift.cron.trigger_autofix_agent") as mock_trigger, patch("sentry.seer.night_shift.delivery.logger") as mock_logger, ): deliver_night_shift_result( organization_id=org.id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result=result, error=None, @@ -364,13 +396,12 @@ def test_user_context_passed_to_autofix(self) -> None: ] } - assert run.seer_run is not None with patch( "sentry.tasks.seer.night_shift.cron.trigger_autofix_agent", return_value=1 ) as mock_trigger: deliver_night_shift_result( organization_id=org.id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result=result, error=None, @@ -383,7 +414,9 @@ def test_successful_delivery_clears_stale_error_message(self) -> None: org = self.create_organization() project = self.create_project(organization=org) group = self.create_group(project=project) - run = self._create_night_shift_run(organization=org, error_message="Night shift run failed") + run = self._create_night_shift_run(organization=org) + shard = run.shards.get() + shard.update(extras={"error_message": "Night shift run failed"}) result = { "verdicts": [ @@ -391,18 +424,17 @@ def test_successful_delivery_clears_stale_error_message(self) -> None: ] } - assert run.seer_run is not None with patch("sentry.tasks.seer.night_shift.cron.trigger_autofix_agent", return_value=1): deliver_night_shift_result( organization_id=org.id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result=result, error=None, ) - run.refresh_from_db() - assert "error_message" not in run.extras + shard.refresh_from_db() + assert "error_message" not in shard.extras def test_empty_reason_no_user_context(self) -> None: """Empty reason should result in no user_context.""" @@ -415,13 +447,12 @@ def test_empty_reason_no_user_context(self) -> None: "verdicts": [{"group_id": group.id, "action": TriageAction.AUTOFIX.value, "reason": ""}] } - assert run.seer_run is not None with patch( "sentry.tasks.seer.night_shift.cron.trigger_autofix_agent", return_value=1 ) as mock_trigger: deliver_night_shift_result( organization_id=org.id, - run_uuid=str(run.seer_run.uuid), + run_uuid=self._run_uuid(run), status="completed", result=result, error=None, diff --git a/tests/sentry/tasks/seer/test_night_shift.py b/tests/sentry/tasks/seer/test_night_shift.py index d2cf2e6162ab..711b1c08e4fc 100644 --- a/tests/sentry/tasks/seer/test_night_shift.py +++ b/tests/sentry/tasks/seer/test_night_shift.py @@ -19,7 +19,7 @@ schedule_night_shift, ) from sentry.tasks.seer.night_shift.models import TriageAction -from sentry.tasks.seer.night_shift.simple_triage import fixability_score_strategy +from sentry.tasks.seer.night_shift.simple_triage import ScoredCandidate, fixability_score_strategy from sentry.tasks.seer.night_shift.skip_cache import key as skip_cache_key from sentry.tasks.seer.night_shift.skip_cache import mark_skipped from sentry.testutils.cases import SnubaTestCase, TestCase @@ -496,6 +496,60 @@ def _store_event_and_update_group(self, project, fingerprint, **group_attrs): Group.objects.filter(id=event.group_id).update(**group_attrs) return Group.objects.get(id=event.group_id) + def _shard_group_ids(self, shard): + outbox = CellOutbox.objects.get( + category=OutboxCategory.SEER_RUN_CREATE, object_identifier=shard.seer_run_id + ) + assert outbox.payload is not None + return [c["group_id"] for c in outbox.payload["body"]["payload"]["candidates"]] + + def test_chunking_preserves_order_across_even_shards(self) -> None: + org = self.create_organization() + project = self.create_project(organization=org) + self._make_eligible(project) + groups = [self.create_group(project=project) for _ in range(4)] + scored = [ScoredCandidate(group=g, fixability=0.9) for g in groups] + + with ( + self.options({"seer.night_shift.shard_size": 2}), + self.feature("organizations:gen-ai-features"), + patch( + "sentry.tasks.seer.night_shift.cron.fixability_score_strategy", + return_value=scored, + ), + ): + run_night_shift_for_org(org.id) + + run = SeerNightShiftRun.objects.get(organization=org) + shards = list(SeerNightShiftRunShard.objects.filter(run=run).order_by("id")) + # 4 candidates @ size 2 -> two even shards, fixability order preserved. + assert [self._shard_group_ids(s) for s in shards] == [ + [groups[0].id, groups[1].id], + [groups[2].id, groups[3].id], + ] + + def test_chunking_single_shard_when_size_exceeds_count(self) -> None: + org = self.create_organization() + project = self.create_project(organization=org) + self._make_eligible(project) + groups = [self.create_group(project=project) for _ in range(3)] + scored = [ScoredCandidate(group=g, fixability=0.9) for g in groups] + + with ( + self.options({"seer.night_shift.shard_size": 10}), + self.feature("organizations:gen-ai-features"), + patch( + "sentry.tasks.seer.night_shift.cron.fixability_score_strategy", + return_value=scored, + ), + ): + run_night_shift_for_org(org.id) + + run = SeerNightShiftRun.objects.get(organization=org) + shards = list(SeerNightShiftRunShard.objects.filter(run=run)) + assert len(shards) == 1 + assert self._shard_group_ids(shards[0]) == [g.id for g in groups] + def test_dispatches_candidates_to_seer_feature(self) -> None: org = self.create_organization() project = self.create_project(organization=org) From b6b6e805988874a1e26f1a14143c848c5f561b19 Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Thu, 18 Jun 2026 20:18:22 -0400 Subject: [PATCH 5/8] ref(seer): Stop writing the legacy seer_run FK on sharded runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nothing outside the delivery fallback reads SeerNightShiftRun.seer_run, and sharded runs resolve via their shards. Leave the scalar FK null on new runs instead of pointing it at the first shard — this drops the first-shard tracking in dispatch and is a step toward removing the column once pre-shard rows are backfilled. The delivery read-fallback stays for those rows until then. Co-Authored-By: Claude Opus 4.8 --- src/sentry/tasks/seer/night_shift/cron.py | 9 +-------- tests/sentry/tasks/seer/test_night_shift.py | 13 +++++-------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/src/sentry/tasks/seer/night_shift/cron.py b/src/sentry/tasks/seer/night_shift/cron.py index 1d1418761c46..b55f2f9af16e 100644 --- a/src/sentry/tasks/seer/night_shift/cron.py +++ b/src/sentry/tasks/seer/night_shift/cron.py @@ -537,12 +537,11 @@ def _link_shard(created: SeerRun) -> None: SeerNightShiftRunShard.objects.create(run=run, seer_run=created) shards = list(chunked(scored, options.get("seer.night_shift.shard_size"))) - first_seer_run: SeerRun | None = None dispatched = 0 for shard_index, chunk in enumerate(shards): payload = _build_triage_payload(chunk, resolved_options) try: - seer_run = client.start_feature_run( + client.start_feature_run( feature_id="night_shift", payload=payload.dict(), flush=False, @@ -554,8 +553,6 @@ def _link_shard(created: SeerRun) -> None: extra={**log_extra, "shard_index": shard_index, "num_shards": len(shards)}, ) continue - if first_seer_run is None: - first_seer_run = seer_run dispatched += 1 if dispatched == 0: @@ -564,10 +561,6 @@ def _link_shard(created: SeerRun) -> None: logger.error("night_shift.dispatch_failed", extra={**log_extra, "num_shards": len(shards)}) return - # Point the legacy scalar FK at the first dispatched shard for the transition. - if first_seer_run is not None: - run.update(seer_run=first_seer_run) - sentry_sdk.metrics.distribution("night_shift.org_run_duration", time.monotonic() - start_time) logger.info( "night_shift.feature_dispatched", diff --git a/tests/sentry/tasks/seer/test_night_shift.py b/tests/sentry/tasks/seer/test_night_shift.py index 711b1c08e4fc..224390c81f51 100644 --- a/tests/sentry/tasks/seer/test_night_shift.py +++ b/tests/sentry/tasks/seer/test_night_shift.py @@ -569,11 +569,11 @@ def test_dispatches_candidates_to_seer_feature(self) -> None: mock_autofix.assert_not_called() run = SeerNightShiftRun.objects.get(organization=org) - assert run.seer_run is not None - assert run.seer_run.type == SeerRunType.FEATURE_RUN + shard = run.shards.get() + assert shard.seer_run.type == SeerRunType.FEATURE_RUN seer_run, body = _dispatched_feature_body(org) - assert seer_run.id == run.seer_run_id + assert seer_run.id == shard.seer_run_id assert body["feature_id"] == "night_shift" assert [c["group_id"] for c in body["payload"]["candidates"]] == [group.id] assert body["payload"]["candidates"][0]["priority"] == "high" @@ -584,8 +584,8 @@ def test_dispatches_candidates_to_seer_feature(self) -> None: assert outbox.payload is not None assert outbox.payload["viewer_context"] == {"organization_id": org.id} - assert run.seer_run.mirror_status == SeerRunMirrorStatus.PENDING - assert run.seer_run.seer_run_state_id is None + assert shard.seer_run.mirror_status == SeerRunMirrorStatus.PENDING + assert shard.seer_run.seer_run_state_id is None assert run.extras.get("error_message") is None # Verdicts and autofix are Seer's responsibility now; no result rows here. assert not SeerNightShiftRunResult.objects.filter(run=run).exists() @@ -627,8 +627,6 @@ def test_shards_candidates_across_feature_runs(self) -> None: assert sorted(shard_sizes) == [1, 2] assert sorted(dispatched_group_ids) == sorted(g.id for g in groups) - - assert run.seer_run_id == shards[0].seer_run_id assert run.extras.get("error_message") is None def test_partial_shard_failure_still_dispatches(self) -> None: @@ -660,7 +658,6 @@ def flaky_create(*args, **kwargs): assert run.extras.get("error_message") is None assert SeerNightShiftRunShard.objects.filter(run=run).count() == 1 assert SeerRun.objects.filter(organization=org, type=SeerRunType.FEATURE_RUN).count() == 1 - assert run.seer_run_id is not None def test_no_candidates_skips_dispatch(self) -> None: org = self.create_organization() From 62c25fa68a4542f24775e6ed90b1a2363dae70c3 Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Thu, 18 Jun 2026 20:27:38 -0400 Subject: [PATCH 6/8] fix(seer): Surface shard errors in the night shift run API Per-shard delivery errors record on SeerNightShiftRunShard.extras, but the run serializer read errorMessage only from the run, so a failed shard could read as a healthy run. Surface a shard error_message when the run itself has none. Addresses Cursor review. Co-Authored-By: Claude Opus 4.8 --- .../serializers/models/seer_night_shift_run.py | 15 ++++++++++++--- .../test_organization_seer_workflows.py | 18 +++++++++++++++++- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/sentry/api/serializers/models/seer_night_shift_run.py b/src/sentry/api/serializers/models/seer_night_shift_run.py index 1e40a30e3572..9bab275729cd 100644 --- a/src/sentry/api/serializers/models/seer_night_shift_run.py +++ b/src/sentry/api/serializers/models/seer_night_shift_run.py @@ -46,7 +46,7 @@ class SeerNightShiftRunSerializer(Serializer[SeerNightShiftRunResponse]): def get_attrs( self, item_list: Sequence[SeerNightShiftRun], user: Any, **kwargs: Any ) -> dict[SeerNightShiftRun, dict[str, Any]]: - prefetch_related_objects(item_list, "results") + prefetch_related_objects(item_list, "results", "shards") return {} def serialize( @@ -59,12 +59,21 @@ def serialize( all_results = list(obj.results.all()) triage_results = [r for r in all_results if r.kind == SeerWorkflowStrategy.AGENTIC_TRIAGE] extras = obj.extras or {} + # A dispatch failure records on the run; per-shard delivery failures record + # on the shard, so surface either so a failed shard doesn't read as healthy. + shard_error = next( + ( + s.extras["error_message"] + for s in obj.shards.all() + if (s.extras or {}).get("error_message") + ), + None, + ) return { "id": str(obj.id), "dateAdded": obj.date_added.isoformat(), "extras": extras, - # Legacy alias: error_message lives in extras now. - "errorMessage": extras.get("error_message"), + "errorMessage": extras.get("error_message") or shard_error, "results": [_serialize_result(r) for r in all_results], "issues": [_serialize_legacy_issue(r) for r in triage_results], # Match the pre-migration column behavior: always "agentic_triage" diff --git a/tests/sentry/seer/endpoints/test_organization_seer_workflows.py b/tests/sentry/seer/endpoints/test_organization_seer_workflows.py index 5f6207305e05..5aa56c21d0fb 100644 --- a/tests/sentry/seer/endpoints/test_organization_seer_workflows.py +++ b/tests/sentry/seer/endpoints/test_organization_seer_workflows.py @@ -1,4 +1,8 @@ -from sentry.seer.models.night_shift import SeerNightShiftRun, SeerNightShiftRunResult +from sentry.seer.models.night_shift import ( + SeerNightShiftRun, + SeerNightShiftRunResult, + SeerNightShiftRunShard, +) from sentry.testutils.cases import APITestCase @@ -50,6 +54,18 @@ def test_returns_runs_for_org_with_nested_results(self) -> None: assert legacy["groupId"] == str(group.id) assert legacy["action"] == "autofix_triggered" + def test_surfaces_shard_error_message(self) -> None: + # Per-shard delivery errors live on the shard; the run API must still + # surface them so a failed shard doesn't read as a healthy run. + run = SeerNightShiftRun.objects.create(organization=self.organization) + SeerNightShiftRunShard.objects.create(run=run) + SeerNightShiftRunShard.objects.create(run=run, extras={"error_message": "shard failed"}) + + with self.feature("organizations:seer-night-shift"): + response = self.get_success_response(self.organization.slug) + + assert response.data[0]["errorMessage"] == "shard failed" + def test_runs_ordered_by_date_added_desc(self) -> None: older = SeerNightShiftRun.objects.create(organization=self.organization) newer = SeerNightShiftRun.objects.create(organization=self.organization) From b5a6802623ca4a557510d3784a8dfeeca03989d1 Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Thu, 18 Jun 2026 20:41:00 -0400 Subject: [PATCH 7/8] test(seer): Narrow nullable shard.seer_run for mypy shard.seer_run is Optional (nullable OneToOne), so tests asserting on it tripped union-attr under CI mypy. Use the non-null SeerRun from the dispatch helper / captured locals instead. Co-Authored-By: Claude Opus 4.8 --- tests/sentry/seer/night_shift/test_delivery.py | 18 +++++++++--------- tests/sentry/tasks/seer/test_night_shift.py | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/sentry/seer/night_shift/test_delivery.py b/tests/sentry/seer/night_shift/test_delivery.py index 50ce5714bc22..83d588f7d68c 100644 --- a/tests/sentry/seer/night_shift/test_delivery.py +++ b/tests/sentry/seer/night_shift/test_delivery.py @@ -32,7 +32,9 @@ def _create_night_shift_run( return run def _run_uuid(self, run: SeerNightShiftRun) -> str: - return str(run.shards.get().seer_run.uuid) + seer_run = run.shards.get().seer_run + assert seer_run is not None + return str(seer_run.uuid) def test_correlates_via_legacy_seer_run_fallback(self) -> None: """Pre-shard runs have only the scalar seer_run FK and no shard rows; @@ -110,16 +112,14 @@ def test_sibling_shard_success_keeps_other_shard_error(self) -> None: project = self.create_project(organization=org) group = self.create_group(project=project) run = SeerNightShiftRun.objects.create(organization=org, extras={"options": {}}) - failed_shard = SeerNightShiftRunShard.objects.create( - run=run, seer_run=self.create_seer_run(organization=org) - ) - ok_shard = SeerNightShiftRunShard.objects.create( - run=run, seer_run=self.create_seer_run(organization=org) - ) + failed_seer_run = self.create_seer_run(organization=org) + ok_seer_run = self.create_seer_run(organization=org) + failed_shard = SeerNightShiftRunShard.objects.create(run=run, seer_run=failed_seer_run) + SeerNightShiftRunShard.objects.create(run=run, seer_run=ok_seer_run) deliver_night_shift_result( organization_id=org.id, - run_uuid=str(failed_shard.seer_run.uuid), + run_uuid=str(failed_seer_run.uuid), status="error", result=None, error="shard failed", @@ -127,7 +127,7 @@ def test_sibling_shard_success_keeps_other_shard_error(self) -> None: with patch("sentry.tasks.seer.night_shift.cron.trigger_autofix_agent", return_value=1): deliver_night_shift_result( organization_id=org.id, - run_uuid=str(ok_shard.seer_run.uuid), + run_uuid=str(ok_seer_run.uuid), status="completed", result={ "verdicts": [ diff --git a/tests/sentry/tasks/seer/test_night_shift.py b/tests/sentry/tasks/seer/test_night_shift.py index 224390c81f51..b73f4d1dbd44 100644 --- a/tests/sentry/tasks/seer/test_night_shift.py +++ b/tests/sentry/tasks/seer/test_night_shift.py @@ -570,10 +570,10 @@ def test_dispatches_candidates_to_seer_feature(self) -> None: run = SeerNightShiftRun.objects.get(organization=org) shard = run.shards.get() - assert shard.seer_run.type == SeerRunType.FEATURE_RUN seer_run, body = _dispatched_feature_body(org) assert seer_run.id == shard.seer_run_id + assert seer_run.type == SeerRunType.FEATURE_RUN assert body["feature_id"] == "night_shift" assert [c["group_id"] for c in body["payload"]["candidates"]] == [group.id] assert body["payload"]["candidates"][0]["priority"] == "high" @@ -584,8 +584,8 @@ def test_dispatches_candidates_to_seer_feature(self) -> None: assert outbox.payload is not None assert outbox.payload["viewer_context"] == {"organization_id": org.id} - assert shard.seer_run.mirror_status == SeerRunMirrorStatus.PENDING - assert shard.seer_run.seer_run_state_id is None + assert seer_run.mirror_status == SeerRunMirrorStatus.PENDING + assert seer_run.seer_run_state_id is None assert run.extras.get("error_message") is None # Verdicts and autofix are Seer's responsibility now; no result rows here. assert not SeerNightShiftRunResult.objects.filter(run=run).exists() From 5750c86ff193437ff2641c9ef1221582eeacdf1f Mon Sep 17 00:00:00 2001 From: Trevor Elkins Date: Thu, 18 Jun 2026 20:48:09 -0400 Subject: [PATCH 8/8] fix(seer): Record partial shard dispatch failures on the run When some shards failed to dispatch but at least one succeeded, the run was treated as fully successful and the API errorMessage stayed empty, hiding that candidates in the failed chunks were never triaged. Record a run-level error_message for partial failures (delivery only clears per-shard errors, so it persists) and emit a metric. Addresses Cursor review. Co-Authored-By: Claude Opus 4.8 --- src/sentry/tasks/seer/night_shift/cron.py | 9 +++++++++ tests/sentry/tasks/seer/test_night_shift.py | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/sentry/tasks/seer/night_shift/cron.py b/src/sentry/tasks/seer/night_shift/cron.py index b55f2f9af16e..1d95dc1b5762 100644 --- a/src/sentry/tasks/seer/night_shift/cron.py +++ b/src/sentry/tasks/seer/night_shift/cron.py @@ -561,6 +561,15 @@ def _link_shard(created: SeerRun) -> None: logger.error("night_shift.dispatch_failed", extra={**log_extra, "num_shards": len(shards)}) return + failed_shards = len(shards) - dispatched + if failed_shards: + sentry_sdk.metrics.count("night_shift.shard_dispatch_failure", failed_shards) + _record_run_error(run, f"Failed to dispatch {failed_shards} of {len(shards)} triage shards") + logger.warning( + "night_shift.partial_dispatch_failure", + extra={**log_extra, "num_shards": len(shards), "num_shards_dispatched": dispatched}, + ) + sentry_sdk.metrics.distribution("night_shift.org_run_duration", time.monotonic() - start_time) logger.info( "night_shift.feature_dispatched", diff --git a/tests/sentry/tasks/seer/test_night_shift.py b/tests/sentry/tasks/seer/test_night_shift.py index b73f4d1dbd44..79ed31d6419e 100644 --- a/tests/sentry/tasks/seer/test_night_shift.py +++ b/tests/sentry/tasks/seer/test_night_shift.py @@ -655,9 +655,10 @@ def flaky_create(*args, **kwargs): run_night_shift_for_org(org.id) run = SeerNightShiftRun.objects.get(organization=org) - assert run.extras.get("error_message") is None + # One shard dispatched; the failed one is recorded so it isn't invisible. assert SeerNightShiftRunShard.objects.filter(run=run).count() == 1 assert SeerRun.objects.filter(organization=org, type=SeerRunType.FEATURE_RUN).count() == 1 + assert run.extras["error_message"] == "Failed to dispatch 1 of 2 triage shards" def test_no_candidates_skips_dispatch(self) -> None: org = self.create_organization()