Skip to content
2 changes: 1 addition & 1 deletion migrations_lockfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ releases: 0004_cleanup_failed_safe_deletes

replays: 0007_organizationmember_replay_access

seer: 0018_backfill_seer_agent_run_group_id
seer: 0019_add_night_shift_run_shard

sentry: 1117_drop_organizationmapping_codecov_access_delete

Expand Down
6 changes: 6 additions & 0 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -1135,6 +1135,12 @@
default=10,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)
register(
"seer.night_shift.shard_size",
type=Int,
default=5,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)
# Per-org overrides for night shift run options. Keyed by stringified
# organization id; each value is a partial set of run-option overrides (e.g.
# {"max_candidates": 20}) that layer on top of the global defaults but below
Expand Down
65 changes: 65 additions & 0 deletions src/sentry/seer/migrations/0019_add_night_shift_run_shard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Generated by Django 5.2.14 on 2026-06-18 23:19

import django.db.models.deletion
import sentry.db.models.fields.bounded
import sentry.db.models.fields.foreignkey
from django.db import migrations, models

from sentry.new_migrations.migrations import CheckedMigration


class Migration(CheckedMigration):
# This flag is used to mark that a migration shouldn't be automatically run in production.
# This should only be used for operations where it's safe to run the migration after your
# code has deployed. So this should not be used for most operations that alter the schema
# of a table.
# Here are some things that make sense to mark as post deployment:
# - Large data migrations. Typically we want these to be run manually so that they can be
# monitored and not block the deploy for a long period of time while they run.
# - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
# run this outside deployments so that we don't block them. Note that while adding an index
# is a schema change, it's completely safe to run the operation after the code has deployed.
# Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment

is_post_deployment = False

dependencies = [
("seer", "0018_backfill_seer_agent_run_group_id"),
]

operations = [
migrations.CreateModel(
name="SeerNightShiftRunShard",
fields=[
(
"id",
sentry.db.models.fields.bounded.BoundedBigAutoField(
primary_key=True, serialize=False
),
),
("date_updated", models.DateTimeField(auto_now=True)),
("date_added", models.DateTimeField(auto_now_add=True)),
("extras", models.JSONField(db_default={}, default=dict)),
(
"run",
sentry.db.models.fields.foreignkey.FlexibleForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="shards",
to="seer.seernightshiftrun",
),
),
(
"seer_run",
models.OneToOneField(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="night_shift_shard",
to="seer.seerrun",
),
),
],
options={
"db_table": "seer_nightshiftrunshard",
},
),
]
23 changes: 23 additions & 0 deletions src/sentry/seer/models/night_shift.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,26 @@ class Meta:
]

__repr__ = sane_repr("run_id", "kind", "group_id")


@cell_silo_model
class SeerNightShiftRunShard(DefaultFieldsModel):
"""One shard of a night shift run, owning the SeerRun for a single
dispatched Seer feature run. A run fans out its work into one or more shards
dispatched as independent feature runs."""

__relocation_scope__ = RelocationScope.Excluded

run = FlexibleForeignKey(
"seer.SeerNightShiftRun", on_delete=models.CASCADE, related_name="shards"
)
seer_run = models.OneToOneField(
"seer.SeerRun", on_delete=models.SET_NULL, null=True, related_name="night_shift_shard"
)
extras = models.JSONField(db_default={}, default=dict)

class Meta:
app_label = "seer"
db_table = "seer_nightshiftrunshard"

__repr__ = sane_repr("run_id", "seer_run_id")
15 changes: 9 additions & 6 deletions src/sentry/seer/night_shift/delivery.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Any

import sentry_sdk
from django.db.models import Q

from sentry.constants import SEER_AUTOMATED_RUN_STOPPING_POINT_DEFAULT, ObjectStatus
from sentry.models.group import Group
Expand All @@ -29,12 +30,14 @@ def deliver_night_shift_result(
error: str | None,
) -> None:
"""Process a night_shift result from Seer."""
try:
run = SeerNightShiftRun.objects.select_related("organization", "seer_run").get(
organization_id=organization_id,
seer_run__uuid=run_uuid,
)
except SeerNightShiftRun.DoesNotExist:
run = (
SeerNightShiftRun.objects.filter(organization_id=organization_id)
.filter(Q(shards__seer_run__uuid=run_uuid) | Q(seer_run__uuid=run_uuid))

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to be backwards compatible until I perform a data migration.

.select_related("organization")
.distinct()
.first()
)
if run is None:
Comment thread
cursor[bot] marked this conversation as resolved.
logger.warning(
"night_shift.delivery.missing_run",
extra={"organization_id": organization_id, "run_uuid": run_uuid},
Expand Down
99 changes: 60 additions & 39 deletions src/sentry/tasks/seer/night_shift/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,19 @@
from sentry.seer.models.night_shift import (
SeerNightShiftRun,
SeerNightShiftRunResult,
SeerNightShiftRunShard,
)
from sentry.seer.models.project_repository import SeerProjectRepository
from sentry.seer.models.run import SeerRun
from sentry.seer.models.workflow import SeerWorkflowConfig, SeerWorkflowStrategy
from sentry.seer.night_shift.models import NightShiftPayload, TriageCandidate, TriageTweaks
from sentry.tasks.base import instrumented_task
from sentry.tasks.seer.night_shift.models import TriageAction, TriageResult
from sentry.tasks.seer.night_shift.simple_triage import fixability_score_strategy, priority_label
from sentry.tasks.seer.night_shift.simple_triage import (
ScoredCandidate,
fixability_score_strategy,
priority_label,
)
from sentry.tasks.seer.night_shift.tweaks import (
DEFAULT_EXTRA_TRIAGE_INSTRUCTIONS,
DEFAULT_INTELLIGENCE_LEVEL,
Expand Down Expand Up @@ -478,24 +483,11 @@ def _get_eligible_projects(
return eligible


def _dispatch_to_seer_feature(
run: SeerNightShiftRun,
organization: Organization,
eligible: Sequence[EligibleProject],
def _build_triage_payload(
candidates: Sequence[ScoredCandidate],
resolved_options: SeerNightShiftRunOptions,
log_extra: dict[str, object],
start_time: float,
) -> None:
"""Hand triage off to Seer's feature-run endpoint. Seer runs the triage agent
and pushes verdicts back via deliver_feature_result, which marks skips and
triggers autofix (using dry_run from run.extras["options"])."""
eligible_projects = [ep.project for ep in eligible]
scored = fixability_score_strategy(eligible_projects, resolved_options["max_candidates"])
if not scored:
logger.info("night_shift.no_candidates", extra=log_extra)
return

payload = NightShiftPayload(
) -> NightShiftPayload:
return NightShiftPayload(
candidates=[
TriageCandidate(
group_id=c.group.id,
Expand All @@ -506,52 +498,81 @@ def _dispatch_to_seer_feature(
first_seen=c.group.first_seen.isoformat(),
priority=priority_label(c.group.priority),
)
for c in scored
for c in candidates
],
tweaks=TriageTweaks(
intelligence_level=resolved_options["intelligence_level"],
reasoning_effort=resolved_options["reasoning_effort"],
extra_triage_instructions=resolved_options["extra_triage_instructions"],
),
)


def _dispatch_to_seer_feature(
run: SeerNightShiftRun,
organization: Organization,
eligible: Sequence[EligibleProject],
resolved_options: SeerNightShiftRunOptions,
log_extra: dict[str, object],
start_time: float,
) -> None:
"""Shard the scored candidates into chunks of seer.night_shift.shard_size and
dispatch each chunk as its own Seer feature run, recorded as a
SeerNightShiftRunShard. Seer pushes verdicts back per shard via
deliver_feature_result."""
eligible_projects = [ep.project for ep in eligible]
scored = fixability_score_strategy(eligible_projects, resolved_options["max_candidates"])
if not scored:
logger.info("night_shift.no_candidates", extra=log_extra)
return

try:
client = SeerAgentClient(organization)
except SeerPermissionError:
logger.info("night_shift.no_seer_access", extra=log_extra)
_record_run_error(run, "Organization does not have Seer access")
return

def _link_run(created: SeerRun) -> None:
# Link inside the dispatch transaction so the row exists before the outbox
# drains and Seer's result correlates back to this night shift run.
run.update(seer_run=created)
shards = list(chunked(scored, options.get("seer.night_shift.shard_size")))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: The seer.night_shift.shard_size option lacks validation. If set to 0, the chunked function will group all items into a single shard, defeating the purpose of sharding.
Severity: MEDIUM

Suggested Fix

Add validation to the seer.night_shift.shard_size option registration in src/sentry/options/defaults.py to enforce a minimum value of 1. Alternatively, add a check in src/sentry/tasks/seer/night_shift/cron.py before calling chunked to handle a shard_size of 0 or less, perhaps by falling back to the default value or raising an error.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent. Verify if this is a real issue. If it is, propose a fix; if not, explain why it's
not valid.

Location: src/sentry/tasks/seer/night_shift/cron.py#L539

Potential issue: The `seer.night_shift.shard_size` option, which is modifiable by
operators, lacks validation to prevent non-positive values. If an operator sets this
value to `0`, the `chunked` utility at `src/sentry/tasks/seer/night_shift/cron.py:539`
will not create multiple small shards. Instead, it will silently create a single, large
chunk containing all items. This defeats the purpose of the sharding logic, which is to
prevent performance degradation in the triage agent by processing large candidate sets.
The result is that all candidates are sent to the agent at once.

Did we get this right? 👍 / 👎 to inform future reviews.

dispatched = 0
for shard_index, chunk in enumerate(shards):
payload = _build_triage_payload(chunk, resolved_options)

try:
seer_run = client.start_feature_run(
feature_id="night_shift",
payload=payload.dict(),
flush=False,
on_run_created=_link_run,
)
except Exception:
def _link_shard(created: SeerRun, is_first: bool = shard_index == 0) -> None:
Comment thread
trevor-e marked this conversation as resolved.
Outdated
SeerNightShiftRunShard.objects.create(run=run, seer_run=created)
if is_first:
run.update(seer_run=created)
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated

try:
client.start_feature_run(
feature_id="night_shift",
payload=payload.dict(),
flush=False,
on_run_created=_link_shard,
)
except Exception:
logger.exception(
"night_shift.shard_dispatch_failed",
extra={**log_extra, "shard_index": shard_index, "num_shards": len(shards)},
)
continue
dispatched += 1

if dispatched == 0:
sentry_sdk.metrics.count("night_shift.run_error", 1)
_fail_run(
run,
message="Night shift dispatch failed",
event="night_shift.dispatch_failed",
extra=log_extra,
)
_record_run_error(run, "Night shift dispatch failed")
logger.error("night_shift.dispatch_failed", extra={**log_extra, "num_shards": len(shards)})
Comment thread
cursor[bot] marked this conversation as resolved.
return

sentry_sdk.metrics.distribution("night_shift.org_run_duration", time.monotonic() - start_time)
logger.info(
"night_shift.feature_dispatched",
extra={
**log_extra,
"seer_run_id": seer_run.id,
"seer_run_uuid": str(seer_run.uuid),
"num_eligible_projects": len(eligible_projects),
"num_candidates": len(scored),
"num_shards": len(shards),
"num_shards_dispatched": dispatched,
},
)

Expand Down
39 changes: 38 additions & 1 deletion tests/sentry/seer/night_shift/test_delivery.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

from sentry.models.organization import Organization
from sentry.seer.autofix.utils import AutofixStoppingPoint
from sentry.seer.models.night_shift import SeerNightShiftRun, SeerNightShiftRunResult
from sentry.seer.models.night_shift import (
SeerNightShiftRun,
SeerNightShiftRunResult,
SeerNightShiftRunShard,
)
from sentry.seer.night_shift.delivery import deliver_night_shift_result
from sentry.tasks.seer.night_shift.models import TriageAction
from sentry.tasks.seer.night_shift.skip_cache import key as skip_cache_key
Expand All @@ -27,6 +31,39 @@ def _create_night_shift_run(
extras=extras,
)

def test_correlates_via_shard_seer_run(self) -> None:
"""Sharded runs carry no scalar seer_run; delivery resolves the run from
the shard's SeerRun uuid and processes that shard's verdicts."""
org = self.create_organization()
project = self.create_project(organization=org)
group = self.create_group(project=project)
shard_seer_run = self.create_seer_run(organization=org)
run = SeerNightShiftRun.objects.create(
organization=org, seer_run=None, extras={"options": {}}
)
SeerNightShiftRunShard.objects.create(run=run, seer_run=shard_seer_run)

result = {
"verdicts": [
{"group_id": group.id, "action": TriageAction.AUTOFIX.value, "reason": "ok"}
]
}
with patch(
"sentry.tasks.seer.night_shift.cron.trigger_autofix_agent", return_value=42
) as mock_trigger:
deliver_night_shift_result(
organization_id=org.id,
run_uuid=str(shard_seer_run.uuid),
status="completed",
result=result,
error=None,
)

mock_trigger.assert_called_once()
results = list(SeerNightShiftRunResult.objects.filter(run=run))
assert len(results) == 1
assert results[0].group_id == group.id

def test_missing_run_logs_warning(self) -> None:
"""When run_uuid doesn't match any SeerNightShiftRun, log and return."""
org = self.create_organization()
Expand Down
Loading
Loading