Skip to content
2 changes: 1 addition & 1 deletion migrations_lockfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ releases: 0004_cleanup_failed_safe_deletes

replays: 0007_organizationmember_replay_access

seer: 0018_backfill_seer_agent_run_group_id
seer: 0019_add_night_shift_run_shard

sentry: 1117_drop_organizationmapping_codecov_access_delete

Expand Down
15 changes: 12 additions & 3 deletions src/sentry/api/serializers/models/seer_night_shift_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class SeerNightShiftRunSerializer(Serializer[SeerNightShiftRunResponse]):
def get_attrs(
self, item_list: Sequence[SeerNightShiftRun], user: Any, **kwargs: Any
) -> dict[SeerNightShiftRun, dict[str, Any]]:
prefetch_related_objects(item_list, "results")
prefetch_related_objects(item_list, "results", "shards")
return {}

def serialize(
Expand All @@ -59,12 +59,21 @@ def serialize(
all_results = list(obj.results.all())
triage_results = [r for r in all_results if r.kind == SeerWorkflowStrategy.AGENTIC_TRIAGE]
extras = obj.extras or {}
# A dispatch failure records on the run; per-shard delivery failures record
# on the shard, so surface either so a failed shard doesn't read as healthy.
shard_error = next(
(
s.extras["error_message"]
for s in obj.shards.all()
if (s.extras or {}).get("error_message")
),
None,
)
return {
"id": str(obj.id),
"dateAdded": obj.date_added.isoformat(),
"extras": extras,
# Legacy alias: error_message lives in extras now.
"errorMessage": extras.get("error_message"),
"errorMessage": extras.get("error_message") or shard_error,
"results": [_serialize_result(r) for r in all_results],
"issues": [_serialize_legacy_issue(r) for r in triage_results],
# Match the pre-migration column behavior: always "agentic_triage"
Expand Down
6 changes: 6 additions & 0 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -1135,6 +1135,12 @@
default=10,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)
register(
"seer.night_shift.shard_size",
type=Int,
default=5,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)
# Per-org overrides for night shift run options. Keyed by stringified
# organization id; each value is a partial set of run-option overrides (e.g.
# {"max_candidates": 20}) that layer on top of the global defaults but below
Expand Down
65 changes: 65 additions & 0 deletions src/sentry/seer/migrations/0019_add_night_shift_run_shard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Generated by Django 5.2.14 on 2026-06-18 23:19

import django.db.models.deletion
import sentry.db.models.fields.bounded
import sentry.db.models.fields.foreignkey
from django.db import migrations, models

from sentry.new_migrations.migrations import CheckedMigration


class Migration(CheckedMigration):
# This flag is used to mark that a migration shouldn't be automatically run in production.
# This should only be used for operations where it's safe to run the migration after your
# code has deployed. So this should not be used for most operations that alter the schema
# of a table.
# Here are some things that make sense to mark as post deployment:
# - Large data migrations. Typically we want these to be run manually so that they can be
# monitored and not block the deploy for a long period of time while they run.
# - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
# run this outside deployments so that we don't block them. Note that while adding an index
# is a schema change, it's completely safe to run the operation after the code has deployed.
# Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment

is_post_deployment = False

dependencies = [
("seer", "0018_backfill_seer_agent_run_group_id"),
]

operations = [
migrations.CreateModel(
name="SeerNightShiftRunShard",
fields=[
(
"id",
sentry.db.models.fields.bounded.BoundedBigAutoField(
primary_key=True, serialize=False
),
),
("date_updated", models.DateTimeField(auto_now=True)),
("date_added", models.DateTimeField(auto_now_add=True)),
("extras", models.JSONField(db_default={}, default=dict)),
(
"run",
sentry.db.models.fields.foreignkey.FlexibleForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="shards",
to="seer.seernightshiftrun",
),
),
(
"seer_run",
models.OneToOneField(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="night_shift_shard",
to="seer.seerrun",
),
),
],
options={
"db_table": "seer_nightshiftrunshard",
},
),
]
23 changes: 23 additions & 0 deletions src/sentry/seer/models/night_shift.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,26 @@ class Meta:
]

__repr__ = sane_repr("run_id", "kind", "group_id")


@cell_silo_model
class SeerNightShiftRunShard(DefaultFieldsModel):
"""One shard of a night shift run, owning the SeerRun for a single
dispatched Seer feature run. A run fans out its work into one or more shards
dispatched as independent feature runs."""

__relocation_scope__ = RelocationScope.Excluded

run = FlexibleForeignKey(
"seer.SeerNightShiftRun", on_delete=models.CASCADE, related_name="shards"
)
seer_run = models.OneToOneField(
"seer.SeerRun", on_delete=models.SET_NULL, null=True, related_name="night_shift_shard"
)
extras = models.JSONField(db_default={}, default=dict)

class Meta:
app_label = "seer"
db_table = "seer_nightshiftrunshard"

__repr__ = sane_repr("run_id", "seer_run_id")
39 changes: 25 additions & 14 deletions src/sentry/seer/night_shift/delivery.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,18 @@
from typing import Any

import sentry_sdk
from django.db.models import Q

from sentry.constants import SEER_AUTOMATED_RUN_STOPPING_POINT_DEFAULT, ObjectStatus
from sentry.models.group import Group
from sentry.models.organization import Organization
from sentry.seer.agent.types import FeatureRunStatus
from sentry.seer.autofix.utils import AutofixStoppingPoint, bulk_read_preferences_from_sentry_db
from sentry.seer.models.night_shift import SeerNightShiftRun, SeerNightShiftRunResult
from sentry.seer.models.night_shift import (
SeerNightShiftRun,
SeerNightShiftRunResult,
SeerNightShiftRunShard,
)
from sentry.seer.night_shift.models import TriageResponse
from sentry.tasks.seer.night_shift.models import TriageAction, TriageResult
from sentry.tasks.seer.night_shift.skip_cache import mark_skipped
Expand All @@ -29,20 +34,28 @@ def deliver_night_shift_result(
error: str | None,
) -> None:
"""Process a night_shift result from Seer."""
try:
run = SeerNightShiftRun.objects.select_related("organization", "seer_run").get(
organization_id=organization_id,
seer_run__uuid=run_uuid,
)
except SeerNightShiftRun.DoesNotExist:
run = (
SeerNightShiftRun.objects.filter(organization_id=organization_id)
.filter(Q(shards__seer_run__uuid=run_uuid) | Q(seer_run__uuid=run_uuid))

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to be backwards compatible until I perform a data migration.

.select_related("organization")
.distinct()
.first()
)
if run is None:
Comment thread
cursor[bot] marked this conversation as resolved.
logger.warning(
"night_shift.delivery.missing_run",
extra={"organization_id": organization_id, "run_uuid": run_uuid},
)
return

# Per-delivery error_message lives on the shard so a sibling shard's success
# can't clear it; the run is the fallback only for pre-shard rows.
error_target: SeerNightShiftRun | SeerNightShiftRunShard = (
run.shards.filter(seer_run__uuid=run_uuid).first() or run
)

if error:
run.update(extras={**(run.extras or {}), "error_message": error})
error_target.update(extras={**(error_target.extras or {}), "error_message": error})
Comment thread
cursor[bot] marked this conversation as resolved.

log_extra: dict[str, object] = {
"organization_id": run.organization_id,
Expand Down Expand Up @@ -70,13 +83,11 @@ def deliver_night_shift_result(
options = (run.extras or {}).get("options") or {}
dry_run = bool(options.get("dry_run", False))

# A failed dispatch may have left a stale error_message even though Seer went
# on to process the run and is now delivering verdicts. Clear it so the run's
# state reflects the successful delivery.
if (run.extras or {}).get("error_message"):
extras = {**run.extras}
# Clear any stale error_message now that this delivery has succeeded.
if (error_target.extras or {}).get("error_message"):
extras = {**error_target.extras}
del extras["error_message"]
run.update(extras=extras)
error_target.update(extras=extras)

_process_verdicts(
run=run,
Expand Down
105 changes: 66 additions & 39 deletions src/sentry/tasks/seer/night_shift/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,19 @@
from sentry.seer.models.night_shift import (
SeerNightShiftRun,
SeerNightShiftRunResult,
SeerNightShiftRunShard,
)
from sentry.seer.models.project_repository import SeerProjectRepository
from sentry.seer.models.run import SeerRun
from sentry.seer.models.workflow import SeerWorkflowConfig, SeerWorkflowStrategy
from sentry.seer.night_shift.models import NightShiftPayload, TriageCandidate, TriageTweaks
from sentry.tasks.base import instrumented_task
from sentry.tasks.seer.night_shift.models import TriageAction, TriageResult
from sentry.tasks.seer.night_shift.simple_triage import fixability_score_strategy, priority_label
from sentry.tasks.seer.night_shift.simple_triage import (
ScoredCandidate,
fixability_score_strategy,
priority_label,
)
from sentry.tasks.seer.night_shift.tweaks import (
DEFAULT_EXTRA_TRIAGE_INSTRUCTIONS,
DEFAULT_INTELLIGENCE_LEVEL,
Expand Down Expand Up @@ -478,24 +483,11 @@ def _get_eligible_projects(
return eligible


def _dispatch_to_seer_feature(
run: SeerNightShiftRun,
organization: Organization,
eligible: Sequence[EligibleProject],
def _build_triage_payload(
candidates: Sequence[ScoredCandidate],
resolved_options: SeerNightShiftRunOptions,
log_extra: dict[str, object],
start_time: float,
) -> None:
"""Hand triage off to Seer's feature-run endpoint. Seer runs the triage agent
and pushes verdicts back via deliver_feature_result, which marks skips and
triggers autofix (using dry_run from run.extras["options"])."""
eligible_projects = [ep.project for ep in eligible]
scored = fixability_score_strategy(eligible_projects, resolved_options["max_candidates"])
if not scored:
logger.info("night_shift.no_candidates", extra=log_extra)
return

payload = NightShiftPayload(
) -> NightShiftPayload:
return NightShiftPayload(
candidates=[
TriageCandidate(
group_id=c.group.id,
Expand All @@ -506,52 +498,87 @@ def _dispatch_to_seer_feature(
first_seen=c.group.first_seen.isoformat(),
priority=priority_label(c.group.priority),
)
for c in scored
for c in candidates
],
tweaks=TriageTweaks(
intelligence_level=resolved_options["intelligence_level"],
reasoning_effort=resolved_options["reasoning_effort"],
extra_triage_instructions=resolved_options["extra_triage_instructions"],
),
)


def _dispatch_to_seer_feature(
run: SeerNightShiftRun,
organization: Organization,
eligible: Sequence[EligibleProject],
resolved_options: SeerNightShiftRunOptions,
log_extra: dict[str, object],
start_time: float,
) -> None:
"""Shard the scored candidates into chunks of seer.night_shift.shard_size and
dispatch each chunk as its own Seer feature run, recorded as a
SeerNightShiftRunShard. Seer pushes verdicts back per shard via
deliver_feature_result."""
eligible_projects = [ep.project for ep in eligible]
scored = fixability_score_strategy(eligible_projects, resolved_options["max_candidates"])
if not scored:
logger.info("night_shift.no_candidates", extra=log_extra)
return

try:
client = SeerAgentClient(organization)
except SeerPermissionError:
logger.info("night_shift.no_seer_access", extra=log_extra)
_record_run_error(run, "Organization does not have Seer access")
return

def _link_run(created: SeerRun) -> None:
# Link inside the dispatch transaction so the row exists before the outbox
# drains and Seer's result correlates back to this night shift run.
run.update(seer_run=created)
def _link_shard(created: SeerRun) -> None:
SeerNightShiftRunShard.objects.create(run=run, seer_run=created)

try:
seer_run = client.start_feature_run(
feature_id="night_shift",
payload=payload.dict(),
flush=False,
on_run_created=_link_run,
)
except Exception:
shards = list(chunked(scored, options.get("seer.night_shift.shard_size")))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: The seer.night_shift.shard_size option lacks validation. If set to 0, the chunked function will group all items into a single shard, defeating the purpose of sharding.
Severity: MEDIUM

Suggested Fix

Add validation to the seer.night_shift.shard_size option registration in src/sentry/options/defaults.py to enforce a minimum value of 1. Alternatively, add a check in src/sentry/tasks/seer/night_shift/cron.py before calling chunked to handle a shard_size of 0 or less, perhaps by falling back to the default value or raising an error.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent. Verify if this is a real issue. If it is, propose a fix; if not, explain why it's
not valid.

Location: src/sentry/tasks/seer/night_shift/cron.py#L539

Potential issue: The `seer.night_shift.shard_size` option, which is modifiable by
operators, lacks validation to prevent non-positive values. If an operator sets this
value to `0`, the `chunked` utility at `src/sentry/tasks/seer/night_shift/cron.py:539`
will not create multiple small shards. Instead, it will silently create a single, large
chunk containing all items. This defeats the purpose of the sharding logic, which is to
prevent performance degradation in the triage agent by processing large candidate sets.
The result is that all candidates are sent to the agent at once.

Did we get this right? 👍 / 👎 to inform future reviews.

dispatched = 0
for shard_index, chunk in enumerate(shards):
payload = _build_triage_payload(chunk, resolved_options)
try:
client.start_feature_run(
feature_id="night_shift",
payload=payload.dict(),
flush=False,
on_run_created=_link_shard,
)
except Exception:
logger.exception(
"night_shift.shard_dispatch_failed",
extra={**log_extra, "shard_index": shard_index, "num_shards": len(shards)},
)
continue
dispatched += 1

if dispatched == 0:
sentry_sdk.metrics.count("night_shift.run_error", 1)
_fail_run(
run,
message="Night shift dispatch failed",
event="night_shift.dispatch_failed",
extra=log_extra,
)
_record_run_error(run, "Night shift dispatch failed")
logger.error("night_shift.dispatch_failed", extra={**log_extra, "num_shards": len(shards)})
Comment thread
cursor[bot] marked this conversation as resolved.
return

failed_shards = len(shards) - dispatched
if failed_shards:
sentry_sdk.metrics.count("night_shift.shard_dispatch_failure", failed_shards)
_record_run_error(run, f"Failed to dispatch {failed_shards} of {len(shards)} triage shards")
logger.warning(
"night_shift.partial_dispatch_failure",
extra={**log_extra, "num_shards": len(shards), "num_shards_dispatched": dispatched},
)

sentry_sdk.metrics.distribution("night_shift.org_run_duration", time.monotonic() - start_time)
logger.info(
"night_shift.feature_dispatched",
extra={
**log_extra,
"seer_run_id": seer_run.id,
"seer_run_uuid": str(seer_run.uuid),
"num_eligible_projects": len(eligible_projects),
"num_candidates": len(scored),
"num_shards": len(shards),
"num_shards_dispatched": dispatched,
},
)

Expand Down
Loading
Loading