-
-
Notifications
You must be signed in to change notification settings - Fork 4.7k
feat(seer): Shard night shift triage into per-chunk feature runs #118080
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
340f7fc
291a074
293b572
b430f63
b6b6e80
62c25fa
b5a6802
5750c86
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| # Generated by Django 5.2.14 on 2026-06-18 23:19 | ||
|
|
||
| import django.db.models.deletion | ||
| import sentry.db.models.fields.bounded | ||
| import sentry.db.models.fields.foreignkey | ||
| from django.db import migrations, models | ||
|
|
||
| from sentry.new_migrations.migrations import CheckedMigration | ||
|
|
||
|
|
||
| class Migration(CheckedMigration): | ||
| # This flag is used to mark that a migration shouldn't be automatically run in production. | ||
| # This should only be used for operations where it's safe to run the migration after your | ||
| # code has deployed. So this should not be used for most operations that alter the schema | ||
| # of a table. | ||
| # Here are some things that make sense to mark as post deployment: | ||
| # - Large data migrations. Typically we want these to be run manually so that they can be | ||
| # monitored and not block the deploy for a long period of time while they run. | ||
| # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to | ||
| # run this outside deployments so that we don't block them. Note that while adding an index | ||
| # is a schema change, it's completely safe to run the operation after the code has deployed. | ||
| # Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment | ||
|
|
||
| is_post_deployment = False | ||
|
|
||
| dependencies = [ | ||
| ("seer", "0018_backfill_seer_agent_run_group_id"), | ||
| ] | ||
|
|
||
| operations = [ | ||
| migrations.CreateModel( | ||
| name="SeerNightShiftRunShard", | ||
| fields=[ | ||
| ( | ||
| "id", | ||
| sentry.db.models.fields.bounded.BoundedBigAutoField( | ||
| primary_key=True, serialize=False | ||
| ), | ||
| ), | ||
| ("date_updated", models.DateTimeField(auto_now=True)), | ||
| ("date_added", models.DateTimeField(auto_now_add=True)), | ||
| ("extras", models.JSONField(db_default={}, default=dict)), | ||
| ( | ||
| "run", | ||
| sentry.db.models.fields.foreignkey.FlexibleForeignKey( | ||
| on_delete=django.db.models.deletion.CASCADE, | ||
| related_name="shards", | ||
| to="seer.seernightshiftrun", | ||
| ), | ||
| ), | ||
| ( | ||
| "seer_run", | ||
| models.OneToOneField( | ||
| null=True, | ||
| on_delete=django.db.models.deletion.SET_NULL, | ||
| related_name="night_shift_shard", | ||
| to="seer.seerrun", | ||
| ), | ||
| ), | ||
| ], | ||
| options={ | ||
| "db_table": "seer_nightshiftrunshard", | ||
| }, | ||
| ), | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,14 +29,19 @@ | |
| from sentry.seer.models.night_shift import ( | ||
| SeerNightShiftRun, | ||
| SeerNightShiftRunResult, | ||
| SeerNightShiftRunShard, | ||
| ) | ||
| from sentry.seer.models.project_repository import SeerProjectRepository | ||
| from sentry.seer.models.run import SeerRun | ||
| from sentry.seer.models.workflow import SeerWorkflowConfig, SeerWorkflowStrategy | ||
| from sentry.seer.night_shift.models import NightShiftPayload, TriageCandidate, TriageTweaks | ||
| from sentry.tasks.base import instrumented_task | ||
| from sentry.tasks.seer.night_shift.models import TriageAction, TriageResult | ||
| from sentry.tasks.seer.night_shift.simple_triage import fixability_score_strategy, priority_label | ||
| from sentry.tasks.seer.night_shift.simple_triage import ( | ||
| ScoredCandidate, | ||
| fixability_score_strategy, | ||
| priority_label, | ||
| ) | ||
| from sentry.tasks.seer.night_shift.tweaks import ( | ||
| DEFAULT_EXTRA_TRIAGE_INSTRUCTIONS, | ||
| DEFAULT_INTELLIGENCE_LEVEL, | ||
|
|
@@ -478,24 +483,11 @@ def _get_eligible_projects( | |
| return eligible | ||
|
|
||
|
|
||
| def _dispatch_to_seer_feature( | ||
| run: SeerNightShiftRun, | ||
| organization: Organization, | ||
| eligible: Sequence[EligibleProject], | ||
| def _build_triage_payload( | ||
| candidates: Sequence[ScoredCandidate], | ||
| resolved_options: SeerNightShiftRunOptions, | ||
| log_extra: dict[str, object], | ||
| start_time: float, | ||
| ) -> None: | ||
| """Hand triage off to Seer's feature-run endpoint. Seer runs the triage agent | ||
| and pushes verdicts back via deliver_feature_result, which marks skips and | ||
| triggers autofix (using dry_run from run.extras["options"]).""" | ||
| eligible_projects = [ep.project for ep in eligible] | ||
| scored = fixability_score_strategy(eligible_projects, resolved_options["max_candidates"]) | ||
| if not scored: | ||
| logger.info("night_shift.no_candidates", extra=log_extra) | ||
| return | ||
|
|
||
| payload = NightShiftPayload( | ||
| ) -> NightShiftPayload: | ||
| return NightShiftPayload( | ||
| candidates=[ | ||
| TriageCandidate( | ||
| group_id=c.group.id, | ||
|
|
@@ -506,52 +498,87 @@ def _dispatch_to_seer_feature( | |
| first_seen=c.group.first_seen.isoformat(), | ||
| priority=priority_label(c.group.priority), | ||
| ) | ||
| for c in scored | ||
| for c in candidates | ||
| ], | ||
| tweaks=TriageTweaks( | ||
| intelligence_level=resolved_options["intelligence_level"], | ||
| reasoning_effort=resolved_options["reasoning_effort"], | ||
| extra_triage_instructions=resolved_options["extra_triage_instructions"], | ||
| ), | ||
| ) | ||
|
|
||
|
|
||
| def _dispatch_to_seer_feature( | ||
| run: SeerNightShiftRun, | ||
| organization: Organization, | ||
| eligible: Sequence[EligibleProject], | ||
| resolved_options: SeerNightShiftRunOptions, | ||
| log_extra: dict[str, object], | ||
| start_time: float, | ||
| ) -> None: | ||
| """Shard the scored candidates into chunks of seer.night_shift.shard_size and | ||
| dispatch each chunk as its own Seer feature run, recorded as a | ||
| SeerNightShiftRunShard. Seer pushes verdicts back per shard via | ||
| deliver_feature_result.""" | ||
| eligible_projects = [ep.project for ep in eligible] | ||
| scored = fixability_score_strategy(eligible_projects, resolved_options["max_candidates"]) | ||
| if not scored: | ||
| logger.info("night_shift.no_candidates", extra=log_extra) | ||
| return | ||
|
|
||
| try: | ||
| client = SeerAgentClient(organization) | ||
| except SeerPermissionError: | ||
| logger.info("night_shift.no_seer_access", extra=log_extra) | ||
| _record_run_error(run, "Organization does not have Seer access") | ||
| return | ||
|
|
||
| def _link_run(created: SeerRun) -> None: | ||
| # Link inside the dispatch transaction so the row exists before the outbox | ||
| # drains and Seer's result correlates back to this night shift run. | ||
| run.update(seer_run=created) | ||
| def _link_shard(created: SeerRun) -> None: | ||
| SeerNightShiftRunShard.objects.create(run=run, seer_run=created) | ||
|
|
||
| try: | ||
| seer_run = client.start_feature_run( | ||
| feature_id="night_shift", | ||
| payload=payload.dict(), | ||
| flush=False, | ||
| on_run_created=_link_run, | ||
| ) | ||
| except Exception: | ||
| shards = list(chunked(scored, options.get("seer.night_shift.shard_size"))) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: The Suggested FixAdd validation to the Prompt for AI AgentDid we get this right? 👍 / 👎 to inform future reviews. |
||
| dispatched = 0 | ||
| for shard_index, chunk in enumerate(shards): | ||
| payload = _build_triage_payload(chunk, resolved_options) | ||
| try: | ||
| client.start_feature_run( | ||
| feature_id="night_shift", | ||
| payload=payload.dict(), | ||
| flush=False, | ||
| on_run_created=_link_shard, | ||
| ) | ||
| except Exception: | ||
| logger.exception( | ||
| "night_shift.shard_dispatch_failed", | ||
| extra={**log_extra, "shard_index": shard_index, "num_shards": len(shards)}, | ||
| ) | ||
| continue | ||
| dispatched += 1 | ||
|
|
||
| if dispatched == 0: | ||
| sentry_sdk.metrics.count("night_shift.run_error", 1) | ||
| _fail_run( | ||
| run, | ||
| message="Night shift dispatch failed", | ||
| event="night_shift.dispatch_failed", | ||
| extra=log_extra, | ||
| ) | ||
| _record_run_error(run, "Night shift dispatch failed") | ||
| logger.error("night_shift.dispatch_failed", extra={**log_extra, "num_shards": len(shards)}) | ||
|
cursor[bot] marked this conversation as resolved.
|
||
| return | ||
|
|
||
| failed_shards = len(shards) - dispatched | ||
| if failed_shards: | ||
| sentry_sdk.metrics.count("night_shift.shard_dispatch_failure", failed_shards) | ||
| _record_run_error(run, f"Failed to dispatch {failed_shards} of {len(shards)} triage shards") | ||
| logger.warning( | ||
| "night_shift.partial_dispatch_failure", | ||
| extra={**log_extra, "num_shards": len(shards), "num_shards_dispatched": dispatched}, | ||
| ) | ||
|
|
||
| sentry_sdk.metrics.distribution("night_shift.org_run_duration", time.monotonic() - start_time) | ||
| logger.info( | ||
| "night_shift.feature_dispatched", | ||
| extra={ | ||
| **log_extra, | ||
| "seer_run_id": seer_run.id, | ||
| "seer_run_uuid": str(seer_run.uuid), | ||
| "num_eligible_projects": len(eligible_projects), | ||
| "num_candidates": len(scored), | ||
| "num_shards": len(shards), | ||
| "num_shards_dispatched": dispatched, | ||
| }, | ||
| ) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is to be backwards compatible until I perform a data migration.