Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion migrations_lockfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ replays: 0007_organizationmember_replay_access

seer: 0011_add_project_repository_fk_to_seer

sentry: 1091_delete_triggered_incidents_alertruletrigger
sentry: 1092_backfill_projectrepository

social_auth: 0003_social_auth_json_field

Expand Down
129 changes: 129 additions & 0 deletions src/sentry/migrations/1092_backfill_projectrepository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from django.db import migrations
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
from django.db.migrations.state import StateApps

from sentry.new_migrations.migrations import CheckedMigration
from sentry.utils.query import RangeQuerySetWrapperWithProgressBar

# Mirror of ProjectRepositorySource values — we can't import the model
# in a migration because the code may change after the migration is written.
SOURCE_MANUAL = 0
SOURCE_AUTO_EVENT = 2
SOURCE_SEER_PREFERENCE = 4

# Lower number = higher priority. Used to pick the best source when
# a (project, repo) pair appears in multiple tables.
SOURCE_PRIORITY = {
SOURCE_SEER_PREFERENCE: 0,
SOURCE_MANUAL: 1,
SOURCE_AUTO_EVENT: 2,
}


def backfill_project_repository(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
ProjectRepository = apps.get_model("sentry", "ProjectRepository")
RepositoryProjectPathConfig = apps.get_model("sentry", "RepositoryProjectPathConfig")
SeerProjectRepository = apps.get_model("seer", "SeerProjectRepository")

# Step 1: Collect all unique (project_id, repository_id) pairs and pick
# the best source for each.
#
# Priority: SEER_PREFERENCE (user explicitly picked repos for Seer)
# > MANUAL (user-created code mapping) > AUTO_EVENT (auto-generated).

pair_to_source: dict[tuple[int, int], int] = {}

def _set_if_higher_priority(key: tuple[int, int], new_source: int) -> None:
existing = pair_to_source.get(key)
if existing is None or SOURCE_PRIORITY[new_source] < SOURCE_PRIORITY[existing]:
pair_to_source[key] = new_source

for row in RangeQuerySetWrapperWithProgressBar(
RepositoryProjectPathConfig.objects.values_list(
"id", "project_id", "repository_id", "automatically_generated"
),
result_value_getter=lambda values: values[0],
):
_id, project_id, repository_id, automatically_generated = row
new_source = SOURCE_AUTO_EVENT if automatically_generated else SOURCE_MANUAL
_set_if_higher_priority((project_id, repository_id), new_source)

for row in RangeQuerySetWrapperWithProgressBar(
SeerProjectRepository.objects.values_list("id", "project_id", "repository_id"),
result_value_getter=lambda values: values[0],
):
_id, project_id, repository_id = row
_set_if_higher_priority((project_id, repository_id), SOURCE_SEER_PREFERENCE)

existing_pairs = set(ProjectRepository.objects.values_list("project_id", "repository_id"))

batch: list[object] = []
for (project_id, repository_id), source in pair_to_source.items():
if (project_id, repository_id) in existing_pairs:
continue
batch.append(
ProjectRepository(project_id=project_id, repository_id=repository_id, source=source)
)
if len(batch) >= 1000:
ProjectRepository.objects.bulk_create(batch, ignore_conflicts=True)
batch = []
if batch:
ProjectRepository.objects.bulk_create(batch, ignore_conflicts=True)

pr_lookup: dict[tuple[int, int], int] = {}
for pr_id, project_id, repository_id in ProjectRepository.objects.values_list(
"id", "project_id", "repository_id"
):
pr_lookup[(project_id, repository_id)] = pr_id

for config in RangeQuerySetWrapperWithProgressBar(
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could the loop iterate pr_lookup instead? One UPDATE per pair rather than per row.

for (project_id, repository_id), pr_id in pr_lookup.items():
    RepositoryProjectPathConfig.objects.filter(
        project_id=project_id,
        repository_id=repository_id,
        project_repository_id__isnull=True,
    ).update(project_repository_id=pr_id)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We make this query so that we only look at rows that don't already have a project_repository_id. If we switch to this method, then if we have to re-run the backfill it will attempt an update for every item in pr_lookup, rather than just the rows that need a value

RepositoryProjectPathConfig.objects.filter(project_repository_id__isnull=True)
):
pr_id = pr_lookup.get((config.project_id, config.repository_id))
if pr_id is not None:
RepositoryProjectPathConfig.objects.filter(id=config.id).update(
project_repository_id=pr_id
)
Comment thread
wedamija marked this conversation as resolved.

for spr in RangeQuerySetWrapperWithProgressBar(
SeerProjectRepository.objects.filter(project_repository_id__isnull=True)
):
pr_id = pr_lookup.get((spr.project_id, spr.repository_id))
if pr_id is not None:
spr.project_repository_id = pr_id
spr.save(update_fields=["project_repository_id"])


class Migration(CheckedMigration):
# This flag is used to mark that a migration shouldn't be automatically run in production.
# This should only be used for operations where it's safe to run the migration after your
# code has deployed. So this should not be used for most operations that alter the schema
# of a table.
# Here are some things that make sense to mark as post deployment:
# - Large data migrations. Typically we want these to be run manually so that they can be
# monitored and not block the deploy for a long period of time while they run.
# - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
# run this outside deployments so that we don't block them. Note that while adding an index
# is a schema change, it's completely safe to run the operation after the code has deployed.
# Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment

is_post_deployment = True

dependencies = [
("sentry", "1091_delete_triggered_incidents_alertruletrigger"),
("seer", "0011_add_project_repository_fk_to_seer"),
]

operations = [
migrations.RunPython(
backfill_project_repository,
reverse_code=migrations.RunPython.noop,
hints={
"tables": [
"sentry_projectrepository",
"sentry_repositoryprojectpathconfig",
"seer_projectrepository",
]
},
),
]
168 changes: 168 additions & 0 deletions tests/sentry/migrations/test_1092_backfill_projectrepository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
from sentry.integrations.models.organization_integration import OrganizationIntegration
from sentry.integrations.models.repository_project_path_config import RepositoryProjectPathConfig
from sentry.models.projectrepository import ProjectRepository, ProjectRepositorySource
from sentry.seer.models.project_repository import SeerProjectRepository
from sentry.silo.base import SiloMode
from sentry.testutils.cases import TestMigrations
from sentry.testutils.silo import assume_test_silo_mode


class BackfillProjectRepositoryTest(TestMigrations):
migrate_from = "1091_delete_triggered_incidents_alertruletrigger"
migrate_to = "1092_backfill_projectrepository"

def setup_before_migration(self, apps):
self.org = self.create_organization(owner=self.create_user())
self.integration = self.create_integration(
organization=self.org, provider="github", external_id="gh-1"
)
with assume_test_silo_mode(SiloMode.CONTROL):
self.oi = OrganizationIntegration.objects.get(
organization_id=self.org.id, integration=self.integration
)
self.proj = self.create_project(organization=self.org)
self.repo_a = self.create_repo(
self.proj,
name="org/repo-a",
provider="integrations:github",
integration_id=self.integration.id,
)
self.repo_b = self.create_repo(
self.proj,
name="org/repo-b",
provider="integrations:github",
integration_id=self.integration.id,
)
self.repo_c = self.create_repo(
self.proj,
name="org/repo-c",
provider="integrations:github",
integration_id=self.integration.id,
)
self.repo_d = self.create_repo(
self.proj,
name="org/repo-d",
provider="integrations:github",
integration_id=self.integration.id,
)
self.repo_e = self.create_repo(
self.proj,
name="org/repo-e",
provider="integrations:github",
integration_id=self.integration.id,
)

# Case 1: Auto-generated code mapping only → AUTO_EVENT
RepositoryProjectPathConfig.objects.create(
project=self.proj,
repository=self.repo_a,
organization_integration_id=self.oi.id,
organization_id=self.org.id,
integration_id=self.integration.id,
stack_root="src/",
source_root="src/",
automatically_generated=True,
)

# Case 2: Manual code mapping only → MANUAL
RepositoryProjectPathConfig.objects.create(
project=self.proj,
repository=self.repo_b,
organization_integration_id=self.oi.id,
organization_id=self.org.id,
integration_id=self.integration.id,
stack_root="lib/",
source_root="lib/",
automatically_generated=False,
)

# Case 3: Seer preference only → SEER_PREFERENCE
SeerProjectRepository.objects.create(
project=self.proj, repository=self.repo_c, branch_name="main"
)

# Case 4: Both manual code mapping AND Seer preference for same
# (project, repo) → SEER_PREFERENCE wins (higher priority).
RepositoryProjectPathConfig.objects.create(
project=self.proj,
repository=self.repo_d,
organization_integration_id=self.oi.id,
organization_id=self.org.id,
integration_id=self.integration.id,
stack_root="app/",
source_root="app/",
automatically_generated=False,
)
SeerProjectRepository.objects.create(
project=self.proj, repository=self.repo_d, branch_name="develop"
)

# Case 5: Dual-write already created a ProjectRepository row.
# The migration should not duplicate it, and should still backfill
# the FK on the code mapping.
self.existing_pr = ProjectRepository.objects.create(
project=self.proj,
repository=self.repo_e,
source=ProjectRepositorySource.MANUAL,
)
RepositoryProjectPathConfig.objects.create(
project=self.proj,
repository=self.repo_e,
organization_integration_id=self.oi.id,
organization_id=self.org.id,
integration_id=self.integration.id,
stack_root="pkg/",
source_root="pkg/",
automatically_generated=True,
)

def test(self) -> None:
def get_pr(repo):
return ProjectRepository.objects.get(project=self.proj, repository=repo)

# Case 1: auto-generated code mapping → AUTO_EVENT
pr_a = get_pr(self.repo_a)
assert pr_a.source == ProjectRepositorySource.AUTO_EVENT

# Case 2: manual code mapping → MANUAL
pr_b = get_pr(self.repo_b)
assert pr_b.source == ProjectRepositorySource.MANUAL

# Case 3: Seer preference only → SEER_PREFERENCE
pr_c = get_pr(self.repo_c)
assert pr_c.source == ProjectRepositorySource.SEER_PREFERENCE

# Case 4: both manual code mapping and Seer → SEER_PREFERENCE wins
pr_d = get_pr(self.repo_d)
assert pr_d.source == ProjectRepositorySource.SEER_PREFERENCE

# Case 5: pre-existing ProjectRepository from dual-write is preserved
pr_e = get_pr(self.repo_e)
assert pr_e.id == self.existing_pr.id

# All RepositoryProjectPathConfig rows have project_repository_id set
assert (
RepositoryProjectPathConfig.objects.filter(
project=self.proj, project_repository_id__isnull=True
).count()
== 0
)

# All SeerProjectRepository rows have project_repository_id set
assert (
SeerProjectRepository.objects.filter(
project=self.proj, project_repository_id__isnull=True
).count()
== 0
)

# FK consistency: each row's project_repository points to the right pair
for config in RepositoryProjectPathConfig.objects.filter(project=self.proj):
pr = ProjectRepository.objects.get(id=config.project_repository_id)
assert pr.project_id == config.project_id
assert pr.repository_id == config.repository_id

for spr in SeerProjectRepository.objects.filter(project=self.proj):
pr = ProjectRepository.objects.get(id=spr.project_repository_id)
assert pr.project_id == spr.project_id
assert pr.repository_id == spr.repository_id
Loading