-
-
Notifications
You must be signed in to change notification settings - Fork 4.7k
chore(repositories): Backfill the new ProjectRepository table
#115351
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+298
−1
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
129 changes: 129 additions & 0 deletions
129
src/sentry/migrations/1092_backfill_projectrepository.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,129 @@ | ||
| from django.db import migrations | ||
| from django.db.backends.base.schema import BaseDatabaseSchemaEditor | ||
| from django.db.migrations.state import StateApps | ||
|
|
||
| from sentry.new_migrations.migrations import CheckedMigration | ||
| from sentry.utils.query import RangeQuerySetWrapperWithProgressBar | ||
|
|
||
| # Mirror of ProjectRepositorySource values — we can't import the model | ||
| # in a migration because the code may change after the migration is written. | ||
| SOURCE_MANUAL = 0 | ||
| SOURCE_AUTO_EVENT = 2 | ||
| SOURCE_SEER_PREFERENCE = 4 | ||
|
|
||
| # Lower number = higher priority. Used to pick the best source when | ||
| # a (project, repo) pair appears in multiple tables. | ||
| SOURCE_PRIORITY = { | ||
| SOURCE_SEER_PREFERENCE: 0, | ||
| SOURCE_MANUAL: 1, | ||
| SOURCE_AUTO_EVENT: 2, | ||
| } | ||
|
|
||
|
|
||
| def backfill_project_repository(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None: | ||
| ProjectRepository = apps.get_model("sentry", "ProjectRepository") | ||
| RepositoryProjectPathConfig = apps.get_model("sentry", "RepositoryProjectPathConfig") | ||
| SeerProjectRepository = apps.get_model("seer", "SeerProjectRepository") | ||
|
|
||
| # Step 1: Collect all unique (project_id, repository_id) pairs and pick | ||
| # the best source for each. | ||
| # | ||
| # Priority: SEER_PREFERENCE (user explicitly picked repos for Seer) | ||
| # > MANUAL (user-created code mapping) > AUTO_EVENT (auto-generated). | ||
|
|
||
| pair_to_source: dict[tuple[int, int], int] = {} | ||
|
|
||
| def _set_if_higher_priority(key: tuple[int, int], new_source: int) -> None: | ||
| existing = pair_to_source.get(key) | ||
| if existing is None or SOURCE_PRIORITY[new_source] < SOURCE_PRIORITY[existing]: | ||
| pair_to_source[key] = new_source | ||
|
|
||
| for row in RangeQuerySetWrapperWithProgressBar( | ||
| RepositoryProjectPathConfig.objects.values_list( | ||
| "id", "project_id", "repository_id", "automatically_generated" | ||
| ), | ||
| result_value_getter=lambda values: values[0], | ||
| ): | ||
| _id, project_id, repository_id, automatically_generated = row | ||
| new_source = SOURCE_AUTO_EVENT if automatically_generated else SOURCE_MANUAL | ||
| _set_if_higher_priority((project_id, repository_id), new_source) | ||
|
|
||
| for row in RangeQuerySetWrapperWithProgressBar( | ||
| SeerProjectRepository.objects.values_list("id", "project_id", "repository_id"), | ||
| result_value_getter=lambda values: values[0], | ||
| ): | ||
| _id, project_id, repository_id = row | ||
| _set_if_higher_priority((project_id, repository_id), SOURCE_SEER_PREFERENCE) | ||
|
|
||
| existing_pairs = set(ProjectRepository.objects.values_list("project_id", "repository_id")) | ||
|
|
||
| batch: list[object] = [] | ||
| for (project_id, repository_id), source in pair_to_source.items(): | ||
| if (project_id, repository_id) in existing_pairs: | ||
| continue | ||
| batch.append( | ||
| ProjectRepository(project_id=project_id, repository_id=repository_id, source=source) | ||
| ) | ||
| if len(batch) >= 1000: | ||
| ProjectRepository.objects.bulk_create(batch, ignore_conflicts=True) | ||
| batch = [] | ||
| if batch: | ||
| ProjectRepository.objects.bulk_create(batch, ignore_conflicts=True) | ||
|
|
||
| pr_lookup: dict[tuple[int, int], int] = {} | ||
| for pr_id, project_id, repository_id in ProjectRepository.objects.values_list( | ||
| "id", "project_id", "repository_id" | ||
| ): | ||
| pr_lookup[(project_id, repository_id)] = pr_id | ||
|
|
||
| for config in RangeQuerySetWrapperWithProgressBar( | ||
| RepositoryProjectPathConfig.objects.filter(project_repository_id__isnull=True) | ||
| ): | ||
| pr_id = pr_lookup.get((config.project_id, config.repository_id)) | ||
| if pr_id is not None: | ||
| RepositoryProjectPathConfig.objects.filter(id=config.id).update( | ||
| project_repository_id=pr_id | ||
| ) | ||
|
wedamija marked this conversation as resolved.
|
||
|
|
||
| for spr in RangeQuerySetWrapperWithProgressBar( | ||
| SeerProjectRepository.objects.filter(project_repository_id__isnull=True) | ||
| ): | ||
| pr_id = pr_lookup.get((spr.project_id, spr.repository_id)) | ||
| if pr_id is not None: | ||
| spr.project_repository_id = pr_id | ||
| spr.save(update_fields=["project_repository_id"]) | ||
|
|
||
|
|
||
| class Migration(CheckedMigration): | ||
| # This flag is used to mark that a migration shouldn't be automatically run in production. | ||
| # This should only be used for operations where it's safe to run the migration after your | ||
| # code has deployed. So this should not be used for most operations that alter the schema | ||
| # of a table. | ||
| # Here are some things that make sense to mark as post deployment: | ||
| # - Large data migrations. Typically we want these to be run manually so that they can be | ||
| # monitored and not block the deploy for a long period of time while they run. | ||
| # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to | ||
| # run this outside deployments so that we don't block them. Note that while adding an index | ||
| # is a schema change, it's completely safe to run the operation after the code has deployed. | ||
| # Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment | ||
|
|
||
| is_post_deployment = True | ||
|
|
||
| dependencies = [ | ||
| ("sentry", "1091_delete_triggered_incidents_alertruletrigger"), | ||
| ("seer", "0011_add_project_repository_fk_to_seer"), | ||
| ] | ||
|
|
||
| operations = [ | ||
| migrations.RunPython( | ||
| backfill_project_repository, | ||
| reverse_code=migrations.RunPython.noop, | ||
| hints={ | ||
| "tables": [ | ||
| "sentry_projectrepository", | ||
| "sentry_repositoryprojectpathconfig", | ||
| "seer_projectrepository", | ||
| ] | ||
| }, | ||
| ), | ||
| ] | ||
168 changes: 168 additions & 0 deletions
168
tests/sentry/migrations/test_1092_backfill_projectrepository.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,168 @@ | ||
| from sentry.integrations.models.organization_integration import OrganizationIntegration | ||
| from sentry.integrations.models.repository_project_path_config import RepositoryProjectPathConfig | ||
| from sentry.models.projectrepository import ProjectRepository, ProjectRepositorySource | ||
| from sentry.seer.models.project_repository import SeerProjectRepository | ||
| from sentry.silo.base import SiloMode | ||
| from sentry.testutils.cases import TestMigrations | ||
| from sentry.testutils.silo import assume_test_silo_mode | ||
|
|
||
|
|
||
| class BackfillProjectRepositoryTest(TestMigrations): | ||
| migrate_from = "1091_delete_triggered_incidents_alertruletrigger" | ||
| migrate_to = "1092_backfill_projectrepository" | ||
|
|
||
| def setup_before_migration(self, apps): | ||
| self.org = self.create_organization(owner=self.create_user()) | ||
| self.integration = self.create_integration( | ||
| organization=self.org, provider="github", external_id="gh-1" | ||
| ) | ||
| with assume_test_silo_mode(SiloMode.CONTROL): | ||
| self.oi = OrganizationIntegration.objects.get( | ||
| organization_id=self.org.id, integration=self.integration | ||
| ) | ||
| self.proj = self.create_project(organization=self.org) | ||
| self.repo_a = self.create_repo( | ||
| self.proj, | ||
| name="org/repo-a", | ||
| provider="integrations:github", | ||
| integration_id=self.integration.id, | ||
| ) | ||
| self.repo_b = self.create_repo( | ||
| self.proj, | ||
| name="org/repo-b", | ||
| provider="integrations:github", | ||
| integration_id=self.integration.id, | ||
| ) | ||
| self.repo_c = self.create_repo( | ||
| self.proj, | ||
| name="org/repo-c", | ||
| provider="integrations:github", | ||
| integration_id=self.integration.id, | ||
| ) | ||
| self.repo_d = self.create_repo( | ||
| self.proj, | ||
| name="org/repo-d", | ||
| provider="integrations:github", | ||
| integration_id=self.integration.id, | ||
| ) | ||
| self.repo_e = self.create_repo( | ||
| self.proj, | ||
| name="org/repo-e", | ||
| provider="integrations:github", | ||
| integration_id=self.integration.id, | ||
| ) | ||
|
|
||
| # Case 1: Auto-generated code mapping only → AUTO_EVENT | ||
| RepositoryProjectPathConfig.objects.create( | ||
| project=self.proj, | ||
| repository=self.repo_a, | ||
| organization_integration_id=self.oi.id, | ||
| organization_id=self.org.id, | ||
| integration_id=self.integration.id, | ||
| stack_root="src/", | ||
| source_root="src/", | ||
| automatically_generated=True, | ||
| ) | ||
|
|
||
| # Case 2: Manual code mapping only → MANUAL | ||
| RepositoryProjectPathConfig.objects.create( | ||
| project=self.proj, | ||
| repository=self.repo_b, | ||
| organization_integration_id=self.oi.id, | ||
| organization_id=self.org.id, | ||
| integration_id=self.integration.id, | ||
| stack_root="lib/", | ||
| source_root="lib/", | ||
| automatically_generated=False, | ||
| ) | ||
|
|
||
| # Case 3: Seer preference only → SEER_PREFERENCE | ||
| SeerProjectRepository.objects.create( | ||
| project=self.proj, repository=self.repo_c, branch_name="main" | ||
| ) | ||
|
|
||
| # Case 4: Both manual code mapping AND Seer preference for same | ||
| # (project, repo) → SEER_PREFERENCE wins (higher priority). | ||
| RepositoryProjectPathConfig.objects.create( | ||
| project=self.proj, | ||
| repository=self.repo_d, | ||
| organization_integration_id=self.oi.id, | ||
| organization_id=self.org.id, | ||
| integration_id=self.integration.id, | ||
| stack_root="app/", | ||
| source_root="app/", | ||
| automatically_generated=False, | ||
| ) | ||
| SeerProjectRepository.objects.create( | ||
| project=self.proj, repository=self.repo_d, branch_name="develop" | ||
| ) | ||
|
|
||
| # Case 5: Dual-write already created a ProjectRepository row. | ||
| # The migration should not duplicate it, and should still backfill | ||
| # the FK on the code mapping. | ||
| self.existing_pr = ProjectRepository.objects.create( | ||
| project=self.proj, | ||
| repository=self.repo_e, | ||
| source=ProjectRepositorySource.MANUAL, | ||
| ) | ||
| RepositoryProjectPathConfig.objects.create( | ||
| project=self.proj, | ||
| repository=self.repo_e, | ||
| organization_integration_id=self.oi.id, | ||
| organization_id=self.org.id, | ||
| integration_id=self.integration.id, | ||
| stack_root="pkg/", | ||
| source_root="pkg/", | ||
| automatically_generated=True, | ||
| ) | ||
|
|
||
| def test(self) -> None: | ||
| def get_pr(repo): | ||
| return ProjectRepository.objects.get(project=self.proj, repository=repo) | ||
|
|
||
| # Case 1: auto-generated code mapping → AUTO_EVENT | ||
| pr_a = get_pr(self.repo_a) | ||
| assert pr_a.source == ProjectRepositorySource.AUTO_EVENT | ||
|
|
||
| # Case 2: manual code mapping → MANUAL | ||
| pr_b = get_pr(self.repo_b) | ||
| assert pr_b.source == ProjectRepositorySource.MANUAL | ||
|
|
||
| # Case 3: Seer preference only → SEER_PREFERENCE | ||
| pr_c = get_pr(self.repo_c) | ||
| assert pr_c.source == ProjectRepositorySource.SEER_PREFERENCE | ||
|
|
||
| # Case 4: both manual code mapping and Seer → SEER_PREFERENCE wins | ||
| pr_d = get_pr(self.repo_d) | ||
| assert pr_d.source == ProjectRepositorySource.SEER_PREFERENCE | ||
|
|
||
| # Case 5: pre-existing ProjectRepository from dual-write is preserved | ||
| pr_e = get_pr(self.repo_e) | ||
| assert pr_e.id == self.existing_pr.id | ||
|
|
||
| # All RepositoryProjectPathConfig rows have project_repository_id set | ||
| assert ( | ||
| RepositoryProjectPathConfig.objects.filter( | ||
| project=self.proj, project_repository_id__isnull=True | ||
| ).count() | ||
| == 0 | ||
| ) | ||
|
|
||
| # All SeerProjectRepository rows have project_repository_id set | ||
| assert ( | ||
| SeerProjectRepository.objects.filter( | ||
| project=self.proj, project_repository_id__isnull=True | ||
| ).count() | ||
| == 0 | ||
| ) | ||
|
|
||
| # FK consistency: each row's project_repository points to the right pair | ||
| for config in RepositoryProjectPathConfig.objects.filter(project=self.proj): | ||
| pr = ProjectRepository.objects.get(id=config.project_repository_id) | ||
| assert pr.project_id == config.project_id | ||
| assert pr.repository_id == config.repository_id | ||
|
|
||
| for spr in SeerProjectRepository.objects.filter(project=self.proj): | ||
| pr = ProjectRepository.objects.get(id=spr.project_repository_id) | ||
| assert pr.project_id == spr.project_id | ||
| assert pr.repository_id == spr.repository_id |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could the loop iterate
pr_lookupinstead? One UPDATE per pair rather than per row.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We make this query so that we only look at rows that don't already have a
project_repository_id. If we switch to this method, then if we have to re-run the backfill it will attempt an update for every item inpr_lookup, rather than just the rows that need a value