-
Notifications
You must be signed in to change notification settings - Fork 995
Fix: Eliminate redundant full table scans in messages and events collection #3444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1b3e0d4
9824374
ac28a75
81161b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,7 @@ | |
|
|
||
|
|
||
| platform_id = 1 | ||
|
|
||
| MESSAGE_BATCH_SIZE = 200 | ||
| @celery.task(base=AugurCoreRepoCollectionTask) | ||
| def collect_github_messages(repo_git: str, full_collection: bool) -> None: | ||
|
|
||
|
|
@@ -39,17 +39,30 @@ def collect_github_messages(repo_git: str, full_collection: bool) -> None: | |
| core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) | ||
|
|
||
|
|
||
| # Build mappings once before processing any messages | ||
| # create mapping from issue url to issue id of current issues | ||
| issue_url_to_id_map = {} | ||
| issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() | ||
| for issue in issues: | ||
| issue_url_to_id_map[issue.issue_url] = issue.issue_id | ||
|
|
||
| # create mapping from pr url to pr id of current pull requests | ||
| pr_issue_url_to_id_map = {} | ||
| prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() | ||
| for pr in prs: | ||
| pr_issue_url_to_id_map[pr.pr_issue_url] = pr.pull_request_id | ||
|
|
||
|
Comment on lines
+42
to
+54
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Above you rely on some functions for these mappings, Is this code just a duplicate of the code in those functions? If those functions are useful elsewhere and not tied in anywhere maybe they can be useful utility functions that can be imported here too? Overall id recommend splitting this PR so that the changes to events.py can be merged without being held up by the larger question of refactoring ( ref #3345) that this file brings up
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @PredictiveManish : I agree with @MoralCode that splitting the events.py from the other refactoring would make this more straightforward to merge and test.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Covered changes in
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sounds good, will focus on that PR first. This one is still on hold until we figure out how best to share the same function that was proposed in events.py
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, for events.py I have created new PR as suggested for split. |
||
| if is_repo_small(repo_id): | ||
| message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected) | ||
|
|
||
| if message_data: | ||
| process_messages(message_data, task_name, repo_id, logger, augur_db) | ||
| process_messages(message_data, task_name, repo_id, logger, augur_db, issue_url_to_id_map, pr_issue_url_to_id_map) | ||
|
|
||
| else: | ||
| logger.info(f"{owner}/{repo} has no messages") | ||
|
|
||
| else: | ||
| process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected) | ||
| process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected, issue_url_to_id_map, pr_issue_url_to_id_map) | ||
|
|
||
|
|
||
| def is_repo_small(repo_id): | ||
|
|
@@ -82,7 +95,7 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas | |
| return list(github_data_access.paginate_resource(url)) | ||
|
|
||
|
|
||
| def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db, since) -> None: | ||
| def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db, since, issue_url_to_id_map, pr_issue_url_to_id_map) -> None: | ||
|
|
||
| message_batch_size = get_batch_size("message") | ||
|
|
||
|
|
@@ -133,12 +146,12 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger | |
| all_data.clear() | ||
|
|
||
| if len(all_data) > 0: | ||
| process_messages(all_data, task_name, repo_id, logger, augur_db) | ||
| process_messages(all_data, task_name, repo_id, logger, augur_db, issue_url_to_id_map, pr_issue_url_to_id_map) | ||
|
|
||
| logger.info(f"{task_name}: Finished. Skipped {skipped_urls} comment URLs due to 404.") | ||
|
|
||
|
|
||
| def process_messages(messages, task_name, repo_id, logger, augur_db): | ||
| def process_messages(messages, task_name, repo_id, logger, augur_db, issue_url_to_id_map, pr_issue_url_to_id_map): | ||
|
|
||
| tool_version = "2.0" | ||
| data_source = "Github API" | ||
|
|
@@ -154,18 +167,6 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): | |
| if len(messages) == 0: | ||
| logger.info(f"{task_name}: No messages to process") | ||
|
|
||
| # create mapping from issue url to issue id of current issues | ||
| issue_url_to_id_map = {} | ||
| issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() | ||
| for issue in issues: | ||
| issue_url_to_id_map[issue.issue_url] = issue.issue_id | ||
|
|
||
| # create mapping from pr url to pr id of current pull requests | ||
| pr_issue_url_to_id_map = {} | ||
| prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() | ||
| for pr in prs: | ||
| pr_issue_url_to_id_map[pr.pr_issue_url] = pr.pull_request_id | ||
|
|
||
|
|
||
| message_len = len(messages) | ||
| for index, message in enumerate(messages): | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.