Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions augur/tasks/github/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def collect(self, repo_git, key_auth, since):
events.clear()

if events:
self._process_events(events, repo_id)
self._process_events(events, repo_id, issue_url_to_id_map, pr_url_to_id_map)

def _collect_events(self, repo_git: str, key_auth, since):

Expand All @@ -146,7 +146,7 @@ def _collect_events(self, repo_git: str, key_auth, since):
if since and datetime.fromisoformat(event["created_at"].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) < since:
return

def _process_events(self, events, repo_id):
def _process_events(self, events, repo_id, issue_url_to_id_map, pr_url_to_id_map):

issue_events = []
pr_events = []
Expand All @@ -164,19 +164,16 @@ def _process_events(self, events, repo_id):
if not_mappable_events:
self._logger.warning(f"{self.repo_identifier} - {self.task_name}: Unable to map these github events to an issue or pr: {not_mappable_events}")

self._process_issue_events(issue_events, repo_id)
self._process_pr_events(pr_events, repo_id)
self._process_issue_events(issue_events, repo_id, issue_url_to_id_map)
self._process_pr_events(pr_events, repo_id, pr_url_to_id_map)

update_issue_closed_cntrbs_by_repo_id(repo_id)

def _process_issue_events(self, issue_events, repo_id):
def _process_issue_events(self, issue_events, repo_id, issue_url_to_id_map):

issue_event_dicts = []
contributors = []


issue_url_to_id_map = self._get_map_from_issue_url_to_id(repo_id)

for event in issue_events:

event, contributor = self._process_github_event_contributors(event)
Expand All @@ -203,13 +200,11 @@ def _process_issue_events(self, issue_events, repo_id):

self._insert_issue_events(issue_event_dicts)

def _process_pr_events(self, pr_events, repo_id):
def _process_pr_events(self, pr_events, repo_id, pr_url_to_id_map):

pr_event_dicts = []
contributors = []

pr_url_to_id_map = self._get_map_from_pr_url_to_id(repo_id)

for event in pr_events:

event, contributor = self._process_github_event_contributors(event)
Expand Down
37 changes: 19 additions & 18 deletions augur/tasks/github/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@


platform_id = 1

MESSAGE_BATCH_SIZE = 200
@celery.task(base=AugurCoreRepoCollectionTask)
def collect_github_messages(repo_git: str, full_collection: bool) -> None:

Expand All @@ -39,17 +39,30 @@ def collect_github_messages(repo_git: str, full_collection: bool) -> None:
core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc)


# Build mappings once before processing any messages
# create mapping from issue url to issue id of current issues
issue_url_to_id_map = {}
issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all()
for issue in issues:
issue_url_to_id_map[issue.issue_url] = issue.issue_id

# create mapping from pr url to pr id of current pull requests
pr_issue_url_to_id_map = {}
prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all()
for pr in prs:
pr_issue_url_to_id_map[pr.pr_issue_url] = pr.pull_request_id

Comment on lines +42 to +54
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above you rely on some functions for these mappings, Is this code just a duplicate of the code in those functions? If those functions are useful elsewhere and not tied in anywhere maybe they can be useful utility functions that can be imported here too?

Overall id recommend splitting this PR so that the changes to events.py can be merged without being held up by the larger question of refactoring ( ref #3345) that this file brings up

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@PredictiveManish : I agree with @MoralCode that splitting the events.py from the other refactoring would make this more straightforward to merge and test.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Covered changes in events.py in #3479

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good, will focus on that PR first. This one is still on hold until we figure out how best to share the same function that was proposed in events.py

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, for events.py I have created new PR as suggested for split.

if is_repo_small(repo_id):
message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected)

if message_data:
process_messages(message_data, task_name, repo_id, logger, augur_db)
process_messages(message_data, task_name, repo_id, logger, augur_db, issue_url_to_id_map, pr_issue_url_to_id_map)

else:
logger.info(f"{owner}/{repo} has no messages")

else:
process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected)
process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected, issue_url_to_id_map, pr_issue_url_to_id_map)


def is_repo_small(repo_id):
Expand Down Expand Up @@ -82,7 +95,7 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas
return list(github_data_access.paginate_resource(url))


def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db, since) -> None:
def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db, since, issue_url_to_id_map, pr_issue_url_to_id_map) -> None:

message_batch_size = get_batch_size("message")

Expand Down Expand Up @@ -133,12 +146,12 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger
all_data.clear()

if len(all_data) > 0:
process_messages(all_data, task_name, repo_id, logger, augur_db)
process_messages(all_data, task_name, repo_id, logger, augur_db, issue_url_to_id_map, pr_issue_url_to_id_map)

logger.info(f"{task_name}: Finished. Skipped {skipped_urls} comment URLs due to 404.")


def process_messages(messages, task_name, repo_id, logger, augur_db):
def process_messages(messages, task_name, repo_id, logger, augur_db, issue_url_to_id_map, pr_issue_url_to_id_map):

tool_version = "2.0"
data_source = "Github API"
Expand All @@ -154,18 +167,6 @@ def process_messages(messages, task_name, repo_id, logger, augur_db):
if len(messages) == 0:
logger.info(f"{task_name}: No messages to process")

# create mapping from issue url to issue id of current issues
issue_url_to_id_map = {}
issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all()
for issue in issues:
issue_url_to_id_map[issue.issue_url] = issue.issue_id

# create mapping from pr url to pr id of current pull requests
pr_issue_url_to_id_map = {}
prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all()
for pr in prs:
pr_issue_url_to_id_map[pr.pr_issue_url] = pr.pull_request_id


message_len = len(messages)
for index, message in enumerate(messages):
Expand Down