Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 59 additions & 15 deletions augur/tasks/github/facade_github/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import *
from augur.application.db.lib import bulk_insert_dicts
from augur.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor

from augur.tasks.github.util.util import sanity_check_email



Expand Down Expand Up @@ -59,7 +59,31 @@
# logger.debug("Failed local login lookup")
# else:
# login = contributors_with_matching_name[0].gh_login



# Here we attempt to detect and parse the user id from the email address directly
# this is helpful in case the contributor uses a github noreply address for commits.

# to do this we should first do some sanity checks on the email
email_check = sanity_check_email(email)
if email_check is not None and 'users.noreply.github.com' in email_check:

Check failure

Code scanning / CodeQL

Incomplete URL substring sanitization High

The string
users.noreply.github.com
may be at an arbitrary position in the sanitized URL.
# we have a noreply address

# the email check explicitly looks for presence of an @ sign
email_user = email_check.split("@")[0]
if '+' in email_user:
username_parts = email_user.split("+")
user_id, login = username_parts[0], username_parts[-1]
else:
user_id = email_user

if not user_id.isnumeric():
logger.warning(
f"Something went wrong parsing user id '{user_id}` from github noreply user {login} <{email_check}>. "
f"Falling back to regular lookup process"
)
login = None
user_id = None

# Try to get the login from the commit sha
if login == None or login == "":
Expand All @@ -72,19 +96,8 @@

if login == None or login == "":
logger.error("Failed to get login from supplemental data!")

unresolved = {
"email": email,
"name": name,
}
logger.debug(f"No more username resolution methods available. Inserting data into unresolved table: {unresolved}")

try:
unresolved_natural_keys = ['email']
bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys)
except Exception as e:
logger.error(
f"Could not create new unresolved email {email}. Error: {e}")
mark_unresolved(name, email, logger)

# move on to the next contributor
continue

Expand All @@ -96,6 +109,21 @@
logger.warning(f"User of {login} not found on github. Skipping...")
continue

# if we had a noreply address and there wasnt an issue parsing a numeric user ID,
# return now that we have the user record to validate that the profile we fetched for the username
# does indeed match the correct user
if email_check is not None and 'users.noreply.github.com' in email_check and user_id is not None:

Check failure

Code scanning / CodeQL

Incomplete URL substring sanitization High

The string
users.noreply.github.com
may be at an arbitrary position in the sanitized URL.
gh_user_src_id = user_data.get("id")

if gh_user_src_id != int(user_id):
logger.warning(
f"github noreply user src id {gh_user_src_id} doesn't match the ID from their github noreply email: {email_check}."
"Marking as unresolved and skipping to avoid inserting mismatched data."
)
mark_unresolved(name, email, logger)
continue


# Use the email found in the commit data if api data is NULL
emailFromCommitData = contributor['email_raw'] if 'email_raw' in contributor else contributor['email']

Expand Down Expand Up @@ -157,6 +185,22 @@
return


def mark_unresolved(name, email, logger):
unresolved = {
"email": email,
"name": name,
}
logger.debug(f"No more username resolution methods available. Inserting data into unresolved table: {unresolved}")

try:
unresolved_natural_keys = ['email']
bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys)
except Exception as e:
logger.error(
f"Could not create new unresolved email {email}. Error: {e}")



def link_commits_to_contributor(logger, facade_helper, contributorQueue):

# # iterate through all the commits with emails that appear in contributors and give them the relevant cntrb_id.
Expand Down
57 changes: 57 additions & 0 deletions augur/tasks/github/util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,63 @@
from augur.application.db.lib import get_repo_by_repo_git
from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps


def sanity_check_email(possible_email:str) -> str:
"""Accept a string that might contain an email and attempt to validate it
This was built for performing some basic sanity checking before attempting to use
the information in an email address for processing.

It is built to be simple without new libraries for now
since the intent is more for basic sanity checks to aid in parsing a known format.
It is not (yet) fully RFC-compliant.

If it becomes heavily used, we should defer to a well-made library

Args:
possible_email (str): a string that might contain an email

Returns:
str: a string representing what should (mostly) be a valid email address. None is returned if no valid email could be found.
"""

try:
if possible_email is None:
return None

candidate_email = str(possible_email)


at_pos = candidate_email.find("@")
if at_pos == -1:
# email cannot possibly be valid without an @ sign
return None

preceeding_space = candidate_email.rfind(" ", 0)

Check failure on line 43 in augur/tasks/github/util/util.py

View workflow job for this annotation

GitHub Actions / misspell

[misspell] augur/tasks/github/util/util.py#L43

"preceeding" is a misspelling of "preceding"
Raw output
./augur/tasks/github/util/util.py:43:8: "preceeding" is a misspelling of "preceding"
following_space = candidate_email.find(" ", at_pos)
search_result = (preceeding_space != -1, following_space != -1)

Check failure on line 45 in augur/tasks/github/util/util.py

View workflow job for this annotation

GitHub Actions / misspell

[misspell] augur/tasks/github/util/util.py#L45

"preceeding" is a misspelling of "preceding"
Raw output
./augur/tasks/github/util/util.py:45:25: "preceeding" is a misspelling of "preceding"

if search_result == (True, True):
# spaces were on either side
candidate_email = candidate_email[preceeding_space, following_space + 1]

Check failure on line 49 in augur/tasks/github/util/util.py

View workflow job for this annotation

GitHub Actions / misspell

[misspell] augur/tasks/github/util/util.py#L49

"preceeding" is a misspelling of "preceding"
Raw output
./augur/tasks/github/util/util.py:49:46: "preceeding" is a misspelling of "preceding"
elif search_result == (False, True):
# space after
candidate_email = candidate_email[:following_space + 1]
elif search_result == (True, False):
# space after
candidate_email = candidate_email[preceeding_space:]

Check failure on line 55 in augur/tasks/github/util/util.py

View workflow job for this annotation

GitHub Actions / misspell

[misspell] augur/tasks/github/util/util.py#L55

"preceeding" is a misspelling of "preceding"
Raw output
./augur/tasks/github/util/util.py:55:46: "preceeding" is a misspelling of "preceding"
# otherwise, there were no spaces, so the string stays the same.

if not candidate_email.isascii():
# non-ascii is pretty uncommon, especially for narrow usecases like
# checking for an existing domain like the github noreply domain.
# if this is insufficient, a library may be a better option.

return None

return candidate_email
except Exception:
return None

def get_repo_src_id(owner, repo, logger):


Expand Down
Loading