Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 51 additions & 65 deletions augur/tasks/github/facade_github/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
from augur.tasks.github.facade_github.core import *
from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_email, get_repo_by_repo_git, batch_insert_contributors, get_batch_size
from augur.application.db.lib import get_session, execute_session_query

from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import *
from augur.application.db.lib import bulk_insert_dicts
from augur.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor
Expand Down Expand Up @@ -196,54 +196,31 @@ def insert_facade_contributors(self, repo_git):
repo_id = repo.repo_id
facade_helper = FacadeHelper(logger)

with get_session() as session:
query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id)
collection_status = execute_session_query(query,'one')
last_collected_date = collection_status.facade_data_last_collected if not facade_helper.facade_contributor_full_recollect else None

# Get all of the commit data's emails and names from the commit table that do not appear
# in the contributors table or the contributors_aliases table.
# Find all commits not yet linked to a contributor. The correct signal for
# "needs resolution" is a NULL cmt_ght_author_id — not a date window or an
# email-table cross-check. The old email-join approach silently skipped
# commits whose emails were later linked to a GitHub account, and the
# last_collection_date cutoff (PR #3253) made that permanent. Commits
# already marked unresolvable are excluded via the unresolved_commit_emails
# table so we don't hammer the GitHub API on known dead-ends.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is fairly verbose - could we simplify it?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simplified the comment.


logger.info(
"Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id))
new_contrib_sql = s.sql.text("""
SELECT DISTINCT
commits.cmt_author_name AS NAME,
commits.cmt_commit_hash AS hash,
commits.cmt_author_raw_email AS email_raw,
'not_unresolved' as resolution_status
commits.cmt_author_raw_email AS email_raw
FROM
commits
WHERE
commits.repo_id = :repo_id
AND (:since_date is NULL OR commits.data_collection_date > :since_date)
AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email )
or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email)
AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ))
GROUP BY
commits.cmt_author_name,
commits.cmt_commit_hash,
commits.cmt_author_raw_email
UNION
SELECT DISTINCT
commits.cmt_author_name AS NAME,--commits.cmt_id AS id,
commits.cmt_commit_hash AS hash,
commits.cmt_author_raw_email AS email_raw,
'unresolved' as resolution_status
FROM
commits
WHERE
commits.repo_id = :repo_id
AND (:since_date is NULL OR commits.data_collection_date > :since_date)
AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email )
AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )
GROUP BY
commits.cmt_author_name,
commits.cmt_commit_hash,
commits.cmt_author_raw_email
ORDER BY
hash
""").bindparams(repo_id=repo_id,since_date=last_collected_date)
AND commits.cmt_ght_author_id IS NULL
AND commits.cmt_author_raw_email NOT IN (
SELECT email FROM unresolved_commit_emails
)
ORDER BY hash
""").bindparams(repo_id=repo_id)

#Execute statement with session.
result = execute_sql(new_contrib_sql)
Expand Down Expand Up @@ -278,37 +255,46 @@ def insert_facade_contributors(self, repo_git):

logger.debug("DEBUG: Got through the new_contribs")

# sql query used to find corresponding cntrb_id's of emails found in the contributor's table
# i.e., if a contributor already exists, we use it!
# Build the full email→contributor mapping by unioning all three email
# columns (cntrb_email, cntrb_canonical, alias_email) then deduplicating.
# Only target commits that are still unlinked (cmt_ght_author_id IS NULL)
# so we never re-process already-resolved records. Removing the
# last_collection_date guard means historical commits that slipped through
# on first pass are finally eligible for resolution.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this comment is very verbose and describes almost line by line what the below query does. Can you make the comment more high level so it complements, rather than duplicating, the understanding someone would get by reading the code?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simplified the comment.

resolve_email_to_cntrb_id_sql = s.sql.text("""
WITH email_to_contributor AS (
SELECT cntrb_email AS email, cntrb_id
FROM contributors
WHERE cntrb_email IS NOT NULL

UNION ALL

SELECT cntrb_canonical AS email, cntrb_id
FROM contributors
WHERE cntrb_canonical IS NOT NULL

UNION ALL

SELECT alias_email AS email, cntrb_id
FROM contributors_aliases
WHERE alias_email IS NOT NULL
),
deduplicated AS (
SELECT DISTINCT ON (email) email, cntrb_id
FROM email_to_contributor
ORDER BY email
)
SELECT DISTINCT
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the suggested query cali wrote in the issue doesnt have a distinct here. Can you elaborate on why this was added to the query?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed DISTINCT from the outer SELECT — the deduplicated CTE already ensures one cntrb_id per email via DISTINCT ON (email), so the outer DISTINCT was redundant.

cntrb_id,
contributors.cntrb_login AS login,
contributors.cntrb_canonical AS email,
commits.cmt_author_raw_email
FROM
contributors,
commits
WHERE
contributors.cntrb_canonical = commits.cmt_author_raw_email
AND (:since_date is NULL OR commits.data_collection_date > :since_date)
AND commits.repo_id = :repo_id
UNION
SELECT DISTINCT
contributors_aliases.cntrb_id,
contributors.cntrb_login as login,
contributors_aliases.alias_email AS email,
commits.cmt_author_raw_email
d.cntrb_id,
d.email
FROM
contributors,
contributors_aliases,
commits
commits c
INNER JOIN
deduplicated d ON c.cmt_author_raw_email = d.email
WHERE
contributors_aliases.alias_email = commits.cmt_author_raw_email
AND contributors.cntrb_id = contributors_aliases.cntrb_id
AND commits.repo_id = :repo_id
AND (:since_date is NULL OR commits.data_collection_date > :since_date)
""").bindparams(repo_id=repo_id,since_date=last_collected_date)
c.cmt_ght_author_id IS NULL
AND c.repo_id = :repo_id
""").bindparams(repo_id=repo_id)


result = execute_sql(resolve_email_to_cntrb_id_sql)
Expand Down