-
Notifications
You must be signed in to change notification settings - Fork 996
[facade] fix contributor resolution permanently skipping commits with null cmt_ght_author_id #3792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,7 +7,7 @@ | |
| from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth | ||
| from augur.tasks.github.facade_github.core import * | ||
| from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_email, get_repo_by_repo_git, batch_insert_contributors, get_batch_size | ||
| from augur.application.db.lib import get_session, execute_session_query | ||
|
|
||
| from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * | ||
| from augur.application.db.lib import bulk_insert_dicts | ||
| from augur.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor | ||
|
|
@@ -196,54 +196,31 @@ def insert_facade_contributors(self, repo_git): | |
| repo_id = repo.repo_id | ||
| facade_helper = FacadeHelper(logger) | ||
|
|
||
| with get_session() as session: | ||
| query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) | ||
| collection_status = execute_session_query(query,'one') | ||
| last_collected_date = collection_status.facade_data_last_collected if not facade_helper.facade_contributor_full_recollect else None | ||
|
|
||
| # Get all of the commit data's emails and names from the commit table that do not appear | ||
| # in the contributors table or the contributors_aliases table. | ||
| # Find all commits not yet linked to a contributor. The correct signal for | ||
| # "needs resolution" is a NULL cmt_ght_author_id — not a date window or an | ||
| # email-table cross-check. The old email-join approach silently skipped | ||
| # commits whose emails were later linked to a GitHub account, and the | ||
| # last_collection_date cutoff (PR #3253) made that permanent. Commits | ||
| # already marked unresolvable are excluded via the unresolved_commit_emails | ||
| # table so we don't hammer the GitHub API on known dead-ends. | ||
|
|
||
| logger.info( | ||
| "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) | ||
| new_contrib_sql = s.sql.text(""" | ||
| SELECT DISTINCT | ||
| commits.cmt_author_name AS NAME, | ||
| commits.cmt_commit_hash AS hash, | ||
| commits.cmt_author_raw_email AS email_raw, | ||
| 'not_unresolved' as resolution_status | ||
| commits.cmt_author_raw_email AS email_raw | ||
| FROM | ||
| commits | ||
| WHERE | ||
| commits.repo_id = :repo_id | ||
| AND (:since_date is NULL OR commits.data_collection_date > :since_date) | ||
| AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) | ||
| or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) | ||
| AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) | ||
| GROUP BY | ||
| commits.cmt_author_name, | ||
| commits.cmt_commit_hash, | ||
| commits.cmt_author_raw_email | ||
| UNION | ||
| SELECT DISTINCT | ||
| commits.cmt_author_name AS NAME,--commits.cmt_id AS id, | ||
| commits.cmt_commit_hash AS hash, | ||
| commits.cmt_author_raw_email AS email_raw, | ||
| 'unresolved' as resolution_status | ||
| FROM | ||
| commits | ||
| WHERE | ||
| commits.repo_id = :repo_id | ||
| AND (:since_date is NULL OR commits.data_collection_date > :since_date) | ||
| AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) | ||
| AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) | ||
| GROUP BY | ||
| commits.cmt_author_name, | ||
| commits.cmt_commit_hash, | ||
| commits.cmt_author_raw_email | ||
| ORDER BY | ||
| hash | ||
| """).bindparams(repo_id=repo_id,since_date=last_collected_date) | ||
| AND commits.cmt_ght_author_id IS NULL | ||
| AND commits.cmt_author_raw_email NOT IN ( | ||
| SELECT email FROM unresolved_commit_emails | ||
| ) | ||
| ORDER BY hash | ||
| """).bindparams(repo_id=repo_id) | ||
|
|
||
| #Execute statement with session. | ||
| result = execute_sql(new_contrib_sql) | ||
|
|
@@ -278,37 +255,46 @@ def insert_facade_contributors(self, repo_git): | |
|
|
||
| logger.debug("DEBUG: Got through the new_contribs") | ||
|
|
||
| # sql query used to find corresponding cntrb_id's of emails found in the contributor's table | ||
| # i.e., if a contributor already exists, we use it! | ||
| # Build the full email→contributor mapping by unioning all three email | ||
| # columns (cntrb_email, cntrb_canonical, alias_email) then deduplicating. | ||
| # Only target commits that are still unlinked (cmt_ght_author_id IS NULL) | ||
| # so we never re-process already-resolved records. Removing the | ||
| # last_collection_date guard means historical commits that slipped through | ||
| # on first pass are finally eligible for resolution. | ||
|
||
| resolve_email_to_cntrb_id_sql = s.sql.text(""" | ||
| WITH email_to_contributor AS ( | ||
| SELECT cntrb_email AS email, cntrb_id | ||
| FROM contributors | ||
| WHERE cntrb_email IS NOT NULL | ||
|
|
||
| UNION ALL | ||
|
|
||
| SELECT cntrb_canonical AS email, cntrb_id | ||
| FROM contributors | ||
| WHERE cntrb_canonical IS NOT NULL | ||
|
|
||
| UNION ALL | ||
|
|
||
| SELECT alias_email AS email, cntrb_id | ||
| FROM contributors_aliases | ||
| WHERE alias_email IS NOT NULL | ||
| ), | ||
| deduplicated AS ( | ||
| SELECT DISTINCT ON (email) email, cntrb_id | ||
| FROM email_to_contributor | ||
| ORDER BY email | ||
| ) | ||
| SELECT DISTINCT | ||
|
||
| cntrb_id, | ||
| contributors.cntrb_login AS login, | ||
| contributors.cntrb_canonical AS email, | ||
| commits.cmt_author_raw_email | ||
| FROM | ||
| contributors, | ||
| commits | ||
| WHERE | ||
| contributors.cntrb_canonical = commits.cmt_author_raw_email | ||
| AND (:since_date is NULL OR commits.data_collection_date > :since_date) | ||
| AND commits.repo_id = :repo_id | ||
| UNION | ||
| SELECT DISTINCT | ||
| contributors_aliases.cntrb_id, | ||
| contributors.cntrb_login as login, | ||
| contributors_aliases.alias_email AS email, | ||
| commits.cmt_author_raw_email | ||
| d.cntrb_id, | ||
| d.email | ||
| FROM | ||
| contributors, | ||
| contributors_aliases, | ||
| commits | ||
| commits c | ||
| INNER JOIN | ||
| deduplicated d ON c.cmt_author_raw_email = d.email | ||
| WHERE | ||
| contributors_aliases.alias_email = commits.cmt_author_raw_email | ||
| AND contributors.cntrb_id = contributors_aliases.cntrb_id | ||
| AND commits.repo_id = :repo_id | ||
| AND (:since_date is NULL OR commits.data_collection_date > :since_date) | ||
| """).bindparams(repo_id=repo_id,since_date=last_collected_date) | ||
| c.cmt_ght_author_id IS NULL | ||
| AND c.repo_id = :repo_id | ||
| """).bindparams(repo_id=repo_id) | ||
|
|
||
|
|
||
| result = execute_sql(resolve_email_to_cntrb_id_sql) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment is fairly verbose - could we simplify it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Simplified the comment.