diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 73ac3ee150..0c6b615b01 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -7,7 +7,7 @@ from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.facade_github.core import * from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_email, get_repo_by_repo_git, batch_insert_contributors, get_batch_size -from augur.application.db.lib import get_session, execute_session_query + from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * from augur.application.db.lib import bulk_insert_dicts from augur.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor @@ -196,13 +196,8 @@ def insert_facade_contributors(self, repo_git): repo_id = repo.repo_id facade_helper = FacadeHelper(logger) - with get_session() as session: - query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) - collection_status = execute_session_query(query,'one') - last_collected_date = collection_status.facade_data_last_collected if not facade_helper.facade_contributor_full_recollect else None - - # Get all of the commit data's emails and names from the commit table that do not appear - # in the contributors table or the contributors_aliases table. + # Find commits not yet linked to a contributor (cmt_ght_author_id IS NULL), + # skipping emails already marked unresolvable. logger.info( "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) @@ -210,40 +205,17 @@ def insert_facade_contributors(self, repo_git): SELECT DISTINCT commits.cmt_author_name AS NAME, commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'not_unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) - or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - UNION - SELECT DISTINCT - commits.cmt_author_name AS NAME,--commits.cmt_id AS id, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'unresolved' as resolution_status + commits.cmt_author_raw_email AS email_raw FROM commits WHERE commits.repo_id = :repo_id - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - ORDER BY - hash - """).bindparams(repo_id=repo_id,since_date=last_collected_date) + AND commits.cmt_ght_author_id IS NULL + AND commits.cmt_author_raw_email NOT IN ( + SELECT email FROM augur_data.unresolved_commit_emails + ) + ORDER BY hash + """).bindparams(repo_id=repo_id) #Execute statement with session. result = execute_sql(new_contrib_sql) @@ -278,37 +250,41 @@ def insert_facade_contributors(self, repo_git): logger.debug("DEBUG: Got through the new_contribs") - # sql query used to find corresponding cntrb_id's of emails found in the contributor's table - # i.e., if a contributor already exists, we use it! + # Match unlinked commits to contributors via email, canonical email, or alias. resolve_email_to_cntrb_id_sql = s.sql.text(""" - SELECT DISTINCT - cntrb_id, - contributors.cntrb_login AS login, - contributors.cntrb_canonical AS email, - commits.cmt_author_raw_email - FROM - contributors, - commits - WHERE - contributors.cntrb_canonical = commits.cmt_author_raw_email - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - AND commits.repo_id = :repo_id - UNION - SELECT DISTINCT - contributors_aliases.cntrb_id, - contributors.cntrb_login as login, - contributors_aliases.alias_email AS email, - commits.cmt_author_raw_email + WITH email_to_contributor AS ( + SELECT cntrb_email AS email, cntrb_id + FROM contributors + WHERE cntrb_email IS NOT NULL + + UNION ALL + + SELECT cntrb_canonical AS email, cntrb_id + FROM contributors + WHERE cntrb_canonical IS NOT NULL + + UNION ALL + + SELECT alias_email AS email, cntrb_id + FROM contributors_aliases + WHERE alias_email IS NOT NULL + ), + deduplicated AS ( + SELECT DISTINCT ON (email) email, cntrb_id + FROM email_to_contributor + ORDER BY email + ) + SELECT + d.cntrb_id, + c.cmt_author_email AS email FROM - contributors, - contributors_aliases, - commits + commits c + INNER JOIN + deduplicated d ON c.cmt_author_email = d.email WHERE - contributors_aliases.alias_email = commits.cmt_author_raw_email - AND contributors.cntrb_id = contributors_aliases.cntrb_id - AND commits.repo_id = :repo_id - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - """).bindparams(repo_id=repo_id,since_date=last_collected_date) + c.cmt_ght_author_id IS NULL + AND c.repo_id = :repo_id + """).bindparams(repo_id=repo_id) result = execute_sql(resolve_email_to_cntrb_id_sql)