diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index a2a8736c8a..2f143f899d 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -5,10 +5,12 @@ from abc import ABC, abstractmethod from datetime import datetime, timedelta, timezone +from celery.exceptions import Ignore + from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException, ResourceGoneException from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.util.util import get_owner_repo @@ -39,12 +41,17 @@ def collect_events(repo_git: str, full_collection: bool): key_auth = GithubRandomKeyAuth(logger) - if bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): - collection_strategy = BulkGithubEventCollection(logger) - else: - collection_strategy = ThoroughGithubEventCollection(logger) + try: + if bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): + collection_strategy = BulkGithubEventCollection(logger) + else: + collection_strategy = ThoroughGithubEventCollection(logger) + + collection_strategy.collect(repo_git, key_auth, core_data_last_collected) - collection_strategy.collect(repo_git, key_auth, core_data_last_collected) + except ResourceGoneException as e: + logger.warning(f"Issues are disabled for repo {repo_git}, skipping event collection: {e}") + raise Ignore() def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): diff --git a/augur/tasks/github/issues.py b/augur/tasks/github/issues.py index 6b7b3dd8b7..19a4ad0b3f 100644 --- a/augur/tasks/github/issues.py +++ b/augur/tasks/github/issues.py @@ -2,13 +2,14 @@ import traceback from datetime import timedelta, timezone, datetime +from celery.exceptions import Ignore from sqlalchemy.exc import IntegrityError from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_data_access import GithubDataAccess +from augur.tasks.github.util.github_data_access import GithubDataAccess, ResourceGoneException from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts @@ -80,6 +81,10 @@ def collect_issues(repo_git: str, full_collection: bool) -> int: return total_issues + except ResourceGoneException as e: + logger.warning(f"Issues are disabled for repo {repo_git}, skipping issue collection: {e}") + raise Ignore() + except Exception as e: logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") return -1 diff --git a/augur/tasks/github/messages.py b/augur/tasks/github/messages.py index e8453a18df..f523efbeb7 100644 --- a/augur/tasks/github/messages.py +++ b/augur/tasks/github/messages.py @@ -1,10 +1,12 @@ import logging from datetime import timedelta, timezone +from celery.exceptions import Ignore + from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException, ResourceGoneException from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo @@ -25,7 +27,7 @@ def collect_github_messages(repo_git: str, full_collection: bool) -> None: with GithubTaskManifest(logger) as manifest: augur_db = manifest.augur_db - + repo_id = augur_db.session.query(Repo).filter( Repo.repo_git == repo_git).one().repo_id @@ -35,21 +37,25 @@ def collect_github_messages(repo_git: str, full_collection: bool) -> None: if full_collection: core_data_last_collected = None else: - # subtract 2 days to ensure all data is collected + # subtract 2 days to ensure all data is collected core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) - - if is_repo_small(repo_id): - message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected) - - if message_data: - process_messages(message_data, task_name, repo_id, logger, augur_db) + try: + if is_repo_small(repo_id): + message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected) + + if message_data: + process_messages(message_data, task_name, repo_id, logger, augur_db) + + else: + logger.info(f"{owner}/{repo} has no messages") else: - logger.info(f"{owner}/{repo} has no messages") + process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected) - else: - process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected) + except ResourceGoneException as e: + logger.warning(f"Issues are disabled for repo {repo_git}, skipping message collection: {e}") + raise Ignore() def is_repo_small(repo_id): diff --git a/tests/test_tasks/test_github_tasks/test_issues.py b/tests/test_tasks/test_github_tasks/test_issues.py new file mode 100644 index 0000000000..91a74b5290 --- /dev/null +++ b/tests/test_tasks/test_github_tasks/test_issues.py @@ -0,0 +1,173 @@ +import logging +import unittest +from unittest.mock import MagicMock, patch + +from celery.exceptions import Ignore + +from augur.tasks.github.util.github_data_access import ResourceGoneException + + +logger = logging.getLogger(__name__) + + +class TestCollectIssuesResourceGone(unittest.TestCase): + """Unit tests for collect_issues handling of ResourceGoneException. + + When a repository has GitHub Issues intentionally disabled the GitHub API + returns HTTP 410 Gone, which raises ResourceGoneException. The Celery task + must convert that into celery.exceptions.Ignore so that the enclosing + chord/chain is not aborted and the rest of the repo's data (commits, PRs, + contributors, releases) is still collected. + """ + + @patch("augur.tasks.github.issues.get_batch_size", return_value=1000) + @patch("augur.tasks.github.issues.GithubRandomKeyAuth") + @patch("augur.tasks.github.issues.GithubDataAccess") + @patch("augur.tasks.github.issues.get_repo_by_repo_git") + def test_raises_ignore_when_issues_disabled_on_page_count( + self, + mock_get_repo, + mock_gda_class, + mock_key_auth_class, + mock_get_batch_size, + ): + """ResourceGoneException during get_resource_page_count must become Ignore.""" + + mock_repo = MagicMock() + mock_repo.repo_id = 1 + mock_get_repo.return_value = mock_repo + + mock_gda = MagicMock() + mock_gda.get_resource_page_count.side_effect = ResourceGoneException( + "Issues are disabled for this repository" + ) + mock_gda_class.return_value = mock_gda + + from augur.tasks.github.issues import collect_issues + + with self.assertRaises(Ignore): + collect_issues.run("https://github.com/example/no-issues-repo", full_collection=True) + + @patch("augur.tasks.github.issues.get_batch_size", return_value=1000) + @patch("augur.tasks.github.issues.GithubRandomKeyAuth") + @patch("augur.tasks.github.issues.GithubDataAccess") + @patch("augur.tasks.github.issues.get_repo_by_repo_git") + def test_raises_ignore_when_issues_disabled_during_pagination( + self, + mock_get_repo, + mock_gda_class, + mock_key_auth_class, + mock_get_batch_size, + ): + """ResourceGoneException raised during pagination must also become Ignore.""" + + mock_repo = MagicMock() + mock_repo.repo_id = 1 + mock_get_repo.return_value = mock_repo + + mock_gda = MagicMock() + mock_gda.get_resource_page_count.return_value = 1 + mock_gda.paginate_resource.side_effect = ResourceGoneException( + "Issues are disabled for this repository" + ) + mock_gda_class.return_value = mock_gda + + from augur.tasks.github.issues import collect_issues + + with self.assertRaises(Ignore): + collect_issues.run("https://github.com/example/no-issues-repo", full_collection=True) + + @patch("augur.tasks.github.issues.get_batch_size", return_value=1000) + @patch("augur.tasks.github.issues.GithubRandomKeyAuth") + @patch("augur.tasks.github.issues.GithubDataAccess") + @patch("augur.tasks.github.issues.get_repo_by_repo_git") + def test_unrelated_exception_returns_minus_one( + self, + mock_get_repo, + mock_gda_class, + mock_key_auth_class, + mock_get_batch_size, + ): + """Unrelated exceptions must not be swallowed — existing -1 path stays intact.""" + + mock_repo = MagicMock() + mock_repo.repo_id = 1 + mock_get_repo.return_value = mock_repo + + mock_gda = MagicMock() + mock_gda.get_resource_page_count.side_effect = ConnectionError("Network unreachable") + mock_gda_class.return_value = mock_gda + + from augur.tasks.github.issues import collect_issues + + result = collect_issues.run("https://github.com/example/some-repo", full_collection=True) + self.assertEqual(result, -1) + + +class TestCollectEventsResourceGone(unittest.TestCase): + """Unit tests for collect_events handling of ResourceGoneException. + + The issues/events endpoint returns 410 when Issues are disabled on a repo. + collect_events must raise Ignore rather than failing, so that the secondary + collection group keeps running for the rest of the repo's data. + """ + + @patch("augur.tasks.github.events.GithubRandomKeyAuth") + @patch("augur.tasks.github.events.GithubDataAccess") + @patch("augur.tasks.github.events.get_owner_repo", return_value=("example", "no-issues-repo")) + def test_raises_ignore_when_issues_endpoint_gone( + self, + mock_get_owner_repo, + mock_gda_class, + mock_key_auth_class, + ): + """ResourceGoneException from bulk_events_collection_endpoint_contains_all_data + must be caught and converted to Ignore.""" + + mock_gda = MagicMock() + mock_gda.get_resource_page_count.side_effect = ResourceGoneException( + "Issues are disabled for this repository" + ) + mock_gda_class.return_value = mock_gda + + from augur.tasks.github.events import collect_events + + with self.assertRaises(Ignore): + collect_events.run("https://github.com/example/no-issues-repo", full_collection=True) + + +class TestCollectMessagesResourceGone(unittest.TestCase): + """Unit tests for collect_github_messages handling of ResourceGoneException. + + The issues/comments endpoint returns 410 when Issues are disabled. + collect_github_messages must raise Ignore rather than failing. + """ + + @patch("augur.tasks.github.messages.GithubTaskManifest") + def test_raises_ignore_when_issues_comments_gone(self, mock_manifest_class): + """ResourceGoneException raised during message retrieval must become Ignore.""" + + mock_manifest = MagicMock() + mock_manifest.__enter__ = MagicMock(return_value=mock_manifest) + mock_manifest.__exit__ = MagicMock(return_value=False) + + mock_repo_obj = MagicMock() + mock_repo_obj.repo_id = 1 + mock_manifest.augur_db.session.query.return_value.filter.return_value.one.return_value = mock_repo_obj + + mock_manifest_class.return_value = mock_manifest + + with patch("augur.tasks.github.messages.is_repo_small", return_value=True), \ + patch("augur.tasks.github.messages.fast_retrieve_all_pr_and_issue_messages", + side_effect=ResourceGoneException("Issues are disabled for this repository")): + + from augur.tasks.github.messages import collect_github_messages + + with self.assertRaises(Ignore): + collect_github_messages.run( + "https://github.com/example/no-issues-repo", full_collection=True + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_tasks/test_task_utilities/test_paginators/test_github_data_access.py b/tests/test_tasks/test_task_utilities/test_paginators/test_github_data_access.py new file mode 100644 index 0000000000..6a18e25d60 --- /dev/null +++ b/tests/test_tasks/test_task_utilities/test_paginators/test_github_data_access.py @@ -0,0 +1,41 @@ +import unittest + +from augur.tasks.github.util.github_data_access import ( + ResourceGoneException, + UrlNotFoundException, + GithubDataAccess, +) + + +class TestDecideRetryPolicy(unittest.TestCase): + """Unit tests for GithubDataAccess._decide_retry_policy. + + This function controls whether a failed request is retried. Introduced in + commit 04ef1ef, it must return False for permanent error conditions + (ResourceGoneException, UrlNotFoundException) so Celery does not waste + retry attempts on resources that are intentionally unavailable. + """ + + def test_resource_gone_exception_is_not_retried(self): + """HTTP 410 Gone (e.g. issues disabled) must not be retried.""" + result = GithubDataAccess._decide_retry_policy(ResourceGoneException()) + self.assertFalse(result) + + def test_url_not_found_exception_is_not_retried(self): + """HTTP 404 Not Found must not be retried.""" + result = GithubDataAccess._decide_retry_policy(UrlNotFoundException()) + self.assertFalse(result) + + def test_generic_exception_is_retried(self): + """Transient errors (network issues, timeouts) must be retried.""" + result = GithubDataAccess._decide_retry_policy(Exception("connection reset")) + self.assertTrue(result) + + def test_connection_error_is_retried(self): + """ConnectionError is a transient error and must be retried.""" + result = GithubDataAccess._decide_retry_policy(ConnectionError()) + self.assertTrue(result) + + +if __name__ == "__main__": + unittest.main()