Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions augur/tasks/github/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
from abc import ABC, abstractmethod
from datetime import datetime, timedelta, timezone

from celery.exceptions import Ignore

from augur.tasks.init.celery_app import celery_app as celery
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.application.db.data_parse import *
from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException
from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException, ResourceGoneException
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
from augur.tasks.github.util.github_task_session import GithubTaskManifest
from augur.tasks.github.util.util import get_owner_repo
Expand Down Expand Up @@ -39,12 +41,17 @@ def collect_events(repo_git: str, full_collection: bool):

key_auth = GithubRandomKeyAuth(logger)

if bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo):
collection_strategy = BulkGithubEventCollection(logger)
else:
collection_strategy = ThoroughGithubEventCollection(logger)
try:
if bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo):
collection_strategy = BulkGithubEventCollection(logger)
else:
collection_strategy = ThoroughGithubEventCollection(logger)

collection_strategy.collect(repo_git, key_auth, core_data_last_collected)

collection_strategy.collect(repo_git, key_auth, core_data_last_collected)
except ResourceGoneException as e:
logger.warning(f"Issues are disabled for repo {repo_git}, skipping event collection: {e}")
raise Ignore()

def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo):

Expand Down
7 changes: 6 additions & 1 deletion augur/tasks/github/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
import traceback
from datetime import timedelta, timezone, datetime

from celery.exceptions import Ignore
from sqlalchemy.exc import IntegrityError


from augur.tasks.init.celery_app import celery_app as celery
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.application.db.data_parse import *
from augur.tasks.github.util.github_data_access import GithubDataAccess
from augur.tasks.github.util.github_data_access import GithubDataAccess, ResourceGoneException
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo
from augur.tasks.util.worker_util import remove_duplicate_dicts
Expand Down Expand Up @@ -80,6 +81,10 @@ def collect_issues(repo_git: str, full_collection: bool) -> int:

return total_issues

except ResourceGoneException as e:
logger.warning(f"Issues are disabled for repo {repo_git}, skipping issue collection: {e}")
raise Ignore()

except Exception as e:
logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}")
return -1
Expand Down
30 changes: 18 additions & 12 deletions augur/tasks/github/messages.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging
from datetime import timedelta, timezone

from celery.exceptions import Ignore

from augur.tasks.init.celery_app import celery_app as celery
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.application.db.data_parse import *
from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException
from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException, ResourceGoneException
from augur.tasks.github.util.github_task_session import GithubTaskManifest
from augur.tasks.util.worker_util import remove_duplicate_dicts
from augur.tasks.github.util.util import get_owner_repo
Expand All @@ -25,7 +27,7 @@ def collect_github_messages(repo_git: str, full_collection: bool) -> None:
with GithubTaskManifest(logger) as manifest:

augur_db = manifest.augur_db

repo_id = augur_db.session.query(Repo).filter(
Repo.repo_git == repo_git).one().repo_id

Expand All @@ -35,21 +37,25 @@ def collect_github_messages(repo_git: str, full_collection: bool) -> None:
if full_collection:
core_data_last_collected = None
else:
# subtract 2 days to ensure all data is collected
# subtract 2 days to ensure all data is collected
core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc)


if is_repo_small(repo_id):
message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected)

if message_data:
process_messages(message_data, task_name, repo_id, logger, augur_db)
try:
if is_repo_small(repo_id):
message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected)

if message_data:
process_messages(message_data, task_name, repo_id, logger, augur_db)

else:
logger.info(f"{owner}/{repo} has no messages")

else:
logger.info(f"{owner}/{repo} has no messages")
process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected)

else:
process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected)
except ResourceGoneException as e:
logger.warning(f"Issues are disabled for repo {repo_git}, skipping message collection: {e}")
raise Ignore()


def is_repo_small(repo_id):
Expand Down
173 changes: 173 additions & 0 deletions tests/test_tasks/test_github_tasks/test_issues.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import logging
import unittest
from unittest.mock import MagicMock, patch

from celery.exceptions import Ignore

from augur.tasks.github.util.github_data_access import ResourceGoneException


logger = logging.getLogger(__name__)


class TestCollectIssuesResourceGone(unittest.TestCase):
"""Unit tests for collect_issues handling of ResourceGoneException.

When a repository has GitHub Issues intentionally disabled the GitHub API
returns HTTP 410 Gone, which raises ResourceGoneException. The Celery task
must convert that into celery.exceptions.Ignore so that the enclosing
chord/chain is not aborted and the rest of the repo's data (commits, PRs,
contributors, releases) is still collected.
"""

@patch("augur.tasks.github.issues.get_batch_size", return_value=1000)
@patch("augur.tasks.github.issues.GithubRandomKeyAuth")
@patch("augur.tasks.github.issues.GithubDataAccess")
@patch("augur.tasks.github.issues.get_repo_by_repo_git")
def test_raises_ignore_when_issues_disabled_on_page_count(
self,
mock_get_repo,
mock_gda_class,
mock_key_auth_class,
mock_get_batch_size,
):
"""ResourceGoneException during get_resource_page_count must become Ignore."""

mock_repo = MagicMock()
mock_repo.repo_id = 1
mock_get_repo.return_value = mock_repo

mock_gda = MagicMock()
mock_gda.get_resource_page_count.side_effect = ResourceGoneException(
"Issues are disabled for this repository"
)
mock_gda_class.return_value = mock_gda

from augur.tasks.github.issues import collect_issues

with self.assertRaises(Ignore):
collect_issues.run("https://github.com/example/no-issues-repo", full_collection=True)

@patch("augur.tasks.github.issues.get_batch_size", return_value=1000)
@patch("augur.tasks.github.issues.GithubRandomKeyAuth")
@patch("augur.tasks.github.issues.GithubDataAccess")
@patch("augur.tasks.github.issues.get_repo_by_repo_git")
def test_raises_ignore_when_issues_disabled_during_pagination(
self,
mock_get_repo,
mock_gda_class,
mock_key_auth_class,
mock_get_batch_size,
):
"""ResourceGoneException raised during pagination must also become Ignore."""

mock_repo = MagicMock()
mock_repo.repo_id = 1
mock_get_repo.return_value = mock_repo

mock_gda = MagicMock()
mock_gda.get_resource_page_count.return_value = 1
mock_gda.paginate_resource.side_effect = ResourceGoneException(
"Issues are disabled for this repository"
)
mock_gda_class.return_value = mock_gda

from augur.tasks.github.issues import collect_issues

with self.assertRaises(Ignore):
collect_issues.run("https://github.com/example/no-issues-repo", full_collection=True)

@patch("augur.tasks.github.issues.get_batch_size", return_value=1000)
@patch("augur.tasks.github.issues.GithubRandomKeyAuth")
@patch("augur.tasks.github.issues.GithubDataAccess")
@patch("augur.tasks.github.issues.get_repo_by_repo_git")
def test_unrelated_exception_returns_minus_one(
self,
mock_get_repo,
mock_gda_class,
mock_key_auth_class,
mock_get_batch_size,
):
"""Unrelated exceptions must not be swallowed — existing -1 path stays intact."""

mock_repo = MagicMock()
mock_repo.repo_id = 1
mock_get_repo.return_value = mock_repo

mock_gda = MagicMock()
mock_gda.get_resource_page_count.side_effect = ConnectionError("Network unreachable")
mock_gda_class.return_value = mock_gda

from augur.tasks.github.issues import collect_issues

result = collect_issues.run("https://github.com/example/some-repo", full_collection=True)
self.assertEqual(result, -1)


class TestCollectEventsResourceGone(unittest.TestCase):
"""Unit tests for collect_events handling of ResourceGoneException.

The issues/events endpoint returns 410 when Issues are disabled on a repo.
collect_events must raise Ignore rather than failing, so that the secondary
collection group keeps running for the rest of the repo's data.
"""

@patch("augur.tasks.github.events.GithubRandomKeyAuth")
@patch("augur.tasks.github.events.GithubDataAccess")
@patch("augur.tasks.github.events.get_owner_repo", return_value=("example", "no-issues-repo"))
def test_raises_ignore_when_issues_endpoint_gone(
self,
mock_get_owner_repo,
mock_gda_class,
mock_key_auth_class,
):
"""ResourceGoneException from bulk_events_collection_endpoint_contains_all_data
must be caught and converted to Ignore."""

mock_gda = MagicMock()
mock_gda.get_resource_page_count.side_effect = ResourceGoneException(
"Issues are disabled for this repository"
)
mock_gda_class.return_value = mock_gda

from augur.tasks.github.events import collect_events

with self.assertRaises(Ignore):
collect_events.run("https://github.com/example/no-issues-repo", full_collection=True)


class TestCollectMessagesResourceGone(unittest.TestCase):
"""Unit tests for collect_github_messages handling of ResourceGoneException.

The issues/comments endpoint returns 410 when Issues are disabled.
collect_github_messages must raise Ignore rather than failing.
"""

@patch("augur.tasks.github.messages.GithubTaskManifest")
def test_raises_ignore_when_issues_comments_gone(self, mock_manifest_class):
"""ResourceGoneException raised during message retrieval must become Ignore."""

mock_manifest = MagicMock()
mock_manifest.__enter__ = MagicMock(return_value=mock_manifest)
mock_manifest.__exit__ = MagicMock(return_value=False)

mock_repo_obj = MagicMock()
mock_repo_obj.repo_id = 1
mock_manifest.augur_db.session.query.return_value.filter.return_value.one.return_value = mock_repo_obj

mock_manifest_class.return_value = mock_manifest

with patch("augur.tasks.github.messages.is_repo_small", return_value=True), \
patch("augur.tasks.github.messages.fast_retrieve_all_pr_and_issue_messages",
side_effect=ResourceGoneException("Issues are disabled for this repository")):

from augur.tasks.github.messages import collect_github_messages

with self.assertRaises(Ignore):
collect_github_messages.run(
"https://github.com/example/no-issues-repo", full_collection=True
)


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import unittest

from augur.tasks.github.util.github_data_access import (
ResourceGoneException,
UrlNotFoundException,
GithubDataAccess,
)


class TestDecideRetryPolicy(unittest.TestCase):
"""Unit tests for GithubDataAccess._decide_retry_policy.

This function controls whether a failed request is retried. Introduced in
commit 04ef1ef, it must return False for permanent error conditions
(ResourceGoneException, UrlNotFoundException) so Celery does not waste
retry attempts on resources that are intentionally unavailable.
"""

def test_resource_gone_exception_is_not_retried(self):
"""HTTP 410 Gone (e.g. issues disabled) must not be retried."""
result = GithubDataAccess._decide_retry_policy(ResourceGoneException())
self.assertFalse(result)

def test_url_not_found_exception_is_not_retried(self):
"""HTTP 404 Not Found must not be retried."""
result = GithubDataAccess._decide_retry_policy(UrlNotFoundException())
self.assertFalse(result)

def test_generic_exception_is_retried(self):
"""Transient errors (network issues, timeouts) must be retried."""
result = GithubDataAccess._decide_retry_policy(Exception("connection reset"))
self.assertTrue(result)

def test_connection_error_is_retried(self):
"""ConnectionError is a transient error and must be retried."""
result = GithubDataAccess._decide_retry_policy(ConnectionError())
self.assertTrue(result)


if __name__ == "__main__":
unittest.main()