From 8e9634f7a31ecb2b1470fa8067f06656b87199fb Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Wed, 26 Nov 2025 06:09:53 -0800 Subject: [PATCH 01/20] reuse taskworker alarm for webhooks --- src/sentry/options/defaults.py | 7 + src/sentry/sentry_apps/metrics.py | 1 + src/sentry/utils/sentry_apps/webhooks.py | 38 ++- .../utils/sentry_apps/test_webhook_timeout.py | 250 ++++++++++++++++++ 4 files changed, 291 insertions(+), 5 deletions(-) create mode 100644 tests/sentry/utils/sentry_apps/test_webhook_timeout.py diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 950e31fc7fbcbf..b3267960cbb1c7 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2393,6 +2393,13 @@ flags=FLAG_AUTOMATOR_MODIFIABLE, ) +# Hard timeout for webhook requests to prevent indefinite hangs +register( + "sentry-apps.webhook.hard-timeout.sec", + default=3.0, + flags=FLAG_AUTOMATOR_MODIFIABLE, +) + # Enables statistical detectors for a project register( "statistical_detectors.enable", diff --git a/src/sentry/sentry_apps/metrics.py b/src/sentry/sentry_apps/metrics.py index 95f465d4aed186..fce8a042340efa 100644 --- a/src/sentry/sentry_apps/metrics.py +++ b/src/sentry/sentry_apps/metrics.py @@ -70,6 +70,7 @@ class SentryAppWebhookHaltReason(StrEnum): MISSING_INSTALLATION = "missing_installation" RESTRICTED_IP = "restricted_ip" CONNECTION_RESET = "connection_reset" + HARD_TIMEOUT = "hard_timeout" class SentryAppExternalRequestFailureReason(StrEnum): diff --git a/src/sentry/utils/sentry_apps/webhooks.py b/src/sentry/utils/sentry_apps/webhooks.py index 97399851e0306a..01380246c60353 100644 --- a/src/sentry/utils/sentry_apps/webhooks.py +++ b/src/sentry/utils/sentry_apps/webhooks.py @@ -2,6 +2,7 @@ import logging from collections.abc import Callable, Mapping +from types import FrameType from typing import TYPE_CHECKING, Any, Concatenate, ParamSpec, TypeVar import sentry_sdk @@ -20,6 +21,7 @@ from sentry.sentry_apps.models.sentry_app import SentryApp, track_response_code from sentry.sentry_apps.utils.errors import SentryAppSentryError from sentry.shared_integrations.exceptions import ApiHostError, ApiTimeoutError, ClientError +from sentry.taskworker.workerchild import timeout_alarm from sentry.utils.sentry_apps import SentryAppWebhookRequestsBuffer if TYPE_CHECKING: @@ -36,6 +38,23 @@ T = TypeVar("T", bound=Mapping[str, Any]) +class WebhookTimeoutError(BaseException): + """This error represents a user set hard timeout for when a + webhook request should've completed within X seconds + """ + + pass + + +def _handle_webhook_timeout(signum: int, frame: FrameType | None) -> None: + """Handler for when a webhook request exceeds the hard timeout deadline. + - This is a workaround for safe_create_connection sockets hanging when the given url + cannot be reached or resolved. + - TODO: Add sentry app disabling logic here + """ + raise WebhookTimeoutError("Webhook request exceeded hard timeout deadline") + + def ignore_unpublished_app_errors( func: Callable[Concatenate[SentryApp | RpcSentryApp, P], R], ) -> Callable[Concatenate[SentryApp | RpcSentryApp, P], R | None]: @@ -98,13 +117,22 @@ def send_and_save_webhook_request( ) assert url is not None + + timeout_seconds = int(options.get("sentry-apps.webhook.hard-timeout.sec")) + try: - response = safe_urlopen( - url=url, - data=app_platform_event.body, - headers=app_platform_event.headers, - timeout=options.get("sentry-apps.webhook.timeout.sec"), + with timeout_alarm(timeout_seconds, _handle_webhook_timeout): + response = safe_urlopen( + url=url, + data=app_platform_event.body, + headers=app_platform_event.headers, + timeout=options.get("sentry-apps.webhook.timeout.sec"), + ) + except WebhookTimeoutError: + lifecycle.record_halt( + halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.HARD_TIMEOUT}" ) + raise except (Timeout, ConnectionError) as e: error_type = e.__class__.__name__.lower() lifecycle.add_extras( diff --git a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py new file mode 100644 index 00000000000000..a492ac9260e26d --- /dev/null +++ b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py @@ -0,0 +1,250 @@ +import signal +import time +from unittest.mock import Mock, patch + +import pytest +from requests import Response + +from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent +from sentry.sentry_apps.metrics import SentryAppWebhookHaltReason +from sentry.testutils.cases import TestCase +from sentry.testutils.silo import region_silo_test +from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError, send_and_save_webhook_request + + +@region_silo_test +class WebhookTimeoutTest(TestCase): + def setUp(self): + self.organization = self.create_organization() + self.sentry_app = self.create_sentry_app( + name="TestApp", + organization=self.organization, + webhook_url="https://example.com/webhook", + ) + self.install = self.create_sentry_app_installation( + organization=self.organization, slug=self.sentry_app.slug + ) + + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.options.get") + def test_webhook_completes_before_timeout(self, mock_options, mock_safe_urlopen): + """Test that webhook completes successfully when it finishes before timeout""" + + # Configure timeouts + def options_side_effect(key): + if key == "sentry-apps.webhook.hard-timeout.sec": + return 3.0 + elif key == "sentry-apps.webhook.timeout.sec": + return 1.0 + elif key == "sentry-apps.webhook.restricted-webhook-sending": + return [] + elif key == "sentry-apps.webhook-logging.enabled": + return {"installation_uuid": [], "sentry_app_slug": []} + return None + + mock_options.side_effect = options_side_effect + + # Mock successful response + mock_response = Mock(spec=Response) + mock_response.status_code = 200 + mock_response.headers = {} + mock_safe_urlopen.return_value = mock_response + + # Create event + app_platform_event = AppPlatformEvent( + resource="issue", action="created", install=self.install, data={"test": "data"} + ) + + # Should complete without raising WebhookTimeoutError + response = send_and_save_webhook_request(self.sentry_app, app_platform_event) + + assert response == mock_response + mock_safe_urlopen.assert_called_once() + + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.options.get") + def test_webhook_interrupted_by_hard_timeout(self, mock_options, mock_safe_urlopen): + """Test that webhook is interrupted when it exceeds hard timeout""" + + # Configure timeouts + def options_side_effect(key): + if key == "sentry-apps.webhook.hard-timeout.sec": + return 1.0 # 1 second hard timeout + elif key == "sentry-apps.webhook.timeout.sec": + return 10.0 + elif key == "sentry-apps.webhook.restricted-webhook-sending": + return [] + elif key == "sentry-apps.webhook-logging.enabled": + return {"installation_uuid": [], "sentry_app_slug": []} + return None + + mock_options.side_effect = options_side_effect + + # Make safe_urlopen sleep longer than the timeout + def slow_urlopen(*args, **kwargs): + time.sleep(2.0) # Sleep longer than 1 second timeout + mock_response = Mock(spec=Response) + mock_response.status_code = 200 + return mock_response + + mock_safe_urlopen.side_effect = slow_urlopen + + # Create event + app_platform_event = AppPlatformEvent( + resource="issue", action="created", install=self.install, data={"test": "data"} + ) + + # Should raise WebhookTimeoutError + with pytest.raises(WebhookTimeoutError, match="Webhook request exceeded hard timeout"): + send_and_save_webhook_request(self.sentry_app, app_platform_event) + + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.options.get") + def test_timeout_exception_propagates(self, mock_options, mock_safe_urlopen): + """Test that WebhookTimeoutError propagates correctly""" + + def options_side_effect(key): + if key == "sentry-apps.webhook.hard-timeout.sec": + return 1.0 + elif key == "sentry-apps.webhook.timeout.sec": + return 10.0 + elif key == "sentry-apps.webhook.restricted-webhook-sending": + return [] + return None + + mock_options.side_effect = options_side_effect + + # Make safe_urlopen sleep + def slow_urlopen(*args, **kwargs): + time.sleep(2.0) + return Mock(spec=Response) + + mock_safe_urlopen.side_effect = slow_urlopen + + app_platform_event = AppPlatformEvent( + resource="issue", action="created", install=self.install, data={"test": "data"} + ) + + # Verify exception type and inheritance + with pytest.raises(BaseException): # WebhookTimeoutError inherits from BaseException + send_and_save_webhook_request(self.sentry_app, app_platform_event) + + with pytest.raises(WebhookTimeoutError): + send_and_save_webhook_request(self.sentry_app, app_platform_event) + + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.options.get") + @patch("sentry.sentry_apps.metrics.SentryAppInteractionEvent") + def test_lifecycle_record_halt_called_on_timeout( + self, mock_interaction_event, mock_options, mock_safe_urlopen + ): + """Test that lifecycle.record_halt() is called when timeout occurs""" + + def options_side_effect(key): + if key == "sentry-apps.webhook.hard-timeout.sec": + return 1.0 + elif key == "sentry-apps.webhook.timeout.sec": + return 10.0 + elif key == "sentry-apps.webhook.restricted-webhook-sending": + return [] + return None + + mock_options.side_effect = options_side_effect + + # Setup mock lifecycle + mock_lifecycle = Mock() + mock_interaction_event.return_value.capture.return_value.__enter__.return_value = ( + mock_lifecycle + ) + + # Make safe_urlopen sleep + def slow_urlopen(*args, **kwargs): + time.sleep(2.0) + return Mock(spec=Response) + + mock_safe_urlopen.side_effect = slow_urlopen + + app_platform_event = AppPlatformEvent( + resource="issue", action="created", install=self.install, data={"test": "data"} + ) + + # Should raise WebhookTimeoutError + with pytest.raises(WebhookTimeoutError): + send_and_save_webhook_request(self.sentry_app, app_platform_event) + + # Verify lifecycle.record_halt() was called with correct halt reason + mock_lifecycle.record_halt.assert_called_once() + halt_reason = mock_lifecycle.record_halt.call_args[1]["halt_reason"] + assert SentryAppWebhookHaltReason.HARD_TIMEOUT in halt_reason + + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.options.get") + def test_timeout_alarm_restores_signal_handler(self, mock_options, mock_safe_urlopen): + """Test that signal handler is properly restored after timeout""" + + def options_side_effect(key): + if key == "sentry-apps.webhook.hard-timeout.sec": + return 1.0 + elif key == "sentry-apps.webhook.timeout.sec": + return 10.0 + elif key == "sentry-apps.webhook.restricted-webhook-sending": + return [] + elif key == "sentry-apps.webhook-logging.enabled": + return {"installation_uuid": [], "sentry_app_slug": []} + return None + + mock_options.side_effect = options_side_effect + + # Get original handler + original_handler = signal.signal(signal.SIGALRM, signal.SIG_DFL) + signal.signal(signal.SIGALRM, original_handler) + + # Mock quick response + mock_response = Mock(spec=Response) + mock_response.status_code = 200 + mock_response.headers = {} + mock_safe_urlopen.return_value = mock_response + + app_platform_event = AppPlatformEvent( + resource="issue", action="created", install=self.install, data={"test": "data"} + ) + + send_and_save_webhook_request(self.sentry_app, app_platform_event) + + # Verify signal handler was restored + current_handler = signal.signal(signal.SIGALRM, signal.SIG_DFL) + signal.signal(signal.SIGALRM, current_handler) + assert current_handler == original_handler + + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.options.get") + def test_alarm_cancelled_after_successful_webhook(self, mock_options, mock_safe_urlopen): + """Test that alarm is cancelled after webhook completes successfully""" + + def options_side_effect(key): + if key == "sentry-apps.webhook.hard-timeout.sec": + return 5.0 + elif key == "sentry-apps.webhook.timeout.sec": + return 1.0 + elif key == "sentry-apps.webhook.restricted-webhook-sending": + return [] + elif key == "sentry-apps.webhook-logging.enabled": + return {"installation_uuid": [], "sentry_app_slug": []} + return None + + mock_options.side_effect = options_side_effect + + mock_response = Mock(spec=Response) + mock_response.status_code = 200 + mock_response.headers = {} + mock_safe_urlopen.return_value = mock_response + + app_platform_event = AppPlatformEvent( + resource="issue", action="created", install=self.install, data={"test": "data"} + ) + + send_and_save_webhook_request(self.sentry_app, app_platform_event) + + # Verify no alarm is pending + remaining = signal.alarm(0) + assert remaining == 0 # No alarm was pending From 62644f79f9373c299d1256896f2d2128cb904e3a Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Thu, 18 Dec 2025 15:58:11 -0800 Subject: [PATCH 02/20] update tests for webhook alarm --- src/sentry/features/temporary.py | 2 + src/sentry/options/defaults.py | 2 +- src/sentry/utils/sentry_apps/webhooks.py | 26 ++- .../utils/sentry_apps/test_webhook_timeout.py | 187 +++++++----------- 4 files changed, 90 insertions(+), 127 deletions(-) diff --git a/src/sentry/features/temporary.py b/src/sentry/features/temporary.py index 0da81a26bb6836..b5a17ac2f18c65 100644 --- a/src/sentry/features/temporary.py +++ b/src/sentry/features/temporary.py @@ -642,6 +642,8 @@ def register_temporary_features(manager: FeatureManager) -> None: manager.add("organizations:sentry-app-manual-token-refresh", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False) # Enables Conduit demo endpoint and UI manager.add("organizations:conduit-demo", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True) + # Enable hard timeout alarm for webhooks + manager.add("organizations:sentry-app-webhook-hard-timeout", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False) # Enables organization access to the new notification platform manager.add("organizations:notification-platform.internal-testing", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True) diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 953c8f8dfc1c0c..464d2fe26097a8 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2426,7 +2426,7 @@ # Hard timeout for webhook requests to prevent indefinite hangs register( "sentry-apps.webhook.hard-timeout.sec", - default=3.0, + default=5.0, flags=FLAG_AUTOMATOR_MODIFIABLE, ) diff --git a/src/sentry/utils/sentry_apps/webhooks.py b/src/sentry/utils/sentry_apps/webhooks.py index 01380246c60353..c7627b45425353 100644 --- a/src/sentry/utils/sentry_apps/webhooks.py +++ b/src/sentry/utils/sentry_apps/webhooks.py @@ -10,9 +10,10 @@ from requests.exceptions import ChunkedEncodingError, ConnectionError, Timeout from rest_framework import status -from sentry import options +from sentry import features, options from sentry.exceptions import RestrictedIPAddress from sentry.http import safe_urlopen +from sentry.organizations.services.organization.service import organization_service from sentry.sentry_apps.metrics import ( SentryAppEventType, SentryAppWebhookFailureReason, @@ -50,7 +51,7 @@ def _handle_webhook_timeout(signum: int, frame: FrameType | None) -> None: """Handler for when a webhook request exceeds the hard timeout deadline. - This is a workaround for safe_create_connection sockets hanging when the given url cannot be reached or resolved. - - TODO: Add sentry app disabling logic here + - TODO(christinarlong): Add sentry app disabling logic here """ raise WebhookTimeoutError("Webhook request exceeded hard timeout deadline") @@ -117,17 +118,30 @@ def send_and_save_webhook_request( ) assert url is not None - - timeout_seconds = int(options.get("sentry-apps.webhook.hard-timeout.sec")) - try: - with timeout_alarm(timeout_seconds, _handle_webhook_timeout): + organization_context = organization_service.get_organization_by_id( + id=app_platform_event.install.organization_id, + ) + if organization_context is not None and features.has( + "organizations:sentry-app-webhook-hard-timeout", + organization_context.organization, + ): + timeout_seconds = int(options.get("sentry-apps.webhook.hard-timeout.sec")) + with timeout_alarm(timeout_seconds, _handle_webhook_timeout): + response = safe_urlopen( + url=url, + data=app_platform_event.body, + headers=app_platform_event.headers, + timeout=options.get("sentry-apps.webhook.timeout.sec"), + ) + else: response = safe_urlopen( url=url, data=app_platform_event.body, headers=app_platform_event.headers, timeout=options.get("sentry-apps.webhook.timeout.sec"), ) + except WebhookTimeoutError: lifecycle.record_halt( halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.HARD_TIMEOUT}" diff --git a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py index a492ac9260e26d..475fcf2882492b 100644 --- a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py +++ b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py @@ -7,7 +7,10 @@ from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent from sentry.sentry_apps.metrics import SentryAppWebhookHaltReason +from sentry.testutils.asserts import assert_halt_metric from sentry.testutils.cases import TestCase +from sentry.testutils.helpers.features import with_feature +from sentry.testutils.helpers.options import override_options from sentry.testutils.silo import region_silo_test from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError, send_and_save_webhook_request @@ -25,32 +28,23 @@ def setUp(self): organization=self.organization, slug=self.sentry_app.slug ) + @with_feature("organizations:sentry-app-webhook-hard-timeout") + @override_options( + { + "sentry-apps.webhook.hard-timeout.sec": 5.0, + "sentry-apps.webhook.timeout.sec": 1.0, + "sentry-apps.webhook.restricted-webhook-sending": [], + "sentry-apps.webhook-logging.enabled": {"installation_uuid": [], "sentry_app_slug": []}, + } + ) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.options.get") - def test_webhook_completes_before_timeout(self, mock_options, mock_safe_urlopen): - """Test that webhook completes successfully when it finishes before timeout""" - - # Configure timeouts - def options_side_effect(key): - if key == "sentry-apps.webhook.hard-timeout.sec": - return 3.0 - elif key == "sentry-apps.webhook.timeout.sec": - return 1.0 - elif key == "sentry-apps.webhook.restricted-webhook-sending": - return [] - elif key == "sentry-apps.webhook-logging.enabled": - return {"installation_uuid": [], "sentry_app_slug": []} - return None - - mock_options.side_effect = options_side_effect - + def test_webhook_completes_before_timeout(self, mock_safe_urlopen): # Mock successful response mock_response = Mock(spec=Response) mock_response.status_code = 200 mock_response.headers = {} mock_safe_urlopen.return_value = mock_response - # Create event app_platform_event = AppPlatformEvent( resource="issue", action="created", install=self.install, data={"test": "data"} ) @@ -61,28 +55,20 @@ def options_side_effect(key): assert response == mock_response mock_safe_urlopen.assert_called_once() + @with_feature("organizations:sentry-app-webhook-hard-timeout") + @override_options( + { + "sentry-apps.webhook.hard-timeout.sec": 1.0, + "sentry-apps.webhook.timeout.sec": 10.0, + "sentry-apps.webhook.restricted-webhook-sending": [], + "sentry-apps.webhook-logging.enabled": {"installation_uuid": [], "sentry_app_slug": []}, + } + ) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.options.get") - def test_webhook_interrupted_by_hard_timeout(self, mock_options, mock_safe_urlopen): - """Test that webhook is interrupted when it exceeds hard timeout""" - - # Configure timeouts - def options_side_effect(key): - if key == "sentry-apps.webhook.hard-timeout.sec": - return 1.0 # 1 second hard timeout - elif key == "sentry-apps.webhook.timeout.sec": - return 10.0 - elif key == "sentry-apps.webhook.restricted-webhook-sending": - return [] - elif key == "sentry-apps.webhook-logging.enabled": - return {"installation_uuid": [], "sentry_app_slug": []} - return None - - mock_options.side_effect = options_side_effect - + def test_webhook_interrupted_by_hard_timeout(self, mock_safe_urlopen): # Make safe_urlopen sleep longer than the timeout def slow_urlopen(*args, **kwargs): - time.sleep(2.0) # Sleep longer than 1 second timeout + time.sleep(6.0) # Sleep longer than 1 second timeout mock_response = Mock(spec=Response) mock_response.status_code = 200 return mock_response @@ -98,22 +84,16 @@ def slow_urlopen(*args, **kwargs): with pytest.raises(WebhookTimeoutError, match="Webhook request exceeded hard timeout"): send_and_save_webhook_request(self.sentry_app, app_platform_event) + @with_feature("organizations:sentry-app-webhook-hard-timeout") + @override_options( + { + "sentry-apps.webhook.hard-timeout.sec": 1.0, + "sentry-apps.webhook.timeout.sec": 10.0, + "sentry-apps.webhook.restricted-webhook-sending": [], + } + ) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.options.get") - def test_timeout_exception_propagates(self, mock_options, mock_safe_urlopen): - """Test that WebhookTimeoutError propagates correctly""" - - def options_side_effect(key): - if key == "sentry-apps.webhook.hard-timeout.sec": - return 1.0 - elif key == "sentry-apps.webhook.timeout.sec": - return 10.0 - elif key == "sentry-apps.webhook.restricted-webhook-sending": - return [] - return None - - mock_options.side_effect = options_side_effect - + def test_timeout_exception_propagates(self, mock_safe_urlopen): # Make safe_urlopen sleep def slow_urlopen(*args, **kwargs): time.sleep(2.0) @@ -125,38 +105,20 @@ def slow_urlopen(*args, **kwargs): resource="issue", action="created", install=self.install, data={"test": "data"} ) - # Verify exception type and inheritance - with pytest.raises(BaseException): # WebhookTimeoutError inherits from BaseException - send_and_save_webhook_request(self.sentry_app, app_platform_event) - with pytest.raises(WebhookTimeoutError): send_and_save_webhook_request(self.sentry_app, app_platform_event) + @with_feature("organizations:sentry-app-webhook-hard-timeout") + @override_options( + { + "sentry-apps.webhook.hard-timeout.sec": 1.0, + "sentry-apps.webhook.timeout.sec": 10.0, + "sentry-apps.webhook.restricted-webhook-sending": [], + } + ) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.options.get") - @patch("sentry.sentry_apps.metrics.SentryAppInteractionEvent") - def test_lifecycle_record_halt_called_on_timeout( - self, mock_interaction_event, mock_options, mock_safe_urlopen - ): - """Test that lifecycle.record_halt() is called when timeout occurs""" - - def options_side_effect(key): - if key == "sentry-apps.webhook.hard-timeout.sec": - return 1.0 - elif key == "sentry-apps.webhook.timeout.sec": - return 10.0 - elif key == "sentry-apps.webhook.restricted-webhook-sending": - return [] - return None - - mock_options.side_effect = options_side_effect - - # Setup mock lifecycle - mock_lifecycle = Mock() - mock_interaction_event.return_value.capture.return_value.__enter__.return_value = ( - mock_lifecycle - ) - + @patch("sentry.integrations.utils.metrics.EventLifecycle.record_event") + def test_lifecycle_record_halt_called_on_timeout(self, mock_record, mock_safe_urlopen): # Make safe_urlopen sleep def slow_urlopen(*args, **kwargs): time.sleep(2.0) @@ -168,33 +130,25 @@ def slow_urlopen(*args, **kwargs): resource="issue", action="created", install=self.install, data={"test": "data"} ) - # Should raise WebhookTimeoutError with pytest.raises(WebhookTimeoutError): send_and_save_webhook_request(self.sentry_app, app_platform_event) - # Verify lifecycle.record_halt() was called with correct halt reason - mock_lifecycle.record_halt.assert_called_once() - halt_reason = mock_lifecycle.record_halt.call_args[1]["halt_reason"] - assert SentryAppWebhookHaltReason.HARD_TIMEOUT in halt_reason + assert_halt_metric( + mock_record=mock_record, + error_msg=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.HARD_TIMEOUT}", + ) + @with_feature("organizations:sentry-app-webhook-hard-timeout") + @override_options( + { + "sentry-apps.webhook.hard-timeout.sec": 5.0, + "sentry-apps.webhook.timeout.sec": 10.0, + "sentry-apps.webhook.restricted-webhook-sending": [], + "sentry-apps.webhook-logging.enabled": {"installation_uuid": [], "sentry_app_slug": []}, + } + ) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.options.get") - def test_timeout_alarm_restores_signal_handler(self, mock_options, mock_safe_urlopen): - """Test that signal handler is properly restored after timeout""" - - def options_side_effect(key): - if key == "sentry-apps.webhook.hard-timeout.sec": - return 1.0 - elif key == "sentry-apps.webhook.timeout.sec": - return 10.0 - elif key == "sentry-apps.webhook.restricted-webhook-sending": - return [] - elif key == "sentry-apps.webhook-logging.enabled": - return {"installation_uuid": [], "sentry_app_slug": []} - return None - - mock_options.side_effect = options_side_effect - + def test_timeout_alarm_restores_signal_handler(self, mock_safe_urlopen): # Get original handler original_handler = signal.signal(signal.SIGALRM, signal.SIG_DFL) signal.signal(signal.SIGALRM, original_handler) @@ -216,24 +170,17 @@ def options_side_effect(key): signal.signal(signal.SIGALRM, current_handler) assert current_handler == original_handler + @with_feature("organizations:sentry-app-webhook-hard-timeout") + @override_options( + { + "sentry-apps.webhook.hard-timeout.sec": 5.0, + "sentry-apps.webhook.timeout.sec": 1.0, + "sentry-apps.webhook.restricted-webhook-sending": [], + "sentry-apps.webhook-logging.enabled": {"installation_uuid": [], "sentry_app_slug": []}, + } + ) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.options.get") - def test_alarm_cancelled_after_successful_webhook(self, mock_options, mock_safe_urlopen): - """Test that alarm is cancelled after webhook completes successfully""" - - def options_side_effect(key): - if key == "sentry-apps.webhook.hard-timeout.sec": - return 5.0 - elif key == "sentry-apps.webhook.timeout.sec": - return 1.0 - elif key == "sentry-apps.webhook.restricted-webhook-sending": - return [] - elif key == "sentry-apps.webhook-logging.enabled": - return {"installation_uuid": [], "sentry_app_slug": []} - return None - - mock_options.side_effect = options_side_effect - + def test_alarm_cancelled_after_successful_webhook(self, mock_safe_urlopen): mock_response = Mock(spec=Response) mock_response.status_code = 200 mock_response.headers = {} From 93fd2c1bb23424223398a1f1f45741d38849179b Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Thu, 19 Feb 2026 11:44:27 -0800 Subject: [PATCH 03/20] implement circuit breaker --- .../utils/sentry_apps/circuit_breaker.py | 147 +++++++++++ .../utils/sentry_apps/test_circuit_breaker.py | 235 ++++++++++++++++++ 2 files changed, 382 insertions(+) create mode 100644 src/sentry/utils/sentry_apps/circuit_breaker.py create mode 100644 tests/sentry/utils/sentry_apps/test_circuit_breaker.py diff --git a/src/sentry/utils/sentry_apps/circuit_breaker.py b/src/sentry/utils/sentry_apps/circuit_breaker.py new file mode 100644 index 00000000000000..7f3f675aad656b --- /dev/null +++ b/src/sentry/utils/sentry_apps/circuit_breaker.py @@ -0,0 +1,147 @@ +import logging +import time +from typing import NotRequired + +from sentry.ratelimits.sliding_windows import ( + GrantedQuota, + Quota, + RequestedQuota, +) +from sentry.utils import metrics +from sentry.utils.circuit_breaker2 import CircuitBreaker, CircuitBreakerConfig, CircuitBreakerState + +logger = logging.getLogger(__name__) + + +class RateBasedCircuitBreakerConfig(CircuitBreakerConfig): + # Minimum absolute error count in the window before rate check applies + error_floor: NotRequired[int] + # Error rate (0.0 to 1.0) threshold — trips when error_count/total_requests >= this + error_rate_threshold: NotRequired[float] + + +class RateBasedCircuitBreaker(CircuitBreaker): + """ + A circuit breaker that trips based on error rate (percentage) with an absolute error floor, + rather than flat error counts alone. + + Extends the parent CircuitBreaker by: + - Adding a parallel total_requests_quota to track all requests (not just errors) + - Adding record_success() to count successful requests + - Overriding _get_remaining_error_quota() to require BOTH error_floor AND error_rate_threshold + before allowing the parent's trip logic to proceed + - Emitting state transition metrics on OK->BROKEN, BROKEN->RECOVERY, RECOVERY->OK, RECOVERY->BROKEN + """ + + def __init__(self, key: str, config: RateBasedCircuitBreakerConfig) -> None: + super().__init__(key, config) + self.error_rate_threshold = config.get("error_rate_threshold", 0.5) + self.error_floor = config.get("error_floor", 500) + + # Parallel quota tracking total requests, same window/granularity as primary. + # Limit set extremely high — we never want this quota to "trip" on its own. + self.total_requests_quota = Quota( + self.window, + self.window_granularity, + 999_999_999, + f"{key}.circuit_breaker.total_requests", + ) + + def record_success(self) -> None: + """Record a successful request (increments total request count only).""" + now = int(time.time()) + state_before, _ = self._get_state_and_remaining_time() + + if state_before == CircuitBreakerState.BROKEN: + return + + self.limiter.use_quotas( + [RequestedQuota(self.key, 1, [self.total_requests_quota])], + [GrantedQuota(self.key, 1, [])], + now, + ) + + # Check for passive state transitions (RECOVERY->OK via TTL expiry) + state_after, _ = self._get_state_and_remaining_time() + if state_before != state_after: + self._emit_state_change_metric(state_before, state_after) + + def record_error(self) -> None: + """ + An error is also a request — count it in both quotas. + Emits state change metrics when the breaker trips. + """ + state_before, _ = self._get_state_and_remaining_time() + + # Count toward total requests first + if state_before != CircuitBreakerState.BROKEN: + now = int(time.time()) + self.limiter.use_quotas( + [RequestedQuota(self.key, 1, [self.total_requests_quota])], + [GrantedQuota(self.key, 1, [])], + now, + ) + + # Delegate to parent for error counting + trip logic + super().record_error() + + # Check for state transitions (OK->BROKEN, RECOVERY->BROKEN) + state_after, _ = self._get_state_and_remaining_time() + if state_before != state_after: + self._emit_state_change_metric(state_before, state_after) + + def _get_remaining_error_quota( + self, quota: Quota | None = None, window_end: int | None = None + ) -> int: + """ + Override: return non-zero (don't trip) unless BOTH conditions are met: + (a) error_count >= error_floor + (b) error_rate >= error_rate_threshold + """ + remaining = super()._get_remaining_error_quota(quota, window_end) + + if quota is None: + return remaining + + error_count = quota.limit - remaining + + # Condition (a): not enough errors to be confident + if error_count < self.error_floor: + return max(remaining, 1) # prevent parent from tripping + + # Condition (b): check error rate + now = int(time.time()) + window_end_time = window_end or now + _, total_grants = self.limiter.check_within_quotas( + [ + RequestedQuota( + self.key, + self.total_requests_quota.limit, + [self.total_requests_quota], + ) + ], + window_end_time, + ) + total_used = self.total_requests_quota.limit - total_grants[0].granted + if total_used == 0: + return max(remaining, 1) + + error_rate = error_count / total_used + if error_rate < self.error_rate_threshold: + return max(remaining, 1) # rate not high enough — don't trip + + return 0 # both conditions met — signal to parent to trip the breaker + + def _emit_state_change_metric( + self, from_state: CircuitBreakerState, to_state: CircuitBreakerState + ) -> None: + transition = f"{from_state.value}_to_{to_state.value}" + metrics.incr( + "sentry_app.webhook.circuit_breaker.state_change", + tags={"key": self.key, "transition": transition}, + sample_rate=1.0, + ) + logger.info( + "sentry_app.webhook.circuit_breaker.state_change", + extra={"key": self.key, "from_state": from_state.value, "to_state": to_state.value}, + ) diff --git a/tests/sentry/utils/sentry_apps/test_circuit_breaker.py b/tests/sentry/utils/sentry_apps/test_circuit_breaker.py new file mode 100644 index 00000000000000..3eca01c91f24b9 --- /dev/null +++ b/tests/sentry/utils/sentry_apps/test_circuit_breaker.py @@ -0,0 +1,235 @@ +import time +from typing import Any +from unittest import TestCase +from unittest.mock import patch + +from sentry.ratelimits.sliding_windows import ( + GrantedQuota, + Quota, + RequestedQuota, +) +from sentry.testutils.helpers.datetime import freeze_time +from sentry.utils.circuit_breaker2 import CircuitBreakerState +from sentry.utils.sentry_apps.circuit_breaker import ( + RateBasedCircuitBreaker, + RateBasedCircuitBreakerConfig, +) + +DEFAULT_RATE_CONFIG: RateBasedCircuitBreakerConfig = { + "error_limit": 10000, + "error_limit_window": 3600, # 1 hr + "broken_state_duration": 120, # 2 min + "error_floor": 100, + "error_rate_threshold": 0.5, +} + + +class MockRateBasedCircuitBreaker(RateBasedCircuitBreaker): + """Test helper with state manipulation methods, following MockCircuitBreaker pattern.""" + + def _set_breaker_state( + self, state: CircuitBreakerState, seconds_left: int | None = None + ) -> None: + now = int(time.time()) + + if state == CircuitBreakerState.OK: + self._delete_from_redis([self.broken_state_key, self.recovery_state_key]) + elif state == CircuitBreakerState.BROKEN: + broken_state_timeout = seconds_left or self.broken_state_duration + broken_state_end = now + broken_state_timeout + recovery_timeout = broken_state_timeout + self.recovery_duration + recovery_end = now + recovery_timeout + self._set_in_redis( + [ + (self.broken_state_key, broken_state_end, broken_state_timeout), + (self.recovery_state_key, recovery_end, recovery_timeout), + ] + ) + elif state == CircuitBreakerState.RECOVERY: + recovery_timeout = seconds_left or self.recovery_duration + recovery_end = now + recovery_timeout + self._delete_from_redis([self.broken_state_key]) + self._set_in_redis([(self.recovery_state_key, recovery_end, recovery_timeout)]) + + def _add_quota_usage( + self, + quota: Quota, + amount_used: int, + granule_or_window_end: int | None = None, + ) -> None: + now = int(time.time()) + window_end_time = granule_or_window_end or now + self.limiter.use_quotas( + [RequestedQuota(self.key, amount_used, [quota])], + [GrantedQuota(self.key, amount_used, [])], + window_end_time, + ) + + def _delete_from_redis(self, keys: list[str]) -> Any: + for key in keys: + self.redis_pipeline.delete(key) + return self.redis_pipeline.execute() + + +@freeze_time() +class RateBasedCircuitBreakerTest(TestCase): + def setUp(self) -> None: + self.config = DEFAULT_RATE_CONFIG + self.breaker = MockRateBasedCircuitBreaker("test_webhook_app", self.config) + # Clear all existing keys from redis + self.breaker.redis_pipeline.flushall() + self.breaker.redis_pipeline.execute() + + def test_low_error_count_below_floor_does_not_trip(self) -> None: + """Even at 100% error rate, if error count < floor, don't trip.""" + # Record 50 errors (below floor of 100), 0 successes = 100% error rate + for _ in range(50): + self.breaker.record_error() + + assert self.breaker.should_allow_request() is True + state, _ = self.breaker._get_state_and_remaining_time() + assert state == CircuitBreakerState.OK + + def test_high_error_count_low_rate_does_not_trip(self) -> None: + """If error count >= floor but rate < threshold, don't trip.""" + # Record 900 successes first, then 100 errors = 100/1000 = 10% error rate. + # Successes must be recorded first to ensure the rate stays below 50% + # throughout error recording. error_floor=100, threshold=50%. + for _ in range(900): + self.breaker.record_success() + for _ in range(100): + self.breaker.record_error() + + assert self.breaker.should_allow_request() is True + state, _ = self.breaker._get_state_and_remaining_time() + assert state == CircuitBreakerState.OK + + def test_both_conditions_met_trips_to_broken(self) -> None: + """When error_count >= floor AND error_rate >= threshold, trip to BROKEN.""" + # 200 errors + 100 successes = 200/300 = 66.7% error rate (> 50% threshold) + # 200 errors > 100 floor + for _ in range(100): + self.breaker.record_success() + for _ in range(200): + self.breaker.record_error() + + state, _ = self.breaker._get_state_and_remaining_time() + assert state == CircuitBreakerState.BROKEN + assert self.breaker.should_allow_request() is False + + def test_broken_to_recovery_to_ok_lifecycle(self) -> None: + """Test the full BROKEN -> RECOVERY -> OK lifecycle.""" + self.breaker._set_breaker_state(CircuitBreakerState.BROKEN) + assert self.breaker.should_allow_request() is False + + # Move to RECOVERY (simulate broken_state_duration expiry) + self.breaker._set_breaker_state(CircuitBreakerState.RECOVERY) + assert self.breaker.should_allow_request() is True + + # Move back to OK (simulate recovery_duration expiry) + self.breaker._set_breaker_state(CircuitBreakerState.OK) + assert self.breaker.should_allow_request() is True + + def test_recovery_to_broken_on_recovery_errors(self) -> None: + """During RECOVERY, exceeding recovery_error_limit re-trips to BROKEN.""" + self.breaker._set_breaker_state(CircuitBreakerState.RECOVERY) + state, _ = self.breaker._get_state_and_remaining_time() + assert state == CircuitBreakerState.RECOVERY + + # The recovery_error_limit is error_limit // 10 = 1000. + # Record enough errors to exceed recovery limit at high rate. + recovery_limit = self.breaker.recovery_error_limit + for _ in range(recovery_limit + 1): + self.breaker.record_error() + + state, _ = self.breaker._get_state_and_remaining_time() + assert state == CircuitBreakerState.BROKEN + + def test_record_success_increments_total_count(self) -> None: + """record_success() should increment total request count.""" + self.breaker.record_success() + + now = int(time.time()) + _, grants = self.breaker.limiter.check_within_quotas( + [ + RequestedQuota( + self.breaker.key, + self.breaker.total_requests_quota.limit, + [self.breaker.total_requests_quota], + ) + ], + now, + ) + total_used = self.breaker.total_requests_quota.limit - grants[0].granted + assert total_used == 1 + + def test_record_error_increments_both_counts(self) -> None: + """record_error() should increment both error and total request counts.""" + self.breaker.record_error() + + now = int(time.time()) + # Check total requests + _, total_grants = self.breaker.limiter.check_within_quotas( + [ + RequestedQuota( + self.breaker.key, + self.breaker.total_requests_quota.limit, + [self.breaker.total_requests_quota], + ) + ], + now, + ) + total_used = self.breaker.total_requests_quota.limit - total_grants[0].granted + assert total_used == 1 + + # Check error count + _, error_grants = self.breaker.limiter.check_within_quotas( + [ + RequestedQuota( + self.breaker.key, + self.breaker.primary_quota.limit, + [self.breaker.primary_quota], + ) + ], + now, + ) + errors_used = self.breaker.primary_quota.limit - error_grants[0].granted + assert errors_used == 1 + + @patch("sentry.utils.sentry_apps.circuit_breaker.metrics") + def test_state_change_metrics_emitted(self, mock_metrics: Any) -> None: + """State transition metrics should be emitted on OK->BROKEN.""" + # Trip the breaker: 200 errors out of 300 total = 66.7% rate, 200 > 100 floor + for _ in range(100): + self.breaker.record_success() + for _ in range(200): + self.breaker.record_error() + + mock_metrics.incr.assert_any_call( + "sentry_app.webhook.circuit_breaker.state_change", + tags={ + "key": "test_webhook_app", + "transition": "circuit_okay_to_circuit_broken", + }, + sample_rate=1.0, + ) + + def test_record_success_noop_when_broken(self) -> None: + """record_success() should return early when BROKEN.""" + self.breaker._set_breaker_state(CircuitBreakerState.BROKEN) + self.breaker.record_success() # Should not raise + + # Verify no total requests were counted + now = int(time.time()) + _, grants = self.breaker.limiter.check_within_quotas( + [ + RequestedQuota( + self.breaker.key, + self.breaker.total_requests_quota.limit, + [self.breaker.total_requests_quota], + ) + ], + now, + ) + total_used = self.breaker.total_requests_quota.limit - grants[0].granted + assert total_used == 0 From 21d422cc27360fda245dffeaa5cbba7f7cb42a3c Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Thu, 19 Feb 2026 12:01:36 -0800 Subject: [PATCH 04/20] update typing in tests --- .../utils/sentry_apps/test_webhook_timeout.py | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py index 475fcf2882492b..62bcfcbd427e2c 100644 --- a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py +++ b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py @@ -7,6 +7,7 @@ from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent from sentry.sentry_apps.metrics import SentryAppWebhookHaltReason +from sentry.sentry_apps.utils.webhooks import IssueActionType, SentryAppResourceType from sentry.testutils.asserts import assert_halt_metric from sentry.testutils.cases import TestCase from sentry.testutils.helpers.features import with_feature @@ -46,7 +47,10 @@ def test_webhook_completes_before_timeout(self, mock_safe_urlopen): mock_safe_urlopen.return_value = mock_response app_platform_event = AppPlatformEvent( - resource="issue", action="created", install=self.install, data={"test": "data"} + resource=SentryAppResourceType.ISSUE, + action=IssueActionType.CREATED, + install=self.install, + data={"test": "data"}, ) # Should complete without raising WebhookTimeoutError @@ -77,7 +81,10 @@ def slow_urlopen(*args, **kwargs): # Create event app_platform_event = AppPlatformEvent( - resource="issue", action="created", install=self.install, data={"test": "data"} + resource=SentryAppResourceType.ISSUE, + action=IssueActionType.CREATED, + install=self.install, + data={"test": "data"}, ) # Should raise WebhookTimeoutError @@ -102,7 +109,10 @@ def slow_urlopen(*args, **kwargs): mock_safe_urlopen.side_effect = slow_urlopen app_platform_event = AppPlatformEvent( - resource="issue", action="created", install=self.install, data={"test": "data"} + resource=SentryAppResourceType.ISSUE, + action=IssueActionType.CREATED, + install=self.install, + data={"test": "data"}, ) with pytest.raises(WebhookTimeoutError): @@ -127,7 +137,10 @@ def slow_urlopen(*args, **kwargs): mock_safe_urlopen.side_effect = slow_urlopen app_platform_event = AppPlatformEvent( - resource="issue", action="created", install=self.install, data={"test": "data"} + resource=SentryAppResourceType.ISSUE, + action=IssueActionType.CREATED, + install=self.install, + data={"test": "data"}, ) with pytest.raises(WebhookTimeoutError): @@ -160,7 +173,10 @@ def test_timeout_alarm_restores_signal_handler(self, mock_safe_urlopen): mock_safe_urlopen.return_value = mock_response app_platform_event = AppPlatformEvent( - resource="issue", action="created", install=self.install, data={"test": "data"} + resource=SentryAppResourceType.ISSUE, + action=IssueActionType.CREATED, + install=self.install, + data={"test": "data"}, ) send_and_save_webhook_request(self.sentry_app, app_platform_event) @@ -187,7 +203,10 @@ def test_alarm_cancelled_after_successful_webhook(self, mock_safe_urlopen): mock_safe_urlopen.return_value = mock_response app_platform_event = AppPlatformEvent( - resource="issue", action="created", install=self.install, data={"test": "data"} + resource=SentryAppResourceType.ISSUE, + action=IssueActionType.CREATED, + install=self.install, + data={"test": "data"}, ) send_and_save_webhook_request(self.sentry_app, app_platform_event) From cf8357b39d92fc7f4103100a65afafb24566d913 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Thu, 19 Feb 2026 13:32:15 -0800 Subject: [PATCH 05/20] move notifying to after the commit --- .../api/endpoints/installation_details.py | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/sentry/sentry_apps/api/endpoints/installation_details.py b/src/sentry/sentry_apps/api/endpoints/installation_details.py index f2ada20c049c78..ce488d9b68f928 100644 --- a/src/sentry/sentry_apps/api/endpoints/installation_details.py +++ b/src/sentry/sentry_apps/api/endpoints/installation_details.py @@ -45,19 +45,9 @@ def get(self, request: Request, installation) -> Response: def delete(self, request: Request, installation) -> Response: sentry_app_installation = SentryAppInstallation.objects.get(id=installation.id) - with transaction.atomic(using=router.db_for_write(SentryAppInstallation)): - try: - assert request.user.is_authenticated, ( - "User must be authenticated to delete installation" - ) - SentryAppInstallationNotifier( - sentry_app_installation=sentry_app_installation, - user=request.user, - action="deleted", - ).run() - # if the error is from a request exception, log the error and continue - except RequestException as exc: - sentry_sdk.capture_exception(exc) + db = router.db_for_write(SentryAppInstallation) + + with transaction.atomic(using=db): sentry_app_installation.update(status=SentryAppInstallationStatus.PENDING_DELETION) ScheduledDeletion.schedule(sentry_app_installation, days=0, actor=request.user) create_audit_entry( @@ -67,7 +57,22 @@ def delete(self, request: Request, installation) -> Response: event=audit_log.get_event_id("SENTRY_APP_UNINSTALL"), data={"sentry_app": sentry_app_installation.sentry_app.name}, ) - if request.user.is_authenticated: + + def notify_on_commit() -> None: + try: + assert request.user.is_authenticated, ( + "User must be authenticated to delete installation" + ) + SentryAppInstallationNotifier( + sentry_app_installation=sentry_app_installation, + user=request.user, + action="deleted", + ).run() + except RequestException as exc: + sentry_sdk.capture_exception(exc) + + transaction.on_commit(notify_on_commit, using=db) + analytics.record( SentryAppUninstalledEvent( user_id=request.user.id, From 1d1b9049055f82d0e4c1558b5ceb133c1bf48fd5 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Thu, 19 Feb 2026 13:45:14 -0800 Subject: [PATCH 06/20] add back the is_authenticated guard --- src/sentry/sentry_apps/api/endpoints/installation_details.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentry/sentry_apps/api/endpoints/installation_details.py b/src/sentry/sentry_apps/api/endpoints/installation_details.py index ce488d9b68f928..8e06393a72836a 100644 --- a/src/sentry/sentry_apps/api/endpoints/installation_details.py +++ b/src/sentry/sentry_apps/api/endpoints/installation_details.py @@ -72,7 +72,7 @@ def notify_on_commit() -> None: sentry_sdk.capture_exception(exc) transaction.on_commit(notify_on_commit, using=db) - + if request.user.is_authenticated: analytics.record( SentryAppUninstalledEvent( user_id=request.user.id, From 791699dd4313ad7f43fdd2e7c39f1eda4bc7edf0 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Thu, 19 Feb 2026 14:22:55 -0800 Subject: [PATCH 07/20] add ffs for circuit breaker --- src/sentry/features/temporary.py | 2 ++ src/sentry/options/defaults.py | 22 ++++++++++++++++++++++ src/sentry/sentry_apps/metrics.py | 1 + 3 files changed, 25 insertions(+) diff --git a/src/sentry/features/temporary.py b/src/sentry/features/temporary.py index f3cac610fe75aa..4c19807ba4504a 100644 --- a/src/sentry/features/temporary.py +++ b/src/sentry/features/temporary.py @@ -695,6 +695,8 @@ def register_temporary_features(manager: FeatureManager) -> None: manager.add("organizations:conduit-demo", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True) # Enable hard timeout alarm for webhooks manager.add("organizations:sentry-app-webhook-hard-timeout", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False) + # Enable circuit breaker for webhook endpoint failure detection + manager.add("organizations:sentry-app-webhook-circuit-breaker", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False) # Enables organization access to the new notification platform manager.add("organizations:notification-platform.internal-testing", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True) diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 05f4334a223097..fc273e904c6ce3 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2567,6 +2567,28 @@ flags=FLAG_AUTOMATOR_MODIFIABLE, ) +# Circuit breaker configuration for webhook endpoint failure detection. +# Keys match RateBasedCircuitBreakerConfig +register( + "sentry-apps.webhook.circuit-breaker.config", + type=Dict, + default={ + "error_limit": 5000, # 5,000 errors in 10 minutes + "error_limit_window": 600, # 10 minutes + "broken_state_duration": 300, # 5 minutes + "error_rate_threshold": 0.5, # 50% error rate + "error_floor": 500, # 500 errors before error rate check applies + }, + flags=FLAG_AUTOMATOR_MODIFIABLE, +) + +# When True, the circuit breaker tracks state and emits metrics but does not block requests. +register( + "sentry-apps.webhook.circuit-breaker.dry-run", + default=True, + flags=FLAG_AUTOMATOR_MODIFIABLE, +) + # Enables statistical detectors for a project register( "statistical_detectors.enable", diff --git a/src/sentry/sentry_apps/metrics.py b/src/sentry/sentry_apps/metrics.py index aad977fa71eb26..31a78f52932731 100644 --- a/src/sentry/sentry_apps/metrics.py +++ b/src/sentry/sentry_apps/metrics.py @@ -71,6 +71,7 @@ class SentryAppWebhookHaltReason(StrEnum): RESTRICTED_IP = "restricted_ip" CONNECTION_RESET = "connection_reset" HARD_TIMEOUT = "hard_timeout" + CIRCUIT_BROKEN = "circuit_broken" class SentryAppExternalRequestFailureReason(StrEnum): From e755d7f018a8af1d3e4f8e528863f11520ced7f6 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Thu, 19 Feb 2026 15:18:37 -0800 Subject: [PATCH 08/20] add circuit breaker to webhook sending --- .../utils/sentry_apps/circuit_breaker.py | 22 ++ src/sentry/utils/sentry_apps/webhooks.py | 235 ++++++++++-------- .../utils/sentry_apps/test_webhook_timeout.py | 1 + .../sentry/utils/sentry_apps/test_webhooks.py | 129 ++++++++++ 4 files changed, 288 insertions(+), 99 deletions(-) create mode 100644 tests/sentry/utils/sentry_apps/test_webhooks.py diff --git a/src/sentry/utils/sentry_apps/circuit_breaker.py b/src/sentry/utils/sentry_apps/circuit_breaker.py index 7f3f675aad656b..f391fe2db886c2 100644 --- a/src/sentry/utils/sentry_apps/circuit_breaker.py +++ b/src/sentry/utils/sentry_apps/circuit_breaker.py @@ -1,5 +1,7 @@ import logging import time +from collections.abc import Generator +from contextlib import contextmanager from typing import NotRequired from sentry.ratelimits.sliding_windows import ( @@ -145,3 +147,23 @@ def _emit_state_change_metric( "sentry_app.webhook.circuit_breaker.state_change", extra={"key": self.key, "from_state": from_state.value, "to_state": to_state.value}, ) + + +@contextmanager +def circuit_breaker_tracking( + breaker: RateBasedCircuitBreaker | None, +) -> Generator[None]: + """Track request outcome: record_error on Exception, record_success on normal exit. + + Handles the None case as a no-op so callers don't need nullcontext(). + """ + if breaker is None: + yield + return + try: + yield + except Exception: + breaker.record_error() + raise + else: + breaker.record_success() diff --git a/src/sentry/utils/sentry_apps/webhooks.py b/src/sentry/utils/sentry_apps/webhooks.py index c7627b45425353..705b8a4b9b0185 100644 --- a/src/sentry/utils/sentry_apps/webhooks.py +++ b/src/sentry/utils/sentry_apps/webhooks.py @@ -23,7 +23,12 @@ from sentry.sentry_apps.utils.errors import SentryAppSentryError from sentry.shared_integrations.exceptions import ApiHostError, ApiTimeoutError, ClientError from sentry.taskworker.workerchild import timeout_alarm +from sentry.utils import metrics from sentry.utils.sentry_apps import SentryAppWebhookRequestsBuffer +from sentry.utils.sentry_apps.circuit_breaker import ( + RateBasedCircuitBreaker, + circuit_breaker_tracking, +) if TYPE_CHECKING: from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent @@ -39,7 +44,7 @@ T = TypeVar("T", bound=Mapping[str, Any]) -class WebhookTimeoutError(BaseException): +class WebhookTimeoutError(Exception): """This error represents a user set hard timeout for when a webhook request should've completed within X seconds """ @@ -118,124 +123,156 @@ def send_and_save_webhook_request( ) assert url is not None - try: - organization_context = organization_service.get_organization_by_id( - id=app_platform_event.install.organization_id, + + # Fetch org context once — reused by both circuit breaker and hard timeout checks + organization_context = organization_service.get_organization_by_id( + id=app_platform_event.install.organization_id, + ) + + # Circuit breaker gate — keyed per app slug + circuit_breaker: RateBasedCircuitBreaker | None = None + if organization_context is not None and features.has( + "organizations:sentry-app-webhook-circuit-breaker", + organization_context.organization, + ): + circuit_breaker = RateBasedCircuitBreaker( + f"sentry-app.webhook.{sentry_app.slug}", + options.get("sentry-apps.webhook.circuit-breaker.config"), ) - if organization_context is not None and features.has( - "organizations:sentry-app-webhook-hard-timeout", - organization_context.organization, - ): - timeout_seconds = int(options.get("sentry-apps.webhook.hard-timeout.sec")) - with timeout_alarm(timeout_seconds, _handle_webhook_timeout): + if not circuit_breaker.should_allow_request(): + dry_run = options.get("sentry-apps.webhook.circuit-breaker.dry-run") + if dry_run: + metrics.incr( + "sentry_app.webhook.circuit_breaker.would_block", + tags={"slug": sentry_app.slug}, + ) + logger.warning( + "sentry_app.webhook.circuit_breaker.would_block", + extra={"slug": sentry_app.slug, "org_id": org_id}, + ) + else: + lifecycle.record_halt( + halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.CIRCUIT_BROKEN}" + ) + return Response() + + with circuit_breaker_tracking(circuit_breaker): + try: + if organization_context is not None and features.has( + "organizations:sentry-app-webhook-hard-timeout", + organization_context.organization, + ): + timeout_seconds = int(options.get("sentry-apps.webhook.hard-timeout.sec")) + with timeout_alarm(timeout_seconds, _handle_webhook_timeout): + response = safe_urlopen( + url=url, + data=app_platform_event.body, + headers=app_platform_event.headers, + timeout=options.get("sentry-apps.webhook.timeout.sec"), + ) + else: response = safe_urlopen( url=url, data=app_platform_event.body, headers=app_platform_event.headers, timeout=options.get("sentry-apps.webhook.timeout.sec"), ) - else: - response = safe_urlopen( + + except WebhookTimeoutError: + lifecycle.record_halt( + halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.HARD_TIMEOUT}" + ) + raise + except (Timeout, ConnectionError) as e: + error_type = e.__class__.__name__.lower() + lifecycle.add_extras( + { + "reason": "send_and_save_webhook_request.timeout", + "error_type": error_type, + "organization_id": org_id, + "integration_slug": sentry_app.slug, + "url": url, + }, + ) + track_response_code(error_type, slug, event) + buffer.add_request( + response_code=TIMEOUT_STATUS_CODE, + org_id=org_id, + event=event, url=url, - data=app_platform_event.body, headers=app_platform_event.headers, - timeout=options.get("sentry-apps.webhook.timeout.sec"), ) + lifecycle.record_halt(e) + # Re-raise the exception because some of these tasks might retry on the exception + raise + except ChunkedEncodingError: + lifecycle.record_halt( + halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.CONNECTION_RESET}" + ) + raise + except RestrictedIPAddress: + lifecycle.record_halt( + halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.RESTRICTED_IP}" + ) + raise - except WebhookTimeoutError: - lifecycle.record_halt( - halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.HARD_TIMEOUT}" - ) - raise - except (Timeout, ConnectionError) as e: - error_type = e.__class__.__name__.lower() - lifecycle.add_extras( - { - "reason": "send_and_save_webhook_request.timeout", - "error_type": error_type, - "organization_id": org_id, - "integration_slug": sentry_app.slug, - "url": url, - }, - ) - track_response_code(error_type, slug, event) + track_response_code(response.status_code, slug, event) buffer.add_request( - response_code=TIMEOUT_STATUS_CODE, + response_code=response.status_code, org_id=org_id, event=event, url=url, + error_id=response.headers.get("Sentry-Hook-Error"), + project_id=response.headers.get("Sentry-Hook-Project"), + response=response, headers=app_platform_event.headers, ) - lifecycle.record_halt(e) - # Re-raise the exception because some of these tasks might retry on the exception - raise - except ChunkedEncodingError: - lifecycle.record_halt( - halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.CONNECTION_RESET}" - ) - raise - except RestrictedIPAddress: - lifecycle.record_halt( - halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.RESTRICTED_IP}" - ) - raise - - track_response_code(response.status_code, slug, event) - buffer.add_request( - response_code=response.status_code, - org_id=org_id, - event=event, - url=url, - error_id=response.headers.get("Sentry-Hook-Error"), - project_id=response.headers.get("Sentry-Hook-Project"), - response=response, - headers=app_platform_event.headers, - ) - debug_logging_enabled = ( - app_platform_event.install.uuid - in options.get("sentry-apps.webhook-logging.enabled")["installation_uuid"] - or sentry_app.slug - in options.get("sentry-apps.webhook-logging.enabled")["sentry_app_slug"] - ) - if debug_logging_enabled: - webhook_event = event - logger.info( - "sentry_app_webhook_sent", - extra={ - "sentry_app_slug": sentry_app.slug, - "organization_id": org_id, - "installation_uuid": app_platform_event.install.uuid, - "resource": app_platform_event.resource, - "action": app_platform_event.action, - "webhook_event": webhook_event, - "url": url, - "response_code": response.status_code, - }, + debug_logging_enabled = ( + app_platform_event.install.uuid + in options.get("sentry-apps.webhook-logging.enabled")["installation_uuid"] + or sentry_app.slug + in options.get("sentry-apps.webhook-logging.enabled")["sentry_app_slug"] ) + if debug_logging_enabled: + webhook_event = event + logger.info( + "sentry_app_webhook_sent", + extra={ + "sentry_app_slug": sentry_app.slug, + "organization_id": org_id, + "installation_uuid": app_platform_event.install.uuid, + "resource": app_platform_event.resource, + "action": app_platform_event.action, + "webhook_event": webhook_event, + "url": url, + "response_code": response.status_code, + }, + ) - if response.status_code == status.HTTP_503_SERVICE_UNAVAILABLE: - lifecycle.record_halt( - halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}" - ) - raise ApiHostError.from_request(response.request) + if response.status_code == status.HTTP_503_SERVICE_UNAVAILABLE: + lifecycle.record_halt( + halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}" + ) + raise ApiHostError.from_request(response.request) - elif response.status_code == status.HTTP_504_GATEWAY_TIMEOUT: - lifecycle.record_halt( - halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}" - ) - raise ApiTimeoutError.from_request(response.request) + elif response.status_code == status.HTTP_504_GATEWAY_TIMEOUT: + lifecycle.record_halt( + halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}" + ) + raise ApiTimeoutError.from_request(response.request) - elif 400 <= response.status_code < 500: - lifecycle.record_halt( - halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.GOT_CLIENT_ERROR}_{response.status_code}", - sample_log_rate=0.05, - ) - raise ClientError(response.status_code, url, response=response) + elif 400 <= response.status_code < 500: + lifecycle.record_halt( + halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.GOT_CLIENT_ERROR}_{response.status_code}", + sample_log_rate=0.05, + ) + raise ClientError(response.status_code, url, response=response) - try: - response.raise_for_status() - except RequestException as e: - lifecycle.record_halt(e) - raise - return response + try: + response.raise_for_status() + except RequestException as e: + lifecycle.record_halt(e) + raise + + return response diff --git a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py index 62bcfcbd427e2c..145c1a7145f08d 100644 --- a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py +++ b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py @@ -24,6 +24,7 @@ def setUp(self): name="TestApp", organization=self.organization, webhook_url="https://example.com/webhook", + published=True, ) self.install = self.create_sentry_app_installation( organization=self.organization, slug=self.sentry_app.slug diff --git a/tests/sentry/utils/sentry_apps/test_webhooks.py b/tests/sentry/utils/sentry_apps/test_webhooks.py new file mode 100644 index 00000000000000..3e9dc734a131a7 --- /dev/null +++ b/tests/sentry/utils/sentry_apps/test_webhooks.py @@ -0,0 +1,129 @@ +from unittest.mock import Mock, patch + +import pytest +from requests import Response +from requests.exceptions import Timeout + +from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent +from sentry.sentry_apps.utils.webhooks import IssueActionType, SentryAppResourceType +from sentry.testutils.cases import TestCase +from sentry.testutils.helpers.features import with_feature +from sentry.testutils.helpers.options import override_options +from sentry.testutils.silo import region_silo_test +from sentry.utils.sentry_apps.webhooks import send_and_save_webhook_request + +CIRCUIT_BREAKER_OPTIONS = { + "sentry-apps.webhook.circuit-breaker.config": { + "error_limit": 10000, + "error_limit_window": 600, + "broken_state_duration": 300, + "error_rate_threshold": 0.5, + "error_floor": 5, # low floor for testing + }, + "sentry-apps.webhook.circuit-breaker.dry-run": False, + "sentry-apps.webhook.timeout.sec": 1.0, + "sentry-apps.webhook.restricted-webhook-sending": [], +} + + +@region_silo_test +class WebhookCircuitBreakerTest(TestCase): + def setUp(self): + self.organization = self.create_organization() + self.sentry_app = self.create_sentry_app( + name="TestApp", + organization=self.organization, + webhook_url="https://example.com/webhook", + published=True, + ) + self.install = self.create_sentry_app_installation( + organization=self.organization, slug=self.sentry_app.slug + ) + + def _make_event(self): + return AppPlatformEvent( + resource=SentryAppResourceType.ISSUE, + action=IssueActionType.CREATED, + install=self.install, + data={"test": "data"}, + ) + + @override_options(CIRCUIT_BREAKER_OPTIONS) + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + def test_no_circuit_breaker_without_feature_flag(self, mock_safe_urlopen): + """Without feature flag, no circuit breaker is instantiated.""" + mock_response = Mock(spec=Response) + mock_response.status_code = 200 + mock_response.headers = {} + mock_safe_urlopen.return_value = mock_response + + response = send_and_save_webhook_request(self.sentry_app, self._make_event()) + assert response is not None + assert response.status_code == 200 + + @with_feature("organizations:sentry-app-webhook-circuit-breaker") + @override_options( + {**CIRCUIT_BREAKER_OPTIONS, "sentry-apps.webhook.circuit-breaker.dry-run": True} + ) + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.RateBasedCircuitBreaker") + def test_dry_run_emits_metric_but_sends_webhook(self, MockBreaker, mock_safe_urlopen): + """In dry-run mode, a broken circuit emits would_block but still sends.""" + mock_breaker_instance = MockBreaker.return_value + mock_breaker_instance.should_allow_request.return_value = False + + mock_response = Mock(spec=Response) + mock_response.status_code = 200 + mock_response.headers = {} + mock_safe_urlopen.return_value = mock_response + + response = send_and_save_webhook_request(self.sentry_app, self._make_event()) + # In dry-run, webhook is still sent + assert response is not None + assert response.status_code == 200 + mock_safe_urlopen.assert_called_once() + + @with_feature("organizations:sentry-app-webhook-circuit-breaker") + @override_options(CIRCUIT_BREAKER_OPTIONS) + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.RateBasedCircuitBreaker") + def test_blocking_mode_returns_empty_response(self, MockBreaker, mock_safe_urlopen): + """With dry-run OFF, a broken circuit blocks the webhook.""" + mock_breaker_instance = MockBreaker.return_value + mock_breaker_instance.should_allow_request.return_value = False + + send_and_save_webhook_request(self.sentry_app, self._make_event()) + # Webhook is blocked — no HTTP call made + mock_safe_urlopen.assert_not_called() + + @with_feature("organizations:sentry-app-webhook-circuit-breaker") + @override_options(CIRCUIT_BREAKER_OPTIONS) + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.RateBasedCircuitBreaker") + def test_timeout_calls_record_error(self, MockBreaker, mock_safe_urlopen): + """Timeout exceptions should call record_error().""" + mock_breaker_instance = MockBreaker.return_value + mock_breaker_instance.should_allow_request.return_value = True + mock_safe_urlopen.side_effect = Timeout() + + with pytest.raises(Timeout): + send_and_save_webhook_request(self.sentry_app, self._make_event()) + + mock_breaker_instance.record_error.assert_called_once() + + @with_feature("organizations:sentry-app-webhook-circuit-breaker") + @override_options(CIRCUIT_BREAKER_OPTIONS) + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.RateBasedCircuitBreaker") + def test_success_calls_record_success(self, MockBreaker, mock_safe_urlopen): + """Successful responses should call record_success().""" + mock_breaker_instance = MockBreaker.return_value + mock_breaker_instance.should_allow_request.return_value = True + + mock_response = Mock(spec=Response) + mock_response.status_code = 200 + mock_response.headers = {} + mock_safe_urlopen.return_value = mock_response + + send_and_save_webhook_request(self.sentry_app, self._make_event()) + mock_breaker_instance.record_success.assert_called_once() From 0efb7827b2299e64924c2eddcfb7dcf79374222c Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Thu, 19 Feb 2026 16:52:49 -0800 Subject: [PATCH 09/20] change WebhookTimeoutError from BaseException to Exception MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With Exception, ignore_unpublished_app_errors swallows timeouts for unpublished (dev/test) apps, which is the correct behavior — the lifecycle halt metric still fires before the re-raise. Tests updated to use published=True so the exception propagates for assertion. --- src/sentry/utils/sentry_apps/webhooks.py | 2 +- tests/sentry/utils/sentry_apps/test_webhook_timeout.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sentry/utils/sentry_apps/webhooks.py b/src/sentry/utils/sentry_apps/webhooks.py index c7627b45425353..f50e7e923f474a 100644 --- a/src/sentry/utils/sentry_apps/webhooks.py +++ b/src/sentry/utils/sentry_apps/webhooks.py @@ -39,7 +39,7 @@ T = TypeVar("T", bound=Mapping[str, Any]) -class WebhookTimeoutError(BaseException): +class WebhookTimeoutError(Exception): """This error represents a user set hard timeout for when a webhook request should've completed within X seconds """ diff --git a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py index 62bcfcbd427e2c..145c1a7145f08d 100644 --- a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py +++ b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py @@ -24,6 +24,7 @@ def setUp(self): name="TestApp", organization=self.organization, webhook_url="https://example.com/webhook", + published=True, ) self.install = self.create_sentry_app_installation( organization=self.organization, slug=self.sentry_app.slug From c492fb5c3b46bc4f626e00c5f518cf8dc6cc7095 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Fri, 27 Mar 2026 10:07:06 -0700 Subject: [PATCH 10/20] adjust circuit breaker usage to be the updated one --- src/sentry/options/defaults.py | 7 +- .../utils/sentry_apps/circuit_breaker.py | 149 +---------- src/sentry/utils/sentry_apps/webhooks.py | 17 +- .../utils/sentry_apps/test_circuit_breaker.py | 235 ------------------ .../sentry/utils/sentry_apps/test_webhooks.py | 13 +- 5 files changed, 19 insertions(+), 402 deletions(-) delete mode 100644 tests/sentry/utils/sentry_apps/test_circuit_breaker.py diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 75ba5eabe6a551..df14af965a067e 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2676,16 +2676,15 @@ ) # Circuit breaker configuration for webhook endpoint failure detection. -# Keys match RateBasedCircuitBreakerConfig +# Keys match RateBasedTripStrategyConfig + CircuitBreakerConfig register( "sentry-apps.webhook.circuit-breaker.config", type=Dict, default={ - "error_limit": 5000, # 5,000 errors in 10 minutes "error_limit_window": 600, # 10 minutes "broken_state_duration": 300, # 5 minutes - "error_rate_threshold": 0.5, # 50% error rate - "error_floor": 500, # 500 errors before error rate check applies + "threshold": 0.5, # 50% error rate + "floor": 500, # 500 errors before error rate check applies }, flags=FLAG_AUTOMATOR_MODIFIABLE, ) diff --git a/src/sentry/utils/sentry_apps/circuit_breaker.py b/src/sentry/utils/sentry_apps/circuit_breaker.py index f391fe2db886c2..0d17971d94bff7 100644 --- a/src/sentry/utils/sentry_apps/circuit_breaker.py +++ b/src/sentry/utils/sentry_apps/circuit_breaker.py @@ -1,157 +1,12 @@ -import logging -import time from collections.abc import Generator from contextlib import contextmanager -from typing import NotRequired -from sentry.ratelimits.sliding_windows import ( - GrantedQuota, - Quota, - RequestedQuota, -) -from sentry.utils import metrics -from sentry.utils.circuit_breaker2 import CircuitBreaker, CircuitBreakerConfig, CircuitBreakerState - -logger = logging.getLogger(__name__) - - -class RateBasedCircuitBreakerConfig(CircuitBreakerConfig): - # Minimum absolute error count in the window before rate check applies - error_floor: NotRequired[int] - # Error rate (0.0 to 1.0) threshold — trips when error_count/total_requests >= this - error_rate_threshold: NotRequired[float] - - -class RateBasedCircuitBreaker(CircuitBreaker): - """ - A circuit breaker that trips based on error rate (percentage) with an absolute error floor, - rather than flat error counts alone. - - Extends the parent CircuitBreaker by: - - Adding a parallel total_requests_quota to track all requests (not just errors) - - Adding record_success() to count successful requests - - Overriding _get_remaining_error_quota() to require BOTH error_floor AND error_rate_threshold - before allowing the parent's trip logic to proceed - - Emitting state transition metrics on OK->BROKEN, BROKEN->RECOVERY, RECOVERY->OK, RECOVERY->BROKEN - """ - - def __init__(self, key: str, config: RateBasedCircuitBreakerConfig) -> None: - super().__init__(key, config) - self.error_rate_threshold = config.get("error_rate_threshold", 0.5) - self.error_floor = config.get("error_floor", 500) - - # Parallel quota tracking total requests, same window/granularity as primary. - # Limit set extremely high — we never want this quota to "trip" on its own. - self.total_requests_quota = Quota( - self.window, - self.window_granularity, - 999_999_999, - f"{key}.circuit_breaker.total_requests", - ) - - def record_success(self) -> None: - """Record a successful request (increments total request count only).""" - now = int(time.time()) - state_before, _ = self._get_state_and_remaining_time() - - if state_before == CircuitBreakerState.BROKEN: - return - - self.limiter.use_quotas( - [RequestedQuota(self.key, 1, [self.total_requests_quota])], - [GrantedQuota(self.key, 1, [])], - now, - ) - - # Check for passive state transitions (RECOVERY->OK via TTL expiry) - state_after, _ = self._get_state_and_remaining_time() - if state_before != state_after: - self._emit_state_change_metric(state_before, state_after) - - def record_error(self) -> None: - """ - An error is also a request — count it in both quotas. - Emits state change metrics when the breaker trips. - """ - state_before, _ = self._get_state_and_remaining_time() - - # Count toward total requests first - if state_before != CircuitBreakerState.BROKEN: - now = int(time.time()) - self.limiter.use_quotas( - [RequestedQuota(self.key, 1, [self.total_requests_quota])], - [GrantedQuota(self.key, 1, [])], - now, - ) - - # Delegate to parent for error counting + trip logic - super().record_error() - - # Check for state transitions (OK->BROKEN, RECOVERY->BROKEN) - state_after, _ = self._get_state_and_remaining_time() - if state_before != state_after: - self._emit_state_change_metric(state_before, state_after) - - def _get_remaining_error_quota( - self, quota: Quota | None = None, window_end: int | None = None - ) -> int: - """ - Override: return non-zero (don't trip) unless BOTH conditions are met: - (a) error_count >= error_floor - (b) error_rate >= error_rate_threshold - """ - remaining = super()._get_remaining_error_quota(quota, window_end) - - if quota is None: - return remaining - - error_count = quota.limit - remaining - - # Condition (a): not enough errors to be confident - if error_count < self.error_floor: - return max(remaining, 1) # prevent parent from tripping - - # Condition (b): check error rate - now = int(time.time()) - window_end_time = window_end or now - _, total_grants = self.limiter.check_within_quotas( - [ - RequestedQuota( - self.key, - self.total_requests_quota.limit, - [self.total_requests_quota], - ) - ], - window_end_time, - ) - total_used = self.total_requests_quota.limit - total_grants[0].granted - if total_used == 0: - return max(remaining, 1) - - error_rate = error_count / total_used - if error_rate < self.error_rate_threshold: - return max(remaining, 1) # rate not high enough — don't trip - - return 0 # both conditions met — signal to parent to trip the breaker - - def _emit_state_change_metric( - self, from_state: CircuitBreakerState, to_state: CircuitBreakerState - ) -> None: - transition = f"{from_state.value}_to_{to_state.value}" - metrics.incr( - "sentry_app.webhook.circuit_breaker.state_change", - tags={"key": self.key, "transition": transition}, - sample_rate=1.0, - ) - logger.info( - "sentry_app.webhook.circuit_breaker.state_change", - extra={"key": self.key, "from_state": from_state.value, "to_state": to_state.value}, - ) +from sentry.utils.circuit_breaker2 import CircuitBreaker @contextmanager def circuit_breaker_tracking( - breaker: RateBasedCircuitBreaker | None, + breaker: CircuitBreaker | None, ) -> Generator[None]: """Track request outcome: record_error on Exception, record_success on normal exit. diff --git a/src/sentry/utils/sentry_apps/webhooks.py b/src/sentry/utils/sentry_apps/webhooks.py index 822e72a26138bc..263fb18d32363b 100644 --- a/src/sentry/utils/sentry_apps/webhooks.py +++ b/src/sentry/utils/sentry_apps/webhooks.py @@ -24,11 +24,9 @@ from sentry.shared_integrations.exceptions import ApiHostError, ApiTimeoutError, ClientError from sentry.taskworker.timeout import timeout_alarm from sentry.utils import metrics +from sentry.utils.circuit_breaker2 import CircuitBreaker, RateBasedTripStrategy from sentry.utils.sentry_apps import SentryAppWebhookRequestsBuffer -from sentry.utils.sentry_apps.circuit_breaker import ( - RateBasedCircuitBreaker, - circuit_breaker_tracking, -) +from sentry.utils.sentry_apps.circuit_breaker import circuit_breaker_tracking if TYPE_CHECKING: from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent @@ -56,7 +54,6 @@ def _handle_webhook_timeout(signum: int, frame: FrameType | None) -> None: """Handler for when a webhook request exceeds the hard timeout deadline. - This is a workaround for safe_create_connection sockets hanging when the given url cannot be reached or resolved. - - TODO(christinarlong): Add sentry app disabling logic here """ raise WebhookTimeoutError("Webhook request exceeded hard timeout deadline") @@ -131,14 +128,16 @@ def send_and_save_webhook_request( ) # Circuit breaker gate — keyed per app slug - circuit_breaker: RateBasedCircuitBreaker | None = None + circuit_breaker: CircuitBreaker | None = None if organization_context is not None and features.has( "organizations:sentry-app-webhook-circuit-breaker", organization_context.organization, ): - circuit_breaker = RateBasedCircuitBreaker( - f"sentry-app.webhook.{sentry_app.slug}", - options.get("sentry-apps.webhook.circuit-breaker.config"), + config = options.get("sentry-apps.webhook.circuit-breaker.config") + circuit_breaker = CircuitBreaker( + key=f"sentry-app.webhook.{sentry_app.slug}", + config=config, + trip_strategy=RateBasedTripStrategy.from_config(config), ) if not circuit_breaker.should_allow_request(): dry_run = options.get("sentry-apps.webhook.circuit-breaker.dry-run") diff --git a/tests/sentry/utils/sentry_apps/test_circuit_breaker.py b/tests/sentry/utils/sentry_apps/test_circuit_breaker.py deleted file mode 100644 index 3eca01c91f24b9..00000000000000 --- a/tests/sentry/utils/sentry_apps/test_circuit_breaker.py +++ /dev/null @@ -1,235 +0,0 @@ -import time -from typing import Any -from unittest import TestCase -from unittest.mock import patch - -from sentry.ratelimits.sliding_windows import ( - GrantedQuota, - Quota, - RequestedQuota, -) -from sentry.testutils.helpers.datetime import freeze_time -from sentry.utils.circuit_breaker2 import CircuitBreakerState -from sentry.utils.sentry_apps.circuit_breaker import ( - RateBasedCircuitBreaker, - RateBasedCircuitBreakerConfig, -) - -DEFAULT_RATE_CONFIG: RateBasedCircuitBreakerConfig = { - "error_limit": 10000, - "error_limit_window": 3600, # 1 hr - "broken_state_duration": 120, # 2 min - "error_floor": 100, - "error_rate_threshold": 0.5, -} - - -class MockRateBasedCircuitBreaker(RateBasedCircuitBreaker): - """Test helper with state manipulation methods, following MockCircuitBreaker pattern.""" - - def _set_breaker_state( - self, state: CircuitBreakerState, seconds_left: int | None = None - ) -> None: - now = int(time.time()) - - if state == CircuitBreakerState.OK: - self._delete_from_redis([self.broken_state_key, self.recovery_state_key]) - elif state == CircuitBreakerState.BROKEN: - broken_state_timeout = seconds_left or self.broken_state_duration - broken_state_end = now + broken_state_timeout - recovery_timeout = broken_state_timeout + self.recovery_duration - recovery_end = now + recovery_timeout - self._set_in_redis( - [ - (self.broken_state_key, broken_state_end, broken_state_timeout), - (self.recovery_state_key, recovery_end, recovery_timeout), - ] - ) - elif state == CircuitBreakerState.RECOVERY: - recovery_timeout = seconds_left or self.recovery_duration - recovery_end = now + recovery_timeout - self._delete_from_redis([self.broken_state_key]) - self._set_in_redis([(self.recovery_state_key, recovery_end, recovery_timeout)]) - - def _add_quota_usage( - self, - quota: Quota, - amount_used: int, - granule_or_window_end: int | None = None, - ) -> None: - now = int(time.time()) - window_end_time = granule_or_window_end or now - self.limiter.use_quotas( - [RequestedQuota(self.key, amount_used, [quota])], - [GrantedQuota(self.key, amount_used, [])], - window_end_time, - ) - - def _delete_from_redis(self, keys: list[str]) -> Any: - for key in keys: - self.redis_pipeline.delete(key) - return self.redis_pipeline.execute() - - -@freeze_time() -class RateBasedCircuitBreakerTest(TestCase): - def setUp(self) -> None: - self.config = DEFAULT_RATE_CONFIG - self.breaker = MockRateBasedCircuitBreaker("test_webhook_app", self.config) - # Clear all existing keys from redis - self.breaker.redis_pipeline.flushall() - self.breaker.redis_pipeline.execute() - - def test_low_error_count_below_floor_does_not_trip(self) -> None: - """Even at 100% error rate, if error count < floor, don't trip.""" - # Record 50 errors (below floor of 100), 0 successes = 100% error rate - for _ in range(50): - self.breaker.record_error() - - assert self.breaker.should_allow_request() is True - state, _ = self.breaker._get_state_and_remaining_time() - assert state == CircuitBreakerState.OK - - def test_high_error_count_low_rate_does_not_trip(self) -> None: - """If error count >= floor but rate < threshold, don't trip.""" - # Record 900 successes first, then 100 errors = 100/1000 = 10% error rate. - # Successes must be recorded first to ensure the rate stays below 50% - # throughout error recording. error_floor=100, threshold=50%. - for _ in range(900): - self.breaker.record_success() - for _ in range(100): - self.breaker.record_error() - - assert self.breaker.should_allow_request() is True - state, _ = self.breaker._get_state_and_remaining_time() - assert state == CircuitBreakerState.OK - - def test_both_conditions_met_trips_to_broken(self) -> None: - """When error_count >= floor AND error_rate >= threshold, trip to BROKEN.""" - # 200 errors + 100 successes = 200/300 = 66.7% error rate (> 50% threshold) - # 200 errors > 100 floor - for _ in range(100): - self.breaker.record_success() - for _ in range(200): - self.breaker.record_error() - - state, _ = self.breaker._get_state_and_remaining_time() - assert state == CircuitBreakerState.BROKEN - assert self.breaker.should_allow_request() is False - - def test_broken_to_recovery_to_ok_lifecycle(self) -> None: - """Test the full BROKEN -> RECOVERY -> OK lifecycle.""" - self.breaker._set_breaker_state(CircuitBreakerState.BROKEN) - assert self.breaker.should_allow_request() is False - - # Move to RECOVERY (simulate broken_state_duration expiry) - self.breaker._set_breaker_state(CircuitBreakerState.RECOVERY) - assert self.breaker.should_allow_request() is True - - # Move back to OK (simulate recovery_duration expiry) - self.breaker._set_breaker_state(CircuitBreakerState.OK) - assert self.breaker.should_allow_request() is True - - def test_recovery_to_broken_on_recovery_errors(self) -> None: - """During RECOVERY, exceeding recovery_error_limit re-trips to BROKEN.""" - self.breaker._set_breaker_state(CircuitBreakerState.RECOVERY) - state, _ = self.breaker._get_state_and_remaining_time() - assert state == CircuitBreakerState.RECOVERY - - # The recovery_error_limit is error_limit // 10 = 1000. - # Record enough errors to exceed recovery limit at high rate. - recovery_limit = self.breaker.recovery_error_limit - for _ in range(recovery_limit + 1): - self.breaker.record_error() - - state, _ = self.breaker._get_state_and_remaining_time() - assert state == CircuitBreakerState.BROKEN - - def test_record_success_increments_total_count(self) -> None: - """record_success() should increment total request count.""" - self.breaker.record_success() - - now = int(time.time()) - _, grants = self.breaker.limiter.check_within_quotas( - [ - RequestedQuota( - self.breaker.key, - self.breaker.total_requests_quota.limit, - [self.breaker.total_requests_quota], - ) - ], - now, - ) - total_used = self.breaker.total_requests_quota.limit - grants[0].granted - assert total_used == 1 - - def test_record_error_increments_both_counts(self) -> None: - """record_error() should increment both error and total request counts.""" - self.breaker.record_error() - - now = int(time.time()) - # Check total requests - _, total_grants = self.breaker.limiter.check_within_quotas( - [ - RequestedQuota( - self.breaker.key, - self.breaker.total_requests_quota.limit, - [self.breaker.total_requests_quota], - ) - ], - now, - ) - total_used = self.breaker.total_requests_quota.limit - total_grants[0].granted - assert total_used == 1 - - # Check error count - _, error_grants = self.breaker.limiter.check_within_quotas( - [ - RequestedQuota( - self.breaker.key, - self.breaker.primary_quota.limit, - [self.breaker.primary_quota], - ) - ], - now, - ) - errors_used = self.breaker.primary_quota.limit - error_grants[0].granted - assert errors_used == 1 - - @patch("sentry.utils.sentry_apps.circuit_breaker.metrics") - def test_state_change_metrics_emitted(self, mock_metrics: Any) -> None: - """State transition metrics should be emitted on OK->BROKEN.""" - # Trip the breaker: 200 errors out of 300 total = 66.7% rate, 200 > 100 floor - for _ in range(100): - self.breaker.record_success() - for _ in range(200): - self.breaker.record_error() - - mock_metrics.incr.assert_any_call( - "sentry_app.webhook.circuit_breaker.state_change", - tags={ - "key": "test_webhook_app", - "transition": "circuit_okay_to_circuit_broken", - }, - sample_rate=1.0, - ) - - def test_record_success_noop_when_broken(self) -> None: - """record_success() should return early when BROKEN.""" - self.breaker._set_breaker_state(CircuitBreakerState.BROKEN) - self.breaker.record_success() # Should not raise - - # Verify no total requests were counted - now = int(time.time()) - _, grants = self.breaker.limiter.check_within_quotas( - [ - RequestedQuota( - self.breaker.key, - self.breaker.total_requests_quota.limit, - [self.breaker.total_requests_quota], - ) - ], - now, - ) - total_used = self.breaker.total_requests_quota.limit - grants[0].granted - assert total_used == 0 diff --git a/tests/sentry/utils/sentry_apps/test_webhooks.py b/tests/sentry/utils/sentry_apps/test_webhooks.py index 3e9dc734a131a7..a7b40833309d3d 100644 --- a/tests/sentry/utils/sentry_apps/test_webhooks.py +++ b/tests/sentry/utils/sentry_apps/test_webhooks.py @@ -14,11 +14,10 @@ CIRCUIT_BREAKER_OPTIONS = { "sentry-apps.webhook.circuit-breaker.config": { - "error_limit": 10000, "error_limit_window": 600, "broken_state_duration": 300, - "error_rate_threshold": 0.5, - "error_floor": 5, # low floor for testing + "threshold": 0.5, + "floor": 5, # low floor for testing }, "sentry-apps.webhook.circuit-breaker.dry-run": False, "sentry-apps.webhook.timeout.sec": 1.0, @@ -66,7 +65,7 @@ def test_no_circuit_breaker_without_feature_flag(self, mock_safe_urlopen): {**CIRCUIT_BREAKER_OPTIONS, "sentry-apps.webhook.circuit-breaker.dry-run": True} ) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.RateBasedCircuitBreaker") + @patch("sentry.utils.sentry_apps.webhooks.CircuitBreaker") def test_dry_run_emits_metric_but_sends_webhook(self, MockBreaker, mock_safe_urlopen): """In dry-run mode, a broken circuit emits would_block but still sends.""" mock_breaker_instance = MockBreaker.return_value @@ -86,7 +85,7 @@ def test_dry_run_emits_metric_but_sends_webhook(self, MockBreaker, mock_safe_url @with_feature("organizations:sentry-app-webhook-circuit-breaker") @override_options(CIRCUIT_BREAKER_OPTIONS) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.RateBasedCircuitBreaker") + @patch("sentry.utils.sentry_apps.webhooks.CircuitBreaker") def test_blocking_mode_returns_empty_response(self, MockBreaker, mock_safe_urlopen): """With dry-run OFF, a broken circuit blocks the webhook.""" mock_breaker_instance = MockBreaker.return_value @@ -99,7 +98,7 @@ def test_blocking_mode_returns_empty_response(self, MockBreaker, mock_safe_urlop @with_feature("organizations:sentry-app-webhook-circuit-breaker") @override_options(CIRCUIT_BREAKER_OPTIONS) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.RateBasedCircuitBreaker") + @patch("sentry.utils.sentry_apps.webhooks.CircuitBreaker") def test_timeout_calls_record_error(self, MockBreaker, mock_safe_urlopen): """Timeout exceptions should call record_error().""" mock_breaker_instance = MockBreaker.return_value @@ -114,7 +113,7 @@ def test_timeout_calls_record_error(self, MockBreaker, mock_safe_urlopen): @with_feature("organizations:sentry-app-webhook-circuit-breaker") @override_options(CIRCUIT_BREAKER_OPTIONS) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") - @patch("sentry.utils.sentry_apps.webhooks.RateBasedCircuitBreaker") + @patch("sentry.utils.sentry_apps.webhooks.CircuitBreaker") def test_success_calls_record_success(self, MockBreaker, mock_safe_urlopen): """Successful responses should call record_success().""" mock_breaker_instance = MockBreaker.return_value From 94f985d66ef38d5e48648e54edd0e21413ddd503 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Fri, 27 Mar 2026 10:09:40 -0700 Subject: [PATCH 11/20] duplicate enum --- src/sentry/sentry_apps/metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sentry/sentry_apps/metrics.py b/src/sentry/sentry_apps/metrics.py index 0886ee7f9b2662..72eb5fea395f7a 100644 --- a/src/sentry/sentry_apps/metrics.py +++ b/src/sentry/sentry_apps/metrics.py @@ -72,7 +72,6 @@ class SentryAppWebhookHaltReason(StrEnum): CONNECTION_RESET = "connection_reset" HARD_TIMEOUT = "hard_timeout" CIRCUIT_BROKEN = "circuit_broken" - HARD_TIMEOUT = "hard_timeout" class SentryAppExternalRequestFailureReason(StrEnum): From a01f42cbed1ffd64de441c8cf1d9d88a311910d2 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Fri, 27 Mar 2026 10:24:58 -0700 Subject: [PATCH 12/20] update region silo test to cell silo --- tests/sentry/utils/sentry_apps/test_webhooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sentry/utils/sentry_apps/test_webhooks.py b/tests/sentry/utils/sentry_apps/test_webhooks.py index a7b40833309d3d..ecfab1b04a786d 100644 --- a/tests/sentry/utils/sentry_apps/test_webhooks.py +++ b/tests/sentry/utils/sentry_apps/test_webhooks.py @@ -9,7 +9,7 @@ from sentry.testutils.cases import TestCase from sentry.testutils.helpers.features import with_feature from sentry.testutils.helpers.options import override_options -from sentry.testutils.silo import region_silo_test +from sentry.testutils.silo import cell_silo_test from sentry.utils.sentry_apps.webhooks import send_and_save_webhook_request CIRCUIT_BREAKER_OPTIONS = { @@ -25,7 +25,7 @@ } -@region_silo_test +@cell_silo_test class WebhookCircuitBreakerTest(TestCase): def setUp(self): self.organization = self.create_organization() From 8901b141b7085a24dd8612899df4c520528d04b3 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Fri, 27 Mar 2026 10:26:38 -0700 Subject: [PATCH 13/20] update region silo test to cell silo --- tests/sentry/utils/sentry_apps/test_webhook_timeout.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py index 145c1a7145f08d..4f82955f1141a5 100644 --- a/tests/sentry/utils/sentry_apps/test_webhook_timeout.py +++ b/tests/sentry/utils/sentry_apps/test_webhook_timeout.py @@ -12,11 +12,11 @@ from sentry.testutils.cases import TestCase from sentry.testutils.helpers.features import with_feature from sentry.testutils.helpers.options import override_options -from sentry.testutils.silo import region_silo_test +from sentry.testutils.silo import cell_silo_test from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError, send_and_save_webhook_request -@region_silo_test +@cell_silo_test class WebhookTimeoutTest(TestCase): def setUp(self): self.organization = self.create_organization() From 07d4036eee9304514e7f03b6f390401a1dd7428f Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Fri, 27 Mar 2026 10:41:03 -0700 Subject: [PATCH 14/20] test name collision --- tests/sentry/utils/sentry_apps/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/sentry/utils/sentry_apps/__init__.py diff --git a/tests/sentry/utils/sentry_apps/__init__.py b/tests/sentry/utils/sentry_apps/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 From 1126a3dca89a7fad398aaa69ffe28311eb4865a0 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Mon, 30 Mar 2026 11:28:37 -0700 Subject: [PATCH 15/20] update codeowners and defautls --- .github/CODEOWNERS | 1 + src/sentry/options/defaults.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 13781d0ce9ff22..f67c0827bf155d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -447,6 +447,7 @@ tests/sentry/api/endpoints/test_organization_attribute_mappings.py @get /src/sentry/sentry_apps/ @getsentry/product-owners-settings-integrations @getsentry/ecosystem /tests/sentry/sentry_apps/ @getsentry/product-owners-settings-integrations @getsentry/ecosystem /src/sentry/utils/sentry_apps/ @getsentry/ecosystem +/tests/sentry/utils/sentry_apps/ @getsentry/ecosystem /src/sentry/middleware/integrations/ @getsentry/ecosystem /src/sentry/api/endpoints/project_rule*.py @getsentry/alerts-notifications /src/sentry/api/serializers/models/rule.py @getsentry/alerts-notifications diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index a9938943aecea2..52a8036bec5666 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2693,7 +2693,7 @@ # When True, the circuit breaker tracks state and emits metrics but does not block requests. register( "sentry-apps.webhook.circuit-breaker.dry-run", - default=True, + default=False, flags=FLAG_AUTOMATOR_MODIFIABLE, ) From 0c029372556e472ea58ab9fe9cf5b6a3f17bc59c Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Mon, 30 Mar 2026 13:17:13 -0700 Subject: [PATCH 16/20] break out webhook circuit breaker logic to helpers --- .../exceptions/__init__.py | 2 +- src/sentry/utils/sentry_apps/webhooks.py | 140 +++++++++++------- 2 files changed, 87 insertions(+), 55 deletions(-) diff --git a/src/sentry/shared_integrations/exceptions/__init__.py b/src/sentry/shared_integrations/exceptions/__init__.py index b48955b19ec380..f65164182b6599 100644 --- a/src/sentry/shared_integrations/exceptions/__init__.py +++ b/src/sentry/shared_integrations/exceptions/__init__.py @@ -224,6 +224,6 @@ def __init__(self, field_errors: Mapping[str, Any] | None = None) -> None: class ClientError(RequestException): """4xx Error Occurred""" - def __init__(self, status_code: str, url: str, response: Response | None = None) -> None: + def __init__(self, status_code: str | int, url: str, response: Response | None = None) -> None: http_error_msg = f"{status_code} Client Error: for url: {url}" super().__init__(http_error_msg, response=response) diff --git a/src/sentry/utils/sentry_apps/webhooks.py b/src/sentry/utils/sentry_apps/webhooks.py index 263fb18d32363b..11789164865cd9 100644 --- a/src/sentry/utils/sentry_apps/webhooks.py +++ b/src/sentry/utils/sentry_apps/webhooks.py @@ -4,6 +4,7 @@ from collections.abc import Callable, Mapping from types import FrameType from typing import TYPE_CHECKING, Any, Concatenate, ParamSpec, TypeVar +from urllib.parse import urlparse import sentry_sdk from requests import RequestException, Response @@ -13,6 +14,8 @@ from sentry import features, options from sentry.exceptions import RestrictedIPAddress from sentry.http import safe_urlopen +from sentry.integrations.utils.metrics import EventLifecycle +from sentry.organizations.services.organization.model import RpcUserOrganizationContext from sentry.organizations.services.organization.service import organization_service from sentry.sentry_apps.metrics import ( SentryAppEventType, @@ -75,6 +78,79 @@ def wrapper( return wrapper +def _create_circuit_breaker( + sentry_app: SentryApp | RpcSentryApp, + organization_context: RpcUserOrganizationContext | None, +) -> CircuitBreaker | None: + if organization_context is None or not features.has( + "organizations:sentry-app-webhook-circuit-breaker", + organization_context.organization, + ): + return None + config = options.get("sentry-apps.webhook.circuit-breaker.config") + return CircuitBreaker( + key=f"sentry-app.webhook.{sentry_app.slug}", + config=config, + trip_strategy=RateBasedTripStrategy.from_config(config), + ) + + +def _circuit_breaker_allows_request( + circuit_breaker: CircuitBreaker | None, + sentry_app: SentryApp | RpcSentryApp, + org_id: int, + lifecycle: EventLifecycle, +) -> bool: + if circuit_breaker is None or circuit_breaker.should_allow_request(): + return True + + dry_run = options.get("sentry-apps.webhook.circuit-breaker.dry-run") + if dry_run: + metrics.incr( + "sentry_app.webhook.circuit_breaker.would_block", + tags={"slug": sentry_app.slug}, + ) + logger.warning( + "sentry_app.webhook.circuit_breaker.would_block", + extra={"slug": sentry_app.slug, "org_id": org_id}, + ) + return True + + lifecycle.record_halt( + halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.CIRCUIT_BROKEN}" + ) + return False + + +def _send_webhook_request( + url: str, + app_platform_event: AppPlatformEvent[T], + organization_context: RpcUserOrganizationContext | None, +) -> Response: + if organization_context is not None and features.has( + "organizations:sentry-app-webhook-hard-timeout", + organization_context.organization, + ): + # We're using a signal based timeout here because we need to interrupt the blocking + # socket.connect() operation. See SENTRY-5HA6 for more context. Here we're hanging at + # the socket.connect() call and the timeout we set in safe_urlopen is not being respected. + timeout_seconds = options.get("sentry-apps.webhook.hard-timeout.sec") + with timeout_alarm(timeout_seconds, _handle_webhook_timeout): + return safe_urlopen( + url=url, + data=app_platform_event.body, + headers=app_platform_event.headers, + timeout=options.get("sentry-apps.webhook.timeout.sec"), + ) + + return safe_urlopen( + url=url, + data=app_platform_event.body, + headers=app_platform_event.headers, + timeout=options.get("sentry-apps.webhook.timeout.sec"), + ) + + @sentry_sdk.trace(name="send_and_save_webhook_request") @ignore_unpublished_app_errors def send_and_save_webhook_request( @@ -126,59 +202,12 @@ def send_and_save_webhook_request( include_projects=False, include_teams=False, ) - - # Circuit breaker gate — keyed per app slug - circuit_breaker: CircuitBreaker | None = None - if organization_context is not None and features.has( - "organizations:sentry-app-webhook-circuit-breaker", - organization_context.organization, - ): - config = options.get("sentry-apps.webhook.circuit-breaker.config") - circuit_breaker = CircuitBreaker( - key=f"sentry-app.webhook.{sentry_app.slug}", - config=config, - trip_strategy=RateBasedTripStrategy.from_config(config), - ) - if not circuit_breaker.should_allow_request(): - dry_run = options.get("sentry-apps.webhook.circuit-breaker.dry-run") - if dry_run: - metrics.incr( - "sentry_app.webhook.circuit_breaker.would_block", - tags={"slug": sentry_app.slug}, - ) - logger.warning( - "sentry_app.webhook.circuit_breaker.would_block", - extra={"slug": sentry_app.slug, "org_id": org_id}, - ) - else: - lifecycle.record_halt( - halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.CIRCUIT_BROKEN}" - ) - return Response() + circuit_breaker = _create_circuit_breaker(sentry_app, organization_context) + if not _circuit_breaker_allows_request(circuit_breaker, sentry_app, org_id, lifecycle): + return Response() with circuit_breaker_tracking(circuit_breaker): - if organization_context is not None and features.has( - "organizations:sentry-app-webhook-hard-timeout", - organization_context.organization, - ): - # We're using a signal based timeout here because we need to interrupt the blocking socket.connect() opeartion. - # See SENTRY-5HA6 for more context. Here we're hanging at the socket.connect() call and the timeout we set - # in safe_urlopen is not being respected. - timeout_seconds = options.get("sentry-apps.webhook.hard-timeout.sec") - with timeout_alarm(timeout_seconds, _handle_webhook_timeout): - response = safe_urlopen( - url=url, - data=app_platform_event.body, - headers=app_platform_event.headers, - timeout=options.get("sentry-apps.webhook.timeout.sec"), - ) - else: - response = safe_urlopen( - url=url, - data=app_platform_event.body, - headers=app_platform_event.headers, - timeout=options.get("sentry-apps.webhook.timeout.sec"), - ) + response = _send_webhook_request(url, app_platform_event, organization_context) except WebhookTimeoutError: lifecycle.record_halt( @@ -219,13 +248,14 @@ def send_and_save_webhook_request( raise track_response_code(response.status_code, slug, event) + project_id = int(p) if (p := response.headers.get("Sentry-Hook-Project")) else None buffer.add_request( response_code=response.status_code, org_id=org_id, event=event, url=url, error_id=response.headers.get("Sentry-Hook-Error"), - project_id=response.headers.get("Sentry-Hook-Project"), + project_id=project_id, response=response, headers=app_platform_event.headers, ) @@ -256,13 +286,15 @@ def send_and_save_webhook_request( lifecycle.record_halt( halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}" ) - raise ApiHostError.from_request(response.request) + raise ApiHostError(f"Unable to reach host: {urlparse(url).netloc}", url=url) elif response.status_code == status.HTTP_504_GATEWAY_TIMEOUT: lifecycle.record_halt( halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}" ) - raise ApiTimeoutError.from_request(response.request) + raise ApiTimeoutError( + f"Timed out attempting to reach host: {urlparse(url).netloc}", url=url + ) elif 400 <= response.status_code < 500: lifecycle.record_halt( From 000127dba88b00b3b4f0290442b928ef7a644287 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Mon, 30 Mar 2026 13:49:01 -0700 Subject: [PATCH 17/20] fix typing and metric key --- src/sentry/utils/sentry_apps/request_buffer.py | 2 +- src/sentry/utils/sentry_apps/webhooks.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sentry/utils/sentry_apps/request_buffer.py b/src/sentry/utils/sentry_apps/request_buffer.py index 848c11a9260fff..39d38d63691209 100644 --- a/src/sentry/utils/sentry_apps/request_buffer.py +++ b/src/sentry/utils/sentry_apps/request_buffer.py @@ -147,7 +147,7 @@ def add_request( event: str, url: str, error_id: str | None = None, - project_id: int | None = None, + project_id: str | None = None, response: Response | None = None, headers: Mapping[str, str] | None = None, ) -> None: diff --git a/src/sentry/utils/sentry_apps/webhooks.py b/src/sentry/utils/sentry_apps/webhooks.py index 11789164865cd9..54692a7aaf76d7 100644 --- a/src/sentry/utils/sentry_apps/webhooks.py +++ b/src/sentry/utils/sentry_apps/webhooks.py @@ -89,6 +89,7 @@ def _create_circuit_breaker( return None config = options.get("sentry-apps.webhook.circuit-breaker.config") return CircuitBreaker( + metrics_key="sentry-app.webhook", key=f"sentry-app.webhook.{sentry_app.slug}", config=config, trip_strategy=RateBasedTripStrategy.from_config(config), @@ -248,14 +249,13 @@ def send_and_save_webhook_request( raise track_response_code(response.status_code, slug, event) - project_id = int(p) if (p := response.headers.get("Sentry-Hook-Project")) else None buffer.add_request( response_code=response.status_code, org_id=org_id, event=event, url=url, error_id=response.headers.get("Sentry-Hook-Error"), - project_id=project_id, + project_id=response.headers.get("Sentry-Hook-Project"), response=response, headers=app_platform_event.headers, ) From 26eee2c5995d4dbcd1d718201aa03c61b5ee730f Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Mon, 30 Mar 2026 14:20:52 -0700 Subject: [PATCH 18/20] circuit breaker comment and test fixes --- src/sentry/options/defaults.py | 1 + .../utils/sentry_apps/circuit_breaker.py | 23 ++++++++++--- src/sentry/utils/sentry_apps/webhooks.py | 1 - .../sentry/utils/sentry_apps/test_webhooks.py | 32 +++++++++++++++++++ 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 52a8036bec5666..56ff49a1d5b7e4 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2686,6 +2686,7 @@ "broken_state_duration": 300, # 5 minutes "threshold": 0.5, # 50% error rate "floor": 500, # 500 errors before error rate check applies + "metrics_key": "sentry-app.webhook", # to avoid high cardinality slug tag }, flags=FLAG_AUTOMATOR_MODIFIABLE, ) diff --git a/src/sentry/utils/sentry_apps/circuit_breaker.py b/src/sentry/utils/sentry_apps/circuit_breaker.py index 0d17971d94bff7..a69f291dd229f6 100644 --- a/src/sentry/utils/sentry_apps/circuit_breaker.py +++ b/src/sentry/utils/sentry_apps/circuit_breaker.py @@ -1,14 +1,19 @@ +import logging from collections.abc import Generator from contextlib import contextmanager from sentry.utils.circuit_breaker2 import CircuitBreaker +logger = logging.getLogger("sentry.sentry_apps.circuit_breaker") + @contextmanager def circuit_breaker_tracking( breaker: CircuitBreaker | None, ) -> Generator[None]: - """Track request outcome: record_error on Exception, record_success on normal exit. + from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError + + """Track request outcome: record_error on WebhookTimeoutError, record_success on normal exit. Handles the None case as a no-op so callers don't need nullcontext(). """ @@ -17,8 +22,18 @@ def circuit_breaker_tracking( return try: yield - except Exception: - breaker.record_error() + + # Currently we only count WebhookTimeoutError as an error in the circuit breaker as those operations are the ones that are taking too long + # If an app returns a say 500, in a reasonable time that's okay + except WebhookTimeoutError: + # This is gross but we don't want to propogate a redis or circuit breaker error to the webhook code + try: + breaker.record_error() + except Exception: + logger.exception("sentry_apps.circuit_breaker.record_error.failure") raise else: - breaker.record_success() + try: + breaker.record_success() + except Exception: + logger.exception("sentry_apps.circuit_breaker.record_success.failure") diff --git a/src/sentry/utils/sentry_apps/webhooks.py b/src/sentry/utils/sentry_apps/webhooks.py index 54692a7aaf76d7..c35ec917f39606 100644 --- a/src/sentry/utils/sentry_apps/webhooks.py +++ b/src/sentry/utils/sentry_apps/webhooks.py @@ -89,7 +89,6 @@ def _create_circuit_breaker( return None config = options.get("sentry-apps.webhook.circuit-breaker.config") return CircuitBreaker( - metrics_key="sentry-app.webhook", key=f"sentry-app.webhook.{sentry_app.slug}", config=config, trip_strategy=RateBasedTripStrategy.from_config(config), diff --git a/tests/sentry/utils/sentry_apps/test_webhooks.py b/tests/sentry/utils/sentry_apps/test_webhooks.py index ecfab1b04a786d..de368b20cce22c 100644 --- a/tests/sentry/utils/sentry_apps/test_webhooks.py +++ b/tests/sentry/utils/sentry_apps/test_webhooks.py @@ -1,3 +1,4 @@ +from collections import namedtuple from unittest.mock import Mock, patch import pytest @@ -6,12 +7,24 @@ from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent from sentry.sentry_apps.utils.webhooks import IssueActionType, SentryAppResourceType +from sentry.shared_integrations.exceptions import ApiHostError from sentry.testutils.cases import TestCase from sentry.testutils.helpers.features import with_feature from sentry.testutils.helpers.options import override_options from sentry.testutils.silo import cell_silo_test +from sentry.utils.circuit_breaker2 import CircuitBreaker from sentry.utils.sentry_apps.webhooks import send_and_save_webhook_request + +def _raise_status_false() -> bool: + return False + + +_MockResponse = namedtuple( + "_MockResponse", + ["headers", "content", "text", "ok", "status_code", "raise_for_status", "request"], +) + CIRCUIT_BREAKER_OPTIONS = { "sentry-apps.webhook.circuit-breaker.config": { "error_limit_window": 600, @@ -126,3 +139,22 @@ def test_success_calls_record_success(self, MockBreaker, mock_safe_urlopen): send_and_save_webhook_request(self.sentry_app, self._make_event()) mock_breaker_instance.record_success.assert_called_once() + + @with_feature("organizations:sentry-app-webhook-circuit-breaker") + @override_options(CIRCUIT_BREAKER_OPTIONS) + @patch.object(CircuitBreaker, "record_success") + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + def test_http_error_response_records_success_and_raises( + self, mock_safe_urlopen, mock_record_success + ): + """When the circuit breaker allows a request but the response is an HTTP error, + the breaker records success (the connection completed) and the normal error + handling still raises the appropriate exception.""" + mock_safe_urlopen.return_value = _MockResponse( + {}, '{"error": "service unavailable"}', "", False, 503, _raise_status_false, None + ) + + with pytest.raises(ApiHostError): + send_and_save_webhook_request(self.sentry_app, self._make_event()) + + mock_record_success.assert_called_once() From 3688925c42f39b072bd5e7dd1430ccdc50694ef7 Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Mon, 30 Mar 2026 17:41:19 -0700 Subject: [PATCH 19/20] gosh this code is despairge --- .../utils/sentry_apps/circuit_breaker.py | 4 +-- .../utils/sentry_apps/request_buffer.py | 2 +- src/sentry/utils/sentry_apps/webhooks.py | 8 +++++- .../sentry_apps/tasks/test_sentry_apps.py | 2 +- .../sentry/utils/sentry_apps/test_webhooks.py | 27 ++++++++++++++++--- 5 files changed, 34 insertions(+), 9 deletions(-) diff --git a/src/sentry/utils/sentry_apps/circuit_breaker.py b/src/sentry/utils/sentry_apps/circuit_breaker.py index a69f291dd229f6..4d65e411fefe37 100644 --- a/src/sentry/utils/sentry_apps/circuit_breaker.py +++ b/src/sentry/utils/sentry_apps/circuit_breaker.py @@ -11,12 +11,12 @@ def circuit_breaker_tracking( breaker: CircuitBreaker | None, ) -> Generator[None]: - from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError - """Track request outcome: record_error on WebhookTimeoutError, record_success on normal exit. Handles the None case as a no-op so callers don't need nullcontext(). """ + from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError + if breaker is None: yield return diff --git a/src/sentry/utils/sentry_apps/request_buffer.py b/src/sentry/utils/sentry_apps/request_buffer.py index 39d38d63691209..848c11a9260fff 100644 --- a/src/sentry/utils/sentry_apps/request_buffer.py +++ b/src/sentry/utils/sentry_apps/request_buffer.py @@ -147,7 +147,7 @@ def add_request( event: str, url: str, error_id: str | None = None, - project_id: str | None = None, + project_id: int | None = None, response: Response | None = None, headers: Mapping[str, str] | None = None, ) -> None: diff --git a/src/sentry/utils/sentry_apps/webhooks.py b/src/sentry/utils/sentry_apps/webhooks.py index c35ec917f39606..7c5380f75f4206 100644 --- a/src/sentry/utils/sentry_apps/webhooks.py +++ b/src/sentry/utils/sentry_apps/webhooks.py @@ -248,13 +248,19 @@ def send_and_save_webhook_request( raise track_response_code(response.status_code, slug, event) + + project_id = ( + int(p_id) + if (p_id := response.headers.get("Sentry-Hook-Project")) and p_id.isdigit() + else None + ) buffer.add_request( response_code=response.status_code, org_id=org_id, event=event, url=url, error_id=response.headers.get("Sentry-Hook-Error"), - project_id=response.headers.get("Sentry-Hook-Project"), + project_id=project_id, response=response, headers=app_platform_event.headers, ) diff --git a/tests/sentry/sentry_apps/tasks/test_sentry_apps.py b/tests/sentry/sentry_apps/tasks/test_sentry_apps.py index e497c145d39d9d..e390eb415f8000 100644 --- a/tests/sentry/sentry_apps/tasks/test_sentry_apps.py +++ b/tests/sentry/sentry_apps/tasks/test_sentry_apps.py @@ -1713,7 +1713,7 @@ def test_saves_error_event_id_if_in_header(self, safe_urlopen: MagicMock) -> Non assert first_request["event_type"] == "issue.assigned" assert first_request["organization_id"] == self.install.organization_id assert first_request["error_id"] == "d5111da2c28645c5889d072017e3445d" - assert first_request["project_id"] == "1" + assert first_request["project_id"] == 1 @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen", return_value=MockResponseInstance) diff --git a/tests/sentry/utils/sentry_apps/test_webhooks.py b/tests/sentry/utils/sentry_apps/test_webhooks.py index de368b20cce22c..5e5cf54b47e7f2 100644 --- a/tests/sentry/utils/sentry_apps/test_webhooks.py +++ b/tests/sentry/utils/sentry_apps/test_webhooks.py @@ -13,7 +13,7 @@ from sentry.testutils.helpers.options import override_options from sentry.testutils.silo import cell_silo_test from sentry.utils.circuit_breaker2 import CircuitBreaker -from sentry.utils.sentry_apps.webhooks import send_and_save_webhook_request +from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError, send_and_save_webhook_request def _raise_status_false() -> bool: @@ -112,8 +112,26 @@ def test_blocking_mode_returns_empty_response(self, MockBreaker, mock_safe_urlop @override_options(CIRCUIT_BREAKER_OPTIONS) @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") @patch("sentry.utils.sentry_apps.webhooks.CircuitBreaker") - def test_timeout_calls_record_error(self, MockBreaker, mock_safe_urlopen): - """Timeout exceptions should call record_error().""" + def test_hard_timeout_calls_record_error(self, MockBreaker, mock_safe_urlopen): + """WebhookTimeoutError (hard timeout) should call record_error() on the circuit breaker.""" + mock_breaker_instance = MockBreaker.return_value + mock_breaker_instance.should_allow_request.return_value = True + mock_safe_urlopen.side_effect = WebhookTimeoutError() + + with pytest.raises(WebhookTimeoutError): + send_and_save_webhook_request(self.sentry_app, self._make_event()) + + mock_breaker_instance.record_error.assert_called_once() + mock_breaker_instance.record_success.assert_not_called() + + @with_feature("organizations:sentry-app-webhook-circuit-breaker") + @override_options(CIRCUIT_BREAKER_OPTIONS) + @patch("sentry.utils.sentry_apps.webhooks.safe_urlopen") + @patch("sentry.utils.sentry_apps.webhooks.CircuitBreaker") + def test_timeout_does_not_record_error(self, MockBreaker, mock_safe_urlopen): + """Regular Timeout exceptions are not recorded as circuit breaker errors — only + WebhookTimeoutError (hard timeout) is. A fast network timeout still counts as + a completed attempt from the breaker's perspective.""" mock_breaker_instance = MockBreaker.return_value mock_breaker_instance.should_allow_request.return_value = True mock_safe_urlopen.side_effect = Timeout() @@ -121,7 +139,8 @@ def test_timeout_calls_record_error(self, MockBreaker, mock_safe_urlopen): with pytest.raises(Timeout): send_and_save_webhook_request(self.sentry_app, self._make_event()) - mock_breaker_instance.record_error.assert_called_once() + mock_breaker_instance.record_error.assert_not_called() + mock_breaker_instance.record_success.assert_not_called() @with_feature("organizations:sentry-app-webhook-circuit-breaker") @override_options(CIRCUIT_BREAKER_OPTIONS) From c6818590977df7f6d2867b8665c6b3a9d61ddbdf Mon Sep 17 00:00:00 2001 From: Christinarlong Date: Tue, 31 Mar 2026 10:32:58 -0700 Subject: [PATCH 20/20] address pr nits --- src/sentry/options/defaults.py | 1 + src/sentry/utils/sentry_apps/circuit_breaker.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 56ff49a1d5b7e4..465997db2cf145 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2694,6 +2694,7 @@ # When True, the circuit breaker tracks state and emits metrics but does not block requests. register( "sentry-apps.webhook.circuit-breaker.dry-run", + type=Bool, default=False, flags=FLAG_AUTOMATOR_MODIFIABLE, ) diff --git a/src/sentry/utils/sentry_apps/circuit_breaker.py b/src/sentry/utils/sentry_apps/circuit_breaker.py index 4d65e411fefe37..34dbd8591a475a 100644 --- a/src/sentry/utils/sentry_apps/circuit_breaker.py +++ b/src/sentry/utils/sentry_apps/circuit_breaker.py @@ -26,7 +26,7 @@ def circuit_breaker_tracking( # Currently we only count WebhookTimeoutError as an error in the circuit breaker as those operations are the ones that are taking too long # If an app returns a say 500, in a reasonable time that's okay except WebhookTimeoutError: - # This is gross but we don't want to propogate a redis or circuit breaker error to the webhook code + # This is gross but we don't want to propagate a redis or circuit breaker error to the webhook code try: breaker.record_error() except Exception: