Skip to content

Commit bb1eb15

Browse files
feat(sentry apps): Add circuit breaker into webhook code (#111723)
1 parent 9b4d18d commit bb1eb15

11 files changed

Lines changed: 412 additions & 34 deletions

File tree

.github/CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ tests/sentry/api/endpoints/test_organization_attribute_mappings.py @get
448448
/src/sentry/sentry_apps/ @getsentry/product-owners-settings-integrations @getsentry/ecosystem
449449
/tests/sentry/sentry_apps/ @getsentry/product-owners-settings-integrations @getsentry/ecosystem
450450
/src/sentry/utils/sentry_apps/ @getsentry/ecosystem
451+
/tests/sentry/utils/sentry_apps/ @getsentry/ecosystem
451452
/src/sentry/middleware/integrations/ @getsentry/ecosystem
452453
/src/sentry/api/endpoints/project_rule*.py @getsentry/alerts-notifications
453454
/src/sentry/api/serializers/models/rule.py @getsentry/alerts-notifications

src/sentry/features/temporary.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,8 @@ def register_temporary_features(manager: FeatureManager) -> None:
492492
manager.add("organizations:conduit-demo", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
493493
# Enable hard timeout alarm for webhooks
494494
manager.add("organizations:sentry-app-webhook-hard-timeout", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)
495+
# Enable circuit breaker for webhook endpoint failure detection
496+
manager.add("organizations:sentry-app-webhook-circuit-breaker", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)
495497

496498
# Enables organization access to the new notification platform
497499
manager.add("organizations:notification-platform.internal-testing", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)

src/sentry/options/defaults.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2597,6 +2597,29 @@
25972597
flags=FLAG_AUTOMATOR_MODIFIABLE,
25982598
)
25992599

2600+
# Circuit breaker configuration for webhook endpoint failure detection.
2601+
# Keys match RateBasedTripStrategyConfig + CircuitBreakerConfig
2602+
register(
2603+
"sentry-apps.webhook.circuit-breaker.config",
2604+
type=Dict,
2605+
default={
2606+
"error_limit_window": 600, # 10 minutes
2607+
"broken_state_duration": 300, # 5 minutes
2608+
"threshold": 0.5, # 50% error rate
2609+
"floor": 500, # 500 errors before error rate check applies
2610+
"metrics_key": "sentry-app.webhook", # to avoid high cardinality slug tag
2611+
},
2612+
flags=FLAG_AUTOMATOR_MODIFIABLE,
2613+
)
2614+
2615+
# When True, the circuit breaker tracks state and emits metrics but does not block requests.
2616+
register(
2617+
"sentry-apps.webhook.circuit-breaker.dry-run",
2618+
type=Bool,
2619+
default=False,
2620+
flags=FLAG_AUTOMATOR_MODIFIABLE,
2621+
)
2622+
26002623
# Enables statistical detectors for a project
26012624
register(
26022625
"statistical_detectors.enable",

src/sentry/sentry_apps/metrics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class SentryAppWebhookHaltReason(StrEnum):
7171
RESTRICTED_IP = "restricted_ip"
7272
CONNECTION_RESET = "connection_reset"
7373
HARD_TIMEOUT = "hard_timeout"
74+
CIRCUIT_BROKEN = "circuit_broken"
7475

7576

7677
class SentryAppExternalRequestFailureReason(StrEnum):

src/sentry/shared_integrations/exceptions/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,6 @@ def __init__(self, field_errors: Mapping[str, Any] | None = None) -> None:
224224
class ClientError(RequestException):
225225
"""4xx Error Occurred"""
226226

227-
def __init__(self, status_code: str, url: str, response: Response | None = None) -> None:
227+
def __init__(self, status_code: str | int, url: str, response: Response | None = None) -> None:
228228
http_error_msg = f"{status_code} Client Error: for url: {url}"
229229
super().__init__(http_error_msg, response=response)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import logging
2+
from collections.abc import Generator
3+
from contextlib import contextmanager
4+
5+
from sentry.utils.circuit_breaker2 import CircuitBreaker
6+
7+
logger = logging.getLogger("sentry.sentry_apps.circuit_breaker")
8+
9+
10+
@contextmanager
11+
def circuit_breaker_tracking(
12+
breaker: CircuitBreaker | None,
13+
) -> Generator[None]:
14+
"""Track request outcome: record_error on WebhookTimeoutError, record_success on normal exit.
15+
16+
Handles the None case as a no-op so callers don't need nullcontext().
17+
"""
18+
from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError
19+
20+
if breaker is None:
21+
yield
22+
return
23+
try:
24+
yield
25+
26+
# Currently we only count WebhookTimeoutError as an error in the circuit breaker as those operations are the ones that are taking too long
27+
# If an app returns a say 500, in a reasonable time that's okay
28+
except WebhookTimeoutError:
29+
# This is gross but we don't want to propagate a redis or circuit breaker error to the webhook code
30+
try:
31+
breaker.record_error()
32+
except Exception:
33+
logger.exception("sentry_apps.circuit_breaker.record_error.failure")
34+
raise
35+
else:
36+
try:
37+
breaker.record_success()
38+
except Exception:
39+
logger.exception("sentry_apps.circuit_breaker.record_success.failure")

src/sentry/utils/sentry_apps/webhooks.py

Lines changed: 97 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from collections.abc import Callable, Mapping
55
from types import FrameType
66
from typing import TYPE_CHECKING, Any, Concatenate, ParamSpec, TypeVar
7+
from urllib.parse import urlparse
78

89
import sentry_sdk
910
from requests import RequestException, Response
@@ -13,6 +14,8 @@
1314
from sentry import features, options
1415
from sentry.exceptions import RestrictedIPAddress
1516
from sentry.http import safe_urlopen
17+
from sentry.integrations.utils.metrics import EventLifecycle
18+
from sentry.organizations.services.organization.model import RpcUserOrganizationContext
1619
from sentry.organizations.services.organization.service import organization_service
1720
from sentry.sentry_apps.metrics import (
1821
SentryAppEventType,
@@ -23,7 +26,10 @@
2326
from sentry.sentry_apps.utils.errors import SentryAppSentryError
2427
from sentry.shared_integrations.exceptions import ApiHostError, ApiTimeoutError, ClientError
2528
from sentry.taskworker.timeout import timeout_alarm
29+
from sentry.utils import metrics
30+
from sentry.utils.circuit_breaker2 import CircuitBreaker, RateBasedTripStrategy
2631
from sentry.utils.sentry_apps import SentryAppWebhookRequestsBuffer
32+
from sentry.utils.sentry_apps.circuit_breaker import circuit_breaker_tracking
2733

2834
if TYPE_CHECKING:
2935
from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent
@@ -51,7 +57,6 @@ def _handle_webhook_timeout(signum: int, frame: FrameType | None) -> None:
5157
"""Handler for when a webhook request exceeds the hard timeout deadline.
5258
- This is a workaround for safe_create_connection sockets hanging when the given url
5359
cannot be reached or resolved.
54-
- TODO(christinarlong): Add sentry app disabling logic here
5560
"""
5661
raise WebhookTimeoutError("Webhook request exceeded hard timeout deadline")
5762

@@ -73,6 +78,79 @@ def wrapper(
7378
return wrapper
7479

7580

81+
def _create_circuit_breaker(
82+
sentry_app: SentryApp | RpcSentryApp,
83+
organization_context: RpcUserOrganizationContext | None,
84+
) -> CircuitBreaker | None:
85+
if organization_context is None or not features.has(
86+
"organizations:sentry-app-webhook-circuit-breaker",
87+
organization_context.organization,
88+
):
89+
return None
90+
config = options.get("sentry-apps.webhook.circuit-breaker.config")
91+
return CircuitBreaker(
92+
key=f"sentry-app.webhook.{sentry_app.slug}",
93+
config=config,
94+
trip_strategy=RateBasedTripStrategy.from_config(config),
95+
)
96+
97+
98+
def _circuit_breaker_allows_request(
99+
circuit_breaker: CircuitBreaker | None,
100+
sentry_app: SentryApp | RpcSentryApp,
101+
org_id: int,
102+
lifecycle: EventLifecycle,
103+
) -> bool:
104+
if circuit_breaker is None or circuit_breaker.should_allow_request():
105+
return True
106+
107+
dry_run = options.get("sentry-apps.webhook.circuit-breaker.dry-run")
108+
if dry_run:
109+
metrics.incr(
110+
"sentry_app.webhook.circuit_breaker.would_block",
111+
tags={"slug": sentry_app.slug},
112+
)
113+
logger.warning(
114+
"sentry_app.webhook.circuit_breaker.would_block",
115+
extra={"slug": sentry_app.slug, "org_id": org_id},
116+
)
117+
return True
118+
119+
lifecycle.record_halt(
120+
halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.CIRCUIT_BROKEN}"
121+
)
122+
return False
123+
124+
125+
def _send_webhook_request(
126+
url: str,
127+
app_platform_event: AppPlatformEvent[T],
128+
organization_context: RpcUserOrganizationContext | None,
129+
) -> Response:
130+
if organization_context is not None and features.has(
131+
"organizations:sentry-app-webhook-hard-timeout",
132+
organization_context.organization,
133+
):
134+
# We're using a signal based timeout here because we need to interrupt the blocking
135+
# socket.connect() operation. See SENTRY-5HA6 for more context. Here we're hanging at
136+
# the socket.connect() call and the timeout we set in safe_urlopen is not being respected.
137+
timeout_seconds = options.get("sentry-apps.webhook.hard-timeout.sec")
138+
with timeout_alarm(timeout_seconds, _handle_webhook_timeout):
139+
return safe_urlopen(
140+
url=url,
141+
data=app_platform_event.body,
142+
headers=app_platform_event.headers,
143+
timeout=options.get("sentry-apps.webhook.timeout.sec"),
144+
)
145+
146+
return safe_urlopen(
147+
url=url,
148+
data=app_platform_event.body,
149+
headers=app_platform_event.headers,
150+
timeout=options.get("sentry-apps.webhook.timeout.sec"),
151+
)
152+
153+
76154
@sentry_sdk.trace(name="send_and_save_webhook_request")
77155
@ignore_unpublished_app_errors
78156
def send_and_save_webhook_request(
@@ -124,28 +202,12 @@ def send_and_save_webhook_request(
124202
include_projects=False,
125203
include_teams=False,
126204
)
127-
if organization_context is not None and features.has(
128-
"organizations:sentry-app-webhook-hard-timeout",
129-
organization_context.organization,
130-
):
131-
# We're using a signal based timeout here because we need to interrupt the blocking socket.connect() opeartion.
132-
# See SENTRY-5HA6 for more context. Here we're hanging at the socket.connect() call and the timeout we set
133-
# in safe_urlopen is not being respected.
134-
timeout_seconds = options.get("sentry-apps.webhook.hard-timeout.sec")
135-
with timeout_alarm(timeout_seconds, _handle_webhook_timeout):
136-
response = safe_urlopen(
137-
url=url,
138-
data=app_platform_event.body,
139-
headers=app_platform_event.headers,
140-
timeout=options.get("sentry-apps.webhook.timeout.sec"),
141-
)
142-
else:
143-
response = safe_urlopen(
144-
url=url,
145-
data=app_platform_event.body,
146-
headers=app_platform_event.headers,
147-
timeout=options.get("sentry-apps.webhook.timeout.sec"),
148-
)
205+
circuit_breaker = _create_circuit_breaker(sentry_app, organization_context)
206+
if not _circuit_breaker_allows_request(circuit_breaker, sentry_app, org_id, lifecycle):
207+
return Response()
208+
209+
with circuit_breaker_tracking(circuit_breaker):
210+
response = _send_webhook_request(url, app_platform_event, organization_context)
149211

150212
except WebhookTimeoutError:
151213
lifecycle.record_halt(
@@ -186,13 +248,19 @@ def send_and_save_webhook_request(
186248
raise
187249

188250
track_response_code(response.status_code, slug, event)
251+
252+
project_id = (
253+
int(p_id)
254+
if (p_id := response.headers.get("Sentry-Hook-Project")) and p_id.isdigit()
255+
else None
256+
)
189257
buffer.add_request(
190258
response_code=response.status_code,
191259
org_id=org_id,
192260
event=event,
193261
url=url,
194262
error_id=response.headers.get("Sentry-Hook-Error"),
195-
project_id=response.headers.get("Sentry-Hook-Project"),
263+
project_id=project_id,
196264
response=response,
197265
headers=app_platform_event.headers,
198266
)
@@ -223,13 +291,15 @@ def send_and_save_webhook_request(
223291
lifecycle.record_halt(
224292
halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}"
225293
)
226-
raise ApiHostError.from_request(response.request)
294+
raise ApiHostError(f"Unable to reach host: {urlparse(url).netloc}", url=url)
227295

228296
elif response.status_code == status.HTTP_504_GATEWAY_TIMEOUT:
229297
lifecycle.record_halt(
230298
halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}"
231299
)
232-
raise ApiTimeoutError.from_request(response.request)
300+
raise ApiTimeoutError(
301+
f"Timed out attempting to reach host: {urlparse(url).netloc}", url=url
302+
)
233303

234304
elif 400 <= response.status_code < 500:
235305
lifecycle.record_halt(
@@ -243,4 +313,5 @@ def send_and_save_webhook_request(
243313
except RequestException as e:
244314
lifecycle.record_halt(e)
245315
raise
316+
246317
return response

tests/sentry/sentry_apps/tasks/test_sentry_apps.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1713,7 +1713,7 @@ def test_saves_error_event_id_if_in_header(self, safe_urlopen: MagicMock) -> Non
17131713
assert first_request["event_type"] == "issue.assigned"
17141714
assert first_request["organization_id"] == self.install.organization_id
17151715
assert first_request["error_id"] == "d5111da2c28645c5889d072017e3445d"
1716-
assert first_request["project_id"] == "1"
1716+
assert first_request["project_id"] == 1
17171717

17181718

17191719
@patch("sentry.utils.sentry_apps.webhooks.safe_urlopen", return_value=MockResponseInstance)

tests/sentry/utils/sentry_apps/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)