Skip to content

Commit 27a8018

Browse files
authored
fire zenduty alert when check fails 5 times in 5 mins (#72)
* fire zenduty alert when check fails 5 times in 5 mins * fix resolution expression * rework the alert status logic a bit so resolution can occur if the check stops failing * linting
1 parent 5cd502f commit 27a8018

File tree

2 files changed

+74
-46
lines changed

2 files changed

+74
-46
lines changed

pyth_observer/dispatch.py

Lines changed: 73 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -74,61 +74,30 @@ async def run(self, states: List[State]):
7474
network=self.config["network"]["name"], publishers=self.publishers
7575
)
7676

77+
current_time = datetime.now()
7778
for check in failed_checks:
7879
for event_type in self.config["events"]:
7980
event: Event = globals()[event_type](check, context)
8081

8182
if event_type == "ZendutyEvent":
82-
# Add failed check to open alerts
83-
alert_identifier = (
84-
f"{check.__class__.__name__}-{check.state().symbol}"
85-
)
86-
state = check.state()
87-
if isinstance(state, PublisherState):
88-
alert_identifier += f"-{state.publisher_name}"
89-
try:
90-
failures = self.open_alerts[alert_identifier]["failures"] + 1
91-
except KeyError:
92-
failures = 1
93-
self.open_alerts[alert_identifier] = {
94-
"last_failure": datetime.now().isoformat(),
95-
"failures": failures,
96-
}
97-
# store the event to send it later if it fails multiple times
83+
alert_identifier = self.generate_alert_identifier(check)
84+
alert = self.open_alerts.get(alert_identifier)
85+
if alert is None:
86+
self.open_alerts[alert_identifier] = {
87+
"window_start": current_time.isoformat(),
88+
"failures": 1,
89+
"last_window_failures": None,
90+
"sent": False,
91+
}
92+
else:
93+
alert["failures"] += 1
9894
self.zenduty_events[alert_identifier] = event
99-
continue # do not immediately send a zenduty alert
95+
continue # Skip sending immediately for ZendutyEvent
10096

10197
sent_events.append(event.send())
10298

10399
await asyncio.gather(*sent_events)
104-
105-
# Check open alerts for zenduty
106-
if "ZendutyEvent" in self.config["events"]:
107-
108-
to_remove = []
109-
current_time = datetime.now()
110-
for identifier, info in self.open_alerts.items():
111-
# Resolve the alert if it last failed > 2 minutes ago
112-
if current_time - datetime.fromisoformat(
113-
info["last_failure"]
114-
) >= timedelta(minutes=2):
115-
logger.debug(f"Resolving Zenduty alert {identifier}")
116-
response = await send_zenduty_alert(
117-
alert_identifier=identifier, message=identifier, resolved=True
118-
)
119-
if response and 200 <= response.status < 300:
120-
to_remove.append(identifier)
121-
elif info["failures"] > 2:
122-
# Raise alert if the check has failed more than twice before self-resolving
123-
await self.zenduty_events[identifier].send()
124-
125-
for identifier in to_remove:
126-
del self.open_alerts[identifier]
127-
del self.zenduty_events[identifier]
128-
129-
# Write open alerts to file to ensure persistence
130-
with open(self.open_alerts_file, "w") as file:
131-
json.dump(self.open_alerts, file)
100+
await self.process_zenduty_events(current_time)
132101

133102
def check_price_feed(self, state: PriceFeedState) -> List[Check]:
134103
failed_checks: List[Check] = []
@@ -179,3 +148,62 @@ def load_config(self, check_name: str, symbol: str) -> Dict[str, Any]:
179148
config |= self.config["checks"][symbol][check_name]
180149

181150
return config
151+
152+
# Zenduty Functions
153+
def generate_alert_identifier(self, check):
154+
alert_identifier = f"{check.__class__.__name__}-{check.state().symbol}"
155+
state = check.state()
156+
if isinstance(state, PublisherState):
157+
alert_identifier += f"-{state.publisher_name}"
158+
return alert_identifier
159+
160+
def check_zd_alert_status(self, alert_identifier, current_time):
161+
alert = self.open_alerts.get(alert_identifier)
162+
if alert is not None:
163+
# Reset the failure count if 5m has elapsed
164+
if current_time - datetime.fromisoformat(
165+
alert["window_start"]
166+
) >= timedelta(minutes=5):
167+
alert["window_start"] = current_time.isoformat()
168+
alert["last_window_failures"] = alert["failures"]
169+
alert["failures"] = 0
170+
171+
async def process_zenduty_events(self, current_time):
172+
to_remove = []
173+
to_alert = []
174+
175+
for identifier, info in self.open_alerts.items():
176+
self.check_zd_alert_status(identifier, current_time)
177+
# Resolve the alert if raised and failed < 5 times in the last 5m window
178+
if (
179+
info["sent"]
180+
and info["last_window_failures"] is not None
181+
and info["last_window_failures"] < 5
182+
):
183+
logger.debug(f"Resolving Zenduty alert {identifier}")
184+
response = await send_zenduty_alert(
185+
identifier, identifier, resolved=True
186+
)
187+
if response and 200 <= response.status < 300:
188+
to_remove.append(identifier)
189+
# Raise alert if failed > 5 times within the last 5m window
190+
# re-alert every 5 minutes
191+
elif info["failures"] >= 5 and (
192+
not info.get("last_alert")
193+
or current_time - datetime.fromisoformat(info["last_alert"])
194+
> timedelta(minutes=5)
195+
):
196+
logger.debug(f"Raising Zenduty alert {identifier}")
197+
self.open_alerts[identifier]["sent"] = True
198+
self.open_alerts[identifier]["last_alert"] = current_time.isoformat()
199+
to_alert.append(self.zenduty_events[identifier].send())
200+
201+
await asyncio.gather(*to_alert)
202+
for identifier in to_remove:
203+
if self.open_alerts.get(identifier):
204+
del self.open_alerts[identifier]
205+
if self.zenduty_events.get(identifier):
206+
del self.zenduty_events[identifier]
207+
208+
with open(self.open_alerts_file, "w") as file:
209+
json.dump(self.open_alerts, file)

pyth_observer/zenduty.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async def send_zenduty_alert(alert_identifier, message, resolved=False, summary=
2424
}
2525

2626
async with aiohttp.ClientSession() as session:
27-
max_retries = 30
27+
max_retries = 5
2828
retries = 0
2929
while retries < max_retries:
3030
async with session.post(url, json=data, headers=headers) as response:

0 commit comments

Comments
 (0)