@@ -74,61 +74,30 @@ async def run(self, states: List[State]):
74
74
network = self .config ["network" ]["name" ], publishers = self .publishers
75
75
)
76
76
77
+ current_time = datetime .now ()
77
78
for check in failed_checks :
78
79
for event_type in self .config ["events" ]:
79
80
event : Event = globals ()[event_type ](check , context )
80
81
81
82
if event_type == "ZendutyEvent" :
82
- # Add failed check to open alerts
83
- alert_identifier = (
84
- f"{ check .__class__ .__name__ } -{ check .state ().symbol } "
85
- )
86
- state = check .state ()
87
- if isinstance (state , PublisherState ):
88
- alert_identifier += f"-{ state .publisher_name } "
89
- try :
90
- failures = self .open_alerts [alert_identifier ]["failures" ] + 1
91
- except KeyError :
92
- failures = 1
93
- self .open_alerts [alert_identifier ] = {
94
- "last_failure" : datetime .now ().isoformat (),
95
- "failures" : failures ,
96
- }
97
- # store the event to send it later if it fails multiple times
83
+ alert_identifier = self .generate_alert_identifier (check )
84
+ alert = self .open_alerts .get (alert_identifier )
85
+ if alert is None :
86
+ self .open_alerts [alert_identifier ] = {
87
+ "window_start" : current_time .isoformat (),
88
+ "failures" : 1 ,
89
+ "last_window_failures" : None ,
90
+ "sent" : False ,
91
+ }
92
+ else :
93
+ alert ["failures" ] += 1
98
94
self .zenduty_events [alert_identifier ] = event
99
- continue # do not immediately send a zenduty alert
95
+ continue # Skip sending immediately for ZendutyEvent
100
96
101
97
sent_events .append (event .send ())
102
98
103
99
await asyncio .gather (* sent_events )
104
-
105
- # Check open alerts for zenduty
106
- if "ZendutyEvent" in self .config ["events" ]:
107
-
108
- to_remove = []
109
- current_time = datetime .now ()
110
- for identifier , info in self .open_alerts .items ():
111
- # Resolve the alert if it last failed > 2 minutes ago
112
- if current_time - datetime .fromisoformat (
113
- info ["last_failure" ]
114
- ) >= timedelta (minutes = 2 ):
115
- logger .debug (f"Resolving Zenduty alert { identifier } " )
116
- response = await send_zenduty_alert (
117
- alert_identifier = identifier , message = identifier , resolved = True
118
- )
119
- if response and 200 <= response .status < 300 :
120
- to_remove .append (identifier )
121
- elif info ["failures" ] > 2 :
122
- # Raise alert if the check has failed more than twice before self-resolving
123
- await self .zenduty_events [identifier ].send ()
124
-
125
- for identifier in to_remove :
126
- del self .open_alerts [identifier ]
127
- del self .zenduty_events [identifier ]
128
-
129
- # Write open alerts to file to ensure persistence
130
- with open (self .open_alerts_file , "w" ) as file :
131
- json .dump (self .open_alerts , file )
100
+ await self .process_zenduty_events (current_time )
132
101
133
102
def check_price_feed (self , state : PriceFeedState ) -> List [Check ]:
134
103
failed_checks : List [Check ] = []
@@ -179,3 +148,62 @@ def load_config(self, check_name: str, symbol: str) -> Dict[str, Any]:
179
148
config |= self .config ["checks" ][symbol ][check_name ]
180
149
181
150
return config
151
+
152
+ # Zenduty Functions
153
+ def generate_alert_identifier (self , check ):
154
+ alert_identifier = f"{ check .__class__ .__name__ } -{ check .state ().symbol } "
155
+ state = check .state ()
156
+ if isinstance (state , PublisherState ):
157
+ alert_identifier += f"-{ state .publisher_name } "
158
+ return alert_identifier
159
+
160
+ def check_zd_alert_status (self , alert_identifier , current_time ):
161
+ alert = self .open_alerts .get (alert_identifier )
162
+ if alert is not None :
163
+ # Reset the failure count if 5m has elapsed
164
+ if current_time - datetime .fromisoformat (
165
+ alert ["window_start" ]
166
+ ) >= timedelta (minutes = 5 ):
167
+ alert ["window_start" ] = current_time .isoformat ()
168
+ alert ["last_window_failures" ] = alert ["failures" ]
169
+ alert ["failures" ] = 0
170
+
171
+ async def process_zenduty_events (self , current_time ):
172
+ to_remove = []
173
+ to_alert = []
174
+
175
+ for identifier , info in self .open_alerts .items ():
176
+ self .check_zd_alert_status (identifier , current_time )
177
+ # Resolve the alert if raised and failed < 5 times in the last 5m window
178
+ if (
179
+ info ["sent" ]
180
+ and info ["last_window_failures" ] is not None
181
+ and info ["last_window_failures" ] < 5
182
+ ):
183
+ logger .debug (f"Resolving Zenduty alert { identifier } " )
184
+ response = await send_zenduty_alert (
185
+ identifier , identifier , resolved = True
186
+ )
187
+ if response and 200 <= response .status < 300 :
188
+ to_remove .append (identifier )
189
+ # Raise alert if failed > 5 times within the last 5m window
190
+ # re-alert every 5 minutes
191
+ elif info ["failures" ] >= 5 and (
192
+ not info .get ("last_alert" )
193
+ or current_time - datetime .fromisoformat (info ["last_alert" ])
194
+ > timedelta (minutes = 5 )
195
+ ):
196
+ logger .debug (f"Raising Zenduty alert { identifier } " )
197
+ self .open_alerts [identifier ]["sent" ] = True
198
+ self .open_alerts [identifier ]["last_alert" ] = current_time .isoformat ()
199
+ to_alert .append (self .zenduty_events [identifier ].send ())
200
+
201
+ await asyncio .gather (* to_alert )
202
+ for identifier in to_remove :
203
+ if self .open_alerts .get (identifier ):
204
+ del self .open_alerts [identifier ]
205
+ if self .zenduty_events .get (identifier ):
206
+ del self .zenduty_events [identifier ]
207
+
208
+ with open (self .open_alerts_file , "w" ) as file :
209
+ json .dump (self .open_alerts , file )
0 commit comments