Skip to content

Commit 0cc1b4d

Browse files
fix(api): reduce watchdog sensitivity
1 parent 190bae5 commit 0cc1b4d

File tree

3 files changed

+8
-6
lines changed

3 files changed

+8
-6
lines changed

api/api/settings.py

-1
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,6 @@
230230

231231
# Watchdog
232232
WATCHDOG_SECONDARIES = os.environ.get("DESECSTACK_WATCHDOG_SECONDARIES", "").split()
233-
WATCHDOG_WINDOW_SEC = 600
234233

235234
# PCH
236235
PCH_API = os.environ.get("DESECSTACK_API_PCH_API", "")

api/cronhook/crontab

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
*/5 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py chores >> /var/log/cron.log 2>&1
2-
*/5 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py check-secondaries >> /var/log/cron.log 2>&1
2+
*/15 * * * * /usr/local/bin/python3 -u /usr/src/app/manage.py check-secondaries >> /var/log/cron.log 2>&1
33
7 11 * * * /usr/local/bin/python3 -u /usr/src/app/manage.py scavenge-unused >> /var/log/cron.log 2>&1

api/desecapi/management/commands/check-secondaries.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,13 @@ def add_arguments(self, parser):
4747
parser.add_argument(
4848
"--delay",
4949
type=int,
50-
default=120,
50+
default=300,
5151
help="Delay SOA checks to allow pending AXFRs to finish.",
5252
)
5353
parser.add_argument(
5454
"--window",
5555
type=int,
56-
default=settings.WATCHDOG_WINDOW_SEC,
56+
default=1200, # Should be sum of crontab interval and delay option (see above)
5757
help="Check domains that were published no longer than this many seconds ago.",
5858
)
5959

@@ -129,10 +129,13 @@ def handle(self, *args, **options):
129129
self.report(outdated_secondaries, output, timeouts)
130130

131131
def report(self, outdated_secondaries, output, timeouts):
132-
if not outdated_secondaries and not timeouts:
132+
# Do not report when timeouts occur, unless there's also replication out-of-sync somwhere.
133+
# Helps catch long-term unreachability, where subject will show timeouts for any emails.
134+
# Individual node downtimes should be tracked by external monitoring.
135+
if not outdated_secondaries:
133136
return
134137

135-
subject = f'{timeouts and "CRITICAL ALERT" or "ALERT"} {len(outdated_secondaries)} secondaries out of sync'
138+
subject = f"{len(timeouts)} timeouts, {len(outdated_secondaries)} secondaries out of sync"
136139
message = ""
137140

138141
if timeouts:

0 commit comments

Comments
 (0)