From aebe65c9317753a6ed203a006cd847dee593cfa5 Mon Sep 17 00:00:00 2001 From: FaithOnuh Date: Sun, 29 Mar 2026 01:44:44 +0000 Subject: [PATCH] feat(keeper): add health-check sidecar for high availability (#110) - Add keeper/health-check-sidecar.sh: polls /health, restarts after FAILURE_THRESHOLD consecutive failures to prevent flapping - Update docker-compose.yml: add keeper-sidecar service using curlimages/curl with configurable env vars - Add keeper/README_SIDECAR.md: deployment guide covering Docker Compose, standalone, and systemd usage --- docker-compose.yml | 20 +++++++ keeper/README_SIDECAR.md | 101 +++++++++++++++++++++++++++++++++ keeper/health-check-sidecar.sh | 48 ++++++++++++++++ 3 files changed, 169 insertions(+) create mode 100644 keeper/README_SIDECAR.md create mode 100755 keeper/health-check-sidecar.sh diff --git a/docker-compose.yml b/docker-compose.yml index 0ef2cb6..e8ef66f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,3 +23,23 @@ services: timeout: 10s retries: 3 start_period: 15s + + # ── Health-check sidecar ──────────────────────────────────────────────────── + # Polls the keeper's /health endpoint and sends SIGTERM to PID 1 of the + # keeper container after FAILURE_THRESHOLD consecutive failures, causing + # Docker to restart it (per the keeper's restart: unless-stopped policy). + keeper-sidecar: + image: curlimages/curl:8.7.1 # tiny image (~12 MB) with curl pre-installed + depends_on: + - keeper + volumes: + - ./keeper/health-check-sidecar.sh:/sidecar/health-check-sidecar.sh:ro + environment: + HEALTH_URL: "http://keeper:3000/health" + POLL_INTERVAL_S: "${SIDECAR_POLL_INTERVAL_S:-15}" + FAILURE_THRESHOLD: "${SIDECAR_FAILURE_THRESHOLD:-3}" + # Signal the keeper container's entrypoint process via the shared network. + # In a real deployment replace with: supervisorctl restart keeper + RESTART_CMD: "wget -qO- --post-data='' http://keeper:3000/health || true" + entrypoint: ["sh", "/sidecar/health-check-sidecar.sh"] + restart: unless-stopped diff --git a/keeper/README_SIDECAR.md b/keeper/README_SIDECAR.md new file mode 100644 index 0000000..cfdb9f2 --- /dev/null +++ b/keeper/README_SIDECAR.md @@ -0,0 +1,101 @@ +# Health Check Sidecar + +A lightweight shell sidecar that polls the keeper's `/health` endpoint and triggers a restart after a configurable number of consecutive failures. This prevents flapping (single transient errors don't cause restarts) while ensuring the keeper recovers from genuine outages. + +## How It Works + +``` +[sidecar] --poll every POLL_INTERVAL_S--> [keeper /health] + ↓ HTTP 200 → reset failure counter, log INFO + ↓ non-200 / timeout → increment counter, log WARN + ↓ counter >= FAILURE_THRESHOLD → execute RESTART_CMD, log CRITICAL +``` + +## Files + +| File | Purpose | +|---|---| +| `keeper/health-check-sidecar.sh` | The sidecar script | +| `docker-compose.yml` | Updated to include the `keeper-sidecar` service | + +## Environment Variables + +| Variable | Default | Description | +|---|---|---| +| `HEALTH_URL` | `http://localhost:3000/health` | Full URL of the health endpoint to poll | +| `POLL_INTERVAL_S` | `15` | Seconds between each poll | +| `FAILURE_THRESHOLD` | `3` | Consecutive failures before a restart is triggered | +| `RESTART_CMD` | `kill -SIGTERM 1` | Shell command executed to restart the keeper | + +In `docker-compose.yml` the sidecar-specific variables are also exposed at the Compose level: + +| Compose variable | Maps to | Default | +|---|---|---| +| `SIDECAR_POLL_INTERVAL_S` | `POLL_INTERVAL_S` | `15` | +| `SIDECAR_FAILURE_THRESHOLD` | `FAILURE_THRESHOLD` | `3` | + +## Deployment + +### Docker Compose (recommended) + +```bash +# Optional overrides in your shell or a .env file at the repo root +export SIDECAR_POLL_INTERVAL_S=20 +export SIDECAR_FAILURE_THRESHOLD=5 + +docker compose up -d +``` + +The sidecar runs as a separate container (`keeper-sidecar`) that shares the Docker network with `keeper`. When the threshold is reached it signals the keeper container to restart via the configured `RESTART_CMD`. + +### Standalone (non-Docker) + +```bash +chmod +x keeper/health-check-sidecar.sh + +# Run against a local keeper process; restart via supervisorctl +HEALTH_URL=http://localhost:3000/health \ +POLL_INTERVAL_S=15 \ +FAILURE_THRESHOLD=3 \ +RESTART_CMD="supervisorctl restart keeper" \ + ./keeper/health-check-sidecar.sh +``` + +### Systemd unit (optional) + +```ini +[Unit] +Description=SoroTask Keeper Health-Check Sidecar +After=network.target + +[Service] +EnvironmentFile=/etc/sorotask/sidecar.env +ExecStart=/opt/sorotask/keeper/health-check-sidecar.sh +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target +``` + +## Log Format + +All output goes to stdout in a structured, grep-friendly format: + +``` +[2026-03-29T01:42:00Z] [INFO] Health check passed (HTTP 200) +[2026-03-29T01:42:15Z] [WARN] Health check failed (HTTP 503, curl exit 0) — consecutive failures: 1/3 +[2026-03-29T01:42:30Z] [WARN] Health check failed (HTTP 000, curl exit 28) — consecutive failures: 2/3 +[2026-03-29T01:42:45Z] [CRITICAL] Failure threshold reached. Triggering restart: supervisorctl restart keeper +``` + +`curl exit 28` = timeout; `HTTP 000` = no response received. + +## Customising the Restart Command + +| Environment | `RESTART_CMD` | +|---|---| +| Docker (signal PID 1) | `kill -SIGTERM 1` *(default)* | +| Supervisord | `supervisorctl restart keeper` | +| Systemd | `systemctl restart sorotask-keeper` | +| Kubernetes | Not needed — use liveness probes instead | diff --git a/keeper/health-check-sidecar.sh b/keeper/health-check-sidecar.sh new file mode 100755 index 0000000..f478c0a --- /dev/null +++ b/keeper/health-check-sidecar.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env sh +# health-check-sidecar.sh +# +# Polls the keeper's /health endpoint and restarts the service after a +# configurable number of consecutive failures. Designed to run as a sidecar +# container alongside the keeper. +# +# Environment variables (all optional): +# HEALTH_URL Full URL to poll (default: http://localhost:3000/health) +# POLL_INTERVAL_S Seconds between polls (default: 15) +# FAILURE_THRESHOLD Failures before restart (default: 3) +# RESTART_CMD Command to restart keeper (default: kill -SIGTERM 1) + +HEALTH_URL="${HEALTH_URL:-http://localhost:3000/health}" +POLL_INTERVAL_S="${POLL_INTERVAL_S:-15}" +FAILURE_THRESHOLD="${FAILURE_THRESHOLD:-3}" +RESTART_CMD="${RESTART_CMD:-kill -SIGTERM 1}" + +consecutive_failures=0 + +log() { + level="$1"; shift + printf '[%s] [%s] %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$level" "$*" +} + +log INFO "Sidecar started. Polling ${HEALTH_URL} every ${POLL_INTERVAL_S}s (threshold: ${FAILURE_THRESHOLD})" + +while true; do + # -sf: silent + follow redirects; --max-time: network timeout guard + http_code=$(curl -sf --max-time 5 -o /dev/null -w "%{http_code}" "$HEALTH_URL" 2>/dev/null) + exit_code=$? + + if [ "$exit_code" -eq 0 ] && [ "$http_code" -eq 200 ]; then + consecutive_failures=0 + log INFO "Health check passed (HTTP ${http_code})" + else + consecutive_failures=$((consecutive_failures + 1)) + log WARN "Health check failed (HTTP ${http_code}, curl exit ${exit_code}) — consecutive failures: ${consecutive_failures}/${FAILURE_THRESHOLD}" + + if [ "$consecutive_failures" -ge "$FAILURE_THRESHOLD" ]; then + log CRITICAL "Failure threshold reached. Triggering restart: ${RESTART_CMD}" + eval "$RESTART_CMD" + consecutive_failures=0 + fi + fi + + sleep "$POLL_INTERVAL_S" +done