From 73dbd4ad68cf36877f05784675ac2afe7972e206 Mon Sep 17 00:00:00 2001 From: Osvaldo Andrade Date: Fri, 26 Jun 2026 05:20:39 -0300 Subject: [PATCH] Add Kubernetes resilience suite --- .github/workflows/resilience-nightly.yml | 55 +++ Makefile | 5 +- docs/helm-resilience.md | 6 + docs/resilience-acceptance-matrix.md | 64 +++ scripts/k8s_resilience_suite.sh | 588 +++++++++++++++++++++++ 5 files changed, 717 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/resilience-nightly.yml create mode 100644 docs/resilience-acceptance-matrix.md create mode 100755 scripts/k8s_resilience_suite.sh diff --git a/.github/workflows/resilience-nightly.yml b/.github/workflows/resilience-nightly.yml new file mode 100644 index 0000000..b2cb934 --- /dev/null +++ b/.github/workflows/resilience-nightly.yml @@ -0,0 +1,55 @@ +name: resilience-nightly + +on: + schedule: + - cron: "17 3 * * *" + workflow_dispatch: + inputs: + mode: + description: "dry-run or live" + required: false + default: dry-run + namespace: + description: "Kubernetes namespace for live mode" + required: false + default: cefas-resilience + release: + description: "Helm release name" + required: false + default: cefas-resilience + kube_context: + description: "kubectl context for live mode" + required: false + default: "" + +permissions: + contents: read + +concurrency: + group: resilience-nightly-${{ github.ref }} + cancel-in-progress: false + +jobs: + k8s-resilience: + name: kubernetes resilience suite + runs-on: ubuntu-latest + timeout-minutes: 30 + env: + MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.mode || 'dry-run' }} + NAMESPACE: ${{ github.event_name == 'workflow_dispatch' && inputs.namespace || 'cefas-resilience' }} + RELEASE: ${{ github.event_name == 'workflow_dispatch' && inputs.release || 'cefas-resilience' }} + KUBE_CONTEXT: ${{ github.event_name == 'workflow_dispatch' && inputs.kube_context || '' }} + ARTIFACT_DIR: ${{ runner.temp }}/cefas-resilience/${{ github.run_id }} + steps: + - uses: actions/checkout@v6 + - uses: azure/setup-helm@v4 + - uses: azure/setup-kubectl@v4 + - name: Run resilience suite + run: scripts/k8s_resilience_suite.sh + - name: Upload resilience artifacts + if: always() + uses: actions/upload-artifact@v7 + with: + name: resilience-suite + path: ${{ runner.temp }}/cefas-resilience + if-no-files-found: warn diff --git a/Makefile b/Makefile index 0e61f7a..e73526f 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ BIN_DIR := ./bin VERSION := $(shell git describe --tags --always --dirty 2>/dev/null || echo dev) LDFLAGS := -ldflags "-s -w -X main.Version=$(VERSION)" -.PHONY: help build server cli install clean fmt lint vet test cover mut sec bench helm-test tools ci +.PHONY: help build server cli install clean fmt lint vet test cover mut sec bench helm-test k8s-resilience tools ci help: ## List available targets. @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-12s %s\n", $$1, $$2}' $(MAKEFILE_LIST) @@ -76,4 +76,7 @@ bench: ## Run benchmarks across all packages. helm-test: ## Render-test Helm resilience profiles. scripts/test_helm_resilience.sh +k8s-resilience: ## Run the Kubernetes resilience suite in dry-run mode by default. + scripts/k8s_resilience_suite.sh + ci: vet lint test cover sec ## Full quality gate (mirror of CI workflow). diff --git a/docs/helm-resilience.md b/docs/helm-resilience.md index 2a9a17a..10fe0ea 100644 --- a/docs/helm-resilience.md +++ b/docs/helm-resilience.md @@ -96,3 +96,9 @@ Run the chart smoke tests: ```sh scripts/test_helm_resilience.sh ``` + +Run the Kubernetes resilience acceptance suite in CI-safe dry-run mode: + +```sh +MODE=dry-run scripts/k8s_resilience_suite.sh +``` diff --git a/docs/resilience-acceptance-matrix.md b/docs/resilience-acceptance-matrix.md new file mode 100644 index 0000000..cbbb366 --- /dev/null +++ b/docs/resilience-acceptance-matrix.md @@ -0,0 +1,64 @@ +# Kubernetes Resilience Acceptance Matrix + +This is the RF=3 acceptance matrix for Kubernetes and Talos-style failure +testing. The suite is implemented by `scripts/k8s_resilience_suite.sh`. + +Run the CI-safe render check: + +```sh +MODE=dry-run scripts/k8s_resilience_suite.sh +``` + +Run against a real cluster: + +```sh +MODE=live \ +KUBE_CONTEXT=hack \ +NAMESPACE=cefas-resilience \ +RELEASE=cefas-resilience \ +KEEP_CLUSTER=1 \ +scripts/k8s_resilience_suite.sh +``` + +Destructive node and provider faults require an explicit opt-in: + +```sh +MODE=live \ +KUBE_CONTEXT=hack \ +ALLOW_DESTRUCTIVE=1 \ +NODE_SHUTDOWN_COMMAND='./ops/talos-shutdown-one-node "$TARGET_NODE"' \ +NODE_RESTORE_COMMAND='./ops/talos-power-on-one-node "$TARGET_NODE"' \ +scripts/k8s_resilience_suite.sh +``` + +The hook commands run with `TARGET_POD`, `TARGET_NODE`, `NAMESPACE`, +`RELEASE`, `APP`, `SELECTOR`, and `KUBE_CONTEXT` exported. + +| Scenario | Fault | Expected Service Behavior | Recovery | Stop Condition | +| --- | --- | --- | --- | --- | +| `healthy-baseline` | None | All database Pods are ready; `cefas-manager doctor` is healthy or degraded-but-serving. | Not applicable. | Doctor fails or reports unsafe. | +| `pod-kill` | Delete one CefasDB Pod and keep PVCs. | StatefulSet recreates the Pod and the RF=3 cluster remains serving. | Wait for rollout and startup probe completion. | Ready quorum is not restored or doctor reports unsafe. | +| `node-drain` | Cordon one node hosting a database Pod and drain CefasDB Pods from it. | One failure domain can be unavailable while the remaining majority stays serving. | Uncordon the node and let Kubernetes reschedule. | Quorum-ready Pod count is lost or doctor reports unsafe. | +| `node-shutdown` | Provider/Talos hook shuts down one node. | One physical host loss remains serving or explicitly degraded-but-serving. | Provider/Talos hook powers the node back on. | Doctor reports unsafe for a one-node loss. | +| `orphan-process` | Provider/Talos hook leaves or simulates a stale process with an old raft identity. | The stale process is fenced by the Kubernetes lease and the active cluster remains serving. | Provider/Talos hook removes the stale process. | The stale process can serve or CPU-spins without a valid lease. | +| `disk-pressure` | Provider/Talos hook applies disk pressure on one database node. | Manager reports pressure and the database stays serving or degraded-but-serving. | Provider/Talos hook removes pressure and confirms PVC health. | Pods spin indefinitely or doctor reports unsafe for a one-node fault. | +| `network-partition` | Provider/Talos hook partitions one database node from the cluster. | The majority side remains serving and the isolated node is not accepted as healthy. | Provider/Talos hook removes the partition and raft catches up. | Split brain is observed or manager cannot identify unsafe state. | +| `two-node-failure` | Two failure domains unavailable at the same time. | RF=3 fails safely with clear quorum or unsafe reporting. | Restore nodes/hooks and wait for raft catch-up. | Pods CPU-spin without functional quorum or health reporting is ambiguous. | + +## Artifacts + +Each run writes: + +- `summary.md`: scenario table and statuses +- `matrix.json`: machine-readable acceptance matrix +- `rendered.yaml`: Helm render used for the run +- `helm-lint.txt`: chart lint output +- `scenarios//doctor.json`: manager health report when available +- `scenarios//*-pods.txt`, `*-events.txt`, `*-nodes.txt`, and logs for failures + +## Acceptance Rules + +One failure domain loss must keep the cluster serving or explicitly +degraded-but-serving. Two failure domain loss is not required to serve with +RF=3, but it must fail safely: the manager must report quorum loss or unsafe +state, and Pods must not remain CPU-bound while nonfunctional. diff --git a/scripts/k8s_resilience_suite.sh b/scripts/k8s_resilience_suite.sh new file mode 100755 index 0000000..6ed36cb --- /dev/null +++ b/scripts/k8s_resilience_suite.sh @@ -0,0 +1,588 @@ +#!/usr/bin/env bash +set -euo pipefail + +MODE="${MODE:-dry-run}" +CHART="${CHART:-dist/helm/cefas}" +RELEASE="${RELEASE:-cefas-resilience}" +NAMESPACE="${NAMESPACE:-cefas-resilience}" +APP="${APP:-${RELEASE}-cefas}" +SELECTOR="${SELECTOR:-app.kubernetes.io/name=cefas,app.kubernetes.io/instance=${RELEASE}}" +ARTIFACT_DIR="${ARTIFACT_DIR:-/tmp/cefas-resilience/$(date -u +%Y%m%dT%H%M%SZ)}" +SCENARIOS="${SCENARIOS:-healthy-baseline,pod-kill,node-drain,node-shutdown,orphan-process,disk-pressure,network-partition,two-node-failure}" +KUBE_CONTEXT="${KUBE_CONTEXT:-}" +REPLICAS="${REPLICAS:-3}" +REPLICATION_FACTOR="${REPLICATION_FACTOR:-3}" +SHARDS="${SHARDS:-24}" +DISABLE_CPU_LIMITS="${DISABLE_CPU_LIMITS:-true}" +ALLOW_DESTRUCTIVE="${ALLOW_DESTRUCTIVE:-0}" +KEEP_CLUSTER="${KEEP_CLUSTER:-0}" +ROLLOUT_TIMEOUT="${ROLLOUT_TIMEOUT:-5m}" +READY_TIMEOUT_SECONDS="${READY_TIMEOUT_SECONDS:-300}" +DRAIN_TIMEOUT="${DRAIN_TIMEOUT:-120s}" +DOCTOR_TIMEOUT="${DOCTOR_TIMEOUT:-45s}" +QUORUM_READY="${QUORUM_READY:-2}" +POST_INJECTION_SETTLE_SECONDS="${POST_INJECTION_SETTLE_SECONDS:-10}" +HELM_EXTRA_ARGS="${HELM_EXTRA_ARGS:-}" + +RUN_STATUS=0 +NAMESPACE_CREATED=0 +CORDONED_NODES_FILE="" +SUMMARY_FILE="" + +log() { + printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" >&2 +} + +die() { + echo "error: $*" >&2 + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" +} + +kubectl_cmd() { + if [ -n "$KUBE_CONTEXT" ]; then + kubectl --context "$KUBE_CONTEXT" "$@" + else + kubectl "$@" + fi +} + +helm_render_args() { + printf '%s\n' \ + --namespace "$NAMESPACE" \ + --set resilience.enabled=true \ + --set resilience.replicas="$REPLICAS" \ + --set resilience.replicationFactor="$REPLICATION_FACTOR" \ + --set cluster.replicationFactor="$REPLICATION_FACTOR" \ + --set cluster.shards="$SHARDS" \ + --set manager.enabled=true \ + --set resourcePolicy.disableCPULimits="$DISABLE_CPU_LIMITS" +} + +summary_escape() { + printf '%s' "$1" | tr '\n' ' ' | sed 's/|/\\|/g' +} + +json_escape() { + printf '%s' "$1" | sed 's/\\/\\\\/g; s/"/\\"/g' +} + +init_artifacts() { + mkdir -p "$ARTIFACT_DIR/scenarios" + CORDONED_NODES_FILE="$ARTIFACT_DIR/cordoned-nodes.txt" + SUMMARY_FILE="$ARTIFACT_DIR/summary.md" + : >"$CORDONED_NODES_FILE" + cat >"$SUMMARY_FILE" <"$ARTIFACT_DIR/matrix.json" <<'JSON' +[ + { + "scenario": "healthy-baseline", + "failure": "none", + "expectedService": "all database Pods ready; cefas-manager doctor reports healthy or degraded-but-serving", + "recovery": "not applicable", + "stopCondition": "doctor command fails or reports unsafe" + }, + { + "scenario": "pod-kill", + "failure": "delete one cefasdb Pod without deleting PVCs", + "expectedService": "replacement Pod starts and RF=3 remains serving during and after recovery", + "recovery": "StatefulSet recreates the Pod; startup probe allows raft replay", + "stopCondition": "ready quorum is not restored or doctor reports unsafe" + }, + { + "scenario": "node-drain", + "failure": "cordon one Kubernetes node and drain CefasDB Pods from it", + "expectedService": "one failure domain can be unavailable while reads and writes remain serving", + "recovery": "uncordon the node and let Kubernetes reschedule", + "stopCondition": "quorum-ready Pod count is lost or doctor reports unsafe" + }, + { + "scenario": "node-shutdown", + "failure": "provider/Talos hook shuts down one node", + "expectedService": "one physical host loss remains serving or explicitly degraded-but-serving", + "recovery": "provider/Talos hook powers the node back on", + "stopCondition": "doctor reports unsafe after one-node loss" + }, + { + "scenario": "orphan-process", + "failure": "provider/Talos hook leaves or simulates a stale process with an old raft identity", + "expectedService": "stale identity is fenced by the Kubernetes lease; active cluster remains serving", + "recovery": "provider/Talos hook removes the stale process", + "stopCondition": "stale process can serve or CPU-spins without a valid lease" + }, + { + "scenario": "disk-pressure", + "failure": "provider/Talos hook applies disk pressure on one database node", + "expectedService": "manager reports the pressure and the database stays serving or degraded-but-serving", + "recovery": "provider/Talos hook removes pressure and confirms PVC health", + "stopCondition": "Pods spin indefinitely or doctor reports unsafe for a one-node fault" + }, + { + "scenario": "network-partition", + "failure": "provider/Talos hook partitions one database node from the cluster", + "expectedService": "majority side remains serving and the isolated node is not accepted as healthy", + "recovery": "provider/Talos hook removes the partition and raft catches up", + "stopCondition": "split brain is observed or manager cannot identify unsafe state" + }, + { + "scenario": "two-node-failure", + "failure": "two failure domains unavailable at the same time", + "expectedService": "RF=3 fails safely with clear quorum or unsafe reporting", + "recovery": "restore nodes/hooks and wait for raft catch-up", + "stopCondition": "Pods CPU-spin without functional quorum or health reporting is ambiguous" + } +] +JSON +} + +append_result() { + local scenario="$1" + local status="$2" + local expected="$3" + local notes="$4" + local dir="scenarios/$scenario" + local status_label="$status" + + case "$status" in + pass) status_label="PASS" ;; + fail) status_label="FAIL" ;; + skipped) status_label="SKIPPED" ;; + planned) status_label="PLANNED" ;; + esac + + printf '| `%s` | %s | %s | `%s` | %s |\n' \ + "$(summary_escape "$scenario")" \ + "$status_label" \ + "$(summary_escape "$expected")" \ + "$dir" \ + "$(summary_escape "$notes")" >>"$SUMMARY_FILE" + + mkdir -p "$ARTIFACT_DIR/$dir" + cat >"$ARTIFACT_DIR/$dir/result.json" <"$output" 2>&1 +} + +render_chart() { + require_cmd helm + local rendered="$ARTIFACT_DIR/rendered.yaml" + local lint_out="$ARTIFACT_DIR/helm-lint.txt" + log "rendering Helm chart into $rendered" + helm lint "$CHART" >"$lint_out" 2>&1 + # shellcheck disable=SC2086 + helm template "$RELEASE" "$CHART" $(helm_render_args) $HELM_EXTRA_ARGS >"$rendered" + grep -q 'kind: StatefulSet' "$rendered" || die "rendered chart is missing StatefulSet" + grep -q 'kind: PodDisruptionBudget' "$rendered" || die "rendered chart is missing PodDisruptionBudget" + grep -q 'replicationFactor: 3' "$rendered" || die "rendered chart is missing RF=3 config" + grep -q 'peers:' "$rendered" || die "rendered chart is missing multi-node peers" + grep -q 'app.kubernetes.io/name: cefas-manager' "$rendered" || die "rendered chart is missing manager deployment" +} + +install_chart() { + require_cmd kubectl + require_cmd helm + kubectl_cmd cluster-info >"$ARTIFACT_DIR/kubectl-cluster-info.txt" 2>&1 + + if ! kubectl_cmd get namespace "$NAMESPACE" >/dev/null 2>&1; then + NAMESPACE_CREATED=1 + fi + + log "installing $RELEASE in namespace $NAMESPACE" + # shellcheck disable=SC2086 + helm upgrade --install "$RELEASE" "$CHART" \ + --namespace "$NAMESPACE" \ + --create-namespace \ + --set resilience.enabled=true \ + --set resilience.replicas="$REPLICAS" \ + --set resilience.replicationFactor="$REPLICATION_FACTOR" \ + --set cluster.replicationFactor="$REPLICATION_FACTOR" \ + --set cluster.shards="$SHARDS" \ + --set manager.enabled=true \ + --set resourcePolicy.disableCPULimits="$DISABLE_CPU_LIMITS" \ + $HELM_EXTRA_ARGS \ + >"$ARTIFACT_DIR/helm-upgrade.txt" 2>&1 +} + +wait_full_rollout() { + log "waiting for StatefulSet and manager rollout" + kubectl_cmd -n "$NAMESPACE" rollout status "statefulset/$APP" --timeout="$ROLLOUT_TIMEOUT" \ + >"$ARTIFACT_DIR/rollout-statefulset.txt" 2>&1 + kubectl_cmd -n "$NAMESPACE" rollout status "deploy/${APP}-manager" --timeout="$ROLLOUT_TIMEOUT" \ + >"$ARTIFACT_DIR/rollout-manager.txt" 2>&1 +} + +ready_pods_count() { + local out + out="$(kubectl_cmd -n "$NAMESPACE" get pods -l "$SELECTOR" -o jsonpath='{range .items[*]}{.status.containerStatuses[0].ready}{"\n"}{end}' 2>/dev/null || true)" + printf '%s\n' "$out" | awk '$1 == "true" { n++ } END { print n + 0 }' +} + +wait_ready_count() { + local min_ready="$1" + local deadline + deadline=$(( $(date +%s) + READY_TIMEOUT_SECONDS )) + while [ "$(ready_pods_count)" -lt "$min_ready" ]; do + if [ "$(date +%s)" -ge "$deadline" ]; then + return 1 + fi + sleep 2 + done + return 0 +} + +collect_snapshot() { + local scenario="$1" + local phase="$2" + local dir="$ARTIFACT_DIR/scenarios/$scenario" + mkdir -p "$dir" + capture "$dir/${phase}-pods.txt" kubectl_cmd -n "$NAMESPACE" get pods -l "$SELECTOR" -o wide || true + capture "$dir/${phase}-manager-pods.txt" kubectl_cmd -n "$NAMESPACE" get pods -l "app.kubernetes.io/name=cefas-manager,app.kubernetes.io/instance=$RELEASE" -o wide || true + capture "$dir/${phase}-events.txt" kubectl_cmd -n "$NAMESPACE" get events --sort-by=.lastTimestamp || true + capture "$dir/${phase}-endpoints.txt" kubectl_cmd -n "$NAMESPACE" get endpoints "${APP}" "${APP}-headless" -o yaml || true + capture "$dir/${phase}-pvc.txt" kubectl_cmd -n "$NAMESPACE" get pvc -l "$SELECTOR" -o wide || true + capture "$dir/${phase}-nodes.txt" kubectl_cmd get nodes -o wide || true +} + +collect_logs() { + local scenario="$1" + local dir="$ARTIFACT_DIR/scenarios/$scenario" + mkdir -p "$dir" + capture "$dir/cefas-logs.txt" kubectl_cmd -n "$NAMESPACE" logs "statefulset/$APP" --all-containers --tail=500 || true + capture "$dir/manager-logs.txt" kubectl_cmd -n "$NAMESPACE" logs "deploy/${APP}-manager" --all-containers --tail=500 || true +} + +doctor_command() { + kubectl_cmd -n "$NAMESPACE" exec "deploy/${APP}-manager" -c cefas-manager -- \ + /usr/local/bin/cefas-manager \ + --endpoint="${APP}:9090" \ + --http-endpoint="http://${APP}:8080" \ + --namespace="$NAMESPACE" \ + --selector="$SELECTOR" \ + --timeout="$DOCTOR_TIMEOUT" \ + --insecure=true \ + doctor +} + +run_doctor() { + local scenario="$1" + local dir="$ARTIFACT_DIR/scenarios/$scenario" + mkdir -p "$dir" + doctor_command >"$dir/doctor.json" 2>"$dir/doctor.err" +} + +doctor_is_unsafe() { + local file="$1" + grep -q '"classification"[[:space:]]*:[[:space:]]*"unsafe"' "$file" +} + +doctor_is_serving() { + local scenario="$1" + local dir="$ARTIFACT_DIR/scenarios/$scenario" + if ! run_doctor "$scenario"; then + return 1 + fi + if doctor_is_unsafe "$dir/doctor.json"; then + return 2 + fi + return 0 +} + +doctor_reports_unsafe() { + local scenario="$1" + local dir="$ARTIFACT_DIR/scenarios/$scenario" + if ! run_doctor "$scenario"; then + return 1 + fi + doctor_is_unsafe "$dir/doctor.json" +} + +first_db_pod() { + kubectl_cmd -n "$NAMESPACE" get pods -l "$SELECTOR" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true +} + +node_for_pod() { + local pod="$1" + kubectl_cmd -n "$NAMESPACE" get pod "$pod" -o jsonpath='{.spec.nodeName}' 2>/dev/null || true +} + +record_cordoned_node() { + local node="$1" + if ! grep -qx "$node" "$CORDONED_NODES_FILE" 2>/dev/null; then + printf '%s\n' "$node" >>"$CORDONED_NODES_FILE" + fi +} + +cleanup_cordoned_nodes() { + if [ -z "$CORDONED_NODES_FILE" ] || [ ! -f "$CORDONED_NODES_FILE" ]; then + return + fi + while IFS= read -r node; do + [ -n "$node" ] || continue + log "uncordon node $node" + kubectl_cmd uncordon "$node" >/dev/null 2>&1 || true + done <"$CORDONED_NODES_FILE" +} + +cleanup_release() { + if [ "$MODE" != "live" ]; then + return + fi + cleanup_cordoned_nodes + if [ "$KEEP_CLUSTER" = "1" ]; then + log "KEEP_CLUSTER=1; leaving release and PVCs in place" + return + fi + log "cleaning Helm release and PVCs" + helm uninstall "$RELEASE" -n "$NAMESPACE" >"$ARTIFACT_DIR/helm-uninstall.txt" 2>&1 || true + kubectl_cmd -n "$NAMESPACE" delete pvc -l "$SELECTOR" --ignore-not-found \ + >"$ARTIFACT_DIR/pvc-delete.txt" 2>&1 || true + if [ "$NAMESPACE_CREATED" = "1" ]; then + kubectl_cmd delete namespace "$NAMESPACE" --ignore-not-found \ + >"$ARTIFACT_DIR/namespace-delete.txt" 2>&1 || true + fi +} + +on_exit() { + local status="$1" + if [ "$MODE" = "live" ]; then + collect_snapshot "suite" "final" || true + cleanup_release || true + fi + exit "$status" +} + +run_healthy_baseline() { + local scenario="healthy-baseline" + local expected="all Pods ready; manager doctor is not unsafe" + collect_snapshot "$scenario" "before" + if wait_ready_count "$REPLICAS" && doctor_is_serving "$scenario"; then + append_result "$scenario" pass "$expected" "baseline serving" + else + collect_logs "$scenario" + append_result "$scenario" fail "$expected" "baseline failed; see doctor and logs" + fi + collect_snapshot "$scenario" "after" +} + +run_pod_kill() { + local scenario="pod-kill" + local expected="one database Pod kill recovers to full ready state without unsafe doctor" + local pod + collect_snapshot "$scenario" "before" + pod="$(first_db_pod)" + if [ -z "$pod" ]; then + append_result "$scenario" fail "$expected" "no database Pod found" + return + fi + log "deleting database Pod $pod" + if ! capture "$ARTIFACT_DIR/scenarios/$scenario/delete-pod.txt" kubectl_cmd -n "$NAMESPACE" delete pod "$pod" --wait=false; then + append_result "$scenario" fail "$expected" "kubectl delete pod failed" + return + fi + if wait_full_rollout && wait_ready_count "$REPLICAS" && doctor_is_serving "$scenario"; then + append_result "$scenario" pass "$expected" "deleted $pod and recovered" + else + collect_logs "$scenario" + append_result "$scenario" fail "$expected" "pod kill did not recover cleanly" + fi + collect_snapshot "$scenario" "after" +} + +run_node_drain() { + local scenario="node-drain" + local expected="one node drained; quorum remains ready and doctor is not unsafe" + local pod node + collect_snapshot "$scenario" "before" + if [ "$ALLOW_DESTRUCTIVE" != "1" ]; then + append_result "$scenario" skipped "$expected" "set ALLOW_DESTRUCTIVE=1 to cordon and drain a real node" + return + fi + pod="$(first_db_pod)" + node="$(node_for_pod "$pod")" + if [ -z "$node" ]; then + append_result "$scenario" fail "$expected" "could not resolve node for pod $pod" + return + fi + log "cordon and drain node $node" + if ! capture "$ARTIFACT_DIR/scenarios/$scenario/cordon.txt" kubectl_cmd cordon "$node"; then + append_result "$scenario" fail "$expected" "kubectl cordon failed" + return + fi + record_cordoned_node "$node" + if ! capture "$ARTIFACT_DIR/scenarios/$scenario/drain.txt" kubectl_cmd drain "$node" --pod-selector="$SELECTOR" --ignore-daemonsets --delete-emptydir-data --force --timeout="$DRAIN_TIMEOUT"; then + collect_logs "$scenario" + append_result "$scenario" fail "$expected" "kubectl drain failed; see drain artifact" + return + fi + if wait_ready_count "$QUORUM_READY" && doctor_is_serving "$scenario"; then + append_result "$scenario" pass "$expected" "drained $node and kept quorum ready" + else + collect_logs "$scenario" + append_result "$scenario" fail "$expected" "cluster did not stay serving after draining $node" + fi + collect_snapshot "$scenario" "after" +} + +run_hooked_serving_scenario() { + local scenario="$1" + local inject_command="$2" + local recover_command="$3" + local expected="$4" + local pod node + collect_snapshot "$scenario" "before" + if [ "$ALLOW_DESTRUCTIVE" != "1" ]; then + append_result "$scenario" skipped "$expected" "set ALLOW_DESTRUCTIVE=1 and the scenario hook to inject this failure" + return + fi + if [ -z "$inject_command" ]; then + append_result "$scenario" skipped "$expected" "scenario hook is not configured" + return + fi + pod="$(first_db_pod)" + node="$(node_for_pod "$pod")" + export TARGET_POD="$pod" TARGET_NODE="$node" NAMESPACE RELEASE APP SELECTOR KUBE_CONTEXT + log "running $scenario injection hook" + if ! sh -c "$inject_command" >"$ARTIFACT_DIR/scenarios/$scenario/inject-hook.txt" 2>&1; then + append_result "$scenario" fail "$expected" "injection hook failed" + return + fi + sleep "$POST_INJECTION_SETTLE_SECONDS" + if wait_ready_count "$QUORUM_READY" && doctor_is_serving "$scenario"; then + append_result "$scenario" pass "$expected" "failure injected and cluster stayed serving" + else + collect_logs "$scenario" + append_result "$scenario" fail "$expected" "cluster did not satisfy serving expectation" + fi + if [ -n "$recover_command" ]; then + log "running $scenario recovery hook" + sh -c "$recover_command" >"$ARTIFACT_DIR/scenarios/$scenario/recover-hook.txt" 2>&1 || true + fi + collect_snapshot "$scenario" "after" +} + +run_two_node_failure() { + local scenario="two-node-failure" + local expected="two-node loss fails safely with explicit unsafe/quorum reporting" + local inject_command="${TWO_NODE_FAILURE_COMMAND:-}" + local recover_command="${TWO_NODE_RECOVER_COMMAND:-}" + collect_snapshot "$scenario" "before" + if [ "$ALLOW_DESTRUCTIVE" != "1" ]; then + append_result "$scenario" skipped "$expected" "set ALLOW_DESTRUCTIVE=1 and TWO_NODE_FAILURE_COMMAND to inject this fault" + return + fi + if [ -z "$inject_command" ]; then + append_result "$scenario" skipped "$expected" "TWO_NODE_FAILURE_COMMAND is not configured" + return + fi + log "running two-node failure injection hook" + if ! sh -c "$inject_command" >"$ARTIFACT_DIR/scenarios/$scenario/inject-hook.txt" 2>&1; then + append_result "$scenario" fail "$expected" "two-node injection hook failed" + return + fi + sleep "$POST_INJECTION_SETTLE_SECONDS" + if doctor_reports_unsafe "$scenario"; then + append_result "$scenario" pass "$expected" "doctor reported unsafe/quorum loss as expected" + else + collect_logs "$scenario" + append_result "$scenario" fail "$expected" "doctor did not report explicit unsafe state" + fi + if [ -n "$recover_command" ]; then + log "running two-node recovery hook" + sh -c "$recover_command" >"$ARTIFACT_DIR/scenarios/$scenario/recover-hook.txt" 2>&1 || true + fi + collect_snapshot "$scenario" "after" +} + +run_dry_scenario() { + local scenario="$1" + local expected="$2" + mkdir -p "$ARTIFACT_DIR/scenarios/$scenario" + append_result "$scenario" planned "$expected" "dry-run mode rendered Helm and wrote the acceptance matrix; no cluster mutation" +} + +run_scenario() { + local scenario="$1" + case "$MODE:$scenario" in + dry-run:healthy-baseline) run_dry_scenario "$scenario" "RF=3 chart renders StatefulSet, peers, PDB, and manager" ;; + dry-run:pod-kill) run_dry_scenario "$scenario" "pod kill is part of the live acceptance suite" ;; + dry-run:node-drain) run_dry_scenario "$scenario" "node drain is guarded by ALLOW_DESTRUCTIVE=1" ;; + dry-run:node-shutdown) run_dry_scenario "$scenario" "node shutdown uses NODE_SHUTDOWN_COMMAND and NODE_RESTORE_COMMAND hooks" ;; + dry-run:orphan-process) run_dry_scenario "$scenario" "orphan process uses ORPHAN_PROCESS_COMMAND and ORPHAN_RECOVER_COMMAND hooks" ;; + dry-run:disk-pressure) run_dry_scenario "$scenario" "disk pressure uses DISK_PRESSURE_COMMAND and DISK_RECOVER_COMMAND hooks" ;; + dry-run:network-partition) run_dry_scenario "$scenario" "network partition uses NETWORK_PARTITION_COMMAND and NETWORK_RECOVER_COMMAND hooks" ;; + dry-run:two-node-failure) run_dry_scenario "$scenario" "two-node failure uses TWO_NODE_FAILURE_COMMAND and TWO_NODE_RECOVER_COMMAND hooks" ;; + live:healthy-baseline) run_healthy_baseline ;; + live:pod-kill) run_pod_kill ;; + live:node-drain) run_node_drain ;; + live:node-shutdown) run_hooked_serving_scenario "$scenario" "${NODE_SHUTDOWN_COMMAND:-}" "${NODE_RESTORE_COMMAND:-}" "one physical node shutdown remains serving or degraded-but-serving" ;; + live:orphan-process) run_hooked_serving_scenario "$scenario" "${ORPHAN_PROCESS_COMMAND:-}" "${ORPHAN_RECOVER_COMMAND:-}" "stale raft identity is fenced and active cluster remains serving" ;; + live:disk-pressure) run_hooked_serving_scenario "$scenario" "${DISK_PRESSURE_COMMAND:-}" "${DISK_RECOVER_COMMAND:-}" "single-node disk pressure is reported without losing service" ;; + live:network-partition) run_hooked_serving_scenario "$scenario" "${NETWORK_PARTITION_COMMAND:-}" "${NETWORK_RECOVER_COMMAND:-}" "majority side remains serving and isolated node is not healthy" ;; + live:two-node-failure) run_two_node_failure ;; + *) append_result "$scenario" fail "known scenario" "unknown scenario or mode" ;; + esac +} + +main() { + case "$MODE" in + dry-run|live) ;; + *) die "MODE must be dry-run or live" ;; + esac + + init_artifacts + write_matrix_json + render_chart + + if [ "$MODE" = "live" ]; then + trap 'on_exit $?' EXIT + install_chart + wait_full_rollout + wait_ready_count "$REPLICAS" || die "database Pods did not become ready" + fi + + for scenario in $(printf '%s' "$SCENARIOS" | tr ',' ' '); do + [ -n "$scenario" ] || continue + log "scenario: $scenario" + run_scenario "$scenario" + done + + log "summary: $SUMMARY_FILE" + return "$RUN_STATUS" +} + +main "$@"