Skip to content

Environment Health Check #729

Environment Health Check

Environment Health Check #729

Workflow file for this run

name: Environment Health Check
on:
schedule:
# Run every hour
- cron: '0 * * * *'
workflow_dispatch:
inputs:
environment:
description: 'Environment to check'
required: false
default: 'both'
type: choice
options:
- both
- prod
- staging
jobs:
health-check:
runs-on: ubuntu-latest
timeout-minutes: 5
strategy:
fail-fast: false
matrix:
include:
- env: prod
ssh_key_secret: PROD_SSH_PRIVATE_KEY
vm_ip_secret: PROD_AZURE_VM_IP
vm_user_secret: PROD_AZURE_VM_USER
- env: staging
ssh_key_secret: STAGING_SSH_PRIVATE_KEY
vm_ip_secret: STAGING_AZURE_VM_IP
vm_user_secret: STAGING_AZURE_VM_USER
steps:
- name: Check if should run (${{ matrix.env }})
id: should_run
run: |
if [[ "${{ github.event_name }}" != "workflow_dispatch" ]]; then
echo "run=true" >> $GITHUB_OUTPUT
elif [[ "${{ github.event.inputs.environment }}" == "both" ]]; then
echo "run=true" >> $GITHUB_OUTPUT
elif [[ "${{ github.event.inputs.environment }}" == "${{ matrix.env }}" ]]; then
echo "run=true" >> $GITHUB_OUTPUT
else
echo "run=false" >> $GITHUB_OUTPUT
echo "Skipping ${{ matrix.env }} - user selected ${{ github.event.inputs.environment }}"
fi
- name: Setup SSH
if: steps.should_run.outputs.run == 'true'
run: |
mkdir -p ~/.ssh
echo "${{ secrets[matrix.ssh_key_secret] }}" > ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
ssh-keyscan -H ${{ secrets[matrix.vm_ip_secret] }} >> ~/.ssh/known_hosts 2>/dev/null
- name: Check PM2 Processes (${{ matrix.env }})
if: steps.should_run.outputs.run == 'true'
id: pm2
run: |
echo "## [${{ matrix.env }}] PM2 Process Status" >> $GITHUB_STEP_SUMMARY
PM2_STATUS=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} \
"source ~/.nvm/nvm.sh && pm2 jlist" 2>/dev/null || echo "[]")
# Only show safe fields (name, status, uptime, restarts) - NOT environment variables
echo '```json' >> $GITHUB_STEP_SUMMARY
echo "$PM2_STATUS" | jq '[.[] | {name: .name, status: .pm2_env.status, uptime: .pm2_env.pm_uptime, restarts: .pm2_env.restart_time}]' >> $GITHUB_STEP_SUMMARY 2>/dev/null || echo "[]" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
WORKER_STATUS=$(echo "$PM2_STATUS" | jq -r '.[] | select(.name=="dw_worker") | .pm2_env.status' 2>/dev/null || echo "unknown")
GATEWAY_STATUS=$(echo "$PM2_STATUS" | jq -r '.[] | select(.name=="dw_gateway") | .pm2_env.status' 2>/dev/null || echo "unknown")
echo "worker_status=$WORKER_STATUS" >> $GITHUB_OUTPUT
echo "gateway_status=$GATEWAY_STATUS" >> $GITHUB_OUTPUT
if [[ "$WORKER_STATUS" != "online" ]]; then
echo "::error::[${{ matrix.env }}] dw_worker is not online (status: $WORKER_STATUS)"
fi
if [[ "$GATEWAY_STATUS" != "online" ]]; then
echo "::error::[${{ matrix.env }}] dw_gateway is not online (status: $GATEWAY_STATUS)"
fi
- name: Check Health Endpoints (${{ matrix.env }})
if: steps.should_run.outputs.run == 'true'
id: endpoints
run: |
echo "## [${{ matrix.env }}] Health Endpoints" >> $GITHUB_STEP_SUMMARY
WORKER_HEALTH=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} \
"curl -s -o /dev/null -w '%{http_code}' --connect-timeout 5 http://localhost:9001/health" 2>/dev/null || echo "000")
GATEWAY_HEALTH=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} \
"curl -s -o /dev/null -w '%{http_code}' --connect-timeout 5 http://localhost:9100/health" 2>/dev/null || echo "000")
echo "| Service | Port | HTTP Status |" >> $GITHUB_STEP_SUMMARY
echo "|---------|------|-------------|" >> $GITHUB_STEP_SUMMARY
echo "| rust_service | 9001 | $WORKER_HEALTH |" >> $GITHUB_STEP_SUMMARY
echo "| gateway | 9100 | $GATEWAY_HEALTH |" >> $GITHUB_STEP_SUMMARY
echo "worker_health=$WORKER_HEALTH" >> $GITHUB_OUTPUT
echo "gateway_health=$GATEWAY_HEALTH" >> $GITHUB_OUTPUT
if [[ "$WORKER_HEALTH" != "200" ]]; then
echo "::error::[${{ matrix.env }}] rust_service health check failed (HTTP $WORKER_HEALTH)"
fi
if [[ "$GATEWAY_HEALTH" != "200" ]]; then
echo "::error::[${{ matrix.env }}] gateway health check failed (HTTP $GATEWAY_HEALTH)"
fi
- name: Check MongoDB Connection (${{ matrix.env }})
if: steps.should_run.outputs.run == 'true'
id: mongodb
run: |
echo "## [${{ matrix.env }}] MongoDB Status" >> $GITHUB_STEP_SUMMARY
MONGO_CHECK=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} << 'EOF'
ENV_FILE="/home/azureuser/server/DoWhiz/DoWhiz_service/.env"
STORAGE_BACKEND=$(grep -E "^STORAGE_BACKEND=" "$ENV_FILE" 2>/dev/null | cut -d= -f2 || echo "unknown")
echo "Storage Backend: $STORAGE_BACKEND"
if grep -qE "^MONGODB_URI=" "$ENV_FILE" 2>/dev/null; then
echo "MongoDB URI: configured"
MONGODB_URI=$(grep -E "^MONGODB_URI=" "$ENV_FILE" | cut -d= -f2- | tr -d '"')
if command -v mongosh &>/dev/null; then
if mongosh "$MONGODB_URI" --eval "db.runCommand({ping:1})" --quiet 2>/dev/null | grep -q "ok"; then
echo "MongoDB Connection: OK"
echo "MONGO_HEALTHY"
else
echo "MongoDB Connection: FAILED"
echo "MONGO_UNHEALTHY"
fi
elif command -v mongo &>/dev/null; then
if mongo "$MONGODB_URI" --eval "db.runCommand({ping:1})" --quiet 2>/dev/null | grep -q "ok"; then
echo "MongoDB Connection: OK"
echo "MONGO_HEALTHY"
else
echo "MongoDB Connection: FAILED"
echo "MONGO_UNHEALTHY"
fi
else
echo "MongoDB Connection: skipped (no mongosh/mongo client)"
echo "MONGO_SKIPPED"
fi
else
echo "MongoDB URI: not configured in $ENV_FILE"
echo "MONGO_NOT_CONFIGURED"
fi
EOF
)
echo '```' >> $GITHUB_STEP_SUMMARY
echo "$MONGO_CHECK" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
if [[ "$MONGO_CHECK" == *"MONGO_UNHEALTHY"* ]]; then
echo "::error::[${{ matrix.env }}] MongoDB connection failed"
echo "mongo_healthy=false" >> $GITHUB_OUTPUT
elif [[ "$MONGO_CHECK" == *"MONGO_HEALTHY"* ]]; then
echo "mongo_healthy=true" >> $GITHUB_OUTPUT
else
echo "mongo_healthy=skipped" >> $GITHUB_OUTPUT
fi
- name: Check Azure ACI Containers (${{ matrix.env }})
if: steps.should_run.outputs.run == 'true'
id: aci_containers
run: |
echo "## [${{ matrix.env }}] Azure ACI Container Status" >> $GITHUB_STEP_SUMMARY
ACI_CHECK=$(ssh -o ConnectTimeout=30 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} << 'EOF'
# Check if az CLI is available
if ! command -v az &>/dev/null; then
echo "Azure CLI not installed"
echo "AZ_CLI_MISSING"
exit 0
fi
# Get resource group from env
ENV_FILE="/home/azureuser/server/DoWhiz/DoWhiz_service/.env"
RESOURCE_GROUP=$(grep -E "^RUN_TASK_AZURE_ACI_RESOURCE_GROUP=" "$ENV_FILE" 2>/dev/null | cut -d= -f2 | tr -d '"' || echo "")
if [[ -z "$RESOURCE_GROUP" ]]; then
echo "RUN_TASK_AZURE_ACI_RESOURCE_GROUP not configured"
echo "ACI_NOT_CONFIGURED"
exit 0
fi
echo "Resource Group: $RESOURCE_GROUP"
echo ""
# List all container groups
CONTAINERS=$(az container list --resource-group "$RESOURCE_GROUP" --output json 2>/dev/null || echo "[]")
TOTAL=$(echo "$CONTAINERS" | jq 'length')
RUNNING=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Running")] | length')
FAILED=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Failed")] | length')
SUCCEEDED=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Succeeded")] | length')
PENDING=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Pending")] | length')
CREATING=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Creating")] | length')
DELETING=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Deleting")] | length')
echo "| Status | Count |"
echo "|--------|-------|"
echo "| Total containers | $TOTAL |"
echo "| Running | $RUNNING |"
echo "| Succeeded (completed) | $SUCCEEDED |"
echo "| Failed | $FAILED |"
echo "| Pending | $PENDING |"
echo "| Creating | $CREATING |"
echo "| Deleting | $DELETING |"
# Show all unique provisioningState values for debugging
echo ""
echo "### All Container States"
echo "$CONTAINERS" | jq -r '[.[] | .provisioningState] | group_by(.) | map({state: .[0], count: length}) | .[] | "- \(.state): \(.count)"'
# Check for orphaned containers (older than 1 hour)
echo ""
echo "### Containers older than 1 hour"
ONE_HOUR_AGO=$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v-1H +%Y-%m-%dT%H:%M:%SZ)
OLD_CONTAINERS=$(echo "$CONTAINERS" | jq --arg cutoff "$ONE_HOUR_AGO" '[.[] | select(.provisioningState == "Running") | select(.containers[0].instanceView.currentState.startTime < $cutoff) | {name: .name, state: .provisioningState, started: .containers[0].instanceView.currentState.startTime}]')
OLD_COUNT=$(echo "$OLD_CONTAINERS" | jq 'length')
if [[ "$OLD_COUNT" -gt 0 ]]; then
echo "$OLD_CONTAINERS" | jq -r '.[] | "- \(.name) (started: \(.started))"'
echo "ORPHANED_CONTAINERS:$OLD_COUNT"
else
echo "No orphaned containers found"
fi
# List failed containers
if [[ "$FAILED" -gt 0 ]]; then
echo ""
echo "### Failed Containers"
echo "$CONTAINERS" | jq -r '.[] | select(.provisioningState == "Failed") | "- \(.name): \(.containers[0].instanceView.currentState.detailStatus // "unknown error")"' | head -10
echo "FAILED_CONTAINERS:$FAILED"
fi
# Alert thresholds
if [[ "$FAILED" -gt 5 ]]; then
echo "ACI_CRITICAL_FAILURES"
elif [[ "$FAILED" -gt 0 ]]; then
echo "ACI_HAS_FAILURES"
fi
if [[ "$TOTAL" -gt 50 ]]; then
echo "ACI_HIGH_COUNT"
fi
echo "ACI_CHECK_COMPLETE"
EOF
)
echo '```' >> $GITHUB_STEP_SUMMARY
echo "$ACI_CHECK" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
if [[ "$ACI_CHECK" == *"ACI_CRITICAL_FAILURES"* ]]; then
FAILED_COUNT=$(echo "$ACI_CHECK" | grep "FAILED_CONTAINERS:" | cut -d: -f2)
echo "::error::[${{ matrix.env }}] $FAILED_COUNT ACI containers in Failed state"
echo "aci_status=critical" >> $GITHUB_OUTPUT
elif [[ "$ACI_CHECK" == *"ORPHANED_CONTAINERS"* ]]; then
ORPHAN_COUNT=$(echo "$ACI_CHECK" | grep "ORPHANED_CONTAINERS:" | cut -d: -f2)
echo "::warning::[${{ matrix.env }}] $ORPHAN_COUNT orphaned ACI containers (running > 1 hour)"
echo "aci_status=orphaned" >> $GITHUB_OUTPUT
elif [[ "$ACI_CHECK" == *"ACI_HAS_FAILURES"* ]]; then
echo "::warning::[${{ matrix.env }}] Some ACI containers in Failed state"
echo "aci_status=warning" >> $GITHUB_OUTPUT
elif [[ "$ACI_CHECK" == *"ACI_HIGH_COUNT"* ]]; then
echo "::warning::[${{ matrix.env }}] High number of ACI containers"
echo "aci_status=high_count" >> $GITHUB_OUTPUT
elif [[ "$ACI_CHECK" == *"AZ_CLI_MISSING"* ]] || [[ "$ACI_CHECK" == *"ACI_NOT_CONFIGURED"* ]]; then
echo "aci_status=skipped" >> $GITHUB_OUTPUT
else
echo "aci_status=healthy" >> $GITHUB_OUTPUT
fi
- name: Check Disk Space (${{ matrix.env }})
if: steps.should_run.outputs.run == 'true'
id: disk
run: |
echo "## [${{ matrix.env }}] Disk Space" >> $GITHUB_STEP_SUMMARY
DISK_CHECK=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} \
"df -h / /home/azureuser/server 2>/dev/null | tail -n +2" 2>/dev/null || echo "unknown")
echo '```' >> $GITHUB_STEP_SUMMARY
echo "$DISK_CHECK" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
if echo "$DISK_CHECK" | awk '{print $5}' | grep -qE "^9[0-9]%|^100%"; then
echo "::warning::[${{ matrix.env }}] Disk usage above 90%"
echo "disk_ok=false" >> $GITHUB_OUTPUT
else
echo "disk_ok=true" >> $GITHUB_OUTPUT
fi
- name: Summary (${{ matrix.env }})
if: always() && steps.should_run.outputs.run == 'true'
run: |
echo "## [${{ matrix.env }}] Health Check Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Check | Status |" >> $GITHUB_STEP_SUMMARY
echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY
echo "| Worker PM2 | ${{ steps.pm2.outputs.worker_status }} |" >> $GITHUB_STEP_SUMMARY
echo "| Gateway PM2 | ${{ steps.pm2.outputs.gateway_status }} |" >> $GITHUB_STEP_SUMMARY
echo "| Worker HTTP | ${{ steps.endpoints.outputs.worker_health }} |" >> $GITHUB_STEP_SUMMARY
echo "| Gateway HTTP | ${{ steps.endpoints.outputs.gateway_health }} |" >> $GITHUB_STEP_SUMMARY
echo "| MongoDB | ${{ steps.mongodb.outputs.mongo_healthy }} |" >> $GITHUB_STEP_SUMMARY
echo "| Azure ACI | ${{ steps.aci_containers.outputs.aci_status }} |" >> $GITHUB_STEP_SUMMARY
echo "| Disk Space | ${{ steps.disk.outputs.disk_ok }} |" >> $GITHUB_STEP_SUMMARY
- name: Fail if Critical Issues (${{ matrix.env }})
if: |
steps.should_run.outputs.run == 'true' &&
matrix.env == 'prod' && (
steps.endpoints.outputs.worker_health != '200' ||
steps.endpoints.outputs.gateway_health != '200' ||
steps.mongodb.outputs.mongo_healthy == 'false' ||
steps.aci_containers.outputs.aci_status == 'critical'
)
run: |
echo "Critical health check failures detected on ${{ matrix.env }}!"
echo "Worker Health: ${{ steps.endpoints.outputs.worker_health }}"
echo "Gateway Health: ${{ steps.endpoints.outputs.gateway_health }}"
echo "MongoDB: ${{ steps.mongodb.outputs.mongo_healthy }}"
echo "Azure ACI: ${{ steps.aci_containers.outputs.aci_status }}"
exit 1