Environment Health Check #729
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Environment Health Check | |
| on: | |
| schedule: | |
| # Run every hour | |
| - cron: '0 * * * *' | |
| workflow_dispatch: | |
| inputs: | |
| environment: | |
| description: 'Environment to check' | |
| required: false | |
| default: 'both' | |
| type: choice | |
| options: | |
| - both | |
| - prod | |
| - staging | |
| jobs: | |
| health-check: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - env: prod | |
| ssh_key_secret: PROD_SSH_PRIVATE_KEY | |
| vm_ip_secret: PROD_AZURE_VM_IP | |
| vm_user_secret: PROD_AZURE_VM_USER | |
| - env: staging | |
| ssh_key_secret: STAGING_SSH_PRIVATE_KEY | |
| vm_ip_secret: STAGING_AZURE_VM_IP | |
| vm_user_secret: STAGING_AZURE_VM_USER | |
| steps: | |
| - name: Check if should run (${{ matrix.env }}) | |
| id: should_run | |
| run: | | |
| if [[ "${{ github.event_name }}" != "workflow_dispatch" ]]; then | |
| echo "run=true" >> $GITHUB_OUTPUT | |
| elif [[ "${{ github.event.inputs.environment }}" == "both" ]]; then | |
| echo "run=true" >> $GITHUB_OUTPUT | |
| elif [[ "${{ github.event.inputs.environment }}" == "${{ matrix.env }}" ]]; then | |
| echo "run=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "run=false" >> $GITHUB_OUTPUT | |
| echo "Skipping ${{ matrix.env }} - user selected ${{ github.event.inputs.environment }}" | |
| fi | |
| - name: Setup SSH | |
| if: steps.should_run.outputs.run == 'true' | |
| run: | | |
| mkdir -p ~/.ssh | |
| echo "${{ secrets[matrix.ssh_key_secret] }}" > ~/.ssh/id_rsa | |
| chmod 600 ~/.ssh/id_rsa | |
| ssh-keyscan -H ${{ secrets[matrix.vm_ip_secret] }} >> ~/.ssh/known_hosts 2>/dev/null | |
| - name: Check PM2 Processes (${{ matrix.env }}) | |
| if: steps.should_run.outputs.run == 'true' | |
| id: pm2 | |
| run: | | |
| echo "## [${{ matrix.env }}] PM2 Process Status" >> $GITHUB_STEP_SUMMARY | |
| PM2_STATUS=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} \ | |
| "source ~/.nvm/nvm.sh && pm2 jlist" 2>/dev/null || echo "[]") | |
| # Only show safe fields (name, status, uptime, restarts) - NOT environment variables | |
| echo '```json' >> $GITHUB_STEP_SUMMARY | |
| echo "$PM2_STATUS" | jq '[.[] | {name: .name, status: .pm2_env.status, uptime: .pm2_env.pm_uptime, restarts: .pm2_env.restart_time}]' >> $GITHUB_STEP_SUMMARY 2>/dev/null || echo "[]" >> $GITHUB_STEP_SUMMARY | |
| echo '```' >> $GITHUB_STEP_SUMMARY | |
| WORKER_STATUS=$(echo "$PM2_STATUS" | jq -r '.[] | select(.name=="dw_worker") | .pm2_env.status' 2>/dev/null || echo "unknown") | |
| GATEWAY_STATUS=$(echo "$PM2_STATUS" | jq -r '.[] | select(.name=="dw_gateway") | .pm2_env.status' 2>/dev/null || echo "unknown") | |
| echo "worker_status=$WORKER_STATUS" >> $GITHUB_OUTPUT | |
| echo "gateway_status=$GATEWAY_STATUS" >> $GITHUB_OUTPUT | |
| if [[ "$WORKER_STATUS" != "online" ]]; then | |
| echo "::error::[${{ matrix.env }}] dw_worker is not online (status: $WORKER_STATUS)" | |
| fi | |
| if [[ "$GATEWAY_STATUS" != "online" ]]; then | |
| echo "::error::[${{ matrix.env }}] dw_gateway is not online (status: $GATEWAY_STATUS)" | |
| fi | |
| - name: Check Health Endpoints (${{ matrix.env }}) | |
| if: steps.should_run.outputs.run == 'true' | |
| id: endpoints | |
| run: | | |
| echo "## [${{ matrix.env }}] Health Endpoints" >> $GITHUB_STEP_SUMMARY | |
| WORKER_HEALTH=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} \ | |
| "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 5 http://localhost:9001/health" 2>/dev/null || echo "000") | |
| GATEWAY_HEALTH=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} \ | |
| "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 5 http://localhost:9100/health" 2>/dev/null || echo "000") | |
| echo "| Service | Port | HTTP Status |" >> $GITHUB_STEP_SUMMARY | |
| echo "|---------|------|-------------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| rust_service | 9001 | $WORKER_HEALTH |" >> $GITHUB_STEP_SUMMARY | |
| echo "| gateway | 9100 | $GATEWAY_HEALTH |" >> $GITHUB_STEP_SUMMARY | |
| echo "worker_health=$WORKER_HEALTH" >> $GITHUB_OUTPUT | |
| echo "gateway_health=$GATEWAY_HEALTH" >> $GITHUB_OUTPUT | |
| if [[ "$WORKER_HEALTH" != "200" ]]; then | |
| echo "::error::[${{ matrix.env }}] rust_service health check failed (HTTP $WORKER_HEALTH)" | |
| fi | |
| if [[ "$GATEWAY_HEALTH" != "200" ]]; then | |
| echo "::error::[${{ matrix.env }}] gateway health check failed (HTTP $GATEWAY_HEALTH)" | |
| fi | |
| - name: Check MongoDB Connection (${{ matrix.env }}) | |
| if: steps.should_run.outputs.run == 'true' | |
| id: mongodb | |
| run: | | |
| echo "## [${{ matrix.env }}] MongoDB Status" >> $GITHUB_STEP_SUMMARY | |
| MONGO_CHECK=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} << 'EOF' | |
| ENV_FILE="/home/azureuser/server/DoWhiz/DoWhiz_service/.env" | |
| STORAGE_BACKEND=$(grep -E "^STORAGE_BACKEND=" "$ENV_FILE" 2>/dev/null | cut -d= -f2 || echo "unknown") | |
| echo "Storage Backend: $STORAGE_BACKEND" | |
| if grep -qE "^MONGODB_URI=" "$ENV_FILE" 2>/dev/null; then | |
| echo "MongoDB URI: configured" | |
| MONGODB_URI=$(grep -E "^MONGODB_URI=" "$ENV_FILE" | cut -d= -f2- | tr -d '"') | |
| if command -v mongosh &>/dev/null; then | |
| if mongosh "$MONGODB_URI" --eval "db.runCommand({ping:1})" --quiet 2>/dev/null | grep -q "ok"; then | |
| echo "MongoDB Connection: OK" | |
| echo "MONGO_HEALTHY" | |
| else | |
| echo "MongoDB Connection: FAILED" | |
| echo "MONGO_UNHEALTHY" | |
| fi | |
| elif command -v mongo &>/dev/null; then | |
| if mongo "$MONGODB_URI" --eval "db.runCommand({ping:1})" --quiet 2>/dev/null | grep -q "ok"; then | |
| echo "MongoDB Connection: OK" | |
| echo "MONGO_HEALTHY" | |
| else | |
| echo "MongoDB Connection: FAILED" | |
| echo "MONGO_UNHEALTHY" | |
| fi | |
| else | |
| echo "MongoDB Connection: skipped (no mongosh/mongo client)" | |
| echo "MONGO_SKIPPED" | |
| fi | |
| else | |
| echo "MongoDB URI: not configured in $ENV_FILE" | |
| echo "MONGO_NOT_CONFIGURED" | |
| fi | |
| EOF | |
| ) | |
| echo '```' >> $GITHUB_STEP_SUMMARY | |
| echo "$MONGO_CHECK" >> $GITHUB_STEP_SUMMARY | |
| echo '```' >> $GITHUB_STEP_SUMMARY | |
| if [[ "$MONGO_CHECK" == *"MONGO_UNHEALTHY"* ]]; then | |
| echo "::error::[${{ matrix.env }}] MongoDB connection failed" | |
| echo "mongo_healthy=false" >> $GITHUB_OUTPUT | |
| elif [[ "$MONGO_CHECK" == *"MONGO_HEALTHY"* ]]; then | |
| echo "mongo_healthy=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "mongo_healthy=skipped" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Check Azure ACI Containers (${{ matrix.env }}) | |
| if: steps.should_run.outputs.run == 'true' | |
| id: aci_containers | |
| run: | | |
| echo "## [${{ matrix.env }}] Azure ACI Container Status" >> $GITHUB_STEP_SUMMARY | |
| ACI_CHECK=$(ssh -o ConnectTimeout=30 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} << 'EOF' | |
| # Check if az CLI is available | |
| if ! command -v az &>/dev/null; then | |
| echo "Azure CLI not installed" | |
| echo "AZ_CLI_MISSING" | |
| exit 0 | |
| fi | |
| # Get resource group from env | |
| ENV_FILE="/home/azureuser/server/DoWhiz/DoWhiz_service/.env" | |
| RESOURCE_GROUP=$(grep -E "^RUN_TASK_AZURE_ACI_RESOURCE_GROUP=" "$ENV_FILE" 2>/dev/null | cut -d= -f2 | tr -d '"' || echo "") | |
| if [[ -z "$RESOURCE_GROUP" ]]; then | |
| echo "RUN_TASK_AZURE_ACI_RESOURCE_GROUP not configured" | |
| echo "ACI_NOT_CONFIGURED" | |
| exit 0 | |
| fi | |
| echo "Resource Group: $RESOURCE_GROUP" | |
| echo "" | |
| # List all container groups | |
| CONTAINERS=$(az container list --resource-group "$RESOURCE_GROUP" --output json 2>/dev/null || echo "[]") | |
| TOTAL=$(echo "$CONTAINERS" | jq 'length') | |
| RUNNING=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Running")] | length') | |
| FAILED=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Failed")] | length') | |
| SUCCEEDED=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Succeeded")] | length') | |
| PENDING=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Pending")] | length') | |
| CREATING=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Creating")] | length') | |
| DELETING=$(echo "$CONTAINERS" | jq '[.[] | select(.provisioningState == "Deleting")] | length') | |
| echo "| Status | Count |" | |
| echo "|--------|-------|" | |
| echo "| Total containers | $TOTAL |" | |
| echo "| Running | $RUNNING |" | |
| echo "| Succeeded (completed) | $SUCCEEDED |" | |
| echo "| Failed | $FAILED |" | |
| echo "| Pending | $PENDING |" | |
| echo "| Creating | $CREATING |" | |
| echo "| Deleting | $DELETING |" | |
| # Show all unique provisioningState values for debugging | |
| echo "" | |
| echo "### All Container States" | |
| echo "$CONTAINERS" | jq -r '[.[] | .provisioningState] | group_by(.) | map({state: .[0], count: length}) | .[] | "- \(.state): \(.count)"' | |
| # Check for orphaned containers (older than 1 hour) | |
| echo "" | |
| echo "### Containers older than 1 hour" | |
| ONE_HOUR_AGO=$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v-1H +%Y-%m-%dT%H:%M:%SZ) | |
| OLD_CONTAINERS=$(echo "$CONTAINERS" | jq --arg cutoff "$ONE_HOUR_AGO" '[.[] | select(.provisioningState == "Running") | select(.containers[0].instanceView.currentState.startTime < $cutoff) | {name: .name, state: .provisioningState, started: .containers[0].instanceView.currentState.startTime}]') | |
| OLD_COUNT=$(echo "$OLD_CONTAINERS" | jq 'length') | |
| if [[ "$OLD_COUNT" -gt 0 ]]; then | |
| echo "$OLD_CONTAINERS" | jq -r '.[] | "- \(.name) (started: \(.started))"' | |
| echo "ORPHANED_CONTAINERS:$OLD_COUNT" | |
| else | |
| echo "No orphaned containers found" | |
| fi | |
| # List failed containers | |
| if [[ "$FAILED" -gt 0 ]]; then | |
| echo "" | |
| echo "### Failed Containers" | |
| echo "$CONTAINERS" | jq -r '.[] | select(.provisioningState == "Failed") | "- \(.name): \(.containers[0].instanceView.currentState.detailStatus // "unknown error")"' | head -10 | |
| echo "FAILED_CONTAINERS:$FAILED" | |
| fi | |
| # Alert thresholds | |
| if [[ "$FAILED" -gt 5 ]]; then | |
| echo "ACI_CRITICAL_FAILURES" | |
| elif [[ "$FAILED" -gt 0 ]]; then | |
| echo "ACI_HAS_FAILURES" | |
| fi | |
| if [[ "$TOTAL" -gt 50 ]]; then | |
| echo "ACI_HIGH_COUNT" | |
| fi | |
| echo "ACI_CHECK_COMPLETE" | |
| EOF | |
| ) | |
| echo '```' >> $GITHUB_STEP_SUMMARY | |
| echo "$ACI_CHECK" >> $GITHUB_STEP_SUMMARY | |
| echo '```' >> $GITHUB_STEP_SUMMARY | |
| if [[ "$ACI_CHECK" == *"ACI_CRITICAL_FAILURES"* ]]; then | |
| FAILED_COUNT=$(echo "$ACI_CHECK" | grep "FAILED_CONTAINERS:" | cut -d: -f2) | |
| echo "::error::[${{ matrix.env }}] $FAILED_COUNT ACI containers in Failed state" | |
| echo "aci_status=critical" >> $GITHUB_OUTPUT | |
| elif [[ "$ACI_CHECK" == *"ORPHANED_CONTAINERS"* ]]; then | |
| ORPHAN_COUNT=$(echo "$ACI_CHECK" | grep "ORPHANED_CONTAINERS:" | cut -d: -f2) | |
| echo "::warning::[${{ matrix.env }}] $ORPHAN_COUNT orphaned ACI containers (running > 1 hour)" | |
| echo "aci_status=orphaned" >> $GITHUB_OUTPUT | |
| elif [[ "$ACI_CHECK" == *"ACI_HAS_FAILURES"* ]]; then | |
| echo "::warning::[${{ matrix.env }}] Some ACI containers in Failed state" | |
| echo "aci_status=warning" >> $GITHUB_OUTPUT | |
| elif [[ "$ACI_CHECK" == *"ACI_HIGH_COUNT"* ]]; then | |
| echo "::warning::[${{ matrix.env }}] High number of ACI containers" | |
| echo "aci_status=high_count" >> $GITHUB_OUTPUT | |
| elif [[ "$ACI_CHECK" == *"AZ_CLI_MISSING"* ]] || [[ "$ACI_CHECK" == *"ACI_NOT_CONFIGURED"* ]]; then | |
| echo "aci_status=skipped" >> $GITHUB_OUTPUT | |
| else | |
| echo "aci_status=healthy" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Check Disk Space (${{ matrix.env }}) | |
| if: steps.should_run.outputs.run == 'true' | |
| id: disk | |
| run: | | |
| echo "## [${{ matrix.env }}] Disk Space" >> $GITHUB_STEP_SUMMARY | |
| DISK_CHECK=$(ssh -o ConnectTimeout=10 ${{ secrets[matrix.vm_user_secret] }}@${{ secrets[matrix.vm_ip_secret] }} \ | |
| "df -h / /home/azureuser/server 2>/dev/null | tail -n +2" 2>/dev/null || echo "unknown") | |
| echo '```' >> $GITHUB_STEP_SUMMARY | |
| echo "$DISK_CHECK" >> $GITHUB_STEP_SUMMARY | |
| echo '```' >> $GITHUB_STEP_SUMMARY | |
| if echo "$DISK_CHECK" | awk '{print $5}' | grep -qE "^9[0-9]%|^100%"; then | |
| echo "::warning::[${{ matrix.env }}] Disk usage above 90%" | |
| echo "disk_ok=false" >> $GITHUB_OUTPUT | |
| else | |
| echo "disk_ok=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Summary (${{ matrix.env }}) | |
| if: always() && steps.should_run.outputs.run == 'true' | |
| run: | | |
| echo "## [${{ matrix.env }}] Health Check Summary" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Check | Status |" >> $GITHUB_STEP_SUMMARY | |
| echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| Worker PM2 | ${{ steps.pm2.outputs.worker_status }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Gateway PM2 | ${{ steps.pm2.outputs.gateway_status }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Worker HTTP | ${{ steps.endpoints.outputs.worker_health }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Gateway HTTP | ${{ steps.endpoints.outputs.gateway_health }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| MongoDB | ${{ steps.mongodb.outputs.mongo_healthy }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Azure ACI | ${{ steps.aci_containers.outputs.aci_status }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Disk Space | ${{ steps.disk.outputs.disk_ok }} |" >> $GITHUB_STEP_SUMMARY | |
| - name: Fail if Critical Issues (${{ matrix.env }}) | |
| if: | | |
| steps.should_run.outputs.run == 'true' && | |
| matrix.env == 'prod' && ( | |
| steps.endpoints.outputs.worker_health != '200' || | |
| steps.endpoints.outputs.gateway_health != '200' || | |
| steps.mongodb.outputs.mongo_healthy == 'false' || | |
| steps.aci_containers.outputs.aci_status == 'critical' | |
| ) | |
| run: | | |
| echo "Critical health check failures detected on ${{ matrix.env }}!" | |
| echo "Worker Health: ${{ steps.endpoints.outputs.worker_health }}" | |
| echo "Gateway Health: ${{ steps.endpoints.outputs.gateway_health }}" | |
| echo "MongoDB: ${{ steps.mongodb.outputs.mongo_healthy }}" | |
| echo "Azure ACI: ${{ steps.aci_containers.outputs.aci_status }}" | |
| exit 1 |