diff --git a/ansible/inventory-password.yml b/ansible/inventory-password.yml index e674aaa..636a47e 100644 --- a/ansible/inventory-password.yml +++ b/ansible/inventory-password.yml @@ -9,7 +9,7 @@ all: hosts: vcl1: - ansible_host: 152.7.176.221 + ansible_host: 152.7.176.240 ansible_user: sraval ansible_connection: ssh ansible_ssh_common_args: '-o StrictHostKeyChecking=no' @@ -21,7 +21,7 @@ all: ansible_ssh_common_args: '-o StrictHostKeyChecking=no' vcl3: - ansible_host: 152.7.178.104 + ansible_host: 152.7.176.221 ansible_user: sraval ansible_connection: ssh ansible_ssh_common_args: '-o StrictHostKeyChecking=no' diff --git a/ansible/inventory.yml b/ansible/inventory.yml index 0b08853..1863b27 100644 --- a/ansible/inventory.yml +++ b/ansible/inventory.yml @@ -4,7 +4,7 @@ all: hosts: vcl1: - ansible_host: 152.7.176.221 + ansible_host: 152.7.176.240 ansible_user: sraval ansible_ssh_private_key_file: ~/.ssh/id_ed25519 @@ -14,7 +14,7 @@ all: ansible_ssh_private_key_file: ~/.ssh/id_ed25519 vcl3: - ansible_host: 152.7.178.104 + ansible_host: 152.7.176.221 ansible_user: sraval ansible_ssh_private_key_file: ~/.ssh/id_ed25519 diff --git a/ansible/setup-replication.yml b/ansible/setup-replication.yml index 5a8eb13..29474bf 100644 --- a/ansible/setup-replication.yml +++ b/ansible/setup-replication.yml @@ -20,12 +20,15 @@ content: | #!/bin/bash # Sync database from VCL2 to VCL3 + # This stores backups on VCL3 for failover use + # The monitor script will restore when failover is triggered VCL3_HOST="{{ hostvars['vcl3']['ansible_host'] }}" VCL3_USER="{{ ansible_user }}" BACKUP_DIR="/tmp/db-backup" TIMESTAMP=$(date +%Y%m%d_%H%M%S) BACKUP_FILE="coffee_db_${TIMESTAMP}.sql" + DB_NAME="coffee_dev" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" @@ -33,44 +36,45 @@ log "Starting database replication to VCL3..." - # Create backup directory + # Create backup directory locally mkdir -p $BACKUP_DIR # Dump database from VCL2 log "Creating database dump..." - if sudo docker exec coffee_db pg_dump -U postgres coffee_dev > "${BACKUP_DIR}/${BACKUP_FILE}"; then + if sudo docker exec coffee_db pg_dump -U postgres ${DB_NAME} > "${BACKUP_DIR}/${BACKUP_FILE}"; then log "Database dump created: ${BACKUP_FILE}" else log "ERROR: Failed to create database dump" exit 1 fi + # Ensure backup directory exists on VCL3 + log "Ensuring backup directory exists on VCL3..." + ssh -o StrictHostKeyChecking=no ${VCL3_USER}@${VCL3_HOST} "mkdir -p ${BACKUP_DIR}" + # Copy to VCL3 log "Copying database dump to VCL3..." if scp -o StrictHostKeyChecking=no "${BACKUP_DIR}/${BACKUP_FILE}" ${VCL3_USER}@${VCL3_HOST}:${BACKUP_DIR}/; then - log "Database dump copied to VCL3" + log "Database dump copied to VCL3: ${BACKUP_DIR}/${BACKUP_FILE}" else log "ERROR: Failed to copy database dump to VCL3" exit 1 fi - # Restore on VCL3 - log "Restoring database on VCL3..." - if ssh -o StrictHostKeyChecking=no ${VCL3_USER}@${VCL3_HOST} \ - "sudo docker exec -i coffee_db psql -U postgres -d coffee_db < ${BACKUP_DIR}/${BACKUP_FILE}"; then - log "Database restored successfully on VCL3" - else - log "WARNING: Database restore on VCL3 had issues (may be expected if DB doesn't exist yet)" - fi + # Note: We don't restore on VCL3 here because: + # 1. VCL3 is in standby mode (no running containers) + # 2. The monitor script will restore from backup during failover + log "Backup stored on VCL3 for failover use" - # Cleanup old backups (keep last 5) + # Cleanup old backups (keep last 5 on both servers) log "Cleaning up old backups..." cd $BACKUP_DIR - ls -t coffee_db_*.sql | tail -n +6 | xargs -r rm + ls -t coffee_db_*.sql 2>/dev/null | tail -n +6 | xargs -r rm ssh -o StrictHostKeyChecking=no ${VCL3_USER}@${VCL3_HOST} \ "cd ${BACKUP_DIR} && ls -t coffee_db_*.sql 2>/dev/null | tail -n +6 | xargs -r rm" || true log "Database replication completed successfully" + log "Backup available on VCL3: ${BACKUP_DIR}/${BACKUP_FILE}" - name: Setup SSH key for passwordless replication shell: | diff --git a/ansible/setup-vcl3-monitor.yml b/ansible/setup-vcl3-monitor.yml index 545a0dd..ac90393 100644 --- a/ansible/setup-vcl3-monitor.yml +++ b/ansible/setup-vcl3-monitor.yml @@ -20,12 +20,16 @@ content: | #!/bin/bash # Monitor VCL2 health and activate VCL3 on failure + # Includes database restore from replicated backup and reverse sync on failback VCL2_HOST="{{ hostvars['vcl2']['ansible_host'] }}" + VCL2_USER="{{ ansible_user }}" VCL2_PORT="3000" CHECK_INTERVAL=10 FAIL_THRESHOLD=3 PROJECT_DIR="/home/{{ ansible_user }}/devops-project/coffee_project" + BACKUP_DIR="/tmp/db-backup" + DB_NAME="coffee_dev" fail_count=0 vcl3_active=false @@ -42,37 +46,162 @@ fi } + restore_database() { + log "Looking for database backup to restore..." + + # Find the latest backup file + LATEST_BACKUP=$(ls -t ${BACKUP_DIR}/coffee_db_*.sql 2>/dev/null | head -1) + + if [ -n "$LATEST_BACKUP" ] && [ -f "$LATEST_BACKUP" ]; then + log "Found backup: $LATEST_BACKUP" + + # Wait for database to be ready + log "Waiting for database container to be ready..." + for i in {1..30}; do + if sudo docker exec coffee_db pg_isready -U postgres > /dev/null 2>&1; then + log "Database is ready" + break + fi + sleep 2 + done + + # Drop and recreate database to ensure clean state + log "Preparing database for restore..." + sudo docker exec coffee_db psql -U postgres -c "DROP DATABASE IF EXISTS ${DB_NAME};" 2>/dev/null || true + sudo docker exec coffee_db psql -U postgres -c "CREATE DATABASE ${DB_NAME};" 2>/dev/null || true + + # Restore the backup + log "Restoring database from backup..." + if cat "$LATEST_BACKUP" | sudo docker exec -i coffee_db psql -U postgres -d ${DB_NAME}; then + log "Database restored successfully from backup!" + return 0 + else + log "WARNING: Database restore had issues, app will use migrations" + return 1 + fi + else + log "WARNING: No database backup found in ${BACKUP_DIR}" + log "VCL3 will start with fresh database (migrations/seeds)" + return 1 + fi + } + activate_vcl3() { - log "Activating VCL3..." + log "=========================================" + log "FAILOVER: Activating VCL3..." + log "=========================================" cd $PROJECT_DIR - sudo docker-compose up -d --build - vcl3_active=true - log "VCL3 is now active" + + # Start database container first + log "Starting database container..." + sudo docker compose up -d db + sleep 10 + + # Try to restore from replicated backup + restore_database + + # Start the app container + log "Starting application container..." + sudo docker compose up -d app + + # Wait for app to be ready + log "Waiting for app to be healthy..." + for i in {1..12}; do + if curl -sf http://localhost:3000/coffees > /dev/null 2>&1; then + vcl3_active=true + log "=========================================" + log "VCL3 is now ACTIVE and serving traffic!" + log "=========================================" + return 0 + fi + log "Waiting for app... ($i/12)" + sleep 5 + done + + log "ERROR: App started but not responding on port 3000" + return 1 + } + + sync_database_to_vcl2() { + log "Syncing VCL3 database back to VCL2..." + + TIMESTAMP=$(date +%Y%m%d_%H%M%S) + SYNC_FILE="${BACKUP_DIR}/failback_${TIMESTAMP}.sql" + + # Dump current VCL3 database + log "Creating database dump from VCL3..." + if sudo docker exec coffee_db pg_dump -U postgres ${DB_NAME} > "$SYNC_FILE"; then + log "Database dump created: $SYNC_FILE" + else + log "ERROR: Failed to create database dump" + return 1 + fi + + # Copy to VCL2 + log "Copying database to VCL2..." + if scp -o StrictHostKeyChecking=no "$SYNC_FILE" ${VCL2_USER}@${VCL2_HOST}:${BACKUP_DIR}/; then + log "Database dump copied to VCL2" + else + log "ERROR: Failed to copy database to VCL2" + return 1 + fi + + # Restore on VCL2 + log "Restoring database on VCL2..." + if ssh -o StrictHostKeyChecking=no ${VCL2_USER}@${VCL2_HOST} \ + "sudo docker exec coffee_db psql -U postgres -c 'DROP DATABASE IF EXISTS ${DB_NAME};' && \ + sudo docker exec coffee_db psql -U postgres -c 'CREATE DATABASE ${DB_NAME};' && \ + cat ${BACKUP_DIR}/failback_${TIMESTAMP}.sql | sudo docker exec -i coffee_db psql -U postgres -d ${DB_NAME}"; then + log "Database synced to VCL2 successfully!" + return 0 + else + log "ERROR: Failed to restore database on VCL2" + return 1 + fi } deactivate_vcl3() { - log "Deactivating VCL3..." + log "=========================================" + log "FAILBACK: VCL2 is back, deactivating VCL3..." + log "=========================================" + + # First sync database back to VCL2 + sync_database_to_vcl2 + + # Now stop VCL3 containers cd $PROJECT_DIR - sudo docker-compose down + sudo docker compose down vcl3_active=false - log "VCL3 is now standby" + + log "=========================================" + log "VCL3 deactivated. VCL2 is PRIMARY again." + log "=========================================" } + # Ensure backup directory exists + mkdir -p $BACKUP_DIR + log "Starting VCL2 health monitor..." + log "Monitoring: http://${VCL2_HOST}:${VCL2_PORT}/coffees" + log "Check interval: ${CHECK_INTERVAL}s, Failure threshold: ${FAIL_THRESHOLD}" while true; do if check_vcl2; then + if [ $fail_count -gt 0 ]; then + log "VCL2 health check passed (recovered from $fail_count failures)" + fi fail_count=0 + if [ "$vcl3_active" = true ]; then - log "VCL2 is healthy again, deactivating VCL3" + log "VCL2 is healthy again, initiating failback..." deactivate_vcl3 fi else fail_count=$((fail_count + 1)) - log "VCL2 health check failed ($fail_count/$FAIL_THRESHOLD)" + log "VCL2 health check FAILED ($fail_count/$FAIL_THRESHOLD)" if [ $fail_count -ge $FAIL_THRESHOLD ] && [ "$vcl3_active" = false ]; then - log "VCL2 is down! Triggering failover to VCL3" + log "VCL2 failure threshold reached! Triggering failover..." activate_vcl3 fi fi diff --git a/run_project_frnd.md b/run_project_frnd.md new file mode 100644 index 0000000..b163ea8 --- /dev/null +++ b/run_project_frnd.md @@ -0,0 +1,113 @@ +# Project Setup Guide for Friends + +Here is the complete guide to run the project setup from scratch. It covers everything from the local setup to running the automation on the VCL machine. + +first step before running these steps below is to run the local-bootstrap.sh from the scripts folder on your local machine + +### **Step 1: Local Setup (On Your Laptop)** + +Before doing anything on the servers, you need to update the configuration files with your specific VCL details. + +1. **Open `ansible/inventory.yml`** and update the **IP addresses** and **ansible_user** for all 3 machines: + ```yaml + all: + hosts: + vcl1: + ansible_host: 152.7.176.221 # <--- CHANGE THIS IP + ansible_user: your_unity_id # <--- CHANGE THIS USER + ansible_ssh_private_key_file: ~/.ssh/id_ed25519 + + vcl2: + ansible_host: 152.7.177.180 # <--- CHANGE THIS IP + ansible_user: your_unity_id # <--- CHANGE THIS USER + ansible_ssh_private_key_file: ~/.ssh/id_ed25519 + + vcl3: + ansible_host: 152.7.178.104 # <--- CHANGE THIS IP + ansible_user: your_unity_id # <--- CHANGE THIS USER + ansible_ssh_private_key_file: ~/.ssh/id_ed25519 + ``` + +2. **Open `ansible/inventory-password.yml`** and do the exact same updates: + ```yaml + all: + hosts: + vcl1: + ansible_host: 152.7.176.221 # <--- CHANGE THIS IP + ansible_user: your_unity_id # <--- CHANGE THIS USER + # ... keep the rest as is ... + + vcl2: + ansible_host: 152.7.177.180 # <--- CHANGE THIS IP + ansible_user: your_unity_id # <--- CHANGE THIS USER + + vcl3: + ansible_host: 152.7.178.104 # <--- CHANGE THIS IP + ansible_user: your_unity_id # <--- CHANGE THIS USER + ``` + +3. **Save and Push Changes to GitHub:** + ```bash + git add ansible/inventory.yml ansible/inventory-password.yml + git commit -m "Update VCL IPs and User" + git push + ``` + +--- + +### **Step 2: Server Setup (On VCL 2)** + +We use VCL 2 as the "Control Node" to run Ansible. + +1. **SSH into VCL 2:** + *(Replace `your_unity_id` and the IP with your VCL 2 info)* + ```bash + ssh your_unity_id@152.7.177.180 + ``` + +2. **Clone the Repository:** + ```bash + git clone https://github.ncsu.edu/vpatel29/devops-project.git + cd devops-project + ``` + +3. **Setup Git Credentials (Important!):** + This saves your password so the automation scripts don't get stuck asking for it. + ```bash + # Enable credential caching for 1 hour + git config --global credential.helper cache + git config --global credential.helper 'cache --timeout=3600' + + # Run a pull to trigger the login prompt and save it + git pull + # (Enter your Username and Personal Access Token here) + + # Run pull again to verify it works WITHOUT asking for password + git pull + ``` + +4. **Install Ansible:** + ```bash + sudo apt-get update + sudo apt-get install -y ansible sshpass + ``` + +5. **Run the Setup Script:** + This script will handle SSH keys, firewalls, Docker, and the app deployment automatically. + ```bash + cd ansible + bash SETUP.sh + ``` + * **Note:** When it asks for the "VCL Password", enter the password you use to SSH into the VCL machines. + +--- + +### **Step 3: Verify It Works** + +Once the script finishes successfully: + +1. **Check the Website (Load Balancer):** + ```bash + curl -v http://152.7.176.221 + ``` + *(Replace with your VCL 1 IP. You should see the HTML for the Coffee App)* diff --git a/scripts/local-bootstrap.sh b/scripts/local-bootstrap.sh index 202352f..6199902 100644 --- a/scripts/local-bootstrap.sh +++ b/scripts/local-bootstrap.sh @@ -5,9 +5,9 @@ # Configuration - UPDATE THESE IF NEEDED USER="sraval" -VCL1_IP="152.7.176.221" +VCL1_IP="152.7.176.240" VCL2_IP="152.7.177.180" -VCL3_IP="152.7.178.104" +VCL3_IP="152.7.176.221" echo "==========================================" echo " VCL CONNECTIVITY BOOTSTRAP" diff --git a/status_report_failover_fix.md b/status_report_failover_fix.md new file mode 100644 index 0000000..37bd131 --- /dev/null +++ b/status_report_failover_fix.md @@ -0,0 +1,156 @@ +# Failover Database Fix - Status Report + +## Problem Statement +When VCL2 failed and VCL3 activated during failover, VCL3 was serving **seed data** instead of the replicated production data from VCL2. + +## Root Cause Analysis +1. The monitor script started containers with `docker compose up -d --build` +2. This triggered database migrations and seeds instead of restoring from backup +3. The replication script was storing backups on VCL3 but the failover didn't use them + +## Changes Made + +### 1. Fixed Monitor Script (`ansible/setup-vcl3-monitor.yml`) + +**Before:** +```bash +activate_vcl3() { + sudo docker compose up -d --build # Started with migrations/seeds +} +``` + +**After:** +```bash +activate_vcl3() { + # 1. Start database container first + sudo docker compose up -d db + + # 2. Wait for DB to be ready + docker exec coffee_db pg_isready -U postgres + + # 3. Find and restore latest backup + LATEST_BACKUP=$(ls -t /tmp/db-backup/coffee_db_*.sql | head -1) + cat $LATEST_BACKUP | docker exec -i coffee_db psql -U postgres -d coffee_dev + + # 4. Then start app container + sudo docker compose up -d app +} +``` + +### 2. Added Reverse Replication for Failback + +When VCL2 comes back online, the monitor now: +1. **Syncs VCL3 database back to VCL2** (preserves data created during failover) +2. Then deactivates VCL3 + +```bash +deactivate_vcl3() { + # Sync database BACK to VCL2 before stopping + sync_database_to_vcl2() + + # Then stop VCL3 + docker compose down +} +``` + +### 3. Fixed Replication Script (`ansible/setup-replication.yml`) + +**Before:** Tried to restore to VCL3's running database (which doesn't exist in standby) + +**After:** Just stores backup on VCL3 for failover use: +- Creates `/tmp/db-backup/coffee_db_YYYYMMDD_HHMMSS.sql` on VCL3 +- Keeps last 5 backups +- Monitor script finds and restores latest during failover + +## Failover Flow (Updated) + +``` +Normal Operation (VCL2 Active): +┌─────────────────────────────────────────────────────────────┐ +│ VCL2 (Primary) VCL3 (Cold Standby) │ +│ ├── App running ├── Containers stopped │ +│ ├── DB running ├── Backup files stored │ +│ └── Serving traffic └── Monitor watching VCL2 │ +│ │ +│ Every 30 min: VCL2 ────backup────> VCL3:/tmp/db-backup/ │ +└─────────────────────────────────────────────────────────────┘ + +Failover (VCL2 Down): +┌─────────────────────────────────────────────────────────────┐ +│ 1. Monitor detects VCL2 is down (3 failed checks) │ +│ 2. Start DB container on VCL3 │ +│ 3. Restore latest backup: coffee_db_*.sql │ +│ 4. Start App container on VCL3 │ +│ 5. VCL3 now serving with PRODUCTION DATA │ +└─────────────────────────────────────────────────────────────┘ + +Failback (VCL2 Returns): +┌─────────────────────────────────────────────────────────────┐ +│ 1. Monitor detects VCL2 is healthy again │ +│ 2. Sync VCL3 database BACK to VCL2 (preserve new data) │ +│ 3. Stop VCL3 containers │ +│ 4. VCL2 now primary again with all data │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Deployment Instructions + +After updating the Ansible files, redeploy: + +```bash +# On your local machine +cd ansible + +# Redeploy replication setup (updates sync script on VCL2) +ansible-playbook -i inventory.yml setup-replication.yml + +# Redeploy monitor (updates failover script on VCL3) +ansible-playbook -i inventory.yml setup-vcl3-monitor.yml +``` + +## Testing the Fix + +1. **Ensure backup exists on VCL3:** + ```bash + # On VCL3 + ls -la /tmp/db-backup/ + ``` + +2. **Manually trigger replication (on VCL2):** + ```bash + /home/sraval/scripts/sync-db-to-vcl3.sh + ``` + +3. **Test failover:** + ```bash + # On VCL2 - stop the app + cd ~/devops-project/coffee_project + sudo docker compose down + ``` + +4. **Verify VCL3 has correct data:** + ```bash + # On VCL3 - check if orders/data matches VCL2 + curl http://localhost:3000/coffees + ``` + +5. **Test failback:** + ```bash + # On VCL2 - restart the app + sudo docker compose up -d + ``` + +## Key Files Changed + +| File | Purpose | +|------|---------| +| `ansible/setup-vcl3-monitor.yml` | Monitor script with DB restore on failover | +| `ansible/setup-replication.yml` | Replication script (just stores backup) | + +## What This Fixes + +| Before | After | +|--------|-------| +| VCL3 starts with seed data | VCL3 restores from latest backup | +| Data created during failover is lost | Data synced back to VCL2 on failback | +| Replication tries to restore to stopped DB | Backup stored for failover use |