Merge pull request #195 from mijinummi/feat/188-automated-backup-recovery

Mkalbani · web-flow · commit dc03292606ef · 2026-02-24T11:26:29.000+01:00
Feat/188 automated backup recovery
diff --git a/disaster-recovery.md b/disaster-recovery.md
@@ -0,0 +1,86 @@
+# Disaster Recovery Runbook
+
+## Purpose
+This runbook provides step-by-step procedures for restoring database services in the event of data loss, corruption, or infrastructure failure. It ensures business continuity and compliance with recovery objectives.
+
+---
+
+## Recovery Objectives
+- **RPO (Recovery Point Objective):** ≤ 1 hour (hourly backups + WAL logs).
+- **RTO (Recovery Time Objective):** ≤ 2 hours for full restoration.
+- **Retention:** 30 days PITR, 6 months weekly backups.
+
+---
+
+## Recovery Scenarios
+1. **Accidental Data Deletion**
+   - Restore latest backup.
+   - Apply WAL logs to recover up to deletion time.
+2. **Database Corruption**
+   - Provision new DB instance.
+   - Restore last verified backup.
+   - Apply WAL logs.
+3. **Regional Outage**
+   - Switch to cross-region backup.
+   - Provision DB in secondary region.
+   - Restore backup + WAL logs.
+4. **Security Breach**
+   - Isolate compromised DB.
+   - Restore clean backup.
+   - Rotate credentials and keys.
+
+---
+
+## Recovery Steps
+1. **Identify Incident**
+   - Monitor alerts (backup failures, DB errors).
+   - Confirm scope of outage.
+2. **Provision New Database**
+   - Launch new DB instance in primary or secondary region.
+   - Configure networking and security groups.
+3. **Restore Backup**
+   - Retrieve latest encrypted backup from storage.
+   - Decrypt using KMS key.
+   - Import backup into new DB.
+4. **Apply WAL Logs (PITR)**
+   - Replay logs up to desired timestamp.
+   - Validate consistency.
+5. **Verify Restoration**
+   - Run automated integrity tests.
+   - Validate application connectivity.
+6. **Switch Traffic**
+   - Update connection strings.
+   - Point services to restored DB.
+7. **Post-Recovery Actions**
+   - Document incident.
+   - Notify stakeholders.
+   - Schedule follow-up review.
+
+---
+
+## Monitoring & Alerts
+- **Backup Failures:** Alert via Slack/email.
+- **Restore Failures:** Escalate to DBA team.
+- **Retention Policy:** Auto-delete expired backups, log events.
+
+---
+
+## Testing Schedule
+- **Monthly Restore Drill:** Restore backup into staging DB.
+- **Quarterly Failover Drill:** Simulate regional outage, restore cross-region backup.
+- **Annual Full Audit:** Verify PITR functionality for 30 days.
+
+---
+
+## Roles & Responsibilities
+- **DBA Team:** Execute recovery steps.
+- **DevOps Team:** Provision infrastructure.
+- **Security Team:** Handle breach scenarios.
+- **Management:** Approve failover decisions.
+
+---
+
+## References
+- Backup Service (`backend/src/backup/backup.service.ts`)
+- Monitoring Dashboard
+- Cloud Storage Policies
diff --git a/src/backup/backup.service.ts b/src/backup/backup.service.ts
@@ -0,0 +1,42 @@
+import { Injectable, Logger } from '@nestjs/common';
+import { Cron } from '@nestjs/schedule';
+import { exec } from 'child_process';
+import * as fs from 'fs';
+import * as crypto from 'crypto';
+
+@Injectable()
+export class BackupService {
+  private readonly logger = new Logger(BackupService.name);
+
+  @Cron('0 * * * *') // hourly
+  async hourlyBackup() {
+    await this.runBackup('hourly');
+  }
+
+  @Cron('0 0 * * *') // daily
+  async dailyBackup() {
+    await this.runBackup('daily');
+  }
+
+  private async runBackup(type: 'hourly' | 'daily') {
+    const timestamp = new Date().toISOString();
+    const file = `backup-${type}-${timestamp}.sql`;
+
+    exec(`pg_dump mydb > ${file}`, async (err) => {
+      if (err) {
+        this.logger.error(`Backup failed: ${err.message}`);
+        // trigger alert
+        return;
+      }
+
+      // Encrypt backup
+      const data = fs.readFileSync(file);
+      const cipher = crypto.createCipher('aes-256-cbc', process.env.BACKUP_KEY!);
+      const encrypted = Buffer.concat([cipher.update(data), cipher.final()]);
+      fs.writeFileSync(`${file}.enc`, encrypted);
+
+      this.logger.log(`Backup ${file} completed and encrypted`);
+      // upload to cloud storage here
+    });
+  }
+}
diff --git a/test/backup/backup.service.spec.ts b/test/backup/backup.service.spec.ts
@@ -0,0 +1,21 @@
+import { BackupService } from '../../src/backup/backup.service';
+
+describe('BackupService', () => {
+  let service: BackupService;
+
+  beforeEach(() => {
+    service = new BackupService();
+  });
+
+  it('should run hourly backup', async () => {
+    const spy = jest.spyOn(service as any, 'runBackup').mockResolvedValue(true);
+    await service.hourlyBackup();
+    expect(spy).toHaveBeenCalledWith('hourly');
+  });
+
+  it('should run daily backup', async () => {
+    const spy = jest.spyOn(service as any, 'runBackup').mockResolvedValue(true);
+    await service.dailyBackup();
+    expect(spy).toHaveBeenCalledWith('daily');
+  });
+});