diff --git a/apps/frontend/src/app/components/ui/ImageWithFallback.tsx b/apps/frontend/src/app/components/ui/ImageWithFallback.tsx index 0e26139..34a3a40 100644 --- a/apps/frontend/src/app/components/ui/ImageWithFallback.tsx +++ b/apps/frontend/src/app/components/ui/ImageWithFallback.tsx @@ -1,3 +1,4 @@ +/* eslint-disable @next/next/no-img-element */ import React, { useState } from 'react' const ERROR_IMG_SRC = diff --git a/apps/frontend/src/app/dashboard/page.tsx b/apps/frontend/src/app/dashboard/page.tsx index ef7b64e..896036e 100644 --- a/apps/frontend/src/app/dashboard/page.tsx +++ b/apps/frontend/src/app/dashboard/page.tsx @@ -11,6 +11,7 @@ import { CheckCircle2, AlertCircle, } from 'lucide-react'; +import { useState } from 'react'; const stats = [ { @@ -98,6 +99,12 @@ const transactions = [ ]; export default function OverviewPage() { + // Fixed: Use stable widths instead of Math.random + const [assetWidths] = useState(() => assets.map((_, index) => { + const widths = [85, 72, 64]; + return widths[index % widths.length]; + })); + return (
{/* Header */} @@ -211,7 +218,7 @@ export default function OverviewPage() {
@@ -371,4 +378,4 @@ export default function OverviewPage() { ); -} +} \ No newline at end of file diff --git a/apps/frontend/src/app/dashboard/subscriptions/page.tsx b/apps/frontend/src/app/dashboard/subscriptions/page.tsx index 6c658f7..8d01739 100644 --- a/apps/frontend/src/app/dashboard/subscriptions/page.tsx +++ b/apps/frontend/src/app/dashboard/subscriptions/page.tsx @@ -1,7 +1,7 @@ 'use client'; -import { motion } from 'motion/react'; -import { Plus, CheckCircle2, AlertCircle } from 'lucide-react'; +import { motion } from "motion/react"; +import { Plus, CheckCircle2, AlertCircle } from "lucide-react"; const subscriptions = [ { @@ -92,7 +92,7 @@ export default function SubscriptionsPage() { Interval Status Next Billing - + {subscriptions.map((sub, index) => ( @@ -135,4 +135,4 @@ export default function SubscriptionsPage() { ); -} +} \ No newline at end of file diff --git a/apps/frontend/src/app/dashboard/treasury/page.tsx b/apps/frontend/src/app/dashboard/treasury/page.tsx index 65934a3..c3b0d7c 100644 --- a/apps/frontend/src/app/dashboard/treasury/page.tsx +++ b/apps/frontend/src/app/dashboard/treasury/page.tsx @@ -1,7 +1,7 @@ 'use client'; -import { motion } from 'motion/react'; -import { Coins, TrendingUp, Eye, CheckCircle2 } from 'lucide-react'; +import { motion } from "motion/react"; +import { Coins, TrendingUp, Eye, CheckCircle2 } from "lucide-react"; const mirrorAssets = [ { @@ -221,4 +221,4 @@ export default function TreasuryPage() { ); -} +} \ No newline at end of file diff --git a/apps/frontend/src/app/dashboard/webhooks/page.tsx b/apps/frontend/src/app/dashboard/webhooks/page.tsx index b42135c..cacff2f 100644 --- a/apps/frontend/src/app/dashboard/webhooks/page.tsx +++ b/apps/frontend/src/app/dashboard/webhooks/page.tsx @@ -1,7 +1,7 @@ 'use client'; -import { motion } from 'motion/react'; -import { Plus, AlertCircle, Activity } from 'lucide-react'; +import { motion } from "motion/react"; +import { Plus, AlertCircle, Activity } from "lucide-react"; const webhooks = [ { @@ -88,7 +88,7 @@ export default function WebhooksPage() { Status Last Triggered Proof Hash - + {webhooks.map((webhook, index) => ( @@ -143,4 +143,4 @@ export default function WebhooksPage() { ); -} +} \ No newline at end of file diff --git a/docs/disaster-recovery.md b/docs/disaster-recovery.md new file mode 100644 index 0000000..bc41083 --- /dev/null +++ b/docs/disaster-recovery.md @@ -0,0 +1,242 @@ +# Disaster recovery — Stellar Pay payment gateway + +This document defines backup and restore procedures, operational runbooks for common failures, database point-in-time recovery (PITR), and cross-region replication patterns for the payment gateway. It is written for operators and incident commanders. Replace placeholder values (regions, ARNs, contact lists) with your production configuration. + +## 1. Objectives and scope + +| Objective | Typical target (set per environment) | +|-----------|--------------------------------------| +| **RPO** (Recovery Point Objective) | Minutes for transactional data if PITR is enabled; align with business and regulatory requirements. | +| **RTO** (Recovery Time Objective) | Time to restore service in a secondary region or from backup; document per tier (API vs database). | + +**In scope:** PostgreSQL (application state), Redis (ephemeral/session/rate-limit state), API and worker compute, secrets, webhook delivery state, treasury and Stellar-related configuration. + +**Out of scope (by design):** The Stellar public ledger is the canonical source for on-chain settlement. Recovery focuses on **your** databases, caches, and application tier. On-chain history is verified via Horizon and Soroban RPC. + +## 2. Backup procedures + +### 2.1 PostgreSQL (primary system of record) + +Use one or more of the following, depending on hosting: + +| Method | When to use | Notes | +|--------|-------------|--------| +| **Automated snapshots** | Managed Postgres (RDS, Cloud SQL, Azure Database, etc.) | Enable retention policy; encrypt at rest; tag snapshots by environment. | +| **Logical dumps** (`pg_dump`) | Portable exports, smaller environments, schema migrations | Schedule off-peak; store encrypted objects in object storage with versioning. | +| **Physical base backup + WAL** | Self-managed Postgres, strict RPO | Use `pg_basebackup` plus continuous archiving (WAL) to object storage for PITR (see section 4). | + +**Minimum practice:** + +1. **Daily** full or incremental snapshot (managed) or logical dump to immutable storage. +2. **Retain** backups per compliance (e.g. 7–35 days online, longer in cold storage). +3. **Test restores** at least quarterly to a non-production cluster. +4. **Protect** backup credentials; separate from production DB credentials. + +**Connection string:** Application uses `DATABASE_URL` (see `apps/api/.env.example`). Document the actual endpoint and database name in your internal runbook inventory (not in this repo). + +### 2.2 Redis + +Redis often holds **non-authoritative** data (sessions, throttles, caches). Treat as rebuildable unless you store payment-critical queues only in Redis. + +| Method | Notes | +|--------|--------| +| **RDB snapshots / AOF** | Enable per Redis vendor guidance; replicate to a standby for faster failover. | +| **Cross-AZ replica** | Reduces blast radius of AZ failure. | + +If Redis loss is acceptable at RPO=0 for cache only, document **cold start**: empty cache, users may need to re-authenticate; replay idempotent webhooks from your DB if applicable. + +### 2.3 Secrets and configuration + +| Asset | Backup approach | +|-------|-----------------| +| JWT signing keys (`JWT_SECRET`) | Store in KMS/Secrets Manager; rotation procedure in section 5. | +| Treasury and Stellar config | Version-controlled infra-as-code where safe; never commit private keys. | +| Webhook signing secrets | Same as above; rotate if compromise suspected. | + +### 2.4 Application artifacts + +- **Container images:** Immutable tags in a registry; reproducible builds from CI. +- **Infrastructure:** Terraform/Pulumi/Kubernetes manifests in Git with reviewed changes. + +## 3. Restore procedures + +### 3.1 Restore PostgreSQL from snapshot (managed) + +1. Create a **new** DB instance from snapshot or PITR restore (section 4) in the target subnet/security group. +2. Run **migrations** if the restored instance is behind the expected schema (use your migration tool against the new endpoint). +3. Update **`DATABASE_URL`** (or equivalent secret) for the API and workers. +4. **Scale down** old writers if promoting a new primary to avoid split-brain (only one writer). +5. Run **smoke tests**: health checks, read/write probe, payment intent creation in staging first. + +### 3.2 Restore from logical dump + +1. Provision empty Postgres with correct version and extensions. +2. `pg_restore` or `psql` import from encrypted dump. +3. Verify row counts, checksums, or application-level reconciliation reports. +4. Point application to new URL; migrate traffic (blue/green or DNS). + +### 3.3 Restore Redis + +1. Restore from RDB/AOF if you rely on persistence; otherwise **empty** Redis and let the application repopulate cache. +2. Invalidate any stale rate-limit keys if IP or user identity semantics changed during failover. + +### 3.4 Restore API tier + +1. Deploy the same **image tag** that passed last successful production deploy (or current known-good). +2. Confirm **environment variables** and secrets match the restored database and Redis endpoints. +3. Gradually shift traffic behind load balancer; watch error rates and latency (see monitoring stack under `monitoring/` if deployed). + +## 4. Database point-in-time recovery (PITR) + +PITR lets you recover to a **specific timestamp** before a bad migration, accidental delete, or corruption discovery. + +### 4.1 Managed PostgreSQL (recommended pattern) + +Examples (concepts apply across clouds): + +- **AWS RDS / Aurora PostgreSQL:** Enable **automated backups** and set **backup retention**; use **Restore to point in time** or **latest restorable time** from the console/CLI. Create a **new** instance from that restore; validate; then cut over `DATABASE_URL`. +- **Google Cloud SQL:** Enable **point-in-time recovery** with transaction logs; restore to an instance at a chosen timestamp. +- **Azure Database for PostgreSQL:** Use **point-in-time restore** to a new server. + +**Operational steps:** + +1. Record the **target recovery time (UTC)** agreed with stakeholders (before the incident). +2. Restore to a **new** instance name to avoid overwriting the current primary until validated. +3. Run **application and data validation** (queries, reconciliation with Stellar for recent intents if applicable). +4. If valid, **promote** by updating connection strings and deprovisioning the bad instance only after retention policy allows. + +### 4.2 Self-hosted PostgreSQL + +1. **Continuous archiving:** Archive WAL segments to durable storage (S3-compatible bucket, etc.). +2. **Base backup:** Periodic `pg_basebackup` (or equivalent) stored alongside WAL path. +3. **Recovery:** Use `recovery_target_time` in `recovery.signal` / `postgresql.auto.conf` (version-dependent) to recover to a timestamp; start Postgres and verify. + +Document your exact `postgresql.conf` and recovery steps in internal infrastructure docs; keep this file as the **gateway-level** procedure reference. + +### 4.3 Retention and compliance + +- Align backup retention with **PCI**, **SOC2**, and local regulations if you process cardholder or sensitive data. +- Encrypt backups **in transit and at rest**. + +## 5. Cross-region replication and failover + +Cross-region setup reduces risk of a full regional outage affecting both API and database. + +### 5.1 Patterns + +| Pattern | Description | +|---------|-------------| +| **Read replica in secondary region** | Managed Postgres cross-region read replica; **promote** replica to standalone primary on regional failure (manual or automated per vendor). | +| **Active-passive stack** | Secondary region has warm or cold API tier; DNS or global load balancer points to primary; failover updates DNS/weights to secondary after DB promotion. | +| **Multi-region writes** | Complex; usually avoided for payment state unless you have strong conflict resolution; prefer single primary per shard. | + +### 5.2 Configuration checklist (secondary region) + +1. **Networking:** VPC peering or private connectivity between regions if required; security groups allow only gateway subnets. +2. **Database:** Create cross-region **read replica** from primary; monitor replication lag (alert if lag exceeds SLO). +3. **Secrets:** Replicate or reference same KMS/Secrets Manager **multi-region keys** where supported, or maintain secondary secrets with rotation procedures. +4. **Application:** Same container image; different `DATABASE_URL` (after promotion), `REDIS_URL`, and possibly Stellar Horizon URL if using region-specific endpoints (usually global). +5. **DNS / traffic:** Health checks on primary; runbook to lower TTL before planned failover; update global load balancer or DNS to secondary. + +### 5.3 Failover order (high level) + +1. **Confirm** primary region is unrecoverable or meets declared disaster criteria. +2. **Stop** writes to old primary if still partially reachable (avoid split-brain). +3. **Promote** cross-region replica to **writable** primary (vendor-specific steps). +4. **Point** application secrets to new writer endpoint. +5. **Raise** API/worker capacity in secondary region. +6. **Validate** end-to-end payment flow and webhook delivery. +7. **Communicate** status to customers per your incident comms plan. + +## 6. Runbooks — common failure scenarios + +Use these as checklists during incidents. Assign roles: **IC** (Incident Commander), **Ops**, **Comms**. + +### 6.1 API unavailable (5xx or timeout) + +| Step | Action | +|------|--------| +| 1 | Check load balancer health, recent deploys, and container restarts. | +| 2 | Verify **database** and **Redis** connectivity from API pods. | +| 3 | Scale replicas horizontally if CPU/memory bound. | +| 4 | Roll back to last known-good image if a bad deploy is suspected. | +| 5 | If region-wide, initiate **section 5.3** failover after DB promotion path is clear. | + +### 6.2 Primary database unavailable + +| Step | Action | +|------|--------| +| 1 | Confirm outage with cloud provider status and DB metrics (connections, storage, CPU). | +| 2 | If **Multi-AZ** failover is automatic, wait for completion and verify `DATABASE_URL` still points to the writer endpoint. | +| 3 | If unrecoverable, restore from **latest snapshot** or **PITR** (section 3–4) to a new instance; update secrets; validate. | +| 4 | If **cross-region replica** exists, evaluate **promote** (section 5.3). | + +### 6.3 Suspected data corruption or bad migration + +| Step | Action | +|------|--------| +| 1 | **Freeze** destructive migrations and optional write traffic (maintenance page) if needed. | +| 2 | Identify **last known good time** from monitoring and application logs. | +| 3 | Execute **PITR** to a new instance (section 4); validate data; cut over. | +| 4 | Root-cause analysis; add migration safeguards (expand/contract patterns, backups before DDL). | + +### 6.4 Redis unavailable + +| Step | Action | +|------|--------| +| 1 | Fail over to replica or restart nodes per Redis operator runbook. | +| 2 | If data loss is acceptable, **empty** Redis and restore service; expect cache miss and possible auth/session effects. | +| 3 | Monitor **rate limits** and **session** behavior; communicate if users must sign in again. | + +### 6.5 Stellar network or Horizon degraded + +| Step | Action | +|------|--------| +| 1 | Confirm status via public Stellar/Horizon channels. | +| 2 | Switch to **backup Horizon/RPC endpoints** if configured (env vars such as `STELLAR_HORIZON_URL`). | +| 3 | **Queue** or **retry** submissions with idempotency keys; do not double-submit without reconciliation. | +| 4 | Comms: delayed settlement vs platform hard down. | + +### 6.6 Treasury or signing key compromise + +| Step | Action | +|------|--------| +| 1 | **Rotate** keys in HSM/KMS; revoke old keys per Stellar key rotation procedures. | +| 2 | Update **`TREASURY_WALLET_ADDRESS`** and signing paths only after new keys are funded and tested on testnet first. | +| 3 | Audit **recent on-chain** activity for unauthorized transactions. | +| 4 | Law enforcement / legal per policy if customer funds at risk. | + +### 6.7 Webhook delivery backlog or duplicate delivery + +| Step | Action | +|------|--------| +| 1 | Inspect **outbox** or job queue in Postgres (if implemented); scale workers. | +| 2 | Use **idempotency keys** so replays are safe. | +| 3 | Alert merchants if prolonged delay per SLA. | + +## 7. Testing and simulation + +### 7.1 Schedule + +- **Quarterly:** Tabletop walkthrough of one runbook (sections 6.x). +- **Annually:** Full restore from backup to a staging environment, or PITR drill to a disposable instance. +- **After major changes:** Re-validate backup coverage when database or region topology changes. + +### 7.2 Simulated disaster recovery (automated checklist) + +The repository includes a **dry-run** script that walks through a simulated DR timeline and optional local health checks: + +- **Windows:** `powershell -File scripts/disaster-recovery-drill.ps1` (or `pwsh` if installed) +- **Linux/macOS:** `bash scripts/disaster-recovery-drill.sh` + +These scripts do **not** delete data or failover production systems; they print phases and verification prompts for operators. + +## 8. Contacts and references + +- **Internal:** Replace with on-call rotation, escalation matrix, and vendor support numbers. +- **External:** Stellar network status and Horizon endpoints as published by the Stellar Development Foundation. +- **Related repo paths:** `apps/api/.env.example` (environment variables), `monitoring/` (observability during and after recovery). + +--- + +*Review this document at least annually and after any production topology change.* diff --git a/lint_output.txt b/lint_output.txt new file mode 100644 index 0000000..5b5daaf Binary files /dev/null and b/lint_output.txt differ diff --git a/scripts/disaster-recovery-drill.ps1 b/scripts/disaster-recovery-drill.ps1 new file mode 100644 index 0000000..0944747 --- /dev/null +++ b/scripts/disaster-recovery-drill.ps1 @@ -0,0 +1,54 @@ +# Simulated disaster recovery drill (dry run - no production changes) +# Usage: powershell -File scripts/disaster-recovery-drill.ps1 [-ApiBaseUrl http://localhost:3000] + +param( + [string]$ApiBaseUrl = 'http://localhost:3000' +) + +$ErrorActionPreference = 'Stop' + +function Write-Phase { + param([string]$Name) + Write-Host '' + Write-Host "=== $Name ===" -ForegroundColor Cyan +} + +Write-Host 'Stellar Pay - simulated disaster recovery drill (dry run)' -ForegroundColor Green +Write-Host "Time (UTC): $([datetime]::UtcNow.ToString('o'))" + +Write-Phase 'Phase 0 - Preconditions' +Write-Host '- Confirm incident commander and comms owner are assigned.' +Write-Host '- Open docs/disaster-recovery.md and the relevant runbook section.' + +Write-Phase 'Phase 1 - Assess' +Write-Host '- Verify scope: region, database, Redis, or application-only.' +Write-Host '- Capture current error rates and last successful deploy from monitoring.' + +Write-Phase 'Phase 2 - Stabilize (if applicable)' +Write-Host '- Optional: enable maintenance mode or reduce traffic per policy.' +Write-Host '- Ensure no destructive migrations run until recovery path is chosen.' + +Write-Phase 'Phase 3 - Database recovery path (tabletop)' +Write-Host '- Choose: snapshot restore vs PITR vs cross-region replica promotion.' +Write-Host '- Document target recovery timestamp (UTC) if using PITR.' + +Write-Phase 'Phase 4 - Validate target environment' +Write-Host '- After restore or promotion: run migrations if needed, then smoke tests.' + +Write-Phase 'Phase 5 - Optional API health check' +try { + $healthUrl = "$ApiBaseUrl/health" + Write-Host "GET $healthUrl" + $response = Invoke-WebRequest -Uri $healthUrl -UseBasicParsing -TimeoutSec 10 + Write-Host "HTTP $($response.StatusCode) - health endpoint reachable." -ForegroundColor Green +} catch { + Write-Host 'Health check skipped or failed (API may not be running locally).' -ForegroundColor Yellow + Write-Host $_.Exception.Message +} + +Write-Phase 'Phase 6 - Post-incident' +Write-Host '- Record timeline, root cause, and action items.' +Write-Host '- Update this drill date in your internal DR calendar.' + +Write-Host '' +Write-Host 'Drill complete (simulation only - no infrastructure was modified).' -ForegroundColor Green diff --git a/scripts/disaster-recovery-drill.sh b/scripts/disaster-recovery-drill.sh new file mode 100644 index 0000000..207e78f --- /dev/null +++ b/scripts/disaster-recovery-drill.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Simulated disaster recovery drill (dry run — no production changes) +# Usage: bash scripts/disaster-recovery-drill.sh [API_BASE_URL] +# Example: bash scripts/disaster-recovery-drill.sh http://localhost:3000 + +set -euo pipefail + +API_BASE_URL="${1:-http://localhost:3000}" + +phase() { + echo "" + echo "=== $1 ===" +} + +echo "Stellar Pay — simulated disaster recovery drill (dry run)" +echo "Time (UTC): $(date -u +"%Y-%m-%dT%H:%M:%SZ")" + +phase "Phase 0 — Preconditions" +echo "- Confirm incident commander and comms owner are assigned." +echo "- Open docs/disaster-recovery.md and the relevant runbook section." + +phase "Phase 1 — Assess" +echo "- Verify scope: region, database, Redis, or application-only." +echo "- Capture current error rates and last successful deploy from monitoring." + +phase "Phase 2 — Stabilize (if applicable)" +echo "- Optional: enable maintenance mode or reduce traffic per policy." +echo "- Ensure no destructive migrations run until recovery path is chosen." + +phase "Phase 3 — Database recovery path (tabletop)" +echo "- Choose: snapshot restore vs PITR vs cross-region replica promotion." +echo "- Document target recovery timestamp (UTC) if using PITR." + +phase "Phase 4 — Validate target environment" +echo "- After restore or promotion: run migrations if needed, then smoke tests." + +phase "Phase 5 — Optional API health check" +HEALTH_URL="${API_BASE_URL%/}/health" +echo "GET ${HEALTH_URL}" +if command -v curl >/dev/null 2>&1; then + if curl -sfS --max-time 10 "${HEALTH_URL}" >/dev/null; then + echo "HTTP OK — health endpoint reachable." + else + echo "Health check failed or API not running (expected in many dev setups)." >&2 + fi +else + echo "curl not found; skipping HTTP check." +fi + +phase "Phase 6 — Post-incident" +echo "- Record timeline, root cause, and action items." +echo "- Update this drill date in your internal DR calendar." + +echo "" +echo "Drill complete (simulation only — no infrastructure was modified)."