Gildado
diff --git a/‎backend/.env.example‎
Lines changed: 23 additions & 0 deletions b/‎backend/.env.example‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎backend/docker-compose.monitoring.yml‎
Lines changed: 165 additions & 0 deletions b/‎backend/docker-compose.monitoring.yml‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎backend/grafana/dashboards/payd-overview.json‎
Lines changed: 151 additions & 0 deletions b/‎backend/grafana/dashboards/payd-overview.json‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎backend/grafana/provisioning/dashboards/dashboards.yml‎
Lines changed: 11 additions & 0 deletions b/‎backend/grafana/provisioning/dashboards/dashboards.yml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎backend/grafana/provisioning/datasources/datasources.yml‎
Lines changed: 31 additions & 0 deletions b/‎backend/grafana/provisioning/datasources/datasources.yml‎
Lines changed: 31 additions & 0 deletions
@@ -176,3 +176,26 @@ BUILD_TIME=
 # Local cache controls.
 ENABLE_CACHING=true
 CACHE_TTL=3600
+
+# -----------------------------------------------------------------------------
+# Monitoring / Observability (ELK stack + tracing + metrics)
+# -----------------------------------------------------------------------------
+# Elasticsearch — set ELASTICSEARCH_ENABLED=true to ship logs to ES.
+# Start the full monitoring stack: docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up
+ELASTICSEARCH_ENABLED=false
+ELASTICSEARCH_URL=http://localhost:9200
+ELASTICSEARCH_USERNAME=
+ELASTICSEARCH_PASSWORD=
+
+# OpenTelemetry distributed tracing — set TRACING_ENABLED=true to export spans.
+# Default collector: Jaeger at localhost:4318 (OTLP/HTTP).
+TRACING_ENABLED=false
+OTLP_ENDPOINT=http://localhost:4318/v1/traces
+
+# Optional bearer token to restrict access to GET /metrics (Prometheus scraping).
+# Leave empty to allow unauthenticated scraping (suitable for private networks).
+METRICS_TOKEN=
+
+# Grafana admin credentials (used by docker-compose.monitoring.yml).
+GRAFANA_ADMIN_USER=admin
+GRAFANA_ADMIN_PASSWORD=payd_grafana
@@ -0,0 +1,165 @@
+version: '3.8'
+
+# Monitoring stack for PayD backend
+# Start with: docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up
+#
+# Services:
+#   elasticsearch   — log & metric storage (ELK)
+#   logstash        — log aggregation pipeline
+#   kibana          — log/metric dashboards  (http://localhost:5601)
+#   jaeger          — distributed tracing UI  (http://localhost:16686)
+#   prometheus      — metrics scraper         (http://localhost:9090)
+#   grafana         — metrics dashboards      (http://localhost:3000)
+
+services:
+  # ── Elasticsearch ────────────────────────────────────────────────────────────
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.13.4
+    container_name: payd_elasticsearch
+    environment:
+      - discovery.type=single-node
+      - xpack.security.enabled=false
+      - xpack.security.http.ssl.enabled=false
+      - ES_JAVA_OPTS=-Xms512m -Xmx512m
+      - bootstrap.memory_lock=true
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    ports:
+      - '9200:9200'
+    volumes:
+      - elasticsearch_data:/usr/share/elasticsearch/data
+    healthcheck:
+      test: ['CMD-SHELL', 'curl -sf http://localhost:9200/_cluster/health || exit 1']
+      interval: 20s
+      timeout: 10s
+      retries: 10
+    networks:
+      - payd_network
+      - monitoring_network
+
+  # ── Logstash ─────────────────────────────────────────────────────────────────
+  logstash:
+    image: docker.elastic.co/logstash/logstash:8.13.4
+    container_name: payd_logstash
+    volumes:
+      - ./logstash/pipeline:/usr/share/logstash/pipeline:ro
+      - ./logstash/config/logstash.yml:/usr/share/logstash/config/logstash.yml:ro
+      - ./logs:/app/logs:ro
+    ports:
+      - '5044:5044'   # Beats input
+      - '5000:5000'   # TCP input (structured JSON logs)
+      - '9600:9600'   # Logstash API
+    environment:
+      - LS_JAVA_OPTS=-Xms256m -Xmx256m
+    depends_on:
+      elasticsearch:
+        condition: service_healthy
+    networks:
+      - monitoring_network
+
+  # ── Kibana ───────────────────────────────────────────────────────────────────
+  kibana:
+    image: docker.elastic.co/kibana/kibana:8.13.4
+    container_name: payd_kibana
+    ports:
+      - '5601:5601'
+    environment:
+      - ELASTICSEARCH_HOSTS=http://elasticsearch:9200
+      - KIBANA_SYSTEM_PASSWORD=changeme
+    volumes:
+      - ./kibana/kibana.yml:/usr/share/kibana/config/kibana.yml:ro
+      - kibana_data:/usr/share/kibana/data
+    depends_on:
+      elasticsearch:
+        condition: service_healthy
+    healthcheck:
+      test: ['CMD-SHELL', 'curl -sf http://localhost:5601/api/status | grep -q "available" || exit 1']
+      interval: 30s
+      timeout: 10s
+      retries: 10
+    networks:
+      - monitoring_network
+
+  # ── Jaeger (distributed tracing) ─────────────────────────────────────────────
+  jaeger:
+    image: jaegertracing/all-in-one:1.57
+    container_name: payd_jaeger
+    ports:
+      - '16686:16686'   # Jaeger UI
+      - '4317:4317'     # OTLP gRPC
+      - '4318:4318'     # OTLP HTTP  ← backend sends traces here
+      - '14268:14268'   # Jaeger HTTP collector
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+      - SPAN_STORAGE_TYPE=memory
+      - METRICS_STORAGE_TYPE=prometheus
+    healthcheck:
+      test: ['CMD-SHELL', 'wget -qO- http://localhost:14269/ || exit 1']
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    networks:
+      - payd_network
+      - monitoring_network
+
+  # ── Prometheus ───────────────────────────────────────────────────────────────
+  prometheus:
+    image: prom/prometheus:v2.52.0
+    container_name: payd_prometheus
+    ports:
+      - '9090:9090'
+    volumes:
+      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--storage.tsdb.retention.time=15d'
+      - '--web.enable-lifecycle'
+    healthcheck:
+      test: ['CMD', 'wget', '-qO-', 'http://localhost:9090/-/healthy']
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    networks:
+      - payd_network
+      - monitoring_network
+
+  # ── Grafana ──────────────────────────────────────────────────────────────────
+  grafana:
+    image: grafana/grafana:10.4.3
+    container_name: payd_grafana
+    ports:
+      - '3000:3000'
+    environment:
+      - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
+      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-payd_grafana}
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_INSTALL_PLUGINS=grafana-piechart-panel
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ./grafana/provisioning:/etc/grafana/provisioning:ro
+      - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
+    depends_on:
+      - prometheus
+    healthcheck:
+      test: ['CMD-SHELL', 'wget -qO- http://localhost:3000/api/health || exit 1']
+      interval: 10s
+      timeout: 5s
+      retries: 10
+    networks:
+      - monitoring_network
+
+volumes:
+  elasticsearch_data:
+  kibana_data:
+  prometheus_data:
+  grafana_data:
+
+networks:
+  payd_network:
+    external: true          # shared with docker-compose.yml
+  monitoring_network:
+    driver: bridge
@@ -0,0 +1,151 @@
+{
+  "title": "PayD Backend Overview",
+  "uid": "payd-overview",
+  "schemaVersion": 39,
+  "version": 1,
+  "refresh": "30s",
+  "time": { "from": "now-1h", "to": "now" },
+  "tags": ["payd", "backend"],
+  "panels": [
+    {
+      "id": 1,
+      "title": "HTTP Request Rate (req/s)",
+      "type": "timeseries",
+      "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "sum(rate(http_requests_total[1m])) by (method, route)",
+          "legendFormat": "{{method}} {{route}}"
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "title": "HTTP Error Rate (4xx/5xx req/s)",
+      "type": "timeseries",
+      "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "sum(rate(http_request_errors_total[1m])) by (status_code)",
+          "legendFormat": "HTTP {{status_code}}"
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "title": "P99 Request Latency (s)",
+      "type": "timeseries",
+      "gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))",
+          "legendFormat": "p99 {{route}}"
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "title": "P50 / P95 Latency",
+      "type": "timeseries",
+      "gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p50"
+        },
+        {
+          "datasource": "Prometheus",
+          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p95"
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "title": "Active Connections",
+      "type": "stat",
+      "gridPos": { "x": 0, "y": 16, "w": 4, "h": 4 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "active_connections",
+          "legendFormat": "Active"
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "title": "Total Errors",
+      "type": "stat",
+      "gridPos": { "x": 4, "y": 16, "w": 4, "h": 4 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "sum(increase(errors_total[1h]))",
+          "legendFormat": "Errors (1h)"
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "title": "Node.js Heap Used (MB)",
+      "type": "timeseries",
+      "gridPos": { "x": 8, "y": 16, "w": 8, "h": 8 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "nodejs_heap_size_used_bytes / 1024 / 1024",
+          "legendFormat": "Heap used"
+        },
+        {
+          "datasource": "Prometheus",
+          "expr": "nodejs_heap_size_total_bytes / 1024 / 1024",
+          "legendFormat": "Heap total"
+        }
+      ]
+    },
+    {
+      "id": 8,
+      "title": "DB Query Duration P99 (s)",
+      "type": "timeseries",
+      "gridPos": { "x": 16, "y": 16, "w": 8, "h": 8 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "histogram_quantile(0.99, sum(rate(db_query_duration_seconds_bucket[5m])) by (le, operation, table))",
+          "legendFormat": "p99 {{operation}}/{{table}}"
+        }
+      ]
+    },
+    {
+      "id": 9,
+      "title": "Payment Operations / min",
+      "type": "timeseries",
+      "gridPos": { "x": 0, "y": 24, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "sum(rate(payment_operations_total[1m])) by (status, type)",
+          "legendFormat": "{{type}} — {{status}}"
+        }
+      ]
+    },
+    {
+      "id": 10,
+      "title": "Auth Attempts / min",
+      "type": "timeseries",
+      "gridPos": { "x": 12, "y": 24, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "sum(rate(auth_attempts_total[1m])) by (method, status)",
+          "legendFormat": "{{method}} — {{status}}"
+        }
+      ]
+    }
+  ]
+}
@@ -0,0 +1,11 @@
+apiVersion: 1
+
+providers:
+  - name: PayD Dashboards
+    orgId: 1
+    type: file
+    disableDeletion: false
+    editable: true
+    updateIntervalSeconds: 30
+    options:
+      path: /var/lib/grafana/dashboards
@@ -0,0 +1,31 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: false
+    jsonData:
+      timeInterval: 15s
+
+  - name: Elasticsearch
+    type: elasticsearch
+    access: proxy
+    url: http://elasticsearch:9200
+    database: payd-logs-*
+    isDefault: false
+    editable: false
+    jsonData:
+      esVersion: "8.0.0"
+      timeField: "@timestamp"
+      logLevelField: level
+      logMessageField: message
+
+  - name: Jaeger
+    type: jaeger
+    access: proxy
+    url: http://jaeger:16686
+    isDefault: false
+    editable: false