Skip to content

Commit 2b259b9

Browse files
authored
Merge pull request #562 from Godbrand0/feat/elk-stack-monitoring
feat: ELK Stack monitoring & observability (#57)
2 parents b192383 + 6f9f10f commit 2b259b9

18 files changed

Lines changed: 1092 additions & 118 deletions

File tree

backend/.env.example

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,3 +176,26 @@ BUILD_TIME=
176176
# Local cache controls.
177177
ENABLE_CACHING=true
178178
CACHE_TTL=3600
179+
180+
# -----------------------------------------------------------------------------
181+
# Monitoring / Observability (ELK stack + tracing + metrics)
182+
# -----------------------------------------------------------------------------
183+
# Elasticsearch — set ELASTICSEARCH_ENABLED=true to ship logs to ES.
184+
# Start the full monitoring stack: docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up
185+
ELASTICSEARCH_ENABLED=false
186+
ELASTICSEARCH_URL=http://localhost:9200
187+
ELASTICSEARCH_USERNAME=
188+
ELASTICSEARCH_PASSWORD=
189+
190+
# OpenTelemetry distributed tracing — set TRACING_ENABLED=true to export spans.
191+
# Default collector: Jaeger at localhost:4318 (OTLP/HTTP).
192+
TRACING_ENABLED=false
193+
OTLP_ENDPOINT=http://localhost:4318/v1/traces
194+
195+
# Optional bearer token to restrict access to GET /metrics (Prometheus scraping).
196+
# Leave empty to allow unauthenticated scraping (suitable for private networks).
197+
METRICS_TOKEN=
198+
199+
# Grafana admin credentials (used by docker-compose.monitoring.yml).
200+
GRAFANA_ADMIN_USER=admin
201+
GRAFANA_ADMIN_PASSWORD=payd_grafana
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
version: '3.8'
2+
3+
# Monitoring stack for PayD backend
4+
# Start with: docker compose -f docker-compose.yml -f docker-compose.monitoring.yml up
5+
#
6+
# Services:
7+
# elasticsearch — log & metric storage (ELK)
8+
# logstash — log aggregation pipeline
9+
# kibana — log/metric dashboards (http://localhost:5601)
10+
# jaeger — distributed tracing UI (http://localhost:16686)
11+
# prometheus — metrics scraper (http://localhost:9090)
12+
# grafana — metrics dashboards (http://localhost:3000)
13+
14+
services:
15+
# ── Elasticsearch ────────────────────────────────────────────────────────────
16+
elasticsearch:
17+
image: docker.elastic.co/elasticsearch/elasticsearch:8.13.4
18+
container_name: payd_elasticsearch
19+
environment:
20+
- discovery.type=single-node
21+
- xpack.security.enabled=false
22+
- xpack.security.http.ssl.enabled=false
23+
- ES_JAVA_OPTS=-Xms512m -Xmx512m
24+
- bootstrap.memory_lock=true
25+
ulimits:
26+
memlock:
27+
soft: -1
28+
hard: -1
29+
ports:
30+
- '9200:9200'
31+
volumes:
32+
- elasticsearch_data:/usr/share/elasticsearch/data
33+
healthcheck:
34+
test: ['CMD-SHELL', 'curl -sf http://localhost:9200/_cluster/health || exit 1']
35+
interval: 20s
36+
timeout: 10s
37+
retries: 10
38+
networks:
39+
- payd_network
40+
- monitoring_network
41+
42+
# ── Logstash ─────────────────────────────────────────────────────────────────
43+
logstash:
44+
image: docker.elastic.co/logstash/logstash:8.13.4
45+
container_name: payd_logstash
46+
volumes:
47+
- ./logstash/pipeline:/usr/share/logstash/pipeline:ro
48+
- ./logstash/config/logstash.yml:/usr/share/logstash/config/logstash.yml:ro
49+
- ./logs:/app/logs:ro
50+
ports:
51+
- '5044:5044' # Beats input
52+
- '5000:5000' # TCP input (structured JSON logs)
53+
- '9600:9600' # Logstash API
54+
environment:
55+
- LS_JAVA_OPTS=-Xms256m -Xmx256m
56+
depends_on:
57+
elasticsearch:
58+
condition: service_healthy
59+
networks:
60+
- monitoring_network
61+
62+
# ── Kibana ───────────────────────────────────────────────────────────────────
63+
kibana:
64+
image: docker.elastic.co/kibana/kibana:8.13.4
65+
container_name: payd_kibana
66+
ports:
67+
- '5601:5601'
68+
environment:
69+
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
70+
- KIBANA_SYSTEM_PASSWORD=changeme
71+
volumes:
72+
- ./kibana/kibana.yml:/usr/share/kibana/config/kibana.yml:ro
73+
- kibana_data:/usr/share/kibana/data
74+
depends_on:
75+
elasticsearch:
76+
condition: service_healthy
77+
healthcheck:
78+
test: ['CMD-SHELL', 'curl -sf http://localhost:5601/api/status | grep -q "available" || exit 1']
79+
interval: 30s
80+
timeout: 10s
81+
retries: 10
82+
networks:
83+
- monitoring_network
84+
85+
# ── Jaeger (distributed tracing) ─────────────────────────────────────────────
86+
jaeger:
87+
image: jaegertracing/all-in-one:1.57
88+
container_name: payd_jaeger
89+
ports:
90+
- '16686:16686' # Jaeger UI
91+
- '4317:4317' # OTLP gRPC
92+
- '4318:4318' # OTLP HTTP ← backend sends traces here
93+
- '14268:14268' # Jaeger HTTP collector
94+
environment:
95+
- COLLECTOR_OTLP_ENABLED=true
96+
- SPAN_STORAGE_TYPE=memory
97+
- METRICS_STORAGE_TYPE=prometheus
98+
healthcheck:
99+
test: ['CMD-SHELL', 'wget -qO- http://localhost:14269/ || exit 1']
100+
interval: 10s
101+
timeout: 5s
102+
retries: 5
103+
networks:
104+
- payd_network
105+
- monitoring_network
106+
107+
# ── Prometheus ───────────────────────────────────────────────────────────────
108+
prometheus:
109+
image: prom/prometheus:v2.52.0
110+
container_name: payd_prometheus
111+
ports:
112+
- '9090:9090'
113+
volumes:
114+
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
115+
- prometheus_data:/prometheus
116+
command:
117+
- '--config.file=/etc/prometheus/prometheus.yml'
118+
- '--storage.tsdb.path=/prometheus'
119+
- '--storage.tsdb.retention.time=15d'
120+
- '--web.enable-lifecycle'
121+
healthcheck:
122+
test: ['CMD', 'wget', '-qO-', 'http://localhost:9090/-/healthy']
123+
interval: 10s
124+
timeout: 5s
125+
retries: 5
126+
networks:
127+
- payd_network
128+
- monitoring_network
129+
130+
# ── Grafana ──────────────────────────────────────────────────────────────────
131+
grafana:
132+
image: grafana/grafana:10.4.3
133+
container_name: payd_grafana
134+
ports:
135+
- '3000:3000'
136+
environment:
137+
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
138+
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-payd_grafana}
139+
- GF_USERS_ALLOW_SIGN_UP=false
140+
- GF_INSTALL_PLUGINS=grafana-piechart-panel
141+
volumes:
142+
- grafana_data:/var/lib/grafana
143+
- ./grafana/provisioning:/etc/grafana/provisioning:ro
144+
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
145+
depends_on:
146+
- prometheus
147+
healthcheck:
148+
test: ['CMD-SHELL', 'wget -qO- http://localhost:3000/api/health || exit 1']
149+
interval: 10s
150+
timeout: 5s
151+
retries: 10
152+
networks:
153+
- monitoring_network
154+
155+
volumes:
156+
elasticsearch_data:
157+
kibana_data:
158+
prometheus_data:
159+
grafana_data:
160+
161+
networks:
162+
payd_network:
163+
external: true # shared with docker-compose.yml
164+
monitoring_network:
165+
driver: bridge
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
{
2+
"title": "PayD Backend Overview",
3+
"uid": "payd-overview",
4+
"schemaVersion": 39,
5+
"version": 1,
6+
"refresh": "30s",
7+
"time": { "from": "now-1h", "to": "now" },
8+
"tags": ["payd", "backend"],
9+
"panels": [
10+
{
11+
"id": 1,
12+
"title": "HTTP Request Rate (req/s)",
13+
"type": "timeseries",
14+
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
15+
"targets": [
16+
{
17+
"datasource": "Prometheus",
18+
"expr": "sum(rate(http_requests_total[1m])) by (method, route)",
19+
"legendFormat": "{{method}} {{route}}"
20+
}
21+
]
22+
},
23+
{
24+
"id": 2,
25+
"title": "HTTP Error Rate (4xx/5xx req/s)",
26+
"type": "timeseries",
27+
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
28+
"targets": [
29+
{
30+
"datasource": "Prometheus",
31+
"expr": "sum(rate(http_request_errors_total[1m])) by (status_code)",
32+
"legendFormat": "HTTP {{status_code}}"
33+
}
34+
]
35+
},
36+
{
37+
"id": 3,
38+
"title": "P99 Request Latency (s)",
39+
"type": "timeseries",
40+
"gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
41+
"targets": [
42+
{
43+
"datasource": "Prometheus",
44+
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))",
45+
"legendFormat": "p99 {{route}}"
46+
}
47+
]
48+
},
49+
{
50+
"id": 4,
51+
"title": "P50 / P95 Latency",
52+
"type": "timeseries",
53+
"gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 },
54+
"targets": [
55+
{
56+
"datasource": "Prometheus",
57+
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
58+
"legendFormat": "p50"
59+
},
60+
{
61+
"datasource": "Prometheus",
62+
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
63+
"legendFormat": "p95"
64+
}
65+
]
66+
},
67+
{
68+
"id": 5,
69+
"title": "Active Connections",
70+
"type": "stat",
71+
"gridPos": { "x": 0, "y": 16, "w": 4, "h": 4 },
72+
"targets": [
73+
{
74+
"datasource": "Prometheus",
75+
"expr": "active_connections",
76+
"legendFormat": "Active"
77+
}
78+
]
79+
},
80+
{
81+
"id": 6,
82+
"title": "Total Errors",
83+
"type": "stat",
84+
"gridPos": { "x": 4, "y": 16, "w": 4, "h": 4 },
85+
"targets": [
86+
{
87+
"datasource": "Prometheus",
88+
"expr": "sum(increase(errors_total[1h]))",
89+
"legendFormat": "Errors (1h)"
90+
}
91+
]
92+
},
93+
{
94+
"id": 7,
95+
"title": "Node.js Heap Used (MB)",
96+
"type": "timeseries",
97+
"gridPos": { "x": 8, "y": 16, "w": 8, "h": 8 },
98+
"targets": [
99+
{
100+
"datasource": "Prometheus",
101+
"expr": "nodejs_heap_size_used_bytes / 1024 / 1024",
102+
"legendFormat": "Heap used"
103+
},
104+
{
105+
"datasource": "Prometheus",
106+
"expr": "nodejs_heap_size_total_bytes / 1024 / 1024",
107+
"legendFormat": "Heap total"
108+
}
109+
]
110+
},
111+
{
112+
"id": 8,
113+
"title": "DB Query Duration P99 (s)",
114+
"type": "timeseries",
115+
"gridPos": { "x": 16, "y": 16, "w": 8, "h": 8 },
116+
"targets": [
117+
{
118+
"datasource": "Prometheus",
119+
"expr": "histogram_quantile(0.99, sum(rate(db_query_duration_seconds_bucket[5m])) by (le, operation, table))",
120+
"legendFormat": "p99 {{operation}}/{{table}}"
121+
}
122+
]
123+
},
124+
{
125+
"id": 9,
126+
"title": "Payment Operations / min",
127+
"type": "timeseries",
128+
"gridPos": { "x": 0, "y": 24, "w": 12, "h": 8 },
129+
"targets": [
130+
{
131+
"datasource": "Prometheus",
132+
"expr": "sum(rate(payment_operations_total[1m])) by (status, type)",
133+
"legendFormat": "{{type}} — {{status}}"
134+
}
135+
]
136+
},
137+
{
138+
"id": 10,
139+
"title": "Auth Attempts / min",
140+
"type": "timeseries",
141+
"gridPos": { "x": 12, "y": 24, "w": 12, "h": 8 },
142+
"targets": [
143+
{
144+
"datasource": "Prometheus",
145+
"expr": "sum(rate(auth_attempts_total[1m])) by (method, status)",
146+
"legendFormat": "{{method}} — {{status}}"
147+
}
148+
]
149+
}
150+
]
151+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: 1
2+
3+
providers:
4+
- name: PayD Dashboards
5+
orgId: 1
6+
type: file
7+
disableDeletion: false
8+
editable: true
9+
updateIntervalSeconds: 30
10+
options:
11+
path: /var/lib/grafana/dashboards
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
apiVersion: 1
2+
3+
datasources:
4+
- name: Prometheus
5+
type: prometheus
6+
access: proxy
7+
url: http://prometheus:9090
8+
isDefault: true
9+
editable: false
10+
jsonData:
11+
timeInterval: 15s
12+
13+
- name: Elasticsearch
14+
type: elasticsearch
15+
access: proxy
16+
url: http://elasticsearch:9200
17+
database: payd-logs-*
18+
isDefault: false
19+
editable: false
20+
jsonData:
21+
esVersion: "8.0.0"
22+
timeField: "@timestamp"
23+
logLevelField: level
24+
logMessageField: message
25+
26+
- name: Jaeger
27+
type: jaeger
28+
access: proxy
29+
url: http://jaeger:16686
30+
isDefault: false
31+
editable: false

0 commit comments

Comments
 (0)