diff --git a/.gitignore b/.gitignore index 70c688df..c404c722 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .vs/ .vscode/ .idea/ +data/ diff --git a/README.md b/README.md index f1c50d97..eb094bb9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # dockprom -A monitoring solution for Docker hosts and containers with [Prometheus](https://prometheus.io/), [Grafana](http://grafana.org/), [cAdvisor](https://github.com/google/cadvisor), -[NodeExporter](https://github.com/prometheus/node_exporter) and alerting with [AlertManager](https://github.com/prometheus/alertmanager). +A monitoring solution for Docker hosts and containers with [Prometheus](https://prometheus.io/), [Grafana](http://grafana.org/), [Loki](https://github.com/grafana/loki), [Tempo](https://github.com/grafana/tempo), [cAdvisor](https://github.com/google/cadvisor), [Alloy](https://github.com/grafana/alloy), [NodeExporter](https://github.com/prometheus/node_exporter) and alerting with [AlertManager](https://github.com/prometheus/alertmanager). ## Install @@ -304,10 +303,10 @@ Please replace the `user:password` part with your user and password set in the i [In Grafana versions >= 5.1 the id of the grafana user has been changed](http://docs.grafana.org/installation/docker/#migration-from-a-previous-version-of-the-docker-container-to-5-1-or-later). Unfortunately this means that files created prior to 5.1 won’t have the correct permissions for later versions. -| Version | User | User ID | -|:-------:|:-------:|:-------:| +| Version | User | User ID | +| :-----: | :-----: | :-----: | | < 5.1 | grafana | 104 | -| \>= 5.1 | grafana | 472 | +| \>= 5.1 | grafana | 472 | There are two possible solutions to this problem. diff --git a/alloy/config.alloy b/alloy/config.alloy new file mode 100644 index 00000000..40d8b49d --- /dev/null +++ b/alloy/config.alloy @@ -0,0 +1,48 @@ +/////////////////////////////////////////////////////////////////////////////// +// Configuration file +// Reference: https://github.com/grafana/loki/blob/main/examples/getting-started/alloy-local-config.yaml +// Reference: https://github.com/grafana/intro-to-mltp/blob/main/alloy/config.alloy + +discovery.docker "flog_scrape" { + host = "unix:///var/run/docker.sock" + refresh_interval = "5s" +} + +discovery.relabel "flog_scrape" { + targets = [] + + rule { + source_labels = ["__meta_docker_container_name"] + regex = "/(.*)" + target_label = "container" + } + + rule { + source_labels = ["__meta_docker_container_label_com_docker_compose_service"] + target_label = "service" + } + + rule { + source_labels = ["__meta_docker_container_state_status"] + target_label = "status" + } +} + +loki.source.docker "flog_scrape" { + host = "unix:///var/run/docker.sock" + targets = discovery.docker.flog_scrape.targets + forward_to = [loki.write.to_loki.receiver] + relabel_rules = discovery.relabel.flog_scrape.rules + refresh_interval = "5s" +} + + +loki.write "to_loki" { + endpoint { + url = string.format( + "http://%s/loki/api/v1/push", + coalesce(sys.env("LOKI_HOST"), "localhost:3100"), + ) + } + external_labels = {} +} \ No newline at end of file diff --git a/docker-compose.exporters.yml b/docker-compose.exporters.yml index 89d58d9d..9dacf7dd 100644 --- a/docker-compose.exporters.yml +++ b/docker-compose.exporters.yml @@ -1,5 +1,4 @@ services: - nodeexporter: image: prom/node-exporter:v1.8.2 container_name: nodeexporter @@ -15,7 +14,7 @@ services: restart: unless-stopped network_mode: host labels: - org.label-schema.group: "monitoring" + org.label-schema.group: 'monitoring' cadvisor: image: gcr.io/cadvisor/cadvisor:v0.51.0 @@ -32,6 +31,19 @@ services: restart: unless-stopped network_mode: host labels: - org.label-schema.group: "monitoring" - + org.label-schema.group: 'monitoring' + alloy: + image: grafana/alloy:v1.7.5 + container_name: alloy + volumes: + - './alloy/config.alloy:/etc/alloy/config.alloy' + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + command: ['run', '--server.http.listen-addr=0.0.0.0:12345', '--stability.level=public-preview', '/etc/alloy/config.alloy'] + restart: unless-stopped + environment: + - LOKI_HOST=loki:3100 + network_mode: host + labels: + org.label-schema.group: 'monitoring' diff --git a/docker-compose.loki.yaml b/docker-compose.loki.yaml new file mode 100644 index 00000000..fcf9bf1c --- /dev/null +++ b/docker-compose.loki.yaml @@ -0,0 +1,96 @@ +networks: + monitor-net: + driver: bridge + +services: + otel-collector: + image: otel/opentelemetry-collector:0.123.0 + container_name: otel-collector + restart: unless-stopped + command: ['--config=/etc/otel-collector.yaml'] + volumes: + - ./otel/otel-collector.yaml:/etc/otel-collector.yaml + ports: + - '1888:1888' # pprof extension + - '8888:8888' # Prometheus metrics exposed by the collector + - '8889:8889' # Prometheus exporter metrics + - '13133:13133' # health_check extension + - '4317:4317' # OTLP gRPC receiver + - '4318:4318' # OTLP HTTP receiver + - '55679:55679' # zpages extension + depends_on: + - tempo + - loki + networks: + - monitor-net + labels: + org.label-schema.group: 'monitoring' + + alloy: + image: grafana/alloy:v1.7.5 + container_name: alloy + volumes: + - './alloy/config.alloy:/etc/alloy/config.alloy' + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + command: ['run', '--server.http.listen-addr=0.0.0.0:12345', '--stability.level=public-preview', '/etc/alloy/config.alloy'] + restart: unless-stopped + expose: + - 12345 + environment: + - LOKI_HOST=loki:3100 + depends_on: + - otel-collector + - loki + networks: + - monitor-net + labels: + org.label-schema.group: 'monitoring' + + tempo: + image: grafana/tempo:2.7.2 + container_name: tempo + restart: unless-stopped + command: ['-config.file=/etc/tempo.yaml'] + volumes: + - ./tempo/tempo.yaml:/etc/tempo.yaml + - ./data/tempo:/var/tempo + expose: + - 14268 + - 3200 + - 4317 + - 4318 + - 9411 + ports: + - '14268' # jaeger ingest + - '3200' # tempo + - '4317' # otlp grpc + - '4318' # otlp http + - '9411' # zipkin + networks: + - monitor-net + labels: + org.label-schema.group: 'monitoring' + + loki: + image: grafana/loki:3.4 + container_name: loki + restart: unless-stopped + user: root + command: -config.file=/etc/loki/loki.yaml -config.expand-env=true + expose: + - 3100 + ports: + - '3100:3100' # loki needs to be exposed so it receives logs + environment: + - JAEGER_AGENT_HOST=tempo + - JAEGER_ENDPOINT=http://tempo:14268/api/traces # send traces to Tempo + - JAEGER_SAMPLER_TYPE=const + - JAEGER_SAMPLER_PARAM=1 + volumes: + - ./loki/loki.yaml:/etc/loki/loki.yaml + - ./data/loki:/tmp/loki + networks: + - monitor-net + labels: + org.label-schema.group: 'monitoring' diff --git a/docker-compose.yml b/docker-compose.yml index 504f49b8..647338f5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,11 +3,10 @@ networks: driver: bridge volumes: - prometheus_data: {} - grafana_data: {} + prometheus_data: {} + grafana_data: {} services: - prometheus: image: prom/prometheus:v3.1.0 container_name: prometheus @@ -27,7 +26,7 @@ services: networks: - monitor-net labels: - org.label-schema.group: "monitoring" + org.label-schema.group: 'monitoring' alertmanager: image: prom/alertmanager:v0.28.0 @@ -43,7 +42,7 @@ services: networks: - monitor-net labels: - org.label-schema.group: "monitoring" + org.label-schema.group: 'monitoring' nodeexporter: image: prom/node-exporter:v1.8.2 @@ -63,7 +62,7 @@ services: networks: - monitor-net labels: - org.label-schema.group: "monitoring" + org.label-schema.group: 'monitoring' cadvisor: image: gcr.io/cadvisor/cadvisor:v0.51.0 @@ -83,7 +82,7 @@ services: networks: - monitor-net labels: - org.label-schema.group: "monitoring" + org.label-schema.group: 'monitoring' grafana: image: grafana/grafana:11.5.1 @@ -102,7 +101,7 @@ services: networks: - monitor-net labels: - org.label-schema.group: "monitoring" + org.label-schema.group: 'monitoring' pushgateway: image: prom/pushgateway:v1.11.0 @@ -113,17 +112,17 @@ services: networks: - monitor-net labels: - org.label-schema.group: "monitoring" + org.label-schema.group: 'monitoring' caddy: image: caddy:2.9.1 container_name: caddy ports: - - "3000:3000" - - "8080:8080" - - "9090:9090" - - "9093:9093" - - "9091:9091" + - '3000:3000' + - '8080:8080' + - '9090:9090' + - '9093:9093' + - '9091:9091' volumes: - ./caddy:/etc/caddy environment: @@ -134,4 +133,4 @@ services: networks: - monitor-net labels: - org.label-schema.group: "monitoring" + org.label-schema.group: 'monitoring' diff --git a/grafana/provisioning/datasources/datasource.yml b/grafana/provisioning/datasources/datasource.yml index bb37f13d..3d3d364e 100644 --- a/grafana/provisioning/datasources/datasource.yml +++ b/grafana/provisioning/datasources/datasource.yml @@ -8,4 +8,68 @@ datasources: url: http://prometheus:9090 basicAuth: false isDefault: true - editable: true \ No newline at end of file + editable: true + + - name: Tempo + type: tempo + access: proxy + orgId: 1 + url: http://tempo:3200 + basicAuth: false + isDefault: false + version: 1 + editable: true + apiVersion: 1 + uid: tempo + jsonData: + tracesToLogsV2: + # Field with an internal link pointing to a logs data source in Grafana. + # datasourceUid value must match the uid value of the logs data source. + datasourceUid: 'loki' + spanStartTimeShift: '1h' + spanEndTimeShift: '-1h' + tags: ['job', 'instance', 'pod', 'namespace', 'app'] + filterByTraceID: false + filterBySpanID: false + customQuery: true + query: 'method="${__span.tags.method}"' + tracesToMetrics: + datasourceUid: 'prom' + spanStartTimeShift: '1h' + spanEndTimeShift: '-1h' + tags: [{ key: 'service.name', value: 'service' }, { key: 'job' }] + queries: + - name: 'Sample query' + query: 'sum(rate(traces_spanmetrics_latency_bucket{$$__tags}[5m]))' + serviceMap: + datasourceUid: 'prometheus' + nodeGraph: + enabled: true + search: + hide: false + lokiSearch: + datasourceUid: 'loki' + traceQuery: + timeShiftEnabled: true + spanStartTimeShift: '1h' + spanEndTimeShift: '-1h' + spanBar: + type: 'Tag' + tag: 'http.path' + + - name: Loki + type: loki + access: proxy + orgId: 1 + url: http://loki:3100 + basicAuth: false + isDefault: false + version: 1 + editable: false + apiVersion: 1 + jsonData: + derivedFields: + - datasourceUid: tempo + matcherRegex: '"traceId":"([A-Za-z0-9]+)"' + name: TraceID + url: $${__value.raw} diff --git a/loki/loki.yaml b/loki/loki.yaml new file mode 100644 index 00000000..8d5cd76b --- /dev/null +++ b/loki/loki.yaml @@ -0,0 +1,98 @@ +# Loki Configuration File +# Reference: https://grafana.com/docs/loki/latest/configure/examples/configuration-examples/ +# Reference: https://github.com/grafana/loki/blob/main/production/docker/config/loki.yaml + +# Disable authentication for development/testing purposes +auth_enabled: false + +# Server configuration +server: + # Listen on all network interfaces + http_listen_address: 0.0.0.0 + # Default Loki HTTP port + http_listen_port: 3100 + +# Common configuration shared across components +common: + ring: + # Ring configuration for distributed deployments + instance_addr: 127.0.0.1 + kvstore: + # Use in-memory storage for development + store: inmemory + # Number of replicas for data + replication_factor: 1 + # Base directory for Loki data + path_prefix: /tmp/loki + +# Schema configuration for time-series data +schema_config: + configs: + - from: 2020-05-15 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +# Storage configuration +storage_config: + tsdb_shipper: + # Directory for active index files + active_index_directory: /tmp/loki/index + # Directory for index cache + cache_location: /tmp/loki/index_cache + filesystem: + # Directory for chunk storage + directory: /tmp/loki/chunks + +# Limits configuration for resource management +limits_config: + # Data retention period (30 days) + retention_period: 720h + # Enable rejection of old samples + reject_old_samples: true + # Maximum age of samples to accept + reject_old_samples_max_age: 720h + # Rate limiting configuration + # 2MB burst, 1MB/s rate + ingestion_rate_mb: 10 + ingestion_burst_size_mb: 20 + # Enable structured metadata + allow_structured_metadata: true + # Split queries by time interval + split_queries_by_interval: 15m + # Enable volume tracking + volume_enabled: true + # OTLP configuration for resource attributes + otlp_config: + resource_attributes: + ignore_defaults: true + attributes_config: + - action: index_label + regex: service.group + +# Table manager configuration for data retention +table_manager: + # Enable automatic deletion of old data + retention_deletes_enabled: true + # Retention period for tables (14 days) + retention_period: 336h + +# Ruler configuration for alerting +ruler: + # Enable ruler API + enable_api: true + storage: + # Use local storage for rules + type: local + local: + # Directory for rule storage + directory: /tmp/loki/rules + # Path for Prometheus rules + rule_path: /loki/prom-rules + ring: + kvstore: + # Use in-memory storage for ring + store: inmemory diff --git a/otel/otel-collector.yml b/otel/otel-collector.yml new file mode 100644 index 00000000..0163b7d9 --- /dev/null +++ b/otel/otel-collector.yml @@ -0,0 +1,74 @@ +# OTel Configuration File +# Reference: https://github.com/grafana/intro-to-mltp/blob/main/otel/otel.yml +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + include_metadata: true + http: + endpoint: 0.0.0.0:4318 + include_metadata: true + +processors: + batch: + timeout: 5s + send_batch_size: 100 + + memory_limiter: + check_interval: 1s + limit_mib: 1000 + +exporters: + # Exporter for sending trace data to Tempo. + otlp/traces: + # Send to the running Tempo service. + endpoint: tempo:4317 + # TLS is not enabled for the instance. + tls: + insecure: true + sending_queue: + enabled: true + num_consumers: 10 + queue_size: 5000 + + otlphttp: + endpoint: http://loki:3100/otlp + +service: + pipelines: + # Define the trace pipeline. + traces: + # Receive from the `otlp` receiver. + receivers: [otlp] + # Use the `batch` processor to process received trace spans. + processors: [batch, memory_limiter] + # Comment out other `processor` definitions and uncomment the line below to use tail sampling. + #processors: [tail_sampling, batch] + # Comment out other `processor` definitions and uncomment the line below to generate service graph metrics + # from within the OpenTelemetry Collector. + #processors: [servicegraph, batch] + # Export to the `otlp/traces` exporter. + exporters: [otlp/traces] + # Comment out other `exporters` definitions and uncomment the line below to generate span metrics + # from within the OpenTelemetry Collector as well as exporting traces to Tempo. + #exporters: [otlp/grafana, spanmetrics] + # use the prometheus directory to scrape metrics from prometheus + # # # Define the metrics pipeline. + # metrics: + # # Receive metrics from the `prometheus` receiver. + # receivers: [otlp, prometheus] + # # Comment out other `receivers` definitions and uncomment the line below to import spanmetrics as well + # # as prometheus metrics. + # #receivers: [otlp, prometheus, spanmetrics] + # # Use the `batch` processor to process received metrics. + # processors: [batch, memory_limiter] + # # Export to the `otlp/metrics` exporter. + # exporters: [otlp/metrics] + logs: + receivers: [otlp] + processors: [batch, memory_limiter] + exporters: [otlphttp] + telemetry: + logs: + level: 'info' diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index 79069633..5e5b7988 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -1,15 +1,15 @@ global: - scrape_interval: 15s + scrape_interval: 15s evaluation_interval: 15s # Attach these labels to any time series or alerts when communicating with # external systems (federation, remote storage, Alertmanager). external_labels: - monitor: 'docker-host-alpha' + monitor: 'docker-host-alpha' # Load and evaluate rules in this file every 'evaluation_interval' seconds. rule_files: - - "alert.rules" + - 'alert.rules' # A scrape configuration containing exactly one endpoint to scrape. scrape_configs: @@ -34,14 +34,12 @@ scrape_configs: static_configs: - targets: ['pushgateway:9091'] - alerting: alertmanagers: - - scheme: http - static_configs: - - targets: - - 'alertmanager:9093' - + - scheme: http + static_configs: + - targets: + - 'alertmanager:9093' # - job_name: 'nginx' # scrape_interval: 10s # static_configs: diff --git a/screens/Grafana_Loki.png b/screens/Grafana_Loki.png new file mode 100644 index 00000000..24922f3f Binary files /dev/null and b/screens/Grafana_Loki.png differ diff --git a/screens/Grafana_Loki_Tempo.png b/screens/Grafana_Loki_Tempo.png new file mode 100644 index 00000000..2e7a638b Binary files /dev/null and b/screens/Grafana_Loki_Tempo.png differ diff --git a/tempo/tempo.yaml b/tempo/tempo.yaml new file mode 100644 index 00000000..bfe06bd6 --- /dev/null +++ b/tempo/tempo.yaml @@ -0,0 +1,79 @@ +# For more information on this configuration, see the complete reference guide at +# https://grafana.com/docs/tempo/latest/configuration/ +# https://github.com/grafana/intro-to-mltp/blob/main/tempo/tempo.yaml +# https://github.com/grafana/tempo/blob/main/example/docker-compose/shared/tempo.yaml +# Enables result streaming from Tempo (to Grafana) via HTTP. +stream_over_http_enabled: true + +# Configure the server block. +server: + # Listen for all incoming requests on port 3200. + http_listen_port: 3200 + +query_frontend: + search: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + metadata_slo: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + trace_by_id: + duration_slo: 5s + +# The distributor receives incoming trace span data for the system. +distributor: + receivers: # This configuration will listen on all ports and protocols that tempo is capable of. + jaeger: # The receivers all come from the OpenTelemetry collector. More configuration information can + protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver + thrift_http: # + grpc: # For a production deployment you should only enable the receivers you need! + thrift_binary: # Note that from Tempo 2.7.0, if an endpoint if not specified, it will listen only on localhost. + thrift_compact: + otlp: + protocols: + http: + endpoint: '0.0.0.0:4318' # Listen to OTLP HTTP on port 4318, on all interfaces. + grpc: + endpoint: '0.0.0.0:4317' # This example repository only utilises the OTLP gRPC receiver on port 4317, on all interfaces. + zipkin: # Receive trace data in any supported Zipkin format. + +# The ingester receives data from the distributor and processes it into indices and blocks. +ingester: + trace_idle_period: 10s # The length of time after a trace has not received spans to consider it complete and flush it. + max_block_bytes: 1_000_000 # Cut the head block when it hits this size or + max_block_duration: 5m # this much time passes + +# The compactor block configures the compactor responsible for compacting TSDB blocks. +compactor: + compaction: + compaction_window: 1h # Blocks in this time window will be compacted together. + max_block_bytes: 100_000_000 # Maximum size of a compacted block. + block_retention: 1h # How long to keep blocks. Default is 14 days, this demo system is short-lived. + compacted_block_retention: 10m # How long to keep compacted blocks stored elsewhere. + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: docker-compose + storage: + path: /var/tempo/generator/wal + remote_write: + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + traces_storage: + path: /var/tempo/generator/traces + +storage: + trace: + backend: local # backend configuration to use + wal: + path: /var/tempo/wal # where to store the wal locally + local: + path: /var/tempo/blocks + +overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator + generate_native_histograms: both