Merge pull request #50 from coder/ssncferreira/prometheus_native_histograms

ssncferreira · web-flow · commit 47bcfc1decc7 · 2025-09-02T12:47:26.000+01:00
feat: enable native histograms in prometheus
diff --git a/README.gotmpl b/README.gotmpl
@@ -215,6 +215,73 @@ grafana:
     path: "/"
 ```
 
+### Prometheus
+
+To access Prometheus, run:
+
+```bash
+kubectl -n coder-observability port-forward svc/prometheus 9090:80
+```
+
+And open your web browser to http://localhost:9090/graph.
+
+#### Native Histograms
+
+Native histograms are an **experimental** Prometheus feature that remove the need to predefine bucket boundaries and instead provide higher-resolution, adaptive buckets (see [Prometheus docs](https://prometheus.io/docs/specs/native_histograms/) for details).
+
+Unlike classic histograms, which are sent in plain text, **native histograms require the protobuf protocol**.
+In addition to running Prometheus with native histogram support, since the Prometheus Helm chart is configured with remote write, the Grafana Agent must be configured to scrape and remote write using protobuf.
+Native histograms are **disabled by default**, but when you enable them globally, the Helm chart automatically updates the Grafana Agent configuration accordingly.
+
+To enable native histograms, define this in your `values.yaml`:
+
+```yaml
+global:
+  telemetry:
+    metrics:
+      nativeHistograms: true
+
+prometheus:
+  server:
+    extraFlags:
+      - web.enable-lifecycle
+      - enable-feature=remote-write-receiver
+      - enable-feature=native-histograms
+```
+
+After updating values, it might be required to restart the Grafana Agent so it picks up the new configuration:
+```bash
+kubectl -n coder-observability rollout restart daemonset/grafana-agent
+```
+
+⚠️ **Important**: Classic and native histograms cannot be aggregated together.
+If you switch from classic to native histograms, dashboards may need to account for the transition. See [Prometheus migration guidelines](https://prometheus.io/docs/specs/native_histograms/#migration-considerations) for details.
+
+<details>
+<summary>Validate Prometheus Native Histograms</summary>
+
+1) Check Prometheus flags:
+
+    Open http://localhost:9090/flags and confirm that `--enable-feature` includes `native-histograms`.
+
+2) Inspect histogram metrics:
+
+   * Classic histograms expose metrics with suffixes: `_bucket`, `_sum`, and `_count`.
+   * Native histograms are exposed directly under the metric name.
+   * Example: query `coderd_workspace_creation_duration_seconds` in http://localhost:9090/graph.
+
+3) Check Grafana Agent (if remote write is enabled):
+
+   To confirm, run:
+    ```bash
+    kubectl -n coder-observability port-forward svc/grafana-agent 3030:80
+    ```
+   Then open http://localhost:3030:
+   * scrape configurations defined in `prometheus.scrape.cadvisor`, should have `enable_protobuf_negotiation: true`
+   * remote write configurations defined in `prometheus.remote_write.default` should have `send_native_histograms: true`
+
+</details>
+
 ## Subcharts
 
 {{ template "chart.requirementsTable" . }}
diff --git a/README.md b/README.md
@@ -215,6 +215,73 @@ grafana:
     path: "/"
 ```
 
+### Prometheus
+
+To access Prometheus, run:
+
+```bash
+kubectl -n coder-observability port-forward svc/prometheus 9090:80
+```
+
+And open your web browser to http://localhost:9090/graph.
+
+#### Native Histograms
+
+Native histograms are an **experimental** Prometheus feature that remove the need to predefine bucket boundaries and instead provide higher-resolution, adaptive buckets (see [Prometheus docs](https://prometheus.io/docs/specs/native_histograms/) for details).
+
+Unlike classic histograms, which are sent in plain text, **native histograms require the protobuf protocol**.
+In addition to running Prometheus with native histogram support, since the Prometheus Helm chart is configured with remote write, the Grafana Agent must be configured to scrape and remote write using protobuf.
+Native histograms are **disabled by default**, but when you enable them globally, the Helm chart automatically updates the Grafana Agent configuration accordingly.
+
+To enable native histograms, define this in your `values.yaml`:
+
+```yaml
+global:
+  telemetry:
+    metrics:
+      nativeHistograms: true
+
+prometheus:
+  server:
+    extraFlags:
+      - web.enable-lifecycle
+      - enable-feature=remote-write-receiver
+      - enable-feature=native-histograms
+```
+
+After updating values, it might be required to restart the Grafana Agent so it picks up the new configuration:
+```bash
+kubectl -n coder-observability rollout restart daemonset/grafana-agent
+```
+
+⚠️ **Important**: Classic and native histograms cannot be aggregated together.
+If you switch from classic to native histograms, dashboards may need to account for the transition. See [Prometheus migration guidelines](https://prometheus.io/docs/specs/native_histograms/#migration-considerations) for details.
+
+<details>
+<summary>Validate Prometheus Native Histograms</summary>
+
+1) Check Prometheus flags:
+
+    Open http://localhost:9090/flags and confirm that `--enable-feature` includes `native-histograms`.
+
+2) Inspect histogram metrics:
+
+   * Classic histograms expose metrics with suffixes: `_bucket`, `_sum`, and `_count`.
+   * Native histograms are exposed directly under the metric name.
+   * Example: query `coderd_workspace_creation_duration_seconds` in http://localhost:9090/graph.
+
+3) Check Grafana Agent (if remote write is enabled):
+
+   To confirm, run:
+    ```bash
+    kubectl -n coder-observability port-forward svc/grafana-agent 3030:80
+    ```
+   Then open http://localhost:3030:
+   * scrape configurations defined in `prometheus.scrape.cadvisor`, should have `enable_protobuf_negotiation: true`
+   * remote write configurations defined in `prometheus.remote_write.default` should have `send_native_histograms: true`
+
+</details>
+
 ## Subcharts
 
 | Repository | Name | Version |
@@ -261,8 +328,9 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main
 | global.externalZone | string | `"svc.cluster.local"` |  |
 | global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","sslrootcert":null,"username":"coder","volumeMounts":[],"volumes":[]}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts |
 | global.postgres.alerts | object | `{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}}` | alerts for postgres |
-| global.telemetry | object | `{"metrics":{"scrape_interval":"15s","scrape_timeout":"12s"},"profiling":{"delta_profiling_duration":"30s","scrape_interval":"60s","scrape_timeout":"70s"}}` | control telemetry collection |
-| global.telemetry.metrics | object | `{"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection |
+| global.telemetry | object | `{"metrics":{"nativeHistograms":false,"scrape_interval":"15s","scrape_timeout":"12s"},"profiling":{"delta_profiling_duration":"30s","scrape_interval":"60s","scrape_timeout":"70s"}}` | control telemetry collection |
+| global.telemetry.metrics | object | `{"nativeHistograms":false,"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection |
+| global.telemetry.metrics.nativeHistograms | bool | `false` | enable Prometheus native histograms or default to classic histograms |
 | global.telemetry.metrics.scrape_interval | string | `"15s"` | how often the collector will scrape discovered pods |
 | global.telemetry.metrics.scrape_timeout | string | `"12s"` | how long a request will be allowed to wait before being canceled |
 | global.telemetry.profiling.delta_profiling_duration | string | `"30s"` | duration of each pprof profiling capture, must be less than scrape_interval |
diff --git a/coder-observability/Chart.lock b/coder-observability/Chart.lock
@@ -1,7 +1,7 @@
 dependencies:
 - name: pyroscope
   repository: https://grafana.github.io/helm-charts
-  version: 1.14.1
+  version: 1.14.2
 - name: grafana
   repository: https://grafana.github.io/helm-charts
   version: 7.3.12
@@ -14,5 +14,5 @@ dependencies:
 - name: grafana-agent
   repository: https://grafana.github.io/helm-charts
   version: 0.37.0
-digest: sha256:5a5f27f74bbf34848da9c1bab508d3b33fda19789016c2eda9608dcd6373921d
-generated: "2025-08-04T13:28:59.433447595-05:00"
+digest: sha256:38b7d46261c4d39a103fbf61eac9da26a997024221ab81078ea5b34fc2b83c68
+generated: "2025-08-27T14:16:57.521541846Z"
diff --git a/coder-observability/templates/_collector-config.tpl b/coder-observability/templates/_collector-config.tpl
@@ -230,6 +230,7 @@ prometheus.scrape "pods" {
 
   scrape_interval = "{{ .Values.global.telemetry.metrics.scrape_interval }}"
   scrape_timeout = "{{ .Values.global.telemetry.metrics.scrape_timeout }}"
+  enable_protobuf_negotiation = {{ .Values.global.telemetry.metrics.nativeHistograms | default false }}
 }
 
 // These are metric_relabel_configs while discovery.relabel are relabel_configs.
@@ -301,6 +302,7 @@ prometheus.scrape "cadvisor" {
   bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
   scrape_interval   = "{{ .Values.global.telemetry.metrics.scrape_interval }}"
   scrape_timeout    = "{{ .Values.global.telemetry.metrics.scrape_timeout }}"
+  enable_protobuf_negotiation = {{ .Values.global.telemetry.metrics.nativeHistograms | default false }}
 }
 
 prometheus.relabel "cadvisor" {
@@ -346,6 +348,7 @@ prometheus.relabel "cadvisor" {
 
 prometheus.remote_write "default" {
   endpoint {
+    send_native_histograms = {{ .Values.global.telemetry.metrics.nativeHistograms | default false }}
     url ="http://{{ include "prometheus.server.fullname" .Subcharts.prometheus }}.{{ .Release.Namespace }}.{{ .Values.global.zone }}/api/v1/write"
 
     // drop instance label which unnecessarily adds new series when pods are restarted, since pod IPs are dynamically assigned
@@ -396,6 +399,7 @@ prometheus.scrape "coder_metrics" {
 
   forward_to = [prometheus.remote_write.default.receiver]
   scrape_interval = "{{ .scrapeInterval }}"
+  enable_protobuf_negotiation = {{ .Values.global.telemetry.metrics.nativeHistograms | default false }}
 }
 {{- end }}
 {{- end }}
diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml
@@ -113,6 +113,8 @@ global:
       scrape_interval: 15s
       # global.telemetry.metrics.scrape_timeout -- how long a request will be allowed to wait before being canceled
       scrape_timeout: 12s
+      # global.telemetry.metrics.nativeHistograms -- enable Prometheus native histograms or default to classic histograms
+      nativeHistograms: false
     profiling:
       # global.telemetry.profiling.scrape_interval -- how often the collector will scrape pprof endpoints
       scrape_interval: 60s
diff --git a/compiled/resources.yaml b/compiled/resources.yaml