Skip to content

Commit

Permalink
Add the ability to scrape the Kubelet's resource metrics (#786)
Browse files Browse the repository at this point in the history
* Add the ability to scrape the Kubelet's resource metrics

Signed-off-by: Pete Wall <[email protected]>

* Update job label to be consistent

Signed-off-by: Pete Wall <[email protected]>

* Update test

Signed-off-by: Pete Wall <[email protected]>

---------

Signed-off-by: Pete Wall <[email protected]>
  • Loading branch information
petewall authored Oct 10, 2024
1 parent 739af8a commit be9a3d6
Show file tree
Hide file tree
Showing 117 changed files with 4,024 additions and 36 deletions.
13 changes: 13 additions & 0 deletions charts/feature-cluster-metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,19 @@ action on the metric list, including filtering based on label or other actions.
| kubelet.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
| kubelet.scrapeInterval | string | `60s` | How frequently to scrape Kubelet metrics. |

### Kubelet Resources

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| kubeletResource.enabled | bool | `true` | Scrape resource metrics from kubelet. |
| kubeletResource.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for Kubelet Resources entities. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) |
| kubeletResource.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for Kubelet Resources metrics. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) |
| kubeletResource.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
| kubeletResource.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
| kubeletResource.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
| kubeletResource.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of resources metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
| kubeletResource.scrapeInterval | string | `60s` | How frequently to scrape Kubelet Resource metrics. |

### Node Exporter - Deployment settings

| Key | Type | Default | Description |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
# The minimal set of resource metrics from the Kubelet required for Kubernetes Monitoring
- node_cpu_usage_seconds_total
- node_memory_working_set_bytes
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{{ define "feature.clusterMetrics.kubeletResource.allowList" }}
{{ if .Values.kubeletResource.metricsTuning.useDefaultAllowList }}
{{ "default-allow-lists/kubelet_resource.yaml" | .Files.Get }}
{{ end }}
{{ if .Values.kubeletResource.metricsTuning.includeMetrics }}
{{ .Values.kubeletResource.metricsTuning.includeMetrics | toYaml }}
{{ end }}
{{ end }}

{{- define "feature.clusterMetrics.kubeletResource.alloy" }}
{{- if .Values.kubeletResource.enabled }}
{{- $metricAllowList := include "feature.clusterMetrics.kubeletResource.allowList" . }}
{{- $metricDenyList := .Values.kubeletResource.metricsTuning.excludeMetrics }}

kubernetes.resources "scrape" {
clustering = true
job_label = "integrations/kubernetes/resources"
{{- if $metricAllowList }}
keep_metrics = "up|{{ $metricAllowList | fromYamlArray | join "|" }}"
{{- end }}
{{- if $metricDenyList }}
drop_metrics = {{ $metricDenyList | join "|" | quote }}
{{- end }}
scrape_interval = {{ .Values.kubeletResource.scrapeInterval | default .Values.global.scrapeInterval | quote }}
max_cache_size = {{ .Values.kubeletResource.maxCacheSize | default .Values.global.maxCacheSize | int }}
{{- if .Values.kubeletResource.extraMetricProcessingRules }}
forward_to = [prometheus.relabel.kubelet_resources.receiver]
}

prometheus.relabel "kubelet_resources" {
max_cache_size = {{ .Values.kubeletResource.maxCacheSize | default .Values.global.maxCacheSize | int }}

{{ .Values.kubeletResource.extraMetricProcessingRules | indent 2 }}

{{- end }}
forward_to = argument.metrics_destinations.value
}
{{- end }}
{{- end }}
3 changes: 2 additions & 1 deletion charts/feature-cluster-metrics/templates/_module.alloy.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ declare "cluster_metrics" {
comment = "Must be a list of metric destinations where collected metrics should be forwarded to"
}

{{- if or .Values.cadvisor.enabled .Values.kubelet.enabled (or .Values.apiServer.enabled (and .Values.controlPlane.enabled (not (eq .Values.apiServer.enabled false)))) }}
{{- if or .Values.cadvisor.enabled .Values.kubelet.enabled .Values.kubeletResource.enabled (or .Values.apiServer.enabled (and .Values.controlPlane.enabled (not (eq .Values.apiServer.enabled false)))) }}
import.git "kubernetes" {
repository = "https://github.com/grafana/alloy-modules.git"
revision = "main"
Expand All @@ -13,6 +13,7 @@ declare "cluster_metrics" {
}
{{- end }}
{{- include "feature.clusterMetrics.kubelet.alloy" . | indent 2 }}
{{- include "feature.clusterMetrics.kubeletResource.alloy" . | indent 2 }}
{{- include "feature.clusterMetrics.cadvisor.alloy" . | indent 2 }}
{{- include "feature.clusterMetrics.apiServer.alloy" . | indent 2 }}
{{- include "feature.clusterMetrics.kubeControllerManager.alloy" . | indent 2 }}
Expand Down
9 changes: 9 additions & 0 deletions charts/feature-cluster-metrics/tests/default_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ tests:
forward_to = argument.metrics_destinations.value
}
kubernetes.resources "scrape" {
clustering = true
job_label = "integrations/kubernetes/resources"
keep_metrics = "up|node_cpu_usage_seconds_total|node_memory_working_set_bytes"
scrape_interval = "60s"
max_cache_size = 100000
forward_to = argument.metrics_destinations.value
}
kubernetes.cadvisor "scrape" {
clustering = true
keep_metrics = "up|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|machine_memory_bytes"
Expand Down
34 changes: 34 additions & 0 deletions charts/feature-cluster-metrics/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,40 @@
}
}
},
"kubeletResource": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"extraDiscoveryRules": {
"type": "string"
},
"extraMetricProcessingRules": {
"type": "string"
},
"maxCacheSize": {
"type": "null"
},
"metricsTuning": {
"type": "object",
"properties": {
"excludeMetrics": {
"type": "array"
},
"includeMetrics": {
"type": "array"
},
"useDefaultAllowList": {
"type": "boolean"
}
}
},
"scrapeInterval": {
"type": "string"
}
}
},
"nameOverride": {
"type": "string"
},
Expand Down
45 changes: 45 additions & 0 deletions charts/feature-cluster-metrics/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,51 @@ kubelet:
# @section -- Kubelet
scrapeInterval: ""

# Kubelet Resource metrics gather information about resource information on each node.
kubeletResource:
# -- Scrape resource metrics from kubelet.
# @section -- Kubelet Resources
enabled: true

# -- Rule blocks to be added to the discovery.relabel component for Kubelet Resources entities.
# These relabeling rules are applied pre-scrape against the targets from service discovery.
# Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped.
# ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block))
# @section -- Kubelet Resources
extraDiscoveryRules: ""

# -- Rule blocks to be added to the prometheus.relabel component for Kubelet Resources metrics.
# These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present.
# ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block))
# @section -- Kubelet Resources
extraMetricProcessingRules: ""

# Adjustments to the scraped metrics to filter the amount of data sent to storage.
metricsTuning:
# -- Filter the list of resources metrics from the Kubelet to the minimal set required for Kubernetes Monitoring.
# See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists)
# @section -- Kubelet Resources
useDefaultAllowList: true
# -- Metrics to keep. Can use regular expressions.
# @section -- Kubelet Resources
includeMetrics: []
# -- Metrics to drop. Can use regular expressions.
# @section -- Kubelet Resources
excludeMetrics: []

# -- Sets the max_cache_size for cadvisor prometheus.relabel component.
# This should be at least 2x-5x your largest scrape target or samples appended rate.
# ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments))
# Overrides global.maxCacheSize
# @default -- `100000`
# @section -- Kubelet Resources
maxCacheSize:

# -- How frequently to scrape Kubelet Resource metrics.
# @default -- `60s`
# @section -- Kubelet Resources
scrapeInterval: ""

# cAdvisor metrics gather information about containers on each node.
cadvisor:
# -- Scrape metrics from cAdvisor.
Expand Down
14 changes: 14 additions & 0 deletions charts/k8s-monitoring-v1/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,20 @@ The Prometheus and Loki services may be hosted on the same cluster, or remotely
| metrics.kubelet.nodeAddressFormat | string | `"direct"` | How to access the node services, either direct (use node IP, requires nodes/metrics) or via proxy (requires nodes/proxy) |
| metrics.kubelet.scrapeInterval | string | 60s | How frequently to scrape metrics from the Kubelet. Overrides metrics.scrapeInterval |

### Metrics Job: Kubelet Resources

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| metrics.kubeletResource.enabled | bool | `true` | Scrape resource metrics from the Kubelet |
| metrics.kubeletResource.extraMetricRelabelingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for Kubelet Resources. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) |
| metrics.kubeletResource.extraRelabelingRules | string | `""` | Rule blocks to be added to the discovery.relabel component for Kubelet Resources. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) |
| metrics.kubeletResource.maxCacheSize | string | `nil` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides metrics.maxCacheSize |
| metrics.kubeletResource.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
| metrics.kubeletResource.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
| metrics.kubeletResource.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
| metrics.kubeletResource.nodeAddressFormat | string | `"direct"` | How to access the node services, either direct (use node IP, requires nodes/metrics) or via proxy (requires nodes/proxy) |
| metrics.kubeletResource.scrapeInterval | string | 60s | How frequently to scrape resource metrics from the Kubelet. Overrides metrics.scrapeInterval |

### Metrics Job: Kubernetes Monitoring Telemetry

| Key | Type | Default | Description |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
# The minimal set of resource metrics from the Kubelet required for Kubernetes Monitoring
- node_cpu_usage_seconds_total
- node_memory_working_set_bytes
1 change: 1 addition & 0 deletions charts/k8s-monitoring-v1/docs/HelmTests.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ These queries are added by default and are used if their respective metric sourc
| | `up` | `metrics.enabled: true` |
| Grafana Alloy | `alloy_build_info{cluster="<clusterName>"}` | `metrics.enabled: true`<br>`metrics.alloy.enabled: true` |
| Kubelet | `kubernetes_build_info{cluster="<clusterName>"}` | `metrics.enabled: true`<br>`metrics.kubelet.enabled: true` |
| Kubelet Resource | `node_cpu_usage_seconds_total{cluster="<clusterName>"} | `metrics.enabled: true`<br>`metrics.kubeletResource.enabled: true` |
| cAdvisor | `machine_memory_bytes{cluster="<clusterName>"}` | `metrics.enabled: true`<br>`metrics.cadvisor.enabled: true` |
| kube-state-metrics | `kube_node_info{cluster="<clusterName>"}` | `metrics.enabled: true`<br>`metrics.kube-state-metrics.enabled: true` |
| Node Exporter | `node_exporter_build_info{cluster="<clusterName>"}` | `metrics.enabled: true`<br>`metrics.node-exporter.enabled: true` |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,40 @@ prometheus.relabel "kubelet" {
forward_to = [prometheus.relabel.metrics_service.receiver]
}

// Kubelet Resource
discovery.relabel "kubelet_resource" {
targets = discovery.kubernetes.nodes.targets
rule {
replacement = "/metrics/resource"
target_label = "__metrics_path__"
}
}

prometheus.scrape "kubelet_resource" {
job_name = "integrations/kubernetes/resources"
targets = discovery.relabel.kubelet_resource.output
scheme = "https"
scrape_interval = "60s"
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
tls_config {
insecure_skip_verify = true
}
clustering {
enabled = true
}
forward_to = [prometheus.relabel.kubelet_resource.receiver]
}

prometheus.relabel "kubelet_resource" {
max_cache_size = 100000
rule {
source_labels = ["__name__"]
regex = "up|node_cpu_usage_seconds_total|node_memory_working_set_bytes"
action = "keep"
}
forward_to = [prometheus.relabel.metrics_service.receiver]
}

// cAdvisor
discovery.relabel "cadvisor" {
targets = discovery.kubernetes.nodes.targets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,40 @@ data:
forward_to = [prometheus.relabel.metrics_service.receiver]
}

// Kubelet Resource
discovery.relabel "kubelet_resource" {
targets = discovery.kubernetes.nodes.targets
rule {
replacement = "/metrics/resource"
target_label = "__metrics_path__"
}
}

prometheus.scrape "kubelet_resource" {
job_name = "integrations/kubernetes/resources"
targets = discovery.relabel.kubelet_resource.output
scheme = "https"
scrape_interval = "60s"
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
tls_config {
insecure_skip_verify = true
}
clustering {
enabled = true
}
forward_to = [prometheus.relabel.kubelet_resource.receiver]
}

prometheus.relabel "kubelet_resource" {
max_cache_size = 100000
rule {
source_labels = ["__name__"]
regex = "up|node_cpu_usage_seconds_total|node_memory_working_set_bytes"
action = "keep"
}
forward_to = [prometheus.relabel.metrics_service.receiver]
}

// cAdvisor
discovery.relabel "cadvisor" {
targets = discovery.kubernetes.nodes.targets
Expand Down Expand Up @@ -964,7 +998,7 @@ data:
k8s-monitoring-build-info-metric.prom: |
# HELP grafana_kubernetes_monitoring_build_info A metric to report the version of the Kubernetes Monitoring Helm chart as well as a summary of enabled features
# TYPE grafana_kubernetes_monitoring_build_info gauge
grafana_kubernetes_monitoring_build_info{version="1.5.6", namespace="default", metrics="enabled,alloy,autoDiscover,kube-state-metrics,node-exporter,kubelet,cadvisor,cost", logs="enabled,events,pod_logs", traces="disabled", deployments="kube-state-metrics,prometheus-node-exporter,prometheus-operator-crds,opencost"} 1
grafana_kubernetes_monitoring_build_info{version="1.5.6", namespace="default", metrics="enabled,alloy,autoDiscover,kube-state-metrics,node-exporter,kubelet,kubeletResource,cadvisor,cost", logs="enabled,events,pod_logs", traces="disabled", deployments="kube-state-metrics,prometheus-node-exporter,prometheus-operator-crds,opencost"} 1
---
# Source: k8s-monitoring/templates/alloy-events-config.yaml
apiVersion: v1
Expand Down Expand Up @@ -68074,6 +68108,40 @@ data:
forward_to = [prometheus.relabel.metrics_service.receiver]
}

// Kubelet Resource
discovery.relabel "kubelet_resource" {
targets = discovery.kubernetes.nodes.targets
rule {
replacement = "/metrics/resource"
target_label = "__metrics_path__"
}
}

prometheus.scrape "kubelet_resource" {
job_name = "integrations/kubernetes/resources"
targets = discovery.relabel.kubelet_resource.output
scheme = "https"
scrape_interval = "60s"
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
tls_config {
insecure_skip_verify = true
}
clustering {
enabled = true
}
forward_to = [prometheus.relabel.kubelet_resource.receiver]
}

prometheus.relabel "kubelet_resource" {
max_cache_size = 100000
rule {
source_labels = ["__name__"]
regex = "up|node_cpu_usage_seconds_total|node_memory_working_set_bytes"
action = "keep"
}
forward_to = [prometheus.relabel.metrics_service.receiver]
}

// cAdvisor
discovery.relabel "cadvisor" {
targets = discovery.kubernetes.nodes.targets
Expand Down Expand Up @@ -68711,6 +68779,10 @@ data:
"query": "kubernetes_build_info{cluster=\"alloy-autoscaling-and-storage-test\"}",
"type": "promql"
},
{
"query": "node_cpu_usage_seconds_total{cluster=\"alloy-autoscaling-and-storage-test\"}",
"type": "promql"
},
{
"query": "machine_memory_bytes{cluster=\"alloy-autoscaling-and-storage-test\"}",
"type": "promql"
Expand Down
Loading

0 comments on commit be9a3d6

Please sign in to comment.