Skip to content

Latest commit

 

History

History
83 lines (73 loc) · 12.3 KB

pod-metrics.md

File metadata and controls

83 lines (73 loc) · 12.3 KB

Pod Metrics

Metric name Metric type Labels/tags Status
kube_pod_info Gauge pod=<pod-name>
namespace=<pod-namespace>
host_ip=<host-ip>
pod_ip=<pod-ip>
node=<node-name>
created_by_kind=<created_by_kind>
created_by_name=<created_by_name>
uid=<pod-uid>
priority_class=<priority_class>
host_network=<host_network>
STABLE
kube_pod_start_time Gauge pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_completion_time Gauge pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_owner Gauge pod=<pod-name>
namespace=<pod-namespace>
owner_kind=<owner kind>
owner_name=<owner name>
owner_is_controller=<whether owner is controller>
uid=<pod-uid>
STABLE
kube_pod_labels Gauge pod=<pod-name>
namespace=<pod-namespace>
label_POD_LABEL=<POD_LABEL>
uid=<pod-uid>
STABLE
kube_pod_status_phase Gauge pod=<pod-name>
namespace=<pod-namespace>
phase=<Pending|Running|Succeeded|Failed|Unknown>
uid=<pod-uid>
STABLE
kube_pod_status_ready Gauge pod=<pod-name>
namespace=<pod-namespace>
condition=<true|false|unknown>
uid=<pod-uid>
STABLE
kube_pod_status_scheduled Gauge pod=<pod-name>
namespace=<pod-namespace>
condition=<true|false|unknown>
uid=<pod-uid>
STABLE
kube_pod_container_info Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
image=<image-name>
image_id=<image-id>
container_id=<containerid>
uid=<pod-uid>
STABLE
kube_pod_container_status_waiting Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_container_status_waiting_reason Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
reason=<ContainerCreating|CrashLoopBackOff|ErrImagePull|ImagePullBackOff|CreateContainerConfigError|InvalidImageName|CreateContainerError>
uid=<pod-uid>
STABLE
kube_pod_container_status_running Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_container_state_started Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_container_status_terminated Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_container_status_terminated_reason Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
reason=<OOMKilled|Error|Completed|ContainerCannotRun|DeadlineExceeded|Evicted>
uid=<pod-uid>
STABLE
kube_pod_container_status_last_terminated_reason Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
reason=<OOMKilled|Error|Completed|ContainerCannotRun|DeadlineExceeded|Evicted>
uid=<pod-uid>
STABLE
kube_pod_container_status_ready Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_container_status_restarts_total Counter container=<container-name>
namespace=<pod-namespace>
pod=<pod-name>
uid=<pod-uid>
STABLE
kube_pod_container_resource_requests Gauge resource=<resource-name>
unit=<resource-unit>
container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
node=< node-name>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_container_resource_limits Gauge resource=<resource-name>
unit=<resource-unit>
container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
node=< node-name>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_overhead_cpu_cores Gauge pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_overhead_memory_bytes Gauge pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_runtimeclass_name_info Gauge pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_created Gauge pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_deletion_timestamp Gauge pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_restart_policy Gauge pod=<pod-name>
namespace=<pod-namespace>
type=<Always|Never|OnFailure>
uid=<pod-uid>
STABLE
kube_pod_init_container_info Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
image=<image-name>
image_id=<image-id>
container_id=<containerid>
uid=<pod-uid>
STABLE
kube_pod_init_container_status_waiting Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_init_container_status_waiting_reason Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
reason=<ContainerCreating|CrashLoopBackOff|ErrImagePull|ImagePullBackOff|CreateContainerConfigError|CreateContainerError|InvalidImageName>
uid=<pod-uid>
STABLE
kube_pod_init_container_status_running Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_init_container_status_terminated Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_init_container_status_terminated_reason Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
reason=<OOMKilled|Error|Completed|ContainerCannotRun|DeadlineExceeded|Evicted>
uid=<pod-uid>
STABLE
kube_pod_init_container_status_last_terminated_reason Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
reason=<OOMKilled|Error|Completed|ContainerCannotRun|DeadlineExceeded|Evicted>
uid=<pod-uid>
STABLE
kube_pod_init_container_status_ready Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_init_container_status_restarts_total Counter container=<container-name>
namespace=<pod-namespace>
pod=<pod-name>
uid=<pod-uid>
STABLE
kube_pod_init_container_resource_limits Gauge resource=<resource-name>
unit=<resource-unit>
container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_init_container_resource_limits_cpu_cores Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_init_container_resource_limits_memory_bytes Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_init_container_resource_limits_storage_bytes Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_init_container_resource_limits_ephemeral_storage_bytes Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_init_container_resource_requests Gauge resource=<resource-name>
unit=<resource-unit>
container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_init_container_resource_requests_cpu_cores Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_init_container_resource_requests_memory_bytes Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_init_container_resource_requests_storage_bytes Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_init_container_resource_requests_ephemeral_storage_bytes Gauge container=<container-name>
pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_spec_volumes_persistentvolumeclaims_info Gauge pod=<pod-name>
namespace=<pod-namespace>
volume=<volume-name>
persistentvolumeclaim=<persistentvolumeclaim-claimname>
uid=<pod-uid>
STABLE
kube_pod_spec_volumes_persistentvolumeclaims_readonly Gauge pod=<pod-name>
namespace=<pod-namespace>
volume=<volume-name>
persistentvolumeclaim=<persistentvolumeclaim-claimname>
uid=<pod-uid>
STABLE
kube_pod_status_reason Gauge pod=<pod-name>
namespace=<pod-namespace>
reason=<NodeLost|Evicted|UnexpectedAdmissionError>
uid=<pod-uid>
EXPERIMENTAL
kube_pod_status_scheduled_time Gauge pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE
kube_pod_status_unschedulable Gauge pod=<pod-name>
namespace=<pod-namespace>
uid=<pod-uid>
STABLE

Useful metrics queries

How to retrieve non-standard Pod state

It is not straightforward to get the Pod states for certain cases like "Terminating" and "Unknown" since it is not stored behind a field in the Pod.Status.

So to mimic the logic used by the kubectl command line, you will need to compose multiple metrics.

For example:

  • To get the list of pods that are in the Unknown state, you can run the following PromQL query: sum(kube_pod_status_phase{phase="Unknown"}) by (namespace, pod) or (count(kube_pod_deletion_timestamp) by (namespace, pod) * sum(kube_pod_status_reason{reason="NodeLost"}) by(namespace, pod))

  • For Pods in Terminating state: count(kube_pod_deletion_timestamp) by (namespace, pod) * count(kube_pod_status_reason{reason="NodeLost"} == 0) by (namespace, pod)

Here is an example of a Prometheus rule that can be used to alert on a Pod that has been in the Terminated state for more than 5m.

groups:
- name: Pod state
  rules:
  - alert: PodsBlockInTerminatingState
    expr: count(kube_pod_deletion_timestamp) by (namespace, pod) * count(kube_pod_status_reason{reason="NodeLost"} == 0) by (namespace, pod) > 0
    for: 5m
    labels:
      severity: page
    annotations:
      summary: Pod {{labels.namespace}}/{{labels.pod}} block in Terminating state.