diff --git a/gcp/modules/monitoring/infra/dashboards.tf b/gcp/modules/monitoring/infra/dashboards.tf index a5070b30..f7e5f8a0 100644 --- a/gcp/modules/monitoring/infra/dashboards.tf +++ b/gcp/modules/monitoring/infra/dashboards.tf @@ -72,3 +72,11 @@ resource "google_monitoring_dashboard" "rekor_v1" { rekor_url = var.rekor_url }) } + +# Consolidated CPU/memory dashboard for all Sigstore GKE workloads. +# See https://github.com/sigstore/public-good-instance/issues/1122 +resource "google_monitoring_dashboard" "workloads" { + project = var.project_id + + dashboard_json = file("${path.module}/workloads.json") +} diff --git a/gcp/modules/monitoring/infra/workloads.json b/gcp/modules/monitoring/infra/workloads.json new file mode 100644 index 00000000..3c99fd54 --- /dev/null +++ b/gcp/modules/monitoring/infra/workloads.json @@ -0,0 +1,525 @@ +{ + "displayName": "Workloads CPU & Memory", + "mosaicLayout": { + "columns": 12, + "tiles": [ + { + "xPos": 0, + "yPos": 0, + "width": 12, + "height": 4, + "widget": { + "title": "Overview", + "text": { + "content": "Consolidated CPU and memory view for all Sigstore GKE workloads (Fulcio, Rekor, CTLog, Trillian, Dex, prober, monitoring, etc.). Charts are grouped by `namespace` / `container_name`. Use this dashboard as the first stop when investigating high resource usage during oncall.\n\nMetric source: GKE container metrics (`kubernetes.io/container/*`).", + "format": "MARKDOWN", + "style": { + "fontSize": "FS_LARGE", + "padding": "P_EXTRA_SMALL" + } + } + } + }, + { + "xPos": 0, + "yPos": 4, + "width": 6, + "height": 4, + "widget": { + "title": "CPU usage (cores) by container", + "xyChart": { + "yAxis": { + "label": "cores", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_RATE", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "1" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 4, + "width": 6, + "height": 4, + "widget": { + "title": "Memory used (bytes) by container", + "xyChart": { + "yAxis": { + "label": "bytes", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\" metric.label.\"memory_type\"=\"non-evictable\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "By" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 8, + "width": 6, + "height": 4, + "widget": { + "title": "CPU limit utilization (% of container limit)", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "thresholds": [ + { + "value": 0.8 + }, + { + "value": 0.95 + } + ], + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/cpu/limit_utilization\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MAX", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 8, + "width": 6, + "height": 4, + "widget": { + "title": "Memory limit utilization (% of container limit)", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "thresholds": [ + { + "value": 0.8 + }, + { + "value": 0.95 + } + ], + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/memory/limit_utilization\" resource.type=\"k8s_container\" metric.label.\"memory_type\"=\"non-evictable\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MAX", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 12, + "width": 6, + "height": 4, + "widget": { + "title": "CPU request utilization (% of container request)", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/cpu/request_utilization\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MAX", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 12, + "width": 6, + "height": 4, + "widget": { + "title": "Memory request utilization (% of container request)", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/memory/request_utilization\" resource.type=\"k8s_container\" metric.label.\"memory_type\"=\"non-evictable\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MAX", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 16, + "width": 6, + "height": 4, + "widget": { + "title": "Container restarts (delta, 5m)", + "xyChart": { + "yAxis": { + "label": "restarts", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "STACKED_BAR", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/restart_count\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "300s", + "perSeriesAligner": "ALIGN_DELTA", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + } + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 16, + "width": 6, + "height": 4, + "widget": { + "title": "Node CPU allocatable utilization", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "thresholds": [ + { + "value": 0.9 + } + ], + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.node_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/node/cpu/allocatable_utilization\" resource.type=\"k8s_node\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MEAN", + "groupByFields": [ + "resource.label.node_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 20, + "width": 6, + "height": 4, + "widget": { + "title": "Node memory allocatable utilization", + "xyChart": { + "yAxis": { + "label": "utilization", + "scale": "LINEAR" + }, + "thresholds": [ + { + "value": 0.9 + } + ], + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.node_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/node/memory/allocatable_utilization\" resource.type=\"k8s_node\" metric.label.\"memory_type\"=\"non-evictable\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_MEAN", + "groupByFields": [ + "resource.label.node_name" + ] + } + }, + "unitOverride": "10^2.%" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 20, + "width": 6, + "height": 4, + "widget": { + "title": "Ephemeral storage used (bytes) by container", + "xyChart": { + "yAxis": { + "label": "bytes", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}/${resource.labels.container_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/ephemeral_storage/used_bytes\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name", + "resource.label.container_name" + ] + } + }, + "unitOverride": "By" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 24, + "width": 6, + "height": 4, + "widget": { + "title": "Pod network received (bytes/s) by namespace", + "xyChart": { + "yAxis": { + "label": "bytes/s", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_RATE", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name" + ] + } + }, + "unitOverride": "By/s" + } + } + ] + } + } + }, + { + "xPos": 6, + "yPos": 24, + "width": 6, + "height": 4, + "widget": { + "title": "Pod network sent (bytes/s) by namespace", + "xyChart": { + "yAxis": { + "label": "bytes/s", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "LINE", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/pod/network/sent_bytes_count\" resource.type=\"k8s_pod\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_RATE", + "crossSeriesReducer": "REDUCE_SUM", + "groupByFields": [ + "resource.label.namespace_name" + ] + } + }, + "unitOverride": "By/s" + } + } + ] + } + } + }, + { + "xPos": 0, + "yPos": 28, + "width": 12, + "height": 4, + "widget": { + "title": "Running containers per namespace", + "xyChart": { + "yAxis": { + "label": "containers", + "scale": "LINEAR" + }, + "dataSets": [ + { + "plotType": "STACKED_AREA", + "targetAxis": "Y1", + "legendTemplate": "${resource.labels.namespace_name}", + "timeSeriesQuery": { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/uptime\" resource.type=\"k8s_container\"", + "aggregation": { + "alignmentPeriod": "60s", + "perSeriesAligner": "ALIGN_MEAN", + "crossSeriesReducer": "REDUCE_COUNT", + "groupByFields": [ + "resource.label.namespace_name" + ] + } + } + } + } + ] + } + } + } + ] + } +}