diff --git a/.github/workflows/alerts-test.yml b/.github/workflows/alerts-test.yml new file mode 100644 index 00000000..cde58f85 --- /dev/null +++ b/.github/workflows/alerts-test.yml @@ -0,0 +1,54 @@ +name: Alerts-test-pgskipper-operator +on: + workflow_run: + workflows: ["Build Artifacts"] + types: + - completed + pull_request: + branches: + - all + +env: + max_attempts: 30 + delay: 10 + +permissions: + contents: read + +jobs: + Run-Alerts-Test: + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Check out repository code + uses: actions/checkout@v4 + + - name: Check yq version + run: yq --version + + - name: Install Helm + run: | + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + + - name: Render rules file from helm chart + run: | + helm template pgskipper-montemplates ./charts/patroni-services/. --set metricCollector.install=true --set metricCollector.metricsProfile=dev --set metricCollector.prometheusMonitoring=true --set siteManager.install=true > ./tests/alerts-tests/rules.yaml + sed -n '/prometheus-rule.yml/,/---/p' -i ./tests/alerts-tests/rules.yaml + sed '0,/spec:/d' -i ./tests/alerts-tests/rules.yaml + + - name: Check that all necessary tests exists + run: | + chmod +x ./tests/alerts-tests/tests-checker.sh + cd ./tests/alerts-tests/ + ./tests-checker.sh + continue-on-error: true + + - name: Install vmalert-tool + run: | + wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v1.122.4/vmutils-linux-amd64-v1.122.4-enterprise.tar.gz + tar -xvf vmutils-linux-amd64-v1.122.4-enterprise.tar.gz + chmod +x vmalert-tool-prod + + - name: Run test + run: | + ./vmalert-tool-prod unittest --files ./tests/alerts-tests/test.yaml \ No newline at end of file diff --git a/charts/patroni-services/Chart.yaml b/charts/patroni-services/Chart.yaml index a22c5e1c..2f919574 100644 --- a/charts/patroni-services/Chart.yaml +++ b/charts/patroni-services/Chart.yaml @@ -12,3 +12,11 @@ version: 0.1.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. appVersion: 1.16.0 + +dependencies: + # Prometheus alert rules +- name: metricCollector + condition: metricCollector.prometheusMonitoring + version: ~0 + repository: "file://charts/metricCollector" + \ No newline at end of file diff --git a/charts/patroni-services/charts/metricCollector/Chart.yaml b/charts/patroni-services/charts/metricCollector/Chart.yaml new file mode 100644 index 00000000..deb696c1 --- /dev/null +++ b/charts/patroni-services/charts/metricCollector/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: metricCollector +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/charts/patroni-services/charts/metricCollector/templates/_helpers.tpl b/charts/patroni-services/charts/metricCollector/templates/_helpers.tpl new file mode 100644 index 00000000..30f09219 --- /dev/null +++ b/charts/patroni-services/charts/metricCollector/templates/_helpers.tpl @@ -0,0 +1,483 @@ +{{- define "defaultAlerts" -}} + {{ .Release.Namespace }}-{{ .Release.Name }}: + rules: + PostgreSQL metrics are absent: + annotations: + description: 'PostgreSQL metrics are absent on {{ .Release.Namespace }}.' + summary: PostgreSQL metrics are absent + expr: absent(ma_pg_patroni_cluster_status{namespace="{{ .Release.Namespace }}"}) == 1 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + PostgreSQL is Down: + annotations: + description: 'PostgreSQL is Down on {{ .Release.Namespace }}.' + summary: PostgreSQL is Down + expr: ma_pg_patroni_cluster_status{namespace="{{ .Release.Namespace }}"} == 10 or ma_pg_patroni_cluster_status{namespace="{{ .Release.Namespace }}"} < 0 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: critical + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + PostgreSQL is Degraded: + annotations: + description: 'PostgreSQL is Degraded on {{ .Release.Namespace }}.' + summary: PostgreSQL is Degraded + expr: ma_pg_patroni_cluster_status{namespace="{{ .Release.Namespace }}"} == 6 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + {{- if .Values.prometheusRules.backupAlertsInstall }} + Space for Postgres backup is less than acceptable critical threshold: + annotations: + description: 'Backup space is less than {{ default 5 .Values.prometheusRules.backupAlertThreshold }} percent free on {{ .Release.Namespace }}' + summary: Space for Postgres backup is less than acceptable critical threshold + expr: ma_storage_free_space{namespace="{{ default "common" .Release.Namespace }}", service_name='postgres-backup-daemon'} / ma_storage_total_space{namespace="{{ .Release.Namespace }}", service_name='postgres-backup-daemon'} < {{ default 5 .Values.prometheusRules.backupAlertThreshold }}*0.01 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: critical + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Space for Postgres backup is less than acceptable warning threshold: + annotations: + description: 'Backup space is less than {{ default 20 .Values.prometheusRules.backupWarningThreshold }} percent free on {{ .Release.Namespace }}' + summary: Space for Postgres backup is less than acceptable warning threshold + expr: ma_storage_free_space{namespace="{{ .Release.Namespace }}", service_name='postgres-backup-daemon'} / ma_storage_total_space{namespace="{{ .Release.Namespace }}", service_name='postgres-backup-daemon'} < {{ default 20 .Values.prometheusRules.backupWarningThreshold }}*0.01 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Last Postgres backup is failed: + annotations: + description: 'Last backup is failed on {{ .Release.Namespace }}' + summary: Targets are down + expr: ma_storage_last_failed{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"} > 0 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Last Successful Postgres backup is too old: + annotations: + description: 'Last Successful backup is too old on {{ .Release.Namespace }}' + summary: Last Successful Postgres backup is too old + expr: time() - (ma_storage_lastSuccessful_metrics_end_backup_timestamp{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"}/1000) > {{ default 86400 .Values.prometheusRules.maxLastBackupAge }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + PostgreSQL backup agent has problem: + annotations: + description: 'PostgreSQL backup agent has problem on {{ .Release.Namespace }}' + summary: PostgreSQL backup agent has problem + expr: ma_status{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"} == 6 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Unable to collect metrics from PostgreSQL backup agent: + annotations: + description: 'Unable to collect metrics from PostgreSQL backup agent on {{ .Release.Namespace }}' + summary: Unable to collect metrics from PostgreSQL backup agent + expr: ma_status{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"} < 0 or absent(ma_status{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"}) == 1 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + {{- end }} + Patroni first Node is not running: + annotations: + description: 'Patroni status First Node is not running on {{ .Release.Namespace }}' + summary: Patroni first Node is not running + expr: ma_pg_patroni_patroni_status{namespace="{{ .Release.Namespace }}", pg_node="node1"} == 0 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Patroni fourth Node is not running: + annotations: + description: 'Patroni status Fourth Node is not running on {{ .Release.Namespace }}' + summary: Patroni fourth Node is not running + expr: ma_pg_patroni_patroni_status{namespace="{{ .Release.Namespace }}", pg_node="node4"} == 0 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Patroni second Node is not running: + annotations: + description: 'Patroni status Second Node is not running on {{ .Release.Namespace }}' + summary: Patroni second Node is not running + expr: ma_pg_patroni_patroni_status{namespace="{{ .Release.Namespace }}", pg_node="node2"} == 0 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Patroni third Node is not running: + annotations: + description: 'Patroni status Third Node is not running on {{ .Release.Namespace }}' + summary: Patroni third Node is not running + expr: ma_pg_patroni_patroni_status{namespace="{{ .Release.Namespace }}", pg_node="node3"} == 0 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Postgres First Node Disk is almost full: + annotations: + description: 'Disk space Postgres First Node Disk is almost full on {{ .Release.Namespace }}' + summary: Postgres First Node Disk is almost full + expr: ma_pg_patroni_metrics_df_pcent{namespace="{{ .Release.Namespace }}", pg_node="node1"} > 90 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Postgres Fourth Node Disk is almost full: + annotations: + description: 'Disk space Postgres Fourth Node Disk is almost full on {{ .Release.Namespace }}' + summary: Postgres Fourth Node Disk is almost full + expr: ma_pg_patroni_metrics_df_pcent{namespace="{{ .Release.Namespace }}", pg_node="node4"} > 90 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Postgre Second Node Disk is almost full: + annotations: + description: 'Disk space Postgre Second Node Disk is almost full on {{ .Release.Namespace }}' + summary: Postgre Second Node Disk is almost full + expr: ma_pg_patroni_metrics_df_pcent{namespace="{{ .Release.Namespace }}", pg_node="node2"} > 90 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Postgres Third Node Disk is almost full: + annotations: + description: 'Disk space Postgres Third Node Disk is almost full on {{ .Release.Namespace }}' + summary: Postgres Third Node Disk is almost full + expr: ma_pg_patroni_metrics_df_pcent{namespace="{{ .Release.Namespace }}", pg_node="node3"} > 90 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Locks on First Node more then acceptable threshold: + annotations: + description: 'Locks on First Node more then {{ default 500 .Values.prometheusRules.locksThreshold }} on {{ .Release.Namespace }}' + summary: Locks on First Node more then acceptable threshold + expr: ma_pg_metrics_locks{namespace="{{ .Release.Namespace }}", pg_node="node1"} > {{ default 500 .Values.prometheusRules.locksThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Locks on Fourth Node more then acceptable threshold: + annotations: + description: 'Locks on Fourth Node more then {{ default 500 .Values.prometheusRules.locksThreshold }} on {{ .Release.Namespace }}' + summary: Locks on Fourth Node more then acceptable threshold + expr: ma_pg_metrics_locks{namespace="{{ .Release.Namespace }}", pg_node="node4"} > {{ default 500 .Values.prometheusRules.locksThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Locks on Second Node more then acceptable threshold: + annotations: + description: 'Locks on Second Node more then {{ default 500 .Values.prometheusRules.locksThreshold }} on {{ .Release.Namespace }}' + summary: Locks on Second Node more then acceptable threshold + expr: ma_pg_metrics_locks{namespace="{{ .Release.Namespace }}", pg_node="node2"} > {{ default 500 .Values.prometheusRules.locksThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Locks on Third Node more then acceptable threshold: + annotations: + description: 'Locks on Third Node more then {{ default 500 .Values.prometheusRules.locksThreshold }} on {{ .Release.Namespace }}' + summary: Locks on Third Node more then acceptable threshold + expr: ma_pg_metrics_locks{namespace="{{ .Release.Namespace }}", pg_node="node3"} > {{ default 500 .Values.prometheusRules.locksThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Memory on Postgres First Node is more than 95% busy: + annotations: + description: 'Memory Postgres First Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: Memory on Postgres First Node is more than 95 percent busy + expr: container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node1"} / container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node1"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node1"} > -1 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Memory on Postgres Fourth Node is more than 95% busy: + annotations: + description: 'Memory Postgres Fourth Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: Memory on Postgres Fourth Node is more than 95 percent busy + expr: container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node4"} / container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node4"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node4"} > -1 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Memory on Postgres Second Node is more than 95% busy: + annotations: + description: 'Memory Postgres Second Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: Memory on Postgres Second Node is more than 95 percent busy + expr: container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node2"} / container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node2"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node2"} > -1 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Memory on Postgres Third Node is more than 95% busy: + annotations: + description: 'Memory Postgres Third Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: Memory on Postgres Third Node is more than 95 percent busy + expr: container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node3"} / container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node3"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node3"} > -1 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + There are long running queries on First Node: + annotations: + description: 'There are long running queries First Node. Execution time is more than {{ default 3600 .Values.prometheusRules.queryMaxTimeThreshold }} second(s) on {{ .Release.Namespace }}' + summary: There are long running queries on First Node + expr: ma_pg_metrics_query_max_time{namespace="{{ .Release.Namespace }}", pg_node="node1"} > {{ default 3600 .Values.prometheusRules.queryMaxTimeThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + There are long running queries on Fourth Node: + annotations: + description: 'There are long running queries Fourth Node. Execution time is more than {{ default 3600 .Values.prometheusRules.queryMaxTimeThreshold }} second(s) on {{ .Release.Namespace }}' + summary: There are long running queries on Fourth Node + expr: ma_pg_metrics_query_max_time{namespace="{{ .Release.Namespace }}", pg_node="node4"} > {{ default 3600 .Values.prometheusRules.queryMaxTimeThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + There are long running queries on Second Node: + annotations: + description: 'There are long running queries Second Node. Execution time is more than {{ default 3600 .Values.prometheusRules.queryMaxTimeThreshold }} second(s) on {{ .Release.Namespace }}' + summary: There are long running queries on Second Node + expr: ma_pg_metrics_query_max_time{namespace="{{ .Release.Namespace }}", pg_node="node2"} > {{ default 3600 .Values.prometheusRules.queryMaxTimeThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + There are long running queries on Third Node: + annotations: + description: 'There are long running queries Third Node. Execution time is more than {{ default 3600 .Values.prometheusRules.queryMaxTimeThreshold }} second(s) on {{ .Release.Namespace }}' + summary: There are long running queries on Third Node + expr: ma_pg_metrics_query_max_time{namespace="{{ .Release.Namespace }}", pg_node="node3"} > {{ default 3600 .Values.prometheusRules.queryMaxTimeThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + PostgreSql Large Object Size High: + annotations: + summary: PostgreSQL Large Object Size High + description: 'Large object total size has exceeded the warning threshold in namespace {{ .Release.Namespace }}.' + expr: ma_pg_large_object_total_size_bytes{namespace="{{ .Release.Namespace }}"} > {{ default "104857600" .Values.prometheusRules.largeObjectSizeThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + CPU on Postgres First Node is more than 95% busy: + annotations: + description: 'CPU Postgres First Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: CPU on Postgres First Node is more than 95 percent busy + expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", container=~".*-node1"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_container=~".*-node1"}) > 0.95 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + CPU on Postgres Second Node is more than 95% busy: + annotations: + description: 'CPU Postgres Second Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: CPU on Postgres Second Node is more than 95 percent busy + expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", container=~".*-node2"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_container=~".*-node2"}) > 0.95 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + CPU on Postgres Third Node is more than 95% busy: + annotations: + description: 'CPU Postgres Third Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: CPU on Postgres Third Node is more than 95 percent busy + expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", container=~".*-node3"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_container=~".*-node3"}) > 0.95 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + CPU on Postgres Fourth Node is more than 95% busy: + annotations: + description: 'CPU Postgres Fourth Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: CPU on Postgres Fourth Node is more than 95 percent busy + expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", container=~".*-node4"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_container=~".*-node4"}) > 0.95 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Patroni Replica Is Lagging: + annotations: + description: Patroni Replica Is Lagging + summary: >- + "Patroni Replica \{\{ \$labels.hostname \}\} Is Lagging in \{\{ \$labels.namespace \}\} namespace" + expr: ma_pg_patroni_replication_lag{namespace="{{ .Release.Namespace }}"} > {{ default 33554432 .Values.prometheusRules.replicationLagValue }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + PostgreSQL Replica Is Lagging: + annotations: + description: PostgreSQL Replica Is Lagging + summary: >- + "PostgreSQL Replica \{\{ \$labels.hostname \}\} Is Lagging in \{\{ \$labels.namespace \}\} namespace" + expr: ma_pg_patroni_replication_state_sent_replay_lag{namespace="{{ .Release.Namespace }}"} > {{ default 33554432 .Values.prometheusRules.replicationLagValue }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Patroni Standby Leader Is Not Connected: + annotations: + description: Patroni Standby Leader Is Not Connected + summary: >- + "Patroni Standby Leader Is Not Connected" + expr: ma_pg_patroni_replication_state_sm_replication_state{namespace="{{ .Release.Namespace }}"} == 0 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Current overall connections exceed max_connection percentage: + annotations: + description: 'Current overall connections are above the max_connection percentage threshold on {{ .Release.Namespace }}.' + summary: Current overall connections exceed max_connection percentage + expr: (ma_pg_metrics_current_connections{namespace="{{ .Release.Namespace }}"}/ma_pg_metrics_postgres_max_connections{namespace="{{ .Release.Namespace }}"} * 100) > {{ default "90" .Values.prometheusRules.maxConnectionExceedPercentageThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: critical + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Current overall connections reached warning max_connection percentage: + annotations: + description: 'Current overall connections reached warning of the max_connection percentage threshold on {{ .Release.Namespace }}.' + summary: Current overall connections reached warning max_connection percentage + expr: (ma_pg_metrics_current_connections{namespace="{{ .Release.Namespace }}"}/ma_pg_metrics_postgres_max_connections{namespace="{{ .Release.Namespace }}"} * 100) > {{ default "80" .Values.prometheusRules.maxConnectionReachedPercentageThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Current connections exceed max_connection: + annotations: + description: 'Current connections are above the max_connection threshold on {{ .Release.Namespace }}.' + summary: Current connections exceed max_connection + expr: ma_pg_metrics_current_connections{namespace="{{ .Release.Namespace }}"} >= ma_pg_metrics_postgres_max_connections{namespace="{{ .Release.Namespace }}"} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: critical + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + DB Connection exceeding more than specified limit: + annotations: + description: 'DB Connections exceeding more than specified limit on {{ .Release.Namespace }}.' + summary: DB Connection exceeding more than specified limit + expr: ma_pg_connection_by_database{namespace="{{ .Release.Namespace }}"} >= {{ default 250 .Values.databaseConnectionLimits }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Wait event warning threshold for SubtransBuffer or SubtransSLRU: + annotations: + description: 'Wait event SubtransBuffer or SubtransSLRU hit warning threshold {{ default 5 .Values.prometheusRules.warnWaitEventTreshold }} on {{ .Release.Namespace }}' + summary: Wait event warning threshold for SubtransBuffer or SubtransSLRU + expr: wait_event_metric{namespace="{{ .Release.Namespace }}", wait_event="SubtransBuffer"} > {{ default 5 .Values.prometheusRules.warnWaitEventTreshold }} or wait_event_metric{namespace="{{ .Release.Namespace }}", wait_event="SubtransSLRU"} > {{ default 5 .Values.prometheusRules.warnWaitEventTreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Wait event critical threshold for SubtransBuffer or SubtransSLRU: + annotations: + description: 'Wait event SubtransBuffer or SubtransSLRU hit critical threshold {{ default 20 .Values.prometheusRules.critWaitEventTreshold }} on {{ .Release.Namespace }}' + summary: Wait event critical threshold for SubtransBuffer or SubtransSLRU + expr: wait_event_metric{namespace="{{ .Release.Namespace }}", wait_event="SubtransBuffer"} > {{ default 20 .Values.prometheusRules.critWaitEventTreshold }} or wait_event_metric{namespace="{{ .Release.Namespace }}", wait_event="SubtransSLRU"} > {{ default 20 .Values.prometheusRules.critWaitEventTreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: critical + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Pct used warning threshold: + annotations: + description: 'Pct used hit warning threshold {{ default 50 .Values.prometheusRules.warnpctUsedThreshold }} on {{ .Release.Namespace }}' + summary: Pct used warning threshold + expr: pct_used_metric{namespace="{{ .Release.Namespace }}"} > {{ default 50 .Values.prometheusRules.warnpctUsedThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + {{if .Values.prometheusRules.siteManagerAlertsInstall }} + Standby cluster is not streaming: + annotations: + description: 'Standby cluster is not streaming from {{ .Release.Namespace }}' + summary: Standby cluster is not streaming + expr: ma_pg_standby_leader_count{namespace="{{ .Release.Namespace }}"} < 1 + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Standby cluster has big lag in bytes: + annotations: + description: 'Standby cluster has big lag in bytes for {{ .Release.Namespace }}' + summary: Standby cluster has big lag in bytes + expr: ma_pg_standby_replication_lag_in_bytes{namespace="{{ .Release.Namespace }}"} > {{ default 1073741824 .Values.prometheusRules.standbyLagInBytesThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + Standby cluster has big lag in milliseconds: + annotations: + description: 'Standby cluster has big lag in milliseconds for {{ .Release.Namespace }}' + summary: Standby cluster has big lag in milliseconds + expr: ma_pg_standby_replication_lag_in_ms{namespace="{{ .Release.Namespace }}"} > {{ default 600000 .Values.prometheusRules.standbyLagInMsThreshold }} + for: {{ default "3m" .Values.prometheusRules.alertDelay }} + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + {{- end }} + {{- end }} + + diff --git a/charts/patroni-services/charts/metricCollector/templates/prometheus-rule.yml b/charts/patroni-services/charts/metricCollector/templates/prometheus-rule.yml new file mode 100644 index 00000000..60d1e2c5 --- /dev/null +++ b/charts/patroni-services/charts/metricCollector/templates/prometheus-rule.yml @@ -0,0 +1,59 @@ +{{- if and (eq .Values.alertsPackVersion "v2") .Values.prometheusMonitoring }} +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: prometheusrules +spec: + groups: + +{{- $defaultConfig := fromYaml (include "defaultAlerts" . ) -}} +{{- $overrideConfig := .Values.alerts -}} +{{- $finalConfig := merge $overrideConfig $defaultConfig -}} +{{- $alertGroups := .Values.ruleGroups -}} + +{{- range $defaultGroupName, $defaultGroup := $finalConfig }} +{{- $found := true }} +{{- if $alertGroups }} +{{- $found := false }} +{{- range $alertGroups }} + {{- if eq $defaultGroupName . }} + {{- $found := true }} + {{- end }} +{{- end }} +{{- else }} + {{- $found := true }} +{{- end }} + +{{- if $found }} + - name: {{ $defaultGroupName }} + {{- if $defaultGroup.labels }} + labels: + {{- range $defaultLabelName, $defaultLabelValue := $defaultGroup.labels }} + {{ $defaultLabelName }}: {{ $defaultLabelValue }} + {{- end }} + {{- end }} + {{- if $defaultGroup.interval }} + interval: {{ $defaultGroup.interval }} + {{- end }} + {{- if $defaultGroup.concurrency }} + concurrency: {{ $defaultGroup.concurrency }} + {{- end }} + rules: +{{- range $defaultRuleName, $defaultRule := $defaultGroup.rules }} + - alert: {{ $defaultRuleName }} + expr: {{ $defaultRule.expr }} + {{- if $defaultRule.for }} + for: {{ $defaultRule.for }} + {{- end }} + labels: +{{- range $defaultLabelName, $defaultLabelValue := $defaultRule.labels }} + {{ $defaultLabelName }}: {{ $defaultLabelValue }} +{{- end }} + annotations: +{{- range $defaultAnnotationName, $defaultAnnotationValue := $defaultRule.annotations }} + {{ $defaultAnnotationName }}: {{ printf $defaultAnnotationValue | trimAll "\n" | toJson | replace "\\u0026" "&" | replace "\\u003e" ">" | nindent 14 }} +{{- end }} +{{- end }} +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/patroni-services/charts/metricCollector/values.yaml b/charts/patroni-services/charts/metricCollector/values.yaml new file mode 100644 index 00000000..e69de29b diff --git a/charts/patroni-services/templates/monitoring-templates/prometheus-rule.yml b/charts/patroni-services/templates/monitoring-templates/prometheus-rule.yml index 1d0e4821..8e7674de 100644 --- a/charts/patroni-services/templates/monitoring-templates/prometheus-rule.yml +++ b/charts/patroni-services/templates/monitoring-templates/prometheus-rule.yml @@ -1,4 +1,4 @@ -{{- if and (eq (include "monitoring.install" .) "true") (.Values.metricCollector.prometheusMonitoring) }} +{{- if and (eq (include "monitoring.install" .) "true") (.Values.metricCollector.prometheusMonitoring) (ne (.Values.metricCollector.alertsPackVersion) "v2") }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -19,7 +19,7 @@ spec: expr: absent(ma_pg_patroni_cluster_status{namespace="{{ .Release.Namespace }}"}) == 1 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: PostgreSQL is Down @@ -29,7 +29,7 @@ spec: expr: ma_pg_patroni_cluster_status{namespace="{{ .Release.Namespace }}"} == 10 or ma_pg_patroni_cluster_status{namespace="{{ .Release.Namespace }}"} < 0 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: disaster + severity: critical namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: PostgreSQL is Degraded @@ -39,24 +39,24 @@ spec: expr: ma_pg_patroni_cluster_status{namespace="{{ .Release.Namespace }}"} == 6 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} {{- if .Values.backupDaemon.install }} - - alert: Space for Postgres backup is less than acceptable threshold + - alert: Space for Postgres backup is less than acceptable critical threshold annotations: - description: 'Backup space is less than {{ default 5 .Values.metricCollector.prometheusRules.backupAlertThreshold }} % free on {{ .Release.Namespace }}' - summary: Space for Postgres backup is less than acceptable threshold + description: 'Backup space is less than {{ default 5 .Values.metricCollector.prometheusRules.backupAlertThreshold }} percent free on {{ .Release.Namespace }}' + summary: Space for Postgres backup is less than acceptable critical threshold expr: ma_storage_free_space{namespace="{{ default "common" .Release.Namespace }}", service_name='postgres-backup-daemon'} / ma_storage_total_space{namespace="{{ .Release.Namespace }}", service_name='postgres-backup-daemon'} < {{ default 5 .Values.metricCollector.prometheusRules.backupAlertThreshold }}*0.01 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: critical namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - - alert: Space for Postgres backup is less than acceptable threshold + - alert: Space for Postgres backup is less than acceptable warning threshold annotations: - description: 'Backup space is less than {{ default 20 .Values.metricCollector.prometheusRules.backupWarningThreshold }} % free on {{ .Release.Namespace }}' - summary: Space for Postgres backup is less than acceptable threshold + description: 'Backup space is less than {{ default 20 .Values.metricCollector.prometheusRules.backupWarningThreshold }} percent free on {{ .Release.Namespace }}' + summary: Space for Postgres backup is less than acceptable warning threshold expr: ma_storage_free_space{namespace="{{ .Release.Namespace }}", service_name='postgres-backup-daemon'} / ma_storage_total_space{namespace="{{ .Release.Namespace }}", service_name='postgres-backup-daemon'} < {{ default 20 .Values.metricCollector.prometheusRules.backupWarningThreshold }}*0.01 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: @@ -70,7 +70,7 @@ spec: expr: ma_storage_last_failed{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"} > 0 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Last Successful Postgres backup is too old @@ -80,7 +80,7 @@ spec: expr: time() - (ma_storage_lastSuccessful_metrics_end_backup_timestamp{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"}/1000) > {{ default 86400 .Values.metricCollector.prometheusRules.maxLastBackupAge }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: PostgreSQL backup agent has problem @@ -90,7 +90,7 @@ spec: expr: ma_status{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"} == 6 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Unable to collect metrics from PostgreSQL backup agent @@ -100,7 +100,7 @@ spec: expr: ma_status{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"} < 0 or absent(ma_status{namespace="{{ .Release.Namespace }}", service_name="postgres-backup-daemon"}) == 1 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} {{- end }} @@ -151,7 +151,7 @@ spec: expr: ma_pg_patroni_metrics_df_pcent{namespace="{{ .Release.Namespace }}", pg_node="node1"} > 90 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Postgres Fourth Node Disk is almost full @@ -161,7 +161,7 @@ spec: expr: ma_pg_patroni_metrics_df_pcent{namespace="{{ .Release.Namespace }}", pg_node="node4"} > 90 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Postgre Second Node Disk is almost full @@ -171,7 +171,7 @@ spec: expr: ma_pg_patroni_metrics_df_pcent{namespace="{{ .Release.Namespace }}", pg_node="node2"} > 90 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Postgres Third Node Disk is almost full @@ -181,7 +181,7 @@ spec: expr: ma_pg_patroni_metrics_df_pcent{namespace="{{ .Release.Namespace }}", pg_node="node3"} > 90 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Locks on First Node more then acceptable threshold @@ -191,7 +191,7 @@ spec: expr: ma_pg_metrics_locks{namespace="{{ .Release.Namespace }}", pg_node="node1"} > {{ default 500 .Values.metricCollector.prometheusRules.locksThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Locks on Fourth Node more then acceptable threshold @@ -201,7 +201,7 @@ spec: expr: ma_pg_metrics_locks{namespace="{{ .Release.Namespace }}", pg_node="node4"} > {{ default 500 .Values.metricCollector.prometheusRules.locksThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Locks on Second Node more then acceptable threshold @@ -211,7 +211,7 @@ spec: expr: ma_pg_metrics_locks{namespace="{{ .Release.Namespace }}", pg_node="node2"} > {{ default 500 .Values.metricCollector.prometheusRules.locksThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Locks on Third Node more then acceptable threshold @@ -221,47 +221,47 @@ spec: expr: ma_pg_metrics_locks{namespace="{{ .Release.Namespace }}", pg_node="node3"} > {{ default 500 .Values.metricCollector.prometheusRules.locksThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Memory on Postgres First Node is more than 95% busy annotations: - description: 'Memory Postgres First Node is more than 95% busy on {{ .Release.Namespace }}' - summary: Memory on Postgres First Node is more than 95% busy + description: 'Memory Postgres First Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: Memory on Postgres First Node is more than 95 percent busy expr: container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node1"} / container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node1"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node1"} > -1 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Memory on Postgres Fourth Node is more than 95% busy annotations: - description: 'Memory Postgres Fourth Node is more than 95% busy on {{ .Release.Namespace }}' - summary: Memory on Postgres Fourth Node is more than 95% busy + description: 'Memory Postgres Fourth Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: Memory on Postgres Fourth Node is more than 95 percent busy expr: container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node4"} / container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node4"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node4"} > -1 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Memory on Postgres Second Node is more than 95% busy annotations: - description: 'Memory Postgres Second Node is more than 95% busy on {{ .Release.Namespace }}' - summary: Memory on Postgres Second Node is more than 95% busy + description: 'Memory Postgres Second Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: Memory on Postgres Second Node is more than 95 percent busy expr: container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node2"} / container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node2"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node2"} > -1 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Memory on Postgres Third Node is more than 95% busy annotations: - description: 'Memory Postgres Third Node is more than 95% busy on {{ .Release.Namespace }}' - summary: Memory on Postgres Third Node is more than 95% busy + description: 'Memory Postgres Third Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: Memory on Postgres Third Node is more than 95 percent busy expr: container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node3"} / container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node3"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="{{ .Release.Namespace }}", container=~".*-node3"} > -1 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: There are long running queries on First Node @@ -271,7 +271,7 @@ spec: expr: ma_pg_metrics_query_max_time{namespace="{{ .Release.Namespace }}", pg_node="node1"} > {{ default 3600 .Values.metricCollector.prometheusRules.queryMaxTimeThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: There are long running queries on Fourth Node @@ -281,7 +281,7 @@ spec: expr: ma_pg_metrics_query_max_time{namespace="{{ .Release.Namespace }}", pg_node="node4"} > {{ default 3600 .Values.metricCollector.prometheusRules.queryMaxTimeThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: There are long running queries on Second Node @@ -291,67 +291,67 @@ spec: expr: ma_pg_metrics_query_max_time{namespace="{{ .Release.Namespace }}", pg_node="node2"} > {{ default 3600 .Values.metricCollector.prometheusRules.queryMaxTimeThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - - alert: PostgreSql Large Object Size High + - alert: There are long running queries on Third Node annotations: - summary: PostgreSQL Large Object Size High - description: 'Large object total size has exceeded the warning threshold in namespace {{ .Release.Namespace }}.' - expr: ma_pg_large_object_total_size_bytes{namespace="{{ .Release.Namespace }}"} > {{ default "104857600" .Values.metricCollector.prometheusRules.largeObjectSizeThreshold }} + description: 'There are long running queries Third Node. Execution time is more than {{ default 3600 .Values.metricCollector.prometheusRules.queryMaxTimeThreshold }} second(s) on {{ .Release.Namespace }}' + summary: There are long running queries on Third Node + expr: ma_pg_metrics_query_max_time{namespace="{{ .Release.Namespace }}", pg_node="node3"} > {{ default 3600 .Values.metricCollector.prometheusRules.queryMaxTimeThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - - alert: There are long running queries on Third Node + - alert: PostgreSql Large Object Size High annotations: - description: 'There are long running queries Third Node. Execution time is more than {{ default 3600 .Values.metricCollector.prometheusRules.queryMaxTimeThreshold }} second(s) on {{ .Release.Namespace }}' - summary: There are long running queries on Third Node - expr: ma_pg_metrics_query_max_time{namespace="{{ .Release.Namespace }}", pg_node="node3"} > {{ default 3600 .Values.metricCollector.prometheusRules.queryMaxTimeThreshold }} + summary: PostgreSQL Large Object Size High + description: 'Large object total size has exceeded the warning threshold in namespace {{ .Release.Namespace }}.' + expr: ma_pg_large_object_total_size_bytes{namespace="{{ .Release.Namespace }}"} > {{ default "104857600" .Values.metricCollector.prometheusRules.largeObjectSizeThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: CPU on Postgres First Node is more than 95% busy annotations: - description: 'CPU Postgres First Node is more than 95% busy on {{ .Release.Namespace }}' - summary: CPU on Postgres First Node is more than 95% busy + description: 'CPU Postgres First Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: CPU on Postgres First Node is more than 95 percent busy expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", container=~".*-node1"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_container=~".*-node1"}) > 0.95 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: CPU on Postgres Second Node is more than 95% busy annotations: - description: 'CPU Postgres Second Node is more than 95% busy on {{ .Release.Namespace }}' - summary: CPU on Postgres Second Node is more than 95% busy + description: 'CPU Postgres Second Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: CPU on Postgres Second Node is more than 95 percent busy expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", container=~".*-node2"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_container=~".*-node2"}) > 0.95 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: CPU on Postgres Third Node is more than 95% busy annotations: - description: 'CPU Postgres Third Node is more than 95% busy on {{ .Release.Namespace }}' - summary: CPU on Postgres Third Node is more than 95% busy + description: 'CPU Postgres Third Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: CPU on Postgres Third Node is more than 95 percent busy expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", container=~".*-node3"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_container=~".*-node3"}) > 0.95 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: CPU on Postgres Fourth Node is more than 95% busy annotations: - description: 'CPU Postgres Fourth Node is more than 95% busy on {{ .Release.Namespace }}' - summary: CPU on Postgres Fourth Node is more than 95% busy + description: 'CPU Postgres Fourth Node is more than 95 percent busy on {{ .Release.Namespace }}' + summary: CPU on Postgres Fourth Node is more than 95 percent busy expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", container=~".*-node4"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_container=~".*-node4"}) > 0.95 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Patroni Replica Is Lagging @@ -362,7 +362,7 @@ spec: expr: ma_pg_patroni_replication_lag{namespace="{{ .Release.Namespace }}"} > {{ default 33554432 .Values.metricCollector.prometheusRules.replicationLagValue }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: PostgreSQL Replica Is Lagging @@ -373,7 +373,7 @@ spec: expr: ma_pg_patroni_replication_state_sent_replay_lag{namespace="{{ .Release.Namespace }}"} > {{ default 33554432 .Values.metricCollector.prometheusRules.replicationLagValue }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Patroni Standby Leader Is Not Connected @@ -384,7 +384,7 @@ spec: expr: ma_pg_patroni_replication_state_sm_replication_state{namespace="{{ .Release.Namespace }}"} == 0 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Current overall connections exceed max_connection percentage @@ -394,17 +394,17 @@ spec: expr: (ma_pg_metrics_current_connections{namespace="{{ .Release.Namespace }}"}/ma_pg_metrics_postgres_max_connections{namespace="{{ .Release.Namespace }}"} * 100) > {{ default "90" .Values.metricCollector.prometheusRules.maxConnectionExceedPercentageThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: critical namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - - alert: Current overall connections reached average max_connection percentage + - alert: Current overall connections reached warning max_connection percentage annotations: - description: 'Current overall connections reached average of the max_connection percentage threshold on {{ .Release.Namespace }}.' - summary: Current overall connections reached average max_connection percentage + description: 'Current overall connections reached warning of the max_connection percentage threshold on {{ .Release.Namespace }}.' + summary: Current overall connections reached warning max_connection percentage expr: (ma_pg_metrics_current_connections{namespace="{{ .Release.Namespace }}"}/ma_pg_metrics_postgres_max_connections{namespace="{{ .Release.Namespace }}"} * 100) > {{ default "80" .Values.metricCollector.prometheusRules.maxConnectionReachedPercentageThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Current connections exceed max_connection @@ -414,7 +414,7 @@ spec: expr: ma_pg_metrics_current_connections{namespace="{{ .Release.Namespace }}"} >= ma_pg_metrics_postgres_max_connections{namespace="{{ .Release.Namespace }}"} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: critical namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: DB Connection exceeding more than specified limit @@ -424,42 +424,9 @@ spec: expr: ma_pg_connection_by_database{namespace="{{ .Release.Namespace }}"} >= {{ default 250 .Values.metricCollector.databaseConnectionLimits }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high - namespace: {{ .Release.Namespace }} - service: {{ .Release.Name }} -{{- if .Values.postgresExporter.install }} - - alert: PostgreSQL the percentage of transaction ID space used out - annotations: - description: The percentage of transaction ID space used out - summary: >- - "TXID of \{\{ \$labels.hostname \}\} is used out in \{\{ \$labels.namespace \}\} namespace" - expr: pg_txid_wraparound_percent_towards_wraparound{namespace="{{ .Release.Namespace }}"} > 75 - for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} - labels: - severity: high - namespace: {{ .Release.Namespace }} - service: {{ .Release.Name }} - - alert: PostgreSQL replication slot wal size is too high - annotations: - description: The replication slot wal size is more than allowed threshold in MB - summary: The replication slot wal size is more than allowed threshold in MB - expr: pg_replicaiton_slots_monitoring_retained_wal{namespace="{{ .Release.Namespace }}"} / 1024 / 1024 > {{ default "1024" .Values.postgresExporter.prometheusRules.maxReplicationSlotWalSizeThreshold }} - for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} - labels: - severity: high - namespace: {{ .Release.Namespace }} - service: {{ .Release.Name }} - - alert: PostgreSQL replication slot is not active for a long time - annotations: - description: The replication slot is not active for a long time - summary: The replication slot is not active for a long time - expr: pg_replicaiton_slots_monitoring_retained_wal{namespace="{{ .Release.Namespace }}", active="false"} > -1 - for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} - labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} -{{- end }} - alert: Wait event warning threshold for SubtransBuffer or SubtransSLRU annotations: description: 'Wait event SubtransBuffer or SubtransSLRU hit warning threshold {{ default 5 .Values.metricCollector.prometheusRules.warnWaitEventTreshold }} on {{ .Release.Namespace }}' @@ -467,7 +434,7 @@ spec: expr: wait_event_metric{namespace="{{ .Release.Namespace }}", wait_event="SubtransBuffer"} > {{ default 5 .Values.metricCollector.prometheusRules.warnWaitEventTreshold }} or wait_event_metric{namespace="{{ .Release.Namespace }}", wait_event="SubtransSLRU"} > {{ default 5 .Values.metricCollector.prometheusRules.warnWaitEventTreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Wait event critical threshold for SubtransBuffer or SubtransSLRU @@ -477,7 +444,7 @@ spec: expr: wait_event_metric{namespace="{{ .Release.Namespace }}", wait_event="SubtransBuffer"} > {{ default 20 .Values.metricCollector.prometheusRules.critWaitEventTreshold }} or wait_event_metric{namespace="{{ .Release.Namespace }}", wait_event="SubtransSLRU"} > {{ default 20 .Values.metricCollector.prometheusRules.critWaitEventTreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: critical namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Pct used warning threshold @@ -487,7 +454,7 @@ spec: expr: pct_used_metric{namespace="{{ .Release.Namespace }}"} > {{ default 50 .Values.metricCollector.prometheusRules.warnpctUsedThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: average + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} {{if .Values.siteManager.install }} @@ -498,7 +465,7 @@ spec: expr: ma_pg_standby_leader_count{namespace="{{ .Release.Namespace }}"} < 1 for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Standby cluster has big lag in bytes @@ -508,7 +475,7 @@ spec: expr: ma_pg_standby_replication_lag_in_bytes{namespace="{{ .Release.Namespace }}"} > {{ default 1073741824 .Values.metricCollector.prometheusRules.standbyLagInBytesThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: Standby cluster has big lag in milliseconds @@ -518,7 +485,7 @@ spec: expr: ma_pg_standby_replication_lag_in_ms{namespace="{{ .Release.Namespace }}"} > {{ default 600000 .Values.metricCollector.prometheusRules.standbyLagInMsThreshold }} for: {{ default "3m" .Values.metricCollector.prometheusRules.alertDelay }} labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} {{- end }} diff --git a/charts/patroni-services/values.yaml b/charts/patroni-services/values.yaml index 9822d16c..fcb4a0df 100644 --- a/charts/patroni-services/values.yaml +++ b/charts/patroni-services/values.yaml @@ -132,7 +132,15 @@ metricCollector: # # The name of the InfluxDB database to store the metrics. This should start with the cloud_ prefix. # database: test" # Configuration for prometheus alerts + + #This variable move from old fashion alerts, descriveb in monitoring/templates/prometheus-rule with small amount of overrides to new fashion, when alerts are deployed as a helm subchart charts/metricsCollector + # and can be fully overrided/added/modified in subchart`s values.yaml` + # Any value except v2 will work as an old fashion alerts + + alertsPackVersion: "v1" prometheusRules: + backupAlertsInstall: true + siteManagerAlertsInstall: true # Threshold for backup storage size for alerting backupAlertThreshold: 5 # Threshold for backup storage size for warning @@ -166,7 +174,7 @@ metricCollector: # Profile for the metrics collection. The possible values are prod and dev. The default value is prod. # For dev profile, additional performance metrics (queries stat, tables stat) will be collected. metricsProfile: prod - prometheusMonitoring: false + prometheusMonitoring: true applyGrafanaDashboard: false # Field for priority of the pod # priorityClassName: "high-priority" diff --git a/docs/public/installation.md b/docs/public/installation.md index 97aa5a94..7246c152 100644 --- a/docs/public/installation.md +++ b/docs/public/installation.md @@ -437,6 +437,7 @@ This sections describes all possible deploy parameters for PostgreSQL Metric Col | metricCollector.scrapeTimeout | int | no | 20 | Specifies timeout in seconds to wait metric be gathered. | | metricCollector.telegrafPluginTimeout | int | no | 60 | Specifies timeout in seconds to execute Telegraf's plugins. | | metricCollector.userPassword | yaml | no | p@ssWOrD1 | Specifies the password for metric collector user. | +| metricCollector.alertsPackVersion | string | no | v1 | Defines alerts install method. Anything except "v2" or no parameter install default alerts from template, if set to v2 it installs alerts from helm subchart (charts/patroni-services/charts/metricCollector) with ability to override them in subcharts values file. | metricCollector.ocExecTimeout | int | no | 10 | Specifies timeout in seconds to execute `exec` commands. | | metricCollector.devMetricsInterval | int | no | 10 | Specifies interval in minutes to execute Telegraf's plugins for additional metrics. | | metricCollector.devMetricsTimeout | int | no | 10 | Timeout in minutes to execute command for additional metrics. | diff --git a/tests/alerts-tests/rules.yaml b/tests/alerts-tests/rules.yaml new file mode 100644 index 00000000..ee7b857e --- /dev/null +++ b/tests/alerts-tests/rules.yaml @@ -0,0 +1,998 @@ +--- +# Source: patroni-services/templates/service_account.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: postgres-sa + labels: + name: postgres-sa + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" +--- +# Source: patroni-services/templates/secrets/influx-db-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + labels: + app: monitoring-collector + name: monitoring-collector + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" + name: influx-db-admin-credentials +data: + + password: "" + username: "" + +type: Opaque +--- +# Source: patroni-services/templates/secrets/monitoring-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + labels: + app: monitoring-collector + name: monitoring-collector + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" + name: monitoring-credentials +data: + username: bW9uaXRvcmluZy11c2Vy + password: cEBzc1dPckQx +type: Opaque +--- +# Source: patroni-services/templates/tests/tests-config.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: supplementary-tests-config + labels: + app: patroni-tests +data: + dd_images: "" +--- +# Source: patroni-services/templates/role.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + creationTimestamp: null + name: postgres-operator + labels: + name: postgres-operator + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" +rules: +- apiGroups: + - "" + resources: + - pods + - services + - persistentvolumeclaims + - configmaps + - secrets + - serviceaccounts + verbs: + - create + - get + - list + - patch + - update + - watch + - delete +- apiGroups: [""] + resources: ["pods/exec"] + verbs: + - get + - list + - patch + - update + - watch + - delete + - create +- apiGroups: [""] + resources: ["pods", "pods/log"] + verbs: ["get", "list"] +- apiGroups: + - apps + resources: + - deployments + - deployments/scale + - replicasets + - statefulsets + - statefulsets/scale + + verbs: + - create + - get + - list + - patch + - update + - watch + - delete +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - get + - list + - patch + - update + - watch + - delete +- apiGroups: + - netcracker.com + resources: + - '*' + verbs: + - create + - get + - list + - patch + - update + - watch + - delete +--- +# Source: patroni-services/templates/role_binding.yaml +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: postgres-operator + labels: + name: postgres-operator + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" +subjects: +- kind: ServiceAccount + name: postgres-sa +roleRef: + kind: Role + name: postgres-operator + apiGroup: rbac.authorization.k8s.io +--- +# Source: patroni-services/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres-operator +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + name: postgres-operator + + template: + metadata: + labels: + name: postgres-operator + + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" + + spec: + serviceAccountName: postgres-sa + + + containers: + - name: patroni-services + image: ghcr.io/netcracker/pgskipper-operator:main + imagePullPolicy: Always + + resources: + limits: + cpu: 50m + memory: 50Mi + requests: + cpu: 50m + memory: 50Mi + securityContext: + + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + + env: + - name: WATCH_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: OPERATOR_NAME + value: "patroni-services" + - name: RESOURCE_NAME + value: patroni-services + - name: PATRONI_CLUSTER_NAME + value: patroni + - name: WAIT_TIMEOUT + value: "10" + - name: PG_RECONCILE_RETRIES + value: "3" + - name: HOST_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: PG_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-credentials + key: password + - name: PG_ADMIN_USER + valueFrom: + secretKeyRef: + name: postgres-credentials + key: username + - name: PG_REPLICATOR_PASSWORD + valueFrom: + secretKeyRef: + name: replicator-credentials + key: password + - name: GLOBAL_SECURITY_CONTEXT + value: "true" + - name: CLOUD_PUBLIC_HOST + value: k8s.default + + - name: INTERNAL_TLS_ENABLED + value: "false" + livenessProbe: + httpGet: + path: /healthz + port: 8081 + scheme: HTTP + initialDelaySeconds: 10 + timeoutSeconds: 5 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 10 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + scheme: HTTP + initialDelaySeconds: 10 + timeoutSeconds: 5 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 10 + volumes: + + tolerations: + securityContext: + + runAsNonRoot: true + seccompProfile: + type: "RuntimeDefault" +--- +# Source: patroni-services/templates/cr.yaml +apiVersion: netcracker.com/v1 +kind: PatroniServices +metadata: + name: patroni-services + labels: + name: patroni-services + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" +spec: + installationTimestamp: "1762865184" + + serviceAccountName: postgres-sa + + + tls: + enabled: false + certificateSecretName: pg-cert-services + + patroni: + clusterName: patroni + + backupDaemon: + + podLabels: + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" + image: ghcr.io/netcracker/pgskipper-backup-daemon:main + compressionLevel: 5 + walArchiving: false + granularEviction: '3600' + encryption: false + retainArchiveSettings: false + backupTimeout: 300 + allowPrefix: false + useEvictionPolicyFirst: "false" + evictionBinaryPolicy: 7d/delete + archiveEvictionPolicy: "7d" + jobFlag: '1' + connectTimeout: '5' + backupSchedule: 0 0/7 * * * + evictionPolicy: 7d/delete + pgHost: pg-patroni.default + securityContext: + + runAsNonRoot: true + seccompProfile: + type: "RuntimeDefault" + resources: + limits: + cpu: 450m + memory: 768Mi + requests: + cpu: 100m + memory: 256Mi + storage: + type: provisioned + size: 1Gi + + + + + + + + sslMode: "prefer" + + + + + + + + vaultRegistration: + dockerImage: banzaicloud/vault-env:1.5.0 + enabled: false + path: default + dbEngine: + enabled: false + name: k8s.default_default_postgresql + maxOpenConnections: 5 + maxIdleConnections: 5 + maxConnectionLifetime: 5s + + + + + metricCollector: + + podLabels: + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" + image: ghcr.io/netcracker/pgskipper-monitoring-agent:main + + resources: + limits: + cpu: 300m + memory: 256Mi + requests: + cpu: 150m + memory: 170Mi + + + + + sslMode: "prefer" + + + metricsProfile: dev + devMetricsTimeout: 10 + devMetricsInterval: 10 + collectionInterval: 60 + telegrafPluginTimeout: 60 + ocExecTimeout: 10 + + securityContext: + + runAsNonRoot: true + seccompProfile: + type: "RuntimeDefault" + + + tracing: + enabled: false + host: jaeger-collector.tracing.svc:4317 + + + + + integrationTests: + podLabels: + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "postgres-operator" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" + image: ghcr.io/netcracker/pgskipper-operator-tests:main + + resources: + limits: + cpu: 300m + memory: 512Mi + requests: + cpu: 150m + memory: 256Mi + + runTestScenarios: basic + pgNodeQty: 1 +--- +# Source: patroni-services/templates/monitoring-templates/postgres-tls-status-metric.yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + name: postgres-tls-status-static-metric + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "monitoring" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" + name: postgres-tls-status-static-metric +spec: + groups: + - name: postgres-tls-status-static-metric-group + rules: + - expr: '0' + labels: + application: patroni-services + namespace: default + service: postgres + record: service:tls_status:info +--- +# Source: patroni-services/templates/monitoring-templates/prometheus-rule.yml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + name: prometheus-postgres-service-rules + + app.kubernetes.io/instance: pgskipper-montemplate + app.kubernetes.io/name: patroni-services + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/component: "monitoring" + app.kubernetes.io/part-of: "postgres-operator" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/technology: "go" + prometheus: postgres-service-metric-collector + role: alert-rules + name: prometheus-postgres-service-rules +spec: + groups: + - name: default-pgskipper-montemplate + rules: + - alert: PostgreSQL metrics are absent + annotations: + description: 'PostgreSQL metrics are absent on default.' + summary: PostgreSQL metrics are absent + expr: absent(ma_pg_patroni_cluster_status{namespace="default"}) == 1 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: PostgreSQL is Down + annotations: + description: 'PostgreSQL is Down on default.' + summary: PostgreSQL is Down + expr: ma_pg_patroni_cluster_status{namespace="default"} == 10 or ma_pg_patroni_cluster_status{namespace="default"} < 0 + for: 3m + labels: + severity: critical + namespace: default + service: pgskipper-montemplate + - alert: PostgreSQL is Degraded + annotations: + description: 'PostgreSQL is Degraded on default.' + summary: PostgreSQL is Degraded + expr: ma_pg_patroni_cluster_status{namespace="default"} == 6 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Space for Postgres backup is less than acceptable critical threshold + annotations: + description: 'Backup space is less than 5 % free on default' + summary: Space for Postgres backup is less than acceptable critical threshold + expr: ma_storage_free_space{namespace="default", service_name='postgres-backup-daemon'} / ma_storage_total_space{namespace="default", service_name='postgres-backup-daemon'} < 5*0.01 + for: 3m + labels: + severity: critical + namespace: default + service: pgskipper-montemplate + - alert: Space for Postgres backup is less than acceptable warning threshold + annotations: + description: 'Backup space is less than 20 % free on default' + summary: Space for Postgres backup is less than acceptable warning threshold + expr: ma_storage_free_space{namespace="default", service_name='postgres-backup-daemon'} / ma_storage_total_space{namespace="default", service_name='postgres-backup-daemon'} < 20*0.01 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Last Postgres backup is failed + annotations: + description: 'Last backup is failed on default' + summary: Targets are down + expr: ma_storage_last_failed{namespace="default", service_name="postgres-backup-daemon"} > 0 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Last Successful Postgres backup is too old + annotations: + description: 'Last Successful backup is too old on default' + summary: Last Successful Postgres backup is too old + expr: time() - (ma_storage_lastSuccessful_metrics_end_backup_timestamp{namespace="default", service_name="postgres-backup-daemon"}/1000) > 86400 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: PostgreSQL backup agent has problem + annotations: + description: 'PostgreSQL backup agent has problem on default' + summary: PostgreSQL backup agent has problem + expr: ma_status{namespace="default", service_name="postgres-backup-daemon"} == 6 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Unable to collect metrics from PostgreSQL backup agent + annotations: + description: 'Unable to collect metrics from PostgreSQL backup agent on default' + summary: Unable to collect metrics from PostgreSQL backup agent + expr: ma_status{namespace="default", service_name="postgres-backup-daemon"} < 0 or absent(ma_status{namespace="default", service_name="postgres-backup-daemon"}) == 1 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Patroni first Node is not running + annotations: + description: 'Patroni status First Node is not running on default' + summary: Patroni first Node is not running + expr: ma_pg_patroni_patroni_status{namespace="default", pg_node="node1"} == 0 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Patroni fourth Node is not running + annotations: + description: 'Patroni status Fourth Node is not running on default' + summary: Patroni fourth Node is not running + expr: ma_pg_patroni_patroni_status{namespace="default", pg_node="node4"} == 0 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Patroni second Node is not running + annotations: + description: 'Patroni status Second Node is not running on default' + summary: Patroni second Node is not running + expr: ma_pg_patroni_patroni_status{namespace="default", pg_node="node2"} == 0 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Patroni third Node is not running + annotations: + description: 'Patroni status Third Node is not running on default' + summary: Patroni third Node is not running + expr: ma_pg_patroni_patroni_status{namespace="default", pg_node="node3"} == 0 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Postgres First Node Disk is almost full + annotations: + description: 'Disk space Postgres First Node Disk is almost full on default' + summary: Postgres First Node Disk is almost full + expr: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node1"} > 90 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Postgres Fourth Node Disk is almost full + annotations: + description: 'Disk space Postgres Fourth Node Disk is almost full on default' + summary: Postgres Fourth Node Disk is almost full + expr: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node4"} > 90 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Postgre Second Node Disk is almost full + annotations: + description: 'Disk space Postgre Second Node Disk is almost full on default' + summary: Postgre Second Node Disk is almost full + expr: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node2"} > 90 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Postgres Third Node Disk is almost full + annotations: + description: 'Disk space Postgres Third Node Disk is almost full on default' + summary: Postgres Third Node Disk is almost full + expr: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node3"} > 90 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Locks on First Node more then acceptable threshold + annotations: + description: 'Locks on First Node more then 500 on default' + summary: Locks on First Node more then acceptable threshold + expr: ma_pg_metrics_locks{namespace="default", pg_node="node1"} > 500 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Locks on Fourth Node more then acceptable threshold + annotations: + description: 'Locks on Fourth Node more then 500 on default' + summary: Locks on Fourth Node more then acceptable threshold + expr: ma_pg_metrics_locks{namespace="default", pg_node="node4"} > 500 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Locks on Second Node more then acceptable threshold + annotations: + description: 'Locks on Second Node more then 500 on default' + summary: Locks on Second Node more then acceptable threshold + expr: ma_pg_metrics_locks{namespace="default", pg_node="node2"} > 500 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Locks on Third Node more then acceptable threshold + annotations: + description: 'Locks on Third Node more then 500 on default' + summary: Locks on Third Node more then acceptable threshold + expr: ma_pg_metrics_locks{namespace="default", pg_node="node3"} > 500 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Memory on Postgres First Node is more than 95% busy + annotations: + description: 'Memory Postgres First Node is more than 95% busy on default' + summary: Memory on Postgres First Node is more than 95% busy + expr: container_memory_working_set_bytes{namespace="default", container=~".*-node1"} / container_spec_memory_limit_bytes{namespace="default", container=~".*-node1"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="default", container=~".*-node1"} > -1 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Memory on Postgres Fourth Node is more than 95% busy + annotations: + description: 'Memory Postgres Fourth Node is more than 95% busy on default' + summary: Memory on Postgres Fourth Node is more than 95% busy + expr: container_memory_working_set_bytes{namespace="default", container=~".*-node4"} / container_spec_memory_limit_bytes{namespace="default", container=~".*-node4"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="default", container=~".*-node4"} > -1 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Memory on Postgres Second Node is more than 95% busy + annotations: + description: 'Memory Postgres Second Node is more than 95% busy on default' + summary: Memory on Postgres Second Node is more than 95% busy + expr: container_memory_working_set_bytes{namespace="default", container=~".*-node2"} / container_spec_memory_limit_bytes{namespace="default", container=~".*-node2"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="default", container=~".*-node2"} > -1 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Memory on Postgres Third Node is more than 95% busy + annotations: + description: 'Memory Postgres Third Node is more than 95% busy on default' + summary: Memory on Postgres Third Node is more than 95% busy + expr: container_memory_working_set_bytes{namespace="default", container=~".*-node3"} / container_spec_memory_limit_bytes{namespace="default", container=~".*-node3"} * 100 > 95 and container_spec_memory_limit_bytes{namespace="default", container=~".*-node3"} > -1 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: There are long running queries on First Node + annotations: + description: 'There are long running queries First Node. Execution time is more than 3600 second(s) on default' + summary: There are long running queries on First Node + expr: ma_pg_metrics_query_max_time{namespace="default", pg_node="node1"} > 3600 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: There are long running queries on Fourth Node + annotations: + description: 'There are long running queries Fourth Node. Execution time is more than 3600 second(s) on default' + summary: There are long running queries on Fourth Node + expr: ma_pg_metrics_query_max_time{namespace="default", pg_node="node4"} > 3600 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: There are long running queries on Second Node + annotations: + description: 'There are long running queries Second Node. Execution time is more than 3600 second(s) on default' + summary: There are long running queries on Second Node + expr: ma_pg_metrics_query_max_time{namespace="default", pg_node="node2"} > 3600 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: There are long running queries on Third Node + annotations: + description: 'There are long running queries Third Node. Execution time is more than 3600 second(s) on default' + summary: There are long running queries on Third Node + expr: ma_pg_metrics_query_max_time{namespace="default", pg_node="node3"} > 3600 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: PostgreSql Large Object Size High + annotations: + summary: PostgreSQL Large Object Size High + description: 'Large object total size has exceeded the warning threshold in namespace default.' + expr: ma_pg_large_object_total_size_bytes{namespace="default"} > 1.048576e+08 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: CPU on Postgres First Node is more than 95% busy + annotations: + description: 'CPU Postgres First Node is more than 95% busy on default' + summary: CPU on Postgres First Node is more than 95% busy + expr: max(rate(container_cpu_usage_seconds_total{namespace="default", container=~".*-node1"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container=~".*-node1"}) > 0.95 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: CPU on Postgres Second Node is more than 95% busy + annotations: + description: 'CPU Postgres Second Node is more than 95% busy on default' + summary: CPU on Postgres Second Node is more than 95% busy + expr: max(rate(container_cpu_usage_seconds_total{namespace="default", container=~".*-node2"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container=~".*-node2"}) > 0.95 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: CPU on Postgres Third Node is more than 95% busy + annotations: + description: 'CPU Postgres Third Node is more than 95% busy on default' + summary: CPU on Postgres Third Node is more than 95% busy + expr: max(rate(container_cpu_usage_seconds_total{namespace="default", container=~".*-node3"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container=~".*-node3"}) > 0.95 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: CPU on Postgres Fourth Node is more than 95% busy + annotations: + description: 'CPU Postgres Fourth Node is more than 95% busy on default' + summary: CPU on Postgres Fourth Node is more than 95% busy + expr: max(rate(container_cpu_usage_seconds_total{namespace="default", container=~".*-node4"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container=~".*-node4"}) > 0.95 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Patroni Replica Is Lagging + annotations: + description: Patroni Replica Is Lagging + summary: >- + "Patroni Replica \{\{ \$labels.hostname \}\} Is Lagging in \{\{ \$labels.namespace \}\} namespace" + expr: ma_pg_patroni_replication_lag{namespace="default"} > 3.3554432e+07 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: PostgreSQL Replica Is Lagging + annotations: + description: PostgreSQL Replica Is Lagging + summary: >- + "PostgreSQL Replica \{\{ \$labels.hostname \}\} Is Lagging in \{\{ \$labels.namespace \}\} namespace" + expr: ma_pg_patroni_replication_state_sent_replay_lag{namespace="default"} > 3.3554432e+07 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Patroni Standby Leader Is Not Connected + annotations: + description: Patroni Standby Leader Is Not Connected + summary: >- + "Patroni Standby Leader Is Not Connected" + expr: ma_pg_patroni_replication_state_sm_replication_state{namespace="default"} == 0 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Current overall connections exceed max_connection percentage + annotations: + description: 'Current overall connections are above the max_connection percentage threshold on default.' + summary: Current overall connections exceed max_connection percentage + expr: (ma_pg_metrics_current_connections{namespace="default"}/ma_pg_metrics_postgres_max_connections{namespace="default"} * 100) > 90 + for: 3m + labels: + severity: critical + namespace: default + service: pgskipper-montemplate + - alert: Current overall connections reached warning max_connection percentage + annotations: + description: 'Current overall connections reached warning of the max_connection percentage threshold on default.' + summary: Current overall connections reached warning max_connection percentage + expr: (ma_pg_metrics_current_connections{namespace="default"}/ma_pg_metrics_postgres_max_connections{namespace="default"} * 100) > 80 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Current connections exceed max_connection + annotations: + description: 'Current connections are above the max_connection threshold on default.' + summary: Current connections exceed max_connection + expr: ma_pg_metrics_current_connections{namespace="default"} >= ma_pg_metrics_postgres_max_connections{namespace="default"} + for: 3m + labels: + severity: critical + namespace: default + service: pgskipper-montemplate + - alert: DB Connection exceeding more than specified limit + annotations: + description: 'DB Connections exceeding more than specified limit on default.' + summary: DB Connection exceeding more than specified limit + expr: ma_pg_connection_by_database{namespace="default"} >= 250 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Wait event warning threshold for SubtransBuffer or SubtransSLRU + annotations: + description: 'Wait event SubtransBuffer or SubtransSLRU hit warning threshold 5 on default' + summary: Wait event warning threshold for SubtransBuffer or SubtransSLRU + expr: wait_event_metric{namespace="default", wait_event="SubtransBuffer"} > 5 or wait_event_metric{namespace="default", wait_event="SubtransSLRU"} > 5 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate + - alert: Wait event critical threshold for SubtransBuffer or SubtransSLRU + annotations: + description: 'Wait event SubtransBuffer or SubtransSLRU hit critical threshold 20 on default' + summary: Wait event critical threshold for SubtransBuffer or SubtransSLRU + expr: wait_event_metric{namespace="default", wait_event="SubtransBuffer"} > 20 or wait_event_metric{namespace="default", wait_event="SubtransSLRU"} > 20 + for: 3m + labels: + severity: critical + namespace: default + service: pgskipper-montemplate + - alert: Pct used warning threshold + annotations: + description: 'Pct used hit warning threshold 50 on default' + summary: Pct used warning threshold + expr: pct_used_metric{namespace="default"} > 50 + for: 3m + labels: + severity: warning + namespace: default + service: pgskipper-montemplate +--- +# Source: patroni-services/templates/monitoring-templates/service-monitor.yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: postgres-service-monitor + labels: + name: postgres-service-monitor + k8s-app: postgres-service-monitor + app.kubernetes.io/name: postgres-service-monitor + app.kubernetes.io/component: monitoring + app.kubernetes.io/part-of: platform-monitoring + app.kubernetes.io/managed-by: platform-monitoring-operator +spec: + endpoints: + - interval: 60s + scrapeTimeout: 20s + port: prometheus-port + scheme: http + jobLabel: k8s-app + namespaceSelector: + + matchNames: + - default + + selector: + matchLabels: + app: monitoring-collector diff --git a/tests/alerts-tests/test.yaml b/tests/alerts-tests/test.yaml new file mode 100644 index 00000000..3fee917b --- /dev/null +++ b/tests/alerts-tests/test.yaml @@ -0,0 +1,1446 @@ +rule_files: +- rules.yaml +evaluation_interval: 1m +tests: +- interval: 1m + input_series: + - series: ma_pg_patroni_cluster_status + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL metrics are absent + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "PostgreSQL metrics are absent" + description: "PostgreSQL metrics are absent on default." + +- interval: 1m + input_series: + - series: ma_pg_patroni_cluster_status{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL metrics are absent + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_cluster_status{namespace="default"} + values: "10x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL is Down + exp_alerts: + - exp_labels: + severity: critical + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "PostgreSQL is Down" + description: "PostgreSQL is Down on default." + +- interval: 1m + input_series: + - series: ma_pg_patroni_cluster_status{namespace="default"} + values: "-1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL is Down + exp_alerts: + - exp_labels: + severity: critical + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "PostgreSQL is Down" + description: "PostgreSQL is Down on default." + +- interval: 1m + input_series: + - series: ma_pg_patroni_cluster_status{namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL is Down + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_cluster_status{namespace="default"} + values: "6x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL is Degraded + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "PostgreSQL is Degraded" + description: "PostgreSQL is Degraded on default." + +- interval: 1m + input_series: + - series: ma_pg_patroni_cluster_status{namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL is Degraded + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_storage_free_space{namespace="default", service_name='postgres-backup-daemon'} + values: "0x5" + - series: ma_storage_total_space{namespace="default", service_name='postgres-backup-daemon'} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Space for Postgres backup is less than acceptable critical threshold + exp_alerts: + - exp_labels: + severity: critical + namespace: default + service: pgskipper-montemplates + service_name: postgres-backup-daemon + exp_annotations: + summary: "Space for Postgres backup is less than acceptable critical threshold" + description: "Backup space is less than 5 percent free on default" + +- interval: 1m + input_series: + - series: ma_storage_free_space{namespace="default", service_name='postgres-backup-daemon'} + values: "1x5" + - series: ma_storage_total_space{namespace="default", service_name='postgres-backup-daemon'} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Space for Postgres backup is less than acceptable critical threshold + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_storage_free_space{namespace="default", service_name='postgres-backup-daemon'} + values: "1x5" + - series: ma_storage_total_space{namespace="default", service_name='postgres-backup-daemon'} + values: "10x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Space for Postgres backup is less than acceptable warning threshold + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + service_name: postgres-backup-daemon + exp_annotations: + summary: "Space for Postgres backup is less than acceptable warning threshold" + description: "Backup space is less than 20 percent free on default" + +- interval: 1m + input_series: + - series: ma_storage_free_space{namespace="default", service_name='postgres-backup-daemon'} + values: "1x5" + - series: ma_storage_total_space{namespace="default", service_name='postgres-backup-daemon'} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Space for Postgres backup is less than acceptable warning threshold + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_storage_last_failed{namespace="default", service_name="postgres-backup-daemon"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Last Postgres backup is failed + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + service_name: postgres-backup-daemon + exp_annotations: + summary: "Targets are down" + description: "Last backup is failed on default" + +- interval: 1m + input_series: + - series: ma_storage_last_failed{namespace="default", service_name="postgres-backup-daemon"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Last Postgres backup is failed + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_storage_lastSuccessful_metrics_end_backup_timestamp{namespace="default", service_name="postgres-backup-daemon"} + values: "-86500000x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Last Successful Postgres backup is too old + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + service_name: postgres-backup-daemon + exp_annotations: + summary: "Last Successful Postgres backup is too old" + description: "Last Successful backup is too old on default" + +- interval: 1m + input_series: + - series: ma_storage_lastSuccessful_metrics_end_backup_timestamp{namespace="default", service_name="postgres-backup-daemon"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Last Successful Postgres backup is too old + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_status{namespace="default", service_name="postgres-backup-daemon"} + values: "6x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL backup agent has problem + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + service_name: postgres-backup-daemon + exp_annotations: + summary: "PostgreSQL backup agent has problem" + description: "PostgreSQL backup agent has problem on default" + +- interval: 1m + input_series: + - series: ma_status{namespace="default", service_name="postgres-backup-daemon"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL backup agent has problem + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_status{namespace="default", service_name="postgres-backup-daemon"} + values: "-1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Unable to collect metrics from PostgreSQL backup agent + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + service_name: postgres-backup-daemon + exp_annotations: + summary: "Unable to collect metrics from PostgreSQL backup agent" + description: "Unable to collect metrics from PostgreSQL backup agent on default" + +- interval: 1m + input_series: + - series: ma_status + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Unable to collect metrics from PostgreSQL backup agent + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + service_name: postgres-backup-daemon + exp_annotations: + summary: "Unable to collect metrics from PostgreSQL backup agent" + description: "Unable to collect metrics from PostgreSQL backup agent on default" + +- interval: 1m + input_series: + - series: ma_status{namespace="default", service_name="postgres-backup-daemon"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Unable to collect metrics from PostgreSQL backup agent + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_patroni_status{namespace="default", pg_node="node1"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni first Node is not running + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node1 + exp_annotations: + summary: "Patroni first Node is not running" + description: "Patroni status First Node is not running on default" + +- interval: 1m + input_series: + - series: ma_pg_patroni_patroni_status{namespace="default", pg_node="node1"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni first Node is not running + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_patroni_status{namespace="default", pg_node="node4"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni fourth Node is not running + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node4 + exp_annotations: + summary: "Patroni fourth Node is not running" + description: "Patroni status Fourth Node is not running on default" + +- interval: 1m + input_series: + - series: ma_pg_patroni_patroni_status{namespace="default", pg_node="node4"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni fourth Node is not running + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_patroni_status{namespace="default", pg_node="node2"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni second Node is not running + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node2 + exp_annotations: + summary: "Patroni second Node is not running" + description: "Patroni status Second Node is not running on default" + +- interval: 1m + input_series: + - series: ma_pg_patroni_patroni_status{namespace="default", pg_node="node2"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni second Node is not running + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_patroni_status{namespace="default", pg_node="node3"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni third Node is not running + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node3 + exp_annotations: + summary: "Patroni third Node is not running" + description: "Patroni status Third Node is not running on default" + +- interval: 1m + input_series: + - series: ma_pg_patroni_patroni_status{namespace="default", pg_node="node3"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni third Node is not running + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node1"} + values: "91x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Postgres First Node Disk is almost full + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node1 + exp_annotations: + summary: "Postgres First Node Disk is almost full" + description: "Disk space Postgres First Node Disk is almost full on default" + +- interval: 1m + input_series: + - series: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node1"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Postgres First Node Disk is almost full + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node4"} + values: "91x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Postgres Fourth Node Disk is almost full + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node4 + exp_annotations: + summary: "Postgres Fourth Node Disk is almost full" + description: "Disk space Postgres Fourth Node Disk is almost full on default" + +- interval: 1m + input_series: + - series: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node4"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Postgres Fourth Node Disk is almost full + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node2"} + values: "91x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Postgre Second Node Disk is almost full + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node2 + exp_annotations: + summary: "Postgre Second Node Disk is almost full" + description: "Disk space Postgre Second Node Disk is almost full on default" + +- interval: 1m + input_series: + - series: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node2"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Postgre Second Node Disk is almost full + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node3"} + values: "91x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Postgres Third Node Disk is almost full + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node3 + exp_annotations: + summary: "Postgres Third Node Disk is almost full" + description: "Disk space Postgres Third Node Disk is almost full on default" + +- interval: 1m + input_series: + - series: ma_pg_patroni_metrics_df_pcent{namespace="default", pg_node="node3"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Postgres Third Node Disk is almost full + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_locks{namespace="default", pg_node="node1"} + values: "501x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Locks on First Node more then acceptable threshold + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node1 + exp_annotations: + summary: "Locks on First Node more then acceptable threshold" + description: "Locks on First Node more then 500 on default" + +- interval: 1m + input_series: + - series: ma_pg_metrics_locks{namespace="default", pg_node="node1"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Locks on First Node more then acceptable threshold + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_locks{namespace="default", pg_node="node4"} + values: "501x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Locks on Fourth Node more then acceptable threshold + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node4 + exp_annotations: + summary: "Locks on Fourth Node more then acceptable threshold" + description: "Locks on Fourth Node more then 500 on default" + +- interval: 1m + input_series: + - series: ma_pg_metrics_locks{namespace="default", pg_node="node4"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Locks on Fourth Node more then acceptable threshold + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_locks{namespace="default", pg_node="node2"} + values: "501x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Locks on Second Node more then acceptable threshold + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node2 + exp_annotations: + summary: "Locks on Second Node more then acceptable threshold" + description: "Locks on Second Node more then 500 on default" + +- interval: 1m + input_series: + - series: ma_pg_metrics_locks{namespace="default", pg_node="node2"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Locks on Second Node more then acceptable threshold + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_locks{namespace="default", pg_node="node3"} + values: "501x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Locks on Third Node more then acceptable threshold + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node3 + exp_annotations: + summary: "Locks on Third Node more then acceptable threshold" + description: "Locks on Third Node more then 500 on default" + +- interval: 1m + input_series: + - series: ma_pg_metrics_locks{namespace="default", pg_node="node3"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Locks on Third Node more then acceptable threshold + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default", container="test-node1"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container="test-node1"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container="test-node1"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Memory on Postgres First Node is more than 95% busy + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + container: test-node1 + exp_annotations: + summary: "Memory on Postgres First Node is more than 95 percent busy" + description: "Memory Postgres First Node is more than 95 percent busy on default" + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default", container="node1"} + values: "0x5" + - series: container_spec_memory_limit_bytes{namespace="default", container=~"node1"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container=~"node1"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Memory on Postgres First Node is more than 95% busy + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default", container="test-node4"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container="test-node4"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container="test-node4"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Memory on Postgres Fourth Node is more than 95% busy + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + container: test-node4 + exp_annotations: + summary: "Memory on Postgres Fourth Node is more than 95 percent busy" + description: "Memory Postgres Fourth Node is more than 95 percent busy on default" + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default", container="node4"} + values: "0x5" + - series: container_spec_memory_limit_bytes{namespace="default", container=~"node4"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container=~"node4"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Memory on Postgres Fourth Node is more than 95% busy + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default", container="test-node2"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container="test-node2"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container="test-node2"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Memory on Postgres Second Node is more than 95% busy + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + container: test-node2 + exp_annotations: + summary: "Memory on Postgres Second Node is more than 95 percent busy" + description: "Memory Postgres Second Node is more than 95 percent busy on default" + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default", container="node2"} + values: "0x5" + - series: container_spec_memory_limit_bytes{namespace="default", container=~"node2"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container=~"node2"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Memory on Postgres Second Node is more than 95% busy + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default", container="test-node3"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container="test-node3"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container="test-node3"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Memory on Postgres Third Node is more than 95% busy + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + container: test-node3 + exp_annotations: + summary: "Memory on Postgres Third Node is more than 95 percent busy" + description: "Memory Postgres Third Node is more than 95 percent busy on default" + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default", container="node3"} + values: "0x5" + - series: container_spec_memory_limit_bytes{namespace="default", container=~"node3"} + values: "1x5" + - series: container_spec_memory_limit_bytes{namespace="default", container=~"node3"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Memory on Postgres Third Node is more than 95% busy + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_query_max_time{namespace="default", pg_node="node1"} + values: "3601x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: There are long running queries on First Node + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node1 + exp_annotations: + summary: "There are long running queries on First Node" + description: "There are long running queries First Node. Execution time is more than 3600 second(s) on default" + +- interval: 1m + input_series: + - series: ma_pg_metrics_query_max_time{namespace="default", pg_node="node1"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: There are long running queries on Fourth Node + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_query_max_time{namespace="default", pg_node="node4"} + values: "3601x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: There are long running queries on Fourth Node + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node4 + exp_annotations: + summary: "There are long running queries on Fourth Node" + description: "There are long running queries Fourth Node. Execution time is more than 3600 second(s) on default" + +- interval: 1m + input_series: + - series: ma_pg_metrics_query_max_time{namespace="default", pg_node="node4"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: There are long running queries on First Node + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_query_max_time{namespace="default", pg_node="node2"} + values: "3601x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: There are long running queries on Second Node + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node2 + exp_annotations: + summary: "There are long running queries on Second Node" + description: "There are long running queries Second Node. Execution time is more than 3600 second(s) on default" + +- interval: 1m + input_series: + - series: ma_pg_metrics_query_max_time{namespace="default", pg_node="node2"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: There are long running queries on Second Node + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_query_max_time{namespace="default", pg_node="node3"} + values: "3601x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: There are long running queries on Third Node + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + pg_node: node3 + exp_annotations: + summary: "There are long running queries on Third Node" + description: "There are long running queries Third Node. Execution time is more than 3600 second(s) on default" + +- interval: 1m + input_series: + - series: ma_pg_metrics_query_max_time{namespace="default", pg_node="node3"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: There are long running queries on Third Node + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_large_object_total_size_bytes{namespace="default"} + values: "200000000x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSql Large Object Size High + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "PostgreSQL Large Object Size High" + description: "Large object total size has exceeded the warning threshold in namespace default." + +- interval: 1m + input_series: + - series: ma_pg_large_object_total_size_bytes{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSql Large Object Size High + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default", container="test-node1"} + values: "60+60x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container="test-node1"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: CPU on Postgres First Node is more than 95% busy + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "CPU on Postgres First Node is more than 95 percent busy" + description: "CPU Postgres First Node is more than 95 percent busy on default" + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default", container="test-node1"} + values: "0x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container="test-node1"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: CPU on Postgres First Node is more than 95% busy + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default", container="test-node2"} + values: "60+60x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container="test-node2"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: CPU on Postgres Second Node is more than 95% busy + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "CPU on Postgres Second Node is more than 95 percent busy" + description: "CPU Postgres Second Node is more than 95 percent busy on default" + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default", container="test-node2"} + values: "0x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container="test-node2"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: CPU on Postgres Second Node is more than 95% busy + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default", container="test-node3"} + values: "60+60x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container="test-node3"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: CPU on Postgres Third Node is more than 95% busy + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "CPU on Postgres Third Node is more than 95 percent busy" + description: "CPU Postgres Third Node is more than 95 percent busy on default" + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default", container="test-node3"} + values: "0x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container="test-node3"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: CPU on Postgres Third Node is more than 95% busy + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default", container="test-node4"} + values: "60+60x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container="test-node4"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: CPU on Postgres Fourth Node is more than 95% busy + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "CPU on Postgres Fourth Node is more than 95 percent busy" + description: "CPU Postgres Fourth Node is more than 95 percent busy on default" + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default", container="test-node4"} + values: "0x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default", exported_container="test-node4"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: CPU on Postgres Fourth Node is more than 95% busy + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_replication_lag{namespace="default"} + values: "40000000x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni Replica Is Lagging + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: >- + "Patroni Replica \{\{ \$labels.hostname \}\} Is Lagging in \{\{ \$labels.namespace \}\} namespace" + description: "Patroni Replica Is Lagging" + +- interval: 1m + input_series: + - series: ma_pg_patroni_replication_lag{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni Replica Is Lagging + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_replication_state_sent_replay_lag{namespace="default"} + values: "40000000x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL Replica Is Lagging + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: >- + "PostgreSQL Replica \{\{ \$labels.hostname \}\} Is Lagging in \{\{ \$labels.namespace \}\} namespace" + description: "PostgreSQL Replica Is Lagging" + +- interval: 1m + input_series: + - series: ma_pg_patroni_replication_state_sent_replay_lag{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: PostgreSQL Replica Is Lagging + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_patroni_replication_state_sm_replication_state{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni Standby Leader Is Not Connected + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: >- + "Patroni Standby Leader Is Not Connected" + description: "Patroni Standby Leader Is Not Connected" + +- interval: 1m + input_series: + - series: ma_pg_patroni_replication_state_sm_replication_state{namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Patroni Standby Leader Is Not Connected + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_current_connections{namespace="default"} + values: "1x5" + - series: ma_pg_metrics_postgres_max_connections{namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Current overall connections exceed max_connection percentage + exp_alerts: + - exp_labels: + severity: critical + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "Current overall connections exceed max_connection percentage" + description: "Current overall connections are above the max_connection percentage threshold on default." + +- interval: 1m + input_series: + - series: ma_pg_metrics_current_connections{namespace="default"} + values: "0x5" + - series: ma_pg_metrics_postgres_max_connections{namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Current overall connections exceed max_connection percentage + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_current_connections{namespace="default"} + values: "9x5" + - series: ma_pg_metrics_postgres_max_connections{namespace="default"} + values: "10x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Current overall connections reached warning max_connection percentage + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "Current overall connections reached warning max_connection percentage" + description: "Current overall connections reached warning of the max_connection percentage threshold on default." + +- interval: 1m + input_series: + - series: ma_pg_metrics_current_connections{namespace="default"} + values: "0x5" + - series: ma_pg_metrics_postgres_max_connections{namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Current overall connections reached warning max_connection percentage + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_metrics_current_connections{namespace="default"} + values: "1x5" + - series: ma_pg_metrics_postgres_max_connections{namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Current connections exceed max_connection + exp_alerts: + - exp_labels: + severity: critical + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "Current connections exceed max_connection" + description: "Current connections are above the max_connection threshold on default." + +- interval: 1m + input_series: + - series: ma_pg_metrics_current_connections{namespace="default"} + values: "0x5" + - series: ma_pg_metrics_postgres_max_connections{namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Current connections exceed max_connection + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_connection_by_database{namespace="default"} + values: "250x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: DB Connection exceeding more than specified limit + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "DB Connection exceeding more than specified limit" + description: "DB Connections exceeding more than specified limit on default." + +- interval: 1m + input_series: + - series: ma_pg_connection_by_database{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: DB Connection exceeding more than specified limit + exp_alerts: [] + +- interval: 1m + input_series: + - series: wait_event_metric{namespace="default", wait_event="SubtransBuffer"} + values: "6x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Wait event warning threshold for SubtransBuffer or SubtransSLRU + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + wait_event: SubtransBuffer + exp_annotations: + summary: "Wait event warning threshold for SubtransBuffer or SubtransSLRU" + description: "Wait event SubtransBuffer or SubtransSLRU hit warning threshold 5 on default" + +- interval: 1m + input_series: + - series: wait_event_metric{namespace="default", wait_event="SubtransSLRU"} + values: "6x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Wait event warning threshold for SubtransBuffer or SubtransSLRU + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + wait_event: SubtransSLRU + exp_annotations: + summary: "Wait event warning threshold for SubtransBuffer or SubtransSLRU" + description: "Wait event SubtransBuffer or SubtransSLRU hit warning threshold 5 on default" + +- interval: 1m + input_series: + - series: wait_event_metric{namespace="default", wait_event="SubtransBuffer"} + values: "0x5" + - series: wait_event_metric{namespace="default", wait_event="SubtransSLRU"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Wait event warning threshold for SubtransBuffer or SubtransSLRU + exp_alerts: [] + +- interval: 1m + input_series: + - series: wait_event_metric{namespace="default", wait_event="SubtransBuffer"} + values: "21x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Wait event critical threshold for SubtransBuffer or SubtransSLRU + exp_alerts: + - exp_labels: + severity: critical + namespace: default + service: pgskipper-montemplates + wait_event: SubtransBuffer + exp_annotations: + summary: "Wait event critical threshold for SubtransBuffer or SubtransSLRU" + description: "Wait event SubtransBuffer or SubtransSLRU hit critical threshold 20 on default" + +- interval: 1m + input_series: + - series: wait_event_metric{namespace="default", wait_event="SubtransSLRU"} + values: "21x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Wait event critical threshold for SubtransBuffer or SubtransSLRU + exp_alerts: + - exp_labels: + severity: critical + namespace: default + service: pgskipper-montemplates + wait_event: SubtransSLRU + exp_annotations: + summary: "Wait event critical threshold for SubtransBuffer or SubtransSLRU" + description: "Wait event SubtransBuffer or SubtransSLRU hit critical threshold 20 on default" + +- interval: 1m + input_series: + - series: wait_event_metric{namespace="default", wait_event="SubtransBuffer"} + values: "0x5" + - series: wait_event_metric{namespace="default", wait_event="SubtransSLRU"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Wait event critical threshold for SubtransBuffer or SubtransSLRU + exp_alerts: [] + +- interval: 1m + input_series: + - series: pct_used_metric{namespace="default"} + values: "51x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Pct used warning threshold + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "Pct used warning threshold" + description: "Pct used hit warning threshold 50 on default" + +- interval: 1m + input_series: + - series: pct_used_metric{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Pct used warning threshold + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_standby_leader_count{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Standby cluster is not streaming + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "Standby cluster is not streaming" + description: "Standby cluster is not streaming from default" + +- interval: 1m + input_series: + - series: ma_pg_standby_leader_count{namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Standby cluster is not streaming + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_standby_replication_lag_in_bytes{namespace="default"} + values: "1073741825x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Standby cluster has big lag in bytes + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "Standby cluster has big lag in bytes" + description: "Standby cluster has big lag in bytes for default" + +- interval: 1m + input_series: + - series: ma_pg_standby_leader_count{namespace="default"} + values: "1073741823x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Standby cluster has big lag in bytes + exp_alerts: [] + +- interval: 1m + input_series: + - series: ma_pg_standby_replication_lag_in_ms{namespace="default"} + values: "600001x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Standby cluster has big lag in milliseconds + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: pgskipper-montemplates + exp_annotations: + summary: "Standby cluster has big lag in milliseconds" + description: "Standby cluster has big lag in milliseconds for default" + +- interval: 1m + input_series: + - series: ma_pg_standby_replication_lag_in_ms{namespace="default"} + values: "600000x5" + alert_rule_test: + - eval_time: 5m + groupname: default-pgskipper-montemplates + alertname: Standby cluster has big lag in milliseconds + exp_alerts: [] \ No newline at end of file diff --git a/tests/alerts-tests/tests-checker.sh b/tests/alerts-tests/tests-checker.sh new file mode 100644 index 00000000..21fa71e8 --- /dev/null +++ b/tests/alerts-tests/tests-checker.sh @@ -0,0 +1,33 @@ +rules=() +readarray -t rules < <(yq eval '.groups[].rules[].alert' ./rules.yaml) +tests=() +readarray -t tests < <(yq '.tests[].alert_rule_test[].alertname' ./test.yaml) +errorrules=() +errorcount=() +i=0 + +for item in "${rules[@]}"; do +count=0 + + for j in "${tests[@]}"; do + if [[ "$j" == "$item" ]]; then + ((count++)) + fi + done +if [[ "$count" -lt 2 ]]; then +errorrules[i]="$item" +errorcount[i]="$count" +((i++)) +fi +done + +if [[ "$i" -gt 0 ]]; then +echo "This alert rules dont have all required tests (minimum 2 tests per rule needed):" + for k in "${!errorrules[@]}"; do + echo "Alert: ${errorrules[k]}, Tests found: ${errorcount[k]}" + done +exit 1 +else +echo "All alert rules has required tests" +exit 0 +fi \ No newline at end of file