From 381e252aa4cf42b8a085d7049dddcff7fbe16240 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 24 Oct 2024 17:01:24 +0200 Subject: [PATCH 01/13] Add Prime install Signed-off-by: Jeroen van Erp --- scripts/README.md | 6 +++++ scripts/rancher/manager_lifecycle.sh | 33 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/scripts/README.md b/scripts/README.md index f27ab74..c6e864d 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -53,6 +53,12 @@ Name | Source `rancher_update_serverurl` | [rancher/manager_settings.sh](rancher/manager_settings.sh) `rancher_wait_capiready` | [rancher/manager_lifecycle.sh](rancher/manager_lifecycle.sh) +### Rancher Prime + +Name | Source +-----------------------------------------------|------------------------------------------------------------- +`rancherprime_install_withcertmanagerclusterissuer` | [rancher/manager_lifecycle.sh](rancher/manager_lifecycle.sh) + ### SUSE Observability Name | Source diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index 180ee48..2e24026 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -36,6 +36,39 @@ rancher_install_withcertmanagerclusterissuer() { sleep 10 } +####################################### +# Installs Rancher Prime with a certificate generated by a cluster issuer +# Arguments: +# Version +# Number of replicas +# Hostname +# Cluster issuer name (managed by cert-manager) +# Examples: +# rancher_install_withcertmanagerclusterissuer latest "2.8.2" 1 rancher.random_string.geek letsencrypt-prod +####################################### +rancherprime_install_withcertmanagerclusterissuer() { + local version=$2 + local replicas=$3 + local hostname=$4 + local clusterissuer=$5 + + echo "Installing Rancher..." + helm repo add rancher-prime https://charts.rancher.com/server-charts/prime + helm repo update + helm upgrade --install rancher rancher-prime/rancher --namespace cattle-system --create-namespace \ + --version ${version} \ + --set replicas=${replicas} \ + --set hostname=${hostname} \ + --set ingress.extraAnnotations.'cert-manager\.io/cluster-issuer'=${clusterissuer} \ + --set ingress.tls.source=secret \ + --set ingress.tls.secretName=rancher-tls \ + --set agentTLSMode="system-store" + kubectl wait pods -n cattle-system -l app=rancher --for condition=Ready --timeout=180s + echo "Waiting for Rancher web app to be running with a valid certificate..." + while ! kubectl get secret rancher-tls --namespace cattle-system 2>/dev/null; do sleep 1; done + sleep 10 +} + ####################################### # Do the first log in Rancher (will update admin password and set server URL) # Arguments: From 6f2f4d35e05a6fafdbf5da77077530edd2d140c1 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 24 Oct 2024 17:15:50 +0200 Subject: [PATCH 02/13] Fix param order Signed-off-by: Jeroen van Erp --- scripts/rancher/manager_lifecycle.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index 2e24026..6822b5e 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -47,10 +47,10 @@ rancher_install_withcertmanagerclusterissuer() { # rancher_install_withcertmanagerclusterissuer latest "2.8.2" 1 rancher.random_string.geek letsencrypt-prod ####################################### rancherprime_install_withcertmanagerclusterissuer() { - local version=$2 - local replicas=$3 - local hostname=$4 - local clusterissuer=$5 + local version=$1 + local replicas=$2 + local hostname=$3 + local clusterissuer=$4 echo "Installing Rancher..." helm repo add rancher-prime https://charts.rancher.com/server-charts/prime From b73ad1d30ea570ea6c4bbdbc8665da108b0ac7f0 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 25 Oct 2024 11:17:37 +0200 Subject: [PATCH 03/13] Add observability service token methods Signed-off-by: Jeroen van Erp --- scripts/README.md | 2 + scripts/observability/service_token.sh | 51 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 scripts/observability/service_token.sh diff --git a/scripts/README.md b/scripts/README.md index c6e864d..fbe7c65 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -70,6 +70,8 @@ Name | Source `observability_get_component_snapshot` | [observability/stql.sh](observability/stql.sh) `observability_get_component_state` | [observability/stql.sh](observability/stql.sh) `observability_install_cli` | [observability/cli.sh](observability/cli.sh) +`observability_create_service_token` | [observability/service_token.sh](observability/service_token.sh) +`observability_delete_service_token` | [observability/service_token.sh](observability/service_token.sh) ### SUSE Linux (previously SLES, SLE Micro) diff --git a/scripts/observability/service_token.sh b/scripts/observability/service_token.sh new file mode 100644 index 0000000..451a1e7 --- /dev/null +++ b/scripts/observability/service_token.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +####################################### +# Create a service token for SUSE Observability +# Output: +# The service token +# Arguments: +# url (SUSE Observability) +# service_token (SUSE Observability) +# cluster_name +# role +# Examples: +# observability_create_service_token https://obs.suse.com/ xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx lab-dsu37834 stackstate-k8s-troubleshooter +####################################### +observability_create_service_token() { + local url=$1 + local service_token=$2 + local cluster_name=$3 + local role=$4 + + local resp + resp=$(/usr/local/bin/sts service-token create --name $cluster_name --roles $role -o json --url $url --service-token $service_token) + + echo $resp | jq -r '."service-token".token' +} + +####################################### +# Delete a service token for SUSE Observability +# Arguments: +# url (SUSE Observability) +# service_token (SUSE Observability) +# cluster_name +# Examples: +# observability_delete_service_token https://obs.suse.com/ xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx lab-dsu37834 +####################################### +observability_delete_service_token() { + local url=$1 + local service_token=$2 + local cluster_name=$3 + + local tokens token_id + + tokens=$(/usr/local/bin/sts service-token list -o json --url $url --service-token $service_token) + token_id=$(echo $tokens | jq -r '."service-tokens"[] | select(.name == "'$cluster_name'") | .id') + if [ -n "$token_id" ]; then + /usr/local/bin/sts service-token delete --id $token_id --url $url --service-token $service_token + echo ">>> Service token named '${cluster_name}' deleted" + else + echo ">>> Service token named '${cluster_name}' not found" + fi +} From 4f55817b3dd4a8ad8177d66332485936ff344162 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Mon, 4 Nov 2024 23:08:31 +0100 Subject: [PATCH 04/13] Add ai-model workload Signed-off-by: Jeroen van Erp --- charts/ai-model/Chart.yaml | 9 +++ charts/ai-model/templates/_helpers.tpl | 62 +++++++++++++++++++ charts/ai-model/templates/ai-model-cm.yaml | 32 ++++++++++ .../templates/ai-model-deployment.yaml | 41 ++++++++++++ .../ai-model/templates/ai-model-ingress.yaml | 27 ++++++++ charts/ai-model/templates/ai-model-svc.yaml | 16 +++++ charts/ai-model/values.yaml | 12 ++++ 7 files changed, 199 insertions(+) create mode 100644 charts/ai-model/Chart.yaml create mode 100644 charts/ai-model/templates/_helpers.tpl create mode 100644 charts/ai-model/templates/ai-model-cm.yaml create mode 100644 charts/ai-model/templates/ai-model-deployment.yaml create mode 100644 charts/ai-model/templates/ai-model-ingress.yaml create mode 100644 charts/ai-model/templates/ai-model-svc.yaml create mode 100644 charts/ai-model/values.yaml diff --git a/charts/ai-model/Chart.yaml b/charts/ai-model/Chart.yaml new file mode 100644 index 0000000..17216aa --- /dev/null +++ b/charts/ai-model/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: ai-model +description: A Helm chart for ai-model Mackroservices +type: application +version: 0.1.0 +appVersion: "0.1.0" +keywords: +- challenge +- observability diff --git a/charts/ai-model/templates/_helpers.tpl b/charts/ai-model/templates/_helpers.tpl new file mode 100644 index 0000000..5c3f420 --- /dev/null +++ b/charts/ai-model/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "common.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "common.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "common.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "common.labels" -}} +helm.sh/chart: {{ include "common.chart" . }} +{{ include "common.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "common.selectorLabels" -}} +app.kubernetes.io/name: {{ include "common.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "common.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "common.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/ai-model/templates/ai-model-cm.yaml b/charts/ai-model/templates/ai-model-cm.yaml new file mode 100644 index 0000000..da05242 --- /dev/null +++ b/charts/ai-model/templates/ai-model-cm.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ai-model-cm + labels: + {{- include "common.labels" . | nindent 4 }} +data: + config.toml: | + # General configuration + port = 11434 + address = "0.0.0.0" + serviceName = "AI Model" + logLevel = "info" + + # Endpoints + [[endpoints]] + uri = "/api/chat" + delay = "1000ms" + body.status = "success" + body.msg = "Your dino is a T-Rex" + + [endpoints.logging] + before = "Processing [[.Endpoint.Uri]] request" + beforeLevel = "Info" + after = "Completed [[.Endpoint.Uri]] request" + afterLevel = "Info" + + # OpenTelemetry + [otel.trace] + enabled = false + tracer-name = "ai-model" + diff --git a/charts/ai-model/templates/ai-model-deployment.yaml b/charts/ai-model/templates/ai-model-deployment.yaml new file mode 100644 index 0000000..fe5616e --- /dev/null +++ b/charts/ai-model/templates/ai-model-deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ai-model + labels: + service: ai-model + {{- include "common.labels" . | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + service: ai-model + {{- include "common.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "common.labels" . | nindent 8 }} + service: ai-model + annotations: + checksum/config: '{{ include (print $.Template.BasePath "/ai-model-cm.yaml") . | sha256sum}}' + spec: + containers: + - name: ai-model + image: {{.Values.image}} + env: + - name: CONFIG_FILE + value: /etc/app/config.toml + ports: + - containerPort: 8080 + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: config-volume + mountPath: /etc/app + volumes: + - name: config-volume + configMap: + name: ai-model-cm + items: + - key: config.toml + path: config.toml diff --git a/charts/ai-model/templates/ai-model-ingress.yaml b/charts/ai-model/templates/ai-model-ingress.yaml new file mode 100644 index 0000000..a162c0b --- /dev/null +++ b/charts/ai-model/templates/ai-model-ingress.yaml @@ -0,0 +1,27 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/proxy-body-size: 50m + labels: + service: ai-model + {{- include "common.labels" . | nindent 4 }} + name: ai-model +spec: + ingressClassName: traefik + rules: + - host: {{ .Values.ingress.host }} + http: + paths: + - backend: + service: + name: ai-model + port: + number: 11434 + path: / + pathType: Prefix + tls: + - hosts: + - {{ .Values.ingress.host }} + secretName: tls-secret diff --git a/charts/ai-model/templates/ai-model-svc.yaml b/charts/ai-model/templates/ai-model-svc.yaml new file mode 100644 index 0000000..5613d2a --- /dev/null +++ b/charts/ai-model/templates/ai-model-svc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: ai-model + labels: + service: ai-model + {{- include "common.labels" . | nindent 4 }} +spec: + selector: + service: ai-model + {{- include "common.selectorLabels" . | nindent 4 }} + ports: + - protocol: TCP + port: 80 # Service port + targetPort: 8080 # Container port + type: ClusterIP # Internal service within the Kubernetes cluster diff --git a/charts/ai-model/values.yaml b/charts/ai-model/values.yaml new file mode 100644 index 0000000..d33b91c --- /dev/null +++ b/charts/ai-model/values.yaml @@ -0,0 +1,12 @@ +nameOverride: '' +fullnameOverride: '' +image: ravan/mockroservice:0.0.23 +resources: + requests: + memory: '8Mi' + cpu: '5m' + limits: + memory: '10Mi' + cpu: '10m' +ingress: + host: From 82fea318ba7fe9f1664ad53b6e60d669eac8a320 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 5 Nov 2024 11:07:15 +0100 Subject: [PATCH 05/13] Fixup helm charts and add cli install Signed-off-by: Jeroen van Erp --- charts/ai-model/templates/ai-model-deployment.yaml | 6 +++--- charts/ai-model/templates/ai-model-svc.yaml | 2 +- scripts/observability/cli.sh | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/charts/ai-model/templates/ai-model-deployment.yaml b/charts/ai-model/templates/ai-model-deployment.yaml index fe5616e..bf0c4e7 100644 --- a/charts/ai-model/templates/ai-model-deployment.yaml +++ b/charts/ai-model/templates/ai-model-deployment.yaml @@ -14,8 +14,8 @@ spec: template: metadata: labels: - {{- include "common.labels" . | nindent 8 }} - service: ai-model + {{- include "common.labels" . | nindent 8 }} + service: ai-model annotations: checksum/config: '{{ include (print $.Template.BasePath "/ai-model-cm.yaml") . | sha256sum}}' spec: @@ -28,7 +28,7 @@ spec: ports: - containerPort: 8080 resources: - {{- toYaml .Values.resources | nindent 12 }} + {{- toYaml .Values.resources | nindent 12 }} volumeMounts: - name: config-volume mountPath: /etc/app diff --git a/charts/ai-model/templates/ai-model-svc.yaml b/charts/ai-model/templates/ai-model-svc.yaml index 5613d2a..cd33a02 100644 --- a/charts/ai-model/templates/ai-model-svc.yaml +++ b/charts/ai-model/templates/ai-model-svc.yaml @@ -8,7 +8,7 @@ metadata: spec: selector: service: ai-model - {{- include "common.selectorLabels" . | nindent 4 }} + {{- include "common.selectorLabels" . | nindent 4 }} ports: - protocol: TCP port: 80 # Service port diff --git a/scripts/observability/cli.sh b/scripts/observability/cli.sh index 40c2030..a228b30 100644 --- a/scripts/observability/cli.sh +++ b/scripts/observability/cli.sh @@ -5,7 +5,7 @@ ####################################### observability_install_cli() { if ! [ -x "$(command -v sts)" ]; then - curl -o- https://dl.stackstate.com/stackstate-cli/install.sh | STS_CLI_LOCATION=/usr/local/bin bash + curl -s -o- https://dl.stackstate.com/stackstate-cli/install.sh | STS_CLI_LOCATION=/usr/local/bin bash else echo ">>> sts CLI already installed" fi From afe86ea1969ac6789a389e320b2b6e7c0bdf15ba Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 5 Nov 2024 11:28:35 +0100 Subject: [PATCH 06/13] Make lint happy Signed-off-by: Jeroen van Erp --- charts/ai-model/Chart.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/charts/ai-model/Chart.yaml b/charts/ai-model/Chart.yaml index 17216aa..d35c96a 100644 --- a/charts/ai-model/Chart.yaml +++ b/charts/ai-model/Chart.yaml @@ -4,6 +4,9 @@ description: A Helm chart for ai-model Mackroservices type: application version: 0.1.0 appVersion: "0.1.0" +maintainers: + - name: hierynomus + email: jeroen.vanerp@suse.com keywords: - challenge - observability From 24d8f60329946ff872983a254f6caa875619d29f Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 5 Nov 2024 11:31:18 +0100 Subject: [PATCH 07/13] Remove hardcoded clusterissuer Signed-off-by: Jeroen van Erp --- charts/ai-model/templates/ai-model-ingress.yaml | 2 +- charts/ai-model/values.yaml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/charts/ai-model/templates/ai-model-ingress.yaml b/charts/ai-model/templates/ai-model-ingress.yaml index a162c0b..c9d5bd5 100644 --- a/charts/ai-model/templates/ai-model-ingress.yaml +++ b/charts/ai-model/templates/ai-model-ingress.yaml @@ -2,7 +2,7 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod + cert-manager.io/cluster-issuer: {{ .Values.ingress.certmanager.issuer }} nginx.ingress.kubernetes.io/proxy-body-size: 50m labels: service: ai-model diff --git a/charts/ai-model/values.yaml b/charts/ai-model/values.yaml index d33b91c..70f1596 100644 --- a/charts/ai-model/values.yaml +++ b/charts/ai-model/values.yaml @@ -10,3 +10,5 @@ resources: cpu: '10m' ingress: host: + certmanager: + issuer: From dae759c008cc1bf19032bb31a4637ed02f8a4448 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 5 Nov 2024 11:32:42 +0100 Subject: [PATCH 08/13] Remove hardcoded annotation Signed-off-by: Jeroen van Erp --- charts/ai-model/templates/ai-model-ingress.yaml | 3 +-- charts/ai-model/values.yaml | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/charts/ai-model/templates/ai-model-ingress.yaml b/charts/ai-model/templates/ai-model-ingress.yaml index c9d5bd5..03476ab 100644 --- a/charts/ai-model/templates/ai-model-ingress.yaml +++ b/charts/ai-model/templates/ai-model-ingress.yaml @@ -2,8 +2,7 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: annotations: - cert-manager.io/cluster-issuer: {{ .Values.ingress.certmanager.issuer }} - nginx.ingress.kubernetes.io/proxy-body-size: 50m + {{- .Values.ingress.annotations | toYaml | nindent 4 }} labels: service: ai-model {{- include "common.labels" . | nindent 4 }} diff --git a/charts/ai-model/values.yaml b/charts/ai-model/values.yaml index 70f1596..37ecdbd 100644 --- a/charts/ai-model/values.yaml +++ b/charts/ai-model/values.yaml @@ -9,6 +9,5 @@ resources: memory: '10Mi' cpu: '10m' ingress: + annotations: host: - certmanager: - issuer: From 37edf3291dd615b548eb4df889c1726cd12e1e04 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 7 Nov 2024 12:28:09 +0100 Subject: [PATCH 09/13] Added monitor and assets Signed-off-by: Jeroen van Erp --- assets/monitors/pods-in-waiting-state.yaml | 236 +++++++++++++++++++++ scripts/download.sh | 2 + scripts/observability/monitors.sh | 23 ++ 3 files changed, 261 insertions(+) create mode 100644 assets/monitors/pods-in-waiting-state.yaml create mode 100644 scripts/observability/monitors.sh diff --git a/assets/monitors/pods-in-waiting-state.yaml b/assets/monitors/pods-in-waiting-state.yaml new file mode 100644 index 0000000..6b62c11 --- /dev/null +++ b/assets/monitors/pods-in-waiting-state.yaml @@ -0,0 +1,236 @@ +nodes: +- _type: Monitor + arguments: + failureState: CRITICAL + loggingLevel: WARN + description: | + If a pod is within a waiting state and contains a reason of CreateContainerConfigError, CreateContainerError, + CrashLoopBackOff, or ImagePullBackOff it will be seen as deviating. + function: {{ get "urn:stackpack:kubernetes-v2:shared:monitor-function:pods-in-waiting-state" }} + id: -6 + identifier: urn:custom:monitor:pods-in-waiting-state-v2 + intervalSeconds: 30 + name: Pods in Waiting State V2 + remediationHint: |- + \{{#if reasons\}} + \{{#if reasons.CreateContainerConfigError\}} + ## CreateContainerConfigError + + In case of CreateContainerConfigError common causes are a secret or ConfigMap that is referenced in [your pod](/#/components/\{{ componentUrnForUrl \}}), but doesn’t exist. + + ### Missing ConfigMap + + If case of a missing ConfigMap you see an error like `Error: configmap "mydb-config" not found` you see the error mention in the message of this monitor. + + To solve this you should reference an existing ConfigMap. + + An example: + + ```markdown + # See if the configmap exists + kubectl get configmap mydb-config + + # Create the correct configmap, this is just an example + kubectl create configmap mydb-config --from-literal=database_name=mydb + + # Delete and recreate the pod using this configmag + kubectl delete -f mydb_pod.yaml + kubectl create -f mydb_pod.yaml + + # After recreating the pod this pod should be in a running state. + # This is visible because the waiting pod monitor will not trigger anymore on this condition. + ``` + + ### Missing Secret + + If case of a missing Secret you see an error like `Error from server (NotFound): secrets "my-secret" not found` + you see the error mention in the message of this monitor. + + To solve this you should reference an existing ConfigMap. + + An example: + + ```markdown + # See if the secret exists + kubectl get secret mydb-secret + + # Create the correct configmap, this is just an example + kubectl create secret mydb-secret --from-literal=password=mysupersecretpassword + + # Delete and recreate the pod using this configmag + kubectl delete -f mydb_pod.yaml + kubectl create -f mydb_pod.yaml + + # After recreating the pod this pod should be in a running state. + # This is visible because the waiting pod monitor will not trigger anymore on this condition. + ``` + \{{/if\}} + \{{#if reasons.CreateContainerError\}} + ## CreateContainerError + + Common causes for a CreateContainerError are: + + - Command Not Available + - Issues Mounting a Volume + - Container Runtime Not Cleaning Up Old Containers + + ### Command Not Available + + In case of ‘`Command Not Available`’ you will find this in the reason field at the top of this monitor (full screen). + If this is the case, the first thing you need to investigate is to check that you have a valid ENTRYPOINT in the Dockerfile + used to build your container image. + + If you don’t have access to the Dockerfile, you can configure your pod object by using + a valid command in the command attribute of the object. + + Check if your pod has a command set by inspecting the [Configuration"](/#/components/\{{ componentUrnForUrl \}}#configuration) on the pod, e.g.: + + ```markdown + apiVersion: v1 + kind: Pod + metadata: + name: nodeapp + labels: + app: nodeapp + spec: + containers: + - image: myimage/wrong-node-app + name: nodeapp + ports: + - containerPort: 80 + **command: ["node", "index.js"]** + ``` + + If the pod does not have a command set, check the container definition to see if an ENTRYPOINT is set, here you see an example without an existing ENTRYPOINT. + + if no exisiting ENTRYPOINT is set and the pod does not have a command the solution is to use a valid command in the pod definition: + + ```markdown + FROM ****node:16.3.0-alpine + WORKDIR /usr/src/app + COPY package*.json ./ + + RUN npm install + COPY . . + + EXPOSE 8080 + + **ENTRYPOINT []** + ``` + + ### Issues Mounting a Volume + + In the case of a `volume mount problem` the message of this monitor will give you a hint. For example, if you have a message like: + + ``` + Error: Error response from daemon: create \mnt\data: "\\mnt\\data" includes invalid characters for a local volume name, only "[a-zA-Z0-9][a-zA-Z0-9_.-]" are allowed. If you intended to pass a host directory, use absolute path + ``` + + In this case you should use a change the path in the PersistentVolume definition to a valid path. e.g. /mnt/data + + ### Container Runtime Not Cleaning Up Old Containers + + In this case you will see a message like: + + ``` + The container name "/myapp_ed236ae738" is already in use by container "22f4edaec41cb193857aefcead3b86cdb69edfd69b2ab57486dff63102b24d29". You have to remove (or rename) that container to be able to reuse that name. + ``` + + This is an indication that the [container runtime](https://kubernetes.io/docs/setup/production-environment/container-runtimes/) + doesn’t clean up old containers. + In this case the node should be removed from the cluster and the node container runtime should be reinstalled + (or be recreated). After that the node should be (re)assigned to the cluster. + + \{{/if\}} + \{{#if reasons.CrashLoopBackOff\}} + ## CrashLoopBackOff + + When a Kubernetes container has errors, it can enter into a state called CrashLoopBackOff, where Kubernetes attempts to restart the container to resolve the issue. + + The container will continue to restart until the problem is resolved. + + Take the following steps to diagnose the problem: + + ### Container Logs + Check the container logs for any explicit errors or warnings + + 1. Inspect the [Logs](/#/components/\{{ componentUrnForUrl \}}#logs) of all the containers in this pod. + 2. Scroll through it and validate if there is an excessive amount of errors. + 1. if a container is crashing due to an out of memory error, the logs may show errors related to memory allocation or exhaustion. + - If this is the case check if the memory limits are too low in which case you can make them higher. + - If the memory problem is not resolved you might have introduced an memory leak in which case you want to take a look at the last deployment. + - If there are no limits you might have a proble with the physical memory on the node running the pod. + 2. if a container is crashing due to a configuration error, the logs may show errors related to the incorrect configuration. + + ### Understand application + + It is important to understand what the intended behaviour of the application should be. + A good place to start is the [configuration](/#/components/\{{ componentUrnForUrl\}}#configuration). + Pay attention to environment variables and volume mounts as these are mechanism to configure the application. + We can use references to configmaps and secrets to futher explore configuration information. + + ### Pod Events + Check the pod events to identify any explicit errors or warnings. + 1. Go to the [Pod events page](/#/components/\{{ componentUrnForUrl \}}/events). + 2. Check if there is a large amount of events like `BackOff`, `FailedScheduling` or `FailedAttachVolume` + 3. If this is the case, see if the event details (click on the event) contains more information about this issue. + + ### Recent Deployment + Look at the pod age in the "About" section on the [Pod highlight page](/#/components/\{{ componentUrnForUrl \}}) to identify any recent deployments that might have caused the issue + + 1. The "Age" is shown in the "About" section on the left side of the screen + 2. If the "Age" and the time that the monitor was triggered are in close proximity then take a look at the most recent deployment by clicking on [Show last change](/#/components/\{{ componentUrnForUrl \}}#lastChange). + \{{/if\}} + \{{#if reasons.ImagePullBackOff\}} + ## ImagePullBackOff + + If you see the "ImagePullBackOff" error message while trying to pull a container image from a registry, it means that + the Docker engine was unable to pull the requested image for some reason. + + The reason field at the top of this monitor (full screen) might give you more information about the specific issue at hand. + + ## Diagnose + + To diagnose the problem, try the following actions: + + - Go to the [pod events page filtered by failed or unhealthy events](/#/components/\{{ componentUrnForUrl \}}/events?view=eventTypes--Unhealthy,Created,FailedMount,Failed) + + If there are no "Failed" events shown increase the time-range by clicking on the Zoom-out button on next to the telemetry-time-interval on the bottom left of the timeline. + + Click on the left side of the [Pod highlight page](/#/components/\{{ componentUrnForUrl \}}) on "Containers" in the "Related resources" + to view the `containers` and the `Image URL`. + + ## Common causes + + ### Rate Limit + A docker hub rate limit has been reached. + + Typical resolution is to authenticate using docker hub credentials (it will increase the rate limit from 100 to 200 pulls per 6 hours) + or to get a paid account and authenticate with that (bumping the limit to 5000 pulls per day). + + ### Network connectivity issues + Check your internet connection or the connection to the registry where the image is hosted. + + ### Authentication problems + If the registry requires authentication, make sure that your credentials are correct and that + you have the necessary permissions to access the image. + + ### Image availability + Verify that the image you are trying to pull exists in the registry and that you have specified the correct image name and tag. + + Here are some steps you can take to resolve the "ImagePullBackOff" error: + + 1. Check the registry logs for any error messages that might provide more information about the issue. + 2. Verify that the image exists in the registry and that you have the correct image name and tag. + 3. Check your network connectivity to ensure that you can reach the registry. + 4. Check the authentication credentials to ensure that they are correct and have the necessary permissions. + + If none of these steps work, you may need to consult the Docker documentation or contact support for the registry or Docker + itself for further assistance. + \{{/if\}} + \{{/if\}} + status: ENABLED + tags: + - pods + - containers +timestamp: 2024-10-17T10:15:31.714348Z[Etc/UTC] diff --git a/scripts/download.sh b/scripts/download.sh index 45717e8..7612b77 100644 --- a/scripts/download.sh +++ b/scripts/download.sh @@ -68,6 +68,8 @@ download() { rm -rf ${OUTPUT_FOLDER}/scripts fi mv ${GIT_REPO_NAME}-${GIT_FOLDER}/scripts ${OUTPUT_FOLDER} + mkdir -p ${OUTPUT_FOLDER}/assets + mv ${GIT_REPO_NAME}-${GIT_FOLDER}/assets ${OUTPUT_FOLDER}/assets } cleanup() { diff --git a/scripts/observability/monitors.sh b/scripts/observability/monitors.sh new file mode 100644 index 0000000..7bd8e23 --- /dev/null +++ b/scripts/observability/monitors.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +observability_disable_monitor() { + local url=$1 + local service_token=$2 + local monitor_identifier=$3 + /usr/local/bin/sts monitor disable --identifier $monitor_identifier --service-token $service_token --url $url +} + +observability_deploy_monitor() { + local url=$1 + local service_token=$2 + local file $3 + /usr/local/bin/sts monitor apply -f $file --service-token $service_token --url $url +} + +observability_enable_monitor() { + local url=$1 + local service_token=$2 + local monitor_identifier=$3 + /usr/local/bin/sts monitor enable --identifier $monitor_identifier --service-token $service_token --url $url +} + From 1affbcbaf1d4243c39e8179b236e12c042e45c1a Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 7 Nov 2024 12:31:22 +0100 Subject: [PATCH 10/13] Switched parameters Signed-off-by: Jeroen van Erp --- scripts/observability/monitors.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/observability/monitors.sh b/scripts/observability/monitors.sh index 7bd8e23..0aae47c 100644 --- a/scripts/observability/monitors.sh +++ b/scripts/observability/monitors.sh @@ -1,23 +1,23 @@ #!/bin/bash observability_disable_monitor() { - local url=$1 - local service_token=$2 - local monitor_identifier=$3 + local monitor_identifier=$1 + local url=$2 + local service_token=$3 /usr/local/bin/sts monitor disable --identifier $monitor_identifier --service-token $service_token --url $url } observability_deploy_monitor() { - local url=$1 - local service_token=$2 - local file $3 + local file $1 + local url=$2 + local service_token=$3 /usr/local/bin/sts monitor apply -f $file --service-token $service_token --url $url } observability_enable_monitor() { - local url=$1 - local service_token=$2 - local monitor_identifier=$3 + local monitor_identifier=$1 + local url=$2 + local service_token=$3 /usr/local/bin/sts monitor enable --identifier $monitor_identifier --service-token $service_token --url $url } From a78615b8f891d0bd59e553a914d06e0e6c20088c Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 7 Nov 2024 15:30:33 +0100 Subject: [PATCH 11/13] Fix download script Signed-off-by: Jeroen van Erp --- scripts/download.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/download.sh b/scripts/download.sh index 7612b77..9d54438 100644 --- a/scripts/download.sh +++ b/scripts/download.sh @@ -68,8 +68,7 @@ download() { rm -rf ${OUTPUT_FOLDER}/scripts fi mv ${GIT_REPO_NAME}-${GIT_FOLDER}/scripts ${OUTPUT_FOLDER} - mkdir -p ${OUTPUT_FOLDER}/assets - mv ${GIT_REPO_NAME}-${GIT_FOLDER}/assets ${OUTPUT_FOLDER}/assets + mv ${GIT_REPO_NAME}-${GIT_FOLDER}/assets ${OUTPUT_FOLDER} } cleanup() { From 4b159679cd01fe3b51259fed56b9d354581f16e3 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 7 Nov 2024 17:59:32 +0100 Subject: [PATCH 12/13] Add fleet assets Signed-off-by: Jeroen van Erp --- assets/fleet/clustergroup.yaml | 16 ++++++++++++++++ assets/fleet/gitrepo.yaml | 24 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 assets/fleet/clustergroup.yaml create mode 100644 assets/fleet/gitrepo.yaml diff --git a/assets/fleet/clustergroup.yaml b/assets/fleet/clustergroup.yaml new file mode 100644 index 0000000..b51a22b --- /dev/null +++ b/assets/fleet/clustergroup.yaml @@ -0,0 +1,16 @@ +apiVersion: fleet.cattle.io/v1alpha1 +kind: ClusterGroup +metadata: + name: build-a-dino + annotations: + {} + # key: string + labels: + {} + # key: string + namespace: fleet-default +spec: + selector: + matchLabels: + gpu-enabled: 'true' + app: build-a-dino diff --git a/assets/fleet/gitrepo.yaml b/assets/fleet/gitrepo.yaml new file mode 100644 index 0000000..7d3702e --- /dev/null +++ b/assets/fleet/gitrepo.yaml @@ -0,0 +1,24 @@ +apiVersion: fleet.cattle.io/v1alpha1 +kind: GitRepo +metadata: + name: build-a-dino + annotations: + {} + # key: string + labels: + {} + # key: string + namespace: fleet-default +spec: + branch: main + correctDrift: + enabled: true +# force: boolean +# keepFailHistory: boolean + insecureSkipTLSVerify: false + paths: + - /fleet/build-a-dino +# - string + repo: https://github.com/wiredquill/prime-rodeo + targets: + - clusterGroup: build-a-dino From 05e0290f168c6c60308f1eda75644c71d05db235 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 12 Nov 2024 06:50:11 -0700 Subject: [PATCH 13/13] Added cpu throttling monitor in assets Signed-off-by: Jeroen van Erp --- assets/monitors/cpu-throttling.yaml | 85 +++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 assets/monitors/cpu-throttling.yaml diff --git a/assets/monitors/cpu-throttling.yaml b/assets/monitors/cpu-throttling.yaml new file mode 100644 index 0000000..0164a01 --- /dev/null +++ b/assets/monitors/cpu-throttling.yaml @@ -0,0 +1,85 @@ +nodes: +- _type: Monitor + arguments: + comparator: GT + failureState: DEVIATING + metric: + aliasTemplate: CPU Throttling for ${container} of ${pod_name} + query: 100 * sum by (cluster_name, namespace, pod_name, container) (container_cpu_throttled_periods{}) + / sum by (cluster_name, namespace, pod_name, container) (container_cpu_elapsed_periods{}) + unit: percent + threshold: 95.0 + urnTemplate: urn:kubernetes:/${cluster_name}:${namespace}:pod/${pod_name} + description: |- + In Kubernetes, CPU throttling refers to the process where limits are applied to the amount of CPU resources a container can use. + This typically occurs when a container approaches the maximum CPU resources allocated to it, causing the system to throttle or restrict + its CPU usage to prevent a crash. + + While CPU throttling can help maintain system stability by avoiding crashes due to CPU exhaustion, it can also significantly slow down workload + performance. Ideally, CPU throttling should be avoided by ensuring that containers have access to sufficient CPU resources. + This proactive approach helps maintain optimal performance and prevents the slowdown associated with throttling. + function: {{ get "urn:stackpack:common:monitor-function:threshold" }} + id: -13 + identifier: urn:custom:monitor:pod-cpu-throttling-v2 + intervalSeconds: 60 + name: CPU Throttling V2 + remediationHint: |- + + ### Application behaviour + + Check the container [Logs](/#/components/\{{ componentUrnForUrl \}}#logs) for any hints on how the application is behaving under CPU Throttling + + ### Understanding CPU Usage and CPU Throttling + + On the [pod metrics page](/#/components/\{{ componentUrnForUrl \}}/metrics) you will find the CPU Usage and CPU Throttling charts. + + #### CPU Trottling + + The percentage of CPU throttling over time. CPU throttling occurs when a container reaches its CPU limit, restricting its CPU usage to + prevent it from exceeding the specified limit. The higher the percentage, the more throttling is occurring, which means the container's + performance is being constrained. + + #### CPU Usage + + This chart shows three key CPU metrics over time: + + 1. Request: The amount of CPU the container requests as its minimum requirement. This sets the baseline CPU resources the container is guaranteed to receive. + 2. Limit: The maximum amount of CPU the container can use. If the container's usage reaches this limit, throttling will occur. + 3. Current: The actual CPU usage of the container in real-time. + + The `Request` and `Limit` settings in the container can be seen in `Resource` section in [configuration](/#/components/\{{ componentUrnForUrl\}}#configuration) + + #### Correlation + + The two charts are correlated in the following way: + + - As the `Current` CPU usage approaches the CPU `Limit`, the CPU throttling percentage increases. This is because the container tries to use more CPU than it is allowed, and the system restricts it, causing throttling. + - The aim is to keep the `Current` usage below the `Limit` to minimize throttling. If you see frequent high percentages in the CPU throttling chart, it suggests that you may need to adjust the CPU limits or optimize the container's workload to reduce CPU demand. + + + ### Adjust CPU Requests and Limits + + On the [pod highlights page](/#/components/\{{ componentUrnForUrl \}}/highlights) and checking whether a `Deployment` event happened recently after which the cpu usage behaviour changed. + + You can investigate which change led to the cpu throttling by checking the [Show last change](/#/components/\{{ componentUrnForUrl \}}#lastChange), + which will highlight the latest changeset for the deployment. You can then revert the change or fix the cpu request and limit. + + + Review the pod's resource requests and limits to ensure they are set appropriately. + Show component [configuration](/#/components/\{{ componentUrnForUrl \}}#configuration) + + If the CPU usage consistently hits the limit, consider increasing the CPU limit of the pod.
+ Edit the pod or deployment configuration file to modify the `resources.limits.cpu` and `resources.requests.cpu` as needed. + ``` + resources: + requests: + cpu: "500m" # Adjust this value based on analysis + limits: + cpu: "1" # Adjust this value based on analysis + ``` + If CPU throttling persists, consider horizontal pod autoscaling to distribute the workload across more pods, or adjust the cluster's node resources to meet the demands. Continuously monitor and fine-tune resource settings to optimize performance and prevent further throttling issues. + status: ENABLED + tags: + - cpu + - performance + - pod