Skip to content

Commit ed35937

Browse files
authored
K8SPS-73 - Add self healing tests (#424)
* K8SPS-73 - Add operator-self-healing test * K8SPS-73 - Add self-healing test * K8SPS-73 - Add gr-self-healing test * Some fixes for self-healing tests * Remove OPERATOR_NS from Jenkinsfile since not supported yet * Fix operator-self-healing test * Remove self-healing test which will be merged after fixes for K8SPS-288 and K8SPS-289
1 parent 20484fa commit ed35937

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+2046
-22
lines changed

Jenkinsfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ void prepareNode() {
275275
# v0.15.0 kuttl version
276276
kubectl krew install --manifest-url https://raw.githubusercontent.com/kubernetes-sigs/krew-index/a67f31ecb2e62f15149ca66d096357050f07b77d/plugins/kuttl.yaml
277277
printf "%s is installed" "$(kubectl kuttl --version)"
278+
kubectl krew install assert
278279
'''
279280
}
280281

@@ -287,7 +288,6 @@ pipeline {
287288
environment {
288289
CLOUDSDK_CORE_DISABLE_PROMPTS = 1
289290
CLEAN_NAMESPACE = 1
290-
OPERATOR_NS = 'ps-operator'
291291
GIT_SHORT_COMMIT = sh(script: 'git rev-parse --short HEAD', , returnStdout: true).trim()
292292
VERSION = "${env.GIT_BRANCH}-${env.GIT_SHORT_COMMIT}"
293293
CLUSTER_NAME = sh(script: "echo jen-ps-${env.CHANGE_ID}-${GIT_SHORT_COMMIT}-${env.BUILD_NUMBER} | tr '[:upper:]' '[:lower:]'", , returnStdout: true).trim()

e2e-tests/conf/chaos-network-loss.yml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: chaos-mesh.org/v1alpha1
2+
kind: NetworkChaos
3+
metadata:
4+
name: network-loss-example
5+
spec:
6+
action: loss
7+
mode: one
8+
selector:
9+
pods:
10+
test-namespace:
11+
- pod-name
12+
loss:
13+
loss: "100"
14+
correlation: "100"
15+
duration: "60s"

e2e-tests/conf/chaos-pod-failure.yml

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
apiVersion: chaos-mesh.org/v1alpha1
2+
kind: PodChaos
3+
metadata:
4+
name: pod-failure-example
5+
spec:
6+
action: pod-failure
7+
mode: one
8+
value: ""
9+
duration: "60s"
10+
selector:
11+
pods:
12+
test-namespace:
13+
- pod-name

e2e-tests/conf/chaos-pod-kill.yml

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: chaos-mesh.org/v1alpha1
2+
kind: PodChaos
3+
metadata:
4+
name: pod-kill-example
5+
spec:
6+
action: pod-kill
7+
mode: one
8+
selector:
9+
pods:
10+
test-namespace:
11+
- pod-name

e2e-tests/functions

+123-21
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ deploy_pmm_server() {
5050
--set platform="${platform}" \
5151
"https://percona-charts.storage.googleapis.com/pmm-server-${PMM_SERVER_VERSION}.tgz"
5252
fi
53-
SERVICE="postgres"
53+
local SERVICE="postgres"
5454
until kubectl -n "${NAMESPACE}" exec monitoring-0 -- bash -c "pgrep -x $SERVICE >/dev/null"; do
5555
echo "Retry $retry"
5656
sleep 5
@@ -63,13 +63,13 @@ deploy_pmm_server() {
6363
}
6464

6565
get_pmm_api_key() {
66-
ADMIN_PASSWORD=$(kubectl -n "${NAMESPACE}" exec monitoring-0 -- bash -c "printenv | grep ADMIN_PASSWORD | cut -d '=' -f2")
66+
local ADMIN_PASSWORD=$(kubectl -n "${NAMESPACE}" exec monitoring-0 -- bash -c "printenv | grep ADMIN_PASSWORD | cut -d '=' -f2")
6767
echo $(curl --insecure -X POST -H "Content-Type: application/json" -d '{"name":"operator", "role": "Admin"}' "https://admin:$ADMIN_PASSWORD@"$(get_service_ip monitoring-service)"/graph/api/auth/keys" | jq .key)
6868
}
6969

7070
deploy_minio() {
71-
accessKey="$(kubectl -n "${NAMESPACE}" get secret minio-secret -o jsonpath='{.data.AWS_ACCESS_KEY_ID}' | base64 -d)"
72-
secretKey="$(kubectl -n "${NAMESPACE}" get secret minio-secret -o jsonpath='{.data.AWS_SECRET_ACCESS_KEY}' | base64 -d)"
71+
local accessKey="$(kubectl -n "${NAMESPACE}" get secret minio-secret -o jsonpath='{.data.AWS_ACCESS_KEY_ID}' | base64 -d)"
72+
local secretKey="$(kubectl -n "${NAMESPACE}" get secret minio-secret -o jsonpath='{.data.AWS_SECRET_ACCESS_KEY}' | base64 -d)"
7373

7474
helm uninstall -n "${NAMESPACE}" minio-service || :
7575
helm repo remove minio || :
@@ -312,6 +312,7 @@ get_mysql_users() {
312312

313313
get_service_ip() {
314314
local service=$1
315+
315316
while (kubectl get service/$service -n "${NAMESPACE}" -o 'jsonpath={.spec.type}' 2>&1 || :) | grep -q NotFound; do
316317
sleep 1
317318
done
@@ -392,16 +393,43 @@ wait_pod() {
392393
set -o xtrace
393394
}
394395

396+
wait_deployment() {
397+
local name=$1
398+
local target_namespace=${2:-"$namespace"}
399+
400+
sleep 10
401+
set +o xtrace
402+
retry=0
403+
echo -n $name
404+
until [ -n "$(kubectl -n ${target_namespace} get deployment $name -o jsonpath='{.status.replicas}')" \
405+
-a "$(kubectl -n ${target_namespace} get deployment $name -o jsonpath='{.status.replicas}')" \
406+
== "$(kubectl -n ${target_namespace} get deployment $name -o jsonpath='{.status.readyReplicas}')" ]; do
407+
sleep 1
408+
echo -n .
409+
let retry+=1
410+
if [ $retry -ge 360 ]; then
411+
kubectl logs $(get_operator_pod) -c operator \
412+
| grep -v 'level=info' \
413+
| grep -v 'level=debug' \
414+
| tail -100
415+
echo max retry count $retry reached. something went wrong with operator or kubernetes cluster
416+
exit 1
417+
fi
418+
done
419+
echo
420+
set -o xtrace
421+
}
422+
395423
check_auto_tuning() {
396-
RAM_SIZE=$1
397-
RDS_MEM_INSTANCE=12582880
398-
CUSTOM_INNODB_SIZE=$2
399-
CUSTOM_CONNECTIONS=$3
424+
local RAM_SIZE=$1
425+
local RDS_MEM_INSTANCE=12582880
426+
local CUSTOM_INNODB_SIZE=$2
427+
local CUSTOM_CONNECTIONS=$3
400428

401-
INNODB_SIZE=$(run_mysql \
429+
local INNODB_SIZE=$(run_mysql \
402430
'SELECT @@innodb_buffer_pool_size;' \
403431
"-h $(get_haproxy_svc "$(get_cluster_name)") -uroot -proot_password")
404-
CONNECTIONS=$(run_mysql \
432+
local CONNECTIONS=$(run_mysql \
405433
'SELECT @@max_connections;' \
406434
"-h $(get_haproxy_svc "$(get_cluster_name)") -uroot -proot_password")
407435

@@ -461,12 +489,15 @@ get_primary_from_haproxy() {
461489
run_mysql "SHOW VARIABLES LIKE '%hostname%';" "-h ${haproxy_pod_ip} -P3306 -uroot -proot_password" | awk '{print $2}'
462490
}
463491

492+
get_primary_from_group_replication() {
493+
run_mysql "SELECT MEMBER_HOST FROM performance_schema.replication_group_members where MEMBER_ROLE='PRIMARY';" "-h $(get_mysql_router_service $(get_cluster_name)) -P 6446 -uroot -proot_password" | cut -d'.' -f1
494+
}
495+
464496
verify_certificate_sans() {
465497
local certificate=$1
466498
local expected_sans=$2
467-
468-
have=$(mktemp)
469-
want=$(mktemp)
499+
local have=$(mktemp)
500+
local want=$(mktemp)
470501

471502
kubectl -n "${NAMESPACE}" get certificate "${certificate}" -o jsonpath='{.spec.dnsNames}' | jq '.' >"${have}"
472503
echo "${expected_sans}" | jq '.' >"${want}"
@@ -475,21 +506,19 @@ verify_certificate_sans() {
475506
}
476507

477508
check_passwords_leak() {
478-
479-
secrets=$(kubectl get secrets -o json | jq -r '.items[].data | to_entries | .[] | select(.key | (endswith(".crt") or endswith(".key") or endswith(".pub") or endswith(".pem") or endswith(".p12")) | not) | .value')
480-
481-
passwords="$(for i in $secrets; do base64 -d <<< $i; echo; done) $secrets"
482-
pods=$(kubectl -n "${NAMESPACE}" get pods -o name | awk -F "/" '{print $2}')
509+
local secrets=$(kubectl get secrets -o json | jq -r '.items[].data | to_entries | .[] | select(.key | (endswith(".crt") or endswith(".key") or endswith(".pub") or endswith(".pem") or endswith(".p12")) | not) | .value')
510+
local passwords="$(for i in $secrets; do base64 -d <<< $i; echo; done) $secrets"
511+
local pods=$(kubectl -n "${NAMESPACE}" get pods -o name | awk -F "/" '{print $2}')
483512

484513
collect_logs() {
485514
NS=$1
486515
for p in $pods; do
487-
containers=$(kubectl -n "$NS" get pod $p -o jsonpath='{.spec.containers[*].name}')
516+
local containers=$(kubectl -n "$NS" get pod $p -o jsonpath='{.spec.containers[*].name}')
488517
for c in $containers; do
489518
kubectl -n "$NS" logs $p -c $c >${TEMP_DIR}/logs_output-$p-$c.txt
490519
echo logs saved in: ${TEMP_DIR}/logs_output-$p-$c.txt
491520
for pass in $passwords; do
492-
count=$(grep -c --fixed-strings -- "$pass" ${TEMP_DIR}/logs_output-$p-$c.txt || :)
521+
local count=$(grep -c --fixed-strings -- "$pass" ${TEMP_DIR}/logs_output-$p-$c.txt || :)
493522
if [[ $count != 0 ]]; then
494523
echo leaked passwords are found in log ${TEMP_DIR}/logs_output-$p-$c.txt
495524
false
@@ -502,7 +531,80 @@ check_passwords_leak() {
502531

503532
collect_logs $NAMESPACE
504533
if [ -n "$OPERATOR_NS" ]; then
505-
pods=$(kubectl -n "${OPERATOR_NS}" get pods -o name | awk -F "/" '{print $2}')
534+
local pods=$(kubectl -n "${OPERATOR_NS}" get pods -o name | awk -F "/" '{print $2}')
506535
collect_logs $OPERATOR_NS
507536
fi
508537
}
538+
539+
deploy_chaos_mesh() {
540+
destroy_chaos_mesh
541+
542+
helm repo add chaos-mesh https://charts.chaos-mesh.org
543+
helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${NAMESPACE} --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock --set dashboard.create=false --version 2.5.1
544+
sleep 10
545+
}
546+
547+
destroy_chaos_mesh() {
548+
local chaos_mesh_ns=$(helm list --all-namespaces --filter chaos-mesh | tail -n1 | awk -F' ' '{print $2}' | sed 's/NAMESPACE//')
549+
550+
for i in $(kubectl api-resources | grep chaos-mesh | awk '{print $1}'); do timeout 30 kubectl delete ${i} --all --all-namespaces || :; done
551+
if [ -n "${chaos_mesh_ns}" ]; then
552+
helm uninstall chaos-mesh --namespace ${chaos_mesh_ns} || :
553+
fi
554+
timeout 30 kubectl delete crd $(kubectl get crd | grep 'chaos-mesh.org' | awk '{print $1}') || :
555+
timeout 30 kubectl delete clusterrolebinding $(kubectl get clusterrolebinding | grep 'chaos-mesh' | awk '{print $1}') || :
556+
timeout 30 kubectl delete clusterrole $(kubectl get clusterrole | grep 'chaos-mesh' | awk '{print $1}') || :
557+
timeout 30 kubectl delete MutatingWebhookConfiguration $(kubectl get MutatingWebhookConfiguration | grep 'chaos-mesh' | awk '{print $1}') || :
558+
timeout 30 kubectl delete ValidatingWebhookConfiguration $(kubectl get ValidatingWebhookConfiguration | grep 'chaos-mesh' | awk '{print $1}') || :
559+
timeout 30 kubectl delete ValidatingWebhookConfiguration $(kubectl get ValidatingWebhookConfiguration | grep 'validate-auth' | awk '{print $1}') || :
560+
}
561+
562+
kill_pods() {
563+
local ns=$1
564+
local selector=$2
565+
local pod_label=$3
566+
local label_value=$4
567+
local chaos_suffix=$5
568+
569+
if [ "${selector}" == "pod" ]; then
570+
yq eval '
571+
.metadata.name = "chaos-pod-kill-'${chaos_suffix}'" |
572+
del(.spec.selector.pods.test-namespace) |
573+
.spec.selector.pods.'${ns}'[0] = "'${pod_label}'"' ${TESTS_CONFIG_DIR}/chaos-pod-kill.yml \
574+
| kubectl apply --namespace ${ns} -f -
575+
elif [ "${selector}" == "label" ]; then
576+
yq eval '
577+
.metadata.name = "chaos-kill-label-'${chaos_suffix}'" |
578+
.spec.mode = "all" |
579+
del(.spec.selector.pods) |
580+
.spec.selector.labelSelectors."'${pod_label}'" = "'${label_value}'"' ${TESTS_CONFIG_DIR}/chaos-pod-kill.yml \
581+
| kubectl apply --namespace ${ns} -f -
582+
fi
583+
sleep 5
584+
}
585+
586+
failure_pod() {
587+
local ns=$1
588+
local pod=$2
589+
local chaos_suffix=$3
590+
591+
yq eval '
592+
.metadata.name = "chaos-pod-failure-'${chaos_suffix}'" |
593+
del(.spec.selector.pods.test-namespace) |
594+
.spec.selector.pods.'${ns}'[0] = "'${pod}'"' ${TESTS_CONFIG_DIR}/chaos-pod-failure.yml \
595+
| kubectl apply --namespace ${ns} -f -
596+
sleep 5
597+
}
598+
599+
network_loss() {
600+
local ns=$1
601+
local pod=$2
602+
local chaos_suffix=$3
603+
604+
yq eval '
605+
.metadata.name = "chaos-pod-network-loss-'${chaos_suffix}'" |
606+
del(.spec.selector.pods.test-namespace) |
607+
.spec.selector.pods.'${ns}'[0] = "'${pod}'"' ${TESTS_CONFIG_DIR}/chaos-network-loss.yml \
608+
| kubectl apply --namespace ${ns} -f -
609+
sleep 5
610+
}

e2e-tests/run-distro.csv

+2
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@ gr-haproxy
77
gr-init-deploy
88
gr-one-pod
99
gr-scaling
10+
gr-self-healing
1011
gr-tls-cert-manager
1112
gr-users
1213
haproxy
1314
init-deploy
1415
monitoring
1516
one-pod
17+
operator-self-healing
1618
scaling
1719
service-per-pod
1820
sidecars

e2e-tests/run-minikube.csv

+2
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@ gr-haproxy
77
gr-init-deploy
88
gr-one-pod
99
gr-scaling
10+
gr-self-healing
1011
gr-tls-cert-manager
1112
gr-users
1213
haproxy
1314
init-deploy
1415
one-pod
16+
operator-self-healing
1517
sidecars
1618
smart-update
1719
tls-cert-manager

e2e-tests/run-pr.csv

+2
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@ gr-ignore-annotations
1010
gr-init-deploy
1111
gr-one-pod
1212
gr-scaling
13+
gr-self-healing
1314
gr-tls-cert-manager
1415
gr-users
1516
haproxy
1617
init-deploy
1718
limits
1819
monitoring
1920
one-pod
21+
operator-self-healing
2022
scaling
2123
service-per-pod
2224
sidecars

e2e-tests/run-release.csv

+2
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@ gr-ignore-annotations
99
gr-init-deploy
1010
gr-one-pod
1111
gr-scaling
12+
gr-self-healing
1213
gr-tls-cert-manager
1314
gr-users
1415
haproxy
1516
init-deploy
1617
limits
1718
monitoring
1819
one-pod
20+
operator-self-healing
1921
scaling
2022
service-per-pod
2123
sidecars
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
apiVersion: kuttl.dev/v1beta1
2+
kind: TestAssert
3+
timeout: 120
4+
---
5+
apiVersion: apiextensions.k8s.io/v1
6+
kind: CustomResourceDefinition
7+
metadata:
8+
name: perconaservermysqls.ps.percona.com
9+
spec:
10+
group: ps.percona.com
11+
names:
12+
kind: PerconaServerMySQL
13+
listKind: PerconaServerMySQLList
14+
plural: perconaservermysqls
15+
shortNames:
16+
- ps
17+
singular: perconaservermysql
18+
scope: Namespaced
19+
---
20+
apiVersion: kuttl.dev/v1beta1
21+
kind: TestAssert
22+
metadata:
23+
name: check-operator-deploy-status
24+
timeout: 120
25+
commands:
26+
- script: kubectl assert exist-enhanced deployment percona-server-mysql-operator -n ${OPERATOR_NS:-$NAMESPACE} --field-selector status.readyReplicas=1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
apiVersion: kuttl.dev/v1beta1
2+
kind: TestStep
3+
timeout: 10
4+
commands:
5+
- script: |-
6+
set -o errexit
7+
set -o xtrace
8+
9+
source ../../functions
10+
11+
deploy_operator
12+
deploy_non_tls_cluster_secrets
13+
deploy_tls_cluster_secrets
14+
deploy_client

0 commit comments

Comments
 (0)