Skip to content

Commit

Permalink
Add e2e CI
Browse files Browse the repository at this point in the history
Signed-off-by: Renaud Gaubert <[email protected]>
  • Loading branch information
Renaud Gaubert committed Jun 1, 2020
1 parent 4fbdafd commit 77083c5
Show file tree
Hide file tree
Showing 9 changed files with 273 additions and 6 deletions.
45 changes: 41 additions & 4 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@ services:
- docker:dind

stages:
- build
- release
- aws_kube_setup
- e2e_tests
- aws_kube_clean

variables:
GIT_SUBMODULE_STRATEGY: recursive
TF_VAR_FILE: "$CI_PROJECT_DIR/tests/variables.tfvars"

build:
stage: build
stage: aws_kube_setup
script:
- apk add make
- docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
Expand All @@ -16,9 +21,36 @@ build:
- make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push
- make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push-short
- make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push-latest
- make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push-ci

aws_kube_setup:
extends: .aws_kube_setup
only:
- master
- tags

e2e:
stage: e2e_tests
only:
- master
script:
- source aws-kube-ci/hostname
- apk add --no-cache openssh-client rsync
- rsync -e "ssh -i aws-kube-ci/key -o StrictHostKeyChecking=no" -av --exclude="vendor/" "${CI_PROJECT_DIR}" "${instance_hostname}:~/"
- rc=0
- ssh -i aws-kube-ci/key ${instance_hostname} \
"export CI_COMMIT_SHORT_SHA=${CI_COMMIT_SHORT_SHA} &&
export CI_REGISTRY_IMAGE=${CI_REGISTRY_IMAGE} &&
cd ~/gpu-monitoring-tools && sudo -E ./tests/ci-run-e2e.sh"

aws_kube_clean:
extends: .aws_kube_clean
only:
- master
- tags

release:
stage: release
stage: aws_kube_clean
only:
- tags
script:
Expand All @@ -29,3 +61,8 @@ release:
- make VERSION="${CI_COMMIT_TAG}" push
- make VERSION="${CI_COMMIT_TAG}" push-short
- make VERSION="${CI_COMMIT_TAG}" push-latest

include:
project: nvidia/container-infrastructure/aws-kube-ci
file: aws-kube-ci.yml
ref: 20.05.20
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "aws-kube-ci"]
path = aws-kube-ci
url = git@gitlab.com:nvidia/container-infrastructure/aws-kube-ci.git
url = https://gitlab.com/nvidia/container-infrastructure/aws-kube-ci.git
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ push-short:
$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(DCGM_VERSION)"
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(DCGM_VERSION)"

push-ci:
$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(VERSION)"
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(VERSION)"

push-latest:
$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:latest"
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:latest"
Expand Down
3 changes: 2 additions & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ The DCGM container posses three major components:
- The Exporter Version (e.g: 2.0.0)
- The platform of the container (e.g: ubuntu18.04)

The overall version of the Vulkan container has two forms:
The overall version of the DCGM container has four forms:
- The long form: `${DCGM_VERSION}-${EXPORTER_VERSION}-${PLATFORM}`
- The short form: `${DCGM_VERSION}`
- The latest tag: `latest`
- The commit form: `${CI_COMMIT_SHORT_SHA}` only available on the gitlab registry

The long form is a unique tag that once pushed will always refer to the same container.
This means that no updates will be made to that tag and it will always point to the same container.
Expand Down
85 changes: 85 additions & 0 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#! /bin/bash -x
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euxo pipefail
shopt -s lastpipe

readonly basedir="$(dirname "$(realpath "$0")")"

# shellcheck source=tests/common.sh
source "${basedir}/common.sh"

# shellcheck source=tests/metrics.sh
source "${basedir}/metrics.sh"

CI_REGISTRY_IMAGE=${CI_REGISTRY_IMAGE:-"undefined"}
CI_COMMIT_SHORT_SHA=${CI_COMMIT_SHORT_SHA:-"undefined"}

install::jq() {
apt update && apt install -y --no-install-recommends jq
}

install::helm() {
curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash
}

install::dcgm::exporter() {
helm package deployment/dcgm-exporter
helm install --wait dcgm-exporter ./*.tgz --set "image.repository=${CI_REGISTRY_IMAGE}/dcgm-exporter" --set "image.tag=${CI_COMMIT_SHORT_SHA}" --set "serviceMonitor=true"
}

install::prom() {
helm repo add stable https://kubernetes-charts.storage.googleapis.com
helm install --wait stable/prometheus-operator --generate-name \
--set "prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false"
}

query::prom() {
IP="$(kubectl get svc -l app=prometheus-operator-prometheus -o jsonpath='{.items[0].spec.clusterIP}')"
val="$(curl -sL "http://$IP:9090/api/v1/query?query=$1" | jq -r '.data.result')"

[ "${val}" != "" ] || return 1
[ "${val}" != "[]" ] || return 1

echo "$val"
}

query::pod::phase() {
state="$(kubectl get pods "$1" -o jsonpath='{.status.phase}')"
[ "$state" = "$2" ] || return 1
}

testing::log::kube() {
kubectl get pods
kubectl get svc
kubectl get serviceMonitor

kubectl get pods -l "app.kubernetes.io/component=dcgm-exporter" -o yaml
}

install::jq
install::helm
install::prom
install::dcgm::exporter

trap 'testing::log::kube' ERR

for test_case in "metrics"; do
log INFO "=================Testing ${test_case}================="
testing::${test_case}::setup "$@"
testing::${test_case}::main "$@"
testing::${test_case}::cleanup "$@"
done

69 changes: 69 additions & 0 deletions tests/common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#! /bin/bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# shellcheck disable=SC2015
[ -t 2 ] && readonly LOG_TTY=1 || readonly LOG_NO_TTY=1

if [ "${LOG_TTY-0}" -eq 1 ] && [ "$(tput colors)" -ge 15 ]; then
readonly FMT_BOLD=$(tput bold)
readonly FMT_RED=$(tput setaf 1)
readonly FMT_YELLOW=$(tput setaf 3)
readonly FMT_BLUE=$(tput setaf 12)
readonly FMT_CLEAR=$(tput sgr0)
fi

log() {
local -r level="$1"; shift
local -r message="$*"

local fmt_on="${FMT_CLEAR-}"
local -r fmt_off="${FMT_CLEAR-}"

case "${level}" in
INFO) fmt_on="${FMT_BLUE-}" ;;
WARN) fmt_on="${FMT_YELLOW-}" ;;
ERROR) fmt_on="${FMT_RED-}" ;;
esac
printf "%s[%s]%s %b\n" "${fmt_on}" "${level}" "${fmt_off}" "${message}" >&2
}

with_retry() {
local -r max_attempts="$1" delay="$2"
shift 2
local count=0 rc

while true; do
set +e
"$@"
rc="$?"
set -e
count="$((count+1))"

if [[ "${rc}" -eq 0 ]]; then
echo "'$*' SUCCEEDED in ${count} attempts !"
return 0
fi

if [[ "${max_attempts}" -le 0 ]] || [[ "${count}" -lt "${max_attempts}" ]]; then
echo "'$*' FAILED at attempt ${count}, will retry in ${delay} seconds ..."
sleep "${delay}"
else
break
fi
done

echo "'$*' FAILED in ${count} attempts !"
return 1
}
14 changes: 14 additions & 0 deletions tests/gpu-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: Pod
metadata:
name: nbody-pod
spec:
restartPolicy: OnFailure
containers:
- name: nbody
image: "nvidia/samples:cuda10.2-ubuntu16.04-nbody"
command: ["/usr/local/cuda/samples/5_Simulations/nbody/nbody"]
args: ["-benchmark", "-i=10000000"]
resources:
limits:
nvidia.com/gpu: 1
54 changes: 54 additions & 0 deletions tests/metrics.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#! /bin/bash -x
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

testing::metrics::setup() {
:
}

testing::metrics::cleanup() {
kubectl delete -f tests/gpu-pod.yaml
}

testing::metrics::utilization::increase() {
# For a short while we might have multiple values returned
# In this case it seems like the first item is the oldest
val="$(query::prom "DCGM_FI_DEV_GPU_UTIL" | jq -r '.[-1].value[1]')"
[ "$val" -ge 0 ] || return 1
}

testing::metrics::ensure::kube::labels() {
val="$(query::prom "DCGM_FI_DEV_GPU_UTIL")"
UUID="$(echo "${val}" | jq -r '.[0].metric.UUID')"
gpu="$(echo "${val}" | jq -r '.[0].metric.gpu')"
pod="$(echo "${val}" | jq -r '.[0].metric.exported_pod')"
namespace="$(echo "${val}" | jq -r '.[0].metric.exported_namespace')"

[ "$UUID" != "" ] || return 1
[ "$gpu" != "" ] || return 1

[ "$pod" = "nbody-pod" ] || return 1
[ "$namespace" = "default" ] || return 1
}

testing::metrics::main() {
# Prometheus can take a while to pickup the exporter
with_retry 30 10s query::prom "DCGM_FI_DEV_MEMORY_TEMP"

kubectl create -f tests/gpu-pod.yaml
with_retry 30 10s query::pod::phase "nbody-pod" "Running"

with_retry 10 10s testing::metrics::utilization::increase
with_retry 10 10s testing::metrics::ensure::kube::labels
}
3 changes: 3 additions & 0 deletions tests/variables.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
instance_type = "p3.2xlarge"
project_name = "gpu-monitoring-tools"
setup_params = "--driver --k8s-plugin --nvcr"

0 comments on commit 77083c5

Please sign in to comment.