Skip to content

Commit

Permalink
testing-Pre-compiled end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Aug 31, 2024
1 parent eaf6ca8 commit caa9a5d
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-precompiled.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ jobs:
cmd: yq '.status.properties[] | select(.name == "instance-id") | .value' /github/workspace/.cache/holodeck.yaml

- name: print holodeck cache file
uses: mikefarah/yq@master
run: |
cat /github/workspace/.cache/holodeck.yaml
Expand Down Expand Up @@ -178,6 +177,7 @@ jobs:
rc=$status
exit 1
fi
# sleep 120
./tests/ci-run-e2e.sh "${TEST_CASE}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
Expand Down
1 change: 0 additions & 1 deletion tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,5 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

: ${OPERATOR_OPTIONS:="--set driver.repository=ghcr.io/nvidia"}
: ${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT:="900"}
: ${AWS_SYSTEM_RUNNING_STATUS_TIMEOUT:="60"}

: ${BASE_TARGET:="jammy"}
26 changes: 21 additions & 5 deletions tests/scripts/remote_retry.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,33 @@
#!/bin/bash

source ${SCRIPT_DIR}/.definitions.sh

echo "SHIVA ${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT} ${AWS_SYSTEM_RUNNING_STATUS_TIMEOUT}"
START_TIME=$(date +%s)

timeout "${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT}" aws ec2 wait instance-status-ok --instance-ids "${instance_id}"
if [ $? -ne 0 ]; then
echo "AWS Instance did not become available within ${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT} seconds."
exit 1
fi
timeout "${AWS_SYSTEM_RUNNING_STATUS_TIMEOUT}" aws ec2 describe-instances --instance-ids "${instance_id}" --query 'Reservations[0].Instances[0].State.Name' --output text
timeout "${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT}" aws ec2 describe-instances --instance-ids "${instance_id}" --query 'Reservations[0].Instances[0].State.Name' --output text
STATE=$(aws ec2 describe-instances --instance-ids "${instance_id}" --query 'Reservations[0].Instances[0].State.Name' --output text)
if [ "$STATE" != "running" ]; then
echo "AWS Instance is not in running condition"
echo "AWS Instance is not in running condition within ${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT} seconds."
exit 1
fi
exit 0

while true; do
INSTANCE_STATUS=$(aws ec2 describe-instance-status --instance-ids "${instance_id}" --query 'InstanceStatuses[0].InstanceStatus.Status' --output text)
SYSTEM_STATUS=$(aws ec2 describe-instance-status --instance-ids "${instance_id}" --query 'InstanceStatuses[0].SystemStatus.Status' --output text)
if [ "$INSTANCE_STATUS" == "ok" ] && [ "$SYSTEM_STATUS" == "ok" ]; then
echo "AWS Instance and System Status: ok"
exit 0
fi
ELAPSED_TIME=$(($(date +%s) - START_TIME))
if [ "$ELAPSED_TIME" -ge "$AWS_SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then
echo "Failed to connect to aws within ${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT} seconds after reboot."
exit 1
fi
sleep 10
done

exit 1

0 comments on commit caa9a5d

Please sign in to comment.