Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cloudbuild/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,15 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
dd of="${bazel_repo_file}" status=none && \
apt-get update -qq
RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
apt-get install -y -qq default-jdk python3-setuptools python3-pip bazel-${bazel_version} > /dev/null 2>&1 && \
apt-get clean

# Set bazel-${bazel_version} as the default bazel alternative in this container
RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \
update-alternatives --set bazel /usr/bin/bazel-${bazel_version}

USER ia-tests

# Install Python dependencies
RUN python3 -m pip install --upgrade pip && \
python3 -m pip install -r /init-actions/requirements.txt
1 change: 1 addition & 0 deletions cloudbuild/presubmit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ determine_tests_to_run() {
changed_dir="${changed_dir%%/*}/"
# Run all tests if common directories modified
if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
continue # remove this line before submission
echo "All tests will be run: '${changed_dir}' was changed"
TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
return 0
Expand Down
16 changes: 13 additions & 3 deletions cloudbuild/run-presubmit-on-k8s.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,21 @@ kubectl wait --for=condition=Ready "pod/${POD_NAME}" --timeout=15m

# To mitigate problems with early test failure, retry kubectl logs
sleep 10s
while ! kubectl describe "pod/${POD_NAME}" | grep -q Terminated; do
# Try to stream logs, but primary log capture is now in the trap
while true; do
if ! kubectl describe "pod/${POD_NAME}" > /dev/null 2>&1; then
echo "Pod ${POD_NAME} not found, assuming it has been deleted."
break # Exit the loop if the pod doesn't exist
fi

if kubectl describe "pod/${POD_NAME}" | grep -q Terminated; then
echo "Pod ${POD_NAME} is Terminated."
break # Exit the loop if the pod is Terminated
fi

# Try to stream logs
kubectl logs -f "${POD_NAME}" --since-time="${LOGS_SINCE_TIME}" --timestamps=true || true
LOGS_SINCE_TIME=$(date --iso-8601=seconds)
sleep 2 # Short sleep to avoid busy waiting if logs -f exits
sleep 2
done

# Final check on the pod exit code
Expand Down
14 changes: 13 additions & 1 deletion gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@ package(default_visibility = ["//visibility:public"])

exports_files(["install_gpu_driver.sh", "mig.sh"])

py_library(
name = "gpu_test_case_base",
srcs = ["gpu_test_case_base.py"],
srcs_version = "PY3",
testonly = True, # Add this line
deps = [
"//integration_tests:dataproc_test_case",
"@io_abseil_py//absl/testing:parameterized",
],
)

py_test(
name = "test_gpu",
size = "enormous",
Expand All @@ -10,7 +21,8 @@ py_test(
local = True,
shard_count = 15,
deps = [
":gpu_test_case_base", # Add this dependency
"//integration_tests:dataproc_test_case",
"@io_abseil_py//absl/testing:parameterized",
],
)
)
1 change: 1 addition & 0 deletions gpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-ge
WORKDIR /init-actions

USER ia-tests
COPY --chown=ia-tests:ia-tests "cloudbuild/key.json" /key.json
COPY --chown=ia-tests:ia-tests . ${WORKDIR}

ENTRYPOINT ["/bin/bash"]
Expand Down
526 changes: 254 additions & 272 deletions gpu/README.md

Large diffs are not rendered by default.

136 changes: 136 additions & 0 deletions gpu/gpu_test_case_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import os
import time
import random
from packaging import version
from integration_tests.dataproc_test_case import DataprocTestCase

DEFAULT_TIMEOUT = 45 # minutes

class GpuTestCaseBase(DataprocTestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def run_dataproc_job(self,
cluster_name,
job_type,
job_params,
timeout_in_minutes=DEFAULT_TIMEOUT):
"""Executes Dataproc job on a cluster and returns results.

Args:
cluster_name: cluster name to submit job to
job_type: type of the job, e.g. spark, hadoop, pyspark
job_params: job parameters
timeout_in_minutes: timeout in minutes

Returns:
ret_code: the return code of the job
stdout: standard output of the job
stderr: error output of the job
"""

ret_code, stdout, stderr = DataprocTestCase.run_command(
'gcloud dataproc jobs submit {} --cluster={} --region={} {}'.
format(job_type, cluster_name, self.REGION,
job_params), timeout_in_minutes)
return ret_code, stdout, stderr

# Tests for PyTorch
TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py"

# Tests for TensorFlow
TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py"

def assert_instance_command(self,
instance,
cmd,
timeout_in_minutes=DEFAULT_TIMEOUT):
retry_count = 5
ssh_cmd = 'gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60 -o StrictHostKeyChecking=no'.format(
instance, self.cluster_zone, cmd.replace('"', '\"'))

while retry_count > 0:
try:
# Use self.assert_command from DataprocTestCase
ret_code, stdout, stderr = self.assert_command(ssh_cmd, timeout_in_minutes)
return ret_code, stdout, stderr
except Exception as e:
print(f"An error occurred in assert_instance_command: {e}")
retry_count -= 1
if retry_count > 0:
print(f"Retrying in 10 seconds...")
time.sleep(10)
continue
else:
print("Max retries reached.")
raise

def verify_instance(self, name):
# Verify that nvidia-smi works
self.assert_instance_command(name, "nvidia-smi", 1)
print(f"OK: nvidia-smi on {name}")

def verify_instance_gpu_agent(self, name):
print(f"--- Verifying GPU Agent on {name} ---")
self.assert_instance_command(
name, "systemctl is-active gpu-utilization-agent.service")
print(f"OK: GPU Agent on {name}")

def get_dataproc_image_version(self, instance):
_, stdout, _ = self.assert_instance_command(instance, "grep DATAPROC_IMAGE_VERSION /etc/environment | cut -d= -f2")
return stdout.strip()

def version_lt(self, v1, v2):
return version.parse(v1) < version.parse(v2)

def verify_pytorch(self, name):
print(f"--- Verifying PyTorch on {name} ---")
test_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu",
self.TORCH_TEST_SCRIPT_FILE_NAME)
self.upload_test_file(test_filename, name)

image_version = self.get_dataproc_image_version(name)
conda_root_path = "/opt/conda/miniconda3"
if not self.version_lt(image_version, "2.3"):
conda_root_path = "/opt/conda"

conda_env = "dpgce"
env_path = f"{conda_root_path}/envs/{conda_env}"
python_bin = f"{env_path}/bin/python3"

verify_cmd = (
f"for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node; do "
f" if [[ -e \\\"$f\\\" ]]; then echo 0 > \\\"$f\\\"; fi; "
f"done; "
f"if /usr/share/google/get_metadata_value attributes/include-pytorch; then"
f" {python_bin} {self.TORCH_TEST_SCRIPT_FILE_NAME}; "
f"else echo 'PyTorch test skipped as include-pytorch is not set'; fi"
)
_, stdout, _ = self.assert_instance_command(name, verify_cmd)
if "PyTorch test skipped" not in stdout:
self.assertTrue("True" in stdout, f"PyTorch CUDA not available or python not found in {env_path}")
print(f"OK: PyTorch on {name}")
self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)

def verify_tensorflow(self, name):
print(f"--- Verifying TensorFlow on {name} ---")
test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu",
self.TF_TEST_SCRIPT_FILE_NAME)
self.upload_test_file(test_filename, name)

image_version = self.get_dataproc_image_version(name)
conda_root_path = "/opt/conda/miniconda3"
if not self.version_lt(image_version, "2.3"):
conda_root_path = "/opt/conda"

conda_env="dpgce"
env_path = f"{conda_root_path}/envs/{conda_env}"
python_bin = f"{env_path}/bin/python3"

verify_cmd = (
f"for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${{f}} ; done ;"
f"{python_bin} {self.TF_TEST_SCRIPT_FILE_NAME}"
)
self.assert_instance_command(name, verify_cmd)
print(f"OK: TensorFlow on {name}")
self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)
Loading