From 6c43fb4f16bf23c229197c8452ffbd2afcb1efaf Mon Sep 17 00:00:00 2001 From: omkar kakarparthi Date: Thu, 4 Jun 2026 16:42:18 +0000 Subject: [PATCH] rocm-ci: scope test container to pod-allocated GPUs via podinfo The sGPU/mGPU jobs launched the test container with '--device=/dev/dri --device=/dev/kfd', exposing ALL host GPUs to the nested (privileged-dind) container regardless of the GPUs Kubernetes allocated to the pod. Combined with the hard-coded absolute HIP_VISIBLE_DEVICES=0..3, two jobs co-scheduled on the same node both pinned physical GPUs 0-3 and collided (OOM/hangs/test failures) while 4-7 sat idle. Jobs only passed when the node was otherwise idle -- arch-independent (mi300x and mi35x). Build GPU_FLAG from /etc/podinfo/gha-render-devices, which the runner populates with this pod's allocated '--device /dev/dri/renderD*' flags (falls back to all GPUs on bare metal). /dev/kfd is always passed. The container now sees only its allocated GPUs as 0..N-1, so the per-suite HIP_VISIBLE_DEVICES=0/1/2/3 split is correct and collision-free across co-scheduled pods. Requires the runner ScaleSet to populate /etc/podinfo/gha-render-devices (see companion rocOps change). --- .github/workflows/rocm-ci.yml | 38 +++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index e2fb09c15..c14793de3 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -144,15 +144,28 @@ jobs: - name: Run Container run: | + # All GPUs are visible to the runner; per-suite visibility is set later + # via HIP_VISIBLE_DEVICES in the test step. Add render group for the + # container. Under kubernetes, GPU isolation comes from DEVICE_FLAG: + # the runner writes this pod's allocated render devices into + # /etc/podinfo/gha-render-devices; fall back to all GPUs on bare metal. + render_gid=$(cat /etc/group | grep render | cut -d: -f3) + if [ -f "/etc/podinfo/gha-render-devices" ]; then + DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) + else + DEVICE_FLAG="--device /dev/dri" + fi + # --group-add daemon/bin cover the video-group GID -> subgid 1 mapping + # across Ubuntu 24.04 / AlmaLinux base images. /dev/kfd is the single + # system-wide compute node and is always required. + GPU_FLAG="--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" docker run -dt \ --rm \ --name te-runner \ --network=host \ - --device=/dev/dri --device=/dev/kfd \ + $GPU_FLAG \ --shm-size=16G \ --pid=host \ - --group-add $(getent group render | cut -d: -f3) \ - --group-add $(getent group video | cut -d: -f3) \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ ${{ needs.select_image.outputs.image-tag }} @@ -341,15 +354,28 @@ jobs: - name: Run Container run: | + # All GPUs are visible to the runner; per-suite visibility is set later + # via HIP_VISIBLE_DEVICES in the test step. Add render group for the + # container. Under kubernetes, GPU isolation comes from DEVICE_FLAG: + # the runner writes this pod's allocated render devices into + # /etc/podinfo/gha-render-devices; fall back to all GPUs on bare metal. + render_gid=$(cat /etc/group | grep render | cut -d: -f3) + if [ -f "/etc/podinfo/gha-render-devices" ]; then + DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) + else + DEVICE_FLAG="--device /dev/dri" + fi + # --group-add daemon/bin cover the video-group GID -> subgid 1 mapping + # across Ubuntu 24.04 / AlmaLinux base images. /dev/kfd is the single + # system-wide compute node and is always required. + GPU_FLAG="--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" docker run -dt \ --rm \ --name te-runner \ --network=host \ - --device=/dev/dri --device=/dev/kfd \ + $GPU_FLAG \ --shm-size=16G \ --pid=host \ - --group-add $(getent group render | cut -d: -f3) \ - --group-add $(getent group video | cut -d: -f3) \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ ${{ needs.select_image.outputs.image-tag }}