From 6c43fb4f16bf23c229197c8452ffbd2afcb1efaf Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <okakarpa@amd.com>
Date: Thu, 4 Jun 2026 16:42:18 +0000
Subject: [PATCH] rocm-ci: scope test container to pod-allocated GPUs via
 podinfo

The sGPU/mGPU jobs launched the test container with
'--device=/dev/dri --device=/dev/kfd', exposing ALL host GPUs to the
nested (privileged-dind) container regardless of the GPUs Kubernetes
allocated to the pod. Combined with the hard-coded absolute
HIP_VISIBLE_DEVICES=0..3, two jobs co-scheduled on the same node both
pinned physical GPUs 0-3 and collided (OOM/hangs/test failures) while
4-7 sat idle. Jobs only passed when the node was otherwise idle --
arch-independent (mi300x and mi35x).

Build GPU_FLAG from /etc/podinfo/gha-render-devices, which the runner
populates with this pod's allocated '--device /dev/dri/renderD*' flags
(falls back to all GPUs on bare metal). /dev/kfd is always passed. The
container now sees only its allocated GPUs as 0..N-1, so the per-suite
HIP_VISIBLE_DEVICES=0/1/2/3 split is correct and collision-free across
co-scheduled pods.

Requires the runner ScaleSet to populate /etc/podinfo/gha-render-devices
(see companion rocOps change).
---
 .github/workflows/rocm-ci.yml | 38 +++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml
index e2fb09c15..c14793de3 100644
--- a/.github/workflows/rocm-ci.yml
+++ b/.github/workflows/rocm-ci.yml
@@ -144,15 +144,28 @@ jobs:
 
       - name: Run Container
         run: |
+          # All GPUs are visible to the runner; per-suite visibility is set later
+          # via HIP_VISIBLE_DEVICES in the test step. Add render group for the
+          # container. Under kubernetes, GPU isolation comes from DEVICE_FLAG:
+          # the runner writes this pod's allocated render devices into
+          # /etc/podinfo/gha-render-devices; fall back to all GPUs on bare metal.
+          render_gid=$(cat /etc/group | grep render | cut -d: -f3)
+          if [ -f "/etc/podinfo/gha-render-devices" ]; then
+            DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
+          else
+            DEVICE_FLAG="--device /dev/dri"
+          fi
+          # --group-add daemon/bin cover the video-group GID -> subgid 1 mapping
+          # across Ubuntu 24.04 / AlmaLinux base images. /dev/kfd is the single
+          # system-wide compute node and is always required.
+          GPU_FLAG="--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
           docker run -dt \
             --rm \
             --name te-runner \
             --network=host \
-            --device=/dev/dri --device=/dev/kfd \
+            $GPU_FLAG \
             --shm-size=16G \
             --pid=host \
-            --group-add $(getent group render | cut -d: -f3) \
-            --group-add $(getent group video | cut -d: -f3) \
             -v "${{ github.workspace }}:/workspace" \
             -w /workspace \
             ${{ needs.select_image.outputs.image-tag }}
@@ -341,15 +354,28 @@ jobs:
 
       - name: Run Container
         run: |
+          # All GPUs are visible to the runner; per-suite visibility is set later
+          # via HIP_VISIBLE_DEVICES in the test step. Add render group for the
+          # container. Under kubernetes, GPU isolation comes from DEVICE_FLAG:
+          # the runner writes this pod's allocated render devices into
+          # /etc/podinfo/gha-render-devices; fall back to all GPUs on bare metal.
+          render_gid=$(cat /etc/group | grep render | cut -d: -f3)
+          if [ -f "/etc/podinfo/gha-render-devices" ]; then
+            DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
+          else
+            DEVICE_FLAG="--device /dev/dri"
+          fi
+          # --group-add daemon/bin cover the video-group GID -> subgid 1 mapping
+          # across Ubuntu 24.04 / AlmaLinux base images. /dev/kfd is the single
+          # system-wide compute node and is always required.
+          GPU_FLAG="--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
           docker run -dt \
             --rm \
             --name te-runner \
             --network=host \
-            --device=/dev/dri --device=/dev/kfd \
+            $GPU_FLAG \
             --shm-size=16G \
             --pid=host \
-            --group-add $(getent group render | cut -d: -f3) \
-            --group-add $(getent group video | cut -d: -f3) \
             -v "${{ github.workspace }}:/workspace" \
             -w /workspace \
             ${{ needs.select_image.outputs.image-tag }}