From 18f71c8f7a030770b1435435063423733d760cfb Mon Sep 17 00:00:00 2001
From: Veera Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Mon, 18 May 2026 15:45:15 +0000
Subject: [PATCH 1/7] Update CI image to use TheRock

---
 .github/scripts/Dockerfile.ci.deps           | 123 ++++++++++++++-----
 .github/scripts/build_ci_deps_docker.sh      |  35 ++++++
 .github/workflows/ci-deps-docker-publish.yml |  43 +++++--
 task.txt                                     |  11 ++
 4 files changed, 172 insertions(+), 40 deletions(-)
 create mode 100755 .github/scripts/build_ci_deps_docker.sh
 create mode 100644 task.txt

diff --git a/.github/scripts/Dockerfile.ci.deps b/.github/scripts/Dockerfile.ci.deps
index 7161054f4..271973067 100644
--- a/.github/scripts/Dockerfile.ci.deps
+++ b/.github/scripts/Dockerfile.ci.deps
@@ -1,49 +1,106 @@
 # Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE for license information.
+#
+# TE CI deps image: Ubuntu 24.04 + one TheRock ROCm tarball + Python 3.12 venv.
+#
+# Build once per AMDGPU stack and publish two images (example tags on your registry):
+#
+#   docker build -f .github/scripts/Dockerfile.ci.deps \
+#     --build-arg AMDGPU_FAMILY=gfx94X-dcgpu \
+#     -t te-ci-deps:rocm-7.12.0-ubuntu24.04-py312-gfx94x-dcgpu .
+#
+# ROCm installer: https://raw.githubusercontent.com/ROCm/TheRock/main/dockerfiles/install_rocm_tarball.sh
+#
+# Wheels: https://repo.amd.com/rocm/whl/${AMDGPU_FAMILY}/
 
-## TE CI Dockerfile
-ARG BASE_DOCKER=registry-sc-harbor.amd.com/framework/compute-rocm-rel-7.2:57_ubuntu22.04_py3.11_pytorch_release-2.8_08d38866
-FROM $BASE_DOCKER
-WORKDIR /
+FROM ubuntu:24.04
 
-# Updated git via git-core PPA
-RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
-    && add-apt-repository ppa:git-core/ppa -y \
-    && apt-get update \
-    && apt-get install -y --no-install-recommends git vim \
+ARG DEBIAN_FRONTEND=noninteractive
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
+ARG ROCM_VERSION=7.12.0
+ARG AMDGPU_FAMILY=gfx94X-dcgpu
+ARG INSTALL_ROCM_TARBALL_SH_URL=https://raw.githubusercontent.com/ROCm/TheRock/main/dockerfiles/install_rocm_tarball.sh
+
+# Base OS packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates curl \
+    git vim \
+    build-essential cmake ninja-build pkg-config \
+    python3.12 python3.12-venv python3.12-dev python3-pip \
     && rm -rf /var/lib/apt/lists/*
 
-# Build arguments
-ARG FA_VERSION=v2.8.1
-ARG ROCM_VERSION=7.2
-ARG JAX_VERSION=0.8.0
-ARG PYTHON_VERSION=311
-# AITER - Required for MXFP4 FP4 GEMM kernels.
-ARG AITER_COMMIT=77455e3ecf4f0d28756afc452e914940c45b944b
+# Install ROCm
+RUN case "${AMDGPU_FAMILY}" in \
+      gfx94X-dcgpu|gfx950-dcgpu) ;; \
+      *) echo "AMDGPU_FAMILY must be gfx94X-dcgpu or gfx950-dcgpu (got: ${AMDGPU_FAMILY})" >&2; exit 1 ;; \
+    esac \
+    && curl -fsSL -o /tmp/install_rocm_tarball.sh "${INSTALL_ROCM_TARBALL_SH_URL}" \
+    && chmod +x /tmp/install_rocm_tarball.sh \
+    && /tmp/install_rocm_tarball.sh "${ROCM_VERSION}" "${AMDGPU_FAMILY}" stable \
+    && rm -f /tmp/install_rocm_tarball.sh
+
+# Python venv; append ROCm exports to activate for interactive `source …/activate`.
+RUN python3.12 -m venv /opt/venv \
+    && { echo ""; echo "# ROCm (single-family image)"; \
+         echo 'export ROCM_PATH=/opt/rocm'; \
+         echo 'export PATH="${ROCM_PATH}/bin:${PATH}"'; \
+         echo 'if [ -n "${LD_LIBRARY_PATH:-}" ]; then export LD_LIBRARY_PATH="${ROCM_PATH}/lib:${LD_LIBRARY_PATH}"; else export LD_LIBRARY_PATH="${ROCM_PATH}/lib"; fi'; \
+       } >> /opt/venv/bin/activate
+
+# Global runtime env for RUN instructions and container processes.
+ENV ROCM_PATH=/opt/rocm \
+    VIRTUAL_ENV=/opt/venv \
+    PATH=/opt/venv/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin \
+    LD_LIBRARY_PATH=/opt/rocm/lib
+
+RUN python -m pip install --upgrade pip setuptools wheel
 
-RUN pip install setuptools wheel
-RUN pip install ipython pytest fire pydantic pybind11 ninja pandas
+RUN pip install --no-cache-dir ipython pytest fire pydantic pybind11 ninja pandas
+
+# Pinned wheels from https://repo.amd.com/rocm/whl/<AMDGPU_FAMILY>/
+RUN W="https://repo.amd.com/rocm/whl/${AMDGPU_FAMILY}" \
+    && pip install --no-cache-dir \
+        --extra-index-url "${W}" \
+        "${W}/rocm-${ROCM_VERSION}.tar.gz" \
+        "${W}/torch-2.10.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
+        "${W}/torchvision-0.25.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
+        "${W}/torchaudio-2.10.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
+        "${W}/triton-3.6.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
+        "${W}/jax_rocm7_pjrt-0.8.2%2Brocm7.12.0-py3-none-manylinux_2_28_x86_64.whl" \
+        "${W}/jax_rocm7_plugin-0.8.2%2Brocm7.12.0-cp312-cp312-manylinux_2_28_x86_64.whl" \
+        "${W}/jaxlib-0.8.2%2Brocm7.12.0-cp312-cp312-manylinux_2_27_x86_64.whl" \
+        "jax==0.8.2"
 
 # Install flash-attention
-RUN git clone --branch ${FA_VERSION} --depth 1 https://github.com/Dao-AILab/flash-attention.git \
+ARG FA_VERSION=v2.8.1
+RUN git clone --branch "${FA_VERSION}" --depth 1 https://github.com/Dao-AILab/flash-attention.git \
     && cd flash-attention \
-    && GPU_ARCHS="gfx950;gfx942" FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE FLASH_ATTENTION_SKIP_CK_BUILD=FALSE python setup.py install \
-    && cd ..
+    && GPU_ARCHS="gfx942;gfx950" FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE FLASH_ATTENTION_SKIP_CK_BUILD=FALSE \
+       python setup.py install \
+    && cd .. \
+    && rm -rf flash-attention
 
-# Install AITER
-RUN git clone --no-checkout https://github.com/ROCm/aiter.git \
-    && cd aiter \
-    && git checkout ${AITER_COMMIT} \
+# AITER - Required for MXFP4 FP4 GEMM kernels.
+ARG AITER_COMMIT=77455e3ecf4f0d28756afc452e914940c45b944b
+RUN W="https://repo.amd.com/rocm/whl/${AMDGPU_FAMILY}" \
+    && case "${AMDGPU_FAMILY}" in \
+         gfx94X-dcgpu) LIBS_PKG=rocm-sdk-libraries-gfx94x-dcgpu ;; \
+         gfx950-dcgpu) LIBS_PKG=rocm-sdk-libraries-gfx950-dcgpu ;; \
+       esac \
+    && pip install --no-cache-dir --extra-index-url "${W}" \
+        "rocm-sdk-core==${ROCM_VERSION}" \
+        "rocm-sdk-devel==${ROCM_VERSION}" \
+        "${LIBS_PKG}==${ROCM_VERSION}" \
+    && /opt/venv/bin/hipconfig --version \
+    && git clone --no-checkout https://github.com/ROCm/aiter.git /tmp/aiter \
+    && cd /tmp/aiter \
+    && git checkout "${AITER_COMMIT}" \
     && git submodule update --init --recursive \
-    && pip install .
-
-# Install JAX
-RUN ROCM_MAJOR=$(echo "${ROCM_VERSION}" | cut -d. -f1) && pip install \
-    https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/jax_rocm${ROCM_MAJOR}_pjrt-${JAX_VERSION}%2Brocm${ROCM_VERSION}.0-py3-none-manylinux_2_28_x86_64.whl \
-    https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/jax_rocm${ROCM_MAJOR}_plugin-${JAX_VERSION}%2Brocm${ROCM_VERSION}.0-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux_2_28_x86_64.whl \
-    jax==${JAX_VERSION} \
-    https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/jaxlib-${JAX_VERSION}%2Brocm${ROCM_VERSION}.0-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux_2_27_x86_64.whl
+    && pip install --no-build-isolation --no-cache-dir . \
+    && cd / \
+    && rm -rf /tmp/aiter
 
 WORKDIR /workspace/
 CMD ["/bin/bash"]
diff --git a/.github/scripts/build_ci_deps_docker.sh b/.github/scripts/build_ci_deps_docker.sh
new file mode 100755
index 000000000..61b14e9f1
--- /dev/null
+++ b/.github/scripts/build_ci_deps_docker.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# Build Dockerfile.ci.deps with a canonical image tag:
+#   <name>:rocm-<ROCM_VERSION>-ubuntu24.04-py312-<amdgpu-slug>
+# where <amdgpu-slug> is AMDGPU_FAMILY lowercased (e.g. gfx94x-dcgpu).
+#
+# Usage (from repo root):
+#   .github/scripts/build_ci_deps_docker.sh gfx94X-dcgpu
+#   ROCM_VERSION=7.12.0 IMAGE_NAME=my-registry/te-ci-deps .github/scripts/build_ci_deps_docker.sh gfx950-dcgpu
+#   IMAGE_TAG=my-custom-tag .github/scripts/build_ci_deps_docker.sh gfx94X-dcgpu   # override tag only
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+AMDGPU_FAMILY="${1:?usage: $0 <gfx94X-dcgpu|gfx950-dcgpu>}"
+case "${AMDGPU_FAMILY}" in
+  gfx94X-dcgpu|gfx950-dcgpu) ;;
+  *) echo "AMDGPU_FAMILY must be gfx94X-dcgpu or gfx950-dcgpu" >&2; exit 1 ;;
+esac
+
+ROCM_VERSION="${ROCM_VERSION:-7.12.0}"
+IMAGE_NAME="${IMAGE_NAME:-te-ci-deps}"
+SLUG="$(echo "${AMDGPU_FAMILY}" | tr '[:upper:]' '[:lower:]')"
+DEFAULT_TAG="rocm-${ROCM_VERSION}-ubuntu24.04-py312-${SLUG}"
+IMAGE_TAG="${IMAGE_TAG:-${DEFAULT_TAG}}"
+
+exec docker build \
+  -f "${SCRIPT_DIR}/Dockerfile.ci.deps" \
+  --build-arg "ROCM_VERSION=${ROCM_VERSION}" \
+  --build-arg "AMDGPU_FAMILY=${AMDGPU_FAMILY}" \
+  -t "${IMAGE_NAME}:${IMAGE_TAG}" \
+  "${REPO_ROOT}"
diff --git a/.github/workflows/ci-deps-docker-publish.yml b/.github/workflows/ci-deps-docker-publish.yml
index c989502e6..e776cb865 100644
--- a/.github/workflows/ci-deps-docker-publish.yml
+++ b/.github/workflows/ci-deps-docker-publish.yml
@@ -4,6 +4,12 @@
 #
 # Build .github/scripts/Dockerfile.ci.deps and push to Artifactory.
 #
+# Tag convention (when image_tag is left empty):
+#   rocm-<rocm_version>-ubuntu24.04-py312-<amdgpu-slug>
+# e.g. rocm-7.12.0-ubuntu24.04-py312-gfx94x-dcgpu
+#
+# Local builds: .github/scripts/build_ci_deps_docker.sh <gfx94X-dcgpu|gfx950-dcgpu>
+#
 # Required repository secrets:
 #   ARTIFACTORY_DOCKER_USERNAME / ARTIFACTORY_DOCKER_PASSWORD — registry basic auth
 #
@@ -16,10 +22,23 @@ name: Publish CI deps Docker image
 on:
   workflow_dispatch:
     inputs:
-      image_tag:
-        description: "Image tag pushed to Artifactory (required)"
+      amd_gpu_family:
+        description: "TheRock / wheel AMDGPU family (also passed as AMDGPU_FAMILY build-arg)"
         required: true
+        type: choice
+        options:
+          - gfx94X-dcgpu
+          - gfx950-dcgpu
+      rocm_version:
+        description: "ROCm version string (must match Dockerfile wheel pins when changed)"
+        required: false
         type: string
+        default: "7.12.0"
+      image_tag:
+        description: "Tag to push; leave empty for rocm-<ver>-ubuntu24.04-py312-<family-slug> (family lowercased)"
+        required: false
+        type: string
+        default: ""
 
 jobs:
   build-and-push:
@@ -37,16 +56,21 @@ jobs:
         env:
           REGISTRY: ${{ vars.ARTIFACTORY_DOCKER_REGISTRY }}
           REPOSITORY: ${{ vars.ARTIFACTORY_CI_DEPS_REPOSITORY }}
-          IMAGE_TAG: ${{ inputs.image_tag }}
+          IMAGE_TAG_INPUT: ${{ inputs.image_tag }}
+          AMDGPU_FAMILY: ${{ inputs.amd_gpu_family }}
+          ROCM_VERSION: ${{ inputs.rocm_version }}
         run: |
           set -euo pipefail
           if [ -z "${REGISTRY}" ] || [ -z "${REPOSITORY}" ]; then
             echo "Set repository variables ARTIFACTORY_DOCKER_REGISTRY and ARTIFACTORY_CI_DEPS_REPOSITORY." >&2
             exit 1
           fi
-          if [ -z "${IMAGE_TAG}" ]; then
-            echo "image_tag must be non-empty." >&2
-            exit 1
+          ROCM_VER="${ROCM_VERSION:-7.12.0}"
+          SLUG="$(echo "${AMDGPU_FAMILY}" | tr '[:upper:]' '[:lower:]')"
+          if [ -n "${IMAGE_TAG_INPUT}" ]; then
+            echo "IMAGE_TAG=${IMAGE_TAG_INPUT}" >> "${GITHUB_ENV}"
+          else
+            echo "IMAGE_TAG=rocm-${ROCM_VER}-ubuntu24.04-py312-${SLUG}" >> "${GITHUB_ENV}"
           fi
 
       - name: Log in to container registry
@@ -60,12 +84,17 @@ jobs:
         env:
           REGISTRY: ${{ vars.ARTIFACTORY_DOCKER_REGISTRY }}
           REPOSITORY: ${{ vars.ARTIFACTORY_CI_DEPS_REPOSITORY }}
-          IMAGE_TAG: ${{ inputs.image_tag }}
+          AMDGPU_FAMILY: ${{ inputs.amd_gpu_family }}
+          ROCM_VERSION_INPUT: ${{ inputs.rocm_version }}
         run: |
           set -euo pipefail
           FULL_IMAGE="${REGISTRY}/${REPOSITORY}"
+          : "${IMAGE_TAG:?IMAGE_TAG must be set by the validate step}"
+          ROCM_VER="${ROCM_VERSION_INPUT:-7.12.0}"
           docker build \
             -f .github/scripts/Dockerfile.ci.deps \
+            --build-arg "ROCM_VERSION=${ROCM_VER}" \
+            --build-arg "AMDGPU_FAMILY=${AMDGPU_FAMILY}" \
             -t "${FULL_IMAGE}:${IMAGE_TAG}" \
             .
           docker push "${FULL_IMAGE}:${IMAGE_TAG}"
diff --git a/task.txt b/task.txt
new file mode 100644
index 000000000..23a319960
--- /dev/null
+++ b/task.txt
@@ -0,0 +1,11 @@
+create the docker image with ubuntu 24.04 and install the rock using the info from 
+
+every thibng to install rocm and all are founf at https://repo.amd.com/rocm/whl/ 
+
+Current released is 7.12.0 and 
+
+my wheels for torch, torchvision, torchaudio, jax, jaxlib, triton are at https://repo.amd.com/rocm/whl/  see if you can fetch the info from here
+
+I want to pickfor these two archs gfx94X-dcgpu gfx950-dcgpu and python version 3.12
+
+update @.github/scripts/Dockerfile.ci.deps as well, to install for one arch by default but provide the arg to change the arch so that it installs those

From a4aab025e4840b07265e6d124bc216fc29aff242 Mon Sep 17 00:00:00 2001
From: Veera Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Wed, 20 May 2026 05:37:32 +0000
Subject: [PATCH 2/7] Updated docker file

---
 .github/scripts/Dockerfile.ci.deps      | 77 +++++++++++--------------
 .github/scripts/build_ci_deps_docker.sh | 35 -----------
 2 files changed, 35 insertions(+), 77 deletions(-)
 delete mode 100755 .github/scripts/build_ci_deps_docker.sh

diff --git a/.github/scripts/Dockerfile.ci.deps b/.github/scripts/Dockerfile.ci.deps
index 271973067..3d31de954 100644
--- a/.github/scripts/Dockerfile.ci.deps
+++ b/.github/scripts/Dockerfile.ci.deps
@@ -4,15 +4,15 @@
 #
 # TE CI deps image: Ubuntu 24.04 + one TheRock ROCm tarball + Python 3.12 venv.
 #
-# Build once per AMDGPU stack and publish two images (example tags on your registry):
+#   docker build -f .github/scripts/Dockerfile.ci.deps \
+#     --build-arg GPU_ARCH=gfx942 \
+#     -t te-ci-deps:rocm-7.12.0-ubuntu24.04-py312-gfx942 .
 #
 #   docker build -f .github/scripts/Dockerfile.ci.deps \
-#     --build-arg AMDGPU_FAMILY=gfx94X-dcgpu \
-#     -t te-ci-deps:rocm-7.12.0-ubuntu24.04-py312-gfx94x-dcgpu .
+#     --build-arg GPU_ARCH=gfx950 \
+#     -t te-ci-deps:rocm-7.12.0-ubuntu24.04-py312-gfx950 .
 #
 # ROCm installer: https://raw.githubusercontent.com/ROCm/TheRock/main/dockerfiles/install_rocm_tarball.sh
-#
-# Wheels: https://repo.amd.com/rocm/whl/${AMDGPU_FAMILY}/
 
 FROM ubuntu:24.04
 
@@ -20,49 +20,56 @@ ARG DEBIAN_FRONTEND=noninteractive
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
 ARG ROCM_VERSION=7.12.0
-ARG AMDGPU_FAMILY=gfx94X-dcgpu
+ARG GPU_ARCH=gfx942
 ARG INSTALL_ROCM_TARBALL_SH_URL=https://raw.githubusercontent.com/ROCm/TheRock/main/dockerfiles/install_rocm_tarball.sh
 
+# Map GPU_ARCH → AMD GPU wheel/tarball family (once).
+RUN case "${GPU_ARCH}" in \
+      gfx942) echo -n gfx94X-dcgpu > /etc/amd_gpu_family ;; \
+      gfx950) echo -n gfx950-dcgpu > /etc/amd_gpu_family ;; \
+      *) echo "GPU_ARCH must be gfx942 or gfx950 (got: ${GPU_ARCH})" >&2; exit 1 ;; \
+    esac
+
 # Base OS packages
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates curl \
     git vim \
-    build-essential cmake ninja-build pkg-config \
+    build-essential cmake ninja-build pkg-config liblzma-dev \
     python3.12 python3.12-venv python3.12-dev python3-pip \
     && rm -rf /var/lib/apt/lists/*
 
-# Install ROCm
-RUN case "${AMDGPU_FAMILY}" in \
-      gfx94X-dcgpu|gfx950-dcgpu) ;; \
-      *) echo "AMDGPU_FAMILY must be gfx94X-dcgpu or gfx950-dcgpu (got: ${AMDGPU_FAMILY})" >&2; exit 1 ;; \
-    esac \
+# Native ROCm tarball → /opt/rocm
+RUN AMD_GPU_FAMILY=$(cat /etc/amd_gpu_family) \
     && curl -fsSL -o /tmp/install_rocm_tarball.sh "${INSTALL_ROCM_TARBALL_SH_URL}" \
     && chmod +x /tmp/install_rocm_tarball.sh \
-    && /tmp/install_rocm_tarball.sh "${ROCM_VERSION}" "${AMDGPU_FAMILY}" stable \
+    && /tmp/install_rocm_tarball.sh "${ROCM_VERSION}" "${AMD_GPU_FAMILY}" stable \
     && rm -f /tmp/install_rocm_tarball.sh
 
-# Python venv; append ROCm exports to activate for interactive `source …/activate`.
 RUN python3.12 -m venv /opt/venv \
-    && { echo ""; echo "# ROCm (single-family image)"; \
+    && { echo ""; echo "# ROCm"; \
          echo 'export ROCM_PATH=/opt/rocm'; \
          echo 'export PATH="${ROCM_PATH}/bin:${PATH}"'; \
          echo 'if [ -n "${LD_LIBRARY_PATH:-}" ]; then export LD_LIBRARY_PATH="${ROCM_PATH}/lib:${LD_LIBRARY_PATH}"; else export LD_LIBRARY_PATH="${ROCM_PATH}/lib"; fi'; \
        } >> /opt/venv/bin/activate
 
-# Global runtime env for RUN instructions and container processes.
-ENV ROCM_PATH=/opt/rocm \
+ENV GPU_ARCH=${GPU_ARCH} \
+    ROCM_PATH=/opt/rocm \
     VIRTUAL_ENV=/opt/venv \
     PATH=/opt/venv/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin \
     LD_LIBRARY_PATH=/opt/rocm/lib
 
-RUN python -m pip install --upgrade pip setuptools wheel
+RUN python -m pip install --upgrade pip setuptools wheel \
+    && pip install --no-cache-dir ipython pytest fire pydantic pybind11 ninja pandas
 
-RUN pip install --no-cache-dir ipython pytest fire pydantic pybind11 ninja pandas
-
-# Pinned wheels from https://repo.amd.com/rocm/whl/<AMDGPU_FAMILY>/
-RUN W="https://repo.amd.com/rocm/whl/${AMDGPU_FAMILY}" \
+# Python ROCm SDK + torch / jax from https://repo.amd.com/rocm/whl/<AMD_GPU_FAMILY>/
+RUN AMD_GPU_FAMILY=$(cat /etc/amd_gpu_family) \
+    && W="https://repo.amd.com/rocm/whl/${AMD_GPU_FAMILY}" \
+    && LIBS_PKG="rocm-sdk-libraries-$(echo "${AMD_GPU_FAMILY}" | tr '[:upper:]' '[:lower:]')" \
     && pip install --no-cache-dir \
         --extra-index-url "${W}" \
+        "rocm-sdk-core==${ROCM_VERSION}" \
+        "rocm-sdk-devel==${ROCM_VERSION}" \
+        "${LIBS_PKG}==${ROCM_VERSION}" \
         "${W}/rocm-${ROCM_VERSION}.tar.gz" \
         "${W}/torch-2.10.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
         "${W}/torchvision-0.25.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
@@ -73,33 +80,19 @@ RUN W="https://repo.amd.com/rocm/whl/${AMDGPU_FAMILY}" \
         "${W}/jaxlib-0.8.2%2Brocm7.12.0-cp312-cp312-manylinux_2_27_x86_64.whl" \
         "jax==0.8.2"
 
-# Install flash-attention
 ARG FA_VERSION=v2.8.1
-RUN git clone --branch "${FA_VERSION}" --depth 1 https://github.com/Dao-AILab/flash-attention.git \
-    && cd flash-attention \
-    && GPU_ARCHS="gfx942;gfx950" FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE FLASH_ATTENTION_SKIP_CK_BUILD=FALSE \
+RUN git clone --branch "${FA_VERSION}" --depth 1 https://github.com/Dao-AILab/flash-attention.git /tmp/flash-attention \
+    && cd /tmp/flash-attention \
+    && GPU_ARCHS="${GPU_ARCH}" FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE FLASH_ATTENTION_SKIP_CK_BUILD=FALSE \
        python setup.py install \
-    && cd .. \
-    && rm -rf flash-attention
+    && rm -rf /tmp/flash-attention
 
-# AITER - Required for MXFP4 FP4 GEMM kernels.
 ARG AITER_COMMIT=77455e3ecf4f0d28756afc452e914940c45b944b
-RUN W="https://repo.amd.com/rocm/whl/${AMDGPU_FAMILY}" \
-    && case "${AMDGPU_FAMILY}" in \
-         gfx94X-dcgpu) LIBS_PKG=rocm-sdk-libraries-gfx94x-dcgpu ;; \
-         gfx950-dcgpu) LIBS_PKG=rocm-sdk-libraries-gfx950-dcgpu ;; \
-       esac \
-    && pip install --no-cache-dir --extra-index-url "${W}" \
-        "rocm-sdk-core==${ROCM_VERSION}" \
-        "rocm-sdk-devel==${ROCM_VERSION}" \
-        "${LIBS_PKG}==${ROCM_VERSION}" \
-    && /opt/venv/bin/hipconfig --version \
-    && git clone --no-checkout https://github.com/ROCm/aiter.git /tmp/aiter \
+RUN git clone --no-checkout https://github.com/ROCm/aiter.git /tmp/aiter \
     && cd /tmp/aiter \
     && git checkout "${AITER_COMMIT}" \
     && git submodule update --init --recursive \
-    && pip install --no-build-isolation --no-cache-dir . \
-    && cd / \
+    && GPU_ARCHS="${GPU_ARCH}" pip install --no-build-isolation --no-cache-dir . \
     && rm -rf /tmp/aiter
 
 WORKDIR /workspace/
diff --git a/.github/scripts/build_ci_deps_docker.sh b/.github/scripts/build_ci_deps_docker.sh
deleted file mode 100755
index 61b14e9f1..000000000
--- a/.github/scripts/build_ci_deps_docker.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# Build Dockerfile.ci.deps with a canonical image tag:
-#   <name>:rocm-<ROCM_VERSION>-ubuntu24.04-py312-<amdgpu-slug>
-# where <amdgpu-slug> is AMDGPU_FAMILY lowercased (e.g. gfx94x-dcgpu).
-#
-# Usage (from repo root):
-#   .github/scripts/build_ci_deps_docker.sh gfx94X-dcgpu
-#   ROCM_VERSION=7.12.0 IMAGE_NAME=my-registry/te-ci-deps .github/scripts/build_ci_deps_docker.sh gfx950-dcgpu
-#   IMAGE_TAG=my-custom-tag .github/scripts/build_ci_deps_docker.sh gfx94X-dcgpu   # override tag only
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
-
-AMDGPU_FAMILY="${1:?usage: $0 <gfx94X-dcgpu|gfx950-dcgpu>}"
-case "${AMDGPU_FAMILY}" in
-  gfx94X-dcgpu|gfx950-dcgpu) ;;
-  *) echo "AMDGPU_FAMILY must be gfx94X-dcgpu or gfx950-dcgpu" >&2; exit 1 ;;
-esac
-
-ROCM_VERSION="${ROCM_VERSION:-7.12.0}"
-IMAGE_NAME="${IMAGE_NAME:-te-ci-deps}"
-SLUG="$(echo "${AMDGPU_FAMILY}" | tr '[:upper:]' '[:lower:]')"
-DEFAULT_TAG="rocm-${ROCM_VERSION}-ubuntu24.04-py312-${SLUG}"
-IMAGE_TAG="${IMAGE_TAG:-${DEFAULT_TAG}}"
-
-exec docker build \
-  -f "${SCRIPT_DIR}/Dockerfile.ci.deps" \
-  --build-arg "ROCM_VERSION=${ROCM_VERSION}" \
-  --build-arg "AMDGPU_FAMILY=${AMDGPU_FAMILY}" \
-  -t "${IMAGE_NAME}:${IMAGE_TAG}" \
-  "${REPO_ROOT}"

From 13aabbf0303ceae6cbd76abf7ade7ed9016d7b5b Mon Sep 17 00:00:00 2001
From: Veera Rajasekhar Reddy Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Fri, 29 May 2026 15:32:29 +0000
Subject: [PATCH 3/7] Uploaded images to test

---
 .github/scripts/Dockerfile.ci.deps            |  2 +-
 .github/workflows/ci-deps-docker-publish.yml  | 30 +++++----
 .github/workflows/rocm-ci.yml                 | 28 +++++---
 .../attention/benchmark_attention_rocm.py     | 67 +++++++++++--------
 ci/README.md                                  |  9 +++
 ci/ci_config.json                             |  5 +-
 6 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/.github/scripts/Dockerfile.ci.deps b/.github/scripts/Dockerfile.ci.deps
index 3d31de954..aff7f53c1 100644
--- a/.github/scripts/Dockerfile.ci.deps
+++ b/.github/scripts/Dockerfile.ci.deps
@@ -59,7 +59,7 @@ ENV GPU_ARCH=${GPU_ARCH} \
     LD_LIBRARY_PATH=/opt/rocm/lib
 
 RUN python -m pip install --upgrade pip setuptools wheel \
-    && pip install --no-cache-dir ipython pytest fire pydantic pybind11 ninja pandas
+    && pip install --no-cache-dir ipython pytest fire pydantic pybind11 ninja pandas expecttest
 
 # Python ROCm SDK + torch / jax from https://repo.amd.com/rocm/whl/<AMD_GPU_FAMILY>/
 RUN AMD_GPU_FAMILY=$(cat /etc/amd_gpu_family) \
diff --git a/.github/workflows/ci-deps-docker-publish.yml b/.github/workflows/ci-deps-docker-publish.yml
index e776cb865..2735c4530 100644
--- a/.github/workflows/ci-deps-docker-publish.yml
+++ b/.github/workflows/ci-deps-docker-publish.yml
@@ -2,13 +2,16 @@
 #
 # See LICENSE for license information.
 #
-# Build .github/scripts/Dockerfile.ci.deps and push to Artifactory.
+# Build .github/scripts/Dockerfile.ci.deps and push to Harbor.
 #
 # Tag convention (when image_tag is left empty):
-#   rocm-<rocm_version>-ubuntu24.04-py312-<amdgpu-slug>
-# e.g. rocm-7.12.0-ubuntu24.04-py312-gfx94x-dcgpu
+#   rocm-<rocm_version>-ubuntu24.04-py312-<gpu_arch>
+# e.g. rocm-7.12.0-ubuntu24.04-py312-gfx942
 #
-# Local builds: .github/scripts/build_ci_deps_docker.sh <gfx94X-dcgpu|gfx950-dcgpu>
+# Local builds:
+#   docker build -f .github/scripts/Dockerfile.ci.deps \
+#     --build-arg GPU_ARCH=gfx942 \
+#     -t registry-sc-harbor.amd.com/framework/te-ci:rocm-7.12.0-ubuntu24.04-py312-gfx942 .
 #
 # Required repository secrets:
 #   ARTIFACTORY_DOCKER_USERNAME / ARTIFACTORY_DOCKER_PASSWORD — registry basic auth
@@ -22,20 +25,20 @@ name: Publish CI deps Docker image
 on:
   workflow_dispatch:
     inputs:
-      amd_gpu_family:
-        description: "TheRock / wheel AMDGPU family (also passed as AMDGPU_FAMILY build-arg)"
+      gpu_arch:
+        description: "GPU architecture for the CI deps image (Dockerfile GPU_ARCH build-arg)"
         required: true
         type: choice
         options:
-          - gfx94X-dcgpu
-          - gfx950-dcgpu
+          - gfx942
+          - gfx950
       rocm_version:
         description: "ROCm version string (must match Dockerfile wheel pins when changed)"
         required: false
         type: string
         default: "7.12.0"
       image_tag:
-        description: "Tag to push; leave empty for rocm-<ver>-ubuntu24.04-py312-<family-slug> (family lowercased)"
+        description: "Tag to push; leave empty for rocm-<ver>-ubuntu24.04-py312-<gpu_arch>"
         required: false
         type: string
         default: ""
@@ -57,7 +60,7 @@ jobs:
           REGISTRY: ${{ vars.ARTIFACTORY_DOCKER_REGISTRY }}
           REPOSITORY: ${{ vars.ARTIFACTORY_CI_DEPS_REPOSITORY }}
           IMAGE_TAG_INPUT: ${{ inputs.image_tag }}
-          AMDGPU_FAMILY: ${{ inputs.amd_gpu_family }}
+          GPU_ARCH: ${{ inputs.gpu_arch }}
           ROCM_VERSION: ${{ inputs.rocm_version }}
         run: |
           set -euo pipefail
@@ -66,11 +69,10 @@ jobs:
             exit 1
           fi
           ROCM_VER="${ROCM_VERSION:-7.12.0}"
-          SLUG="$(echo "${AMDGPU_FAMILY}" | tr '[:upper:]' '[:lower:]')"
           if [ -n "${IMAGE_TAG_INPUT}" ]; then
             echo "IMAGE_TAG=${IMAGE_TAG_INPUT}" >> "${GITHUB_ENV}"
           else
-            echo "IMAGE_TAG=rocm-${ROCM_VER}-ubuntu24.04-py312-${SLUG}" >> "${GITHUB_ENV}"
+            echo "IMAGE_TAG=rocm-${ROCM_VER}-ubuntu24.04-py312-${GPU_ARCH}" >> "${GITHUB_ENV}"
           fi
 
       - name: Log in to container registry
@@ -84,7 +86,7 @@ jobs:
         env:
           REGISTRY: ${{ vars.ARTIFACTORY_DOCKER_REGISTRY }}
           REPOSITORY: ${{ vars.ARTIFACTORY_CI_DEPS_REPOSITORY }}
-          AMDGPU_FAMILY: ${{ inputs.amd_gpu_family }}
+          GPU_ARCH: ${{ inputs.gpu_arch }}
           ROCM_VERSION_INPUT: ${{ inputs.rocm_version }}
         run: |
           set -euo pipefail
@@ -94,7 +96,7 @@ jobs:
           docker build \
             -f .github/scripts/Dockerfile.ci.deps \
             --build-arg "ROCM_VERSION=${ROCM_VER}" \
-            --build-arg "AMDGPU_FAMILY=${AMDGPU_FAMILY}" \
+            --build-arg "GPU_ARCH=${GPU_ARCH}" \
             -t "${FULL_IMAGE}:${IMAGE_TAG}" \
             .
           docker push "${FULL_IMAGE}:${IMAGE_TAG}"
diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml
index e2fb09c15..bae2b201a 100644
--- a/.github/workflows/rocm-ci.yml
+++ b/.github/workflows/rocm-ci.yml
@@ -54,7 +54,8 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 10
     outputs:
-      image-tag: ${{ steps.select-image.outputs.image-tag }}
+      image-tag-mi30x: ${{ steps.select-image.outputs.image-tag-mi30x }}
+      image-tag-mi35x: ${{ steps.select-image.outputs.image-tag-mi35x }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v6
@@ -88,16 +89,25 @@ jobs:
           fi
 
           echo "Selected config key: $JSON_KEY"
-          IMAGE_TO_USE=$(jq -r --arg key "$JSON_KEY" '.docker_images[$key]' ci/ci_config.json)
+          CONFIG_ENTRY=$(jq -c --arg key "$JSON_KEY" '.docker_images[$key]' ci/ci_config.json)
 
           MANUAL_OVERRIDE="${{ inputs.docker_image_override }}"
           if [[ -n "$MANUAL_OVERRIDE" ]]; then
             echo "::notice::Manual override detected: $MANUAL_OVERRIDE"
-            IMAGE_TO_USE="$MANUAL_OVERRIDE"
+            IMAGE_MI30X="$MANUAL_OVERRIDE"
+            IMAGE_MI35X="$MANUAL_OVERRIDE"
+          elif jq -e '.mi30x and .mi35x' <<< "$CONFIG_ENTRY" > /dev/null; then
+            IMAGE_MI30X=$(jq -r '.mi30x' <<< "$CONFIG_ENTRY")
+            IMAGE_MI35X=$(jq -r '.mi35x' <<< "$CONFIG_ENTRY")
+          else
+            IMAGE_MI30X=$(jq -r '.' <<< "$CONFIG_ENTRY")
+            IMAGE_MI35X="$IMAGE_MI30X"
           fi
 
-          echo "Selected image: $IMAGE_TO_USE"
-          echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT
+          echo "Selected mi30x (gfx942) image: $IMAGE_MI30X"
+          echo "Selected mi35x (gfx950) image: $IMAGE_MI35X"
+          echo "image-tag-mi30x=$IMAGE_MI30X" >> $GITHUB_OUTPUT
+          echo "image-tag-mi35x=$IMAGE_MI35X" >> $GITHUB_OUTPUT
 
   build:
     # Delegate wheel building to the reusable workflow on dev. It produces a core .whl plus framework .tar.gz sdists under artifact name `te-rocm-wheels`.
@@ -140,7 +150,7 @@ jobs:
 
       - name: Pull Docker Image
         run: |
-          docker pull ${{ needs.select_image.outputs.image-tag }}
+          docker pull ${{ matrix.arch_label == 'mi30x' && needs.select_image.outputs.image-tag-mi30x || needs.select_image.outputs.image-tag-mi35x }}
 
       - name: Run Container
         run: |
@@ -155,7 +165,7 @@ jobs:
             --group-add $(getent group video | cut -d: -f3) \
             -v "${{ github.workspace }}:/workspace" \
             -w /workspace \
-            ${{ needs.select_image.outputs.image-tag }}
+            ${{ matrix.arch_label == 'mi30x' && needs.select_image.outputs.image-tag-mi30x || needs.select_image.outputs.image-tag-mi35x }}
 
       - name: Install packages
         run: |
@@ -337,7 +347,7 @@ jobs:
 
       - name: Pull Docker Image
         run: |
-          docker pull ${{ needs.select_image.outputs.image-tag }}
+          docker pull ${{ matrix.arch_label == 'mi30x' && needs.select_image.outputs.image-tag-mi30x || needs.select_image.outputs.image-tag-mi35x }}
 
       - name: Run Container
         run: |
@@ -352,7 +362,7 @@ jobs:
             --group-add $(getent group video | cut -d: -f3) \
             -v "${{ github.workspace }}:/workspace" \
             -w /workspace \
-            ${{ needs.select_image.outputs.image-tag }}
+            ${{ matrix.arch_label == 'mi30x' && needs.select_image.outputs.image-tag-mi30x || needs.select_image.outputs.image-tag-mi35x }}
 
       - name: Install packages
         env:
diff --git a/benchmarks/attention/benchmark_attention_rocm.py b/benchmarks/attention/benchmark_attention_rocm.py
index b98e9c6e0..076a91de9 100644
--- a/benchmarks/attention/benchmark_attention_rocm.py
+++ b/benchmarks/attention/benchmark_attention_rocm.py
@@ -84,6 +84,9 @@
 output_csv = "times.csv"
 # Output directory name
 output_dir_name = "profiler_outputs"
+# rocprofv3 output prefix and kernel stats filename (see rocprofv3 -o/-d)
+rocprof_output_prefix = "results"
+rocprof_kernel_stats_csv = f"{rocprof_output_prefix}_kernel_stats.csv"
 # Current working directory
 cwd = os.getcwd()
 
@@ -137,7 +140,7 @@ def setup_backend_env(backend_name, use_ck_bwd_v3=True, use_ck_fwd_v3=True, use_
     "aotriton_bwd": "bwd",
 }
 
-# Runs benchmark with warmup iterations and profiles using rocprof
+# Runs benchmark with warmup iterations and profiles using rocprofv3
 def benchmark_dot_product_attention(model, attention, column_name, dirname):
     config = model_configs[model]
 
@@ -153,29 +156,34 @@ def benchmark_dot_product_attention(model, attention, column_name, dirname):
                 is_training,
             )
     os.makedirs(dirname, exist_ok=True)
-    before_files = set(os.listdir(cwd))
-    # Profiling command using rocprof
     benchmark_dir = os.path.dirname(os.path.abspath(__file__))
+    profiler_script = (
+        f"import sys; sys.path.insert(0, {benchmark_dir!r}); "
+        f"import benchmark_attention_rocm; "
+        f"benchmark_attention_rocm.benchmark_dot_product_attention_profiler("
+        f"{model!r}, {attention!r}, {column_name!r})"
+    )
+    # rocprofv3: --kernel-trace + --stats replaces rocprofv2 --hip-trace (kernel stats
+    # are not enabled by default). Full kernel names are kept (v2 --basenames off).
     prof_cmd = [
-            "rocprof",
-            "--hip-trace",
-            "--basenames off",
-            "python",
-            "-c",
-            f""" "import sys; sys.path.insert(0, '{benchmark_dir}'); import benchmark_attention_rocm;""",
-            f"""benchmark_attention_rocm.benchmark_dot_product_attention_profiler("""
-            f"""'{model}', '{attention}', '{column_name}')" """,
-        ]
-    prof_cmd = " ".join(prof_cmd)
-    subprocess.call(prof_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True)
-    after_files = set(os.listdir(cwd))
-    new_files = after_files - before_files
-
-    for f in new_files:
-        src_path = os.path.join(cwd, f)
-        dst_path = os.path.join(dirname, f)
-        if os.path.isfile(src_path):  # Only move files, not directories
-            shutil.move(src_path, dst_path)
+        "rocprofv3",
+        "--kernel-trace",
+        "--stats",
+        "-f", "csv",
+        "-o", rocprof_output_prefix,
+        "-d", dirname,
+        "--",
+        sys.executable,
+        "-c",
+        profiler_script,
+    ]
+    result = subprocess.run(prof_cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(
+            f"rocprofv3 failed for {model} [{attention}] (exit {result.returncode}):\n"
+            f"{result.stderr}",
+            file=sys.stderr,
+        )
     torch.cuda.empty_cache()
     
 # Runs profiler and records timing information
@@ -216,7 +224,10 @@ def calculate_attention_tflops(batch_size, seq_len, num_heads_q, head_dim_qk, fw
 
 # Helper function to extract timing results from profiler logs
 def parse_helper(model, dirname, fwd_search_pattern, bwd_search_pattern, column_name, df_times):
-    df = pd.read_csv(os.path.join(dirname, "results.stats.csv"))
+    stats_csv = os.path.join(dirname, rocprof_kernel_stats_csv)
+    if not os.path.isfile(stats_csv):
+        return False
+    df = pd.read_csv(stats_csv)
 
     # Extract kernel timing values
     fwd_values = df[df["Name"].str.contains(fwd_search_pattern, regex=False)]["AverageNs"].to_numpy()
@@ -281,7 +292,7 @@ def sanity_checks(
 ):
     """
     • Verifies that every model/backend that *should* have run produced
-        profiler_root/<dir>/results.stats.csv
+        profiler_root/<dir>/results_kernel_stats.csv
     • Non-zero exit code on any failure (CI friendly)
     """
     if profiler_root is None:
@@ -323,12 +334,14 @@ def sanity_checks(
         print(f"{model}:")
         # Rocprof run status
         for be, pat in expected.items():
-            stats = os.path.join(profiler_root, pat.format(model=model), "results.stats.csv")
+            stats = os.path.join(profiler_root, pat.format(model=model), rocprof_kernel_stats_csv)
             if os.path.isfile(stats):
                 print(f"  [{be:<22}] Profiling successful")
             else:
                 ok_overall = False
-                raise FileNotFoundError(f"Error while profiling {model} [{be}], results.stats.csv not found")
+                raise FileNotFoundError(
+                    f"Error while profiling {model} [{be}], {rocprof_kernel_stats_csv} not found"
+                )
 
         print("-" * 60)
     return ok_overall
@@ -347,7 +360,7 @@ def main(args):
     os.makedirs(output_dir)
 
     df_times = pd.DataFrame(index=indices, columns=columns)
-    df_times = df_times.infer_objects(copy=False)
+    df_times = df_times.infer_objects()
     df_times.fillna(0.0, inplace=True)
     df_times.index.name = "Model"
     df_times.to_csv(output_csv_path)
diff --git a/ci/README.md b/ci/README.md
index 07d5bd7d2..7c15748d0 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -21,4 +21,13 @@ It is the caller's responsibility to clean up generated files.
 
 Default and release-specific TE CI images are listed in [`ci_config.json`](ci_config.json) under `docker_images`.
 
+For `dev` and other branches using the `default` entry, images are selected per runner architecture:
+
+| Runner label | GPU arch | Image tag |
+|--------------|----------|-----------|
+| `linux-te-mi30x-*` | gfx942 (MI300X) | `rocm-7.12.0-ubuntu24.04-py312-gfx942_test` |
+| `linux-te-mi35x-*` | gfx950 (MI350X) | `rocm-7.12.0-ubuntu24.04-py312-gfx950_test` |
+
+Registry: `registry-sc-harbor.amd.com/framework/te-ci`
+
 The default image is built from [`.github/scripts/Dockerfile.ci.deps`](../.github/scripts/Dockerfile.ci.deps). It pins [ROCm/aiter](https://github.com/ROCm/aiter) at commit [`77455e3ecf4f0d28756afc452e914940c45b944b`](https://github.com/ROCm/aiter/commit/77455e3ecf4f0d28756afc452e914940c45b944b). That revision was validated in CI for **MXFP4 FP4 GEMM** kernel coverage.
diff --git a/ci/ci_config.json b/ci/ci_config.json
index 123ded73a..9ce3526fc 100644
--- a/ci/ci_config.json
+++ b/ci/ci_config.json
@@ -1,6 +1,9 @@
 {
   "docker_images": {
-    "default": "registry-sc-harbor.amd.com/framework/te-ci:rocm-7.2_ubuntu22.04_py3.11_pytorch_release-2.8_08d38866_jax_0.8.0_fa_2.8.1_aiter_77455e3ecf",
+    "default": {
+      "mi30x": "registry-sc-harbor.amd.com/framework/te-ci:rocm-7.12.0-ubuntu24.04-py312-gfx942_test",
+      "mi35x": "registry-sc-harbor.amd.com/framework/te-ci:rocm-7.12.0-ubuntu24.04-py312-gfx950_test"
+    },
     "release_v1.13": "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273",
     "release_v1.14": "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273"
   }

From 8ec362de4d94168ef55bf38c986ac89f059164f0 Mon Sep 17 00:00:00 2001
From: Veera Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Wed, 3 Jun 2026 14:50:34 +0000
Subject: [PATCH 4/7] Fixed sgpu tests

---
 .github/scripts/Dockerfile.ci.deps            |   6 +-
 .github/workflows/ci-deps-docker-publish.yml  |   4 +
 .../attention/benchmark_attention_rocm.py     | 135 +++++++++++++-----
 3 files changed, 110 insertions(+), 35 deletions(-)

diff --git a/.github/scripts/Dockerfile.ci.deps b/.github/scripts/Dockerfile.ci.deps
index aff7f53c1..4e2fe1f6a 100644
--- a/.github/scripts/Dockerfile.ci.deps
+++ b/.github/scripts/Dockerfile.ci.deps
@@ -49,17 +49,17 @@ RUN python3.12 -m venv /opt/venv \
     && { echo ""; echo "# ROCm"; \
          echo 'export ROCM_PATH=/opt/rocm'; \
          echo 'export PATH="${ROCM_PATH}/bin:${PATH}"'; \
-         echo 'if [ -n "${LD_LIBRARY_PATH:-}" ]; then export LD_LIBRARY_PATH="${ROCM_PATH}/lib:${LD_LIBRARY_PATH}"; else export LD_LIBRARY_PATH="${ROCM_PATH}/lib"; fi'; \
+         echo 'export LD_LIBRARY_PATH="${ROCM_PATH}/lib/rocm_sysdeps/lib:${ROCM_PATH}/lib:${LD_LIBRARY_PATH:-}"'; \
        } >> /opt/venv/bin/activate
 
 ENV GPU_ARCH=${GPU_ARCH} \
     ROCM_PATH=/opt/rocm \
     VIRTUAL_ENV=/opt/venv \
     PATH=/opt/venv/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin \
-    LD_LIBRARY_PATH=/opt/rocm/lib
+    LD_LIBRARY_PATH=/opt/rocm/lib/rocm_sysdeps/lib:/opt/rocm/lib
 
 RUN python -m pip install --upgrade pip setuptools wheel \
-    && pip install --no-cache-dir ipython pytest fire pydantic pybind11 ninja pandas expecttest
+    && pip install --no-cache-dir ipython pytest fire pydantic pybind11 ninja pandas expecttest expecttest onnxscript
 
 # Python ROCm SDK + torch / jax from https://repo.amd.com/rocm/whl/<AMD_GPU_FAMILY>/
 RUN AMD_GPU_FAMILY=$(cat /etc/amd_gpu_family) \
diff --git a/.github/workflows/ci-deps-docker-publish.yml b/.github/workflows/ci-deps-docker-publish.yml
index 2735c4530..5c192ea1f 100644
--- a/.github/workflows/ci-deps-docker-publish.yml
+++ b/.github/workflows/ci-deps-docker-publish.yml
@@ -68,6 +68,10 @@ jobs:
             echo "Set repository variables ARTIFACTORY_DOCKER_REGISTRY and ARTIFACTORY_CI_DEPS_REPOSITORY." >&2
             exit 1
           fi
+          case "${GPU_ARCH}" in
+            gfx942|gfx950) ;;
+            *) echo "gpu_arch must be gfx942 or gfx950" >&2; exit 1 ;;
+          esac
           ROCM_VER="${ROCM_VERSION:-7.12.0}"
           if [ -n "${IMAGE_TAG_INPUT}" ]; then
             echo "IMAGE_TAG=${IMAGE_TAG_INPUT}" >> "${GITHUB_ENV}"
diff --git a/benchmarks/attention/benchmark_attention_rocm.py b/benchmarks/attention/benchmark_attention_rocm.py
index 076a91de9..291c861da 100644
--- a/benchmarks/attention/benchmark_attention_rocm.py
+++ b/benchmarks/attention/benchmark_attention_rocm.py
@@ -4,9 +4,8 @@
 # 
 # See LICENSE for license information.
 
-import os, sys, time, shutil
+import os, sys, time, shutil, subprocess
 import argparse
-import subprocess
 import pandas as pd
 import numpy as np
 import torch
@@ -140,7 +139,102 @@ def setup_backend_env(backend_name, use_ck_bwd_v3=True, use_ck_fwd_v3=True, use_
     "aotriton_bwd": "bwd",
 }
 
-# Runs benchmark with warmup iterations and profiles using rocprofv3
+ROCPROF_STATS_CSV = "results.stats.csv"
+
+
+def _rocprof_executable():
+    """ROCm 7.x TheRock images ship rocprofv3; legacy rocprof may exist on older stacks."""
+    if shutil.which("rocprof"):
+        return "rocprof"
+    if shutil.which("rocprofv3"):
+        return "rocprofv3"
+    return None
+
+
+def _profiler_python_code(model, attention, column_name, benchmark_dir):
+    return (
+        f"import sys; sys.path.insert(0, {benchmark_dir!r}); "
+        f"import benchmark_attention_rocm; "
+        f"benchmark_attention_rocm.benchmark_dot_product_attention_profiler("
+        f"{model!r}, {attention!r}, {column_name!r})"
+    )
+
+
+def _collect_rocprofv3_kernel_stats(dirname):
+    """rocprofv3 writes <dirname>/<host>/<pid>_kernel_stats.csv; TE expects results.stats.csv."""
+    stats_path = os.path.join(dirname, ROCPROF_STATS_CSV)
+    candidates = []
+    for root, _, files in os.walk(dirname):
+        for name in files:
+            if name.endswith("_kernel_stats.csv"):
+                candidates.append(os.path.join(root, name))
+    if not candidates:
+        return
+    src = max(candidates, key=os.path.getmtime)
+    if os.path.abspath(src) != os.path.abspath(stats_path):
+        shutil.copy2(src, stats_path)
+
+
+def _run_attention_profiler(model, attention, column_name, dirname):
+    benchmark_dir = os.path.dirname(os.path.abspath(__file__))
+    py_code = _profiler_python_code(model, attention, column_name, benchmark_dir)
+    exe = _rocprof_executable()
+    if exe is None:
+        print(
+            "WARNING: rocprof/rocprofv3 not in PATH; kernel timing columns will be empty.",
+            file=sys.stderr,
+        )
+        return
+
+    if exe == "rocprofv3":
+        cmd = [
+            "rocprofv3",
+            "--hip-trace",
+            "--kernel-trace",
+            "--stats",
+            "-f",
+            "csv",
+            "-d",
+            dirname,
+            "--",
+            sys.executable,
+            "-c",
+            py_code,
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+    else:
+        before_files = set(os.listdir(cwd))
+        cmd = [
+            "rocprof",
+            "--hip-trace",
+            "--basenames",
+            "off",
+            sys.executable,
+            "-c",
+            py_code,
+        ]
+        result = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True)
+        if result.returncode == 0:
+            after_files = set(os.listdir(cwd))
+            for f in after_files - before_files:
+                src_path = os.path.join(cwd, f)
+                dst_path = os.path.join(dirname, f)
+                if os.path.isfile(src_path):
+                    shutil.move(src_path, dst_path)
+
+    if result.returncode != 0:
+        print(
+            f"WARNING: {exe} failed (exit {result.returncode}); "
+            f"see stderr below. Kernel timing columns may be empty.\n{result.stderr[-4000:]}",
+            file=sys.stderr,
+        )
+        return
+
+    if exe == "rocprofv3":
+        _collect_rocprofv3_kernel_stats(dirname)
+
+
+# Runs benchmark with warmup iterations and profiles using rocprof / rocprofv3
 def benchmark_dot_product_attention(model, attention, column_name, dirname):
     config = model_configs[model]
 
@@ -156,34 +250,7 @@ def benchmark_dot_product_attention(model, attention, column_name, dirname):
                 is_training,
             )
     os.makedirs(dirname, exist_ok=True)
-    benchmark_dir = os.path.dirname(os.path.abspath(__file__))
-    profiler_script = (
-        f"import sys; sys.path.insert(0, {benchmark_dir!r}); "
-        f"import benchmark_attention_rocm; "
-        f"benchmark_attention_rocm.benchmark_dot_product_attention_profiler("
-        f"{model!r}, {attention!r}, {column_name!r})"
-    )
-    # rocprofv3: --kernel-trace + --stats replaces rocprofv2 --hip-trace (kernel stats
-    # are not enabled by default). Full kernel names are kept (v2 --basenames off).
-    prof_cmd = [
-        "rocprofv3",
-        "--kernel-trace",
-        "--stats",
-        "-f", "csv",
-        "-o", rocprof_output_prefix,
-        "-d", dirname,
-        "--",
-        sys.executable,
-        "-c",
-        profiler_script,
-    ]
-    result = subprocess.run(prof_cmd, capture_output=True, text=True)
-    if result.returncode != 0:
-        print(
-            f"rocprofv3 failed for {model} [{attention}] (exit {result.returncode}):\n"
-            f"{result.stderr}",
-            file=sys.stderr,
-        )
+    _run_attention_profiler(model, attention, column_name, dirname)
     torch.cuda.empty_cache()
     
 # Runs profiler and records timing information
@@ -224,8 +291,12 @@ def calculate_attention_tflops(batch_size, seq_len, num_heads_q, head_dim_qk, fw
 
 # Helper function to extract timing results from profiler logs
 def parse_helper(model, dirname, fwd_search_pattern, bwd_search_pattern, column_name, df_times):
-    stats_csv = os.path.join(dirname, rocprof_kernel_stats_csv)
+    stats_csv = os.path.join(dirname, ROCPROF_STATS_CSV)
     if not os.path.isfile(stats_csv):
+        print(
+            f"WARNING: {stats_csv} missing for {model} [{column_name}]; skipping kernel parse.",
+            file=sys.stderr,
+        )
         return False
     df = pd.read_csv(stats_csv)
 

From abfd1d7748429b704dcc2de1d629cdfe885aaf43 Mon Sep 17 00:00:00 2001
From: Veera Rajasekhar Reddy Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Thu, 4 Jun 2026 16:05:58 +0000
Subject: [PATCH 5/7] Fixed sgpu tests

---
 .../attention/benchmark_attention_rocm.py     | 108 ++++++++----------
 tests/pytorch/test_fusible_ops.py             |  12 ++
 2 files changed, 58 insertions(+), 62 deletions(-)

diff --git a/benchmarks/attention/benchmark_attention_rocm.py b/benchmarks/attention/benchmark_attention_rocm.py
index 291c861da..c831906d8 100644
--- a/benchmarks/attention/benchmark_attention_rocm.py
+++ b/benchmarks/attention/benchmark_attention_rocm.py
@@ -142,15 +142,6 @@ def setup_backend_env(backend_name, use_ck_bwd_v3=True, use_ck_fwd_v3=True, use_
 ROCPROF_STATS_CSV = "results.stats.csv"
 
 
-def _rocprof_executable():
-    """ROCm 7.x TheRock images ship rocprofv3; legacy rocprof may exist on older stacks."""
-    if shutil.which("rocprof"):
-        return "rocprof"
-    if shutil.which("rocprofv3"):
-        return "rocprofv3"
-    return None
-
-
 def _profiler_python_code(model, attention, column_name, benchmark_dir):
     return (
         f"import sys; sys.path.insert(0, {benchmark_dir!r}); "
@@ -178,63 +169,33 @@ def _collect_rocprofv3_kernel_stats(dirname):
 def _run_attention_profiler(model, attention, column_name, dirname):
     benchmark_dir = os.path.dirname(os.path.abspath(__file__))
     py_code = _profiler_python_code(model, attention, column_name, benchmark_dir)
-    exe = _rocprof_executable()
-    if exe is None:
-        print(
-            "WARNING: rocprof/rocprofv3 not in PATH; kernel timing columns will be empty.",
-            file=sys.stderr,
-        )
-        return
-
-    if exe == "rocprofv3":
-        cmd = [
-            "rocprofv3",
-            "--hip-trace",
-            "--kernel-trace",
-            "--stats",
-            "-f",
-            "csv",
-            "-d",
-            dirname,
-            "--",
-            sys.executable,
-            "-c",
-            py_code,
-        ]
-        result = subprocess.run(cmd, capture_output=True, text=True)
-    else:
-        before_files = set(os.listdir(cwd))
-        cmd = [
-            "rocprof",
-            "--hip-trace",
-            "--basenames",
-            "off",
-            sys.executable,
-            "-c",
-            py_code,
-        ]
-        result = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True)
-        if result.returncode == 0:
-            after_files = set(os.listdir(cwd))
-            for f in after_files - before_files:
-                src_path = os.path.join(cwd, f)
-                dst_path = os.path.join(dirname, f)
-                if os.path.isfile(src_path):
-                    shutil.move(src_path, dst_path)
-
+    cmd = [
+        "rocprofv3",
+        "--hip-trace",
+        "--kernel-trace",
+        "--stats",
+        "-f",
+        "csv",
+        "-d",
+        dirname,
+        "--",
+        sys.executable,
+        "-c",
+        py_code,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
     if result.returncode != 0:
         print(
-            f"WARNING: {exe} failed (exit {result.returncode}); "
+            f"WARNING: rocprofv3 failed (exit {result.returncode}); "
             f"see stderr below. Kernel timing columns may be empty.\n{result.stderr[-4000:]}",
             file=sys.stderr,
         )
         return
 
-    if exe == "rocprofv3":
-        _collect_rocprofv3_kernel_stats(dirname)
+    _collect_rocprofv3_kernel_stats(dirname)
 
 
-# Runs benchmark with warmup iterations and profiles using rocprof / rocprofv3
+# Runs benchmark with warmup iterations and profiles using rocprofv3
 def benchmark_dot_product_attention(model, attention, column_name, dirname):
     config = model_configs[model]
 
@@ -290,7 +251,17 @@ def calculate_attention_tflops(batch_size, seq_len, num_heads_q, head_dim_qk, fw
     return fwd_tflops, bwd_tflops
 
 # Helper function to extract timing results from profiler logs
-def parse_helper(model, dirname, fwd_search_pattern, bwd_search_pattern, column_name, df_times):
+def parse_helper(
+    model,
+    dirname,
+    fwd_search_pattern,
+    bwd_search_pattern,
+    column_name,
+    df_times,
+    *,
+    fwd_fallback=None,
+    bwd_fallback=None,
+):
     stats_csv = os.path.join(dirname, ROCPROF_STATS_CSV)
     if not os.path.isfile(stats_csv):
         print(
@@ -302,7 +273,11 @@ def parse_helper(model, dirname, fwd_search_pattern, bwd_search_pattern, column_
 
     # Extract kernel timing values
     fwd_values = df[df["Name"].str.contains(fwd_search_pattern, regex=False)]["AverageNs"].to_numpy()
+    if len(fwd_values) == 0 and fwd_fallback is not None:
+        fwd_values = df[df["Name"].str.contains(fwd_fallback, regex=False)]["AverageNs"].to_numpy()
     bwd_values = df[df["Name"].str.contains(bwd_search_pattern, regex=False)]["AverageNs"].to_numpy()
+    if len(bwd_values) == 0 and bwd_fallback is not None:
+        bwd_values = df[df["Name"].str.contains(bwd_fallback, regex=False)]["AverageNs"].to_numpy()
 
     if len(fwd_values) == 0 or len(bwd_values) == 0:
         return False  # Kernels not found
@@ -342,7 +317,16 @@ def parse_results(model, df_times, perf_dir_flash_attn, perf_dir_fused_ck, perf_
     if perf_dir_fused_ck:
         fwd_pattern = KERNEL_PATTERNS["ck_fwd_v3"] if use_ck_fwd_v3 else KERNEL_PATTERNS["ck_fwd_v2"]
         bwd_pattern = KERNEL_PATTERNS["ck_bwd_v3"] if use_ck_bwd_v3 else KERNEL_PATTERNS["ck_bwd_v2"]
-        parse_helper(model, perf_dir_fused_ck, fwd_pattern, bwd_pattern, "FusedAttention CK", df_times)
+        parse_helper(
+            model,
+            perf_dir_fused_ck,
+            fwd_pattern,
+            bwd_pattern,
+            "FusedAttention CK",
+            df_times,
+            fwd_fallback=KERNEL_PATTERNS["ck_fwd_v2"] if use_ck_fwd_v3 else None,
+            bwd_fallback=KERNEL_PATTERNS["ck_bwd_v2"] if use_ck_bwd_v3 else None,
+        )
     
     # Parse AOTriton
     if perf_dir_fused_aotriton:
@@ -363,7 +347,7 @@ def sanity_checks(
 ):
     """
     • Verifies that every model/backend that *should* have run produced
-        profiler_root/<dir>/results_kernel_stats.csv
+        profiler_root/<dir>/results.stats.csv
     • Non-zero exit code on any failure (CI friendly)
     """
     if profiler_root is None:
@@ -405,13 +389,13 @@ def sanity_checks(
         print(f"{model}:")
         # Rocprof run status
         for be, pat in expected.items():
-            stats = os.path.join(profiler_root, pat.format(model=model), rocprof_kernel_stats_csv)
+            stats = os.path.join(profiler_root, pat.format(model=model), ROCPROF_STATS_CSV)
             if os.path.isfile(stats):
                 print(f"  [{be:<22}] Profiling successful")
             else:
                 ok_overall = False
                 raise FileNotFoundError(
-                    f"Error while profiling {model} [{be}], {rocprof_kernel_stats_csv} not found"
+                    f"Error while profiling {model} [{be}], {ROCPROF_STATS_CSV} not found"
                 )
 
         print("-" * 60)
diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py
index 01d0fa1b6..34bb0065a 100644
--- a/tests/pytorch/test_fusible_ops.py
+++ b/tests/pytorch/test_fusible_ops.py
@@ -856,6 +856,18 @@ def _test_basic_linear(
         out_shape = in_shape[:-1] + [out_features]
 
         # Skip invalid configurations
+        if (
+            IS_HIP_EXTENSION
+            and get_device_compute_capability() in ((9, 5), (9, 4))
+            and accumulate_into_main_grad
+            and quantization is None
+            and weight_shape == (3, 5)
+            and dtype in (torch.float16, torch.bfloat16)
+        ):
+            pytest.skip(
+                "hipBLASLt does not provide suitable algorithms for this config"
+            )
+        
         maybe_skip_quantization(quantization, dims=in_shape, device=device, dtype=dtype)
         maybe_skip_quantization(quantization, dims=out_shape)
         quantization_needed = any(

From 0110a252c5ec7c61e2216506ae3d187a88e0fdf0 Mon Sep 17 00:00:00 2001
From: Veera Rajasekhar Reddy Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Fri, 5 Jun 2026 11:33:11 -0500
Subject: [PATCH 6/7] Delete task.txt

---
 task.txt | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 task.txt

diff --git a/task.txt b/task.txt
deleted file mode 100644
index 23a319960..000000000
--- a/task.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-create the docker image with ubuntu 24.04 and install the rock using the info from 
-
-every thibng to install rocm and all are founf at https://repo.amd.com/rocm/whl/ 
-
-Current released is 7.12.0 and 
-
-my wheels for torch, torchvision, torchaudio, jax, jaxlib, triton are at https://repo.amd.com/rocm/whl/  see if you can fetch the info from here
-
-I want to pickfor these two archs gfx94X-dcgpu gfx950-dcgpu and python version 3.12
-
-update @.github/scripts/Dockerfile.ci.deps as well, to install for one arch by default but provide the arg to change the arch so that it installs those

From df082fc9e805973f908c667b0068f82a3c45755d Mon Sep 17 00:00:00 2001
From: Veera Rajasekhar Reddy Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Fri, 5 Jun 2026 16:57:57 +0000
Subject: [PATCH 7/7] refactor Dockerfile

---
 .github/scripts/Dockerfile.ci.deps | 47 +++++++++++++++---------------
 ci/README.md                       |  2 --
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/.github/scripts/Dockerfile.ci.deps b/.github/scripts/Dockerfile.ci.deps
index 4e2fe1f6a..7784014f6 100644
--- a/.github/scripts/Dockerfile.ci.deps
+++ b/.github/scripts/Dockerfile.ci.deps
@@ -2,16 +2,12 @@
 #
 # See LICENSE for license information.
 #
-# TE CI deps image: Ubuntu 24.04 + one TheRock ROCm tarball + Python 3.12 venv.
+# TE CI deps image: Ubuntu 24.04 + TheRock ROCm tarball
 #
 #   docker build -f .github/scripts/Dockerfile.ci.deps \
 #     --build-arg GPU_ARCH=gfx942 \
 #     -t te-ci-deps:rocm-7.12.0-ubuntu24.04-py312-gfx942 .
 #
-#   docker build -f .github/scripts/Dockerfile.ci.deps \
-#     --build-arg GPU_ARCH=gfx950 \
-#     -t te-ci-deps:rocm-7.12.0-ubuntu24.04-py312-gfx950 .
-#
 # ROCm installer: https://raw.githubusercontent.com/ROCm/TheRock/main/dockerfiles/install_rocm_tarball.sh
 
 FROM ubuntu:24.04
@@ -21,6 +17,15 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
 ARG ROCM_VERSION=7.12.0
 ARG GPU_ARCH=gfx942
+ARG PYTHON_VERSION=3.12
+ARG PYTHON_ABI=cp312
+ARG TORCH_VERSION=2.10.0
+ARG TORCHVISION_VERSION=0.25.0
+ARG TORCHAUDIO_VERSION=2.10.0
+ARG TRITON_VERSION=3.6.0
+ARG JAX_VERSION=0.8.2
+ARG FA_VERSION=v2.8.1
+ARG AITER_COMMIT=77455e3ecf4f0d28756afc452e914940c45b944b
 ARG INSTALL_ROCM_TARBALL_SH_URL=https://raw.githubusercontent.com/ROCm/TheRock/main/dockerfiles/install_rocm_tarball.sh
 
 # Map GPU_ARCH → AMD GPU wheel/tarball family (once).
@@ -35,7 +40,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates curl \
     git vim \
     build-essential cmake ninja-build pkg-config liblzma-dev \
-    python3.12 python3.12-venv python3.12-dev python3-pip \
+    python${PYTHON_VERSION} python${PYTHON_VERSION}-venv python${PYTHON_VERSION}-dev python3-pip \
     && rm -rf /var/lib/apt/lists/*
 
 # Native ROCm tarball → /opt/rocm
@@ -45,13 +50,10 @@ RUN AMD_GPU_FAMILY=$(cat /etc/amd_gpu_family) \
     && /tmp/install_rocm_tarball.sh "${ROCM_VERSION}" "${AMD_GPU_FAMILY}" stable \
     && rm -f /tmp/install_rocm_tarball.sh
 
-RUN python3.12 -m venv /opt/venv \
-    && { echo ""; echo "# ROCm"; \
-         echo 'export ROCM_PATH=/opt/rocm'; \
-         echo 'export PATH="${ROCM_PATH}/bin:${PATH}"'; \
-         echo 'export LD_LIBRARY_PATH="${ROCM_PATH}/lib/rocm_sysdeps/lib:${ROCM_PATH}/lib:${LD_LIBRARY_PATH:-}"'; \
-       } >> /opt/venv/bin/activate
+# Isolated Python env for pip packages
+RUN python${PYTHON_VERSION} -m venv /opt/venv
 
+# Default container env: venv on PATH, ROCm toolchain + runtime libs, GPU arch for builds.
 ENV GPU_ARCH=${GPU_ARCH} \
     ROCM_PATH=/opt/rocm \
     VIRTUAL_ENV=/opt/venv \
@@ -59,35 +61,34 @@ ENV GPU_ARCH=${GPU_ARCH} \
     LD_LIBRARY_PATH=/opt/rocm/lib/rocm_sysdeps/lib:/opt/rocm/lib
 
 RUN python -m pip install --upgrade pip setuptools wheel \
-    && pip install --no-cache-dir ipython pytest fire pydantic pybind11 ninja pandas expecttest expecttest onnxscript
+    && pip install --no-cache-dir ipython pytest fire pydantic pybind11 ninja pandas expecttest onnxscript
 
 # Python ROCm SDK + torch / jax from https://repo.amd.com/rocm/whl/<AMD_GPU_FAMILY>/
 RUN AMD_GPU_FAMILY=$(cat /etc/amd_gpu_family) \
     && W="https://repo.amd.com/rocm/whl/${AMD_GPU_FAMILY}" \
     && LIBS_PKG="rocm-sdk-libraries-$(echo "${AMD_GPU_FAMILY}" | tr '[:upper:]' '[:lower:]')" \
+    && ROCM_WHEEL_TAG="rocm${ROCM_VERSION}" \
     && pip install --no-cache-dir \
         --extra-index-url "${W}" \
         "rocm-sdk-core==${ROCM_VERSION}" \
         "rocm-sdk-devel==${ROCM_VERSION}" \
         "${LIBS_PKG}==${ROCM_VERSION}" \
         "${W}/rocm-${ROCM_VERSION}.tar.gz" \
-        "${W}/torch-2.10.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
-        "${W}/torchvision-0.25.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
-        "${W}/torchaudio-2.10.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
-        "${W}/triton-3.6.0%2Brocm7.12.0-cp312-cp312-linux_x86_64.whl" \
-        "${W}/jax_rocm7_pjrt-0.8.2%2Brocm7.12.0-py3-none-manylinux_2_28_x86_64.whl" \
-        "${W}/jax_rocm7_plugin-0.8.2%2Brocm7.12.0-cp312-cp312-manylinux_2_28_x86_64.whl" \
-        "${W}/jaxlib-0.8.2%2Brocm7.12.0-cp312-cp312-manylinux_2_27_x86_64.whl" \
-        "jax==0.8.2"
+        "${W}/torch-${TORCH_VERSION}%2B${ROCM_WHEEL_TAG}-${PYTHON_ABI}-${PYTHON_ABI}-linux_x86_64.whl" \
+        "${W}/torchvision-${TORCHVISION_VERSION}%2B${ROCM_WHEEL_TAG}-${PYTHON_ABI}-${PYTHON_ABI}-linux_x86_64.whl" \
+        "${W}/torchaudio-${TORCHAUDIO_VERSION}%2B${ROCM_WHEEL_TAG}-${PYTHON_ABI}-${PYTHON_ABI}-linux_x86_64.whl" \
+        "${W}/triton-${TRITON_VERSION}%2B${ROCM_WHEEL_TAG}-${PYTHON_ABI}-${PYTHON_ABI}-linux_x86_64.whl" \
+        "${W}/jax_rocm7_pjrt-${JAX_VERSION}%2B${ROCM_WHEEL_TAG}-py3-none-manylinux_2_28_x86_64.whl" \
+        "${W}/jax_rocm7_plugin-${JAX_VERSION}%2B${ROCM_WHEEL_TAG}-${PYTHON_ABI}-${PYTHON_ABI}-manylinux_2_28_x86_64.whl" \
+        "${W}/jaxlib-${JAX_VERSION}%2B${ROCM_WHEEL_TAG}-${PYTHON_ABI}-${PYTHON_ABI}-manylinux_2_27_x86_64.whl" \
+        "jax==${JAX_VERSION}"
 
-ARG FA_VERSION=v2.8.1
 RUN git clone --branch "${FA_VERSION}" --depth 1 https://github.com/Dao-AILab/flash-attention.git /tmp/flash-attention \
     && cd /tmp/flash-attention \
     && GPU_ARCHS="${GPU_ARCH}" FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE FLASH_ATTENTION_SKIP_CK_BUILD=FALSE \
        python setup.py install \
     && rm -rf /tmp/flash-attention
 
-ARG AITER_COMMIT=77455e3ecf4f0d28756afc452e914940c45b944b
 RUN git clone --no-checkout https://github.com/ROCm/aiter.git /tmp/aiter \
     && cd /tmp/aiter \
     && git checkout "${AITER_COMMIT}" \
diff --git a/ci/README.md b/ci/README.md
index 7c15748d0..621c5a3a7 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -28,6 +28,4 @@ For `dev` and other branches using the `default` entry, images are selected per
 | `linux-te-mi30x-*` | gfx942 (MI300X) | `rocm-7.12.0-ubuntu24.04-py312-gfx942_test` |
 | `linux-te-mi35x-*` | gfx950 (MI350X) | `rocm-7.12.0-ubuntu24.04-py312-gfx950_test` |
 
-Registry: `registry-sc-harbor.amd.com/framework/te-ci`
-
 The default image is built from [`.github/scripts/Dockerfile.ci.deps`](../.github/scripts/Dockerfile.ci.deps). It pins [ROCm/aiter](https://github.com/ROCm/aiter) at commit [`77455e3ecf4f0d28756afc452e914940c45b944b`](https://github.com/ROCm/aiter/commit/77455e3ecf4f0d28756afc452e914940c45b944b). That revision was validated in CI for **MXFP4 FP4 GEMM** kernel coverage.