C2SM · edopao · Mar 12, 2026 · Mar 13, 2026
diff --git a/AMD_INTRODUCTION.md b/AMD_INTRODUCTION.md
@@ -0,0 +1,44 @@
+# Icon4py performance on MI300
+
+## Quickstart
+
+```
+# Connect to Beverin (CSCS system with MI300A)
+ssh beverin.cscs.ch
+```
+
+In Beverin:
+
+```
+# Enter scratch directory
+cd $SCRATCH
+
+# Clone icon4py and checkout the correct branch
+git clone [email protected]:C2SM/icon4py.git
+cd icon4py
+git checkout amd_profiling
+
+# Pull the correct `uenv` image. *!* NECESSARY ONLY ONCE *!*
+uenv image pull build::prgenv-gnu/25.12:2333839235
+
+# Start the uenv and mount the ROCm 7.1.0 environment. *!* This needs to be executed before running anything everytime *!*
+uenv start --view default prgenv-gnu/25.12:2333839235
+
+# Install the necessary venv
+bash amd_scripts/install_icon4py_venv.sh
+
+# Source venv
+source .venv/bin/activate
+
+# Source other necessary environment variables
+source amd_scripts/setup_env.sh
+
+# Set GT4Py related environment variables
+export GT4PY_UNSTRUCTURED_HORIZONTAL_HAS_UNIT_STRIDE="1"
+export GT4PY_BUILD_CACHE_LIFETIME=persistent
+export GT4PY_BUILD_CACHE_DIR=amd_profiling_granule
+export GT4PY_COLLECT_METRICS_LEVEL=10
+export GT4PY_DYCORE_ENABLE_METRICS="1"
+export GT4PY_ADD_GPU_TRACE_MARKERS="1"
+export HIPFLAGS="-std=c++17 -fPIC -O3 -march=native -Wno-unused-parameter -save-temps -Rpass-analysis=kernel-resource-usage"
+```
diff --git a/amd_scripts/install_icon4py_venv.sh b/amd_scripts/install_icon4py_venv.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+
+date
+
+# Go to the root of the icon4py repository to run the installation from there
+ICON4PY_GIT_ROOT=$(git rev-parse --show-toplevel)
+cd $ICON4PY_GIT_ROOT
+
+# Set necessasry flags for compilation
+source $ICON4PY_GIT_ROOT/amd_scripts/setup_env.sh
+
+# Install uv locally
+export PATH="$PWD/bin:$PATH"
+if [ ! -x "$PWD/bin/uv" ]; then
+    curl -LsSf https://astral.sh/uv/install.sh | UV_UNMANAGED_INSTALL="$PWD/bin" sh
+else
+    echo "# uv already installed at $PWD/bin/uv"
+fi
+
+# Install icon4py, gt4py, DaCe and other basic dependencies using uv
+uv sync --extra rocm7_0 --python $(which python3.12)
+
+# Activate virtual environment
+source .venv/bin/activate
+
+# Install the requirements for rocprofiler-compute so we can run the profiler from the same environment
+uv pip install -r /user-environment/linux-zen3/rocprofiler-compute-7.1.0-rjjjgkz67w66bp46jw7bvlfyduzr6vhv/libexec/rocprofiler-compute/requirements.txt
+
+echo "# install done"
+date
diff --git a/amd_scripts/setup_env.sh b/amd_scripts/setup_env.sh
@@ -0,0 +1,13 @@
+export CC="$(which gcc)"
+export MPICH_CC="$(which gcc)"
+export CXX="$(which g++)"
+export MPICH_CXX="$(which g++)"
+export HUGETLB_ELFMAP="no"
+export HUGETLB_MORECORE="no"
+export PYTHONOPTIMIZE="2"
+export HCC_AMDGPU_TARGET="gfx942"
+export ROCM_HOME="/user-environment/env/default"
+export HIPCC=$(which hipcc)
+export ROCM_VERSION="7.1.0"
+export LD_LIBRARY_PATH=/user-environment/linux-zen3/rocprofiler-dev-7.1.0-i7wbbbgrx7jjp4o2xroyj5j263dkzplv/lib:$LD_LIBRARY_PATH
+export LD_PRELOAD=/user-environment/env/default/lib/libomp.so:$LD_PRELOAD
diff --git a/model/common/pyproject.toml b/model/common/pyproject.toml
@@ -60,6 +60,7 @@ io = [
   "uxarray==2024.3.0",
   "xarray[complete]>=2024.3.0"
 ]
+rocm7_0 = ['amd-cupy>=13.0']  # TODO(havogt): add gt4py[rocm7_0] once available
 
 [project.urls]
 repository = "https://github.com/C2SM/icon4py"

diff --git a/model/common/src/icon4py/model/common/model_options.py b/model/common/src/icon4py/model/common/model_options.py
@@ -28,6 +28,7 @@ def dict_values_to_list(d: dict[str, Any]) -> dict[str, list]:
 def get_dace_options(
     program_name: str, **backend_descriptor: Any
 ) -> model_backends.BackendDescriptor:
+    is_rocm_backend = backend_descriptor.get("device") == model_backends.DeviceType.ROCM
     optimization_args = backend_descriptor.get("optimization_args", {})
     optimization_hooks = optimization_args.get("optimization_hooks", {})
     if program_name in [
@@ -54,8 +55,9 @@ def get_dace_options(
         backend_descriptor["use_zero_origin"] = True
     if program_name == "graupel_run":
         optimization_args["fuse_tasklets"] = True
-        optimization_args["gpu_maxnreg"] = 80
-        optimization_args["gpu_block_size_2d"] = (64, 6)
+        if not is_rocm_backend:
+            optimization_args["gpu_maxnreg"] = 80
+            optimization_args["gpu_block_size_2d"] = (64, 6)
         optimization_args["gpu_memory_pool"] = False
         optimization_args["make_persistent"] = True
     if optimization_hooks:

diff --git a/pyproject.toml b/pyproject.toml
@@ -106,6 +106,7 @@ distributed = ["icon4py-common[distributed]"]
 fortran = ["icon4py-tools>=0.0.6"]
 io = ["icon4py-common[io]"]
 profiling = ['viztracer>=1.1.0']
+rocm7_0 = ["icon4py-common[rocm7_0]"]
 testing = ["icon4py-testing"]
 
 [project.urls]
@@ -405,7 +406,13 @@ explicit = true
 name = 'gridtools'
 url = 'https://gridtools.github.io/pypi/'
 
+[[tool.uv.index]]
+explicit = true
+name = 'amd'
+url = 'https://pypi.amd.com/rocm-7.0.2/simple'
+
 [tool.uv.sources]
+amd-cupy = {index = "amd"}
 dace = {index = "gridtools"}
 ghex = {git = "https://github.com/msimberg/GHEX.git", branch = "async-mpi"}
 # gt4py = {git = "https://github.com/GridTools/gt4py", branch = "main"}

diff --git a/uv.lock b/uv.lock