C2SM · iomaganaris · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/AMD_INTRODUCTION.md b/AMD_INTRODUCTION.md
diff --git a/amd_scripts/benchmark_dycore.sh b/amd_scripts/benchmark_dycore.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+#SBATCH --job-name=dycore_granule_profile
+#SBATCH --ntasks=1
+#SBATCH --time=08:00:00
+#SBATCH --gres=gpu:1
+#SBATCH --partition=mi300
+
+# Go to the root of the icon4py repository to run the script from there
+ICON4PY_GIT_ROOT=$(git rev-parse --show-toplevel)
+cd $ICON4PY_GIT_ROOT
+
+# Set necessasry flags for compilation
+source amd_scripts/setup_env.sh
+
+source .venv/bin/activate
+
+export GT4PY_UNSTRUCTURED_HORIZONTAL_HAS_UNIT_STRIDE="1"
+export GT4PY_BUILD_CACHE_LIFETIME=persistent
+export GT4PY_BUILD_CACHE_DIR=amd_profiling_granule_regional
+export GT4PY_COLLECT_METRICS_LEVEL=10
+export GT4PY_ADD_GPU_TRACE_MARKERS="1"
+export HIPFLAGS="-std=c++17 -fPIC -O3 -march=native -Wno-unused-parameter -save-temps -Rpass-analysis=kernel-resource-usage"
+
+export ICON_GRID="icon_benchmark_regional" # TODO(CSCS): Fix `icon_benchmark_global` GPU memory issue: `Memory access fault by GPU node-4 (Agent handle: 0x5514890) on address 0x1463a8000000. Reason: Unknown. Failed to allocate file: Bad file descriptor`
+
+export DYCORE_GT4PY_PROGRAMS_TIMER_FILE="dycore_gt4py_program_metrics.json"
+
+rm ${DYCORE_GT4PY_PROGRAMS_TIMER_FILE} || true
+
+pytest -sv \
+    -m continuous_benchmarking \
+    -p no:tach \
+    --benchmark-only \
+    --benchmark-warmup=on \
+    --benchmark-warmup-iterations=30 \
+    --backend=dace_gpu \
+    --grid=${ICON_GRID} \
+    --benchmark-time-unit=ms \
+    --benchmark-min-rounds 100 \
+    model/atmosphere/dycore/tests/dycore/integration_tests/test_benchmark_solve_nonhydro.py::test_benchmark_solve_nonhydro[False-False]
+
+python amd_scripts/print_gt4py_timers.py ${DYCORE_GT4PY_PROGRAMS_TIMER_FILE}
+
+# TODO(AMD): The trace generated by the following command doesn't inclde the GPU activity. Perfetto UI output warning about import errors and data losses.
+# rocprofv3 --kernel-trace on --hip-trace on --marker-trace on --memory-copy-trace on --memory-allocation-trace on --output-format pftrace -o rocprofv3_${GT4PY_BUILD_CACHE_DIR} -- \
+#     $(which python3.12) -m pytest -sv \
+#     -m continuous_benchmarking \
+#     -p no:tach \
+#     --benchmark-only \
+#     --benchmark-warmup=on \
+#     --benchmark-warmup-iterations=30 \
+#     --backend=dace_gpu \
+#     --grid=${ICON_GRID} \
+#     --benchmark-time-unit=ms \
+#     --benchmark-min-rounds 10 \
+#     model/atmosphere/dycore/tests/dycore/integration_tests/test_benchmark_solve_nonhydro.py::test_benchmark_solve_nonhydro[False-False]
diff --git a/amd_scripts/benchmark_solver.sh b/amd_scripts/benchmark_solver.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+#SBATCH --job-name=solver_benchmark
+#SBATCH --ntasks=1
+#SBATCH --time=08:00:00
+#SBATCH --gres=gpu:1
+#SBATCH --partition=mi300
+
+# Go to the root of the icon4py repository to run the script from there
+ICON4PY_GIT_ROOT=$(git rev-parse --show-toplevel)
+cd $ICON4PY_GIT_ROOT
+
+# Set necessasry flags for compilation
+source amd_scripts/setup_env.sh
+
+source .venv/bin/activate
+
+export GT4PY_UNSTRUCTURED_HORIZONTAL_HAS_UNIT_STRIDE="1"
+export GT4PY_BUILD_CACHE_LIFETIME=persistent
+export GT4PY_BUILD_CACHE_DIR=amd_profiling_solver_regional
+export GT4PY_COLLECT_METRICS_LEVEL=10
+export GT4PY_ADD_GPU_TRACE_MARKERS="1"
+export ICON4PY_STENCIL_TEST_WARMUP_ROUNDS=3
+export ICON4PY_STENCIL_TEST_ITERATIONS=10
+export ICON4PY_STENCIL_TEST_BENCHMARK_ROUNDS=100
+export HIPFLAGS="-std=c++17 -fPIC -O3 -march=native -Wno-unused-parameter -save-temps -Rpass-analysis=kernel-resource-usage"
+
+export ICON_GRID="icon_benchmark_regional" # TODO(CSCS): Check also `icon_benchmark_global` when the dycore GPU memory issue is fixed
+
+# Run the benchmark and collect the runtime of the whole GT4Py program (see `GT4Py Timer Report` in the output)
+# The compiled GT4Py programs will be cached in the directory specified by `GT4PY_BUILD_CACHE_DIR` to be reused for running the profilers afterwards
+pytest -sv \
+    -m continuous_benchmarking \
+    -p no:tach \
+    --backend=dace_gpu \
+    --grid=${ICON_GRID} \
+    model/atmosphere/dycore/tests/dycore/stencil_tests/test_vertically_implicit_dycore_solver_at_predictor_step.py \
+    -k "test_TestVerticallyImplicitSolverAtPredictorStep[compile_time_domain-at_first_substep[False]__is_iau_active[False]__divdamp_type[32]]"
+
+# Run the benchmark and collect its trace
+# TODO(AMD): Generating `rocpd` output fails with segfaults
+export ICON4PY_STENCIL_TEST_WARMUP_ROUNDS=30
+export ICON4PY_STENCIL_TEST_ITERATIONS=10
+export ICON4PY_STENCIL_TEST_BENCHMARK_ROUNDS=100
+# Can also add `--att` for thread tracing
+rocprofv3 --kernel-trace on --hip-trace on --marker-trace on --memory-copy-trace on --memory-allocation-trace on --output-format pftrace -o rocprofv3_${GT4PY_BUILD_CACHE_DIR} -- \
+    $(which python3.12) -m pytest -sv \
+    -m continuous_benchmarking \
+    -p no:tach \
+    --backend=dace_gpu \
+    --grid=${ICON_GRID} \
+    model/atmosphere/dycore/tests/dycore/stencil_tests/test_vertically_implicit_dycore_solver_at_predictor_step.py \
+    -k "test_TestVerticallyImplicitSolverAtPredictorStep[compile_time_domain-at_first_substep[False]__is_iau_active[False]__divdamp_type[32]]"
+# Alternatively, export the data to `csv` and print kernel runtimes with the following command
+# python amd_scripts/median_rocprof_csv.py rocprofv3_${GT4PY_BUILD_CACHE_DIR}_kernel_trace.csv
+
+# Get the kernel names of the GT4Py program so that we can filter them with rocprof-compute
+LAST_COMPILED_DIRECTORY=$(realpath $(ls -td ${GT4PY_BUILD_CACHE_DIR}/.gt4py_cache/*/ | head -1))
+echo "# Last compiled GT4Py directory: $LAST_COMPILED_DIRECTORY"
+LAST_COMPILED_KERNEL_NAMES=$(grep -r -e "__global__ void.*map.*(" ${LAST_COMPILED_DIRECTORY}/src/cuda -o | sed 's/.*\s\([a-zA-Z_][a-zA-Z0-9_]*\)(.*/\1/')
+echo "# Last compiled GT4Py kernel names:"
+echo "$LAST_COMPILED_KERNEL_NAMES"
+ROCPROF_COMPUTE_KERNEL_NAME_FILTER="-k $LAST_COMPILED_KERNEL_NAMES"
+
+# Run rocprof-compute filtering the kernels of interest
+export ICON4PY_STENCIL_TEST_WARMUP_ROUNDS=0
+export ICON4PY_STENCIL_TEST_ITERATIONS=1
+export ICON4PY_STENCIL_TEST_BENCHMARK_ROUNDS=1
+rocprof-compute profile --name rcu_${GT4PY_BUILD_CACHE_DIR} ${ROCPROF_COMPUTE_KERNEL_NAME_FILTER} --format-rocprof-output rocpd --kernel-names -R FP64 -- \
+    $(which python3.12) -m pytest -sv \
+    -m continuous_benchmarking \
+    -p no:tach \
+    --backend=dace_gpu \
+    --grid=${ICON_GRID} \
+    model/atmosphere/dycore/tests/dycore/stencil_tests/test_vertically_implicit_dycore_solver_at_predictor_step.py \
+    -k "test_TestVerticallyImplicitSolverAtPredictorStep[compile_time_domain-at_first_substep[False]__is_iau_active[False]__divdamp_type[32]]"
+
+# TODO(AMD): Roofline generation fails with
+#   File "/user-environment/linux-zen3/rocprofiler-compute-7.1.0-rjjjgkz67w66bp46jw7bvlfyduzr6vhv/libexec/rocprofiler-compute/roofline.py", line 998, in standalone_roofline
+#    self.empirical_roofline(ret_df=t_df)
+#  File "/user-environment/linux-zen3/rocprofiler-compute-7.1.0-rjjjgkz67w66bp46jw7bvlfyduzr6vhv/libexec/rocprofiler-compute/utils/logger.py", line 66, in wrap_function
+#    result = function(*args, **kwargs)
+#             ^^^^^^^^^^^^^^^^^^^^^^^^^
+#  File "/user-environment/linux-zen3/rocprofiler-compute-7.1.0-rjjjgkz67w66bp46jw7bvlfyduzr6vhv/libexec/rocprofiler-compute/roofline.py", line 463, in empirical_roofline
+#    flops_figure.write_image(
+#  File "/capstor/scratch/cscs/ioannmag/HPCAIAdvisory/icon4py/.venv/lib/python3.12/site-packages/plotly/basedatatypes.py", line 3895, in write_image
+#    return pio.write_image(self, *args, **kwargs)
+#           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#  File "/capstor/scratch/cscs/ioannmag/HPCAIAdvisory/icon4py/.venv/lib/python3.12/site-packages/plotly/io/_kaleido.py", line 555, in write_image
+#    path.write_bytes(img_data)
+#  File "/user-environment/linux-zen3/python-3.12.12-jpkfwhqo6njvbpw7gjcs22qkvxwexnv5/lib/python3.12/pathlib.py", line 1036, in write_bytes
+#    with self.open(mode='wb') as f:
+#         ^^^^^^^^^^^^^^^^^^^^
+# File "/user-environment/linux-zen3/python-3.12.12-jpkfwhqo6njvbpw7gjcs22qkvxwexnv5/lib/python3.12/pathlib.py", line 1013, in open
+#    return io.open(self, mode, buffering, encoding, errors, newline)
+#           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# OSError: [Errno 36] File name too long: '/capstor/scratch/cscs/ioannmag/HPCAIAdvisory/icon4py/workloads/rcu_amd_profiling_solver/MI300A_A1/empirRoof_gpu-0_FP64_map_0_fieldop_0_0_500_map_100_fieldop_0_0_0_514_map_100_fieldop_1_0_0_0_520_map_115_fieldop_0_0_0_516_map_115_fieldop_1_0_0_518_map_13_fieldop_0_0_498_map_31_fieldop_0_0_0_512_map_35_fieldop_0_0_503_map_60_fieldop_0_0_504_map_85_fieldop_0_0_506_map_90_fieldop_0_0_508_map_91_fieldop_0_0_510.pdf'
diff --git a/amd_scripts/install_icon4py_venv.sh b/amd_scripts/install_icon4py_venv.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+
+date
+
+# Go to the root of the icon4py repository to run the installation from there
+ICON4PY_GIT_ROOT=$(git rev-parse --show-toplevel)
+cd $ICON4PY_GIT_ROOT
+
+# Set necessasry flags for compilation
+source $ICON4PY_GIT_ROOT/amd_scripts/setup_env.sh
+
+# Install uv locally
+export PATH="$PWD/bin:$PATH"
+if [ ! -x "$PWD/bin/uv" ]; then
+    curl -LsSf https://astral.sh/uv/install.sh | UV_UNMANAGED_INSTALL="$PWD/bin" sh
+else
+    echo "# uv already installed at $PWD/bin/uv"
+fi
+
+# Install icon4py, gt4py, DaCe and other basic dependencies using uv
+uv sync --extra rocm7_0 --python $(which python3.12)
+
+# Activate virtual environment
+source .venv/bin/activate
+
+# Install the requirements for rocprofiler-compute so we can run the profiler from the same environment
+uv pip install -r /user-environment/linux-zen3/rocprofiler-compute-7.1.0-rjjjgkz67w66bp46jw7bvlfyduzr6vhv/libexec/rocprofiler-compute/requirements.txt
+
+echo "# install done"
+date
diff --git a/amd_scripts/median_rocprof_csv.py b/amd_scripts/median_rocprof_csv.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+import csv
+import sys
+import statistics
+
+if len(sys.argv) < 2:
+    print("Usage: python script.py <csv_file>", file=sys.stderr)
+    sys.exit(1)
+
+path = sys.argv[1]
+kernels = {}
+
+with open(path, newline="") as f:
+    reader = csv.DictReader(f)
+    for row in reader:
+        name = row["Kernel_Name"]
+        if name.startswith("map"):
+            name = name.split("(")[0]
+            if name not in kernels:
+                kernels[name] = []
+            duration = int(row["End_Timestamp"]) - int(row["Start_Timestamp"])
+            kernels[name].append(duration)
+
+if not kernels:
+    print("No kernels starting with 'map' found", file=sys.stderr)
+else:
+    for kernel_name, durations in sorted(kernels.items()):
+        median = statistics.median(durations)
+        stdev = statistics.stdev(durations) if len(durations) > 1 else 0
+        print(f"{kernel_name},{median:.0f},{stdev:.0f}")
diff --git a/amd_scripts/print_gt4py_timers.py b/amd_scripts/print_gt4py_timers.py
@@ -0,0 +1,31 @@
+import json
+import numpy
+import csv
+import sys
+
+if len(sys.argv) < 2:
+    print("Usage: python print_gt4py_timers.py <input_file> [--csv]")
+    sys.exit(1)
+
+input_file = sys.argv[1]
+data = json.load(open(input_file))
+
+if len(sys.argv) > 2 and sys.argv[2] == '--csv':
+    with open('output.csv', 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(['Function', 'Median', 'Std'])
+        for k, v in data.items():
+            if v.get('metrics').get('compute'):
+                arr = numpy.array(v.get('metrics').get('compute')[1:])
+                if len(arr) > 0:
+                    median = numpy.median(arr)
+                    if not numpy.isnan(median):
+                        writer.writerow([k.split('<')[0], median, arr.std()])
+else:
+    for k, v in data.items():
+        if v.get('metrics').get('compute'):
+            arr = numpy.array(v.get('metrics').get('compute')[1:])
+            if len(arr) > 0:
+                median = numpy.median(arr)
+                if not numpy.isnan(median):
+                    print(f"{k.split('<')[0]}: Median = {median}, Std = {arr.std()}")
diff --git a/amd_scripts/setup_env.sh b/amd_scripts/setup_env.sh
@@ -0,0 +1,14 @@
+export CC="$(which gcc)"
+export MPICH_CC="$(which gcc)"
+export CXX="$(which g++)"
+export MPICH_CXX="$(which g++)"
+export HUGETLB_ELFMAP="no"
+export HUGETLB_MORECORE="no"
+export PYTHONOPTIMIZE="2"
+export HCC_AMDGPU_TARGET="gfx942"
+export ROCM_HOME="/user-environment/env/default"
+export HIPCC=$(which hipcc)
+export ROCM_VERSION="7.1.0"
+export LD_LIBRARY_PATH=/user-environment/linux-zen3/rocprofiler-dev-7.1.0-i7wbbbgrx7jjp4o2xroyj5j263dkzplv/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=rocprof-trace-decoder-manylinux-2.28-0.1.6-Linux:$LD_LIBRARY_PATH # TODO(iomaganaris): Add package to uenv properly
+export LD_PRELOAD=/user-environment/env/default/lib/libomp.so:$LD_PRELOAD
diff --git a/model/atmosphere/dycore/tests/dycore/integration_tests/test_benchmark_solve_nonhydro.py b/model/atmosphere/dycore/tests/dycore/integration_tests/test_benchmark_solve_nonhydro.py
@@ -9,9 +9,11 @@
 from __future__ import annotations
 
 import functools
+import os
 from typing import TYPE_CHECKING, Any
 
 import gt4py.next as gtx
+from gt4py.next.instrumentation import metrics as gtx_metrics
 import pytest
 
 
@@ -346,3 +348,6 @@ def test_benchmark_solve_nonhydro(
         at_first_substep=at_first_substep,
         at_last_substep=at_last_substep,
     )
+
+    if gtx_metrics.sources:
+        gtx_metrics.dump_json("dycore_gt4py_program_metrics.json")
diff --git a/model/common/pyproject.toml b/model/common/pyproject.toml
@@ -42,6 +42,7 @@ version = "0.0.6"
 all = ["icon4py-common[distributed,io]"]
 cuda11 = ['cupy-cuda11x>=13.0', 'gt4py[cuda11]']
 cuda12 = ['cupy-cuda12x>=13.0', 'gt4py[cuda12]']
+rocm7_0 = ['amd-cupy>=13.0'] # TODO(havogt): add gt4py[rocm7_0] once available
 distributed = ["ghex>=0.5.0", "mpi4py>=3.1.5"]
 io = [
   # external dependencies

diff --git a/model/common/src/icon4py/model/common/model_options.py b/model/common/src/icon4py/model/common/model_options.py
@@ -52,12 +52,15 @@ def get_dace_options(
     # due to it falling into a less optimized code generation (on santis).
     if program_name == "compute_rho_theta_pgrad_and_update_vn":
         backend_descriptor["use_zero_origin"] = True
+    # TODO(AMD): For now disable problematic `hipMallocAsync` calls on each GT4Py Program call that have high runtime variability.
+    #            Needs to be fixed for realistic simulations due to increased memory footprint of persistent memory.
+    if backend_descriptor["device"] == model_backends.DeviceType.ROCM:
+        optimization_args["gpu_memory_pool"] = False
+        optimization_args["make_persistent"] = True
     if program_name == "graupel_run":
         backend_descriptor["use_zero_origin"] = True
         optimization_args["fuse_tasklets"] = True
         optimization_args["gpu_maxnreg"] = 128
-        optimization_args["gpu_memory_pool"] = False
-        optimization_args["make_persistent"] = True
     if optimization_hooks:
         optimization_args["optimization_hooks"] = optimization_hooks
     if optimization_args:

diff --git a/model/testing/src/icon4py/model/testing/stencil_tests.py b/model/testing/src/icon4py/model/testing/stencil_tests.py
@@ -133,9 +133,8 @@ def test_and_benchmark(
             # Get the pool key necessary to find the right metrics key. There should be only one compiled program in _configured_program
             pool_key = next(iter(compiled_programs.keys()))
             # Get the metrics key from the pool key to read the corresponding metrics
-            metrics_key = _configured_program._compiled_programs._metrics_key_from_pool_key(
-                pool_key
-            )
+            compiled_programs_root = _configured_program._compiled_programs.root
+            metrics_key = f"{compiled_programs_root[0]}<{compiled_programs_root[1]}>[{hash(pool_key)}]"
             metrics_data = gtx_metrics.sources
             compute_samples = metrics_data[metrics_key].metrics["compute"].samples
             # exclude warmup iterations, one extra iteration for calibrating pytest-benchmark and one for validation (if executed)

diff --git a/pyproject.toml b/pyproject.toml
@@ -97,6 +97,7 @@ version = "0.0.6"
 all = ["icon4py[distributed,fortran,io,testing,profiling]"]
 cuda11 = ["icon4py-common[cuda11]"]
 cuda12 = ["icon4py-common[cuda12]"]
+rocm7_0 = ["icon4py-common[rocm7_0]"]
 distributed = ["icon4py-common[distributed]"]
 fortran = ["icon4py-tools>=0.0.6"]
 io = ["icon4py-common[io]"]
@@ -359,11 +360,17 @@ explicit = true
 name = 'gridtools'
 url = 'https://gridtools.github.io/pypi/'
 
+[[tool.uv.index]]
+explicit = true
+name = 'amd'
+url = 'https://pypi.amd.com/rocm-7.0.2/simple'
+
 [tool.uv.sources]
 dace = {index = "gridtools"}
 ghex = {git = "https://github.com/msimberg/GHEX.git", branch = "async-mpi"}
-# gt4py = {git = "https://github.com/GridTools/gt4py", branch = "main"}
+gt4py = {git = "https://github.com/GridTools/gt4py", branch = "amd_profiling_staging"}
 # gt4py = {index = "test.pypi"}
+amd-cupy = {index = "amd" }
 icon4py-atmosphere-advection = {workspace = true}
 icon4py-atmosphere-diffusion = {workspace = true}
 icon4py-atmosphere-dycore = {workspace = true}