C2SM · msimberg · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 29, 2026
diff --git a/ci/distributed.yml b/ci/distributed.yml
@@ -38,34 +38,41 @@ build_distributed_baseimage_aarch64:
     DOCKERFILE: ci/docker/checkout_mpi.Dockerfile
     DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]'
     PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi
-    USE_MPI: NO
-    SLURM_MPI_TYPE: pmix
-    PMIX_MCA_psec: native
-    PMIX_MCA_gds: "^shmem2"
 
-.build_distributed_cpu:
+.build_distributed:
   extends: [.build_distributed_template]
   variables:
     UV_PROJECT_ENVIRONMENT: venv_dist
 
-build_distributed_cpu:
+build_distributed:
   stage: image
-  extends: [.container-builder-cscs-gh200, .build_distributed_cpu]
+  extends: [.container-builder-cscs-gh200, .build_distributed]
   needs: [build_distributed_baseimage_aarch64]
 
 .test_template_distributed:
   timeout: 8h
   image: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi
-  extends: [.container-runner-santis-gh200, .build_distributed_cpu]
-  needs: [build_distributed_cpu]
+  extends: [.container-runner-santis-gh200, .build_distributed]
+  needs: [build_distributed]
   variables:
     SLURM_JOB_NUM_NODES: 1
     SLURM_CPU_BIND: 'verbose'
     SLURM_NTASKS: 4
+    SLURM_GPUS_PER_TASK: 1
     TEST_DATA_PATH: "/icon4py/testdata"
     ICON4PY_ENABLE_GRID_DOWNLOAD: false
     ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false
     CSCS_ADDITIONAL_MOUNTS: '["/capstor/store/cscs/userlab/d126/icon4py/ci/testdata_003:$TEST_DATA_PATH"]'
+    # Do not use libfabric from the host system. Libfabric with slingshot
+    # support is built into the container image.
+    USE_MPI: NO
+    # Use libfabric slingshot (cxi) provider and recommended settings from
+    # https://docs.cscs.ch/software/communication/openmpi.
+    SLURM_MPI_TYPE: pmix
+    PMIX_MCA_psec: native
+    FI_PROVIDER: cxi
+    OMPI_MCA_pml: cm
+    OMPI_MCA_mtl: ofi
 
 .test_distributed_aarch64:
   stage: test
@@ -80,14 +87,17 @@ build_distributed_cpu:
   parallel:
     matrix:
       - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
-        BACKEND: [embedded, gtfn_cpu, dace_cpu]
+        BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu, gtfn_gpu]
   rules:
     - if: $COMPONENT == 'atmosphere/diffusion'
       variables:
         SLURM_TIMELIMIT: '00:05:00'
-    - if: $COMPONENT == 'atmosphere/dycore' && $BACKEND == 'dace_cpu'
+    - if: $COMPONENT == 'atmosphere/dycore' && ($BACKEND == 'dace_cpu' || $BACKEND == 'dace_gpu')
+      variables:
+        SLURM_TIMELIMIT: '00:30:00'
+    - if: $COMPONENT == 'common' && $BACKEND == 'dace_gpu'
       variables:
-        SLURM_TIMELIMIT: '00:20:00'
+        SLURM_TIMELIMIT: '00:45:00'
     - if: $COMPONENT == 'atmosphere/dycore'
       variables:
         SLURM_TIMELIMIT: '00:15:00'

diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
@@ -1,27 +1,124 @@
-FROM ubuntu:25.04
+FROM ubuntu:25.10
 
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
 
 ARG DEBIAN_FRONTEND=noninteractive
-RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
-    strace \
-    build-essential \
-    tar \
-    wget \
-    curl \
-    libboost-dev \
-    libnuma-dev \
-    libopenmpi-dev \
-    ca-certificates \
-    libssl-dev \
-    autoconf \
-    automake \
-    libtool \
-    pkg-config \
-    libreadline-dev \
-    git && \
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        autoconf \
+        automake \
+        build-essential \
+        ca-certificates \
+        curl \
+        git \
+        libboost-dev \
+        libconfig-dev \
+        libcurl4-openssl-dev \
+        libfuse-dev \
+        libjson-c-dev \
+        libnl-3-dev \
+        libnuma-dev \
+        libreadline-dev \
+        libsensors-dev \
+        libssl-dev \
+        libtool \
+        libuv1-dev \
+        libyaml-dev \
+        nvidia-cuda-dev \
+        nvidia-cuda-toolkit \
+        nvidia-cuda-toolkit-gcc \
+        pkg-config \
+        python3 \
+        strace \
+        tar \
+        wget && \
     rm -rf /var/lib/apt/lists/*
 
+ENV CC=/usr/bin/cuda-gcc
+ENV CXX=/usr/bin/cuda-g++
+ENV CUDAHOSTCXX=/usr/bin/cuda-g++
+
+# Install OpenMPI configured with libfabric, libcxi, and gdrcopy support for use
+# on Alps. This is based on examples in
+# https://github.com/eth-cscs/cray-network-stack.
+ARG gdrcopy_version=2.5.1
+RUN set -eux; \
+    git clone --depth 1 --branch "v${gdrcopy_version}" https://github.com/NVIDIA/gdrcopy.git; \
+    cd gdrcopy; \
+    make lib -j"$(nproc)" lib_install; \
+    cd /; \
+    rm -rf /gdrcopy; \
+    ldconfig
+
+ARG cassini_headers_version=release/shs-13.0.0
+RUN set -eux; \
+    git clone --depth 1 --branch "${cassini_headers_version}" https://github.com/HewlettPackard/shs-cassini-headers.git; \
+    cd shs-cassini-headers; \
+    cp -r include/* /usr/include/; \
+    cp -r share/* /usr/share/; \
+    rm -rf /shs-cassini-headers
+
+ARG cxi_driver_version=release/shs-13.0.0
+RUN set -eux; \
+    git clone --depth 1 --branch "${cxi_driver_version}" https://github.com/HewlettPackard/shs-cxi-driver.git; \
+    cd shs-cxi-driver; \
+    cp -r include/* /usr/include/; \
+    rm -rf /shs-cxi-driver
+
+ARG libcxi_version=release/shs-13.0.0
+RUN set -eux; \
+    git clone --depth 1 --branch "${libcxi_version}" https://github.com/HewlettPackard/shs-libcxi.git; \
+    cd shs-libcxi; \
+    ./autogen.sh; \
+    ./configure \
+      --with-cuda; \
+    make -j"$(nproc)" install; \
+    cd /; \
+    rm -rf /shs-libcxi; \
+    ldconfig
+
+ARG xpmem_version=0d0bad4e1d07b38d53ecc8f20786bb1328c446da
+RUN set -eux; \
+    git clone https://github.com/hpc/xpmem.git; \
+    cd xpmem; \
+    git checkout "${xpmem_version}"; \
+    ./autogen.sh; \
+    ./configure --disable-kernel-module; \
+    make -j"$(nproc)" install; \
+    cd /; \
+    rm -rf /xpmem; \
+    ldconfig
+
+# NOTE: xpmem is not found correctly without setting the prefix explicitly in
+# --enable-xpmem
+ARG libfabric_version=v2.4.0
+RUN set -eux; \
+    git clone --depth 1 --branch "${libfabric_version}" https://github.com/ofiwg/libfabric.git; \
+    cd libfabric; \
+    ./autogen.sh; \
+    ./configure \
+      --with-cuda \
+      --enable-xpmem=/usr \
+      --enable-tcp \
+      --enable-cxi; \
+    make -j"$(nproc)" install; \
+    cd /; \
+    rm -rf /libfabric; \
+    ldconfig
+
+ARG openmpi_version=5.0.9
+RUN set -eux; \
+    curl -fsSL "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${openmpi_version}.tar.gz" -o /tmp/ompi.tar.gz; \
+    tar -C /tmp -xzf /tmp/ompi.tar.gz; \
+    cd "/tmp/openmpi-${openmpi_version}"; \
+    ./configure \
+      --with-ofi \
+      --with-cuda=/usr; \
+    make -j"$(nproc)" install; \
+    cd /; \
+    rm -rf "/tmp/openmpi-${openmpi_version}" /tmp/ompi.tar.gz; \
+    ldconfig
+
 # Install uv: https://docs.astral.sh/uv/guides/integration/docker
 COPY --from=ghcr.io/astral-sh/uv:0.9.24@sha256:816fdce3387ed2142e37d2e56e1b1b97ccc1ea87731ba199dc8a25c04e4997c5 /uv /uvx /bin/
diff --git a/ci/docker/checkout_mpi.Dockerfile b/ci/docker/checkout_mpi.Dockerfile
@@ -7,5 +7,9 @@ WORKDIR /icon4py
 ARG PYVERSION
 ARG VENV
 ENV UV_PROJECT_ENVIRONMENT=$VENV
-ENV MPI4PY_BUILD_BACKEND="scikit-build-core"
-RUN uv sync --extra distributed --python=$PYVERSION
+ENV MPI4PY_BUILD_BACKEND=scikit-build-core
+ENV GHEX_USE_GPU=ON
+ENV GHEX_GPU_TYPE=NVIDIA
+ENV GHEX_GPU_ARCH=90
+ENV GHEX_TRANSPORT_BACKEND=MPI
+RUN uv sync --extra all --extra cuda12 --python=$PYVERSION
diff --git a/model/common/src/icon4py/model/common/grid/utils.py b/model/common/src/icon4py/model/common/grid/utils.py
@@ -5,21 +5,20 @@
 #
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
-from types import ModuleType
 
 import numpy as np
 
 from icon4py.model.common.grid import gridfile
 
 
-def revert_repeated_index_to_invalid(offset: np.ndarray, array_ns: ModuleType):
+def revert_repeated_index_to_invalid(offset: np.ndarray):
     num_elements = offset.shape[0]
     for i in range(num_elements):
         # convert repeated indices back into -1
-        for val in array_ns.flip(offset[i, :]):
-            if array_ns.count_nonzero(val == offset[i, :]) > 1:
-                unique_values, counts = array_ns.unique(offset[i, :], return_counts=True)
+        for val in np.flip(offset[i, :]):
+            if np.count_nonzero(val == offset[i, :]) > 1:
+                unique_values, counts = np.unique(offset[i, :], return_counts=True)
                 rep_values = unique_values[counts > 1]
-                rep_indices = array_ns.where(array_ns.isin(offset[i, :], rep_values))[0]
+                rep_indices = np.where(np.isin(offset[i, :], rep_values))[0]
                 offset[i, rep_indices[1:]] = gridfile.GridFile.INVALID_INDEX
     return offset
diff --git a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
@@ -280,6 +280,7 @@ def test_exchange_on_dummy_data(
 
 @pytest.mark.mpi
 @pytest.mark.datatest
+@pytest.mark.embedded_only
 @pytest.mark.parametrize("processor_props", [False], indirect=True)
 def test_halo_exchange_for_sparse_field(
     interpolation_savepoint: serialbox.InterpolationSavepoint,

diff --git a/model/testing/src/icon4py/model/testing/serialbox.py b/model/testing/src/icon4py/model/testing/serialbox.py
@@ -72,7 +72,7 @@ def wrapper(self, *args, **kwargs):
                         # as a workaround for the lack of support for optional fields in gt4py.
                         shp = (1,) * len(dims)
                         return gtx.as_field(
-                            dims, np.zeros(shp, dtype=dtype), allocator=self.backend
+                            dims, self.xp.zeros(shp, dtype=dtype), allocator=self.backend
                         )
                     else:
                         return None
@@ -503,9 +503,8 @@ def construct_icon_grid(
             def potentially_revert_icon_index_transformation(ar):
                 return ar
         else:
-            potentially_revert_icon_index_transformation = functools.partial(
-                grid_utils.revert_repeated_index_to_invalid,
-                array_ns=data_alloc.import_array_ns(backend),
+            potentially_revert_icon_index_transformation = (
+                grid_utils.revert_repeated_index_to_invalid
             )
 
         c2e2c = self.c2e2c()

diff --git a/pyproject.toml b/pyproject.toml
@@ -361,7 +361,7 @@ url = 'https://gridtools.github.io/pypi/'
 
 [tool.uv.sources]
 dace = {index = "gridtools"}
-ghex = {git = "https://github.com/msimberg/GHEX.git", branch = "async-mpi"}
+ghex = {git = "https://github.com/philip-paul-mueller/GHEX.git", branch = "phimuell__async-mpi-2"}
 # gt4py = {git = "https://github.com/GridTools/gt4py", branch = "main"}
 # gt4py = {index = "test.pypi"}
 icon4py-atmosphere-advection = {workspace = true}

diff --git a/uv.lock b/uv.lock