diff --git a/AMD_INTRODUCTION.md b/AMD_INTRODUCTION.md new file mode 100644 index 0000000000..b9f08f476a --- /dev/null +++ b/AMD_INTRODUCTION.md @@ -0,0 +1,44 @@ +# Icon4py performance on MI300 + +## Quickstart + +``` +# Connect to Beverin (CSCS system with MI300A) +ssh beverin.cscs.ch +``` + +In Beverin: + +``` +# Enter scratch directory +cd $SCRATCH + +# Clone icon4py and checkout the correct branch +git clone git@github.com:C2SM/icon4py.git +cd icon4py +git checkout amd_profiling + +# Pull the correct `uenv` image. *!* NECESSARY ONLY ONCE *!* +uenv image pull build::prgenv-gnu/25.12:2333839235 + +# Start the uenv and mount the ROCm 7.1.0 environment. *!* This needs to be executed before running anything everytime *!* +uenv start --view default prgenv-gnu/25.12:2333839235 + +# Install the necessary venv +bash amd_scripts/install_icon4py_venv.sh + +# Source venv +source .venv/bin/activate + +# Source other necessary environment variables +source amd_scripts/setup_env.sh + +# Set GT4Py related environment variables +export GT4PY_UNSTRUCTURED_HORIZONTAL_HAS_UNIT_STRIDE="1" +export GT4PY_BUILD_CACHE_LIFETIME=persistent +export GT4PY_BUILD_CACHE_DIR=amd_profiling_granule +export GT4PY_COLLECT_METRICS_LEVEL=10 +export GT4PY_DYCORE_ENABLE_METRICS="1" +export GT4PY_ADD_GPU_TRACE_MARKERS="1" +export HIPFLAGS="-std=c++17 -fPIC -O3 -march=native -Wno-unused-parameter -save-temps -Rpass-analysis=kernel-resource-usage" +``` diff --git a/amd_scripts/install_icon4py_venv.sh b/amd_scripts/install_icon4py_venv.sh new file mode 100755 index 0000000000..4c86fc481d --- /dev/null +++ b/amd_scripts/install_icon4py_venv.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e + +date + +# Go to the root of the icon4py repository to run the installation from there +ICON4PY_GIT_ROOT=$(git rev-parse --show-toplevel) +cd $ICON4PY_GIT_ROOT + +# Set necessasry flags for compilation +source $ICON4PY_GIT_ROOT/amd_scripts/setup_env.sh + +# Install uv locally +export PATH="$PWD/bin:$PATH" +if [ ! -x "$PWD/bin/uv" ]; then + curl -LsSf https://astral.sh/uv/install.sh | UV_UNMANAGED_INSTALL="$PWD/bin" sh +else + echo "# uv already installed at $PWD/bin/uv" +fi + +# Install icon4py, gt4py, DaCe and other basic dependencies using uv +uv sync --extra rocm7_0 --python $(which python3.12) + +# Activate virtual environment +source .venv/bin/activate + +# Install the requirements for rocprofiler-compute so we can run the profiler from the same environment +uv pip install -r /user-environment/linux-zen3/rocprofiler-compute-7.1.0-rjjjgkz67w66bp46jw7bvlfyduzr6vhv/libexec/rocprofiler-compute/requirements.txt + +echo "# install done" +date diff --git a/amd_scripts/setup_env.sh b/amd_scripts/setup_env.sh new file mode 100644 index 0000000000..d8eeb99692 --- /dev/null +++ b/amd_scripts/setup_env.sh @@ -0,0 +1,13 @@ +export CC="$(which gcc)" +export MPICH_CC="$(which gcc)" +export CXX="$(which g++)" +export MPICH_CXX="$(which g++)" +export HUGETLB_ELFMAP="no" +export HUGETLB_MORECORE="no" +export PYTHONOPTIMIZE="2" +export HCC_AMDGPU_TARGET="gfx942" +export ROCM_HOME="/user-environment/env/default" +export HIPCC=$(which hipcc) +export ROCM_VERSION="7.1.0" +export LD_LIBRARY_PATH=/user-environment/linux-zen3/rocprofiler-dev-7.1.0-i7wbbbgrx7jjp4o2xroyj5j263dkzplv/lib:$LD_LIBRARY_PATH +export LD_PRELOAD=/user-environment/env/default/lib/libomp.so:$LD_PRELOAD diff --git a/model/common/pyproject.toml b/model/common/pyproject.toml index 65c55e9663..8e3a0635b1 100644 --- a/model/common/pyproject.toml +++ b/model/common/pyproject.toml @@ -60,6 +60,7 @@ io = [ "uxarray==2024.3.0", "xarray[complete]>=2024.3.0" ] +rocm7_0 = ['amd-cupy>=13.0'] # TODO(havogt): add gt4py[rocm7_0] once available [project.urls] repository = "https://github.com/C2SM/icon4py" diff --git a/model/common/src/icon4py/model/common/model_options.py b/model/common/src/icon4py/model/common/model_options.py index f08a154667..e9d9ea92a6 100644 --- a/model/common/src/icon4py/model/common/model_options.py +++ b/model/common/src/icon4py/model/common/model_options.py @@ -28,6 +28,7 @@ def dict_values_to_list(d: dict[str, Any]) -> dict[str, list]: def get_dace_options( program_name: str, **backend_descriptor: Any ) -> model_backends.BackendDescriptor: + is_rocm_backend = backend_descriptor.get("device") == model_backends.DeviceType.ROCM optimization_args = backend_descriptor.get("optimization_args", {}) optimization_hooks = optimization_args.get("optimization_hooks", {}) if program_name in [ @@ -54,8 +55,9 @@ def get_dace_options( backend_descriptor["use_zero_origin"] = True if program_name == "graupel_run": optimization_args["fuse_tasklets"] = True - optimization_args["gpu_maxnreg"] = 80 - optimization_args["gpu_block_size_2d"] = (64, 6) + if not is_rocm_backend: + optimization_args["gpu_maxnreg"] = 80 + optimization_args["gpu_block_size_2d"] = (64, 6) optimization_args["gpu_memory_pool"] = False optimization_args["make_persistent"] = True if optimization_hooks: diff --git a/pyproject.toml b/pyproject.toml index 73dc550447..4b7829aad0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,6 +106,7 @@ distributed = ["icon4py-common[distributed]"] fortran = ["icon4py-tools>=0.0.6"] io = ["icon4py-common[io]"] profiling = ['viztracer>=1.1.0'] +rocm7_0 = ["icon4py-common[rocm7_0]"] testing = ["icon4py-testing"] [project.urls] @@ -405,7 +406,13 @@ explicit = true name = 'gridtools' url = 'https://gridtools.github.io/pypi/' +[[tool.uv.index]] +explicit = true +name = 'amd' +url = 'https://pypi.amd.com/rocm-7.0.2/simple' + [tool.uv.sources] +amd-cupy = {index = "amd"} dace = {index = "gridtools"} ghex = {git = "https://github.com/msimberg/GHEX.git", branch = "async-mpi"} # gt4py = {git = "https://github.com/GridTools/gt4py", branch = "main"} diff --git a/uv.lock b/uv.lock index 3b645f86e4..590b69fb6e 100644 --- a/uv.lock +++ b/uv.lock @@ -36,6 +36,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/34/d4e1c02d3bee589efb5dfa17f88ea08bdb3e3eac12bc475462aec52ed223/alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92", size = 13511, upload-time = "2024-01-10T00:56:08.388Z" }, ] +[[package]] +name = "amd-cupy" +version = "13.5.1" +source = { registry = "https://pypi.amd.com/rocm-7.0.2/simple" } +dependencies = [ + { name = "fastrlock" }, + { name = "numpy" }, +] +wheels = [ + { url = "https://pypi.amd.com/rocm-7.0.2/packages/amd-cupy/amd_cupy-13.5.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:eca984c7b8176eecaff0dd84504b322828bedd40c177d736753295e8a4b672de" }, + { url = "https://pypi.amd.com/rocm-7.0.2/packages/amd-cupy/amd_cupy-13.5.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:468ca95416f57d5bbf6663ad8ca69a6ac46b4a34166833f01e5535068fa1b4e8" }, + { url = "https://pypi.amd.com/rocm-7.0.2/packages/amd-cupy/amd_cupy-13.5.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:de3138281e2711e06efaf49a31310d0d4824998e18d43e13e288a0e52ca75ec0" }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -1588,6 +1602,9 @@ io = [ profiling = [ { name = "viztracer" }, ] +rocm7-0 = [ + { name = "icon4py-common", extra = ["rocm7-0"] }, +] testing = [ { name = "icon4py-testing" }, ] @@ -1689,13 +1706,14 @@ requires-dist = [ { name = "icon4py-common", extras = ["cuda12"], marker = "extra == 'cuda12'", editable = "model/common" }, { name = "icon4py-common", extras = ["distributed"], marker = "extra == 'distributed'", editable = "model/common" }, { name = "icon4py-common", extras = ["io"], marker = "extra == 'io'", editable = "model/common" }, + { name = "icon4py-common", extras = ["rocm7-0"], marker = "extra == 'rocm7-0'", editable = "model/common" }, { name = "icon4py-driver", editable = "model/driver" }, { name = "icon4py-standalone-driver", editable = "model/standalone_driver" }, { name = "icon4py-testing", marker = "extra == 'testing'", editable = "model/testing" }, { name = "icon4py-tools", marker = "extra == 'fortran'", editable = "tools" }, { name = "viztracer", marker = "extra == 'profiling'", specifier = ">=1.1.0" }, ] -provides-extras = ["all", "cuda11", "cuda12", "distributed", "fortran", "io", "profiling", "testing"] +provides-extras = ["all", "cuda11", "cuda12", "distributed", "fortran", "io", "profiling", "rocm7-0", "testing"] [package.metadata.requires-dev] build = [ @@ -1921,9 +1939,13 @@ io = [ { name = "uxarray" }, { name = "xarray", extra = ["complete"] }, ] +rocm7-0 = [ + { name = "amd-cupy" }, +] [package.metadata] requires-dist = [ + { name = "amd-cupy", marker = "extra == 'rocm7-0'", specifier = ">=13.0", index = "https://pypi.amd.com/rocm-7.0.2/simple" }, { name = "array-api-compat", specifier = ">=1.13.0" }, { name = "cartopy", marker = "extra == 'io'", specifier = ">=0.22.0" }, { name = "cftime", marker = "extra == 'io'", specifier = ">=1.6.3" }, @@ -1949,7 +1971,7 @@ requires-dist = [ { name = "uxarray", marker = "extra == 'io'", specifier = "==2024.3.0" }, { name = "xarray", extras = ["complete"], marker = "extra == 'io'", specifier = ">=2024.3.0" }, ] -provides-extras = ["all", "cuda11", "cuda12", "distributed", "io"] +provides-extras = ["all", "cuda11", "cuda12", "distributed", "io", "rocm7-0"] [[package]] name = "icon4py-driver"