GridTools · egparedes · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ dev = [
   {include-group = 'docs'},
   {include-group = 'frameworks'},
   {include-group = 'lint'},
+  {include-group = 'profiling'},
   {include-group = 'scripts'},
   {include-group = 'test'},
   {include-group = 'typing'}
@@ -42,6 +43,10 @@ lint = [
   'tach>=0.23.0',
   'validate-pyproject-schema-store[all]>=2025.06.13'
 ]
+profiling = [
+  'nvtx>=0.2.14',
+  'viztracer>=1.1.1'
+]
 scripts = ["pyyaml>=6.0.1", "typer>=0.12.3", "packaging"]
 test = [
   'coverage[toml]>=7.6.1',

diff --git a/src/gt4py/next/instrumentation/gpu_profiler.py b/src/gt4py/next/instrumentation/gpu_profiler.py
@@ -0,0 +1,104 @@
+# GT4Py - GridTools Framework
+#
+# Copyright (c) 2014-2024, ETH Zurich
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+from __future__ import annotations
+
+import contextlib
+import warnings
+from collections.abc import Callable
+from typing import Any, ClassVar
+
+from gt4py._core import definitions as core_definitions, types as core_types
+from gt4py.next import common, typing as gtx_typing
+from gt4py.next.instrumentation import hooks
+
+
+if core_definitions.CUPY_DEVICE_TYPE is not None:
+    import cupyx.profiler as cupy_profiler
+
+    time_range = cupy_profiler.time_range
+
+else:
+
+    class time_range(contextlib.AbstractContextManager):
+        def __init__(
+            self,
+            message: str | None = None,
+            color_id: int | None = None,
+            argb_color: core_types.int32 | None = None,
+            sync=False,
+        ) -> None:
+            warnings.warn(
+                "GT4Py profiling is only supported when using a GPU.",
+                UserWarning,
+                stacklevel=2,
+            )
+
+
+@contextlib.contextmanager
+def profile_calls():
+    start_profiling_calls()
+    yield
+    stop_profiling_calls()
+
+
+def start_profiling_calls() -> None:
+    hooks.program_call_context.register(ProgramCallProfiler, index=0)
+    hooks.compiled_program_call_context.register(CompiledProgramCallProfiler, index=0)
+
+
+def stop_profiling_calls() -> None:
+    hooks.program_call_context.remove(ProgramCallProfiler)
+    hooks.compiled_program_call_context.remove(CompiledProgramCallProfiler)
+
+
+class ProgramProfiler(contextlib.AbstractContextManager):
+    name: str
+    time_range: cupy_profiler.time_range
+
+    COLOR_ID: ClassVar[int]
+
+    __slots__ = ("name", "time_range")
+
+    def __enter__(self) -> None:
+        print(f"\n\n\n\nProfiling {self.name}...")
+        self.time_range = time_range(self.name, color_id=self.COLOR_ID).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.time_range.__exit__(exc_type, exc_val, exc_tb)
+        print(f"Finished profiling {self.name}.\n\n\n\n")
+
+
+class ProgramCallProfiler(ProgramProfiler):
+    COLOR_ID: ClassVar[int] = 1
+
+    def __init__(
+        self,
+        program: gtx_typing.Program,
+        args: tuple[Any, ...],
+        offset_provider: common.OffsetProvider,
+        enable_jit: bool,
+        kwargs: dict[str, Any],
+    ) -> None:
+        self.name = program.__name__
+
+
+class CompiledProgramCallProfiler(ProgramProfiler):
+    COLOR_ID: ClassVar[int] = 2
+
+    def __init__(
+        self,
+        compiled_program: Callable,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+        offset_provider: common.OffsetProvider,
+        root: tuple[str, str],
+        key: gtx_typing.CompiledProgramsKey,
+    ) -> None:
+        self.name = f"{root[0]}<{root[1]}>"
diff --git a/tests/next_tests/integration_tests/feature_tests/instrumentation_tests/test_gpu_profiler.py b/tests/next_tests/integration_tests/feature_tests/instrumentation_tests/test_gpu_profiler.py
@@ -0,0 +1,19 @@
+# GT4Py - GridTools Framework
+#
+# Copyright (c) 2014-2024, ETH Zurich
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+
+from __future__ import annotations
+
+from gt4py.next.instrumentation import gpu_profiler
+
+from ...multi_feature_tests.ffront_tests.test_ffront_fvm_nabla import pnabla
+
+
+
+with gpu_profiler.profile():
+    pass
+
diff --git a/tests/next_tests/integration_tests/feature_tests/instrumentation_tests/test_hooks.py b/tests/next_tests/integration_tests/feature_tests/instrumentation_tests/test_hooks.py
@@ -16,7 +16,7 @@
 
 import gt4py.next as gtx
 from gt4py.next import common, Dims, gtfn_cpu, typing as gtx_typing
-from gt4py.next.instrumentation import hooks
+from gt4py.next.instrumentation import gpu_profiler, hooks
 
 try:
     from gt4py.next.program_processors.runners import dace as dace_backends
@@ -153,7 +153,9 @@ def test_program_call_hooks(backend: gtx_typing.Backend):
     hooks.program_call_context.register(custom_program_callback)
     hooks.embedded_program_call_context.register(custom_embedded_program_callback)
     hooks.compiled_program_call_context.register(custom_compiled_program_callback)
-    test_program(True, a_field, b_field, out=out_field)
+    import gt4py.next.instrumentation.gpu_profiler 
+    with gpu_profiler.profile_calls():
+        test_program(True, a_field, b_field, out=out_field)
 
     # Check that the callbacks were called
     assert len(callback_results) == 2

diff --git a/tests/next_tests/integration_tests/multi_feature_tests/ffront_tests/test_ffront_fvm_nabla.py b/tests/next_tests/integration_tests/multi_feature_tests/ffront_tests/test_ffront_fvm_nabla.py
@@ -104,3 +104,35 @@ def test_ffront_nabla(exec_alloc_descriptor):
     assert_close(3.5455427772565435e-003, np.max(pnabla_MXX.asnumpy()))
     assert_close(-3.3540113705465301e-003, np.min(pnabla_MYY.asnumpy()))
     assert_close(3.3540113705465301e-003, np.max(pnabla_MYY.asnumpy()))
+
+
+@pytest.mark.requires_atlas
+def test_ffront_nabla_profiler(exec_alloc_descriptor):
+    from gt4py.next.instrumentation import gpu_profiler
+    import cupyx.profiler as cupy_profiler
+    with gpu_profiler.profile_calls():
+        with cupy_profiler.time_range("pnabla-preparation", color_id=3):
+            setup = nabla_setup(allocator=exec_alloc_descriptor.allocator)
+
+            pnabla_MXX = gtx.zeros({Vertex: setup.nodes_size}, allocator=exec_alloc_descriptor.allocator)
+            pnabla_MYY = gtx.zeros({Vertex: setup.nodes_size}, allocator=exec_alloc_descriptor.allocator)
+
+            offset_provider={
+                    "E2V": setup.edges2node_connectivity,
+                    "V2E": setup.nodes2edge_connectivity,
+                }
+
+            pnabla_prog = pnabla.with_backend(exec_alloc_descriptor)
+            pnabla_prog.compile(offset_provider=offset_provider)
+
+
+        pnabla_prog(
+            setup.input_field,
+            setup.S_fields,
+            setup.sign_field,
+            setup.vol_field,
+            out=(pnabla_MXX, pnabla_MYY),
+            offset_provider=offset_provider
+        )
+
+
diff --git a/tests/next_tests/unit_tests/instrumentation_tests/test_metrics.py b/tests/next_tests/unit_tests/instrumentation_tests/test_metrics.py
@@ -6,6 +6,8 @@
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
 
+from __future__ import annotations
+
 import json
 import pathlib
 import unittest.mock