diff --git a/python/cuda_cccl/cuda/compute/_nvtx.py b/python/cuda_cccl/cuda/compute/_nvtx.py new file mode 100644 index 00000000000..364af934b6b --- /dev/null +++ b/python/cuda_cccl/cuda/compute/_nvtx.py @@ -0,0 +1,53 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +NVTX annotation utilities for cuda.compute module. +Uses NVIDIA green (76B900) color and cuda.compute domain. +""" + +import functools + +import nvtx + +# NVIDIA green color hex value (76B900) +NVIDIA_GREEN = 0x76B900 + +# Domain name for cuda.compute annotations +COMPUTE_DOMAIN = "cuda.compute" + + +def annotate(message=None, domain=None, category=None, color=None): + """ + Decorator to annotate functions with NVTX markers. + + Args: + message: Optional message to display. If None, uses the function name. + domain: Optional NVTX domain string. Defaults to "cuda.compute". + category: Optional category for the annotation. + color: Optional color in hexadecimal format (0xRRGGBB). Defaults to NVIDIA green (0x76B900). + + Returns: + Decorated function with NVTX annotations. + """ + + def decorator(func): + # Use function name if no message is provided + annotation_message = message if message is not None else func.__name__ + annotation_domain = domain if domain is not None else COMPUTE_DOMAIN + annotation_color = color if color is not None else NVIDIA_GREEN + + @functools.wraps(func) + def wrapper(*args, **kwargs): + with nvtx.annotate( + annotation_message, + domain=annotation_domain, + color=annotation_color, + category=category, + ): + return func(*args, **kwargs) + + return wrapper + + return decorator diff --git a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py index 9e15d3c3ff0..563fd30d418 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py @@ -11,6 +11,7 @@ from .. import _cccl_interop as cccl from .._caching import cache_with_key from .._cccl_interop import call_build, set_cccl_iterator_state, to_cccl_value_state +from .._nvtx import annotate from .._utils.protocols import get_data_pointer, get_dtype, validate_and_get_stream from .._utils.temp_storage_buffer import TempStorageBuffer from ..iterators._iterators import IteratorBase @@ -90,6 +91,7 @@ def __init__( is_evenly_segmented, ) + @annotate(message="_Histogram.__call__") def __call__( self, temp_storage, @@ -134,6 +136,7 @@ def __call__( return temp_storage_bytes +@annotate() @cache_with_key(make_cache_key) def make_histogram_even( d_samples: DeviceArrayLike | IteratorBase, @@ -173,6 +176,7 @@ def make_histogram_even( ) +@annotate() def histogram_even( d_samples: DeviceArrayLike | IteratorBase, d_histogram: DeviceArrayLike, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py index 9b1c0f908c9..382ab5f2e7d 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py @@ -16,6 +16,7 @@ set_cccl_iterator_state, to_cccl_value_state, ) +from .._nvtx import annotate from .._utils import protocols from .._utils.protocols import get_data_pointer, validate_and_get_stream from .._utils.temp_storage_buffer import TempStorageBuffer @@ -59,6 +60,7 @@ def __init__( self.h_init_cccl, ) + @annotate(message="_Reduce.__call__") def __call__( self, temp_storage, @@ -119,6 +121,7 @@ def make_cache_key( # TODO Figure out `sum` without operator and initial value # TODO Accept stream +@annotate() @cache_with_key(make_cache_key) def make_reduce_into( d_in: DeviceArrayLike | IteratorBase, @@ -148,6 +151,7 @@ def make_reduce_into( return _Reduce(d_in, d_out, op, h_init) +@annotate() def reduce_into( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_scan.py b/python/cuda_cccl/cuda/compute/algorithms/_scan.py index 57bb152e24b..78238c71705 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_scan.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_scan.py @@ -17,6 +17,7 @@ set_cccl_iterator_state, to_cccl_value_state, ) +from .._nvtx import annotate from .._utils import protocols from .._utils.protocols import get_data_pointer, validate_and_get_stream from .._utils.temp_storage_buffer import TempStorageBuffer @@ -116,6 +117,7 @@ def __init__( case (False, _bindings.InitKind.NO_INIT): raise ValueError("Exclusive scan with No init value is not supported") + @annotate(message="_Scan.__call__") def __call__( self, temp_storage, @@ -201,6 +203,7 @@ def make_cache_key( # TODO Figure out `sum` without operator and initial value # TODO Accept stream +@annotate() @cache_with_key(make_cache_key) def make_exclusive_scan( d_in: DeviceArrayLike | IteratorBase, @@ -230,6 +233,7 @@ def make_exclusive_scan( return _Scan(d_in, d_out, op, init_value, False) +@annotate() def exclusive_scan( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -267,6 +271,7 @@ def exclusive_scan( # TODO Figure out `sum` without operator and initial value # TODO Accept stream +@annotate() @cache_with_key(make_cache_key) def make_inclusive_scan( d_in: DeviceArrayLike | IteratorBase, @@ -296,6 +301,7 @@ def make_inclusive_scan( return _Scan(d_in, d_out, op, init_value, True) +@annotate() def inclusive_scan( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py index 84ffb5f7e32..fd002458fbd 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py @@ -11,6 +11,7 @@ set_cccl_iterator_state, to_cccl_value_state, ) +from .._nvtx import annotate from .._utils import protocols from .._utils.protocols import ( get_data_pointer, @@ -84,6 +85,7 @@ def __init__( self.h_init_cccl, ) + @annotate(message="_SegmentedReduce.__call__") def __call__( self, temp_storage, @@ -166,6 +168,7 @@ def make_cache_key( ) +@annotate() @cache_with_key(make_cache_key) def make_segmented_reduce( d_in: DeviceArrayLike | IteratorBase, @@ -199,6 +202,7 @@ def make_segmented_reduce( return _SegmentedReduce(d_in, d_out, start_offsets_in, end_offsets_in, op, h_init) +@annotate() def segmented_reduce( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_select.py b/python/cuda_cccl/cuda/compute/algorithms/_select.py index 83afb2c3053..4727b32f3fd 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_select.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_select.py @@ -6,6 +6,7 @@ from typing import Callable from .._caching import CachableFunction, cache_with_key +from .._nvtx import annotate from .._utils import protocols from ..iterators._factories import DiscardIterator from ..iterators._iterators import IteratorBase @@ -60,6 +61,7 @@ def _cccl_always_false(x): _cccl_always_false, # select_second_part_op - always false ) + @annotate(message="_Select.__call__") def __call__( self, temp_storage, @@ -81,6 +83,7 @@ def __call__( ) +@annotate() @cache_with_key(make_cache_key) def make_select( d_in: DeviceArrayLike | IteratorBase, @@ -115,6 +118,7 @@ def make_select( return _Select(d_in, d_out, d_num_selected_out, cond) +@annotate() def select( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py index 1a8613a0f5a..333d2b923d2 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py @@ -11,6 +11,7 @@ from ... import _cccl_interop as cccl from ..._caching import CachableFunction, cache_with_key from ..._cccl_interop import call_build, set_cccl_iterator_state +from ..._nvtx import annotate from ..._utils import protocols from ..._utils.protocols import ( get_data_pointer, @@ -107,6 +108,7 @@ def __init__( self.op_wrapper, ) + @annotate(message="_MergeSort.__call__") def __call__( self, temp_storage, @@ -153,6 +155,7 @@ def __call__( return temp_storage_bytes +@annotate() @cache_with_key(make_cache_key) def make_merge_sort( d_in_keys: DeviceArrayLike | IteratorBase, @@ -184,6 +187,7 @@ def make_merge_sort( return _MergeSort(d_in_keys, d_in_items, d_out_keys, d_out_items, op) +@annotate() def merge_sort( d_in_keys: DeviceArrayLike | IteratorBase, d_in_items: DeviceArrayLike | IteratorBase | None, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py index 1080143018a..66c292b890a 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py @@ -7,6 +7,7 @@ from ... import _cccl_interop as cccl from ..._caching import cache_with_key from ..._cccl_interop import call_build, set_cccl_iterator_state +from ..._nvtx import annotate from ..._utils.protocols import ( get_data_pointer, get_dtype, @@ -94,6 +95,7 @@ def __init__( decomposer_return_type, ) + @annotate(message="_RadixSort.__call__") def __call__( self, temp_storage, @@ -164,6 +166,7 @@ def __call__( return temp_storage_bytes +@annotate() @cache_with_key(make_cache_key) def make_radix_sort( d_in_keys: DeviceArrayLike | DoubleBuffer, @@ -195,6 +198,7 @@ def make_radix_sort( return _RadixSort(d_in_keys, d_out_keys, d_in_values, d_out_values, order) +@annotate() def radix_sort( d_in_keys: DeviceArrayLike | DoubleBuffer, d_out_keys: DeviceArrayLike | None, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py index 2df9bcc3db7..d80a2532766 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py @@ -8,6 +8,7 @@ from ... import _cccl_interop as cccl from ..._caching import cache_with_key from ..._cccl_interop import call_build, set_cccl_iterator_state +from ..._nvtx import annotate from ..._utils.protocols import ( get_data_pointer, get_dtype, @@ -66,6 +67,7 @@ def __init__( self.end_offsets_in_cccl, ) + @annotate(message="_SegmentedSort.__call__") def __call__( self, temp_storage, @@ -166,6 +168,7 @@ def make_cache_key( ) +@annotate() @cache_with_key(make_cache_key) def make_segmented_sort( d_in_keys: DeviceArrayLike | DoubleBuffer, @@ -209,6 +212,7 @@ def make_segmented_sort( ) +@annotate() def segmented_sort( d_in_keys: DeviceArrayLike | DoubleBuffer, d_out_keys: DeviceArrayLike | None, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py index aa0d6d8d5e6..4795f87287d 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py @@ -11,6 +11,7 @@ from .. import _cccl_interop as cccl from .._caching import CachableFunction, cache_with_key from .._cccl_interop import call_build, set_cccl_iterator_state +from .._nvtx import annotate from .._utils import protocols from .._utils.temp_storage_buffer import TempStorageBuffer from ..iterators._iterators import IteratorBase @@ -108,6 +109,7 @@ def __init__( self.select_second_part_op_wrapper, ) + @annotate(message="_ThreeWayPartition.__call__") def __call__( self, temp_storage, @@ -149,6 +151,7 @@ def __call__( return temp_storage_bytes +@annotate() @cache_with_key(make_cache_key) def make_three_way_partition( d_in: DeviceArrayLike | IteratorBase, @@ -192,6 +195,7 @@ def make_three_way_partition( ) +@annotate() def three_way_partition( d_in: DeviceArrayLike | IteratorBase, d_first_part_out: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_transform.py b/python/cuda_cccl/cuda/compute/algorithms/_transform.py index c6c451c0859..005aa8ec586 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_transform.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_transform.py @@ -9,6 +9,7 @@ from .. import _cccl_interop as cccl from .._caching import CachableFunction, cache_with_key from .._cccl_interop import set_cccl_iterator_state +from .._nvtx import annotate from .._utils import protocols from ..iterators._iterators import IteratorBase from ..numba_utils import get_inferred_return_type, signature_from_annotations @@ -54,6 +55,7 @@ def __init__( self.op_wrapper, ) + @annotate(message="_UnaryTransform.__call__") def __call__( self, d_in, @@ -118,6 +120,7 @@ def __init__( self.op_wrapper, ) + @annotate(message="_BinaryTransform.__call__") def __call__( self, d_in1, @@ -189,6 +192,7 @@ def make_binary_transform_cache_key( return (d_in1_key, d_in2_key, d_out_key, op_key) +@annotate() @cache_with_key(make_unary_transform_cache_key) def make_unary_transform( d_in: DeviceArrayLike | IteratorBase, @@ -219,6 +223,7 @@ def make_unary_transform( return _UnaryTransform(d_in, d_out, op) +@annotate() @cache_with_key(make_binary_transform_cache_key) def make_binary_transform( d_in1: DeviceArrayLike | IteratorBase, @@ -251,6 +256,7 @@ def make_binary_transform( return _BinaryTransform(d_in1, d_in2, d_out, op) +@annotate() def unary_transform( d_in: DeviceArrayLike | IteratorBase, d_out: DeviceArrayLike | IteratorBase, @@ -289,6 +295,7 @@ def unary_transform( transformer(d_in, d_out, num_items, stream) +@annotate() def binary_transform( d_in1: DeviceArrayLike | IteratorBase, d_in2: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py index ce5855bddd8..b62679ab0cc 100644 --- a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py +++ b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py @@ -11,6 +11,7 @@ from .. import _cccl_interop as cccl from .._caching import CachableFunction, cache_with_key from .._cccl_interop import call_build, set_cccl_iterator_state +from .._nvtx import annotate from .._utils import protocols from .._utils.protocols import ( get_data_pointer, @@ -115,6 +116,7 @@ def __init__( self.op_wrapper, ) + @annotate(message="_UniqueByKey.__call__") def __call__( self, temp_storage, @@ -157,6 +159,7 @@ def __call__( return temp_storage_bytes +@annotate() @cache_with_key(make_cache_key) def make_unique_by_key( d_in_keys: DeviceArrayLike | IteratorBase, @@ -193,6 +196,7 @@ def make_unique_by_key( ) +@annotate() def unique_by_key( d_in_keys: DeviceArrayLike | IteratorBase, d_in_items: DeviceArrayLike | IteratorBase, diff --git a/python/cuda_cccl/cuda/compute/iterators/_factories.py b/python/cuda_cccl/cuda/compute/iterators/_factories.py index 5023e660600..d398498b1cc 100644 --- a/python/cuda_cccl/cuda/compute/iterators/_factories.py +++ b/python/cuda_cccl/cuda/compute/iterators/_factories.py @@ -1,5 +1,6 @@ import numba +from .._nvtx import annotate from ._iterators import ( CacheModifiedPointer as _CacheModifiedPointer, ) @@ -20,6 +21,7 @@ from ._zip_iterator import make_zip_iterator +@annotate() def CacheModifiedInputIterator(device_array, modifier): """Random Access Cache Modified Iterator that wraps a native device pointer. @@ -50,6 +52,7 @@ def CacheModifiedInputIterator(device_array, modifier): ) +@annotate() def ConstantIterator(value): """Returns an Iterator representing a sequence of constant values. @@ -73,6 +76,7 @@ def ConstantIterator(value): return _ConstantIterator(value) +@annotate() def CountingIterator(offset): """Returns an Iterator representing a sequence of incrementing values. @@ -96,6 +100,7 @@ def CountingIterator(offset): return _CountingIterator(offset) +@annotate() def DiscardIterator(reference_iterator=None): """Returns an Input or Output Iterator that discards all values written to it. @@ -119,6 +124,7 @@ def DiscardIterator(reference_iterator=None): return _DiscardIterator(reference_iterator) +@annotate() def ReverseIterator(sequence): """Returns an Iterator over an array or another iterator in reverse. @@ -147,6 +153,7 @@ def ReverseIterator(sequence): return make_reverse_iterator(sequence) +@annotate() def TransformIterator(it, op): """An iterator that applies a user-defined unary function to the elements of an underlying iterator as they are read. @@ -169,6 +176,7 @@ def TransformIterator(it, op): return make_transform_iterator(it, op, "input") +@annotate() def TransformOutputIterator(it, op): """An iterator that applies a user-defined unary function to values before writing them to an underlying iterator. @@ -192,6 +200,7 @@ def TransformOutputIterator(it, op): return make_transform_iterator(it, op, "output") +@annotate() def PermutationIterator(values, indices): """Returns an Iterator that accesses values through an index mapping. @@ -219,6 +228,7 @@ def PermutationIterator(values, indices): return make_permutation_iterator(values, indices) +@annotate() def ZipIterator(*iterators): """Returns an Iterator representing a zipped sequence of values from N iterators. diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index e414dfda371..8f81bca514a 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "cuda-core", "numba-cuda>=0.20.0,!=0.21.2", "typing_extensions", + "nvtx", ] dynamic = ["version"] @@ -109,6 +110,7 @@ module = [ "cuda.bindings.*", "cuda.core.*", "cuda.pathfinder.*", + "nvtx", ] ignore_missing_imports = true follow_imports = "skip"