diff --git a/python/cuda_cccl/cuda/compute/_nvtx.py b/python/cuda_cccl/cuda/compute/_nvtx.py
new file mode 100644
index 00000000000..364af934b6b
--- /dev/null
+++ b/python/cuda_cccl/cuda/compute/_nvtx.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+NVTX annotation utilities for cuda.compute module.
+Uses NVIDIA green (76B900) color and cuda.compute domain.
+"""
+
+import functools
+
+import nvtx
+
+# NVIDIA green color hex value (76B900)
+NVIDIA_GREEN = 0x76B900
+
+# Domain name for cuda.compute annotations
+COMPUTE_DOMAIN = "cuda.compute"
+
+
+def annotate(message=None, domain=None, category=None, color=None):
+    """
+    Decorator to annotate functions with NVTX markers.
+
+    Args:
+        message: Optional message to display. If None, uses the function name.
+        domain: Optional NVTX domain string. Defaults to "cuda.compute".
+        category: Optional category for the annotation.
+        color: Optional color in hexadecimal format (0xRRGGBB). Defaults to NVIDIA green (0x76B900).
+
+    Returns:
+        Decorated function with NVTX annotations.
+    """
+
+    def decorator(func):
+        # Use function name if no message is provided
+        annotation_message = message if message is not None else func.__name__
+        annotation_domain = domain if domain is not None else COMPUTE_DOMAIN
+        annotation_color = color if color is not None else NVIDIA_GREEN
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            with nvtx.annotate(
+                annotation_message,
+                domain=annotation_domain,
+                color=annotation_color,
+                category=category,
+            ):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py
index 9e15d3c3ff0..563fd30d418 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_histogram.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_histogram.py
@@ -11,6 +11,7 @@
 from .. import _cccl_interop as cccl
 from .._caching import cache_with_key
 from .._cccl_interop import call_build, set_cccl_iterator_state, to_cccl_value_state
+from .._nvtx import annotate
 from .._utils.protocols import get_data_pointer, get_dtype, validate_and_get_stream
 from .._utils.temp_storage_buffer import TempStorageBuffer
 from ..iterators._iterators import IteratorBase
@@ -90,6 +91,7 @@ def __init__(
             is_evenly_segmented,
         )
 
+    @annotate(message="_Histogram.__call__")
     def __call__(
         self,
         temp_storage,
@@ -134,6 +136,7 @@ def __call__(
         return temp_storage_bytes
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_histogram_even(
     d_samples: DeviceArrayLike | IteratorBase,
@@ -173,6 +176,7 @@ def make_histogram_even(
     )
 
 
+@annotate()
 def histogram_even(
     d_samples: DeviceArrayLike | IteratorBase,
     d_histogram: DeviceArrayLike,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py
index 9b1c0f908c9..382ab5f2e7d 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_reduce.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_reduce.py
@@ -16,6 +16,7 @@
     set_cccl_iterator_state,
     to_cccl_value_state,
 )
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.protocols import get_data_pointer, validate_and_get_stream
 from .._utils.temp_storage_buffer import TempStorageBuffer
@@ -59,6 +60,7 @@ def __init__(
             self.h_init_cccl,
         )
 
+    @annotate(message="_Reduce.__call__")
     def __call__(
         self,
         temp_storage,
@@ -119,6 +121,7 @@ def make_cache_key(
 
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
+@annotate()
 @cache_with_key(make_cache_key)
 def make_reduce_into(
     d_in: DeviceArrayLike | IteratorBase,
@@ -148,6 +151,7 @@ def make_reduce_into(
     return _Reduce(d_in, d_out, op, h_init)
 
 
+@annotate()
 def reduce_into(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_scan.py b/python/cuda_cccl/cuda/compute/algorithms/_scan.py
index 57bb152e24b..78238c71705 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_scan.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_scan.py
@@ -17,6 +17,7 @@
     set_cccl_iterator_state,
     to_cccl_value_state,
 )
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.protocols import get_data_pointer, validate_and_get_stream
 from .._utils.temp_storage_buffer import TempStorageBuffer
@@ -116,6 +117,7 @@ def __init__(
             case (False, _bindings.InitKind.NO_INIT):
                 raise ValueError("Exclusive scan with No init value is not supported")
 
+    @annotate(message="_Scan.__call__")
     def __call__(
         self,
         temp_storage,
@@ -201,6 +203,7 @@ def make_cache_key(
 
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
+@annotate()
 @cache_with_key(make_cache_key)
 def make_exclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
@@ -230,6 +233,7 @@ def make_exclusive_scan(
     return _Scan(d_in, d_out, op, init_value, False)
 
 
+@annotate()
 def exclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -267,6 +271,7 @@ def exclusive_scan(
 
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
+@annotate()
 @cache_with_key(make_cache_key)
 def make_inclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
@@ -296,6 +301,7 @@ def make_inclusive_scan(
     return _Scan(d_in, d_out, op, init_value, True)
 
 
+@annotate()
 def inclusive_scan(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py
index 84ffb5f7e32..fd002458fbd 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_segmented_reduce.py
@@ -11,6 +11,7 @@
     set_cccl_iterator_state,
     to_cccl_value_state,
 )
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.protocols import (
     get_data_pointer,
@@ -84,6 +85,7 @@ def __init__(
             self.h_init_cccl,
         )
 
+    @annotate(message="_SegmentedReduce.__call__")
     def __call__(
         self,
         temp_storage,
@@ -166,6 +168,7 @@ def make_cache_key(
     )
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_segmented_reduce(
     d_in: DeviceArrayLike | IteratorBase,
@@ -199,6 +202,7 @@ def make_segmented_reduce(
     return _SegmentedReduce(d_in, d_out, start_offsets_in, end_offsets_in, op, h_init)
 
 
+@annotate()
 def segmented_reduce(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_select.py b/python/cuda_cccl/cuda/compute/algorithms/_select.py
index 83afb2c3053..4727b32f3fd 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_select.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_select.py
@@ -6,6 +6,7 @@
 from typing import Callable
 
 from .._caching import CachableFunction, cache_with_key
+from .._nvtx import annotate
 from .._utils import protocols
 from ..iterators._factories import DiscardIterator
 from ..iterators._iterators import IteratorBase
@@ -60,6 +61,7 @@ def _cccl_always_false(x):
             _cccl_always_false,  # select_second_part_op - always false
         )
 
+    @annotate(message="_Select.__call__")
     def __call__(
         self,
         temp_storage,
@@ -81,6 +83,7 @@ def __call__(
         )
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_select(
     d_in: DeviceArrayLike | IteratorBase,
@@ -115,6 +118,7 @@ def make_select(
     return _Select(d_in, d_out, d_num_selected_out, cond)
 
 
+@annotate()
 def select(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py
index 1a8613a0f5a..333d2b923d2 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_merge_sort.py
@@ -11,6 +11,7 @@
 from ... import _cccl_interop as cccl
 from ..._caching import CachableFunction, cache_with_key
 from ..._cccl_interop import call_build, set_cccl_iterator_state
+from ..._nvtx import annotate
 from ..._utils import protocols
 from ..._utils.protocols import (
     get_data_pointer,
@@ -107,6 +108,7 @@ def __init__(
             self.op_wrapper,
         )
 
+    @annotate(message="_MergeSort.__call__")
     def __call__(
         self,
         temp_storage,
@@ -153,6 +155,7 @@ def __call__(
         return temp_storage_bytes
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_merge_sort(
     d_in_keys: DeviceArrayLike | IteratorBase,
@@ -184,6 +187,7 @@ def make_merge_sort(
     return _MergeSort(d_in_keys, d_in_items, d_out_keys, d_out_items, op)
 
 
+@annotate()
 def merge_sort(
     d_in_keys: DeviceArrayLike | IteratorBase,
     d_in_items: DeviceArrayLike | IteratorBase | None,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py
index 1080143018a..66c292b890a 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_radix_sort.py
@@ -7,6 +7,7 @@
 from ... import _cccl_interop as cccl
 from ..._caching import cache_with_key
 from ..._cccl_interop import call_build, set_cccl_iterator_state
+from ..._nvtx import annotate
 from ..._utils.protocols import (
     get_data_pointer,
     get_dtype,
@@ -94,6 +95,7 @@ def __init__(
             decomposer_return_type,
         )
 
+    @annotate(message="_RadixSort.__call__")
     def __call__(
         self,
         temp_storage,
@@ -164,6 +166,7 @@ def __call__(
         return temp_storage_bytes
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_radix_sort(
     d_in_keys: DeviceArrayLike | DoubleBuffer,
@@ -195,6 +198,7 @@ def make_radix_sort(
     return _RadixSort(d_in_keys, d_out_keys, d_in_values, d_out_values, order)
 
 
+@annotate()
 def radix_sort(
     d_in_keys: DeviceArrayLike | DoubleBuffer,
     d_out_keys: DeviceArrayLike | None,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py
index 2df9bcc3db7..d80a2532766 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_sort/_segmented_sort.py
@@ -8,6 +8,7 @@
 from ... import _cccl_interop as cccl
 from ..._caching import cache_with_key
 from ..._cccl_interop import call_build, set_cccl_iterator_state
+from ..._nvtx import annotate
 from ..._utils.protocols import (
     get_data_pointer,
     get_dtype,
@@ -66,6 +67,7 @@ def __init__(
             self.end_offsets_in_cccl,
         )
 
+    @annotate(message="_SegmentedSort.__call__")
     def __call__(
         self,
         temp_storage,
@@ -166,6 +168,7 @@ def make_cache_key(
     )
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_segmented_sort(
     d_in_keys: DeviceArrayLike | DoubleBuffer,
@@ -209,6 +212,7 @@ def make_segmented_sort(
     )
 
 
+@annotate()
 def segmented_sort(
     d_in_keys: DeviceArrayLike | DoubleBuffer,
     d_out_keys: DeviceArrayLike | None,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py
index aa0d6d8d5e6..4795f87287d 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_three_way_partition.py
@@ -11,6 +11,7 @@
 from .. import _cccl_interop as cccl
 from .._caching import CachableFunction, cache_with_key
 from .._cccl_interop import call_build, set_cccl_iterator_state
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.temp_storage_buffer import TempStorageBuffer
 from ..iterators._iterators import IteratorBase
@@ -108,6 +109,7 @@ def __init__(
             self.select_second_part_op_wrapper,
         )
 
+    @annotate(message="_ThreeWayPartition.__call__")
     def __call__(
         self,
         temp_storage,
@@ -149,6 +151,7 @@ def __call__(
         return temp_storage_bytes
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_three_way_partition(
     d_in: DeviceArrayLike | IteratorBase,
@@ -192,6 +195,7 @@ def make_three_way_partition(
     )
 
 
+@annotate()
 def three_way_partition(
     d_in: DeviceArrayLike | IteratorBase,
     d_first_part_out: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_transform.py b/python/cuda_cccl/cuda/compute/algorithms/_transform.py
index c6c451c0859..005aa8ec586 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_transform.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_transform.py
@@ -9,6 +9,7 @@
 from .. import _cccl_interop as cccl
 from .._caching import CachableFunction, cache_with_key
 from .._cccl_interop import set_cccl_iterator_state
+from .._nvtx import annotate
 from .._utils import protocols
 from ..iterators._iterators import IteratorBase
 from ..numba_utils import get_inferred_return_type, signature_from_annotations
@@ -54,6 +55,7 @@ def __init__(
             self.op_wrapper,
         )
 
+    @annotate(message="_UnaryTransform.__call__")
     def __call__(
         self,
         d_in,
@@ -118,6 +120,7 @@ def __init__(
             self.op_wrapper,
         )
 
+    @annotate(message="_BinaryTransform.__call__")
     def __call__(
         self,
         d_in1,
@@ -189,6 +192,7 @@ def make_binary_transform_cache_key(
     return (d_in1_key, d_in2_key, d_out_key, op_key)
 
 
+@annotate()
 @cache_with_key(make_unary_transform_cache_key)
 def make_unary_transform(
     d_in: DeviceArrayLike | IteratorBase,
@@ -219,6 +223,7 @@ def make_unary_transform(
     return _UnaryTransform(d_in, d_out, op)
 
 
+@annotate()
 @cache_with_key(make_binary_transform_cache_key)
 def make_binary_transform(
     d_in1: DeviceArrayLike | IteratorBase,
@@ -251,6 +256,7 @@ def make_binary_transform(
     return _BinaryTransform(d_in1, d_in2, d_out, op)
 
 
+@annotate()
 def unary_transform(
     d_in: DeviceArrayLike | IteratorBase,
     d_out: DeviceArrayLike | IteratorBase,
@@ -289,6 +295,7 @@ def unary_transform(
     transformer(d_in, d_out, num_items, stream)
 
 
+@annotate()
 def binary_transform(
     d_in1: DeviceArrayLike | IteratorBase,
     d_in2: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py
index ce5855bddd8..b62679ab0cc 100644
--- a/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py
+++ b/python/cuda_cccl/cuda/compute/algorithms/_unique_by_key.py
@@ -11,6 +11,7 @@
 from .. import _cccl_interop as cccl
 from .._caching import CachableFunction, cache_with_key
 from .._cccl_interop import call_build, set_cccl_iterator_state
+from .._nvtx import annotate
 from .._utils import protocols
 from .._utils.protocols import (
     get_data_pointer,
@@ -115,6 +116,7 @@ def __init__(
             self.op_wrapper,
         )
 
+    @annotate(message="_UniqueByKey.__call__")
     def __call__(
         self,
         temp_storage,
@@ -157,6 +159,7 @@ def __call__(
         return temp_storage_bytes
 
 
+@annotate()
 @cache_with_key(make_cache_key)
 def make_unique_by_key(
     d_in_keys: DeviceArrayLike | IteratorBase,
@@ -193,6 +196,7 @@ def make_unique_by_key(
     )
 
 
+@annotate()
 def unique_by_key(
     d_in_keys: DeviceArrayLike | IteratorBase,
     d_in_items: DeviceArrayLike | IteratorBase,
diff --git a/python/cuda_cccl/cuda/compute/iterators/_factories.py b/python/cuda_cccl/cuda/compute/iterators/_factories.py
index 5023e660600..d398498b1cc 100644
--- a/python/cuda_cccl/cuda/compute/iterators/_factories.py
+++ b/python/cuda_cccl/cuda/compute/iterators/_factories.py
@@ -1,5 +1,6 @@
 import numba
 
+from .._nvtx import annotate
 from ._iterators import (
     CacheModifiedPointer as _CacheModifiedPointer,
 )
@@ -20,6 +21,7 @@
 from ._zip_iterator import make_zip_iterator
 
 
+@annotate()
 def CacheModifiedInputIterator(device_array, modifier):
     """Random Access Cache Modified Iterator that wraps a native device pointer.
 
@@ -50,6 +52,7 @@ def CacheModifiedInputIterator(device_array, modifier):
     )
 
 
+@annotate()
 def ConstantIterator(value):
     """Returns an Iterator representing a sequence of constant values.
 
@@ -73,6 +76,7 @@ def ConstantIterator(value):
     return _ConstantIterator(value)
 
 
+@annotate()
 def CountingIterator(offset):
     """Returns an Iterator representing a sequence of incrementing values.
 
@@ -96,6 +100,7 @@ def CountingIterator(offset):
     return _CountingIterator(offset)
 
 
+@annotate()
 def DiscardIterator(reference_iterator=None):
     """Returns an Input or Output Iterator that discards all values written to it.
 
@@ -119,6 +124,7 @@ def DiscardIterator(reference_iterator=None):
     return _DiscardIterator(reference_iterator)
 
 
+@annotate()
 def ReverseIterator(sequence):
     """Returns an Iterator over an array or another iterator in reverse.
 
@@ -147,6 +153,7 @@ def ReverseIterator(sequence):
     return make_reverse_iterator(sequence)
 
 
+@annotate()
 def TransformIterator(it, op):
     """An iterator that applies a user-defined unary function to the elements of an underlying iterator as they are read.
 
@@ -169,6 +176,7 @@ def TransformIterator(it, op):
     return make_transform_iterator(it, op, "input")
 
 
+@annotate()
 def TransformOutputIterator(it, op):
     """An iterator that applies a user-defined unary function to values before writing them to an underlying iterator.
 
@@ -192,6 +200,7 @@ def TransformOutputIterator(it, op):
     return make_transform_iterator(it, op, "output")
 
 
+@annotate()
 def PermutationIterator(values, indices):
     """Returns an Iterator that accesses values through an index mapping.
 
@@ -219,6 +228,7 @@ def PermutationIterator(values, indices):
     return make_permutation_iterator(values, indices)
 
 
+@annotate()
 def ZipIterator(*iterators):
     """Returns an Iterator representing a zipped sequence of values from N iterators.
 
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index e414dfda371..8f81bca514a 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
   "cuda-core",
   "numba-cuda>=0.20.0,!=0.21.2",
   "typing_extensions",
+  "nvtx",
 ]
 
 dynamic = ["version"]
@@ -109,6 +110,7 @@ module = [
   "cuda.bindings.*",
   "cuda.core.*",
   "cuda.pathfinder.*",
+  "nvtx",
 ]
 ignore_missing_imports = true
 follow_imports = "skip"