Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f1f7cfa
Implement the new tuning API for DeviceTransform
bernhardmgruber Dec 8, 2025
933848d
lol, this works
bernhardmgruber Dec 8, 2025
2449cd5
CCCL.C
bernhardmgruber Dec 8, 2025
5f9499a
missing operators and other CCCL.C fixes
bernhardmgruber Dec 8, 2025
8dcfdea
fixes for CCCL.C
bernhardmgruber Dec 8, 2025
5ece160
fixes for CCCL.C
bernhardmgruber Dec 8, 2025
c7b3553
not needed
bernhardmgruber Dec 8, 2025
118ce1f
drop comment
bernhardmgruber Dec 9, 2025
a25155b
Return policy by ref
bernhardmgruber Dec 9, 2025
ea66665
Cleanup
bernhardmgruber Dec 9, 2025
3eb8714
Implement tuning query
bernhardmgruber Dec 9, 2025
fd107c2
Try to make babelstream tunable
bernhardmgruber Dec 10, 2025
8bac0f9
Try to make other benchmarks tunable
bernhardmgruber Dec 10, 2025
bdb31d3
TUNE_BIF_BIAS
bernhardmgruber Dec 10, 2025
f1543db
Refactor
bernhardmgruber Dec 10, 2025
27020b2
fixes
bernhardmgruber Dec 10, 2025
1375b61
apply reviewer feedback
bernhardmgruber Dec 10, 2025
7594e24
Fix clang CUDA
bernhardmgruber Dec 10, 2025
f454f6a
nvcc compiler crash workaround
bernhardmgruber Dec 10, 2025
28c469b
gcc warning fix
bernhardmgruber Dec 11, 2025
5c91f23
nvcc crash fixes
bernhardmgruber Dec 11, 2025
d8d689c
MSVC and a bit of renaming
bernhardmgruber Dec 11, 2025
2ad96ef
Disable sm120 fill policy
bernhardmgruber Dec 11, 2025
d7809f2
Disable vector fill policies when we have inputs
bernhardmgruber Dec 11, 2025
c8b2ef6
MSVC < 14.44 workaround
bernhardmgruber Dec 11, 2025
631e275
MSVC: Replace lambda by struct
bernhardmgruber Dec 11, 2025
3880d5b
REMOVE ME: only test MSVC for now
bernhardmgruber Dec 11, 2025
8b01df7
Try to further simplify
bernhardmgruber Dec 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion c/parallel/src/kernels/iterators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ struct {0}_proxy_t {{
struct {0} {{
using iterator_category = cuda::std::random_access_iterator_tag;
using difference_type = DIFF_T;
using value_type = void;
using value_type = VALUE_T;
using pointer = {0}_proxy_t*;
using reference = {0}_proxy_t;
__device__ {0}_proxy_t operator*() const {{ return {{state}}; }}
Expand Down
212 changes: 76 additions & 136 deletions c/parallel/src/transform.cu

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ workflows:
# args: '--preset libcudacxx --lit-tests "cuda/utility/basic_any.pass.cpp"' }
#
override:
- {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['msvc2019', 'msvc14.39']}
- {jobs: ['build'], std: 'minmax', ctk: '12.X', cxx: ['msvc2019', 'msvc' ]}
- {jobs: ['build'], std: 'minmax', ctk: '13.0', cxx: ['msvc2019', 'msvc' ]}

pull_request:
# Old CTK: Oldest/newest supported host compilers:
Expand Down
10 changes: 6 additions & 4 deletions cub/benchmarks/bench/transform/babelstream.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS alg -16:16:4
// %RANGE% TUNE_ALGORITHM alg 1:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:2:1

// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#include "common.h"

Expand Down
79 changes: 48 additions & 31 deletions cub/benchmarks/bench/transform/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

// keep checks at the top so compilation of discarded variants fails really fast
#include <cub/device/dispatch/dispatch_transform.cuh>
#if !TUNE_BASE && TUNE_ALGORITHM == 2
#if !TUNE_BASE && TUNE_ALGORITHM == 3
# if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
# error "When tuning, this benchmark does not support being compiled for multiple architectures"
# endif
# if (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 2 (ublkcp) below sm90"
# error "Cannot compile algorithm 3 (ublkcp) below sm90"
# endif
#endif

Expand All @@ -23,32 +23,47 @@

#include <nvbench_helper.cuh>

template <typename RandomAccessIteratorOut, typename... RandomAccessIteratorsIn>
#if TUNE_BASE
using policy_hub_t =
cub::detail::transform::policy_hub</* stable address */ false,
/* dense output */ true,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut>;
#else
struct policy_hub_t
#if !TUNE_BASE
struct arch_policies
{
struct max_policy : cub::ChainedPolicy<500, max_policy, max_policy>
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_arch_policy
{
static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
const int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__) + TUNE_BIF_BIAS;
# if TUNE_ALGORITHM == 0
static constexpr auto algorithm = cub::detail::transform::Algorithm::prefetch;
const auto algorithm = cub::detail::transform::Algorithm::prefetch;
const auto policy = prefetch_policy{
TUNE_THREADS
# ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
,
TUNE_ITEMS_PER_THREAD_NO_INPUT
# endif
};
return {min_bif, algorithm, policy, {}, {}};
# elif TUNE_ALGORITHM == 1
static constexpr auto algorithm = cub::detail::transform::Algorithm::ublkcp;
const auto algorithm = cub::detail::transform::Algorithm::vectorized;
const auto policy = vectorized_policy{
TUNE_THREADS,
TUNE_VEC_SIZE * TUNE_VECTORS_PER_THREAD,
TUNE_VEC_SIZE
# ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
,
TUNE_ITEMS_PER_THREAD_NO_INPUT
# endif
};
return {min_bif, algorithm, {}, policy, {}};
# elif TUNE_ALGORITHM == 2
const auto algorithm = cub::detail::transform::Algorithm::memcpy_async;
const auto policy = async_copy_policy{TUNE_THREADS, cub::detail::transform::ldgsts_size_and_align};
return {min_bif, algorithm, {}, {}, policy};
# elif TUNE_ALGORITHM == 3s
const auto algorithm = cub::detail::transform::Algorithm::ublkcp;
const auto policy =
async_copy_policy{TUNE_THREADS, cub::detail::transform::bulk_copy_alignment(__CUDA_ARCH_LIST__)};
return {min_bif, algorithm, {}, {}, policy};
# else
# error Policy hub does not yet implement the specified value for algorithm
# endif

using algo_policy =
::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::prefetch,
cub::detail::transform::prefetch_policy_t<TUNE_THREADS>,
cub::detail::transform::async_copy_policy_t<TUNE_THREADS, __CUDA_ARCH_LIST__ == 900 ? 128 : 16>>;
};
}
};
#endif

Expand All @@ -60,15 +75,17 @@ void bench_transform(nvbench::state& state,
TransformOp transform_op)
{
state.exec(nvbench::exec_tag::gpu, [&](const nvbench::launch& launch) {
cub::detail::transform::dispatch_t<
cub::detail::transform::requires_stable_address::no,
OffsetT,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut,
cub::detail::transform::always_true_predicate,
TransformOp,
policy_hub_t<RandomAccessIteratorOut, RandomAccessIteratorsIn...>>::
dispatch(
inputs, output, num_items, cub::detail::transform::always_true_predicate{}, transform_op, launch.get_stream());
cub::detail::transform::dispatch<cub::detail::transform::requires_stable_address::no>(
inputs,
output,
num_items,
cub::detail::transform::always_true_predicate{},
transform_op,
launch.get_stream()
#if !TUNE_BASE
,
arch_policies{}
#endif
);
});
}
10 changes: 6 additions & 4 deletions cub/benchmarks/bench/transform/complex_cmp.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS alg -16:16:4
// %RANGE% TUNE_ALGORITHM alg 1:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#include "common.h"

Expand Down
10 changes: 6 additions & 4 deletions cub/benchmarks/bench/transform/fib.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS alg -16:16:4
// %RANGE% TUNE_ALGORITHM alg 1:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#include "common.h"

Expand Down
14 changes: 10 additions & 4 deletions cub/benchmarks/bench/transform/fill.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS alg -16:16:4
// for filling, we can only use the prefetch and the vectorized algorithm
// %RANGE% TUNE_ALGORITHM alg 1:2:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:3:1

// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 0 (prefetch)
// %RANGE% TUNE_ITEMS_PER_THREAD_NO_INPUT ipt 1:32:1

// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#include "common.h"

Expand Down
8 changes: 8 additions & 0 deletions cub/benchmarks/bench/transform/grayscale.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_BIF_BIAS alg -16:16:4
// %RANGE% TUNE_ALGORITHM alg 1:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128

// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#include "common.h"

template <typename T>
Expand Down
10 changes: 6 additions & 4 deletions cub/benchmarks/bench/transform/heavy.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
// benchmark. Its main use is to detect regressions.

// %RANGE% TUNE_BIF_BIAS alg -16:16:4
// %RANGE% TUNE_ALGORITHM alg 1:4:1
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1

#include "common.h"

Expand Down
67 changes: 44 additions & 23 deletions cub/cub/device/device_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <cub/device/dispatch/dispatch_transform.cuh>
#include <cub/util_namespace.cuh>

#include <cuda/__execution/tune.h>
#include <cuda/__functional/address_stability.h>
#include <cuda/__stream/get_stream.h>
#include <cuda/std/__execution/env.h>
Expand Down Expand Up @@ -44,25 +45,31 @@ struct ::cuda::proclaims_copyable_arguments<CUB_NS_QUALIFIER::detail::__return_c
{};

CUB_NAMESPACE_BEGIN
namespace detail::transform
{
struct get_tuning_query_t
{};
} // namespace detail::transform

//! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
//! sequences into an output sequence.
struct DeviceTransform
{
private:
template <typename... RandomAccessIteratorsIn,
template <detail::transform::requires_stable_address StableAddress = detail::transform::requires_stable_address::no,
typename... RandomAccessIteratorsIn,
typename RandomAccessIteratorOut,
typename NumItemsT,
typename Predicate,
typename TransformOp,
typename StableAddress = cuda::std::false_type>
typename Env>
CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
NumItemsT num_items,
Predicate predicate,
TransformOp transform_op,
cudaStream_t stream,
StableAddress = {})
Env env)
{
using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
using offset_t = typename choose_offset_t::type;
Expand All @@ -73,17 +80,32 @@ private:
return error;
}

return detail::transform::dispatch_t < StableAddress::value
? detail::transform::requires_stable_address::yes
: detail::transform::requires_stable_address::no,
offset_t, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, Predicate,
TransformOp > ::dispatch(
::cuda::std::move(inputs),
::cuda::std::move(output),
num_items,
::cuda::std::move(predicate),
::cuda::std::move(transform_op),
stream);
using tuning_env_t =
::cuda::std::execution::__query_result_or_t<Env, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
using transform_tuning_t =
::cuda::std::execution::__query_result_or_t<tuning_env_t, detail::transform::get_tuning_query_t, int>;

if constexpr (!::cuda::std::is_same_v<transform_tuning_t, int>)
{
return detail::transform::dispatch<StableAddress>(
::cuda::std::move(inputs),
::cuda::std::move(output),
static_cast<offset_t>(num_items),
::cuda::std::move(predicate),
::cuda::std::move(transform_op),
get_stream(env),
transform_tuning_t{});
}
else
{
return detail::transform::dispatch<StableAddress>(
::cuda::std::move(inputs),
::cuda::std::move(output),
static_cast<offset_t>(num_items),
::cuda::std::move(predicate),
::cuda::std::move(transform_op),
get_stream(env));
}
}

template <typename Env>
Expand Down Expand Up @@ -146,7 +168,7 @@ public:
num_items,
detail::transform::always_true_predicate{},
::cuda::std::move(transform_op),
get_stream(env));
::cuda::std::move(env));
}

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
Expand Down Expand Up @@ -269,7 +291,7 @@ public:
num_items,
detail::transform::always_true_predicate{},
::cuda::std::move(generator),
get_stream(env));
::cuda::std::move(env));
}

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
Expand Down Expand Up @@ -322,7 +344,7 @@ public:
num_items,
detail::transform::always_true_predicate{},
detail::__return_constant<Value>{::cuda::std::move(value)},
get_stream(env));
::cuda::std::move(env));
}

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
Expand Down Expand Up @@ -401,7 +423,7 @@ public:
num_items,
::cuda::std::move(predicate),
::cuda::std::move(transform_op),
get_stream(env));
::cuda::std::move(env));
}

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
Expand Down Expand Up @@ -489,7 +511,7 @@ public:
num_items,
::cuda::std::move(predicate),
::cuda::std::move(transform_op),
get_stream(env));
::cuda::std::move(env));
}

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
Expand Down Expand Up @@ -567,14 +589,13 @@ public:
Env env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
return TransformInternal(
return TransformInternal<detail::transform::requires_stable_address::yes>(
::cuda::std::move(inputs),
::cuda::std::move(output),
num_items,
detail::transform::always_true_predicate{},
::cuda::std::move(transform_op),
get_stream(env),
::cuda::std::true_type{});
::cuda::std::move(env));
}

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
Expand Down
Loading
Loading