diff --git a/c/parallel/src/kernels/iterators.cpp b/c/parallel/src/kernels/iterators.cpp index e23b219486d..ff588bcadf7 100644 --- a/c/parallel/src/kernels/iterators.cpp +++ b/c/parallel/src/kernels/iterators.cpp @@ -114,7 +114,7 @@ struct {0}_proxy_t {{ struct {0} {{ using iterator_category = cuda::std::random_access_iterator_tag; using difference_type = DIFF_T; - using value_type = void; + using value_type = VALUE_T; using pointer = {0}_proxy_t*; using reference = {0}_proxy_t; __device__ {0}_proxy_t operator*() const {{ return {{state}}; }} diff --git a/c/parallel/src/transform.cu b/c/parallel/src/transform.cu index aae70bddd61..13404420236 100644 --- a/c/parallel/src/transform.cu +++ b/c/parallel/src/transform.cu @@ -120,48 +120,6 @@ get_kernel_name(cccl_iterator_t input1_it, cccl_iterator_t input2_it, cccl_itera namespace cdt = cub::detail::transform; -struct runtime_tuning_policy -{ - using max_policy = runtime_tuning_policy; - - cdt::Algorithm algorithm; - int min_bif; - cdt::RuntimeTransformAgentPrefetchPolicy prefetch_policy; - cdt::RuntimeTransformAgentVectorizedPolicy vectorized_policy; - cdt::RuntimeTransformAgentAsyncPolicy async_policy; - - cdt::Algorithm Algorithm() const - { - return algorithm; - } - - int MinBif() const - { - return min_bif; - } - - auto PrefetchPolicy() const - { - return prefetch_policy; - } - - auto VectorizedPolicy() const - { - return vectorized_policy; - } - - auto AsyncPolicy() const - { - return async_policy; - } - - template - cudaError_t Invoke([[maybe_unused]] int device_ptx_version, F& op) - { - return op.template Invoke(*this); - } -}; - struct cache { cuda::std::optional> async_config{}; @@ -172,7 +130,7 @@ template struct transform_kernel_source { cccl_device_transform_build_result_t& build; - std::array, NumInputs> it_value_sizes_alignments; + cuda::std::array inputs; template cub::detail::transform::cuda_expected @@ -208,9 +166,9 @@ struct transform_kernel_source return build.loaded_bytes_per_iteration; } - auto ItValueSizesAlignments() const + const auto& InputIteratorInfos() const { - return cuda::std::span(it_value_sizes_alignments); + return inputs; } template @@ -238,6 +196,14 @@ public: return (is_pointer_aligned(its, its.value_size * vec_size) && ...); } }; + +auto make_iterator_info(cccl_iterator_t input_it) -> cdt::iterator_info +{ + return {static_cast(input_it.value_type.size), + static_cast(input_it.value_type.alignment), + /* trivially_relocatable */ true, // TODO(bgruber): how to check this properly? + /* is contiguous */ input_it.type == CCCL_POINTER}; // TODO(bgruber): how to check this properly? +} } // namespace transform CUresult cccl_device_unary_transform_build_ex( @@ -259,7 +225,6 @@ CUresult cccl_device_unary_transform_build_ex( { const char* name = "test"; - const int cc = cc_major * 10 + cc_minor; const auto input_it_value_t = cccl_type_enum_to_name(input_it.value_type.type); const auto output_it_value_t = cccl_type_enum_to_name(output_it.value_type.type); const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_INT64); @@ -269,8 +234,17 @@ CUresult cccl_device_unary_transform_build_ex( make_kernel_output_iterator(offset_t, transform::output_iterator_name, output_it_value_t, output_it); const std::string op_src = make_kernel_user_unary_operator(input_it_value_t, output_it_value_t, op); + const auto inputs = + cuda::std::array{transform::make_iterator_info(input_it)}; + const auto output = transform::make_iterator_info(output_it); + const auto cub_arch_policies = cub::detail::transform::arch_policies<1>{false, true, inputs, output}; + + // TODO(bgruber): drop this if tuning policies become formattable + std::stringstream cub_arch_policies_str; + cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor})); + const auto policy_hub_expr = std::format( - "cub::detail::transform::policy_hub, {}>", + "cub::detail::transform::arch_policies_from_types, {}>", transform::get_iterator_name(input_it, transform::input_iterator_name), transform::get_iterator_name(output_it, transform::output_iterator_name)); @@ -287,13 +261,10 @@ struct __align__({3}) output_storage_t {{ {4} {5} {6} -using device_transform_policy = {7}::max_policy; - -#include -__device__ consteval auto& policy_generator() {{ - return ptx_json::id() - = cub::detail::transform::TransformPolicyWrapper::EncodedPolicy(); -}}; +using device_transform_policy = {7}; +using namespace cub; +using namespace cub::detail::transform; +static_assert(device_transform_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {8}, "Host generated and JIT compiled policy mismatch"); )XXX", input_it.value_type.size, // 0 input_it.value_type.alignment, // 1 @@ -302,7 +273,8 @@ __device__ consteval auto& policy_generator() {{ input_iterator_src, // 4 output_iterator_src, // 5 op_src, // 6 - policy_hub_expr); // 7 + policy_hub_expr, // 7 + cub_arch_policies_str.view()); // 8 #if false // CCCL_DEBUGGING_SWITCH fflush(stderr); @@ -328,7 +300,6 @@ __device__ consteval auto& policy_generator() {{ "-dlto", "-default-device", "-DCUB_DISABLE_CDP", - "-DCUB_ENABLE_POLICY_PTX_JSON", "-std=c++20"}; cccl::detail::extend_args_with_build_config(args, config); @@ -357,23 +328,12 @@ __device__ consteval auto& policy_generator() {{ cuLibraryLoadData(&build_ptr->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); check(cuLibraryGetKernel(&build_ptr->transform_kernel, build_ptr->library, kernel_lowered_name.c_str())); - nlohmann::json runtime_policy = - cub::detail::ptx_json::parse("device_transform_policy", {result.data.get(), result.size}); - - const auto algorithm = static_cast(runtime_policy["algorithm"].get()); - const auto min_bif = static_cast(runtime_policy["min_bif"].get()); - build_ptr->loaded_bytes_per_iteration = static_cast(input_it.value_type.size); - build_ptr->cc = cc; + build_ptr->cc = cc_major * 10 + cc_minor; build_ptr->cubin = (void*) result.data.release(); build_ptr->cubin_size = result.size; - build_ptr->runtime_policy = new transform::runtime_tuning_policy{ - algorithm, - min_bif, - transform::cdt::RuntimeTransformAgentPrefetchPolicy::from_json(runtime_policy, "prefetch_policy"), - transform::cdt::RuntimeTransformAgentVectorizedPolicy::from_json(runtime_policy, "vectorized_policy"), - transform::cdt::RuntimeTransformAgentAsyncPolicy::from_json(runtime_policy, "async_policy")}; - build_ptr->cache = new transform::cache(); + build_ptr->runtime_policy = new cub::detail::transform::arch_policies<1>{cub_arch_policies}; + build_ptr->cache = new transform::cache(); } catch (const std::exception& exc) { @@ -402,25 +362,16 @@ CUresult cccl_device_unary_transform( CUdevice cu_device; check(cuCtxGetDevice(&cu_device)); - error = static_cast( - transform::cdt::dispatch_t, - indirect_iterator_t, - transform::cdt::always_true_predicate, - indirect_arg_t, - transform::runtime_tuning_policy, - transform::transform_kernel_source<1>, - cub::detail::CudaDriverLauncherFactory>:: - dispatch(d_in, - d_out, - num_items, - {}, - op, - stream, - {build, {{{d_in.value_type.size, d_in.value_type.alignment}}}}, - cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, - *reinterpret_cast(build.runtime_policy))); + error = static_cast(transform::cdt::dispatch( + ::cuda::std::tuple{d_in}, + indirect_iterator_t{d_out}, + static_cast(num_items), + transform::cdt::always_true_predicate{}, + op, + stream, + *static_cast*>(build.runtime_policy), + transform::transform_kernel_source<1>{build, {transform::make_iterator_info(d_in)}}, + cub::detail::CudaDriverLauncherFactory{cu_device, build.cc})); } catch (const std::exception& exc) { @@ -457,7 +408,6 @@ CUresult cccl_device_binary_transform_build_ex( { const char* name = "test"; - const int cc = cc_major * 10 + cc_minor; const auto input1_it_value_t = cccl_type_enum_to_name(input1_it.value_type.type); const auto input2_it_value_t = cccl_type_enum_to_name(input2_it.value_type.type); @@ -473,8 +423,17 @@ CUresult cccl_device_binary_transform_build_ex( const std::string op_src = make_kernel_user_binary_operator(input1_it_value_t, input2_it_value_t, output_it_value_t, op); + const auto inputs = cuda::std::array{ + transform::make_iterator_info(input1_it), transform::make_iterator_info(input2_it)}; + const auto output = transform::make_iterator_info(output_it); + const auto cub_arch_policies = cub::detail::transform::arch_policies<2>{false, true, inputs, output}; + + // TODO(bgruber): drop this if tuning policies become formattable + std::stringstream cub_arch_policies_str; + cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor})); + const auto policy_hub_expr = std::format( - "cub::detail::transform::policy_hub, {2}>", + "cub::detail::transform::arch_policies_from_types, {2}>", transform::get_iterator_name(input1_it, transform::input1_iterator_name), transform::get_iterator_name(input2_it, transform::input2_iterator_name), transform::get_iterator_name(output_it, transform::output_iterator_name)); @@ -496,13 +455,10 @@ struct __align__({5}) output_storage_t {{ {7} {8} {9} -using device_transform_policy = {10}::max_policy; - -#include -__device__ consteval auto& policy_generator() {{ - return ptx_json::id() - = cub::detail::transform::TransformPolicyWrapper::EncodedPolicy(); -}}; +using device_transform_policy = {10}; +using namespace cub; +using namespace cub::detail::transform; +static_assert(device_transform_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {11}, "Host generated and JIT compiled policy mismatch"); )XXX", input1_it.value_type.size, // 0 input1_it.value_type.alignment, // 1 @@ -514,7 +470,8 @@ __device__ consteval auto& policy_generator() {{ input2_iterator_src, // 7 output_iterator_src, // 8 op_src, // 9 - policy_hub_expr); // 10 + policy_hub_expr, // 10 + cub_arch_policies_str.view()); // 11 #if false // CCCL_DEBUGGING_SWITCH fflush(stderr); @@ -537,7 +494,6 @@ __device__ consteval auto& policy_generator() {{ "-dlto", "-default-device", "-DCUB_DISABLE_CDP", - "-DCUB_ENABLE_POLICY_PTX_JSON", "-std=c++20"}; cccl::detail::extend_args_with_build_config(args, config); @@ -570,20 +526,12 @@ __device__ consteval auto& policy_generator() {{ nlohmann::json runtime_policy = cub::detail::ptx_json::parse("device_transform_policy", {result.data.get(), result.size}); - const auto algorithm = static_cast(runtime_policy["algorithm"].get()); - const auto min_bif = static_cast(runtime_policy["min_bif"].get()); - build_ptr->loaded_bytes_per_iteration = static_cast((input1_it.value_type.size + input2_it.value_type.size)); - build_ptr->cc = cc; + build_ptr->cc = cc_major * 10 + cc_minor; build_ptr->cubin = (void*) result.data.release(); build_ptr->cubin_size = result.size; - build_ptr->runtime_policy = new transform::runtime_tuning_policy{ - algorithm, - min_bif, - transform::cdt::RuntimeTransformAgentPrefetchPolicy::from_json(runtime_policy, "prefetch_policy"), - transform::cdt::RuntimeTransformAgentVectorizedPolicy::from_json(runtime_policy, "vectorized_policy"), - transform::cdt::RuntimeTransformAgentAsyncPolicy::from_json(runtime_policy, "async_policy")}; - build_ptr->cache = new transform::cache(); + build_ptr->runtime_policy = new cub::detail::transform::arch_policies<2>{cub_arch_policies}; + build_ptr->cache = new transform::cache(); } catch (const std::exception& exc) { @@ -614,27 +562,17 @@ CUresult cccl_device_binary_transform( CUdevice cu_device; check(cuCtxGetDevice(&cu_device)); - error = static_cast( - transform::cdt::dispatch_t, - indirect_iterator_t, - transform::cdt::always_true_predicate, - indirect_arg_t, - transform::runtime_tuning_policy, - transform::transform_kernel_source<2>, - cub::detail::CudaDriverLauncherFactory>:: - dispatch( - ::cuda::std::make_tuple(d_in1, d_in2), - d_out, - num_items, - {}, - op, - stream, - {build, - {{{d_in1.value_type.size, d_in1.value_type.alignment}, {d_in2.value_type.size, d_in2.value_type.alignment}}}}, - cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, - *reinterpret_cast(build.runtime_policy))); + error = static_cast(transform::cdt::dispatch( + ::cuda::std::make_tuple(d_in1, d_in2), + indirect_iterator_t{d_out}, + static_cast(num_items), + transform::cdt::always_true_predicate{}, + op, + stream, + *static_cast*>(build.runtime_policy), + transform::transform_kernel_source<2>{ + build, {transform::make_iterator_info(d_in1), transform::make_iterator_info(d_in2)}}, + cub::detail::CudaDriverLauncherFactory{cu_device, build.cc})); } catch (const std::exception& exc) { @@ -692,10 +630,12 @@ CUresult cccl_device_transform_cleanup(cccl_device_transform_build_result_t* bui { return CUDA_ERROR_INVALID_VALUE; } - std::unique_ptr cubin(reinterpret_cast(build_ptr->cubin)); - std::unique_ptr rtp( - reinterpret_cast(build_ptr->runtime_policy)); - std::unique_ptr cache(reinterpret_cast(build_ptr->cache)); + using namespace cub::detail::transform; + std::unique_ptr cubin(static_cast(build_ptr->cubin)); + std::unique_ptr> rtp(static_cast*>(build_ptr->runtime_policy)); // FIXME(bgruber): + // handle <2> as + // well + std::unique_ptr cache(static_cast(build_ptr->cache)); check(cuLibraryUnload(build_ptr->library)); } catch (const std::exception& exc) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 34e39cc9078..c228702750f 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -21,6 +21,9 @@ workflows: # args: '--preset libcudacxx --lit-tests "cuda/utility/basic_any.pass.cpp"' } # override: + - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['msvc2019', 'msvc14.39']} + - {jobs: ['build'], std: 'minmax', ctk: '12.X', cxx: ['msvc2019', 'msvc' ]} + - {jobs: ['build'], std: 'minmax', ctk: '13.0', cxx: ['msvc2019', 'msvc' ]} pull_request: # Old CTK: Oldest/newest supported host compilers: diff --git a/cub/benchmarks/bench/transform/babelstream.cu b/cub/benchmarks/bench/transform/babelstream.cu index 67462949425..84b96de3c04 100644 --- a/cub/benchmarks/bench/transform/babelstream.cu +++ b/cub/benchmarks/bench/transform/babelstream.cu @@ -1,11 +1,13 @@ // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this -// benchmark. Its main use is to detect regressions. - +// %RANGE% TUNE_BIF_BIAS alg -16:16:4 +// %RANGE% TUNE_ALGORITHM alg 1:4:1 // %RANGE% TUNE_THREADS tpb 128:1024:128 -// %RANGE% TUNE_ALGORITHM alg 0:2:1 + +// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized) +// %RANGE% TUNE_VEC_SIZE ipt 1:32:1 +// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1 #include "common.h" diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h index aa7472351f1..6f0e69ae096 100644 --- a/cub/benchmarks/bench/transform/common.h +++ b/cub/benchmarks/bench/transform/common.h @@ -5,12 +5,12 @@ // keep checks at the top so compilation of discarded variants fails really fast #include -#if !TUNE_BASE && TUNE_ALGORITHM == 2 +#if !TUNE_BASE && TUNE_ALGORITHM == 3 # if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 # error "When tuning, this benchmark does not support being compiled for multiple architectures" # endif # if (__CUDA_ARCH_LIST__) < 900 -# error "Cannot compile algorithm 2 (ublkcp) below sm90" +# error "Cannot compile algorithm 3 (ublkcp) below sm90" # endif #endif @@ -23,32 +23,47 @@ #include -template -#if TUNE_BASE -using policy_hub_t = - cub::detail::transform::policy_hub, - RandomAccessIteratorOut>; -#else -struct policy_hub_t +#if !TUNE_BASE +struct arch_policies { - struct max_policy : cub::ChainedPolicy<500, max_policy, max_policy> + _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_arch_policy { - static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__); + const int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__) + TUNE_BIF_BIAS; # if TUNE_ALGORITHM == 0 - static constexpr auto algorithm = cub::detail::transform::Algorithm::prefetch; + const auto algorithm = cub::detail::transform::Algorithm::prefetch; + const auto policy = prefetch_policy{ + TUNE_THREADS +# ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT + , + TUNE_ITEMS_PER_THREAD_NO_INPUT +# endif + }; + return {min_bif, algorithm, policy, {}, {}}; # elif TUNE_ALGORITHM == 1 - static constexpr auto algorithm = cub::detail::transform::Algorithm::ublkcp; + const auto algorithm = cub::detail::transform::Algorithm::vectorized; + const auto policy = vectorized_policy{ + TUNE_THREADS, + TUNE_VEC_SIZE * TUNE_VECTORS_PER_THREAD, + TUNE_VEC_SIZE +# ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT + , + TUNE_ITEMS_PER_THREAD_NO_INPUT +# endif + }; + return {min_bif, algorithm, {}, policy, {}}; +# elif TUNE_ALGORITHM == 2 + const auto algorithm = cub::detail::transform::Algorithm::memcpy_async; + const auto policy = async_copy_policy{TUNE_THREADS, cub::detail::transform::ldgsts_size_and_align}; + return {min_bif, algorithm, {}, {}, policy}; +# elif TUNE_ALGORITHM == 3s + const auto algorithm = cub::detail::transform::Algorithm::ublkcp; + const auto policy = + async_copy_policy{TUNE_THREADS, cub::detail::transform::bulk_copy_alignment(__CUDA_ARCH_LIST__)}; + return {min_bif, algorithm, {}, {}, policy}; # else # error Policy hub does not yet implement the specified value for algorithm # endif - - using algo_policy = - ::cuda::std::_If, - cub::detail::transform::async_copy_policy_t>; - }; + } }; #endif @@ -60,15 +75,17 @@ void bench_transform(nvbench::state& state, TransformOp transform_op) { state.exec(nvbench::exec_tag::gpu, [&](const nvbench::launch& launch) { - cub::detail::transform::dispatch_t< - cub::detail::transform::requires_stable_address::no, - OffsetT, - ::cuda::std::tuple, - RandomAccessIteratorOut, - cub::detail::transform::always_true_predicate, - TransformOp, - policy_hub_t>:: - dispatch( - inputs, output, num_items, cub::detail::transform::always_true_predicate{}, transform_op, launch.get_stream()); + cub::detail::transform::dispatch( + inputs, + output, + num_items, + cub::detail::transform::always_true_predicate{}, + transform_op, + launch.get_stream() +#if !TUNE_BASE + , + arch_policies{} +#endif + ); }); } diff --git a/cub/benchmarks/bench/transform/complex_cmp.cu b/cub/benchmarks/bench/transform/complex_cmp.cu index ab7b993f531..636ed40765c 100644 --- a/cub/benchmarks/bench/transform/complex_cmp.cu +++ b/cub/benchmarks/bench/transform/complex_cmp.cu @@ -1,11 +1,13 @@ // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this -// benchmark. Its main use is to detect regressions. - +// %RANGE% TUNE_BIF_BIAS alg -16:16:4 +// %RANGE% TUNE_ALGORITHM alg 1:4:1 // %RANGE% TUNE_THREADS tpb 128:1024:128 -// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized) +// %RANGE% TUNE_VEC_SIZE ipt 1:32:1 +// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1 #include "common.h" diff --git a/cub/benchmarks/bench/transform/fib.cu b/cub/benchmarks/bench/transform/fib.cu index f85ae86386c..5f40fba2abc 100644 --- a/cub/benchmarks/bench/transform/fib.cu +++ b/cub/benchmarks/bench/transform/fib.cu @@ -1,11 +1,13 @@ // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this -// benchmark. Its main use is to detect regressions. - +// %RANGE% TUNE_BIF_BIAS alg -16:16:4 +// %RANGE% TUNE_ALGORITHM alg 1:4:1 // %RANGE% TUNE_THREADS tpb 128:1024:128 -// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized) +// %RANGE% TUNE_VEC_SIZE ipt 1:32:1 +// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1 #include "common.h" diff --git a/cub/benchmarks/bench/transform/fill.cu b/cub/benchmarks/bench/transform/fill.cu index 84e53221bd3..c90ae6472cc 100644 --- a/cub/benchmarks/bench/transform/fill.cu +++ b/cub/benchmarks/bench/transform/fill.cu @@ -1,11 +1,17 @@ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this -// benchmark. Its main use is to detect regressions. - +// %RANGE% TUNE_BIF_BIAS alg -16:16:4 +// for filling, we can only use the prefetch and the vectorized algorithm +// %RANGE% TUNE_ALGORITHM alg 1:2:1 // %RANGE% TUNE_THREADS tpb 128:1024:128 -// %RANGE% TUNE_ALGORITHM alg 0:3:1 + +// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 0 (prefetch) +// %RANGE% TUNE_ITEMS_PER_THREAD_NO_INPUT ipt 1:32:1 + +// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized) +// %RANGE% TUNE_VEC_SIZE ipt 1:32:1 +// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1 #include "common.h" diff --git a/cub/benchmarks/bench/transform/grayscale.cu b/cub/benchmarks/bench/transform/grayscale.cu index 5d15205b36f..52a7783cdc1 100644 --- a/cub/benchmarks/bench/transform/grayscale.cu +++ b/cub/benchmarks/bench/transform/grayscale.cu @@ -1,6 +1,14 @@ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause +// %RANGE% TUNE_BIF_BIAS alg -16:16:4 +// %RANGE% TUNE_ALGORITHM alg 1:4:1 +// %RANGE% TUNE_THREADS tpb 128:1024:128 + +// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized) +// %RANGE% TUNE_VEC_SIZE ipt 1:32:1 +// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1 + #include "common.h" template diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu index ef7a9fa4430..9dbb78bc8c5 100644 --- a/cub/benchmarks/bench/transform/heavy.cu +++ b/cub/benchmarks/bench/transform/heavy.cu @@ -1,11 +1,13 @@ // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause -// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this -// benchmark. Its main use is to detect regressions. - +// %RANGE% TUNE_BIF_BIAS alg -16:16:4 +// %RANGE% TUNE_ALGORITHM alg 1:4:1 // %RANGE% TUNE_THREADS tpb 128:1024:128 -// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized) +// %RANGE% TUNE_VEC_SIZE ipt 1:32:1 +// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1 #include "common.h" diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 994670c9d33..a8a26ee5709 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -44,25 +45,31 @@ struct ::cuda::proclaims_copyable_arguments + typename Env> CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal( ::cuda::std::tuple inputs, RandomAccessIteratorOut output, NumItemsT num_items, Predicate predicate, TransformOp transform_op, - cudaStream_t stream, - StableAddress = {}) + Env env) { using choose_offset_t = detail::choose_signed_offset; using offset_t = typename choose_offset_t::type; @@ -73,17 +80,32 @@ private: return error; } - return detail::transform::dispatch_t < StableAddress::value - ? detail::transform::requires_stable_address::yes - : detail::transform::requires_stable_address::no, - offset_t, ::cuda::std::tuple, RandomAccessIteratorOut, Predicate, - TransformOp > ::dispatch( - ::cuda::std::move(inputs), - ::cuda::std::move(output), - num_items, - ::cuda::std::move(predicate), - ::cuda::std::move(transform_op), - stream); + using tuning_env_t = + ::cuda::std::execution::__query_result_or_t>; + using transform_tuning_t = + ::cuda::std::execution::__query_result_or_t; + + if constexpr (!::cuda::std::is_same_v) + { + return detail::transform::dispatch( + ::cuda::std::move(inputs), + ::cuda::std::move(output), + static_cast(num_items), + ::cuda::std::move(predicate), + ::cuda::std::move(transform_op), + get_stream(env), + transform_tuning_t{}); + } + else + { + return detail::transform::dispatch( + ::cuda::std::move(inputs), + ::cuda::std::move(output), + static_cast(num_items), + ::cuda::std::move(predicate), + ::cuda::std::move(transform_op), + get_stream(env)); + } } template @@ -146,7 +168,7 @@ public: num_items, detail::transform::always_true_predicate{}, ::cuda::std::move(transform_op), - get_stream(env)); + ::cuda::std::move(env)); } #ifndef _CCCL_DOXYGEN_INVOKED // Do not document @@ -269,7 +291,7 @@ public: num_items, detail::transform::always_true_predicate{}, ::cuda::std::move(generator), - get_stream(env)); + ::cuda::std::move(env)); } #ifndef _CCCL_DOXYGEN_INVOKED // Do not document @@ -322,7 +344,7 @@ public: num_items, detail::transform::always_true_predicate{}, detail::__return_constant{::cuda::std::move(value)}, - get_stream(env)); + ::cuda::std::move(env)); } #ifndef _CCCL_DOXYGEN_INVOKED // Do not document @@ -401,7 +423,7 @@ public: num_items, ::cuda::std::move(predicate), ::cuda::std::move(transform_op), - get_stream(env)); + ::cuda::std::move(env)); } #ifndef _CCCL_DOXYGEN_INVOKED // Do not document @@ -489,7 +511,7 @@ public: num_items, ::cuda::std::move(predicate), ::cuda::std::move(transform_op), - get_stream(env)); + ::cuda::std::move(env)); } #ifndef _CCCL_DOXYGEN_INVOKED // Do not document @@ -567,14 +589,13 @@ public: Env env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses"); - return TransformInternal( + return TransformInternal( ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, detail::transform::always_true_predicate{}, ::cuda::std::move(transform_op), - get_stream(env), - ::cuda::std::true_type{}); + ::cuda::std::move(env)); } #ifndef _CCCL_DOXYGEN_INVOKED // Do not document diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index c33053f12a6..2925fd88c85 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -41,8 +41,13 @@ #include #include #include +#include #include +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) +# include +#endif + // On Windows, the `if CUB_DETAIL_CONSTEXPR_ISH` results in `warning C4702: unreachable code`. _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_MSVC(4702) @@ -67,30 +72,33 @@ struct prefetch_config int sm_count; }; -template + typename TransformOp> struct TransformKernelSource; -template -struct TransformKernelSource +struct TransformKernelSource, RandomAccessIteratorOut, Predicate, - TransformOp, - PolicyHub> + TransformOp> { + // ArchPolicies must be stateless, so we can pass the type to the kernel + static_assert(::cuda::std::is_empty_v); + CUB_DEFINE_KERNEL_GETTER( TransformKernel, - transform_kernel(); } - CUB_RUNTIME_FUNCTION static constexpr auto ItValueSizesAlignments() + CUB_RUNTIME_FUNCTION static constexpr auto InputIteratorInfos() { - return make_sizes_alignments(); + return ::cuda::std::array{ + make_iterator_info()...}; } template @@ -149,7 +158,7 @@ public: CUB_RUNTIME_FUNCTION constexpr static bool CanVectorize(int vec_size, const RandomAccessIteratorOut& out, const RandomAccessIteratorsIn&... in) { - return is_pointer_aligned(out, sizeof(it_value_t) * vec_size) + return is_pointer_aligned(out, size_of> * vec_size) && (is_pointer_aligned(in, sizeof(it_value_t) * vec_size) && ...); } }; @@ -160,367 +169,432 @@ enum class requires_stable_address yes }; -template < - requires_stable_address StableAddress, - typename Offset, - typename RandomAccessIteratorTupleIn, - typename RandomAccessIteratorOut, - typename Predicate, - typename TransformOp, - typename PolicyHub = policy_hub, - RandomAccessIteratorTupleIn, - RandomAccessIteratorOut>, - typename KernelSource = - TransformKernelSource, - typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> -struct dispatch_t; +// NEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEW -template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int +spread_out_items_per_thread(Offset num_items, Policy policy, int items_per_thread, int sm_count, int max_occupancy) +{ + const int block_threads = policy.block_threads; + + const int items_per_thread_evenly_spread = static_cast(( + ::cuda::std::min) (Offset{items_per_thread}, ::cuda::ceil_div(num_items, sm_count * block_threads * max_occupancy))); + const int items_per_thread_clamped = + ::cuda::std::clamp(items_per_thread_evenly_spread, policy.min_items_per_thread, policy.max_items_per_thread); + return items_per_thread_clamped; +} + +template -struct dispatch_t, - RandomAccessIteratorOut, - Predicate, - TransformOp, - PolicyHub, - KernelSource, - KernelLauncherFactory> +CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_async_kernel( + Offset num_items, + int alignment, + SMemFunc dyn_smem_for_tile_size, + cudaStream_t stream, + PolicyGetter policy_getter, + KernelSource kernel_source, + KernelLauncherFactory launcher_factory) + -> cuda_expected< + ::cuda::std::tuple> { - static_assert(::cuda::std::is_same_v - || ::cuda::std::is_same_v, - "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types"); + CUB_DETAIL_CONSTEXPR_ISH const transform_arch_policy policy = policy_getter(); + CUB_DETAIL_CONSTEXPR_ISH int block_threads = policy.async_copy_policy.block_threads; - ::cuda::std::tuple in; - RandomAccessIteratorOut out; - Offset num_items; - Predicate pred; - TransformOp op; - int bulk_copy_align; - cudaStream_t stream; - KernelSource kernel_source = {}; - KernelLauncherFactory launcher_factory = {}; - - // Reduces the items_per_thread when necessary to generate enough blocks to reach the maximum occupancy. - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int - spread_out_items_per_thread(AsyncPolicy async_policy, int items_per_thread, int sm_count, int max_occupancy) - { - const int block_threads = async_policy.BlockThreads(); - - const int items_per_thread_evenly_spread = static_cast( - (::cuda::std::min) (Offset{items_per_thread}, - ::cuda::ceil_div(num_items, sm_count * block_threads * max_occupancy))); - const int items_per_thread_clamped = ::cuda::std::clamp( - items_per_thread_evenly_spread, +async_policy.MinItemsPerThread(), +async_policy.MaxItemsPerThread()); - return items_per_thread_clamped; - } + _CCCL_ASSERT(block_threads % alignment == 0, "block_threads needs to be a multiple of the copy alignment"); + // ^ then tile_size is a multiple of it - template - CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto - configure_async_kernel(int alignment, SMemFunc dyn_smem_for_tile_size, WrappedPolicy wrapped_policy = {}) - -> cuda_expected< - ::cuda::std::tuple> - { - int block_threads = wrapped_policy.AsyncPolicy().BlockThreads(); - - _CCCL_ASSERT(block_threads % alignment == 0, "block_threads needs to be a multiple of the copy alignment"); - // ^ then tile_size is a multiple of it + CUB_DETAIL_CONSTEXPR_ISH auto min_items_per_thread = policy.async_copy_policy.min_items_per_thread; + CUB_DETAIL_CONSTEXPR_ISH auto max_items_per_thread = policy.async_copy_policy.max_items_per_thread; - CUB_DETAIL_CONSTEXPR_ISH auto min_items_per_thread = wrapped_policy.AsyncPolicy().MinItemsPerThread(); - CUB_DETAIL_CONSTEXPR_ISH auto max_items_per_thread = wrapped_policy.AsyncPolicy().MaxItemsPerThread(); + // ensures the loop below runs at least once + // pulled outside of the lambda below to make MSVC happy + CUB_DETAIL_STATIC_ISH_ASSERT(min_items_per_thread <= max_items_per_thread, "invalid policy"); - // ensures the loop below runs at least once - // pulled outside of the lambda below to make MSVC happy - CUB_DETAIL_STATIC_ISH_ASSERT(min_items_per_thread <= max_items_per_thread, "invalid policy"); + auto determine_element_counts = [&]() -> cuda_expected { + int sm_count = 0; + auto error = CubDebug(launcher_factory.MultiProcessorCount(sm_count)); + if (error != cudaSuccess) + { + return ::cuda::std::unexpected(error); + } - auto determine_element_counts = [&]() -> cuda_expected { - int sm_count = 0; - auto error = CubDebug(launcher_factory.MultiProcessorCount(sm_count)); + // Increase the number of output elements per thread until we reach the required bytes in flight. + // Benchmarking shows that even for a few iteration, this loop takes around 4-7 us, so should not be a concern. + // This computation MUST NOT depend on any runtime state of the current API invocation (like num_items), since the + // result will be cached. + async_config last_config{}; + for (int items_per_thread = +min_items_per_thread; items_per_thread <= +max_items_per_thread; ++items_per_thread) + { + const int tile_size = block_threads * items_per_thread; + const int dyn_smem_size = dyn_smem_for_tile_size(tile_size, alignment); + int max_occupancy = 0; + error = CubDebug( + launcher_factory.MaxSmOccupancy(max_occupancy, kernel_source.TransformKernel(), block_threads, dyn_smem_size)); if (error != cudaSuccess) { return ::cuda::std::unexpected(error); } - - // Increase the number of output elements per thread until we reach the required bytes in flight. - // Benchmarking shows that even for a few iteration, this loop takes around 4-7 us, so should not be a concern. - // This computation MUST NOT depend on any runtime state of the current API invocation (like num_items), since the - // result will be cached. - async_config last_config{}; - for (int items_per_thread = +min_items_per_thread; items_per_thread <= +max_items_per_thread; ++items_per_thread) + if (max_occupancy == 0) { - const int tile_size = block_threads * items_per_thread; - const int dyn_smem_size = dyn_smem_for_tile_size(tile_size, alignment); - int max_occupancy = 0; - error = CubDebug(launcher_factory.MaxSmOccupancy( - max_occupancy, kernel_source.TransformKernel(), block_threads, dyn_smem_size)); - if (error != cudaSuccess) - { - return ::cuda::std::unexpected(error); - } - if (max_occupancy == 0) - { - // assert should be prevented by smem check in policy - _CCCL_ASSERT(last_config.items_per_thread > 0, "min_items_per_thread exceeds available shared memory"); - return last_config; - } - - const auto config = async_config{items_per_thread, max_occupancy, sm_count}; - - const int bytes_in_flight_SM = max_occupancy * tile_size * kernel_source.LoadedBytesPerIteration(); - if (wrapped_policy.MinBif() <= bytes_in_flight_SM) - { - return config; - } - - last_config = config; + // assert should be prevented by smem check in policy + _CCCL_ASSERT(last_config.items_per_thread > 0, "min_items_per_thread exceeds available shared memory"); + return last_config; } - return last_config; - }; - cuda_expected config = kernel_source.CacheAsyncConfiguration(determine_element_counts); - if (!config) - { - return ::cuda::std::unexpected(config.error()); - } - _CCCL_ASSERT(config->items_per_thread > 0, ""); - _CCCL_ASSERT((config->items_per_thread * block_threads) % alignment == 0, ""); - - const int ipt = spread_out_items_per_thread( - wrapped_policy.AsyncPolicy(), config->items_per_thread, config->sm_count, config->max_occupancy); - const int tile_size = block_threads * ipt; - const int dyn_smem_size = dyn_smem_for_tile_size(tile_size, alignment); - _CCCL_ASSERT((sizeof...(RandomAccessIteratorsIn) == 0) != (dyn_smem_size != 0), ""); // logical xor - - const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{tile_size})); - // config->smem_size is 16 bytes larger than needed for UBLKCP because it's the total SMEM size, but 16 bytes are - // occupied by static shared memory and padding. But let's not complicate things. - return ::cuda::std::make_tuple( - launcher_factory(grid_dim, block_threads, dyn_smem_size, stream, true), kernel_source.TransformKernel(), ipt); - } - - // Avoid unnecessarily parsing these definitions when not needed. -#if defined(CUB_DEFINE_RUNTIME_POLICIES) - template - struct is_valid_aligned_base_ptr_arg_impl : ::cuda::std::false_type - {}; - template - struct is_valid_aligned_base_ptr_arg_impl< - It, - ::cuda::std::void_t().MakeAlignedBasePtrKernelArg( - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::declval()), 0))>> : ::cuda::std::true_type - {}; + const auto config = async_config{items_per_thread, max_occupancy, sm_count}; - template - static constexpr auto is_valid_aligned_base_ptr_arg = is_valid_aligned_base_ptr_arg_impl::value; -#endif // CUB_DEFINE_RUNTIME_POLICIES + const int bytes_in_flight_SM = max_occupancy * tile_size * kernel_source.LoadedBytesPerIteration(); + if (policy.min_bif <= bytes_in_flight_SM) + { + return config; + } - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_async_algorithm( - int alignment, SMemFunc dyn_smem_for_tile_size, cuda::std::index_sequence, WrappedPolicy wrapped_policy = {}) - { - auto ret = configure_async_kernel(alignment, dyn_smem_for_tile_size, wrapped_policy); - if (!ret) - { - return ret.error(); + last_config = config; } -#if defined(CUB_DEFINE_RUNTIME_POLICIES) - // Normally, this check is handled by the if constexpr(ish) in Invoke. However, when runtime policies are - // defined (like by c.parallel), that if constexpr becomes a plain if, so we need to check the actual compile time - // condition again, this time asserting at runtime if we hit this point during dispatch. - if constexpr ((is_valid_aligned_base_ptr_arg && ...)) - { -#endif // CUB_DEFINE_RUNTIME_POLICIES - auto [launcher, kernel, items_per_thread] = *ret; - return launcher.doit( - kernel, - num_items, - items_per_thread, - false, - pred, - op, - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out), - kernel_source.MakeAlignedBasePtrKernelArg( - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get(in)), alignment)...); -#if defined(CUB_DEFINE_RUNTIME_POLICIES) - } - else - { - _CCCL_ASSERT_HOST(false, "ublkcp algorithm requires all input iterators to be contiguous"); - _CCCL_UNREACHABLE(); - } -#endif // CUB_DEFINE_RUNTIME_POLICIES + return last_config; + }; + cuda_expected config = kernel_source.CacheAsyncConfiguration(determine_element_counts); + if (!config) + { + return ::cuda::std::unexpected(config.error()); } + _CCCL_ASSERT(config->items_per_thread > 0, ""); + _CCCL_ASSERT((config->items_per_thread * block_threads) % alignment == 0, ""); + + const int ipt = spread_out_items_per_thread( + num_items, policy.async_copy_policy, config->items_per_thread, config->sm_count, config->max_occupancy); + const int tile_size = block_threads * ipt; + const int dyn_smem_size = dyn_smem_for_tile_size(tile_size, alignment); + _CCCL_ASSERT(NoInputs != (dyn_smem_size != 0), ""); // logical xor + + const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{tile_size})); + // config->smem_size is 16 bytes larger than needed for UBLKCP because it's the total SMEM size, but 16 bytes are + // occupied by static shared memory and padding. But let's not complicate things. + return ::cuda::std::make_tuple( + launcher_factory(grid_dim, block_threads, dyn_smem_size, stream, true), kernel_source.TransformKernel(), ipt); +} - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t - invoke_prefetch_or_vectorized_algorithm(::cuda::std::index_sequence, WrappedPolicy wrapped_policy) +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_async_algorithm( + ::cuda::std::tuple in, + RandomAccessIteratorOut out, + Offset num_items, + Predicate pred, + TransformOp op, + cudaStream_t stream, + int alignment, + SMemFunc dyn_smem_for_tile_size, + cuda::std::index_sequence, + PolicyGetter policy_getter, + KernelSource kernel_source, + KernelLauncherFactory launcher_factory) +{ + auto ret = configure_async_kernel<(sizeof...(RandomAccessIteratorsIn) == 0)>( + num_items, alignment, dyn_smem_for_tile_size, stream, policy_getter, kernel_source, launcher_factory); + if (!ret) { - const int block_threads = - wrapped_policy.Algorithm() == Algorithm::vectorized - ? wrapped_policy.VectorizedPolicy().BlockThreads() - : wrapped_policy.PrefetchPolicy().BlockThreads(); - - auto determine_config = [&]() -> cuda_expected { - int max_occupancy = 0; - auto error = - CubDebug(launcher_factory.MaxSmOccupancy(max_occupancy, kernel_source.TransformKernel(), block_threads, 0)); - if (error != cudaSuccess) - { - return ::cuda::std::unexpected(error); - } - int sm_count = 0; - error = CubDebug(launcher_factory.MultiProcessorCount(sm_count)); - if (error != cudaSuccess) - { - return ::cuda::std::unexpected(error); - } - return prefetch_config{max_occupancy, sm_count}; - }; + return ret.error(); + } - cuda_expected config = kernel_source.CachePrefetchConfiguration(determine_config); - if (!config) + auto [launcher, kernel, items_per_thread] = *ret; + return launcher.doit( + kernel, + num_items, + items_per_thread, + false, + pred, + op, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out), + kernel_source.MakeAlignedBasePtrKernelArg( + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get(in)), alignment)...); +} + +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_prefetch_or_vectorized_algorithm( + [[maybe_unused]] ::cuda::std::tuple in, + RandomAccessIteratorOut out, + Offset num_items, + Predicate pred, + TransformOp op, + cudaStream_t stream, + ::cuda::std::index_sequence, + PolicyGetter policy_getter, + KernelSource kernel_source, + KernelLauncherFactory launcher_factory) +{ + CUB_DETAIL_CONSTEXPR_ISH const transform_arch_policy policy = policy_getter(); + CUB_DETAIL_CONSTEXPR_ISH const int block_threads = + policy.algorithm == Algorithm::vectorized + ? policy.vectorized_policy.block_threads + : policy.prefetch_policy.block_threads; + + auto determine_config = [&]() -> cuda_expected { + int max_occupancy = 0; + auto error = + CubDebug(launcher_factory.MaxSmOccupancy(max_occupancy, kernel_source.TransformKernel(), block_threads, 0)); + if (error != cudaSuccess) { - return config.error(); + return ::cuda::std::unexpected(error); } - - auto can_vectorize = false; - // the policy already handles the compile-time checks if we can vectorize. Do the remaining alignment check here - if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == wrapped_policy.Algorithm()) + int sm_count = 0; + error = CubDebug(launcher_factory.MultiProcessorCount(sm_count)); + if (error != cudaSuccess) { - const int vs = wrapped_policy.VectorizedPolicy().VecSize(); - can_vectorize = kernel_source.CanVectorize(vs, out, ::cuda::std::get(in)...); + return ::cuda::std::unexpected(error); } + return prefetch_config{max_occupancy, sm_count}; + }; - int ipt = 0; - bool ipt_found = false; - if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == wrapped_policy.Algorithm()) - { - if (can_vectorize) - { - ipt = wrapped_policy.VectorizedPolicy().ItemsPerThreadVectorized(); - ipt_found = true; - } - } + cuda_expected config = kernel_source.CachePrefetchConfiguration(determine_config); + if (!config) + { + return config.error(); + } - if (!ipt_found) + auto can_vectorize = false; + ::cuda::std::optional ipt; + + // the policy already handles the compile-time checks if we can vectorize. Do the remaining alignment check here + if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == policy.algorithm) + { + const int vs = policy.vectorized_policy.vec_size; + can_vectorize = kernel_source.CanVectorize(vs, out, ::cuda::std::get(in)...); + if (can_vectorize) { - // otherwise, set up the prefetch kernel - - auto loaded_bytes_per_iter = kernel_source.LoadedBytesPerIteration(); - const auto items_per_thread_no_input = - wrapped_policy.Algorithm() == Algorithm::vectorized - ? wrapped_policy.VectorizedPolicy().ItemsPerThreadNoInput() - : wrapped_policy.PrefetchPolicy().ItemsPerThreadNoInput(); - // choose items per thread to reach minimum bytes in flight - const int items_per_thread = - loaded_bytes_per_iter == 0 - ? items_per_thread_no_input - : ::cuda::ceil_div(wrapped_policy.MinBif(), config->max_occupancy * block_threads * loaded_bytes_per_iter); - - // but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16/2^20 elements - if CUB_DETAIL_CONSTEXPR_ISH (wrapped_policy.Algorithm() == Algorithm::vectorized) - { - ipt = spread_out_items_per_thread( - wrapped_policy.VectorizedPolicy(), items_per_thread, config->sm_count, config->max_occupancy); - } - else - { - ipt = spread_out_items_per_thread( - wrapped_policy.PrefetchPolicy(), items_per_thread, config->sm_count, config->max_occupancy); - } + ipt = policy.vectorized_policy.items_per_thread_vectorized; } - const int tile_size = block_threads * ipt; - const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{tile_size})); - return CubDebug( - launcher_factory(grid_dim, block_threads, 0, stream, true) - .doit(kernel_source.TransformKernel(), - num_items, - ipt, - can_vectorize, - pred, - op, - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out), - kernel_source.MakeIteratorKernelArg( - THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get(in)))...)); } - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {}) + if (!ipt) + { + // otherwise, set up the prefetch kernel + const auto fallback_prefetch_policy = prefetch_policy{ + policy.vectorized_policy.block_threads, + policy.vectorized_policy.prefetch_items_per_thread_no_input, + policy.vectorized_policy.prefetch_min_items_per_thread, + policy.vectorized_policy.prefetch_max_items_per_thread}; + const auto prefetch_policy = + policy.algorithm == Algorithm::prefetch ? policy.prefetch_policy : fallback_prefetch_policy; + + auto loaded_bytes_per_iter = kernel_source.LoadedBytesPerIteration(); + const auto items_per_thread_no_input = prefetch_policy.items_per_thread_no_input; + // choose items per thread to reach minimum bytes in flight + const int items_per_thread = + loaded_bytes_per_iter == 0 + ? items_per_thread_no_input + : ::cuda::ceil_div(policy.min_bif, config->max_occupancy * block_threads * loaded_bytes_per_iter); + + // but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16/2^20 elements + ipt = spread_out_items_per_thread( + num_items, prefetch_policy, items_per_thread, config->sm_count, config->max_occupancy); + } + _CCCL_ASSERT(ipt, ""); + const int tile_size = block_threads * ipt.value(); + const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{tile_size})); + return CubDebug( + launcher_factory(grid_dim, block_threads, 0, stream, true) + .doit(kernel_source.TransformKernel(), + num_items, + ipt.value(), + can_vectorize, + pred, + op, + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out), + kernel_source.MakeIteratorKernelArg( + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get(in)))...)); +} + +// This should ideally have been a lambda, but MSVC < 14.44 ICEs if we put a `(static) constexpr` variable inside +template +struct invoke_for_arch; + +template +struct invoke_for_arch<::cuda::std::tuple, + RandomAccessIteratorOut, + Offset, + Predicate, + TransformOp, + KernelSource, + KernelLauncherFactory> +{ + ::cuda::std::tuple in; + RandomAccessIteratorOut out; + Offset num_items; + Predicate pred; + TransformOp op; + cudaStream_t stream; + KernelSource kernel_source; + KernelLauncherFactory launcher_factory; + ::cuda::arch_id arch_id; + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t operator()(PolicyGetter policy_getter) const { - auto wrapped_policy = MakeTransformPolicyWrapper(active_policy); - const auto seq = ::cuda::std::index_sequence_for{}; - if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::ublkcp == wrapped_policy.Algorithm()) + CUB_DETAIL_CONSTEXPR_ISH transform_arch_policy active_policy = policy_getter(); + const auto seq = ::cuda::std::index_sequence_for{}; + +#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + NV_IF_TARGET( + NV_IS_HOST, + (std::stringstream ss; ss << active_policy; + _CubLog("Dispatching DeviceTransform to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());)) +#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG) + + if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::ublkcp == active_policy.algorithm) { return invoke_async_algorithm( - bulk_copy_align, - [this](int tile_size, int alignment) { - return bulk_copy_dyn_smem_for_tile_size(kernel_source.ItValueSizesAlignments(), tile_size, alignment); + ::cuda::std::move(in), + ::cuda::std::move(out), + num_items, + ::cuda::std::move(pred), + ::cuda::std::move(op), + stream, + bulk_copy_alignment(arch_id), + [&](int tile_size, int alignment) { + return bulk_copy_dyn_smem_for_tile_size( + kernel_source.InputIteratorInfos(), tile_size, alignment); }, seq, - wrapped_policy); + policy_getter, + kernel_source, + launcher_factory); } - else if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::memcpy_async == wrapped_policy.Algorithm()) + else if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::memcpy_async == active_policy.algorithm) { return invoke_async_algorithm( + ::cuda::std::move(in), + ::cuda::std::move(out), + num_items, + ::cuda::std::move(pred), + ::cuda::std::move(op), + stream, ldgsts_size_and_align, - [this](int tile_size, int alignment) { - return memcpy_async_dyn_smem_for_tile_size(kernel_source.ItValueSizesAlignments(), tile_size, alignment); + [&](int tile_size, int alignment) { + return memcpy_async_dyn_smem_for_tile_size( + kernel_source.InputIteratorInfos(), tile_size, alignment); }, seq, - wrapped_policy); + policy_getter, + kernel_source, + launcher_factory); } else { - return invoke_prefetch_or_vectorized_algorithm(seq, wrapped_policy); + return invoke_prefetch_or_vectorized_algorithm( + ::cuda::std::move(in), + ::cuda::std::move(out), + num_items, + ::cuda::std::move(pred), + ::cuda::std::move(op), + stream, + seq, + policy_getter, + kernel_source, + launcher_factory); } } +}; - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( - ::cuda::std::tuple in, - RandomAccessIteratorOut out, - Offset num_items, - Predicate pred, - TransformOp op, - cudaStream_t stream, - KernelSource kernel_source = {}, - KernelLauncherFactory launcher_factory = {}, - MaxPolicyT max_policy = {}) +template , + ::cuda::std::tuple, + RandomAccessIteratorOut>, + typename KernelSource = TransformKernelSource, + RandomAccessIteratorOut, + Predicate, + TransformOp>, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> +#if _CCCL_HAS_CONCEPTS() + requires transform_policy_hub +#endif // _CCCL_HAS_CONCEPTS() +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( + ::cuda::std::tuple in, + RandomAccessIteratorOut out, + Offset num_items, + Predicate pred, + TransformOp op, + cudaStream_t stream, + ArchPolicies arch_policies = {}, + KernelSource kernel_source = {}, + KernelLauncherFactory launcher_factory = {}) +{ + static_assert( + ::cuda::std::is_same_v || ::cuda::std::is_same_v, + "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types"); + + if (num_items == 0) { - if (num_items == 0) - { - return cudaSuccess; - } + return cudaSuccess; + } - int ptx_version = 0; - auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)); - if (cudaSuccess != error) - { - return error; - } + ::cuda::arch_id arch_id{}; + if (const auto error = CubDebug(launcher_factory.PtxArchId(arch_id))) + { + return error; + } - dispatch_t dispatch{ + return dispatch_arch( + arch_policies, + arch_id, + invoke_for_arch<::cuda::std::tuple, + RandomAccessIteratorOut, + Offset, + Predicate, + TransformOp, + KernelSource, + KernelLauncherFactory>{ ::cuda::std::move(in), ::cuda::std::move(out), num_items, ::cuda::std::move(pred), ::cuda::std::move(op), - bulk_copy_alignment(ptx_version), stream, kernel_source, - launcher_factory}; - return CubDebug(max_policy.Invoke(ptx_version, dispatch)); - } -}; + launcher_factory, + arch_id}); +} } // namespace detail::transform CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh index 081bfff43d0..930d01de7eb 100644 --- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -96,7 +97,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It begin, int items) // This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in // global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the // parameter yields a global memory address. -template (blockIdx.x) * tile_size; const int valid_items = static_cast((::cuda::std::min) (num_items - offset, Offset{tile_size})); @@ -201,11 +202,18 @@ _CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type() } } -template +// FIXME(bgruber): nvcc 12.0 - 13.1 crash with `error: Internal Compiler Error (codegen): "unexpected error in codegen +// for function: found previous definition of same function!"` when we pass a const& as template parameter (and the +// function template body contains a lambda). As a workaround, we pass the parts of the policy by value. +// TODO(bgruber): In C++20, we should just pass transform_arch_policy by value. +template < // const transform_arch_policy& Policy, + int block_threads, + int items_per_thread, + int vec_size, + typename Offset, + typename F, + typename RandomAccessIteratorOut, + typename... RandomAccessIteratorsIn> _CCCL_DEVICE void transform_kernel_vectorized( Offset num_items, int num_elem_per_thread_prefetch, @@ -214,18 +222,18 @@ _CCCL_DEVICE void transform_kernel_vectorized( RandomAccessIteratorOut out, RandomAccessIteratorsIn... ins) { - constexpr int block_dim = VectorizedPolicy::block_threads; - constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized; - constexpr int vec_size = VectorizedPolicy::vec_size; + // constexpr int block_threads = Policy.vectorized_policy.block_threads; + // constexpr int items_per_thread = Policy.vectorized_policy.items_per_thread_vectorized; + // constexpr int vec_size = Policy.vectorized_policy.vec_size; _CCCL_ASSERT(!can_vectorize || (items_per_thread == num_elem_per_thread_prefetch), ""); - constexpr int tile_size = block_dim * items_per_thread; + constexpr int tile_size = block_threads * items_per_thread; const Offset offset = static_cast(blockIdx.x) * tile_size; const int valid_items = static_cast((::cuda::std::min) (num_items - offset, Offset{tile_size})); // if we cannot vectorize or don't have a full tile, fall back to prefetch kernel if (!can_vectorize || valid_items != tile_size) { - transform_kernel_prefetch( + transform_kernel_prefetch( num_items, num_elem_per_thread_prefetch, always_true_predicate{}, @@ -268,7 +276,7 @@ _CCCL_DEVICE void transform_kernel_vectorized( _CCCL_PRAGMA_UNROLL_FULL() for (int i = 0; i < load_store_count; ++i) { - input_vec[i] = in_vec[i * VectorizedPolicy::block_threads]; + input_vec[i] = in_vec[i * block_threads]; } } else @@ -280,7 +288,7 @@ _CCCL_DEVICE void transform_kernel_vectorized( _CCCL_PRAGMA_UNROLL_FULL() for (int j = 0; j < vec_size; ++j) { - input[i * vec_size + j] = in[i * vec_size * VectorizedPolicy::block_threads + j]; + input[i * vec_size + j] = in[i * vec_size * block_threads + j]; } } } @@ -310,7 +318,7 @@ _CCCL_DEVICE void transform_kernel_vectorized( _CCCL_PRAGMA_UNROLL_FULL() for (int i = 0; i < load_store_count; ++i) { - out_vec[i * VectorizedPolicy::block_threads] = output_vec[i]; + out_vec[i * block_threads] = output_vec[i]; } } else @@ -323,7 +331,7 @@ _CCCL_DEVICE void transform_kernel_vectorized( _CCCL_PRAGMA_UNROLL_FULL() for (int j = 0; j < vec_size; ++j) { - out[i * vec_size * VectorizedPolicy::block_threads + j] = output[i * vec_size + j]; + out[i * vec_size * block_threads + j] = output[i * vec_size + j]; } } } @@ -539,13 +547,18 @@ _CCCL_DEVICE auto copy_and_return_smem_dst_fallback( return reinterpret_cast(dst); } +// FIXME(bgruber): nvcc 12.0 - 13.1 crash with `error: Internal Compiler Error (codegen): "unexpected error in codegen +// for function: found previous definition of same function!"` when we pass a const& as template parameter (and the +// function template body contains a lambda). As a workaround, we pass the parts of the policy by value. +// TODO(bgruber): In C++20, we should just pass transform_arch_policy by value. // note: there is no PDL in this kernel since PDL is not supported below Hopper and this kernel is intended for Ampere -template +template < // const transform_arch_policy& Policy, + int block_threads, + typename Offset, + typename Predicate, + typename F, + typename RandomAccessIteratorOut, + typename... InTs> _CCCL_DEVICE void transform_kernel_ldgsts( Offset num_items, int num_elem_per_thread, @@ -559,10 +572,10 @@ _CCCL_DEVICE void transform_kernel_ldgsts( static_assert(ldgsts_size_and_align <= 16); _CCCL_ASSERT(reinterpret_cast(smem) % ldgsts_size_and_align == 0, ""); - constexpr int block_threads = LdgstsPolicy::block_threads; - const int tile_size = block_threads * num_elem_per_thread; - const Offset offset = static_cast(blockIdx.x) * tile_size; - const int valid_items = static_cast(::cuda::std::min(num_items - offset, Offset{tile_size})); + // constexpr int block_threads = Policy.async_copy_policy.block_threads; + const int tile_size = block_threads * num_elem_per_thread; + const Offset offset = static_cast(blockIdx.x) * tile_size; + const int valid_items = static_cast(::cuda::std::min(num_items - offset, Offset{tile_size})); [[maybe_unused]] int smem_offset = 0; // TODO(bgruber): drop checking first block, since gmem buffers are always sufficiently aligned. But this would not @@ -680,16 +693,26 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned( dst_ptr[bytes_to_copy - tail_bytes + threadIdx.x] = tail_byte; } } - +// FIXME(bgruber): nvcc 12.0 - 13.1 error with `function "void +// cub::_V_300300_SM_750_800_900_1000_1200::detail::transform::transform_kernel_ublkcp< ::policy, int, +// ::cub::_V_300300_SM_750_800_900_1000_1200::detail::transform::always_true_predicate, +// ::cuda::std::__4::logical_and , bool *, int, int > (T2, int, T3, T4, T5, +// ::cub::_V_300300_SM_750_800_900_1000_1200::detail::transform::aligned_base_ptr ...)::[lambda(T1) (instance +// 3)]::operator ()< ::cuda::std::__4::integral_constant > const" has already been defined` when we pass +// a const& as template parameter (and the function template body contains a lambda). As a workaround, we pass the parts +// of the policy by value. +// TODO(bgruber): In C++20, we should just pass transform_arch_policy by value. // Note: we tried implementing work stealing, aka. cluster launch control, aka. UGETNEXTWORKID, (see PR: // https://github.com/NVIDIA/cccl/pull/5099) and the slowdowns on some benchmarks outweighed the benefits on B200. So we // didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu. -template +template < // const transform_arch_policy& Policy, + int block_threads, + int bulk_copy_alignment, + typename Offset, + typename Predicate, + typename F, + typename RandomAccessIteratorOut, + typename... InTs> _CCCL_DEVICE void transform_kernel_ublkcp( Offset num_items, int num_elem_per_thread, @@ -698,8 +721,8 @@ _CCCL_DEVICE void transform_kernel_ublkcp( RandomAccessIteratorOut out, aligned_base_ptr... aligned_ptrs) { - constexpr int block_threads = BulkCopyPolicy::block_threads; - constexpr int bulk_copy_alignment = BulkCopyPolicy::bulk_copy_alignment; + // constexpr int block_threads = Policy.async_copy_policy.block_threads; + // constexpr int bulk_copy_alignment = Policy.async_copy_policy.bulk_copy_alignment; // add padding after a tile in shared memory to make space for the next tile's head padding, and retain alignment constexpr int max_alignment = ::cuda::std::max({int{alignof(InTs)}...}); @@ -950,52 +973,57 @@ _CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) - return arg; } -template +template _CCCL_API constexpr int get_block_threads_helper() { - if constexpr (ActivePolicy::algorithm == Algorithm::prefetch) + constexpr transform_arch_policy policy = ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); + if constexpr (policy.algorithm == Algorithm::prefetch) { - return ActivePolicy::prefetch_policy::block_threads; + return policy.prefetch_policy.block_threads; } - else if constexpr (ActivePolicy::algorithm == Algorithm::vectorized) + else if constexpr (policy.algorithm == Algorithm::vectorized) { - return ActivePolicy::vectorized_policy::block_threads; + return policy.vectorized_policy.block_threads; } else { - return ActivePolicy::async_policy::block_threads; + return policy.async_copy_policy.block_threads; } } // need a variable template to force constant evaluation of get_block_threads_helper(), otherwise nvcc will give us a // "bad attribute argument substitution" error -template -inline constexpr int get_block_threads = get_block_threads_helper(); +template +inline constexpr int get_block_threads = get_block_threads_helper(); // There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated // with the same arguments for each algorithm. Only the device compiler will then select the implementation. This // saves some compile-time and binary size. -template -__launch_bounds__(get_block_threads) - CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel( - Offset num_items, - int num_elem_per_thread, - [[maybe_unused]] bool can_vectorize, - Predicate pred, - F f, - RandomAccessIteratorOut out, - kernel_arg... ins) + typename... RandomAccessIteratorsIn> +#if _CCCL_HAS_CONCEPTS() + requires transform_policy_hub +#endif // _CCCL_HAS_CONCEPTS() +__launch_bounds__(get_block_threads) CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel( + Offset num_items, + int num_elem_per_thread, + [[maybe_unused]] bool can_vectorize, + Predicate pred, + F f, + RandomAccessIteratorOut out, + kernel_arg... ins) { _CCCL_ASSERT(blockDim.y == 1 && blockDim.z == 1, "transform_kernel only supports 1D blocks"); - if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::prefetch) + static constexpr transform_arch_policy policy = ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10}); + + if constexpr (policy.algorithm == Algorithm::prefetch) { - transform_kernel_prefetch( + transform_kernel_prefetch( num_items, num_elem_per_thread, ::cuda::std::move(pred), @@ -1003,11 +1031,14 @@ __launch_bounds__(get_block_threads) ::cuda::std::move(out), ::cuda::std::move(ins.iterator)...); } - else if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::vectorized) + else if constexpr (policy.algorithm == Algorithm::vectorized) { static_assert(::cuda::std::is_same_v, "Cannot vectorize transform with a predicate"); - transform_kernel_vectorized( + + transform_kernel_vectorized( num_items, num_elem_per_thread, can_vectorize, @@ -1015,11 +1046,11 @@ __launch_bounds__(get_block_threads) ::cuda::std::move(out), ::cuda::std::move(ins.iterator)...); } - else if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::memcpy_async) + else if constexpr (policy.algorithm == Algorithm::memcpy_async) { NV_IF_TARGET( NV_PROVIDES_SM_80, - (transform_kernel_ldgsts( + (transform_kernel_ldgsts( num_items, num_elem_per_thread, ::cuda::std::move(pred), @@ -1027,11 +1058,12 @@ __launch_bounds__(get_block_threads) ::cuda::std::move(out), ::cuda::std::move(ins.aligned_ptr)...);)); } - else if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::ublkcp) + else if constexpr (policy.algorithm == Algorithm::ublkcp) { NV_IF_TARGET( NV_PROVIDES_SM_90, - (transform_kernel_ublkcp( + (transform_kernel_ublkcp( num_items, num_elem_per_thread, ::cuda::std::move(pred), diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh index 5cf4f6ed375..403b8936093 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh @@ -20,10 +20,6 @@ #include #include -#if _CCCL_HAS_CONCEPTS() -# include -#endif // _CCCL_HAS_CONCEPTS() - #if !_CCCL_COMPILER(NVRTC) # include #endif @@ -95,14 +91,9 @@ struct reduce_arch_policy // equivalent of a policy for a single CUDA architectu }; #if _CCCL_HAS_CONCEPTS() -_CCCL_API consteval void __needs_a_constexpr_value(auto) {} - // TODO(bgruber): bikeshed name before we make the tuning API public template -concept reduce_policy_hub = requires(T hub, ::cuda::arch_id arch) { - { hub(arch) } -> _CCCL_CONCEPT_VSTD::same_as; - { __needs_a_constexpr_value(hub(arch)) }; -}; +concept reduce_policy_hub = policy_hub; #endif // _CCCL_HAS_CONCEPTS() template diff --git a/cub/cub/device/dispatch/tuning/tuning_transform.cuh b/cub/cub/device/dispatch/tuning/tuning_transform.cuh index 9199862ee25..00491b590dd 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform.cuh @@ -13,8 +13,6 @@ # pragma system_header #endif // no system header -#include -#include #include #include @@ -22,18 +20,21 @@ #include #include +#include #include #include #include -#include -#include -#include -#include -#include #include -#include #include +#if _CCCL_HAS_CONCEPTS() +# include +#endif // _CCCL_HAS_CONCEPTS() + +#if !_CCCL_COMPILER(NVRTC) +# include +#endif + CUB_NAMESPACE_BEGIN namespace detail::transform { @@ -66,137 +67,187 @@ enum class Algorithm ublkcp }; -template -struct prefetch_policy_t +#if !_CCCL_COMPILER(NVRTC) +inline ::std::ostream& operator<<(::std::ostream& os, const Algorithm& algorithm) { - static constexpr int block_threads = BlockThreads; - // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy - static constexpr int items_per_thread_no_input = 2; // when there are no input iterators, the kernel is just filling - static constexpr int min_items_per_thread = 1; - static constexpr int max_items_per_thread = 32; - - // TODO: remove with C++20 - // The value of the below does not matter. - static constexpr int not_a_vectorized_policy = 0; -}; + switch (algorithm) + { + case Algorithm::prefetch: + return os << "Algorithm::prefetch"; + case Algorithm::vectorized: + return os << "Algorithm::vectorized"; + case Algorithm::memcpy_async: + return os << "Algorithm::memcpy_async"; + case Algorithm::ublkcp: + return os << "Algorithm::ublkcp"; + default: + return os << "Algorithm::"; + } +} +#endif // !_CCCL_COMPILER(NVRTC) -CUB_DETAIL_POLICY_WRAPPER_DEFINE( - TransformAgentPrefetchPolicy, - (always_true), - (block_threads, BlockThreads, int), - (items_per_thread_no_input, ItemsPerThreadNoInput, int), - (min_items_per_thread, MinItemsPerThread, int), - (max_items_per_thread, MaxItemsPerThread, int), - (not_a_vectorized_policy, NotAVectorizedPolicy, int) ) // TODO: remove with C++20 - -template -struct vectorized_policy_t : prefetch_policy_t +struct prefetch_policy { - static constexpr int items_per_thread_vectorized = Tuning::items_per_thread; - static constexpr int vec_size = Tuning::vec_size; + int block_threads; + // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy + int items_per_thread_no_input = 2; // when there are no input iterators, the kernel is just filling + int min_items_per_thread = 1; + int max_items_per_thread = 32; - using not_a_vectorized_policy = void; // TODO: remove with C++20, shadows the variable in prefetch_policy_t -}; + [[nodiscard]] _CCCL_API constexpr friend bool operator==(const prefetch_policy& lhs, const prefetch_policy& rhs) + { + return lhs.block_threads == rhs.block_threads && lhs.items_per_thread_no_input == rhs.items_per_thread_no_input + && lhs.min_items_per_thread == rhs.min_items_per_thread && lhs.max_items_per_thread == rhs.max_items_per_thread; + } -CUB_DETAIL_POLICY_WRAPPER_DEFINE( - TransformAgentVectorizedPolicy, - (always_true), // TODO: restore with C++20: (TransformAgentPrefetchPolicy), - (block_threads, BlockThreads, int), - (items_per_thread_no_input, ItemsPerThreadNoInput, int), - (min_items_per_thread, MinItemsPerThread, int), - (max_items_per_thread, MaxItemsPerThread, int), - (items_per_thread_vectorized, ItemsPerThreadVectorized, int), - (vec_size, VecSize, int) ) - -template -struct async_copy_policy_t -{ - static constexpr int block_threads = BlockThreads; - // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy - static constexpr int min_items_per_thread = 1; - static constexpr int max_items_per_thread = 32; + [[nodiscard]] _CCCL_API constexpr friend bool operator!=(const prefetch_policy& lhs, const prefetch_policy& rhs) + { + return !(lhs == rhs); + } - static constexpr int bulk_copy_alignment = BulkCopyAlignment; +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const prefetch_policy& policy) + { + return os << "prefetch_policy { .block_threads = " << policy.block_threads << ", .items_per_thread_no_input = " + << policy.items_per_thread_no_input << ", .min_items_per_thread = " << policy.min_items_per_thread + << ", .max_items_per_thread = " << policy.max_items_per_thread << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) }; -CUB_DETAIL_POLICY_WRAPPER_DEFINE( - TransformAgentAsyncPolicy, - (always_true), - (block_threads, BlockThreads, int), - (min_items_per_thread, MinItemsPerThread, int), - (max_items_per_thread, MaxItemsPerThread, int), - (bulk_copy_alignment, BulkCopyAlignment, int) ) - -_CCCL_TEMPLATE(typename PolicyT) -_CCCL_REQUIRES((!TransformAgentPrefetchPolicy && !TransformAgentAsyncPolicy - && !TransformAgentVectorizedPolicy) ) -__host__ __device__ constexpr PolicyT MakePolicyWrapper(PolicyT policy) +struct vectorized_policy { - return policy; -} + int block_threads; + int items_per_thread_vectorized; + int vec_size; + // if we have to fall back to prefetching, use these values: + int prefetch_items_per_thread_no_input = 2; + int prefetch_min_items_per_thread = 1; + int prefetch_max_items_per_thread = 32; + + [[nodiscard]] _CCCL_API constexpr friend bool operator==(const vectorized_policy& lhs, const vectorized_policy& rhs) + { + return lhs.block_threads == rhs.block_threads && lhs.items_per_thread_vectorized == rhs.items_per_thread_vectorized + && lhs.vec_size == rhs.vec_size + && lhs.prefetch_items_per_thread_no_input == rhs.prefetch_items_per_thread_no_input + && lhs.prefetch_min_items_per_thread == rhs.prefetch_min_items_per_thread + && lhs.prefetch_max_items_per_thread == rhs.prefetch_max_items_per_thread; + } -template -struct TransformPolicyWrapper : PolicyT -{ - _CCCL_HOST_DEVICE TransformPolicyWrapper(PolicyT base) - : PolicyT(base) - {} + [[nodiscard]] _CCCL_API constexpr friend bool operator!=(const vectorized_policy& lhs, const vectorized_policy& rhs) + { + return !(lhs == rhs); + } + +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const vectorized_policy& policy) + { + return os + << "vectorized_policy { .block_threads = " << policy.block_threads << ", .items_per_thread_vectorized = " + << policy.items_per_thread_vectorized << ", .vec_size = " << policy.vec_size + << ", .prefetch_items_per_thread_no_input = " << policy.prefetch_items_per_thread_no_input + << ", .prefetch_min_items_per_thread = " << policy.prefetch_min_items_per_thread + << ", .prefetch_max_items_per_thread = " << policy.prefetch_max_items_per_thread << " }"; + } +#endif // !_CCCL_COMPILER(NVRTC) }; -template -struct TransformPolicyWrapper> : StaticPolicyT +struct async_copy_policy { - _CCCL_HOST_DEVICE TransformPolicyWrapper(StaticPolicyT base) - : StaticPolicyT(base) - {} + int block_threads; + int bulk_copy_alignment; // TODO(bgruber): this should probably be removed from the tuning policy + // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy + int min_items_per_thread = 1; + int max_items_per_thread = 32; - _CCCL_HOST_DEVICE static constexpr Algorithm Algorithm() + [[nodiscard]] _CCCL_API constexpr friend bool operator==(const async_copy_policy& lhs, const async_copy_policy& rhs) { - return StaticPolicyT::algorithm; + return lhs.block_threads == rhs.block_threads && lhs.bulk_copy_alignment == rhs.bulk_copy_alignment + && lhs.min_items_per_thread == rhs.min_items_per_thread && lhs.max_items_per_thread == rhs.max_items_per_thread; } - _CCCL_HOST_DEVICE static constexpr int MinBif() + [[nodiscard]] _CCCL_API constexpr friend bool operator!=(const async_copy_policy& lhs, const async_copy_policy& rhs) { - return StaticPolicyT::min_bif; + return !(lhs == rhs); } - _CCCL_HOST_DEVICE static constexpr auto PrefetchPolicy() +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const async_copy_policy& policy) { - return MakePolicyWrapper(typename StaticPolicyT::prefetch_policy()); + return os << "async_copy_policy { .block_threads = " << policy.block_threads << ", .bulk_copy_alignment = " + << policy.bulk_copy_alignment << ", .min_items_per_thread = " << policy.min_items_per_thread + << ", .max_items_per_thread = " << policy.max_items_per_thread << " }"; } +#endif // !_CCCL_COMPILER(NVRTC) +}; - _CCCL_HOST_DEVICE static constexpr auto VectorizedPolicy() +struct transform_arch_policy +{ + int min_bif; + Algorithm algorithm; + prefetch_policy prefetch_policy; + vectorized_policy vectorized_policy; + async_copy_policy async_copy_policy; + + [[nodiscard]] _CCCL_API constexpr friend bool + operator==(const transform_arch_policy& lhs, const transform_arch_policy& rhs) { - return MakePolicyWrapper(typename StaticPolicyT::vectorized_policy()); + return lhs.min_bif == rhs.min_bif && lhs.algorithm == rhs.algorithm && lhs.prefetch_policy == rhs.prefetch_policy + && lhs.vectorized_policy == rhs.vectorized_policy && lhs.async_copy_policy == rhs.async_copy_policy; } - _CCCL_HOST_DEVICE static constexpr auto AsyncPolicy() + [[nodiscard]] _CCCL_API constexpr friend bool + operator!=(const transform_arch_policy& lhs, const transform_arch_policy& rhs) { - return MakePolicyWrapper(typename StaticPolicyT::async_policy()); + return !(lhs == rhs); } -#if defined(CUB_ENABLE_POLICY_PTX_JSON) - _CCCL_DEVICE static constexpr auto EncodedPolicy() +#if !_CCCL_COMPILER(NVRTC) + friend ::std::ostream& operator<<(::std::ostream& os, const transform_arch_policy& policy) { - using namespace ptx_json; - return object() = value(), - key<"algorithm">() = value(StaticPolicyT::algorithm)>(), - key<"prefetch_policy">() = PrefetchPolicy().EncodedPolicy(), - key<"vectorized_policy">() = VectorizedPolicy().EncodedPolicy(), - key<"async_policy">() = AsyncPolicy().EncodedPolicy()>(); + return os + << "transform_arch_policy { .min_bif = " << policy.min_bif << ", .algorithm = " << policy.algorithm + << ", .prefetch_policy = " << policy.prefetch_policy << ", .vectorized_policy = " << policy.vectorized_policy + << ", .async_copy_policy = " << policy.async_copy_policy << " }"; } -#endif // CUB_ENABLE_POLICY_PTX_JSON +#endif // !_CCCL_COMPILER(NVRTC) }; -template -_CCCL_HOST_DEVICE TransformPolicyWrapper MakeTransformPolicyWrapper(PolicyT base) +#if _CCCL_HAS_CONCEPTS() +// TODO(bgruber): bikeshed name before we make the tuning API public +template +concept transform_policy_hub = policy_hub; +#endif // _CCCL_HAS_CONCEPTS() + +struct iterator_info +{ + int value_type_size; + int value_type_alignment; + bool value_type_is_trivially_relocatable; + bool is_contiguous; +}; + +template +inline constexpr size_t size_of = sizeof(T); + +template <> +inline constexpr size_t size_of = 0; + +template +inline constexpr size_t align_of = alignof(T); + +template <> +inline constexpr size_t align_of = 0; + +template +[[nodiscard]] _CCCL_API constexpr auto make_iterator_info() -> iterator_info { - return TransformPolicyWrapper(base); + using vt = it_value_t; + return iterator_info{ + static_cast(size_of), + static_cast(align_of), + THRUST_NS_QUALIFIER::is_trivially_relocatable_v, + THRUST_NS_QUALIFIER::is_contiguous_iterator_v}; } template @@ -207,18 +258,20 @@ _CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int constexpr int ldgsts_size_and_align = 16; -template +template _CCCL_HOST_DEVICE constexpr auto memcpy_async_dyn_smem_for_tile_size( - ItValueSizesAlignments it_value_sizes_alignments, int tile_size, int copy_alignment = ldgsts_size_and_align) -> int + const ::cuda::std::array& inputs, + int tile_size, + int copy_alignment = ldgsts_size_and_align) -> int { int smem_size = 0; - for (auto&& [vt_size, vt_alignment] : it_value_sizes_alignments) + for (const auto& input : inputs) { smem_size = - static_cast(::cuda::round_up(smem_size, ::cuda::std::max(static_cast(vt_alignment), copy_alignment))); + static_cast(::cuda::round_up(smem_size, ::cuda::std::max(input.value_type_alignment, copy_alignment))); // max head/tail padding is copy_alignment - sizeof(T) each const int max_bytes_to_copy = - static_cast(vt_size) * tile_size + ::cuda::std::max(copy_alignment - static_cast(vt_size), 0) * 2; + input.value_type_size * tile_size + ::cuda::std::max(copy_alignment - input.value_type_size, 0) * 2; smem_size += max_bytes_to_copy; }; return smem_size; @@ -226,111 +279,217 @@ _CCCL_HOST_DEVICE constexpr auto memcpy_async_dyn_smem_for_tile_size( constexpr int bulk_copy_size_multiple = 16; -_CCCL_HOST_DEVICE constexpr auto bulk_copy_alignment(int sm_arch) -> int +_CCCL_HOST_DEVICE constexpr auto bulk_copy_alignment(::cuda::arch_id arch) -> int { - return sm_arch < 1000 ? 128 : 16; + return arch < ::cuda::arch_id::sm_100 ? 128 : 16; } -template -_CCCL_HOST_DEVICE constexpr auto -bulk_copy_dyn_smem_for_tile_size(ItValueSizesAlignments it_value_sizes_alignments, int tile_size, int bulk_copy_align) - -> int +template +_CCCL_HOST_DEVICE constexpr auto bulk_copy_dyn_smem_for_tile_size( + const ::cuda::std::array& inputs, int tile_size, int bulk_copy_align) -> int { // we rely on the tile_size being a multiple of alignments, so shifting offsets/pointers by it retains alignments _CCCL_ASSERT(tile_size % bulk_copy_align == 0, ""); _CCCL_ASSERT(tile_size % bulk_copy_size_multiple == 0, ""); int tile_padding = bulk_copy_align; - for (auto&& [_, vt_alignment] : it_value_sizes_alignments) + for (const auto& input : inputs) { - tile_padding = ::cuda::std::max(tile_padding, static_cast(vt_alignment)); + tile_padding = ::cuda::std::max(tile_padding, input.value_type_alignment); } int smem_size = tile_padding; // for the barrier and padding - for (auto&& [vt_size, _] : it_value_sizes_alignments) + for (const auto& input : inputs) { - smem_size += tile_padding + static_cast(vt_size) * tile_size; + smem_size += tile_padding + input.value_type_size * tile_size; } return smem_size; } -_CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch) +[[nodiscard]] _CCCL_API constexpr int arch_to_min_bytes_in_flight(::cuda::arch_id arch) { - if (sm_arch >= 1000) + if (arch >= ::cuda::arch_id::sm_100) { return 64 * 1024; // B200 } - if (sm_arch >= 900) + if (arch >= ::cuda::arch_id::sm_90) { return 48 * 1024; // 32 for H100, 48 for H200 } - if (sm_arch >= 800) + if (arch >= ::cuda::arch_id::sm_80) { return 16 * 1024; // A100 } return 12 * 1024; // V100 and below } -template -inline constexpr size_t size_of = sizeof(T); - -template <> -inline constexpr size_t size_of = 0; - -template -_CCCL_HOST_DEVICE static constexpr auto make_sizes_alignments() +[[nodiscard]] _CCCL_API constexpr auto tuned_vectorized_policy(::cuda::arch_id arch, int store_size, bool is_filling) { - return ::cuda::std::array<::cuda::std::pair<::cuda::std::size_t, ::cuda::std::size_t>, - sizeof...(RandomAccessIteratorsIn)>{ - {{sizeof(it_value_t), alignof(it_value_t)}...}}; -} + if (is_filling) + { + // manually tuned fill on RTX 5090 + // TODO(bgruber): re-enable this later! It's disabled to avoid SASS changes in PR #6914 + // if (arch >= ::cuda::arch_id::sm_120) + // { + // return vectorized_policy{256, 8, 4}; + // } + // manually tuned fill on B200, same as H200 + if (arch >= ::cuda::arch_id::sm_90) + { + return vectorized_policy{ + store_size > 4 ? 128 : 256, 16, ::cuda::std::max(8 / store_size, 1) /* 64-bit instructions */}; + } + // manually tuned fill on A100 + if (arch >= ::cuda::arch_id::sm_90) + { + return vectorized_policy{256, 8, ::cuda::std::max(8 / store_size, 1) /* 64-bit instructions */}; + } + } -template -struct tuning_vec -{ // defaults from fill on RTX 5090, but can be changed - static constexpr int block_threads = 256; - static constexpr int vec_size = 4; - static constexpr int items_per_thread = 8; -}; + return vectorized_policy{256, 8, 4}; +} -// manually tuned fill on A100 -template -struct tuning_vec<800, StoreSize> +template +struct arch_policies { - static constexpr int block_threads = 256; - static constexpr int vec_size = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions - static constexpr int items_per_thread = 8; -}; + bool requires_stable_address; + bool dense_output; + ::cuda::std::array inputs; + iterator_info output; -// manually tuned fill on H200 -template -struct tuning_vec<900, StoreSize> -{ - static constexpr int block_threads = StoreSize > 4 ? 128 : 256; - static constexpr int vec_size = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions - static constexpr int items_per_thread = 16; + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> transform_arch_policy + { + const bool no_input_streams = InputCount == 0; + + bool all_inputs_contiguous = true; + bool all_input_values_trivially_reloc = true; + bool can_memcpy_contiguous_inputs = true; + bool all_value_types_have_power_of_two_size = ::cuda::is_power_of_two(::cuda::std::max(1, output.value_type_size)); + for (const auto& input : inputs) + { + all_inputs_contiguous &= input.is_contiguous; + all_input_values_trivially_reloc &= input.value_type_is_trivially_relocatable; + // the vectorized kernel supports mixing contiguous and non-contiguous iterators + can_memcpy_contiguous_inputs &= !input.is_contiguous || input.value_type_is_trivially_relocatable; + all_value_types_have_power_of_two_size &= ::cuda::is_power_of_two(input.value_type_size); + } + const bool can_memcpy_all_inputs = all_inputs_contiguous && all_input_values_trivially_reloc; + const bool fallback_to_prefetch = requires_stable_address || !can_memcpy_contiguous_inputs + || !all_value_types_have_power_of_two_size || !dense_output; + const int min_bif = arch_to_min_bytes_in_flight(arch); + + if (arch >= ::cuda::arch_id::sm_90) // handles sm_100 as well + { + const int async_block_size = arch < ::cuda::arch_id::sm_100 ? 256 : 128; + const int alignment = bulk_copy_alignment(arch); + + const auto prefetch = prefetch_policy{256}; + const auto vectorized = + tuned_vectorized_policy(arch, ::cuda::std::max(1, output.value_type_size), InputCount == 0); + const auto async = async_copy_policy{async_block_size, alignment}; + + // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not + // forward compatible. If a user compiled for sm_xxx and we assume the available SMEM for that architecture, but + // then runs on the next architecture after that, which may have a smaller available SMEM, we get a crash. + const bool exhaust_smem = + bulk_copy_dyn_smem_for_tile_size(inputs, async_block_size * async.min_items_per_thread, alignment) + > int{max_smem_per_block}; + + // if each tile size is a multiple of the bulk copy and maximum value type alignments, the alignment is retained + // if the base pointer is sufficiently aligned (the correct check would be if it's a multiple of all value types + // following the current tile). we would otherwise need to realign every SMEM tile individually, which is costly + // and complex, so let's fall back in this case. + int max_alignment = alignment; + for (const auto& input : inputs) + { + max_alignment = ::cuda::std::max({max_alignment, input.value_type_alignment}); + } + + bool tile_sizes_retain_alignment = true; + for (const auto& input : inputs) + { + tile_sizes_retain_alignment &= (input.value_type_size * async_block_size) % max_alignment == 0; + } + + // on Hopper, the vectorized kernel performs better for 1 and 2 byte values, except for BabelStream mul (1 input) + bool vector_kernel_is_faster = arch == ::cuda::arch_id::sm_90 && output.value_type_size < 4 && InputCount > 1; + for (const auto& input : inputs) + { + vector_kernel_is_faster &= input.value_type_size < 4; + } + + const bool enough_threads_for_peeling = async_block_size >= alignment; // head and tail bytes + const bool fallback_to_vectorized = exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling + || no_input_streams || !can_memcpy_all_inputs || vector_kernel_is_faster; + + const auto algorithm = + fallback_to_prefetch ? Algorithm::prefetch + : fallback_to_vectorized + ? Algorithm::vectorized + : Algorithm::ublkcp; + + return transform_arch_policy{ + min_bif, + algorithm, + prefetch, + vectorized, + async, + }; + } + else if (arch >= ::cuda::arch_id::sm_80) + { + const int block_threads = 256; + const auto prefetch = prefetch_policy{block_threads}; + const auto vectorized = + tuned_vectorized_policy(arch, ::cuda::std::max(1, output.value_type_size), InputCount == 0); + const auto async = async_copy_policy{block_threads, ldgsts_size_and_align}; + + // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not + // forward compatible. If a user compiled for sm_xxx and we assume the available SMEM for that architecture, but + // then runs on the next architecture after that, which may have a smaller available SMEM, we get a crash. + const bool exhaust_smem = + memcpy_async_dyn_smem_for_tile_size( + inputs, block_threads * async.min_items_per_thread, ldgsts_size_and_align) + > int{max_smem_per_block}; + const bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs; + + const auto algorithm = + fallback_to_prefetch ? Algorithm::prefetch + : fallback_to_vectorized + ? Algorithm::vectorized + : Algorithm::memcpy_async; + + return transform_arch_policy{ + min_bif, + algorithm, + prefetch, + vectorized, + async, + }; + } + + // fallback + return transform_arch_policy{ + min_bif, + fallback_to_prefetch ? Algorithm::prefetch : Algorithm::vectorized, + prefetch_policy{256}, + tuned_vectorized_policy(::cuda::arch_id::sm_60, ::cuda::std::max(1, output.value_type_size), InputCount == 0), + async_copy_policy{}, // never used + }; + } }; -// manually tuned fill on B200, same as H200 -template -struct tuning_vec<1000, StoreSize> : tuning_vec<900, StoreSize> -{}; - -// manually tuned fill on RTX 5090 -template -struct tuning_vec<1200, StoreSize> -{ - static constexpr int block_threads = 256; - static constexpr int vec_size = 4; - static constexpr int items_per_thread = 8; -}; +#if _CCCL_HAS_CONCEPTS() +static_assert(transform_policy_hub>); +#endif // _CCCL_HAS_CONCEPTS() +// stateless version which can be passed to kernels template -struct policy_hub +struct arch_policies_from_types { static_assert(sizeof(RandomAccessIteratorTupleIn) == 0, "Second parameter must be a tuple"); }; @@ -339,137 +498,20 @@ template -struct policy_hub, - RandomAccessIteratorOut> +struct arch_policies_from_types, + RandomAccessIteratorOut> { - static constexpr bool no_input_streams = sizeof...(RandomAccessIteratorsIn) == 0; - static constexpr bool all_inputs_contiguous = - (THRUST_NS_QUALIFIER::is_contiguous_iterator_v && ...); - static constexpr bool all_input_values_trivially_reloc = - (THRUST_NS_QUALIFIER::is_trivially_relocatable_v> && ...); - static constexpr bool can_memcpy_all_inputs = all_inputs_contiguous && all_input_values_trivially_reloc; - // the vectorized kernel supports mixing contiguous and non-contiguous iterators - static constexpr bool can_memcpy_contiguous_inputs = - ((!THRUST_NS_QUALIFIER::is_contiguous_iterator_v - || THRUST_NS_QUALIFIER::is_trivially_relocatable_v>) - && ...); - - static constexpr bool all_value_types_have_power_of_two_size = - (::cuda::is_power_of_two(sizeof(it_value_t)) && ...) - && ::cuda::is_power_of_two(size_of>); - - static constexpr bool fallback_to_prefetch = - RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_value_types_have_power_of_two_size || !DenseOutput; - - // TODO(bgruber): consider a separate kernel for just filling - - struct policy300 : ChainedPolicy<300, policy300, policy300> + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> transform_arch_policy { - static constexpr int min_bif = arch_to_min_bytes_in_flight(300); - using prefetch_policy = prefetch_policy_t<256>; - using vectorized_policy = vectorized_policy_t< - tuning_vec<500, size_of>, sizeof(it_value_t)...>>; - using async_policy = async_copy_policy_t<256, 16>; // dummy policy, never used - static constexpr auto algorithm = fallback_to_prefetch ? Algorithm::prefetch : Algorithm::vectorized; - }; - - struct policy800 : ChainedPolicy<800, policy800, policy300> - { - private: - static constexpr int block_threads = 256; - - public: - static constexpr int min_bif = arch_to_min_bytes_in_flight(800); - using prefetch_policy = prefetch_policy_t; - using vectorized_policy = vectorized_policy_t< - tuning_vec<800, size_of>, sizeof(it_value_t)...>>; - using async_policy = async_copy_policy_t; - - private: - // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not - // forward compatible. If a user compiled for sm_xxx and we assume the available SMEM for that architecture, but - // then runs on the next architecture after that, which may have a smaller available SMEM, we get a crash. - static constexpr bool exhaust_smem = - memcpy_async_dyn_smem_for_tile_size( - make_sizes_alignments(), - block_threads* async_policy::min_items_per_thread, - ldgsts_size_and_align) - > int{max_smem_per_block}; - static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs; - - public: - static constexpr auto algorithm = - fallback_to_prefetch ? Algorithm::prefetch - : fallback_to_vectorized - ? Algorithm::vectorized - : Algorithm::memcpy_async; - }; - - template - struct bulk_copy_policy_base - { - private: - static constexpr int alignment = bulk_copy_alignment(PtxVersion); - - public: - static constexpr int min_bif = arch_to_min_bytes_in_flight(PtxVersion); - using prefetch_policy = prefetch_policy_t<256>; - using vectorized_policy = - vectorized_policy_t>, - sizeof(it_value_t)...>>; - using async_policy = async_copy_policy_t; - - private: - // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not - // forward compatible. If a user compiled for sm_xxx and we assume the available SMEM for that architecture, but - // then runs on the next architecture after that, which may have a smaller available SMEM, we get a crash. - static constexpr bool exhaust_smem = - bulk_copy_dyn_smem_for_tile_size( - make_sizes_alignments(), - AsyncBlockSize* async_policy::min_items_per_thread, - alignment) - > int{max_smem_per_block}; - - // on Hopper, the vectorized kernel performs better for 1 and 2 byte values - static constexpr bool use_vector_kernel_on_hopper = - ((size_of> < 4) && ...) && sizeof...(RandomAccessIteratorsIn) > 1 - && size_of> < 4; - - // if each tile size is a multiple of the bulk copy and maximum value type alignments, the alignment is retained if - // the base pointer is sufficiently aligned (the correct check would be if it's a multiple of all value types - // following the current tile). we would otherwise need to realign every SMEM tile individually, which is costly and - // complex, so let's fall back in this case. - static constexpr int max_alignment = - ::cuda::std::max({alignment, int{alignof(it_value_t)}...}); - static constexpr bool tile_sizes_retain_alignment = - (((int{sizeof(it_value_t)} * AsyncBlockSize) % max_alignment == 0) && ...); - static constexpr bool enough_threads_for_peeling = AsyncBlockSize >= alignment; // head and tail bytes - static constexpr bool fallback_to_vectorized = - exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams - || !can_memcpy_all_inputs || (PtxVersion == 900 && use_vector_kernel_on_hopper); - - public: - static constexpr auto algorithm = - fallback_to_prefetch ? Algorithm::prefetch - : fallback_to_vectorized - ? Algorithm::vectorized - : Algorithm::ublkcp; - }; - - struct policy900 - : bulk_copy_policy_base<256, 900> - , ChainedPolicy<900, policy900, policy800> - {}; - - struct policy1000 - : bulk_copy_policy_base<128, 1000> - , ChainedPolicy<1000, policy1000, policy900> - {}; - - using max_policy = policy1000; + constexpr auto policies = arch_policies{ + RequiresStableAddress, + DenseOutput, + {make_iterator_info()...}, + make_iterator_info()}; + return policies(arch); + } }; } // namespace detail::transform diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index cbccb3a0933..d1884a5a121 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -30,6 +30,10 @@ #include #include +#if _CCCL_HAS_CONCEPTS() +# include +#endif // _CCCL_HAS_CONCEPTS() + #if !_CCCL_COMPILER(NVRTC) # include // saves 146ms compile-time over (CCCL 3.1) # if defined(CUB_DEFINE_RUNTIME_POLICIES) @@ -831,6 +835,88 @@ private: } #endif // !_CCCL_COMPILER(NVRTC) }; + +namespace detail +{ +#if !defined(CUB_DEFINE_RUNTIME_POLICIES) && !_CCCL_COMPILER(NVRTC) + +template <::cuda::arch_id ArchId, typename ArchPolicies, typename FunctorT> +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t call_for_arch(ArchPolicies arch_policies, FunctorT&& f) +{ + static constexpr auto policy = arch_policies(ArchId); +# if _CCCL_STD_VER >= 2020 + // we instantiate f only for each distinct policy (!!) + return f(::cuda::std::integral_constant{}); +# else + return f([] { + return policy; + }); +# endif +} + +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t +dispatch_compiled_for_arches(ArchPolicies arch_policies, ::cuda::arch_id device_arch, FunctorT&& f) +{ + cudaError_t e = cudaErrorInvalidDeviceFunction; + (..., + (device_arch == ::cuda::arch_id{(CudaArches * ArchMult) / 10} + ? (e = call_for_arch<::cuda::arch_id{(CudaArches * ArchMult) / 10}>(arch_policies, f)) + : cudaSuccess)); + return e; +} + +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_all_arches( + ArchPolicies arch_policies, ::cuda::arch_id device_arch, FunctorT&& f, ::cuda::std::index_sequence) +{ + static constexpr auto all_arches = ::cuda::__all_arch_ids(); + cudaError_t e = cudaErrorInvalidDeviceFunction; + (..., (device_arch == all_arches[Is] ? (e = call_for_arch(arch_policies, f)) : cudaSuccess)); + return e; +} + +template +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t +dispatch_arch(ArchPolicies arch_policies, ::cuda::arch_id device_arch, F&& f) +{ +# ifdef __CUDA_ARCH_LIST__ + return dispatch_compiled_for_arches<1, __CUDA_ARCH_LIST__>(arch_policies, device_arch, ::cuda::std::forward(f)); +# elif defined(NV_TARGET_SM_INTEGER_LIST) + return dispatch_compiled_for_arches<10, NV_TARGET_SM_INTEGER_LIST>( + arch_policies, device_arch, ::cuda::std::forward(f)); +# else + // some compilers don't tell us what arches we are compiling for, so we use all of them + return dispatch_all_arches( + arch_policies, + device_arch, + ::cuda::std::forward(f), + ::cuda::std::make_index_sequence<::cuda::__all_arch_ids().size()>{}); +# endif +} +#else // !defined(CUB_DEFINE_RUNTIME_POLICIES) && !_CCCL_COMPILER(NVRTC) +_CCCL_EXEC_CHECK_DISABLE +template +_CCCL_API _CCCL_FORCEINLINE cudaError_t dispatch_arch(ArchPolicies arch_policies, ::cuda::arch_id device_arch, F&& f) +{ + return f([&] { + return arch_policies(device_arch); + }); +} +#endif // !defined(CUB_DEFINE_RUNTIME_POLICIES) && !_CCCL_COMPILER(NVRTC) + +#if _CCCL_HAS_CONCEPTS() +_CCCL_API consteval void __needs_a_constexpr_value(auto) {} + +// TODO(bgruber): bikeshed name before we make the tuning API public +template +concept policy_hub = requires(T hub, ::cuda::arch_id arch) { + { hub(arch) } -> _CCCL_CONCEPT_VSTD::same_as; + { __needs_a_constexpr_value(hub(arch)) }; +}; +#endif // _CCCL_HAS_CONCEPTS() +} // namespace detail + CUB_NAMESPACE_END #if _CCCL_CUDA_COMPILATION() && !_CCCL_COMPILER(NVRTC) diff --git a/thrust/thrust/system/cuda/detail/fill.h b/thrust/thrust/system/cuda/detail/fill.h index b18489b07cd..b41c0280b5b 100644 --- a/thrust/thrust/system/cuda/detail/fill.h +++ b/thrust/thrust/system/cuda/detail/fill.h @@ -73,13 +73,7 @@ fill_n(execution_policy& policy, OutputIterator first, Size count, cons cudaError_t status; THRUST_INDEX_TYPE_DISPATCH( status, - (CUB_NS_QUALIFIER::detail::transform::dispatch_t< - CUB_NS_QUALIFIER::detail::transform::requires_stable_address::no, - decltype(count_fixed), - ::cuda::std::tuple<>, - OutputIterator, - Predicate, - TransformOp>::dispatch), + (CUB_NS_QUALIFIER::detail::transform::dispatch), count, (::cuda::std::tuple<>{}, first, count_fixed, Predicate{}, TransformOp{value}, cuda_cub::stream(policy))); throw_on_error(status, "fill_n: failed inside CUB"); diff --git a/thrust/thrust/system/cuda/detail/generate.h b/thrust/thrust/system/cuda/detail/generate.h index 5f56a3d8352..f347565575a 100644 --- a/thrust/thrust/system/cuda/detail/generate.h +++ b/thrust/thrust/system/cuda/detail/generate.h @@ -34,13 +34,7 @@ generate_n(execution_policy& policy, OutputIt result, Size count, Gener cudaError_t status; THRUST_INDEX_TYPE_DISPATCH( status, - (CUB_NS_QUALIFIER::detail::transform::dispatch_t< - CUB_NS_QUALIFIER::detail::transform::requires_stable_address::no, - decltype(count_fixed), - ::cuda::std::tuple<>, - OutputIt, - Predicate, - Generator>::dispatch), + (CUB_NS_QUALIFIER::detail::transform::dispatch), count, (::cuda::std::tuple<>{}, result, count_fixed, Predicate{}, generator, cuda_cub::stream(policy))); throw_on_error(status, "generate_n: failed inside CUB"); diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h index 8b1d10695c4..6a735ad4c72 100644 --- a/thrust/thrust/system/cuda/detail/transform.h +++ b/thrust/thrust/system/cuda/detail/transform.h @@ -175,12 +175,7 @@ OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many( cudaError_t status; THRUST_INDEX_TYPE_DISPATCH( status, - (cub::detail::transform::dispatch_t, - OutputIt, - Predicate, - TransformOp>::dispatch), + (cub::detail::transform::dispatch), num_items, (firsts, result, num_items_fixed, pred, transform_op, cuda_cub::stream(policy))); throw_on_error(status, "transform: failed inside CUB");