diff --git a/c/parallel/src/kernels/iterators.cpp b/c/parallel/src/kernels/iterators.cpp
index e23b219486d..ff588bcadf7 100644
--- a/c/parallel/src/kernels/iterators.cpp
+++ b/c/parallel/src/kernels/iterators.cpp
@@ -114,7 +114,7 @@ struct {0}_proxy_t {{
 struct {0} {{
   using iterator_category = cuda::std::random_access_iterator_tag;
   using difference_type   = DIFF_T;
-  using value_type        = void;
+  using value_type        = VALUE_T;
   using pointer           = {0}_proxy_t*;
   using reference         = {0}_proxy_t;
   __device__ {0}_proxy_t operator*() const {{ return {{state}}; }}
diff --git a/c/parallel/src/transform.cu b/c/parallel/src/transform.cu
index aae70bddd61..13404420236 100644
--- a/c/parallel/src/transform.cu
+++ b/c/parallel/src/transform.cu
@@ -120,48 +120,6 @@ get_kernel_name(cccl_iterator_t input1_it, cccl_iterator_t input2_it, cccl_itera
 
 namespace cdt = cub::detail::transform;
 
-struct runtime_tuning_policy
-{
-  using max_policy = runtime_tuning_policy;
-
-  cdt::Algorithm algorithm;
-  int min_bif;
-  cdt::RuntimeTransformAgentPrefetchPolicy prefetch_policy;
-  cdt::RuntimeTransformAgentVectorizedPolicy vectorized_policy;
-  cdt::RuntimeTransformAgentAsyncPolicy async_policy;
-
-  cdt::Algorithm Algorithm() const
-  {
-    return algorithm;
-  }
-
-  int MinBif() const
-  {
-    return min_bif;
-  }
-
-  auto PrefetchPolicy() const
-  {
-    return prefetch_policy;
-  }
-
-  auto VectorizedPolicy() const
-  {
-    return vectorized_policy;
-  }
-
-  auto AsyncPolicy() const
-  {
-    return async_policy;
-  }
-
-  template <typename F>
-  cudaError_t Invoke([[maybe_unused]] int device_ptx_version, F& op)
-  {
-    return op.template Invoke<runtime_tuning_policy>(*this);
-  }
-};
-
 struct cache
 {
   cuda::std::optional<cub::detail::transform::cuda_expected<cub::detail::transform::async_config>> async_config{};
@@ -172,7 +130,7 @@ template <int NumInputs>
 struct transform_kernel_source
 {
   cccl_device_transform_build_result_t& build;
-  std::array<cuda::std::pair<cuda::std::size_t, cuda::std::size_t>, NumInputs> it_value_sizes_alignments;
+  cuda::std::array<cdt::iterator_info, NumInputs> inputs;
 
   template <class ActionT>
   cub::detail::transform::cuda_expected<cub::detail::transform::async_config>
@@ -208,9 +166,9 @@ struct transform_kernel_source
     return build.loaded_bytes_per_iteration;
   }
 
-  auto ItValueSizesAlignments() const
+  const auto& InputIteratorInfos() const
   {
-    return cuda::std::span(it_value_sizes_alignments);
+    return inputs;
   }
 
   template <typename It>
@@ -238,6 +196,14 @@ public:
     return (is_pointer_aligned(its, its.value_size * vec_size) && ...);
   }
 };
+
+auto make_iterator_info(cccl_iterator_t input_it) -> cdt::iterator_info
+{
+  return {static_cast<int>(input_it.value_type.size),
+          static_cast<int>(input_it.value_type.alignment),
+          /* trivially_relocatable */ true, // TODO(bgruber): how to check this properly?
+          /* is contiguous */ input_it.type == CCCL_POINTER}; // TODO(bgruber): how to check this properly?
+}
 } // namespace transform
 
 CUresult cccl_device_unary_transform_build_ex(
@@ -259,7 +225,6 @@ CUresult cccl_device_unary_transform_build_ex(
   {
     const char* name = "test";
 
-    const int cc                 = cc_major * 10 + cc_minor;
     const auto input_it_value_t  = cccl_type_enum_to_name<input_storage_t>(input_it.value_type.type);
     const auto output_it_value_t = cccl_type_enum_to_name<output_storage_t>(output_it.value_type.type);
     const auto offset_t          = cccl_type_enum_to_name(cccl_type_enum::CCCL_INT64);
@@ -269,8 +234,17 @@ CUresult cccl_device_unary_transform_build_ex(
       make_kernel_output_iterator(offset_t, transform::output_iterator_name, output_it_value_t, output_it);
     const std::string op_src = make_kernel_user_unary_operator(input_it_value_t, output_it_value_t, op);
 
+    const auto inputs =
+      cuda::std::array<cub::detail::transform::iterator_info, 1>{transform::make_iterator_info(input_it)};
+    const auto output            = transform::make_iterator_info(output_it);
+    const auto cub_arch_policies = cub::detail::transform::arch_policies<1>{false, true, inputs, output};
+
+    // TODO(bgruber): drop this if tuning policies become formattable
+    std::stringstream cub_arch_policies_str;
+    cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));
+
     const auto policy_hub_expr = std::format(
-      "cub::detail::transform::policy_hub<false, true, ::cuda::std::tuple<{}>, {}>",
+      "cub::detail::transform::arch_policies_from_types<false, true, ::cuda::std::tuple<{}>, {}>",
       transform::get_iterator_name<input_storage_t>(input_it, transform::input_iterator_name),
       transform::get_iterator_name<output_storage_t>(output_it, transform::output_iterator_name));
 
@@ -287,13 +261,10 @@ struct __align__({3}) output_storage_t {{
 {4}
 {5}
 {6}
-using device_transform_policy = {7}::max_policy;
-
-#include <cub/detail/ptx-json/json.cuh>
-__device__ consteval auto& policy_generator() {{
-  return ptx_json::id<ptx_json::string("device_transform_policy")>()
-    = cub::detail::transform::TransformPolicyWrapper<device_transform_policy::ActivePolicy>::EncodedPolicy();
-}};
+using device_transform_policy = {7};
+using namespace cub;
+using namespace cub::detail::transform;
+static_assert(device_transform_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {8}, "Host generated and JIT compiled policy mismatch");
 )XXX",
       input_it.value_type.size, // 0
       input_it.value_type.alignment, // 1
@@ -302,7 +273,8 @@ __device__ consteval auto& policy_generator() {{
       input_iterator_src, // 4
       output_iterator_src, // 5
       op_src, // 6
-      policy_hub_expr); // 7
+      policy_hub_expr, // 7
+      cub_arch_policies_str.view()); // 8
 
 #if false // CCCL_DEBUGGING_SWITCH
     fflush(stderr);
@@ -328,7 +300,6 @@ __device__ consteval auto& policy_generator() {{
       "-dlto",
       "-default-device",
       "-DCUB_DISABLE_CDP",
-      "-DCUB_ENABLE_POLICY_PTX_JSON",
       "-std=c++20"};
 
     cccl::detail::extend_args_with_build_config(args, config);
@@ -357,23 +328,12 @@ __device__ consteval auto& policy_generator() {{
     cuLibraryLoadData(&build_ptr->library, result.data.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
     check(cuLibraryGetKernel(&build_ptr->transform_kernel, build_ptr->library, kernel_lowered_name.c_str()));
 
-    nlohmann::json runtime_policy =
-      cub::detail::ptx_json::parse("device_transform_policy", {result.data.get(), result.size});
-
-    const auto algorithm = static_cast<transform::cdt::Algorithm>(runtime_policy["algorithm"].get<int>());
-    const auto min_bif   = static_cast<int>(runtime_policy["min_bif"].get<int>());
-
     build_ptr->loaded_bytes_per_iteration = static_cast<int>(input_it.value_type.size);
-    build_ptr->cc                         = cc;
+    build_ptr->cc                         = cc_major * 10 + cc_minor;
     build_ptr->cubin                      = (void*) result.data.release();
     build_ptr->cubin_size                 = result.size;
-    build_ptr->runtime_policy             = new transform::runtime_tuning_policy{
-      algorithm,
-      min_bif,
-      transform::cdt::RuntimeTransformAgentPrefetchPolicy::from_json(runtime_policy, "prefetch_policy"),
-      transform::cdt::RuntimeTransformAgentVectorizedPolicy::from_json(runtime_policy, "vectorized_policy"),
-      transform::cdt::RuntimeTransformAgentAsyncPolicy::from_json(runtime_policy, "async_policy")};
-    build_ptr->cache = new transform::cache();
+    build_ptr->runtime_policy             = new cub::detail::transform::arch_policies<1>{cub_arch_policies};
+    build_ptr->cache                      = new transform::cache();
   }
   catch (const std::exception& exc)
   {
@@ -402,25 +362,16 @@ CUresult cccl_device_unary_transform(
 
     CUdevice cu_device;
     check(cuCtxGetDevice(&cu_device));
-    error = static_cast<CUresult>(
-      transform::cdt::dispatch_t<transform::cdt::requires_stable_address::no, // TODO implement yes
-                                 OffsetT,
-                                 ::cuda::std::tuple<indirect_iterator_t>,
-                                 indirect_iterator_t,
-                                 transform::cdt::always_true_predicate,
-                                 indirect_arg_t,
-                                 transform::runtime_tuning_policy,
-                                 transform::transform_kernel_source<1>,
-                                 cub::detail::CudaDriverLauncherFactory>::
-        dispatch(d_in,
-                 d_out,
-                 num_items,
-                 {},
-                 op,
-                 stream,
-                 {build, {{{d_in.value_type.size, d_in.value_type.alignment}}}},
-                 cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
-                 *reinterpret_cast<transform::runtime_tuning_policy*>(build.runtime_policy)));
+    error = static_cast<CUresult>(transform::cdt::dispatch<transform::cdt::requires_stable_address::no>(
+      ::cuda::std::tuple<indirect_iterator_t>{d_in},
+      indirect_iterator_t{d_out},
+      static_cast<OffsetT>(num_items),
+      transform::cdt::always_true_predicate{},
+      op,
+      stream,
+      *static_cast<cub::detail::transform::arch_policies<1>*>(build.runtime_policy),
+      transform::transform_kernel_source<1>{build, {transform::make_iterator_info(d_in)}},
+      cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}));
   }
   catch (const std::exception& exc)
   {
@@ -457,7 +408,6 @@ CUresult cccl_device_binary_transform_build_ex(
   {
     const char* name = "test";
 
-    const int cc                 = cc_major * 10 + cc_minor;
     const auto input1_it_value_t = cccl_type_enum_to_name<input1_storage_t>(input1_it.value_type.type);
     const auto input2_it_value_t = cccl_type_enum_to_name<input2_storage_t>(input2_it.value_type.type);
 
@@ -473,8 +423,17 @@ CUresult cccl_device_binary_transform_build_ex(
     const std::string op_src =
       make_kernel_user_binary_operator(input1_it_value_t, input2_it_value_t, output_it_value_t, op);
 
+    const auto inputs = cuda::std::array<cub::detail::transform::iterator_info, 2>{
+      transform::make_iterator_info(input1_it), transform::make_iterator_info(input2_it)};
+    const auto output            = transform::make_iterator_info(output_it);
+    const auto cub_arch_policies = cub::detail::transform::arch_policies<2>{false, true, inputs, output};
+
+    // TODO(bgruber): drop this if tuning policies become formattable
+    std::stringstream cub_arch_policies_str;
+    cub_arch_policies_str << cub_arch_policies(cuda::to_arch_id(cuda::compute_capability{cc_major, cc_minor}));
+
     const auto policy_hub_expr = std::format(
-      "cub::detail::transform::policy_hub<false, true, ::cuda::std::tuple<{0}, {1}>, {2}>",
+      "cub::detail::transform::arch_policies_from_types<false, true, ::cuda::std::tuple<{0}, {1}>, {2}>",
       transform::get_iterator_name<input1_storage_t>(input1_it, transform::input1_iterator_name),
       transform::get_iterator_name<input2_storage_t>(input2_it, transform::input2_iterator_name),
       transform::get_iterator_name<output_storage_t>(output_it, transform::output_iterator_name));
@@ -496,13 +455,10 @@ struct __align__({5}) output_storage_t {{
 {7}
 {8}
 {9}
-using device_transform_policy = {10}::max_policy;
-
-#include <cub/detail/ptx-json/json.cuh>
-__device__ consteval auto& policy_generator() {{
-  return ptx_json::id<ptx_json::string("device_transform_policy")>()
-    = cub::detail::transform::TransformPolicyWrapper<device_transform_policy::ActivePolicy>::EncodedPolicy();
-}};
+using device_transform_policy = {10};
+using namespace cub;
+using namespace cub::detail::transform;
+static_assert(device_transform_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {11}, "Host generated and JIT compiled policy mismatch");
 )XXX",
       input1_it.value_type.size, // 0
       input1_it.value_type.alignment, // 1
@@ -514,7 +470,8 @@ __device__ consteval auto& policy_generator() {{
       input2_iterator_src, // 7
       output_iterator_src, // 8
       op_src, // 9
-      policy_hub_expr); // 10
+      policy_hub_expr, // 10
+      cub_arch_policies_str.view()); // 11
 
 #if false // CCCL_DEBUGGING_SWITCH
     fflush(stderr);
@@ -537,7 +494,6 @@ __device__ consteval auto& policy_generator() {{
       "-dlto",
       "-default-device",
       "-DCUB_DISABLE_CDP",
-      "-DCUB_ENABLE_POLICY_PTX_JSON",
       "-std=c++20"};
 
     cccl::detail::extend_args_with_build_config(args, config);
@@ -570,20 +526,12 @@ __device__ consteval auto& policy_generator() {{
     nlohmann::json runtime_policy =
       cub::detail::ptx_json::parse("device_transform_policy", {result.data.get(), result.size});
 
-    const auto algorithm = static_cast<transform::cdt::Algorithm>(runtime_policy["algorithm"].get<int>());
-    const auto min_bif   = static_cast<int>(runtime_policy["min_bif"].get<int>());
-
     build_ptr->loaded_bytes_per_iteration = static_cast<int>((input1_it.value_type.size + input2_it.value_type.size));
-    build_ptr->cc                         = cc;
+    build_ptr->cc                         = cc_major * 10 + cc_minor;
     build_ptr->cubin                      = (void*) result.data.release();
     build_ptr->cubin_size                 = result.size;
-    build_ptr->runtime_policy             = new transform::runtime_tuning_policy{
-      algorithm,
-      min_bif,
-      transform::cdt::RuntimeTransformAgentPrefetchPolicy::from_json(runtime_policy, "prefetch_policy"),
-      transform::cdt::RuntimeTransformAgentVectorizedPolicy::from_json(runtime_policy, "vectorized_policy"),
-      transform::cdt::RuntimeTransformAgentAsyncPolicy::from_json(runtime_policy, "async_policy")};
-    build_ptr->cache = new transform::cache();
+    build_ptr->runtime_policy             = new cub::detail::transform::arch_policies<2>{cub_arch_policies};
+    build_ptr->cache                      = new transform::cache();
   }
   catch (const std::exception& exc)
   {
@@ -614,27 +562,17 @@ CUresult cccl_device_binary_transform(
     CUdevice cu_device;
     check(cuCtxGetDevice(&cu_device));
 
-    error = static_cast<CUresult>(
-      transform::cdt::dispatch_t<transform::cdt::requires_stable_address::no, // TODO implement yes
-                                 OffsetT,
-                                 ::cuda::std::tuple<indirect_iterator_t, indirect_iterator_t>,
-                                 indirect_iterator_t,
-                                 transform::cdt::always_true_predicate,
-                                 indirect_arg_t,
-                                 transform::runtime_tuning_policy,
-                                 transform::transform_kernel_source<2>,
-                                 cub::detail::CudaDriverLauncherFactory>::
-        dispatch(
-          ::cuda::std::make_tuple<indirect_iterator_t, indirect_iterator_t>(d_in1, d_in2),
-          d_out,
-          num_items,
-          {},
-          op,
-          stream,
-          {build,
-           {{{d_in1.value_type.size, d_in1.value_type.alignment}, {d_in2.value_type.size, d_in2.value_type.alignment}}}},
-          cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
-          *reinterpret_cast<transform::runtime_tuning_policy*>(build.runtime_policy)));
+    error = static_cast<CUresult>(transform::cdt::dispatch<transform::cdt::requires_stable_address::no>(
+      ::cuda::std::make_tuple<indirect_iterator_t, indirect_iterator_t>(d_in1, d_in2),
+      indirect_iterator_t{d_out},
+      static_cast<OffsetT>(num_items),
+      transform::cdt::always_true_predicate{},
+      op,
+      stream,
+      *static_cast<cub::detail::transform::arch_policies<2>*>(build.runtime_policy),
+      transform::transform_kernel_source<2>{
+        build, {transform::make_iterator_info(d_in1), transform::make_iterator_info(d_in2)}},
+      cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}));
   }
   catch (const std::exception& exc)
   {
@@ -692,10 +630,12 @@ CUresult cccl_device_transform_cleanup(cccl_device_transform_build_result_t* bui
     {
       return CUDA_ERROR_INVALID_VALUE;
     }
-    std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(build_ptr->cubin));
-    std::unique_ptr<transform::runtime_tuning_policy> rtp(
-      reinterpret_cast<transform::runtime_tuning_policy*>(build_ptr->runtime_policy));
-    std::unique_ptr<transform::cache> cache(reinterpret_cast<transform::cache*>(build_ptr->cache));
+    using namespace cub::detail::transform;
+    std::unique_ptr<char[]> cubin(static_cast<char*>(build_ptr->cubin));
+    std::unique_ptr<arch_policies<1>> rtp(static_cast<arch_policies<1>*>(build_ptr->runtime_policy)); // FIXME(bgruber):
+                                                                                                      // handle <2> as
+                                                                                                      // well
+    std::unique_ptr<transform::cache> cache(static_cast<transform::cache*>(build_ptr->cache));
     check(cuLibraryUnload(build_ptr->library));
   }
   catch (const std::exception& exc)
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 34e39cc9078..c228702750f 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -21,6 +21,9 @@ workflows:
   #       args: '--preset libcudacxx --lit-tests "cuda/utility/basic_any.pass.cpp"' }
   #
   override:
+    - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['msvc2019', 'msvc14.39']}
+    - {jobs: ['build'], std: 'minmax', ctk: '12.X', cxx: ['msvc2019', 'msvc'     ]}
+    - {jobs: ['build'], std: 'minmax', ctk: '13.0', cxx: ['msvc2019', 'msvc'     ]}
 
   pull_request:
     # Old CTK: Oldest/newest supported host compilers:
diff --git a/cub/benchmarks/bench/transform/babelstream.cu b/cub/benchmarks/bench/transform/babelstream.cu
index 67462949425..84b96de3c04 100644
--- a/cub/benchmarks/bench/transform/babelstream.cu
+++ b/cub/benchmarks/bench/transform/babelstream.cu
@@ -1,11 +1,13 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS alg -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 1:4:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:2:1
+
+// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
 
 #include "common.h"
 
diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h
index aa7472351f1..6f0e69ae096 100644
--- a/cub/benchmarks/bench/transform/common.h
+++ b/cub/benchmarks/bench/transform/common.h
@@ -5,12 +5,12 @@
 
 // keep checks at the top so compilation of discarded variants fails really fast
 #include <cub/device/dispatch/dispatch_transform.cuh>
-#if !TUNE_BASE && TUNE_ALGORITHM == 2
+#if !TUNE_BASE && TUNE_ALGORITHM == 3
 #  if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
 #    error "When tuning, this benchmark does not support being compiled for multiple architectures"
 #  endif
 #  if (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 2 (ublkcp) below sm90"
+#    error "Cannot compile algorithm 3 (ublkcp) below sm90"
 #  endif
 #endif
 
@@ -23,32 +23,47 @@
 
 #include <nvbench_helper.cuh>
 
-template <typename RandomAccessIteratorOut, typename... RandomAccessIteratorsIn>
-#if TUNE_BASE
-using policy_hub_t =
-  cub::detail::transform::policy_hub</* stable address */ false,
-                                     /* dense output */ true,
-                                     ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-                                     RandomAccessIteratorOut>;
-#else
-struct policy_hub_t
+#if !TUNE_BASE
+struct arch_policies
 {
-  struct max_policy : cub::ChainedPolicy<500, max_policy, max_policy>
+  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_arch_policy
   {
-    static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
+    const int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__) + TUNE_BIF_BIAS;
 #  if TUNE_ALGORITHM == 0
-    static constexpr auto algorithm = cub::detail::transform::Algorithm::prefetch;
+    const auto algorithm = cub::detail::transform::Algorithm::prefetch;
+    const auto policy    = prefetch_policy{
+      TUNE_THREADS
+#    ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
+      ,
+      TUNE_ITEMS_PER_THREAD_NO_INPUT
+#    endif
+    };
+    return {min_bif, algorithm, policy, {}, {}};
 #  elif TUNE_ALGORITHM == 1
-    static constexpr auto algorithm = cub::detail::transform::Algorithm::ublkcp;
+    const auto algorithm = cub::detail::transform::Algorithm::vectorized;
+    const auto policy    = vectorized_policy{
+      TUNE_THREADS,
+      TUNE_VEC_SIZE * TUNE_VECTORS_PER_THREAD,
+      TUNE_VEC_SIZE
+#    ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
+      ,
+      TUNE_ITEMS_PER_THREAD_NO_INPUT
+#    endif
+    };
+    return {min_bif, algorithm, {}, policy, {}};
+#  elif TUNE_ALGORITHM == 2
+    const auto algorithm = cub::detail::transform::Algorithm::memcpy_async;
+    const auto policy    = async_copy_policy{TUNE_THREADS, cub::detail::transform::ldgsts_size_and_align};
+    return {min_bif, algorithm, {}, {}, policy};
+#  elif TUNE_ALGORITHM == 3s
+    const auto algorithm = cub::detail::transform::Algorithm::ublkcp;
+    const auto policy =
+      async_copy_policy{TUNE_THREADS, cub::detail::transform::bulk_copy_alignment(__CUDA_ARCH_LIST__)};
+    return {min_bif, algorithm, {}, {}, policy};
 #  else
 #    error Policy hub does not yet implement the specified value for algorithm
 #  endif
-
-    using algo_policy =
-      ::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::prefetch,
-                       cub::detail::transform::prefetch_policy_t<TUNE_THREADS>,
-                       cub::detail::transform::async_copy_policy_t<TUNE_THREADS, __CUDA_ARCH_LIST__ == 900 ? 128 : 16>>;
-  };
+  }
 };
 #endif
 
@@ -60,15 +75,17 @@ void bench_transform(nvbench::state& state,
                      TransformOp transform_op)
 {
   state.exec(nvbench::exec_tag::gpu, [&](const nvbench::launch& launch) {
-    cub::detail::transform::dispatch_t<
-      cub::detail::transform::requires_stable_address::no,
-      OffsetT,
-      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-      RandomAccessIteratorOut,
-      cub::detail::transform::always_true_predicate,
-      TransformOp,
-      policy_hub_t<RandomAccessIteratorOut, RandomAccessIteratorsIn...>>::
-      dispatch(
-        inputs, output, num_items, cub::detail::transform::always_true_predicate{}, transform_op, launch.get_stream());
+    cub::detail::transform::dispatch<cub::detail::transform::requires_stable_address::no>(
+      inputs,
+      output,
+      num_items,
+      cub::detail::transform::always_true_predicate{},
+      transform_op,
+      launch.get_stream()
+#if !TUNE_BASE
+        ,
+      arch_policies{}
+#endif
+    );
   });
 }
diff --git a/cub/benchmarks/bench/transform/complex_cmp.cu b/cub/benchmarks/bench/transform/complex_cmp.cu
index ab7b993f531..636ed40765c 100644
--- a/cub/benchmarks/bench/transform/complex_cmp.cu
+++ b/cub/benchmarks/bench/transform/complex_cmp.cu
@@ -1,11 +1,13 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS alg -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 1:4:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
 
 #include "common.h"
 
diff --git a/cub/benchmarks/bench/transform/fib.cu b/cub/benchmarks/bench/transform/fib.cu
index f85ae86386c..5f40fba2abc 100644
--- a/cub/benchmarks/bench/transform/fib.cu
+++ b/cub/benchmarks/bench/transform/fib.cu
@@ -1,11 +1,13 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS alg -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 1:4:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
 
 #include "common.h"
 
diff --git a/cub/benchmarks/bench/transform/fill.cu b/cub/benchmarks/bench/transform/fill.cu
index 84e53221bd3..c90ae6472cc 100644
--- a/cub/benchmarks/bench/transform/fill.cu
+++ b/cub/benchmarks/bench/transform/fill.cu
@@ -1,11 +1,17 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS alg -16:16:4
+// for filling, we can only use the prefetch and the vectorized algorithm
+// %RANGE% TUNE_ALGORITHM alg 1:2:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:3:1
+
+// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 0 (prefetch)
+// %RANGE% TUNE_ITEMS_PER_THREAD_NO_INPUT ipt 1:32:1
+
+// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
 
 #include "common.h"
 
diff --git a/cub/benchmarks/bench/transform/grayscale.cu b/cub/benchmarks/bench/transform/grayscale.cu
index 5d15205b36f..52a7783cdc1 100644
--- a/cub/benchmarks/bench/transform/grayscale.cu
+++ b/cub/benchmarks/bench/transform/grayscale.cu
@@ -1,6 +1,14 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
+// %RANGE% TUNE_BIF_BIAS alg -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 1:4:1
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+
+// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
+
 #include "common.h"
 
 template <typename T>
diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu
index ef7a9fa4430..9dbb78bc8c5 100644
--- a/cub/benchmarks/bench/transform/heavy.cu
+++ b/cub/benchmarks/bench/transform/heavy.cu
@@ -1,11 +1,13 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
-// Because CUB cannot inspect the transformation function, we cannot add any tunings based on the results of this
-// benchmark. Its main use is to detect regressions.
-
+// %RANGE% TUNE_BIF_BIAS alg -16:16:4
+// %RANGE% TUNE_ALGORITHM alg 1:4:1
 // %RANGE% TUNE_THREADS tpb 128:1024:128
-// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// TODO(bgruber): those parameters only apply if TUNE_ALGORITHM == 1 (vectorized)
+// %RANGE% TUNE_VEC_SIZE ipt 1:32:1
+// %RANGE% TUNE_VECTORS_PER_THREAD vpt 1:4:1
 
 #include "common.h"
 
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 994670c9d33..a8a26ee5709 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -17,6 +17,7 @@
 #include <cub/device/dispatch/dispatch_transform.cuh>
 #include <cub/util_namespace.cuh>
 
+#include <cuda/__execution/tune.h>
 #include <cuda/__functional/address_stability.h>
 #include <cuda/__stream/get_stream.h>
 #include <cuda/std/__execution/env.h>
@@ -44,25 +45,31 @@ struct ::cuda::proclaims_copyable_arguments<CUB_NS_QUALIFIER::detail::__return_c
 {};
 
 CUB_NAMESPACE_BEGIN
+namespace detail::transform
+{
+struct get_tuning_query_t
+{};
+} // namespace detail::transform
+
 //! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
 //! sequences into an output sequence.
 struct DeviceTransform
 {
 private:
-  template <typename... RandomAccessIteratorsIn,
+  template <detail::transform::requires_stable_address StableAddress = detail::transform::requires_stable_address::no,
+            typename... RandomAccessIteratorsIn,
             typename RandomAccessIteratorOut,
             typename NumItemsT,
             typename Predicate,
             typename TransformOp,
-            typename StableAddress = cuda::std::false_type>
+            typename Env>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     Predicate predicate,
     TransformOp transform_op,
-    cudaStream_t stream,
-    StableAddress = {})
+    Env env)
   {
     using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
     using offset_t        = typename choose_offset_t::type;
@@ -73,17 +80,32 @@ private:
       return error;
     }
 
-    return detail::transform::dispatch_t < StableAddress::value
-           ? detail::transform::requires_stable_address::yes
-           : detail::transform::requires_stable_address::no,
-           offset_t, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, Predicate,
-           TransformOp > ::dispatch(
-             ::cuda::std::move(inputs),
-             ::cuda::std::move(output),
-             num_items,
-             ::cuda::std::move(predicate),
-             ::cuda::std::move(transform_op),
-             stream);
+    using tuning_env_t =
+      ::cuda::std::execution::__query_result_or_t<Env, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
+    using transform_tuning_t =
+      ::cuda::std::execution::__query_result_or_t<tuning_env_t, detail::transform::get_tuning_query_t, int>;
+
+    if constexpr (!::cuda::std::is_same_v<transform_tuning_t, int>)
+    {
+      return detail::transform::dispatch<StableAddress>(
+        ::cuda::std::move(inputs),
+        ::cuda::std::move(output),
+        static_cast<offset_t>(num_items),
+        ::cuda::std::move(predicate),
+        ::cuda::std::move(transform_op),
+        get_stream(env),
+        transform_tuning_t{});
+    }
+    else
+    {
+      return detail::transform::dispatch<StableAddress>(
+        ::cuda::std::move(inputs),
+        ::cuda::std::move(output),
+        static_cast<offset_t>(num_items),
+        ::cuda::std::move(predicate),
+        ::cuda::std::move(transform_op),
+        get_stream(env));
+    }
   }
 
   template <typename Env>
@@ -146,7 +168,7 @@ public:
       num_items,
       detail::transform::always_true_predicate{},
       ::cuda::std::move(transform_op),
-      get_stream(env));
+      ::cuda::std::move(env));
   }
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -269,7 +291,7 @@ public:
       num_items,
       detail::transform::always_true_predicate{},
       ::cuda::std::move(generator),
-      get_stream(env));
+      ::cuda::std::move(env));
   }
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -322,7 +344,7 @@ public:
       num_items,
       detail::transform::always_true_predicate{},
       detail::__return_constant<Value>{::cuda::std::move(value)},
-      get_stream(env));
+      ::cuda::std::move(env));
   }
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -401,7 +423,7 @@ public:
       num_items,
       ::cuda::std::move(predicate),
       ::cuda::std::move(transform_op),
-      get_stream(env));
+      ::cuda::std::move(env));
   }
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -489,7 +511,7 @@ public:
       num_items,
       ::cuda::std::move(predicate),
       ::cuda::std::move(transform_op),
-      get_stream(env));
+      ::cuda::std::move(env));
   }
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -567,14 +589,13 @@ public:
     Env env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
-    return TransformInternal(
+    return TransformInternal<detail::transform::requires_stable_address::yes>(
       ::cuda::std::move(inputs),
       ::cuda::std::move(output),
       num_items,
       detail::transform::always_true_predicate{},
       ::cuda::std::move(transform_op),
-      get_stream(env),
-      ::cuda::std::true_type{});
+      ::cuda::std::move(env));
   }
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index c33053f12a6..2925fd88c85 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -41,8 +41,13 @@
 #include <cuda/std/cassert>
 #include <cuda/std/cstdint>
 #include <cuda/std/expected>
+#include <cuda/std/optional>
 #include <cuda/std/tuple>
 
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+#  include <sstream>
+#endif
+
 // On Windows, the `if CUB_DETAIL_CONSTEXPR_ISH` results in `warning C4702: unreachable code`.
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_MSVC(4702)
@@ -67,30 +72,33 @@ struct prefetch_config
   int sm_count;
 };
 
-template <typename Offset,
+template <typename ArchPolicies,
+          typename Offset,
           typename RandomAccessIteratorsIn,
           typename RandomAccessIteratorOut,
           typename Predicate,
-          typename TransformOp,
-          typename PolicyHub>
+          typename TransformOp>
 struct TransformKernelSource;
 
-template <typename Offset,
+template <typename ArchPolicies,
+          typename Offset,
           typename... RandomAccessIteratorsIn,
           typename RandomAccessIteratorOut,
           typename Predicate,
-          typename TransformOp,
-          typename PolicyHub>
-struct TransformKernelSource<Offset,
+          typename TransformOp>
+struct TransformKernelSource<ArchPolicies,
+                             Offset,
                              ::cuda::std::tuple<RandomAccessIteratorsIn...>,
                              RandomAccessIteratorOut,
                              Predicate,
-                             TransformOp,
-                             PolicyHub>
+                             TransformOp>
 {
+  // ArchPolicies must be stateless, so we can pass the type to the kernel
+  static_assert(::cuda::std::is_empty_v<ArchPolicies>);
+
   CUB_DEFINE_KERNEL_GETTER(
     TransformKernel,
-    transform_kernel<typename PolicyHub::max_policy,
+    transform_kernel<ArchPolicies,
                      Offset,
                      Predicate,
                      TransformOp,
@@ -114,9 +122,10 @@ struct TransformKernelSource<Offset,
     return loaded_bytes_per_iteration<RandomAccessIteratorsIn...>();
   }
 
-  CUB_RUNTIME_FUNCTION static constexpr auto ItValueSizesAlignments()
+  CUB_RUNTIME_FUNCTION static constexpr auto InputIteratorInfos()
   {
-    return make_sizes_alignments<RandomAccessIteratorsIn...>();
+    return ::cuda::std::array<iterator_info, sizeof...(RandomAccessIteratorsIn)>{
+      make_iterator_info<RandomAccessIteratorsIn>()...};
   }
 
   template <typename It>
@@ -149,7 +158,7 @@ public:
   CUB_RUNTIME_FUNCTION constexpr static bool
   CanVectorize(int vec_size, const RandomAccessIteratorOut& out, const RandomAccessIteratorsIn&... in)
   {
-    return is_pointer_aligned(out, sizeof(it_value_t<RandomAccessIteratorOut>) * vec_size)
+    return is_pointer_aligned(out, size_of<it_value_t<RandomAccessIteratorOut>> * vec_size)
         && (is_pointer_aligned(in, sizeof(it_value_t<RandomAccessIteratorsIn>) * vec_size) && ...);
   }
 };
@@ -160,367 +169,432 @@ enum class requires_stable_address
   yes
 };
 
-template <
-  requires_stable_address StableAddress,
-  typename Offset,
-  typename RandomAccessIteratorTupleIn,
-  typename RandomAccessIteratorOut,
-  typename Predicate,
-  typename TransformOp,
-  typename PolicyHub = policy_hub<StableAddress == requires_stable_address::yes,
-                                  ::cuda::std::is_same_v<Predicate, always_true_predicate>,
-                                  RandomAccessIteratorTupleIn,
-                                  RandomAccessIteratorOut>,
-  typename KernelSource =
-    TransformKernelSource<Offset, RandomAccessIteratorTupleIn, RandomAccessIteratorOut, Predicate, TransformOp, PolicyHub>,
-  typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
-struct dispatch_t;
+// NEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEW
 
-template <requires_stable_address StableAddress,
+// Reduces the items_per_thread when necessary to generate enough blocks to reach the maximum occupancy.
+template <typename Offset, typename Policy>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int
+spread_out_items_per_thread(Offset num_items, Policy policy, int items_per_thread, int sm_count, int max_occupancy)
+{
+  const int block_threads = policy.block_threads;
+
+  const int items_per_thread_evenly_spread = static_cast<int>((
+    ::cuda::std::min) (Offset{items_per_thread}, ::cuda::ceil_div(num_items, sm_count * block_threads * max_occupancy)));
+  const int items_per_thread_clamped =
+    ::cuda::std::clamp(items_per_thread_evenly_spread, policy.min_items_per_thread, policy.max_items_per_thread);
+  return items_per_thread_clamped;
+}
+
+template <bool NoInputs,
           typename Offset,
-          typename... RandomAccessIteratorsIn,
-          typename RandomAccessIteratorOut,
-          typename Predicate,
-          typename TransformOp,
-          typename PolicyHub,
+          typename SMemFunc,
+          typename PolicyGetter,
           typename KernelSource,
           typename KernelLauncherFactory>
-struct dispatch_t<StableAddress,
-                  Offset,
-                  ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-                  RandomAccessIteratorOut,
-                  Predicate,
-                  TransformOp,
-                  PolicyHub,
-                  KernelSource,
-                  KernelLauncherFactory>
+CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_async_kernel(
+  Offset num_items,
+  int alignment,
+  SMemFunc dyn_smem_for_tile_size,
+  cudaStream_t stream,
+  PolicyGetter policy_getter,
+  KernelSource kernel_source,
+  KernelLauncherFactory launcher_factory)
+  -> cuda_expected<
+    ::cuda::std::tuple<decltype(launcher_factory(0, 0, 0, 0)), decltype(kernel_source.TransformKernel()), int>>
 {
-  static_assert(::cuda::std::is_same_v<Offset, ::cuda::std::int32_t>
-                  || ::cuda::std::is_same_v<Offset, ::cuda::std::int64_t>,
-                "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types");
+  CUB_DETAIL_CONSTEXPR_ISH const transform_arch_policy policy = policy_getter();
+  CUB_DETAIL_CONSTEXPR_ISH int block_threads                  = policy.async_copy_policy.block_threads;
 
-  ::cuda::std::tuple<RandomAccessIteratorsIn...> in;
-  RandomAccessIteratorOut out;
-  Offset num_items;
-  Predicate pred;
-  TransformOp op;
-  int bulk_copy_align;
-  cudaStream_t stream;
-  KernelSource kernel_source             = {};
-  KernelLauncherFactory launcher_factory = {};
-
-  // Reduces the items_per_thread when necessary to generate enough blocks to reach the maximum occupancy.
-  template <typename AsyncPolicy>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int
-  spread_out_items_per_thread(AsyncPolicy async_policy, int items_per_thread, int sm_count, int max_occupancy)
-  {
-    const int block_threads = async_policy.BlockThreads();
-
-    const int items_per_thread_evenly_spread = static_cast<int>(
-      (::cuda::std::min) (Offset{items_per_thread},
-                          ::cuda::ceil_div(num_items, sm_count * block_threads * max_occupancy)));
-    const int items_per_thread_clamped = ::cuda::std::clamp(
-      items_per_thread_evenly_spread, +async_policy.MinItemsPerThread(), +async_policy.MaxItemsPerThread());
-    return items_per_thread_clamped;
-  }
+  _CCCL_ASSERT(block_threads % alignment == 0, "block_threads needs to be a multiple of the copy alignment");
+  // ^ then tile_size is a multiple of it
 
-  template <typename WrappedPolicy, typename SMemFunc>
-  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto
-  configure_async_kernel(int alignment, SMemFunc dyn_smem_for_tile_size, WrappedPolicy wrapped_policy = {})
-    -> cuda_expected<
-      ::cuda::std::tuple<decltype(launcher_factory(0, 0, 0, 0)), decltype(kernel_source.TransformKernel()), int>>
-  {
-    int block_threads = wrapped_policy.AsyncPolicy().BlockThreads();
-
-    _CCCL_ASSERT(block_threads % alignment == 0, "block_threads needs to be a multiple of the copy alignment");
-    // ^ then tile_size is a multiple of it
+  CUB_DETAIL_CONSTEXPR_ISH auto min_items_per_thread = policy.async_copy_policy.min_items_per_thread;
+  CUB_DETAIL_CONSTEXPR_ISH auto max_items_per_thread = policy.async_copy_policy.max_items_per_thread;
 
-    CUB_DETAIL_CONSTEXPR_ISH auto min_items_per_thread = wrapped_policy.AsyncPolicy().MinItemsPerThread();
-    CUB_DETAIL_CONSTEXPR_ISH auto max_items_per_thread = wrapped_policy.AsyncPolicy().MaxItemsPerThread();
+  // ensures the loop below runs at least once
+  // pulled outside of the lambda below to make MSVC happy
+  CUB_DETAIL_STATIC_ISH_ASSERT(min_items_per_thread <= max_items_per_thread, "invalid policy");
 
-    // ensures the loop below runs at least once
-    // pulled outside of the lambda below to make MSVC happy
-    CUB_DETAIL_STATIC_ISH_ASSERT(min_items_per_thread <= max_items_per_thread, "invalid policy");
+  auto determine_element_counts = [&]() -> cuda_expected<async_config> {
+    int sm_count = 0;
+    auto error   = CubDebug(launcher_factory.MultiProcessorCount(sm_count));
+    if (error != cudaSuccess)
+    {
+      return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(error);
+    }
 
-    auto determine_element_counts = [&]() -> cuda_expected<async_config> {
-      int sm_count = 0;
-      auto error   = CubDebug(launcher_factory.MultiProcessorCount(sm_count));
+    // Increase the number of output elements per thread until we reach the required bytes in flight.
+    // Benchmarking shows that even for a few iteration, this loop takes around 4-7 us, so should not be a concern.
+    // This computation MUST NOT depend on any runtime state of the current API invocation (like num_items), since the
+    // result will be cached.
+    async_config last_config{};
+    for (int items_per_thread = +min_items_per_thread; items_per_thread <= +max_items_per_thread; ++items_per_thread)
+    {
+      const int tile_size     = block_threads * items_per_thread;
+      const int dyn_smem_size = dyn_smem_for_tile_size(tile_size, alignment);
+      int max_occupancy       = 0;
+      error                   = CubDebug(
+        launcher_factory.MaxSmOccupancy(max_occupancy, kernel_source.TransformKernel(), block_threads, dyn_smem_size));
       if (error != cudaSuccess)
       {
         return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(error);
       }
-
-      // Increase the number of output elements per thread until we reach the required bytes in flight.
-      // Benchmarking shows that even for a few iteration, this loop takes around 4-7 us, so should not be a concern.
-      // This computation MUST NOT depend on any runtime state of the current API invocation (like num_items), since the
-      // result will be cached.
-      async_config last_config{};
-      for (int items_per_thread = +min_items_per_thread; items_per_thread <= +max_items_per_thread; ++items_per_thread)
+      if (max_occupancy == 0)
       {
-        const int tile_size     = block_threads * items_per_thread;
-        const int dyn_smem_size = dyn_smem_for_tile_size(tile_size, alignment);
-        int max_occupancy       = 0;
-        error                   = CubDebug(launcher_factory.MaxSmOccupancy(
-          max_occupancy, kernel_source.TransformKernel(), block_threads, dyn_smem_size));
-        if (error != cudaSuccess)
-        {
-          return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(error);
-        }
-        if (max_occupancy == 0)
-        {
-          // assert should be prevented by smem check in policy
-          _CCCL_ASSERT(last_config.items_per_thread > 0, "min_items_per_thread exceeds available shared memory");
-          return last_config;
-        }
-
-        const auto config = async_config{items_per_thread, max_occupancy, sm_count};
-
-        const int bytes_in_flight_SM = max_occupancy * tile_size * kernel_source.LoadedBytesPerIteration();
-        if (wrapped_policy.MinBif() <= bytes_in_flight_SM)
-        {
-          return config;
-        }
-
-        last_config = config;
+        // assert should be prevented by smem check in policy
+        _CCCL_ASSERT(last_config.items_per_thread > 0, "min_items_per_thread exceeds available shared memory");
+        return last_config;
       }
-      return last_config;
-    };
-    cuda_expected<async_config> config = kernel_source.CacheAsyncConfiguration(determine_element_counts);
-    if (!config)
-    {
-      return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(config.error());
-    }
-    _CCCL_ASSERT(config->items_per_thread > 0, "");
-    _CCCL_ASSERT((config->items_per_thread * block_threads) % alignment == 0, "");
-
-    const int ipt = spread_out_items_per_thread(
-      wrapped_policy.AsyncPolicy(), config->items_per_thread, config->sm_count, config->max_occupancy);
-    const int tile_size     = block_threads * ipt;
-    const int dyn_smem_size = dyn_smem_for_tile_size(tile_size, alignment);
-    _CCCL_ASSERT((sizeof...(RandomAccessIteratorsIn) == 0) != (dyn_smem_size != 0), ""); // logical xor
-
-    const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{tile_size}));
-    // config->smem_size is 16 bytes larger than needed for UBLKCP because it's the total SMEM size, but 16 bytes are
-    // occupied by static shared memory and padding. But let's not complicate things.
-    return ::cuda::std::make_tuple(
-      launcher_factory(grid_dim, block_threads, dyn_smem_size, stream, true), kernel_source.TransformKernel(), ipt);
-  }
-
-  // Avoid unnecessarily parsing these definitions when not needed.
-#if defined(CUB_DEFINE_RUNTIME_POLICIES)
-  template <typename It, typename = void>
-  struct is_valid_aligned_base_ptr_arg_impl : ::cuda::std::false_type
-  {};
 
-  template <typename It>
-  struct is_valid_aligned_base_ptr_arg_impl<
-    It,
-    ::cuda::std::void_t<decltype(::cuda::std::declval<KernelSource>().MakeAlignedBasePtrKernelArg(
-      THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::declval<It>()), 0))>> : ::cuda::std::true_type
-  {};
+      const auto config = async_config{items_per_thread, max_occupancy, sm_count};
 
-  template <typename It>
-  static constexpr auto is_valid_aligned_base_ptr_arg = is_valid_aligned_base_ptr_arg_impl<It>::value;
-#endif // CUB_DEFINE_RUNTIME_POLICIES
+      const int bytes_in_flight_SM = max_occupancy * tile_size * kernel_source.LoadedBytesPerIteration();
+      if (policy.min_bif <= bytes_in_flight_SM)
+      {
+        return config;
+      }
 
-  template <typename WrappedPolicy, typename SMemFunc, std::size_t... Is>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_async_algorithm(
-    int alignment, SMemFunc dyn_smem_for_tile_size, cuda::std::index_sequence<Is...>, WrappedPolicy wrapped_policy = {})
-  {
-    auto ret = configure_async_kernel(alignment, dyn_smem_for_tile_size, wrapped_policy);
-    if (!ret)
-    {
-      return ret.error();
+      last_config = config;
     }
-#if defined(CUB_DEFINE_RUNTIME_POLICIES)
-    // Normally, this check is handled by the if constexpr(ish) in Invoke. However, when runtime policies are
-    // defined (like by c.parallel), that if constexpr becomes a plain if, so we need to check the actual compile time
-    // condition again, this time asserting at runtime if we hit this point during dispatch.
-    if constexpr ((is_valid_aligned_base_ptr_arg<RandomAccessIteratorsIn> && ...))
-    {
-#endif // CUB_DEFINE_RUNTIME_POLICIES
-      auto [launcher, kernel, items_per_thread] = *ret;
-      return launcher.doit(
-        kernel,
-        num_items,
-        items_per_thread,
-        false,
-        pred,
-        op,
-        THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out),
-        kernel_source.MakeAlignedBasePtrKernelArg(
-          THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)), alignment)...);
-#if defined(CUB_DEFINE_RUNTIME_POLICIES)
-    }
-    else
-    {
-      _CCCL_ASSERT_HOST(false, "ublkcp algorithm requires all input iterators to be contiguous");
-      _CCCL_UNREACHABLE();
-    }
-#endif // CUB_DEFINE_RUNTIME_POLICIES
+    return last_config;
+  };
+  cuda_expected<async_config> config = kernel_source.CacheAsyncConfiguration(determine_element_counts);
+  if (!config)
+  {
+    return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(config.error());
   }
+  _CCCL_ASSERT(config->items_per_thread > 0, "");
+  _CCCL_ASSERT((config->items_per_thread * block_threads) % alignment == 0, "");
+
+  const int ipt = spread_out_items_per_thread(
+    num_items, policy.async_copy_policy, config->items_per_thread, config->sm_count, config->max_occupancy);
+  const int tile_size     = block_threads * ipt;
+  const int dyn_smem_size = dyn_smem_for_tile_size(tile_size, alignment);
+  _CCCL_ASSERT(NoInputs != (dyn_smem_size != 0), ""); // logical xor
+
+  const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{tile_size}));
+  // config->smem_size is 16 bytes larger than needed for UBLKCP because it's the total SMEM size, but 16 bytes are
+  // occupied by static shared memory and padding. But let's not complicate things.
+  return ::cuda::std::make_tuple(
+    launcher_factory(grid_dim, block_threads, dyn_smem_size, stream, true), kernel_source.TransformKernel(), ipt);
+}
 
-  template <typename WrappedPolicy, size_t... Is>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
-  invoke_prefetch_or_vectorized_algorithm(::cuda::std::index_sequence<Is...>, WrappedPolicy wrapped_policy)
+template <typename Offset,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename Predicate,
+          typename TransformOp,
+          typename SMemFunc,
+          std::size_t... Is,
+          typename PolicyGetter,
+          typename KernelSource,
+          typename KernelLauncherFactory>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_async_algorithm(
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> in,
+  RandomAccessIteratorOut out,
+  Offset num_items,
+  Predicate pred,
+  TransformOp op,
+  cudaStream_t stream,
+  int alignment,
+  SMemFunc dyn_smem_for_tile_size,
+  cuda::std::index_sequence<Is...>,
+  PolicyGetter policy_getter,
+  KernelSource kernel_source,
+  KernelLauncherFactory launcher_factory)
+{
+  auto ret = configure_async_kernel<(sizeof...(RandomAccessIteratorsIn) == 0)>(
+    num_items, alignment, dyn_smem_for_tile_size, stream, policy_getter, kernel_source, launcher_factory);
+  if (!ret)
   {
-    const int block_threads =
-      wrapped_policy.Algorithm() == Algorithm::vectorized
-        ? wrapped_policy.VectorizedPolicy().BlockThreads()
-        : wrapped_policy.PrefetchPolicy().BlockThreads();
-
-    auto determine_config = [&]() -> cuda_expected<prefetch_config> {
-      int max_occupancy = 0;
-      auto error =
-        CubDebug(launcher_factory.MaxSmOccupancy(max_occupancy, kernel_source.TransformKernel(), block_threads, 0));
-      if (error != cudaSuccess)
-      {
-        return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(error);
-      }
-      int sm_count = 0;
-      error        = CubDebug(launcher_factory.MultiProcessorCount(sm_count));
-      if (error != cudaSuccess)
-      {
-        return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(error);
-      }
-      return prefetch_config{max_occupancy, sm_count};
-    };
+    return ret.error();
+  }
 
-    cuda_expected<prefetch_config> config = kernel_source.CachePrefetchConfiguration(determine_config);
-    if (!config)
+  auto [launcher, kernel, items_per_thread] = *ret;
+  return launcher.doit(
+    kernel,
+    num_items,
+    items_per_thread,
+    false,
+    pred,
+    op,
+    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out),
+    kernel_source.MakeAlignedBasePtrKernelArg(
+      THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)), alignment)...);
+}
+
+template <typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename Offset,
+          typename Predicate,
+          typename TransformOp,
+          size_t... Is,
+          typename PolicyGetter,
+          typename KernelSource,
+          typename KernelLauncherFactory>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_prefetch_or_vectorized_algorithm(
+  [[maybe_unused]] ::cuda::std::tuple<RandomAccessIteratorsIn...> in,
+  RandomAccessIteratorOut out,
+  Offset num_items,
+  Predicate pred,
+  TransformOp op,
+  cudaStream_t stream,
+  ::cuda::std::index_sequence<Is...>,
+  PolicyGetter policy_getter,
+  KernelSource kernel_source,
+  KernelLauncherFactory launcher_factory)
+{
+  CUB_DETAIL_CONSTEXPR_ISH const transform_arch_policy policy = policy_getter();
+  CUB_DETAIL_CONSTEXPR_ISH const int block_threads =
+    policy.algorithm == Algorithm::vectorized
+      ? policy.vectorized_policy.block_threads
+      : policy.prefetch_policy.block_threads;
+
+  auto determine_config = [&]() -> cuda_expected<prefetch_config> {
+    int max_occupancy = 0;
+    auto error =
+      CubDebug(launcher_factory.MaxSmOccupancy(max_occupancy, kernel_source.TransformKernel(), block_threads, 0));
+    if (error != cudaSuccess)
     {
-      return config.error();
+      return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(error);
     }
-
-    auto can_vectorize = false;
-    // the policy already handles the compile-time checks if we can vectorize. Do the remaining alignment check here
-    if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == wrapped_policy.Algorithm())
+    int sm_count = 0;
+    error        = CubDebug(launcher_factory.MultiProcessorCount(sm_count));
+    if (error != cudaSuccess)
     {
-      const int vs  = wrapped_policy.VectorizedPolicy().VecSize();
-      can_vectorize = kernel_source.CanVectorize(vs, out, ::cuda::std::get<Is>(in)...);
+      return ::cuda::std::unexpected<cudaError_t /* nvcc 12.0 fails CTAD here */>(error);
     }
+    return prefetch_config{max_occupancy, sm_count};
+  };
 
-    int ipt        = 0;
-    bool ipt_found = false;
-    if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == wrapped_policy.Algorithm())
-    {
-      if (can_vectorize)
-      {
-        ipt       = wrapped_policy.VectorizedPolicy().ItemsPerThreadVectorized();
-        ipt_found = true;
-      }
-    }
+  cuda_expected<prefetch_config> config = kernel_source.CachePrefetchConfiguration(determine_config);
+  if (!config)
+  {
+    return config.error();
+  }
 
-    if (!ipt_found)
+  auto can_vectorize = false;
+  ::cuda::std::optional<int> ipt;
+
+  // the policy already handles the compile-time checks if we can vectorize. Do the remaining alignment check here
+  if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == policy.algorithm)
+  {
+    const int vs  = policy.vectorized_policy.vec_size;
+    can_vectorize = kernel_source.CanVectorize(vs, out, ::cuda::std::get<Is>(in)...);
+    if (can_vectorize)
     {
-      // otherwise, set up the prefetch kernel
-
-      auto loaded_bytes_per_iter = kernel_source.LoadedBytesPerIteration();
-      const auto items_per_thread_no_input =
-        wrapped_policy.Algorithm() == Algorithm::vectorized
-          ? wrapped_policy.VectorizedPolicy().ItemsPerThreadNoInput()
-          : wrapped_policy.PrefetchPolicy().ItemsPerThreadNoInput();
-      // choose items per thread to reach minimum bytes in flight
-      const int items_per_thread =
-        loaded_bytes_per_iter == 0
-          ? items_per_thread_no_input
-          : ::cuda::ceil_div(wrapped_policy.MinBif(), config->max_occupancy * block_threads * loaded_bytes_per_iter);
-
-      // but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16/2^20 elements
-      if CUB_DETAIL_CONSTEXPR_ISH (wrapped_policy.Algorithm() == Algorithm::vectorized)
-      {
-        ipt = spread_out_items_per_thread(
-          wrapped_policy.VectorizedPolicy(), items_per_thread, config->sm_count, config->max_occupancy);
-      }
-      else
-      {
-        ipt = spread_out_items_per_thread(
-          wrapped_policy.PrefetchPolicy(), items_per_thread, config->sm_count, config->max_occupancy);
-      }
+      ipt = policy.vectorized_policy.items_per_thread_vectorized;
     }
-    const int tile_size = block_threads * ipt;
-    const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{tile_size}));
-    return CubDebug(
-      launcher_factory(grid_dim, block_threads, 0, stream, true)
-        .doit(kernel_source.TransformKernel(),
-              num_items,
-              ipt,
-              can_vectorize,
-              pred,
-              op,
-              THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out),
-              kernel_source.MakeIteratorKernelArg(
-                THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)))...));
   }
 
-  template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {})
+  if (!ipt)
+  {
+    // otherwise, set up the prefetch kernel
+    const auto fallback_prefetch_policy = prefetch_policy{
+      policy.vectorized_policy.block_threads,
+      policy.vectorized_policy.prefetch_items_per_thread_no_input,
+      policy.vectorized_policy.prefetch_min_items_per_thread,
+      policy.vectorized_policy.prefetch_max_items_per_thread};
+    const auto prefetch_policy =
+      policy.algorithm == Algorithm::prefetch ? policy.prefetch_policy : fallback_prefetch_policy;
+
+    auto loaded_bytes_per_iter           = kernel_source.LoadedBytesPerIteration();
+    const auto items_per_thread_no_input = prefetch_policy.items_per_thread_no_input;
+    // choose items per thread to reach minimum bytes in flight
+    const int items_per_thread =
+      loaded_bytes_per_iter == 0
+        ? items_per_thread_no_input
+        : ::cuda::ceil_div(policy.min_bif, config->max_occupancy * block_threads * loaded_bytes_per_iter);
+
+    // but also generate enough blocks for full occupancy to optimize small problem sizes, e.g., 2^16/2^20 elements
+    ipt = spread_out_items_per_thread(
+      num_items, prefetch_policy, items_per_thread, config->sm_count, config->max_occupancy);
+  }
+  _CCCL_ASSERT(ipt, "");
+  const int tile_size = block_threads * ipt.value();
+  const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{tile_size}));
+  return CubDebug(
+    launcher_factory(grid_dim, block_threads, 0, stream, true)
+      .doit(kernel_source.TransformKernel(),
+            num_items,
+            ipt.value(),
+            can_vectorize,
+            pred,
+            op,
+            THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(out),
+            kernel_source.MakeIteratorKernelArg(
+              THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)))...));
+}
+
+// This should ideally have been a lambda, but MSVC < 14.44 ICEs if we put a `(static) constexpr` variable inside
+template <typename RandomAccessIteratorTupleIn,
+          typename RandomAccessIteratorOut,
+          typename Offset,
+          typename Predicate,
+          typename TransformOp,
+          typename KernelSource,
+          typename KernelLauncherFactory>
+struct invoke_for_arch;
+
+template <typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename Offset,
+          typename Predicate,
+          typename TransformOp,
+          typename KernelSource,
+          typename KernelLauncherFactory>
+struct invoke_for_arch<::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                       RandomAccessIteratorOut,
+                       Offset,
+                       Predicate,
+                       TransformOp,
+                       KernelSource,
+                       KernelLauncherFactory>
+{
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> in;
+  RandomAccessIteratorOut out;
+  Offset num_items;
+  Predicate pred;
+  TransformOp op;
+  cudaStream_t stream;
+  KernelSource kernel_source;
+  KernelLauncherFactory launcher_factory;
+  ::cuda::arch_id arch_id;
+
+  template <typename PolicyGetter>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t operator()(PolicyGetter policy_getter) const
   {
-    auto wrapped_policy = MakeTransformPolicyWrapper(active_policy);
-    const auto seq      = ::cuda::std::index_sequence_for<RandomAccessIteratorsIn...>{};
-    if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::ublkcp == wrapped_policy.Algorithm())
+    CUB_DETAIL_CONSTEXPR_ISH transform_arch_policy active_policy = policy_getter();
+    const auto seq = ::cuda::std::index_sequence_for<RandomAccessIteratorsIn...>{};
+
+#if !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+    NV_IF_TARGET(
+      NV_IS_HOST,
+      (std::stringstream ss; ss << active_policy;
+       _CubLog("Dispatching DeviceTransform to arch %d with tuning: %s\n", (int) arch_id, ss.str().c_str());))
+#endif // !_CCCL_COMPILER(NVRTC) && defined(CUB_DEBUG_LOG)
+
+    if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::ublkcp == active_policy.algorithm)
     {
       return invoke_async_algorithm(
-        bulk_copy_align,
-        [this](int tile_size, int alignment) {
-          return bulk_copy_dyn_smem_for_tile_size(kernel_source.ItValueSizesAlignments(), tile_size, alignment);
+        ::cuda::std::move(in),
+        ::cuda::std::move(out),
+        num_items,
+        ::cuda::std::move(pred),
+        ::cuda::std::move(op),
+        stream,
+        bulk_copy_alignment(arch_id),
+        [&](int tile_size, int alignment) {
+          return bulk_copy_dyn_smem_for_tile_size<sizeof...(RandomAccessIteratorsIn)>(
+            kernel_source.InputIteratorInfos(), tile_size, alignment);
         },
         seq,
-        wrapped_policy);
+        policy_getter,
+        kernel_source,
+        launcher_factory);
     }
-    else if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::memcpy_async == wrapped_policy.Algorithm())
+    else if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::memcpy_async == active_policy.algorithm)
     {
       return invoke_async_algorithm(
+        ::cuda::std::move(in),
+        ::cuda::std::move(out),
+        num_items,
+        ::cuda::std::move(pred),
+        ::cuda::std::move(op),
+        stream,
         ldgsts_size_and_align,
-        [this](int tile_size, int alignment) {
-          return memcpy_async_dyn_smem_for_tile_size(kernel_source.ItValueSizesAlignments(), tile_size, alignment);
+        [&](int tile_size, int alignment) {
+          return memcpy_async_dyn_smem_for_tile_size<sizeof...(RandomAccessIteratorsIn)>(
+            kernel_source.InputIteratorInfos(), tile_size, alignment);
         },
         seq,
-        wrapped_policy);
+        policy_getter,
+        kernel_source,
+        launcher_factory);
     }
     else
     {
-      return invoke_prefetch_or_vectorized_algorithm(seq, wrapped_policy);
+      return invoke_prefetch_or_vectorized_algorithm(
+        ::cuda::std::move(in),
+        ::cuda::std::move(out),
+        num_items,
+        ::cuda::std::move(pred),
+        ::cuda::std::move(op),
+        stream,
+        seq,
+        policy_getter,
+        kernel_source,
+        launcher_factory);
     }
   }
+};
 
-  template <typename MaxPolicyT = typename PolicyHub::max_policy>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
-    ::cuda::std::tuple<RandomAccessIteratorsIn...> in,
-    RandomAccessIteratorOut out,
-    Offset num_items,
-    Predicate pred,
-    TransformOp op,
-    cudaStream_t stream,
-    KernelSource kernel_source             = {},
-    KernelLauncherFactory launcher_factory = {},
-    MaxPolicyT max_policy                  = {})
+template <requires_stable_address StableAddress,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename Offset,
+          typename Predicate,
+          typename TransformOp,
+          typename ArchPolicies          = arch_policies_from_types<StableAddress == requires_stable_address::yes,
+                                                                    ::cuda::std::is_same_v<Predicate, always_true_predicate>,
+                                                                    ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                                                                    RandomAccessIteratorOut>,
+          typename KernelSource          = TransformKernelSource<ArchPolicies,
+                                                                 Offset,
+                                                                 ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                                                                 RandomAccessIteratorOut,
+                                                                 Predicate,
+                                                                 TransformOp>,
+          typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
+#if _CCCL_HAS_CONCEPTS()
+  requires transform_policy_hub<ArchPolicies>
+#endif // _CCCL_HAS_CONCEPTS()
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> in,
+  RandomAccessIteratorOut out,
+  Offset num_items,
+  Predicate pred,
+  TransformOp op,
+  cudaStream_t stream,
+  ArchPolicies arch_policies             = {},
+  KernelSource kernel_source             = {},
+  KernelLauncherFactory launcher_factory = {})
+{
+  static_assert(
+    ::cuda::std::is_same_v<Offset, ::cuda::std::int32_t> || ::cuda::std::is_same_v<Offset, ::cuda::std::int64_t>,
+    "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types");
+
+  if (num_items == 0)
   {
-    if (num_items == 0)
-    {
-      return cudaSuccess;
-    }
+    return cudaSuccess;
+  }
 
-    int ptx_version = 0;
-    auto error      = CubDebug(launcher_factory.PtxVersion(ptx_version));
-    if (cudaSuccess != error)
-    {
-      return error;
-    }
+  ::cuda::arch_id arch_id{};
+  if (const auto error = CubDebug(launcher_factory.PtxArchId(arch_id)))
+  {
+    return error;
+  }
 
-    dispatch_t dispatch{
+  return dispatch_arch(
+    arch_policies,
+    arch_id,
+    invoke_for_arch<::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                    RandomAccessIteratorOut,
+                    Offset,
+                    Predicate,
+                    TransformOp,
+                    KernelSource,
+                    KernelLauncherFactory>{
       ::cuda::std::move(in),
       ::cuda::std::move(out),
       num_items,
       ::cuda::std::move(pred),
       ::cuda::std::move(op),
-      bulk_copy_alignment(ptx_version),
       stream,
       kernel_source,
-      launcher_factory};
-    return CubDebug(max_policy.Invoke(ptx_version, dispatch));
-  }
-};
+      launcher_factory,
+      arch_id});
+}
 } // namespace detail::transform
 CUB_NAMESPACE_END
 
diff --git a/cub/cub/device/dispatch/kernels/kernel_transform.cuh b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
index 081bfff43d0..930d01de7eb 100644
--- a/cub/cub/device/dispatch/kernels/kernel_transform.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_transform.cuh
@@ -23,6 +23,7 @@
 
 #include <cuda/__cmath/pow2.h>
 #include <cuda/__cmath/round_up.h>
+#include <cuda/__device/arch_id.h>
 #include <cuda/__memcpy_async/elect_one.h>
 #include <cuda/__memory/aligned_size.h>
 #include <cuda/__ptx/instructions/cp_async_bulk.h>
@@ -96,7 +97,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It begin, int items)
 // This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in
 // global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the
 // parameter yields a global memory address.
-template <typename PrefetchPolicy,
+template <int BlockThreads,
           typename Offset,
           typename Predicate,
           typename F,
@@ -110,7 +111,7 @@ _CCCL_DEVICE void transform_kernel_prefetch(
   RandomAccessIteratorOut out,
   RandomAccessIteratorIn... ins)
 {
-  constexpr int block_threads = PrefetchPolicy::block_threads;
+  constexpr int block_threads = BlockThreads;
   const int tile_size         = block_threads * num_elem_per_thread;
   const Offset offset         = static_cast<Offset>(blockIdx.x) * tile_size;
   const int valid_items       = static_cast<int>((::cuda::std::min) (num_items - offset, Offset{tile_size}));
@@ -201,11 +202,18 @@ _CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type()
   }
 }
 
-template <typename VectorizedPolicy,
-          typename Offset,
-          typename F,
-          typename RandomAccessIteratorOut,
-          typename... RandomAccessIteratorsIn>
+// FIXME(bgruber): nvcc 12.0 - 13.1 crash with `error: Internal Compiler Error (codegen): "unexpected error in codegen
+// for function: found previous definition of same function!"` when we pass a const& as template parameter (and the
+// function template body contains a lambda). As a workaround, we pass the parts of the policy by value.
+// TODO(bgruber): In C++20, we should just pass transform_arch_policy by value.
+template < // const transform_arch_policy& Policy,
+  int block_threads,
+  int items_per_thread,
+  int vec_size,
+  typename Offset,
+  typename F,
+  typename RandomAccessIteratorOut,
+  typename... RandomAccessIteratorsIn>
 _CCCL_DEVICE void transform_kernel_vectorized(
   Offset num_items,
   int num_elem_per_thread_prefetch,
@@ -214,18 +222,18 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   RandomAccessIteratorOut out,
   RandomAccessIteratorsIn... ins)
 {
-  constexpr int block_dim        = VectorizedPolicy::block_threads;
-  constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
-  constexpr int vec_size         = VectorizedPolicy::vec_size;
+  // constexpr int block_threads    = Policy.vectorized_policy.block_threads;
+  // constexpr int items_per_thread = Policy.vectorized_policy.items_per_thread_vectorized;
+  // constexpr int vec_size         = Policy.vectorized_policy.vec_size;
   _CCCL_ASSERT(!can_vectorize || (items_per_thread == num_elem_per_thread_prefetch), "");
-  constexpr int tile_size = block_dim * items_per_thread;
+  constexpr int tile_size = block_threads * items_per_thread;
   const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_size;
   const int valid_items   = static_cast<int>((::cuda::std::min) (num_items - offset, Offset{tile_size}));
 
   // if we cannot vectorize or don't have a full tile, fall back to prefetch kernel
   if (!can_vectorize || valid_items != tile_size)
   {
-    transform_kernel_prefetch<VectorizedPolicy>(
+    transform_kernel_prefetch<block_threads>(
       num_items,
       num_elem_per_thread_prefetch,
       always_true_predicate{},
@@ -268,7 +276,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
         _CCCL_PRAGMA_UNROLL_FULL()
         for (int i = 0; i < load_store_count; ++i)
         {
-          input_vec[i] = in_vec[i * VectorizedPolicy::block_threads];
+          input_vec[i] = in_vec[i * block_threads];
         }
       }
       else
@@ -280,7 +288,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
           _CCCL_PRAGMA_UNROLL_FULL()
           for (int j = 0; j < vec_size; ++j)
           {
-            input[i * vec_size + j] = in[i * vec_size * VectorizedPolicy::block_threads + j];
+            input[i * vec_size + j] = in[i * vec_size * block_threads + j];
           }
         }
       }
@@ -310,7 +318,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int i = 0; i < load_store_count; ++i)
     {
-      out_vec[i * VectorizedPolicy::block_threads] = output_vec[i];
+      out_vec[i * block_threads] = output_vec[i];
     }
   }
   else
@@ -323,7 +331,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
       _CCCL_PRAGMA_UNROLL_FULL()
       for (int j = 0; j < vec_size; ++j)
       {
-        out[i * vec_size * VectorizedPolicy::block_threads + j] = output[i * vec_size + j];
+        out[i * vec_size * block_threads + j] = output[i * vec_size + j];
       }
     }
   }
@@ -539,13 +547,18 @@ _CCCL_DEVICE auto copy_and_return_smem_dst_fallback(
   return reinterpret_cast<T*>(dst);
 }
 
+// FIXME(bgruber): nvcc 12.0 - 13.1 crash with `error: Internal Compiler Error (codegen): "unexpected error in codegen
+// for function: found previous definition of same function!"` when we pass a const& as template parameter (and the
+// function template body contains a lambda). As a workaround, we pass the parts of the policy by value.
+// TODO(bgruber): In C++20, we should just pass transform_arch_policy by value.
 // note: there is no PDL in this kernel since PDL is not supported below Hopper and this kernel is intended for Ampere
-template <typename LdgstsPolicy,
-          typename Offset,
-          typename Predicate,
-          typename F,
-          typename RandomAccessIteratorOut,
-          typename... InTs>
+template < // const transform_arch_policy& Policy,
+  int block_threads,
+  typename Offset,
+  typename Predicate,
+  typename F,
+  typename RandomAccessIteratorOut,
+  typename... InTs>
 _CCCL_DEVICE void transform_kernel_ldgsts(
   Offset num_items,
   int num_elem_per_thread,
@@ -559,10 +572,10 @@ _CCCL_DEVICE void transform_kernel_ldgsts(
   static_assert(ldgsts_size_and_align <= 16);
   _CCCL_ASSERT(reinterpret_cast<uintptr_t>(smem) % ldgsts_size_and_align == 0, "");
 
-  constexpr int block_threads = LdgstsPolicy::block_threads;
-  const int tile_size         = block_threads * num_elem_per_thread;
-  const Offset offset         = static_cast<Offset>(blockIdx.x) * tile_size;
-  const int valid_items       = static_cast<int>(::cuda::std::min(num_items - offset, Offset{tile_size}));
+  // constexpr int block_threads = Policy.async_copy_policy.block_threads;
+  const int tile_size   = block_threads * num_elem_per_thread;
+  const Offset offset   = static_cast<Offset>(blockIdx.x) * tile_size;
+  const int valid_items = static_cast<int>(::cuda::std::min(num_items - offset, Offset{tile_size}));
 
   [[maybe_unused]] int smem_offset = 0;
   // TODO(bgruber): drop checking first block, since gmem buffers are always sufficiently aligned. But this would not
@@ -680,16 +693,26 @@ _CCCL_DEVICE void bulk_copy_maybe_unaligned(
     dst_ptr[bytes_to_copy - tail_bytes + threadIdx.x] = tail_byte;
   }
 }
-
+// FIXME(bgruber): nvcc 12.0 - 13.1 error with `function "void
+// cub::_V_300300_SM_750_800_900_1000_1200::detail::transform::transform_kernel_ublkcp< ::policy, int,
+// ::cub::_V_300300_SM_750_800_900_1000_1200::detail::transform::always_true_predicate,
+// ::cuda::std::__4::logical_and<int> , bool *, int, int > (T2, int, T3, T4, T5,
+// ::cub::_V_300300_SM_750_800_900_1000_1200::detail::transform::aligned_base_ptr<T6> ...)::[lambda(T1) (instance
+// 3)]::operator ()< ::cuda::std::__4::integral_constant<bool, (bool)1> >  const" has already been defined` when we pass
+// a const& as template parameter (and the function template body contains a lambda). As a workaround, we pass the parts
+// of the policy by value.
+// TODO(bgruber): In C++20, we should just pass transform_arch_policy by value.
 // Note: we tried implementing work stealing, aka. cluster launch control, aka. UGETNEXTWORKID, (see PR:
 // https://github.com/NVIDIA/cccl/pull/5099) and the slowdowns on some benchmarks outweighed the benefits on B200. So we
 // didn't merge the changes. The problem was mostly a 25% increase in integer instructions, as shown by ncu.
-template <typename BulkCopyPolicy,
-          typename Offset,
-          typename Predicate,
-          typename F,
-          typename RandomAccessIteratorOut,
-          typename... InTs>
+template < // const transform_arch_policy& Policy,
+  int block_threads,
+  int bulk_copy_alignment,
+  typename Offset,
+  typename Predicate,
+  typename F,
+  typename RandomAccessIteratorOut,
+  typename... InTs>
 _CCCL_DEVICE void transform_kernel_ublkcp(
   Offset num_items,
   int num_elem_per_thread,
@@ -698,8 +721,8 @@ _CCCL_DEVICE void transform_kernel_ublkcp(
   RandomAccessIteratorOut out,
   aligned_base_ptr<InTs>... aligned_ptrs)
 {
-  constexpr int block_threads       = BulkCopyPolicy::block_threads;
-  constexpr int bulk_copy_alignment = BulkCopyPolicy::bulk_copy_alignment;
+  // constexpr int block_threads       = Policy.async_copy_policy.block_threads;
+  // constexpr int bulk_copy_alignment = Policy.async_copy_policy.bulk_copy_alignment;
 
   // add padding after a tile in shared memory to make space for the next tile's head padding, and retain alignment
   constexpr int max_alignment = ::cuda::std::max({int{alignof(InTs)}...});
@@ -950,52 +973,57 @@ _CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) -
   return arg;
 }
 
-template <typename ActivePolicy>
+template <typename ArchPolicies>
 _CCCL_API constexpr int get_block_threads_helper()
 {
-  if constexpr (ActivePolicy::algorithm == Algorithm::prefetch)
+  constexpr transform_arch_policy policy = ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
+  if constexpr (policy.algorithm == Algorithm::prefetch)
   {
-    return ActivePolicy::prefetch_policy::block_threads;
+    return policy.prefetch_policy.block_threads;
   }
-  else if constexpr (ActivePolicy::algorithm == Algorithm::vectorized)
+  else if constexpr (policy.algorithm == Algorithm::vectorized)
   {
-    return ActivePolicy::vectorized_policy::block_threads;
+    return policy.vectorized_policy.block_threads;
   }
   else
   {
-    return ActivePolicy::async_policy::block_threads;
+    return policy.async_copy_policy.block_threads;
   }
 }
 
 // need a variable template to force constant evaluation of get_block_threads_helper(), otherwise nvcc will give us a
 // "bad attribute argument substitution" error
-template <typename ActivePolicy>
-inline constexpr int get_block_threads = get_block_threads_helper<ActivePolicy>();
+template <typename ArchPolicies>
+inline constexpr int get_block_threads = get_block_threads_helper<ArchPolicies>();
 
 // There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated
 // with the same arguments for each algorithm. Only the device compiler will then select the implementation. This
 // saves some compile-time and binary size.
-template <typename MaxPolicy,
+template <typename ArchPolicies,
           typename Offset,
           typename Predicate,
           typename F,
           typename RandomAccessIteratorOut,
-          typename... RandomAccessIteartorsIn>
-__launch_bounds__(get_block_threads<typename MaxPolicy::ActivePolicy>)
-  CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
-    Offset num_items,
-    int num_elem_per_thread,
-    [[maybe_unused]] bool can_vectorize,
-    Predicate pred,
-    F f,
-    RandomAccessIteratorOut out,
-    kernel_arg<RandomAccessIteartorsIn>... ins)
+          typename... RandomAccessIteratorsIn>
+#if _CCCL_HAS_CONCEPTS()
+  requires transform_policy_hub<ArchPolicies>
+#endif // _CCCL_HAS_CONCEPTS()
+__launch_bounds__(get_block_threads<ArchPolicies>) CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
+  Offset num_items,
+  int num_elem_per_thread,
+  [[maybe_unused]] bool can_vectorize,
+  Predicate pred,
+  F f,
+  RandomAccessIteratorOut out,
+  kernel_arg<RandomAccessIteratorsIn>... ins)
 {
   _CCCL_ASSERT(blockDim.y == 1 && blockDim.z == 1, "transform_kernel only supports 1D blocks");
 
-  if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::prefetch)
+  static constexpr transform_arch_policy policy = ArchPolicies{}(::cuda::arch_id{CUB_PTX_ARCH / 10});
+
+  if constexpr (policy.algorithm == Algorithm::prefetch)
   {
-    transform_kernel_prefetch<typename MaxPolicy::ActivePolicy::prefetch_policy>(
+    transform_kernel_prefetch<policy.prefetch_policy.block_threads>(
       num_items,
       num_elem_per_thread,
       ::cuda::std::move(pred),
@@ -1003,11 +1031,14 @@ __launch_bounds__(get_block_threads<typename MaxPolicy::ActivePolicy>)
       ::cuda::std::move(out),
       ::cuda::std::move(ins.iterator)...);
   }
-  else if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::vectorized)
+  else if constexpr (policy.algorithm == Algorithm::vectorized)
   {
     static_assert(::cuda::std::is_same_v<Predicate, always_true_predicate>,
                   "Cannot vectorize transform with a predicate");
-    transform_kernel_vectorized<typename MaxPolicy::ActivePolicy::vectorized_policy>(
+
+    transform_kernel_vectorized</*policy*/ policy.vectorized_policy.block_threads,
+                                policy.vectorized_policy.items_per_thread_vectorized,
+                                policy.vectorized_policy.vec_size>(
       num_items,
       num_elem_per_thread,
       can_vectorize,
@@ -1015,11 +1046,11 @@ __launch_bounds__(get_block_threads<typename MaxPolicy::ActivePolicy>)
       ::cuda::std::move(out),
       ::cuda::std::move(ins.iterator)...);
   }
-  else if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::memcpy_async)
+  else if constexpr (policy.algorithm == Algorithm::memcpy_async)
   {
     NV_IF_TARGET(
       NV_PROVIDES_SM_80,
-      (transform_kernel_ldgsts<typename MaxPolicy::ActivePolicy::async_policy>(
+      (transform_kernel_ldgsts</*policy*/ policy.async_copy_policy.block_threads>(
          num_items,
          num_elem_per_thread,
          ::cuda::std::move(pred),
@@ -1027,11 +1058,12 @@ __launch_bounds__(get_block_threads<typename MaxPolicy::ActivePolicy>)
          ::cuda::std::move(out),
          ::cuda::std::move(ins.aligned_ptr)...);));
   }
-  else if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::ublkcp)
+  else if constexpr (policy.algorithm == Algorithm::ublkcp)
   {
     NV_IF_TARGET(
       NV_PROVIDES_SM_90,
-      (transform_kernel_ublkcp<typename MaxPolicy::ActivePolicy::async_policy>(
+      (transform_kernel_ublkcp</*policy*/ policy.async_copy_policy.block_threads,
+                               policy.async_copy_policy.bulk_copy_alignment>(
          num_items,
          num_elem_per_thread,
          ::cuda::std::move(pred),
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh
index 5cf4f6ed375..403b8936093 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh
@@ -20,10 +20,6 @@
 #include <cuda/__device/arch_id.h>
 #include <cuda/std/optional>
 
-#if _CCCL_HAS_CONCEPTS()
-#  include <cuda/std/concepts>
-#endif // _CCCL_HAS_CONCEPTS()
-
 #if !_CCCL_COMPILER(NVRTC)
 #  include <ostream>
 #endif
@@ -95,14 +91,9 @@ struct reduce_arch_policy // equivalent of a policy for a single CUDA architectu
 };
 
 #if _CCCL_HAS_CONCEPTS()
-_CCCL_API consteval void __needs_a_constexpr_value(auto) {}
-
 // TODO(bgruber): bikeshed name before we make the tuning API public
 template <typename T>
-concept reduce_policy_hub = requires(T hub, ::cuda::arch_id arch) {
-  { hub(arch) } -> _CCCL_CONCEPT_VSTD::same_as<reduce_arch_policy>;
-  { __needs_a_constexpr_value(hub(arch)) };
-};
+concept reduce_policy_hub = policy_hub<T, reduce_arch_policy>;
 #endif // _CCCL_HAS_CONCEPTS()
 
 template <typename PolicyT, typename = void>
diff --git a/cub/cub/device/dispatch/tuning/tuning_transform.cuh b/cub/cub/device/dispatch/tuning/tuning_transform.cuh
index 9199862ee25..00491b590dd 100644
--- a/cub/cub/device/dispatch/tuning/tuning_transform.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_transform.cuh
@@ -13,8 +13,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/detail/detect_cuda_runtime.cuh>
-#include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
 #include <thrust/type_traits/is_contiguous_iterator.h>
@@ -22,18 +20,21 @@
 
 #include <cuda/__cmath/pow2.h>
 #include <cuda/__cmath/round_up.h>
+#include <cuda/__device/arch_id.h>
 #include <cuda/__functional/address_stability.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__cccl/execution_space.h>
-#include <cuda/std/__numeric/reduce.h>
-#include <cuda/std/__type_traits/conditional.h>
-#include <cuda/std/__type_traits/integral_constant.h>
-#include <cuda/std/__type_traits/void_t.h>
-#include <cuda/std/__utility/pair.h>
 #include <cuda/std/array>
-#include <cuda/std/cstdint>
 #include <cuda/std/tuple>
 
+#if _CCCL_HAS_CONCEPTS()
+#  include <cuda/std/concepts>
+#endif // _CCCL_HAS_CONCEPTS()
+
+#if !_CCCL_COMPILER(NVRTC)
+#  include <ostream>
+#endif
+
 CUB_NAMESPACE_BEGIN
 namespace detail::transform
 {
@@ -66,137 +67,187 @@ enum class Algorithm
   ublkcp
 };
 
-template <int BlockThreads>
-struct prefetch_policy_t
+#if !_CCCL_COMPILER(NVRTC)
+inline ::std::ostream& operator<<(::std::ostream& os, const Algorithm& algorithm)
 {
-  static constexpr int block_threads = BlockThreads;
-  // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
-  static constexpr int items_per_thread_no_input = 2; // when there are no input iterators, the kernel is just filling
-  static constexpr int min_items_per_thread      = 1;
-  static constexpr int max_items_per_thread      = 32;
-
-  // TODO: remove with C++20
-  // The value of the below does not matter.
-  static constexpr int not_a_vectorized_policy = 0;
-};
+  switch (algorithm)
+  {
+    case Algorithm::prefetch:
+      return os << "Algorithm::prefetch";
+    case Algorithm::vectorized:
+      return os << "Algorithm::vectorized";
+    case Algorithm::memcpy_async:
+      return os << "Algorithm::memcpy_async";
+    case Algorithm::ublkcp:
+      return os << "Algorithm::ublkcp";
+    default:
+      return os << "Algorithm::<unknown>";
+  }
+}
+#endif // !_CCCL_COMPILER(NVRTC)
 
-CUB_DETAIL_POLICY_WRAPPER_DEFINE(
-  TransformAgentPrefetchPolicy,
-  (always_true),
-  (block_threads, BlockThreads, int),
-  (items_per_thread_no_input, ItemsPerThreadNoInput, int),
-  (min_items_per_thread, MinItemsPerThread, int),
-  (max_items_per_thread, MaxItemsPerThread, int),
-  (not_a_vectorized_policy, NotAVectorizedPolicy, int) ) // TODO: remove with C++20
-
-template <typename Tuning>
-struct vectorized_policy_t : prefetch_policy_t<Tuning::block_threads>
+struct prefetch_policy
 {
-  static constexpr int items_per_thread_vectorized = Tuning::items_per_thread;
-  static constexpr int vec_size                    = Tuning::vec_size;
+  int block_threads;
+  // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
+  int items_per_thread_no_input = 2; // when there are no input iterators, the kernel is just filling
+  int min_items_per_thread      = 1;
+  int max_items_per_thread      = 32;
 
-  using not_a_vectorized_policy = void; // TODO: remove with C++20, shadows the variable in prefetch_policy_t
-};
+  [[nodiscard]] _CCCL_API constexpr friend bool operator==(const prefetch_policy& lhs, const prefetch_policy& rhs)
+  {
+    return lhs.block_threads == rhs.block_threads && lhs.items_per_thread_no_input == rhs.items_per_thread_no_input
+        && lhs.min_items_per_thread == rhs.min_items_per_thread && lhs.max_items_per_thread == rhs.max_items_per_thread;
+  }
 
-CUB_DETAIL_POLICY_WRAPPER_DEFINE(
-  TransformAgentVectorizedPolicy,
-  (always_true), // TODO: restore with C++20: (TransformAgentPrefetchPolicy),
-  (block_threads, BlockThreads, int),
-  (items_per_thread_no_input, ItemsPerThreadNoInput, int),
-  (min_items_per_thread, MinItemsPerThread, int),
-  (max_items_per_thread, MaxItemsPerThread, int),
-  (items_per_thread_vectorized, ItemsPerThreadVectorized, int),
-  (vec_size, VecSize, int) )
-
-template <int BlockThreads, int BulkCopyAlignment>
-struct async_copy_policy_t
-{
-  static constexpr int block_threads = BlockThreads;
-  // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
-  static constexpr int min_items_per_thread = 1;
-  static constexpr int max_items_per_thread = 32;
+  [[nodiscard]] _CCCL_API constexpr friend bool operator!=(const prefetch_policy& lhs, const prefetch_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
 
-  static constexpr int bulk_copy_alignment = BulkCopyAlignment;
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const prefetch_policy& policy)
+  {
+    return os << "prefetch_policy { .block_threads = " << policy.block_threads << ", .items_per_thread_no_input = "
+              << policy.items_per_thread_no_input << ", .min_items_per_thread = " << policy.min_items_per_thread
+              << ", .max_items_per_thread = " << policy.max_items_per_thread << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
 };
 
-CUB_DETAIL_POLICY_WRAPPER_DEFINE(
-  TransformAgentAsyncPolicy,
-  (always_true),
-  (block_threads, BlockThreads, int),
-  (min_items_per_thread, MinItemsPerThread, int),
-  (max_items_per_thread, MaxItemsPerThread, int),
-  (bulk_copy_alignment, BulkCopyAlignment, int) )
-
-_CCCL_TEMPLATE(typename PolicyT)
-_CCCL_REQUIRES((!TransformAgentPrefetchPolicy<PolicyT> && !TransformAgentAsyncPolicy<PolicyT>
-                && !TransformAgentVectorizedPolicy<PolicyT>) )
-__host__ __device__ constexpr PolicyT MakePolicyWrapper(PolicyT policy)
+struct vectorized_policy
 {
-  return policy;
-}
+  int block_threads;
+  int items_per_thread_vectorized;
+  int vec_size;
+  // if we have to fall back to prefetching, use these values:
+  int prefetch_items_per_thread_no_input = 2;
+  int prefetch_min_items_per_thread      = 1;
+  int prefetch_max_items_per_thread      = 32;
+
+  [[nodiscard]] _CCCL_API constexpr friend bool operator==(const vectorized_policy& lhs, const vectorized_policy& rhs)
+  {
+    return lhs.block_threads == rhs.block_threads && lhs.items_per_thread_vectorized == rhs.items_per_thread_vectorized
+        && lhs.vec_size == rhs.vec_size
+        && lhs.prefetch_items_per_thread_no_input == rhs.prefetch_items_per_thread_no_input
+        && lhs.prefetch_min_items_per_thread == rhs.prefetch_min_items_per_thread
+        && lhs.prefetch_max_items_per_thread == rhs.prefetch_max_items_per_thread;
+  }
 
-template <typename PolicyT, typename = void>
-struct TransformPolicyWrapper : PolicyT
-{
-  _CCCL_HOST_DEVICE TransformPolicyWrapper(PolicyT base)
-      : PolicyT(base)
-  {}
+  [[nodiscard]] _CCCL_API constexpr friend bool operator!=(const vectorized_policy& lhs, const vectorized_policy& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const vectorized_policy& policy)
+  {
+    return os
+        << "vectorized_policy { .block_threads = " << policy.block_threads << ", .items_per_thread_vectorized = "
+        << policy.items_per_thread_vectorized << ", .vec_size = " << policy.vec_size
+        << ", .prefetch_items_per_thread_no_input = " << policy.prefetch_items_per_thread_no_input
+        << ", .prefetch_min_items_per_thread = " << policy.prefetch_min_items_per_thread
+        << ", .prefetch_max_items_per_thread = " << policy.prefetch_max_items_per_thread << " }";
+  }
+#endif // !_CCCL_COMPILER(NVRTC)
 };
 
-template <typename StaticPolicyT>
-struct TransformPolicyWrapper<StaticPolicyT,
-                              ::cuda::std::void_t<decltype(StaticPolicyT::algorithm),
-                                                  decltype(StaticPolicyT::min_bif),
-                                                  typename StaticPolicyT::prefetch_policy,
-                                                  typename StaticPolicyT::vectorized_policy,
-                                                  typename StaticPolicyT::async_policy>> : StaticPolicyT
+struct async_copy_policy
 {
-  _CCCL_HOST_DEVICE TransformPolicyWrapper(StaticPolicyT base)
-      : StaticPolicyT(base)
-  {}
+  int block_threads;
+  int bulk_copy_alignment; // TODO(bgruber): this should probably be removed from the tuning policy
+  // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
+  int min_items_per_thread = 1;
+  int max_items_per_thread = 32;
 
-  _CCCL_HOST_DEVICE static constexpr Algorithm Algorithm()
+  [[nodiscard]] _CCCL_API constexpr friend bool operator==(const async_copy_policy& lhs, const async_copy_policy& rhs)
   {
-    return StaticPolicyT::algorithm;
+    return lhs.block_threads == rhs.block_threads && lhs.bulk_copy_alignment == rhs.bulk_copy_alignment
+        && lhs.min_items_per_thread == rhs.min_items_per_thread && lhs.max_items_per_thread == rhs.max_items_per_thread;
   }
 
-  _CCCL_HOST_DEVICE static constexpr int MinBif()
+  [[nodiscard]] _CCCL_API constexpr friend bool operator!=(const async_copy_policy& lhs, const async_copy_policy& rhs)
   {
-    return StaticPolicyT::min_bif;
+    return !(lhs == rhs);
   }
 
-  _CCCL_HOST_DEVICE static constexpr auto PrefetchPolicy()
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const async_copy_policy& policy)
   {
-    return MakePolicyWrapper(typename StaticPolicyT::prefetch_policy());
+    return os << "async_copy_policy { .block_threads = " << policy.block_threads << ", .bulk_copy_alignment = "
+              << policy.bulk_copy_alignment << ", .min_items_per_thread = " << policy.min_items_per_thread
+              << ", .max_items_per_thread = " << policy.max_items_per_thread << " }";
   }
+#endif // !_CCCL_COMPILER(NVRTC)
+};
 
-  _CCCL_HOST_DEVICE static constexpr auto VectorizedPolicy()
+struct transform_arch_policy
+{
+  int min_bif;
+  Algorithm algorithm;
+  prefetch_policy prefetch_policy;
+  vectorized_policy vectorized_policy;
+  async_copy_policy async_copy_policy;
+
+  [[nodiscard]] _CCCL_API constexpr friend bool
+  operator==(const transform_arch_policy& lhs, const transform_arch_policy& rhs)
   {
-    return MakePolicyWrapper(typename StaticPolicyT::vectorized_policy());
+    return lhs.min_bif == rhs.min_bif && lhs.algorithm == rhs.algorithm && lhs.prefetch_policy == rhs.prefetch_policy
+        && lhs.vectorized_policy == rhs.vectorized_policy && lhs.async_copy_policy == rhs.async_copy_policy;
   }
 
-  _CCCL_HOST_DEVICE static constexpr auto AsyncPolicy()
+  [[nodiscard]] _CCCL_API constexpr friend bool
+  operator!=(const transform_arch_policy& lhs, const transform_arch_policy& rhs)
   {
-    return MakePolicyWrapper(typename StaticPolicyT::async_policy());
+    return !(lhs == rhs);
   }
 
-#if defined(CUB_ENABLE_POLICY_PTX_JSON)
-  _CCCL_DEVICE static constexpr auto EncodedPolicy()
+#if !_CCCL_COMPILER(NVRTC)
+  friend ::std::ostream& operator<<(::std::ostream& os, const transform_arch_policy& policy)
   {
-    using namespace ptx_json;
-    return object<key<"min_bif">()           = value<StaticPolicyT::min_bif>(),
-                  key<"algorithm">()         = value<static_cast<int>(StaticPolicyT::algorithm)>(),
-                  key<"prefetch_policy">()   = PrefetchPolicy().EncodedPolicy(),
-                  key<"vectorized_policy">() = VectorizedPolicy().EncodedPolicy(),
-                  key<"async_policy">()      = AsyncPolicy().EncodedPolicy()>();
+    return os
+        << "transform_arch_policy { .min_bif = " << policy.min_bif << ", .algorithm = " << policy.algorithm
+        << ", .prefetch_policy = " << policy.prefetch_policy << ", .vectorized_policy = " << policy.vectorized_policy
+        << ", .async_copy_policy = " << policy.async_copy_policy << " }";
   }
-#endif // CUB_ENABLE_POLICY_PTX_JSON
+#endif // !_CCCL_COMPILER(NVRTC)
 };
 
-template <typename PolicyT>
-_CCCL_HOST_DEVICE TransformPolicyWrapper<PolicyT> MakeTransformPolicyWrapper(PolicyT base)
+#if _CCCL_HAS_CONCEPTS()
+// TODO(bgruber): bikeshed name before we make the tuning API public
+template <typename T>
+concept transform_policy_hub = policy_hub<T, transform_arch_policy>;
+#endif // _CCCL_HAS_CONCEPTS()
+
+struct iterator_info
+{
+  int value_type_size;
+  int value_type_alignment;
+  bool value_type_is_trivially_relocatable;
+  bool is_contiguous;
+};
+
+template <typename T>
+inline constexpr size_t size_of = sizeof(T);
+
+template <>
+inline constexpr size_t size_of<void> = 0;
+
+template <typename T>
+inline constexpr size_t align_of = alignof(T);
+
+template <>
+inline constexpr size_t align_of<void> = 0;
+
+template <typename It>
+[[nodiscard]] _CCCL_API constexpr auto make_iterator_info() -> iterator_info
 {
-  return TransformPolicyWrapper<PolicyT>(base);
+  using vt = it_value_t<It>;
+  return iterator_info{
+    static_cast<int>(size_of<vt>),
+    static_cast<int>(align_of<vt>),
+    THRUST_NS_QUALIFIER::is_trivially_relocatable_v<vt>,
+    THRUST_NS_QUALIFIER::is_contiguous_iterator_v<It>};
 }
 
 template <typename... Its>
@@ -207,18 +258,20 @@ _CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
 
 constexpr int ldgsts_size_and_align = 16;
 
-template <typename ItValueSizesAlignments>
+template <int InputCount>
 _CCCL_HOST_DEVICE constexpr auto memcpy_async_dyn_smem_for_tile_size(
-  ItValueSizesAlignments it_value_sizes_alignments, int tile_size, int copy_alignment = ldgsts_size_and_align) -> int
+  const ::cuda::std::array<iterator_info, InputCount>& inputs,
+  int tile_size,
+  int copy_alignment = ldgsts_size_and_align) -> int
 {
   int smem_size = 0;
-  for (auto&& [vt_size, vt_alignment] : it_value_sizes_alignments)
+  for (const auto& input : inputs)
   {
     smem_size =
-      static_cast<int>(::cuda::round_up(smem_size, ::cuda::std::max(static_cast<int>(vt_alignment), copy_alignment)));
+      static_cast<int>(::cuda::round_up(smem_size, ::cuda::std::max(input.value_type_alignment, copy_alignment)));
     // max head/tail padding is copy_alignment - sizeof(T) each
     const int max_bytes_to_copy =
-      static_cast<int>(vt_size) * tile_size + ::cuda::std::max(copy_alignment - static_cast<int>(vt_size), 0) * 2;
+      input.value_type_size * tile_size + ::cuda::std::max(copy_alignment - input.value_type_size, 0) * 2;
     smem_size += max_bytes_to_copy;
   };
   return smem_size;
@@ -226,111 +279,217 @@ _CCCL_HOST_DEVICE constexpr auto memcpy_async_dyn_smem_for_tile_size(
 
 constexpr int bulk_copy_size_multiple = 16;
 
-_CCCL_HOST_DEVICE constexpr auto bulk_copy_alignment(int sm_arch) -> int
+_CCCL_HOST_DEVICE constexpr auto bulk_copy_alignment(::cuda::arch_id arch) -> int
 {
-  return sm_arch < 1000 ? 128 : 16;
+  return arch < ::cuda::arch_id::sm_100 ? 128 : 16;
 }
 
-template <typename ItValueSizesAlignments>
-_CCCL_HOST_DEVICE constexpr auto
-bulk_copy_dyn_smem_for_tile_size(ItValueSizesAlignments it_value_sizes_alignments, int tile_size, int bulk_copy_align)
-  -> int
+template <int InputCount>
+_CCCL_HOST_DEVICE constexpr auto bulk_copy_dyn_smem_for_tile_size(
+  const ::cuda::std::array<iterator_info, InputCount>& inputs, int tile_size, int bulk_copy_align) -> int
 {
   // we rely on the tile_size being a multiple of alignments, so shifting offsets/pointers by it retains alignments
   _CCCL_ASSERT(tile_size % bulk_copy_align == 0, "");
   _CCCL_ASSERT(tile_size % bulk_copy_size_multiple == 0, "");
 
   int tile_padding = bulk_copy_align;
-  for (auto&& [_, vt_alignment] : it_value_sizes_alignments)
+  for (const auto& input : inputs)
   {
-    tile_padding = ::cuda::std::max(tile_padding, static_cast<int>(vt_alignment));
+    tile_padding = ::cuda::std::max(tile_padding, input.value_type_alignment);
   }
 
   int smem_size = tile_padding; // for the barrier and padding
-  for (auto&& [vt_size, _] : it_value_sizes_alignments)
+  for (const auto& input : inputs)
   {
-    smem_size += tile_padding + static_cast<int>(vt_size) * tile_size;
+    smem_size += tile_padding + input.value_type_size * tile_size;
   }
   return smem_size;
 }
 
-_CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch)
+[[nodiscard]] _CCCL_API constexpr int arch_to_min_bytes_in_flight(::cuda::arch_id arch)
 {
-  if (sm_arch >= 1000)
+  if (arch >= ::cuda::arch_id::sm_100)
   {
     return 64 * 1024; // B200
   }
-  if (sm_arch >= 900)
+  if (arch >= ::cuda::arch_id::sm_90)
   {
     return 48 * 1024; // 32 for H100, 48 for H200
   }
-  if (sm_arch >= 800)
+  if (arch >= ::cuda::arch_id::sm_80)
   {
     return 16 * 1024; // A100
   }
   return 12 * 1024; // V100 and below
 }
 
-template <typename T>
-inline constexpr size_t size_of = sizeof(T);
-
-template <>
-inline constexpr size_t size_of<void> = 0;
-
-template <typename... RandomAccessIteratorsIn>
-_CCCL_HOST_DEVICE static constexpr auto make_sizes_alignments()
+[[nodiscard]] _CCCL_API constexpr auto tuned_vectorized_policy(::cuda::arch_id arch, int store_size, bool is_filling)
 {
-  return ::cuda::std::array<::cuda::std::pair<::cuda::std::size_t, ::cuda::std::size_t>,
-                            sizeof...(RandomAccessIteratorsIn)>{
-    {{sizeof(it_value_t<RandomAccessIteratorsIn>), alignof(it_value_t<RandomAccessIteratorsIn>)}...}};
-}
+  if (is_filling)
+  {
+    // manually tuned fill on RTX 5090
+    // TODO(bgruber): re-enable this later! It's disabled to avoid SASS changes in PR #6914
+    // if (arch >= ::cuda::arch_id::sm_120)
+    // {
+    //   return vectorized_policy{256, 8, 4};
+    // }
+    // manually tuned fill on B200, same as H200
+    if (arch >= ::cuda::arch_id::sm_90)
+    {
+      return vectorized_policy{
+        store_size > 4 ? 128 : 256, 16, ::cuda::std::max(8 / store_size, 1) /* 64-bit instructions */};
+    }
+    // manually tuned fill on A100
+    if (arch >= ::cuda::arch_id::sm_90)
+    {
+      return vectorized_policy{256, 8, ::cuda::std::max(8 / store_size, 1) /* 64-bit instructions */};
+    }
+  }
 
-template <int PtxVersion, int StoreSize, int... LoadSizes>
-struct tuning_vec
-{
   // defaults from fill on RTX 5090, but can be changed
-  static constexpr int block_threads    = 256;
-  static constexpr int vec_size         = 4;
-  static constexpr int items_per_thread = 8;
-};
+  return vectorized_policy{256, 8, 4};
+}
 
-// manually tuned fill on A100
-template <int StoreSize>
-struct tuning_vec<800, StoreSize>
+template <int InputCount>
+struct arch_policies
 {
-  static constexpr int block_threads    = 256;
-  static constexpr int vec_size         = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
-  static constexpr int items_per_thread = 8;
-};
+  bool requires_stable_address;
+  bool dense_output;
+  ::cuda::std::array<iterator_info, InputCount> inputs;
+  iterator_info output;
 
-// manually tuned fill on H200
-template <int StoreSize>
-struct tuning_vec<900, StoreSize>
-{
-  static constexpr int block_threads    = StoreSize > 4 ? 128 : 256;
-  static constexpr int vec_size         = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
-  static constexpr int items_per_thread = 16;
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> transform_arch_policy
+  {
+    const bool no_input_streams = InputCount == 0;
+
+    bool all_inputs_contiguous                  = true;
+    bool all_input_values_trivially_reloc       = true;
+    bool can_memcpy_contiguous_inputs           = true;
+    bool all_value_types_have_power_of_two_size = ::cuda::is_power_of_two(::cuda::std::max(1, output.value_type_size));
+    for (const auto& input : inputs)
+    {
+      all_inputs_contiguous &= input.is_contiguous;
+      all_input_values_trivially_reloc &= input.value_type_is_trivially_relocatable;
+      // the vectorized kernel supports mixing contiguous and non-contiguous iterators
+      can_memcpy_contiguous_inputs &= !input.is_contiguous || input.value_type_is_trivially_relocatable;
+      all_value_types_have_power_of_two_size &= ::cuda::is_power_of_two(input.value_type_size);
+    }
+    const bool can_memcpy_all_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
+    const bool fallback_to_prefetch  = requires_stable_address || !can_memcpy_contiguous_inputs
+                                   || !all_value_types_have_power_of_two_size || !dense_output;
+    const int min_bif = arch_to_min_bytes_in_flight(arch);
+
+    if (arch >= ::cuda::arch_id::sm_90) // handles sm_100 as well
+    {
+      const int async_block_size = arch < ::cuda::arch_id::sm_100 ? 256 : 128;
+      const int alignment        = bulk_copy_alignment(arch);
+
+      const auto prefetch = prefetch_policy{256};
+      const auto vectorized =
+        tuned_vectorized_policy(arch, ::cuda::std::max(1, output.value_type_size), InputCount == 0);
+      const auto async = async_copy_policy{async_block_size, alignment};
+
+      // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
+      // forward compatible. If a user compiled for sm_xxx and we assume the available SMEM for that architecture, but
+      // then runs on the next architecture after that, which may have a smaller available SMEM, we get a crash.
+      const bool exhaust_smem =
+        bulk_copy_dyn_smem_for_tile_size<InputCount>(inputs, async_block_size * async.min_items_per_thread, alignment)
+        > int{max_smem_per_block};
+
+      // if each tile size is a multiple of the bulk copy and maximum value type alignments, the alignment is retained
+      // if the base pointer is sufficiently aligned (the correct check would be if it's a multiple of all value types
+      // following the current tile). we would otherwise need to realign every SMEM tile individually, which is costly
+      // and complex, so let's fall back in this case.
+      int max_alignment = alignment;
+      for (const auto& input : inputs)
+      {
+        max_alignment = ::cuda::std::max({max_alignment, input.value_type_alignment});
+      }
+
+      bool tile_sizes_retain_alignment = true;
+      for (const auto& input : inputs)
+      {
+        tile_sizes_retain_alignment &= (input.value_type_size * async_block_size) % max_alignment == 0;
+      }
+
+      // on Hopper, the vectorized kernel performs better for 1 and 2 byte values, except for BabelStream mul (1 input)
+      bool vector_kernel_is_faster = arch == ::cuda::arch_id::sm_90 && output.value_type_size < 4 && InputCount > 1;
+      for (const auto& input : inputs)
+      {
+        vector_kernel_is_faster &= input.value_type_size < 4;
+      }
+
+      const bool enough_threads_for_peeling = async_block_size >= alignment; // head and tail bytes
+      const bool fallback_to_vectorized = exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling
+                                       || no_input_streams || !can_memcpy_all_inputs || vector_kernel_is_faster;
+
+      const auto algorithm =
+        fallback_to_prefetch ? Algorithm::prefetch
+        : fallback_to_vectorized
+          ? Algorithm::vectorized
+          : Algorithm::ublkcp;
+
+      return transform_arch_policy{
+        min_bif,
+        algorithm,
+        prefetch,
+        vectorized,
+        async,
+      };
+    }
+    else if (arch >= ::cuda::arch_id::sm_80)
+    {
+      const int block_threads = 256;
+      const auto prefetch     = prefetch_policy{block_threads};
+      const auto vectorized =
+        tuned_vectorized_policy(arch, ::cuda::std::max(1, output.value_type_size), InputCount == 0);
+      const auto async = async_copy_policy{block_threads, ldgsts_size_and_align};
+
+      // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
+      // forward compatible. If a user compiled for sm_xxx and we assume the available SMEM for that architecture, but
+      // then runs on the next architecture after that, which may have a smaller available SMEM, we get a crash.
+      const bool exhaust_smem =
+        memcpy_async_dyn_smem_for_tile_size<InputCount>(
+          inputs, block_threads * async.min_items_per_thread, ldgsts_size_and_align)
+        > int{max_smem_per_block};
+      const bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs;
+
+      const auto algorithm =
+        fallback_to_prefetch ? Algorithm::prefetch
+        : fallback_to_vectorized
+          ? Algorithm::vectorized
+          : Algorithm::memcpy_async;
+
+      return transform_arch_policy{
+        min_bif,
+        algorithm,
+        prefetch,
+        vectorized,
+        async,
+      };
+    }
+
+    // fallback
+    return transform_arch_policy{
+      min_bif,
+      fallback_to_prefetch ? Algorithm::prefetch : Algorithm::vectorized,
+      prefetch_policy{256},
+      tuned_vectorized_policy(::cuda::arch_id::sm_60, ::cuda::std::max(1, output.value_type_size), InputCount == 0),
+      async_copy_policy{}, // never used
+    };
+  }
 };
 
-// manually tuned fill on B200, same as H200
-template <int StoreSize>
-struct tuning_vec<1000, StoreSize> : tuning_vec<900, StoreSize>
-{};
-
-// manually tuned fill on RTX 5090
-template <int StoreSize>
-struct tuning_vec<1200, StoreSize>
-{
-  static constexpr int block_threads    = 256;
-  static constexpr int vec_size         = 4;
-  static constexpr int items_per_thread = 8;
-};
+#if _CCCL_HAS_CONCEPTS()
+static_assert(transform_policy_hub<arch_policies<1>>);
+#endif // _CCCL_HAS_CONCEPTS()
 
+// stateless version which can be passed to kernels
 template <bool RequiresStableAddress,
           bool DenseOutput,
           typename RandomAccessIteratorTupleIn,
           typename RandomAccessIteratorOut>
-struct policy_hub
+struct arch_policies_from_types
 {
   static_assert(sizeof(RandomAccessIteratorTupleIn) == 0, "Second parameter must be a tuple");
 };
@@ -339,137 +498,20 @@ template <bool RequiresStableAddress,
           bool DenseOutput,
           typename... RandomAccessIteratorsIn,
           typename RandomAccessIteratorOut>
-struct policy_hub<RequiresStableAddress,
-                  DenseOutput,
-                  ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-                  RandomAccessIteratorOut>
+struct arch_policies_from_types<RequiresStableAddress,
+                                DenseOutput,
+                                ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                                RandomAccessIteratorOut>
 {
-  static constexpr bool no_input_streams = sizeof...(RandomAccessIteratorsIn) == 0;
-  static constexpr bool all_inputs_contiguous =
-    (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn> && ...);
-  static constexpr bool all_input_values_trivially_reloc =
-    (THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>> && ...);
-  static constexpr bool can_memcpy_all_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
-  // the vectorized kernel supports mixing contiguous and non-contiguous iterators
-  static constexpr bool can_memcpy_contiguous_inputs =
-    ((!THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>
-      || THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
-     && ...);
-
-  static constexpr bool all_value_types_have_power_of_two_size =
-    (::cuda::is_power_of_two(sizeof(it_value_t<RandomAccessIteratorsIn>)) && ...)
-    && ::cuda::is_power_of_two(size_of<it_value_t<RandomAccessIteratorOut>>);
-
-  static constexpr bool fallback_to_prefetch =
-    RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_value_types_have_power_of_two_size || !DenseOutput;
-
-  // TODO(bgruber): consider a separate kernel for just filling
-
-  struct policy300 : ChainedPolicy<300, policy300, policy300>
+  [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id arch) const -> transform_arch_policy
   {
-    static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
-    using prefetch_policy        = prefetch_policy_t<256>;
-    using vectorized_policy      = vectorized_policy_t<
-           tuning_vec<500, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
-    using async_policy              = async_copy_policy_t<256, 16>; // dummy policy, never used
-    static constexpr auto algorithm = fallback_to_prefetch ? Algorithm::prefetch : Algorithm::vectorized;
-  };
-
-  struct policy800 : ChainedPolicy<800, policy800, policy300>
-  {
-  private:
-    static constexpr int block_threads = 256;
-
-  public:
-    static constexpr int min_bif = arch_to_min_bytes_in_flight(800);
-    using prefetch_policy        = prefetch_policy_t<block_threads>;
-    using vectorized_policy      = vectorized_policy_t<
-           tuning_vec<800, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
-    using async_policy = async_copy_policy_t<block_threads, ldgsts_size_and_align>;
-
-  private:
-    // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
-    // forward compatible. If a user compiled for sm_xxx and we assume the available SMEM for that architecture, but
-    // then runs on the next architecture after that, which may have a smaller available SMEM, we get a crash.
-    static constexpr bool exhaust_smem =
-      memcpy_async_dyn_smem_for_tile_size(
-        make_sizes_alignments<RandomAccessIteratorsIn...>(),
-        block_threads* async_policy::min_items_per_thread,
-        ldgsts_size_and_align)
-      > int{max_smem_per_block};
-    static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs;
-
-  public:
-    static constexpr auto algorithm =
-      fallback_to_prefetch ? Algorithm::prefetch
-      : fallback_to_vectorized
-        ? Algorithm::vectorized
-        : Algorithm::memcpy_async;
-  };
-
-  template <int AsyncBlockSize, int PtxVersion>
-  struct bulk_copy_policy_base
-  {
-  private:
-    static constexpr int alignment = bulk_copy_alignment(PtxVersion);
-
-  public:
-    static constexpr int min_bif = arch_to_min_bytes_in_flight(PtxVersion);
-    using prefetch_policy        = prefetch_policy_t<256>;
-    using vectorized_policy =
-      vectorized_policy_t<tuning_vec<PtxVersion,
-                                     size_of<it_value_t<RandomAccessIteratorOut>>,
-                                     sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
-    using async_policy = async_copy_policy_t<AsyncBlockSize, alignment>;
-
-  private:
-    // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
-    // forward compatible. If a user compiled for sm_xxx and we assume the available SMEM for that architecture, but
-    // then runs on the next architecture after that, which may have a smaller available SMEM, we get a crash.
-    static constexpr bool exhaust_smem =
-      bulk_copy_dyn_smem_for_tile_size(
-        make_sizes_alignments<RandomAccessIteratorsIn...>(),
-        AsyncBlockSize* async_policy::min_items_per_thread,
-        alignment)
-      > int{max_smem_per_block};
-
-    // on Hopper, the vectorized kernel performs better for 1 and 2 byte values
-    static constexpr bool use_vector_kernel_on_hopper =
-      ((size_of<it_value_t<RandomAccessIteratorsIn>> < 4) && ...) && sizeof...(RandomAccessIteratorsIn) > 1
-      && size_of<it_value_t<RandomAccessIteratorOut>> < 4;
-
-    // if each tile size is a multiple of the bulk copy and maximum value type alignments, the alignment is retained if
-    // the base pointer is sufficiently aligned (the correct check would be if it's a multiple of all value types
-    // following the current tile). we would otherwise need to realign every SMEM tile individually, which is costly and
-    // complex, so let's fall back in this case.
-    static constexpr int max_alignment =
-      ::cuda::std::max({alignment, int{alignof(it_value_t<RandomAccessIteratorsIn>)}...});
-    static constexpr bool tile_sizes_retain_alignment =
-      (((int{sizeof(it_value_t<RandomAccessIteratorsIn>)} * AsyncBlockSize) % max_alignment == 0) && ...);
-    static constexpr bool enough_threads_for_peeling = AsyncBlockSize >= alignment; // head and tail bytes
-    static constexpr bool fallback_to_vectorized =
-      exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams
-      || !can_memcpy_all_inputs || (PtxVersion == 900 && use_vector_kernel_on_hopper);
-
-  public:
-    static constexpr auto algorithm =
-      fallback_to_prefetch ? Algorithm::prefetch
-      : fallback_to_vectorized
-        ? Algorithm::vectorized
-        : Algorithm::ublkcp;
-  };
-
-  struct policy900
-      : bulk_copy_policy_base<256, 900>
-      , ChainedPolicy<900, policy900, policy800>
-  {};
-
-  struct policy1000
-      : bulk_copy_policy_base<128, 1000>
-      , ChainedPolicy<1000, policy1000, policy900>
-  {};
-
-  using max_policy = policy1000;
+    constexpr auto policies = arch_policies<sizeof...(RandomAccessIteratorsIn)>{
+      RequiresStableAddress,
+      DenseOutput,
+      {make_iterator_info<RandomAccessIteratorsIn>()...},
+      make_iterator_info<RandomAccessIteratorOut>()};
+    return policies(arch);
+  }
 };
 } // namespace detail::transform
 
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index cbccb3a0933..d1884a5a121 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -30,6 +30,10 @@
 #include <cuda/std/array>
 #include <cuda/std/cassert>
 
+#if _CCCL_HAS_CONCEPTS()
+#  include <cuda/std/concepts>
+#endif // _CCCL_HAS_CONCEPTS()
+
 #if !_CCCL_COMPILER(NVRTC)
 #  include <atomic> // saves 146ms compile-time over <cuda/std/atomic> (CCCL 3.1)
 #  if defined(CUB_DEFINE_RUNTIME_POLICIES)
@@ -831,6 +835,88 @@ private:
   }
 #endif // !_CCCL_COMPILER(NVRTC)
 };
+
+namespace detail
+{
+#if !defined(CUB_DEFINE_RUNTIME_POLICIES) && !_CCCL_COMPILER(NVRTC)
+
+template <::cuda::arch_id ArchId, typename ArchPolicies, typename FunctorT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t call_for_arch(ArchPolicies arch_policies, FunctorT&& f)
+{
+  static constexpr auto policy = arch_policies(ArchId);
+#  if _CCCL_STD_VER >= 2020
+  // we instantiate f only for each distinct policy (!!)
+  return f(::cuda::std::integral_constant<decltype(policy), policy>{});
+#  else
+  return f([] {
+    return policy;
+  });
+#  endif
+}
+
+template <int ArchMult, int... CudaArches, typename ArchPolicies, typename FunctorT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+dispatch_compiled_for_arches(ArchPolicies arch_policies, ::cuda::arch_id device_arch, FunctorT&& f)
+{
+  cudaError_t e = cudaErrorInvalidDeviceFunction;
+  (...,
+   (device_arch == ::cuda::arch_id{(CudaArches * ArchMult) / 10}
+      ? (e = call_for_arch<::cuda::arch_id{(CudaArches * ArchMult) / 10}>(arch_policies, f))
+      : cudaSuccess));
+  return e;
+}
+
+template <typename ArchPolicies, typename FunctorT, size_t... Is>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_all_arches(
+  ArchPolicies arch_policies, ::cuda::arch_id device_arch, FunctorT&& f, ::cuda::std::index_sequence<Is...>)
+{
+  static constexpr auto all_arches = ::cuda::__all_arch_ids();
+  cudaError_t e                    = cudaErrorInvalidDeviceFunction;
+  (..., (device_arch == all_arches[Is] ? (e = call_for_arch<all_arches[Is]>(arch_policies, f)) : cudaSuccess));
+  return e;
+}
+
+template <typename ArchPolicies, typename F>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+dispatch_arch(ArchPolicies arch_policies, ::cuda::arch_id device_arch, F&& f)
+{
+#  ifdef __CUDA_ARCH_LIST__
+  return dispatch_compiled_for_arches<1, __CUDA_ARCH_LIST__>(arch_policies, device_arch, ::cuda::std::forward<F>(f));
+#  elif defined(NV_TARGET_SM_INTEGER_LIST)
+  return dispatch_compiled_for_arches<10, NV_TARGET_SM_INTEGER_LIST>(
+    arch_policies, device_arch, ::cuda::std::forward<F>(f));
+#  else
+  // some compilers don't tell us what arches we are compiling for, so we use all of them
+  return dispatch_all_arches(
+    arch_policies,
+    device_arch,
+    ::cuda::std::forward<F>(f),
+    ::cuda::std::make_index_sequence<::cuda::__all_arch_ids().size()>{});
+#  endif
+}
+#else // !defined(CUB_DEFINE_RUNTIME_POLICIES) && !_CCCL_COMPILER(NVRTC)
+_CCCL_EXEC_CHECK_DISABLE
+template <typename ArchPolicies, typename F>
+_CCCL_API _CCCL_FORCEINLINE cudaError_t dispatch_arch(ArchPolicies arch_policies, ::cuda::arch_id device_arch, F&& f)
+{
+  return f([&] {
+    return arch_policies(device_arch);
+  });
+}
+#endif // !defined(CUB_DEFINE_RUNTIME_POLICIES) && !_CCCL_COMPILER(NVRTC)
+
+#if _CCCL_HAS_CONCEPTS()
+_CCCL_API consteval void __needs_a_constexpr_value(auto) {}
+
+// TODO(bgruber): bikeshed name before we make the tuning API public
+template <typename T, typename ArchPolicy>
+concept policy_hub = requires(T hub, ::cuda::arch_id arch) {
+  { hub(arch) } -> _CCCL_CONCEPT_VSTD::same_as<ArchPolicy>;
+  { __needs_a_constexpr_value(hub(arch)) };
+};
+#endif // _CCCL_HAS_CONCEPTS()
+} // namespace detail
+
 CUB_NAMESPACE_END
 
 #if _CCCL_CUDA_COMPILATION() && !_CCCL_COMPILER(NVRTC)
diff --git a/thrust/thrust/system/cuda/detail/fill.h b/thrust/thrust/system/cuda/detail/fill.h
index b18489b07cd..b41c0280b5b 100644
--- a/thrust/thrust/system/cuda/detail/fill.h
+++ b/thrust/thrust/system/cuda/detail/fill.h
@@ -73,13 +73,7 @@ fill_n(execution_policy<Derived>& policy, OutputIterator first, Size count, cons
      cudaError_t status;
      THRUST_INDEX_TYPE_DISPATCH(
        status,
-       (CUB_NS_QUALIFIER::detail::transform::dispatch_t<
-         CUB_NS_QUALIFIER::detail::transform::requires_stable_address::no,
-         decltype(count_fixed),
-         ::cuda::std::tuple<>,
-         OutputIterator,
-         Predicate,
-         TransformOp>::dispatch),
+       (CUB_NS_QUALIFIER::detail::transform::dispatch<CUB_NS_QUALIFIER::detail::transform::requires_stable_address::no>),
        count,
        (::cuda::std::tuple<>{}, first, count_fixed, Predicate{}, TransformOp{value}, cuda_cub::stream(policy)));
      throw_on_error(status, "fill_n: failed inside CUB");
diff --git a/thrust/thrust/system/cuda/detail/generate.h b/thrust/thrust/system/cuda/detail/generate.h
index 5f56a3d8352..f347565575a 100644
--- a/thrust/thrust/system/cuda/detail/generate.h
+++ b/thrust/thrust/system/cuda/detail/generate.h
@@ -34,13 +34,7 @@ generate_n(execution_policy<Derived>& policy, OutputIt result, Size count, Gener
      cudaError_t status;
      THRUST_INDEX_TYPE_DISPATCH(
        status,
-       (CUB_NS_QUALIFIER::detail::transform::dispatch_t<
-         CUB_NS_QUALIFIER::detail::transform::requires_stable_address::no,
-         decltype(count_fixed),
-         ::cuda::std::tuple<>,
-         OutputIt,
-         Predicate,
-         Generator>::dispatch),
+       (CUB_NS_QUALIFIER::detail::transform::dispatch<CUB_NS_QUALIFIER::detail::transform::requires_stable_address::no>),
        count,
        (::cuda::std::tuple<>{}, result, count_fixed, Predicate{}, generator, cuda_cub::stream(policy)));
      throw_on_error(status, "generate_n: failed inside CUB");
diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h
index 8b1d10695c4..6a735ad4c72 100644
--- a/thrust/thrust/system/cuda/detail/transform.h
+++ b/thrust/thrust/system/cuda/detail/transform.h
@@ -175,12 +175,7 @@ OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
   cudaError_t status;
   THRUST_INDEX_TYPE_DISPATCH(
     status,
-    (cub::detail::transform::dispatch_t<stable_address,
-                                        decltype(num_items_fixed),
-                                        ::cuda::std::tuple<InputIts...>,
-                                        OutputIt,
-                                        Predicate,
-                                        TransformOp>::dispatch),
+    (cub::detail::transform::dispatch<stable_address>),
     num_items,
     (firsts, result, num_items_fixed, pred, transform_op, cuda_cub::stream(policy)));
   throw_on_error(status, "transform: failed inside CUB");