diff --git a/cmake/detray-compiler-options-cuda.cmake b/cmake/detray-compiler-options-cuda.cmake
index bc8a3c653..e426e4be1 100644
--- a/cmake/detray-compiler-options-cuda.cmake
+++ b/cmake/detray-compiler-options-cuda.cmake
@@ -19,13 +19,6 @@ if(PROJECT_IS_TOP_LEVEL)
         detray_add_flag( CMAKE_CUDA_FLAGS "-Xcompiler /Zc:__cplusplus" )
     endif()
 
-    # Set the CUDA architecture to build code for.
-    set(CMAKE_CUDA_ARCHITECTURES
-        "52"
-        CACHE STRING
-        "CUDA architectures to build device code for"
-    )
-
     if("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA")
         # Allow to use functions in device code that are constexpr, even if they are
         # not marked with __device__.
diff --git a/core/include/detray/geometry/shapes/concentric_cylinder2D.hpp b/core/include/detray/geometry/shapes/concentric_cylinder2D.hpp
index 729bceb6a..82ae039ea 100644
--- a/core/include/detray/geometry/shapes/concentric_cylinder2D.hpp
+++ b/core/include/detray/geometry/shapes/concentric_cylinder2D.hpp
@@ -112,7 +112,7 @@ class concentric_cylinder2D {
         const scalar_t tol = std::numeric_limits<scalar_t>::epsilon(),
         const scalar_t /*edge_tol*/ = 0.f) const {
 
-        return (bounds[e_lower_z] - tol <= loc_p[1] &&
+        return (bounds[e_lower_z] <= loc_p[1] + tol &&
                 loc_p[1] <= bounds[e_upper_z] + tol);
     }
     /// @}
diff --git a/core/include/detray/propagator/rk_stepper.hpp b/core/include/detray/propagator/rk_stepper.hpp
index 5c290e1bf..25c5f5ed7 100644
--- a/core/include/detray/propagator/rk_stepper.hpp
+++ b/core/include/detray/propagator/rk_stepper.hpp
@@ -179,7 +179,7 @@ class rk_stepper final
         scalar_type m_next_step_size{0.f};
 
         /// Magnetic field view
-        const magnetic_field_t m_magnetic_field;
+        magnetic_field_t m_magnetic_field;
     };
 
     /// Take a step, using an adaptive Runge-Kutta algorithm.
diff --git a/tests/benchmarks/cuda/CMakeLists.txt b/tests/benchmarks/cuda/CMakeLists.txt
index 1c3f8bbb6..6a31b7463 100644
--- a/tests/benchmarks/cuda/CMakeLists.txt
+++ b/tests/benchmarks/cuda/CMakeLists.txt
@@ -38,3 +38,13 @@ foreach(algebra ${algebra_plugins})
         PRIVATE "-march=native" "-ftree-vectorize"
     )
 endforeach()
+
+detray_add_executable(cuda_propagation
+    "propagation_new.cpp"
+    LINK_LIBRARIES detray::benchmark_cuda_array detray::core_array vecmem::cuda detray::test_common
+)
+
+target_compile_options(
+    detray_cuda_propagation
+    PRIVATE "-march=native" "-ftree-vectorize"
+)
diff --git a/tests/benchmarks/cuda/propagation_new.cpp b/tests/benchmarks/cuda/propagation_new.cpp
new file mode 100644
index 000000000..94771d6c9
--- /dev/null
+++ b/tests/benchmarks/cuda/propagation_new.cpp
@@ -0,0 +1,136 @@
+/** Detray library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+// Project include(s)
+#include "detray/navigation/navigator.hpp"
+#include "detray/propagator/actors.hpp"
+#include "detray/propagator/rk_stepper.hpp"
+#include "detray/tracks/tracks.hpp"
+
+// Detray benchmark include(s)
+#include "detray/benchmarks/device/cuda/propagator.hpp"
+#include "detray/benchmarks/propagation_benchmark_utils.hpp"
+#include "detray/benchmarks/types.hpp"
+
+// Detray test include(s)
+#include "detray/test/common/bfield.hpp"
+#include "detray/test/common/build_toy_detector.hpp"
+#include "detray/test/common/track_generators.hpp"
+
+// Vecmem include(s)
+#include <vecmem/memory/cuda/device_memory_resource.hpp>
+#include <vecmem/memory/cuda/host_memory_resource.hpp>
+#include <vecmem/memory/host_memory_resource.hpp>
+
+// System include(s)
+#include <chrono>
+#include <ctime>
+#include <iostream>
+#include <ratio>
+#include <string>
+
+using namespace detray;
+
+int main(int argc, char** argv) {
+
+    using metadata_t = benchmarks::toy_metadata;
+    using toy_detector_t = detector<metadata_t>;
+    using algebra_t = typename toy_detector_t::algebra_type;
+    using scalar = dscalar<algebra_t>;
+    using vector3 = dvector3D<algebra_t>;
+
+    using free_track_parameters_t = free_track_parameters<algebra_t>;
+    using uniform_gen_t =
+        detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
+    using track_generator_t =
+        random_track_generator<free_track_parameters_t, uniform_gen_t>;
+    using field_bknd_t = bfield::const_bknd_t<benchmarks::scalar>;
+
+    // vecmem::host_memory_resource host_mr;
+    vecmem::cuda::host_memory_resource host_mr;  //< pinned memory
+    vecmem::cuda::device_memory_resource dev_mr;
+
+    //
+    // Configuration
+    //
+
+    std::size_t n_tracks{262144u};
+    if (argc > 1) {
+        n_tracks = static_cast<std::size_t>(atoi(argv[1]));
+    }
+
+    // Constant magnetic field
+    vector3 B{0.f, 0.f, 2.f * unit<scalar>::T};
+
+    // Configure toy detector
+    toy_det_config<scalar> toy_cfg{};
+    toy_cfg.use_material_maps(false).n_brl_layers(4u).n_edc_layers(7u);
+
+    std::cout << toy_cfg << std::endl;
+
+    // Configure propagation
+    propagation::config prop_cfg{};
+    prop_cfg.navigation.search_window = {3u, 3u};
+
+    std::cout << prop_cfg << std::endl;
+
+    //
+    // Prepare data
+    //
+    // Generate track sample for strong scaling
+    track_generator_t::configuration trk_cfg{};
+    trk_cfg.n_tracks(n_tracks);
+    trk_cfg.seed(detail::random_numbers<scalar>::default_seed());
+
+    std::cout << trk_cfg << std::endl;
+
+    track_generator_t trk_gen{trk_cfg};
+
+    dvector<free_track_parameters_t> single_sample =
+        detray::benchmarks::generate_tracks(&host_mr, trk_gen, true);
+
+    const auto [toy_det, names] =
+        build_toy_detector<algebra_t>(host_mr, toy_cfg);
+
+    auto bfield = create_const_field<scalar>(B);
+
+    pointwise_material_interactor<algebra_t>::state interactor_state{};
+    parameter_resetter<algebra_t>::state resetter_state{};
+
+    auto actor_states =
+        detail::make_tuple<dtuple>(interactor_state, resetter_state);
+
+    //
+    // Register benchmarks
+    //
+    std::cout << "\n----------------------\n"
+              << "Propagation Test\n"
+              << "----------------------\n\n";
+
+    using navigator_t = navigator_type<metadata_t>;
+    using stepper_t = stepper_type<metadata_t, field_bknd_t>;
+    using actor_chain_t = default_chain<algebra_t>;
+
+    prop_cfg.stepping.do_covariance_transport = true;
+    cuda_propagation<navigator_t, stepper_t, actor_chain_t> propagator{
+        prop_cfg};
+
+    std::chrono::high_resolution_clock::time_point t1 =
+        std::chrono::high_resolution_clock::now();
+    propagator(&dev_mr, &toy_det, &bfield, &single_sample, &actor_states);
+    std::chrono::high_resolution_clock::time_point t2 =
+        std::chrono::high_resolution_clock::now();
+
+    const auto total_time =
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+    const double total_time_ms{total_time.count() * 1000.};
+
+    // Assumption: 1 event = 3000 truth tracks + 2 seeds per track
+    std::cout << "It took: " << total_time_ms << "ms ("
+              << total_time_ms / (static_cast<double>(n_tracks) / 3000.)
+              << " ms/evt)" << std::endl;
+}
diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt
index 5ed1e442c..787d8b41e 100644
--- a/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt
+++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt
@@ -30,6 +30,8 @@ foreach(algebra ${algebra_plugins})
         STATIC
         "propagation_benchmark.hpp"
         "propagation_benchmark.cu"
+        "propagator.hpp"
+        "propagator.cu"
     )
 
     add_library(
@@ -40,9 +42,15 @@ foreach(algebra ${algebra_plugins})
     target_link_libraries(
         detray_benchmark_cuda_${algebra}
         PUBLIC
+            CUDA::cudart
             vecmem::cuda
             detray::benchmarks
             detray::test_common
             detray::core_${algebra}
     )
+
+    set_property(
+        TARGET detray_benchmark_cuda_${algebra}
+        PROPERTY CUDA_ARCHITECTURES 75
+    )
 endforeach()
diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp
index fa121705b..9b6e5f359 100644
--- a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp
+++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp
@@ -104,7 +104,7 @@ template <typename propagator_t>
 void release_actor_states(
     typename propagator_t::actor_chain_type::state_tuple *);
 
-/// Device Propagation becnhmark
+/// Device Propagation benchmark
 template <typename propagator_t, typename bfield_bknd_t,
           detray::benchmarks::propagation_opt kOPT =
               detray::benchmarks::propagation_opt::e_unsync>
diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.cu b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.cu
new file mode 100644
index 000000000..413b5bb37
--- /dev/null
+++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.cu
@@ -0,0 +1,372 @@
+/** Detray library, part of the ACTS project (R&D line)
+ *
+ * (c) 2026 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+// Project include(s)
+#include "detray/benchmarks/device/cuda/propagator.hpp"
+#include "detray/definitions/detail/cuda_definitions.hpp"
+#include "detray/utils/logging.hpp"
+
+// CUDA include(s)
+#include <cooperative_groups.h>
+
+#include <cuda/pipeline>
+
+namespace detray {
+
+/// Run the stepper
+template <typename stepper_t, typename actor_chain_t>
+__device__ inline void run_actors(
+    typename stepper_t::state &stepping,
+    typename actor_chain_t::state_ref_tuple &actor_states) {
+
+    using stepper_state_t = typename stepper_t::state;
+    using actor_states_t = typename actor_chain_t::state_ref_tuple;
+
+    // auto block = cooperative_groups::this_thread_block();
+
+    // constexpr actor_chain_t run_actors{};
+}
+
+/// Run the stepper
+template <typename stepper_t, typename actor_chain_t>
+__device__ inline void take_step(
+    const stepping::config &cfg, typename stepper_t::state &stepping,
+    typename actor_chain_t::state_ref_tuple &actor_states) {
+
+    using stepper_state_t = typename stepper_t::state;
+    using actor_states_t = typename actor_chain_t::state_ref_tuple;
+
+    // auto block = cooperative_groups::this_thread_block();
+
+    // constexpr stepper_t stepper{};
+    constexpr actor_chain_t run_actors{};
+}
+
+/// Run the navigator
+template <typename navigator_t>
+__device__ inline void navigate(const navigation::config &cfg,
+                                typename navigator_t::state &navigation) {
+
+    using detector_t = typename navigator_t::detector_type;
+    using navigation_state_t = typename navigator_t::state;
+
+    // auto block = cooperative_groups::this_thread_block();
+}
+
+/// Specialize the warps to run stepping, navigation and actors independently
+template <typename navigator_t, typename stepper_t, typename actor_chain_t>
+__global__ void __launch_bounds__(256, 4) propagation_kernel(
+    const propagation::config cfg,
+    const typename navigator_t::detector_type *pinned_detector_ptr,
+    typename stepper_t::magnetic_field_type field_view,
+    vecmem::data::vector_view<typename stepper_t::free_track_parameters_type>
+        tracks_view,
+    // vecmem::data::vector_view<unsigned int> stepper_res_view,
+    // vecmem::data::vector_view<unsigned int> navigator_res_view,
+    typename actor_chain_t::state_tuple *pinned_actor_state_ptr) {
+
+    using detector_device_t = typename navigator_t::detector_type;
+    using algebra_t = typename detector_device_t::algebra_type;
+
+    using track_t = typename stepper_t::free_track_parameters_type;
+    using stepper_state_t = typename stepper_t::state;
+    using navigation_state_t = typename navigator_t::state;
+    using propagation_state_t = propagation_state<navigator_t, stepper_t>;
+
+    assert(blockDim.y == blockDim.z == 1);
+
+    auto block = cooperative_groups::this_thread_block();
+    const unsigned int gid = threadIdx.x + blockIdx.x * blockDim.x;
+    const unsigned int bid = block.thread_rank();
+
+    // Setup some small amount of data in shared memory for actor states
+    __shared__ typename actor_chain_t::state_tuple actor_states_sh[2];
+
+    if (gid == 0) {
+        // Create the actor states on a fresh copy
+        actor_states_sh[0] = *pinned_actor_state_ptr;
+        actor_states_sh[1] = *pinned_actor_state_ptr;
+    }
+
+    block.sync();
+
+    /// One navigation and one stepping/acting thread work on 2 tracks (2:2)
+    const vecmem::device_vector<track_t> tracks(tracks_view);
+    if (gid >= tracks.size()) {
+        return;
+    }
+
+    // Create a pipeline.
+    assert(block.size() % 2u == 0);
+
+    constexpr auto scope{cuda::thread_scope_block};
+    constexpr int n_stages{2};
+    __shared__ cuda::pipeline_shared_state<scope, n_stages> shared_state;
+    auto propagation =
+        cuda::make_pipeline(block, &shared_state, block.size() / 2);
+
+    ///
+    /// Run propagation steps
+    ///
+
+    // The first half of the block performs navigation, the second stepping and
+    // actors calls
+    if (bid < block.size() / 2u) {
+
+        assert(2u * gid + 1u < tracks.size());
+
+        DETRAY_INFO_DEVICE(
+            "Thread %d (%d) in block %d is doing navigation (tracks: %d, %d)",
+            gid, bid, blockIdx.x, 2u * gid, 2u * gid + 1u);
+
+        std::array<navigation_state_t, 2u> navigation{
+            navigation_state_t{*pinned_detector_ptr},
+            navigation_state_t{*pinned_detector_ptr}};
+
+        // Initialize the navigation and fill the propagation pipeline
+        typename detector_device_t::geometry_context gctx{};
+        for (int i = 0; i < 2; ++i) {
+
+            constexpr navigator_t navigator{};
+            navigator.init(tracks.at(2 * gid + i), navigation[i],
+                           cfg.navigation, gctx);
+
+            assert(navigation.is_alive());
+
+            // Put navigation result in pipeline
+            __syncwarp();
+            propagation.producer_acquire();
+
+            DETRAY_INFO_DEVICE("Thread %d: Initial dist %f (stage %d)",
+                               block.thread_rank(), navigation[i](), i);
+
+            __syncwarp();
+            propagation.producer_commit();
+        }
+
+        // Propagation loop
+        int stage = 0;
+        for (std::size_t step = 0; step < 10; ++step) {
+
+            // Wait for the stepper/actor to release the pipeline
+            __syncwarp();
+            propagation.producer_acquire();
+
+            DETRAY_INFO_DEVICE("Thread %d: Dist %f (stage %d)",
+                               block.thread_rank(), navigation[stage](), stage);
+
+            navigate<navigator_t>(cfg.navigation, navigation[stage]);
+
+            // Publish navigator results
+            __syncwarp();
+            propagation.producer_commit();
+
+            // Flip stage and navigate the other track
+            stage = (stage + 1) % n_stages;
+        }
+
+        DETRAY_INFO_HOST_DEVICE(
+            "Thread %d, block %d (Nav): Navigation finished",
+            block.thread_rank(), blockIdx.x);
+    } else {
+        // Map the second batch of threads back onto the same data
+        const unsigned int lid{gid - blockDim.x / 2u};
+        assert(2u * lid + 1u < tracks.size());
+
+        DETRAY_INFO_DEVICE(
+            "Thread %d (%d) in block %d is doing stepping (tracks: %d, %d)",
+            gid, bid, blockIdx.x, 2u * lid, 2u * lid + 1u);
+
+        std::array<stepper_state_t, 2u> stepping{
+            stepper_state_t{tracks.at(2u * lid), field_view},
+            stepper_state_t{tracks.at(2u * lid + 1u), field_view}};
+
+        using state_refs_t = typename actor_chain_t::state_ref_tuple;
+        std::array<state_refs_t, 2u> actor_states{
+            actor_chain_t::setup_actor_states(actor_states_sh[0]),
+            actor_chain_t::setup_actor_states(actor_states_sh[1])};
+
+        // Propagation loop
+        int stage = 0;  //< which track to advance
+        std::array<int, 2u> prop_phase{
+            0};  //< whether to run actors or stepping
+        for (std::size_t step = 0; step < 10; ++step) {
+
+            // Wait for navigation to finish
+            __syncwarp();
+            propagation.consumer_wait();
+
+            if (prop_phase[stage] % 2 == 0) {
+                DETRAY_INFO_DEVICE("Thread %d: Run actors (stage %d)",
+                                   block.thread_rank(), stage);
+
+                run_actors<stepper_t, actor_chain_t>(stepping[stage],
+                                                     actor_states[stage]);
+
+                // Publish actor results
+            } else {
+                DETRAY_INFO_DEVICE("Thread %d: Stepsize %f (stage %d)",
+                                   block.thread_rank(),
+                                   stepping[stage].step_size(), stage);
+
+                take_step<stepper_t, actor_chain_t>(
+                    cfg.stepping, stepping[stage], actor_states[stage]);
+
+                // Publish stepper results
+            }
+
+            // Trigger navigation
+            __syncwarp();
+            propagation.consumer_release();
+
+            // Flip the propagation phase for this track (stepping <-> actors)
+            prop_phase[stage] = (prop_phase[stage] + 1) % 2;
+            // Flip stage and run on the other track
+            stage = (stage + 1) % n_stages;
+        }
+
+        DETRAY_INFO_HOST_DEVICE("Thread %d, block %d (Step): Stepping finished",
+                                block.thread_rank(), blockIdx.x);
+    }
+}
+
+template <typename device_detector_t>
+device_detector_t *setup_device_detector(
+    typename device_detector_t::view_type det_view) {
+
+    // Build a device detector type (the interal pointers and capacities refer
+    // to the already allocated vecmem device buffers)
+    device_detector_t device_det{det_view};
+
+    // Allocate global memory space for the device detector to be shared by
+    // kernels
+    device_detector_t *pinned_detector_ptr{nullptr};
+
+    DETRAY_CUDA_ERROR_CHECK(cudaHostAlloc((void **)&pinned_detector_ptr,
+                                          sizeof(device_detector_t),
+                                          cudaHostAllocPortable));
+
+    DETRAY_CUDA_ERROR_CHECK(cudaMemcpy(pinned_detector_ptr, &device_det,
+                                       sizeof(device_detector_t),
+                                       cudaMemcpyHostToDevice));
+
+    DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+
+    return pinned_detector_ptr;
+}
+
+template <typename device_detector_t>
+void release_device_detector(device_detector_t *pinned_detector_ptr) {
+    DETRAY_CUDA_ERROR_CHECK(cudaFreeHost(pinned_detector_ptr));
+}
+
+template <typename actor_chain_t>
+typename actor_chain_t::state_tuple *setup_actor_states(
+    typename actor_chain_t::state_tuple *input_actor_states) {
+
+    // Copy the actor state blueprint to the device
+    using actor_state_t = typename actor_chain_t::state_tuple;
+    actor_state_t *pinned_actor_state_ptr{nullptr};
+
+    DETRAY_CUDA_ERROR_CHECK(cudaHostAlloc((void **)&pinned_actor_state_ptr,
+                                          sizeof(actor_state_t),
+                                          cudaHostAllocPortable));
+    DETRAY_CUDA_ERROR_CHECK(
+        cudaMemcpy(pinned_actor_state_ptr, input_actor_states,
+                   sizeof(actor_state_t), cudaMemcpyHostToDevice));
+
+    DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+
+    return pinned_actor_state_ptr;
+}
+
+template <typename actor_chain_t>
+void release_actor_states(
+    typename actor_chain_t::state_tuple *pinned_actor_state_ptr) {
+    DETRAY_CUDA_ERROR_CHECK(cudaFreeHost(pinned_actor_state_ptr));
+}
+
+template <typename navigator_t, typename stepper_t, typename actor_chain_t>
+void run_propagation_kernel(
+    const propagation::config &cfg,
+    const typename navigator_t::detector_type *pinned_detector_ptr,
+    typename stepper_t::magnetic_field_type field_view,
+    vecmem::data::vector_view<typename stepper_t::free_track_parameters_type>
+        tracks_view,
+    vecmem::data::vector_view<unsigned int> stepper_res_view,
+    vecmem::data::vector_view<unsigned int> navigator_res_view,
+    typename actor_chain_t::state_tuple *pinned_actor_state_ptr) {
+
+    int thread_dim = math::min(256, static_cast<int>(tracks_view.size()));
+    // One block handles 256 tracks (2 per thread)
+    int block_dim = static_cast<int>((tracks_view.size() / 2 + thread_dim - 1) /
+                                     thread_dim);
+
+    DETRAY_INFO_HOST_DEVICE("# Tracks: %ld", tracks_view.size());
+    DETRAY_INFO_HOST_DEVICE("# threads per block: %d", thread_dim);
+    DETRAY_INFO_HOST_DEVICE("# blocks: %d", block_dim);
+    DETRAY_INFO_HOST_DEVICE("# threads: %d", thread_dim * block_dim);
+
+    // run the propagation loop
+    propagation_kernel<navigator_t, stepper_t, actor_chain_t>
+        <<<block_dim, thread_dim>>>(cfg, pinned_detector_ptr, field_view,
+                                    tracks_view, pinned_actor_state_ptr);
+
+    // cuda error check
+    DETRAY_CUDA_ERROR_CHECK(cudaGetLastError());
+    DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+}
+
+/// Macro declaring the template instantiations for the different detector types
+#define DECLARE_PROPAGATOR(METADATA, CHAIN, FIELD)                    \
+                                                                      \
+    template void run_propagation_kernel<                             \
+        navigator_type<METADATA>, stepper_type<METADATA, FIELD>,      \
+        CHAIN<detector<METADATA>::algebra_type>>(                     \
+        const propagation::config &,                                  \
+        const detector<METADATA, device_container_types> *,           \
+        covfie::field_view<FIELD>,                                    \
+        vecmem::data::vector_view<                                    \
+            free_track_parameters<detector<METADATA>::algebra_type>>, \
+        vecmem::data::vector_view<unsigned int>,                      \
+        vecmem::data::vector_view<unsigned int>,                      \
+        typename CHAIN<detector<METADATA>::algebra_type>::state_tuple *);
+
+#define DECLARE_ACTOR_CHAIN_SETUP(METADATA, CHAIN)                           \
+                                                                             \
+    template typename CHAIN<detector<METADATA>::algebra_type>::state_tuple * \
+    setup_actor_states<CHAIN<detector<METADATA>::algebra_type>>(             \
+        typename CHAIN<detector<METADATA>::algebra_type>::state_tuple *);    \
+                                                                             \
+    template void                                                            \
+    release_actor_states<CHAIN<detector<METADATA>::algebra_type>>(           \
+        typename CHAIN<detector<METADATA>::algebra_type>::state_tuple *);
+
+#define DECLARE_DETECTOR_ALLOCATION(METADATA)                                \
+                                                                             \
+    template detector<METADATA, device_container_types>                      \
+        *setup_device_detector<detector<METADATA, device_container_types>>(  \
+            typename detector<METADATA, device_container_types>::view_type); \
+                                                                             \
+    template void                                                            \
+    release_device_detector<detector<METADATA, device_container_types>>(     \
+        detector<METADATA, device_container_types> *);
+
+DECLARE_PROPAGATOR(benchmarks::default_metadata, empty_chain, const_field_t)
+DECLARE_PROPAGATOR(benchmarks::default_metadata, default_chain, const_field_t)
+
+DECLARE_PROPAGATOR(benchmarks::toy_metadata, empty_chain, const_field_t)
+DECLARE_PROPAGATOR(benchmarks::toy_metadata, default_chain, const_field_t)
+
+// Declare only once per algebra type
+DECLARE_ACTOR_CHAIN_SETUP(benchmarks::toy_metadata, empty_chain)
+DECLARE_ACTOR_CHAIN_SETUP(benchmarks::toy_metadata, default_chain)
+
+DECLARE_DETECTOR_ALLOCATION(benchmarks::default_metadata)
+DECLARE_DETECTOR_ALLOCATION(benchmarks::toy_metadata)
+
+}  // namespace detray
diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.hpp b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.hpp
new file mode 100644
index 000000000..e94daa2ec
--- /dev/null
+++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.hpp
@@ -0,0 +1,247 @@
+/** Detray library, part of the ACTS project (R&D line)
+ *
+ * (c) 2025 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+// Project include(s)
+#include "detray/core/detector.hpp"
+#include "detray/definitions/algebra.hpp"
+#include "detray/navigation/caching_navigator.hpp"
+#include "detray/propagator/actors.hpp"
+#include "detray/propagator/propagation_config.hpp"
+#include "detray/propagator/propagator.hpp"
+#include "detray/propagator/rk_stepper.hpp"
+#include "detray/tracks/tracks.hpp"
+#include "detray/utils/logging.hpp"
+
+// Detray test include(s)
+#include "detray/benchmarks/types.hpp"
+#include "detray/test/common/bfield.hpp"
+
+// Vecmem include(s)
+#include <vecmem/memory/cuda/device_memory_resource.hpp>
+#include <vecmem/memory/host_memory_resource.hpp>
+#include <vecmem/memory/memory_resource.hpp>
+#include <vecmem/utils/cuda/async_copy.hpp>
+#include <vecmem/utils/cuda/copy.hpp>
+
+// System include(s)
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <ctime>
+#include <iostream>
+#include <ratio>
+#include <string>
+
+namespace detray {
+
+// Define propagator type
+template <concepts::algebra algebra_t>
+using empty_chain = actor_chain<>;
+
+template <concepts::algebra algebra_t>
+using default_chain = actor_chain<parameter_transporter<algebra_t>,
+                                  pointwise_material_interactor<algebra_t>,
+                                  parameter_resetter<algebra_t>>;
+
+using const_field_t = bfield::const_bknd_t<benchmarks::scalar>;
+
+template <typename metadata_t, typename bfield_t>
+using stepper_type = rk_stepper<covfie::field_view<bfield_t>,
+                                typename detector<metadata_t>::algebra_type>;
+
+template <typename metadata_t>
+using navigator_type =
+    caching_navigator<detector<metadata_t, device_container_types>>;
+
+/// Propagation that state aggregates a stepping and a navigation state. It
+/// also keeps references to the actor states.
+template <typename navigator_t, typename stepper_t>
+struct propagation_state {
+    typename stepper_t::state _stepping;
+    typename navigator_t::state _navigation;
+    typename navigator_t::detector_type::geometry_context _context{};
+};
+
+/// Launch the propagation kernel
+///
+/// @param cfg the propagation configuration
+/// @param field_data the magentic field view (maybe an empty field)
+/// @param tracks_data the track collection view
+/// @param navigation_cache_view the navigation cache vecemem view
+/// @param opt which propagation to run (sync vs. unsync)
+template <typename navigator_t, typename stepper_t, typename actor_chain_t>
+void run_propagation_kernel(
+    const propagation::config &, const typename navigator_t::detector_type *,
+    typename stepper_t::magnetic_field_type,
+    vecmem::data::vector_view<typename stepper_t::free_track_parameters_type>,
+    vecmem::data::vector_view<unsigned int>,
+    vecmem::data::vector_view<unsigned int>,
+    typename actor_chain_t::state_tuple *);
+
+/// Allocate space for the device detector opbject to be shared between kenels
+template <typename device_detector_t>
+device_detector_t *setup_device_detector(typename device_detector_t::view_type);
+
+/// Release device detector
+template <typename device_detector_t>
+void release_device_detector(device_detector_t *);
+
+/// Copy a blueprint actor state to device, which will be taken and set up
+/// correctly for every track in the propagation init kernel
+/// @returns the device
+template <typename actor_chain_t>
+typename actor_chain_t::state_tuple *setup_actor_states(
+    typename actor_chain_t::state_tuple *);
+
+/// Release the blueprint actor allocation after all actor states are set up
+template <typename actor_chain_t>
+void release_actor_states(typename actor_chain_t::state_tuple *);
+
+/// Device Propagation benchmark
+template <typename navigator_t, typename stepper_t, typename actor_chain_t>
+struct cuda_propagation {
+    /// Detector dependent types
+    using metadata_t = typename navigator_t::detector_type::metadata;
+    using detector_t = detector<metadata_t>;
+    using device_detector_t = detector<metadata_t, device_container_types>;
+    using bfield_view_t = typename stepper_t::magnetic_field_type;
+    using algebra_t = typename detector_t::algebra_type;
+    using scalar_t = dscalar<algebra_t>;
+    using vector3_t = dvector3D<algebra_t>;
+
+    using state = propagation_state<navigator_t, stepper_t>;
+
+    /// The propagation configuration
+    propagation::config m_cfg{};
+
+    /// Default construction
+    cuda_propagation() = default;
+
+    /// Construct from an externally provided configuration @param cfg
+    explicit cuda_propagation(const propagation::config &cfg) : m_cfg{cfg} {}
+
+    /// @return the propagation configuration
+    propagation::config &config() { return m_cfg; }
+
+    /// Prepare data and run propagation loop
+    template <typename bfield_bkn_t>
+    inline void operator()(
+        vecmem::memory_resource *dev_mr, const detector_t *det,
+        const bfield_bkn_t *field,
+        dvector<free_track_parameters<algebra_t>> *tracks,
+        typename actor_chain_t::state_tuple *input_actor_states) const {
+
+        assert(dev_mr != nullptr);
+        assert(tracks != nullptr);
+        assert(det != nullptr);
+        assert(field != nullptr);
+        assert(input_actor_states != nullptr);
+
+        // Helper object for performing memory copies (to CUDA devices)
+        vecmem::cuda::copy cuda_cpy;
+
+        // Copy the detector to device and get its view
+        auto det_buffer = detray::get_buffer(*det, *dev_mr, cuda_cpy);
+        auto det_view = detray::get_data(det_buffer);
+
+        bfield_view_t field_view(*field);
+
+        // Copy the track collection to device
+        auto track_buffer =
+            detray::get_buffer(vecmem::get_data(*tracks), *dev_mr, cuda_cpy);
+
+        // Launch the propagator test for GPU device
+        propagate(dev_mr, det_view, field_view, track_buffer,
+                  input_actor_states);
+    }
+
+    /// Run propagation loop
+    inline void operator()(
+        vecmem::memory_resource *dev_mr, const detector_t::view_type det_view,
+        const bfield_view_t field_view,
+        vecmem::data::vector_view<free_track_parameters<algebra_t>> tracks_view,
+        typename actor_chain_t::state_tuple *input_actor_states) const {
+
+        // Launch the propagator test for GPU device
+        propagate(dev_mr, det_view, field_view, tracks_view,
+                  input_actor_states);
+    }
+
+    private:
+    /// Prepare data and run propagation loop
+    inline void propagate(
+        vecmem::memory_resource *dev_mr, const detector_t::view_type det_view,
+        const bfield_view_t field_view,
+        vecmem::data::vector_view<free_track_parameters<algebra_t>> tracks_view,
+        typename actor_chain_t::state_tuple *input_actor_states) const {
+
+        assert(dev_mr != nullptr);
+        assert(tracks_view.size() != 0u);
+        assert(input_actor_states != nullptr);
+
+        // Helper object for performing memory copies (to CUDA devices)
+        vecmem::cuda::copy cuda_cpy;
+
+        const unsigned int n_tracks{
+            static_cast<unsigned int>(tracks_view.size())};
+
+        // Allocate memory for device detector object (not detector data!)
+        auto *pinned_detector_ptr =
+            setup_device_detector<device_detector_t>(det_view);
+
+        // Copy blueprint actor states to device
+        auto *pinned_actor_state_ptr =
+            setup_actor_states<actor_chain_t>(input_actor_states);
+
+        // Stepper results
+        vecmem::data::vector_buffer<unsigned int> stepper_res_buffer(n_tracks,
+                                                                     *dev_mr);
+        cuda_cpy.setup(stepper_res_buffer)->wait();
+        auto stepper_res_view = vecmem::get_data(stepper_res_buffer);
+
+        // Navigation results
+        vecmem::data::vector_buffer<unsigned int> navigation_res_buffer(
+            n_tracks, *dev_mr);
+        cuda_cpy.setup(navigation_res_buffer)->wait();
+        auto navigation_res_view = vecmem::get_data(navigation_res_buffer);
+
+        std::chrono::high_resolution_clock::time_point t1_prop =
+            std::chrono::high_resolution_clock::now();
+        // Check if all tracks finished
+        bool all_finished{false};
+        // Safety agianst infinite loops
+        int iterations{0};
+
+        while (!all_finished && iterations < 1) {
+            // Launch the propagation for GPU device
+            run_propagation_kernel<navigator_t, stepper_t, actor_chain_t>(
+                m_cfg, pinned_detector_ptr, field_view, tracks_view,
+                stepper_res_view, navigation_res_view, pinned_actor_state_ptr);
+
+            ++iterations;
+        }
+        std::chrono::high_resolution_clock::time_point t2_prop =
+            std::chrono::high_resolution_clock::now();
+
+        // Deallocate the propagation config and detector
+        release_device_detector(pinned_detector_ptr);
+        release_actor_states<actor_chain_t>(pinned_actor_state_ptr);
+
+        const auto prop_time =
+            std::chrono::duration_cast<std::chrono::duration<double>>(t2_prop -
+                                                                      t1_prop);
+        const double prop_ms{prop_time.count() * 1000.};
+
+        DETRAY_INFO_HOST("Propagation kernel took: "
+                         << prop_ms << "ms (" << prop_ms / (n_tracks / 3000.)
+                         << " ms/evt)\n");
+    }
+};
+
+}  // namespace detray
diff --git a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
index 36bde6f2f..86469a869 100644
--- a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
+++ b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
@@ -14,6 +14,9 @@
 #include "detray/propagator/propagator.hpp"
 #include "detray/tracks/tracks.hpp"
 
+// Detray benchmark include(s)
+#include "detray/benchmarks/benchmark_base.hpp"
+
 // Vecmem include(s)
 #include <vecmem/memory/memory_resource.hpp>