diff --git a/cmake/detray-compiler-options-cuda.cmake b/cmake/detray-compiler-options-cuda.cmake index bc8a3c653..e426e4be1 100644 --- a/cmake/detray-compiler-options-cuda.cmake +++ b/cmake/detray-compiler-options-cuda.cmake @@ -19,13 +19,6 @@ if(PROJECT_IS_TOP_LEVEL) detray_add_flag( CMAKE_CUDA_FLAGS "-Xcompiler /Zc:__cplusplus" ) endif() - # Set the CUDA architecture to build code for. - set(CMAKE_CUDA_ARCHITECTURES - "52" - CACHE STRING - "CUDA architectures to build device code for" - ) - if("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA") # Allow to use functions in device code that are constexpr, even if they are # not marked with __device__. diff --git a/core/include/detray/geometry/shapes/concentric_cylinder2D.hpp b/core/include/detray/geometry/shapes/concentric_cylinder2D.hpp index 729bceb6a..82ae039ea 100644 --- a/core/include/detray/geometry/shapes/concentric_cylinder2D.hpp +++ b/core/include/detray/geometry/shapes/concentric_cylinder2D.hpp @@ -112,7 +112,7 @@ class concentric_cylinder2D { const scalar_t tol = std::numeric_limits::epsilon(), const scalar_t /*edge_tol*/ = 0.f) const { - return (bounds[e_lower_z] - tol <= loc_p[1] && + return (bounds[e_lower_z] <= loc_p[1] + tol && loc_p[1] <= bounds[e_upper_z] + tol); } /// @} diff --git a/core/include/detray/propagator/rk_stepper.hpp b/core/include/detray/propagator/rk_stepper.hpp index 5c290e1bf..25c5f5ed7 100644 --- a/core/include/detray/propagator/rk_stepper.hpp +++ b/core/include/detray/propagator/rk_stepper.hpp @@ -179,7 +179,7 @@ class rk_stepper final scalar_type m_next_step_size{0.f}; /// Magnetic field view - const magnetic_field_t m_magnetic_field; + magnetic_field_t m_magnetic_field; }; /// Take a step, using an adaptive Runge-Kutta algorithm. diff --git a/tests/benchmarks/cuda/CMakeLists.txt b/tests/benchmarks/cuda/CMakeLists.txt index 1c3f8bbb6..6a31b7463 100644 --- a/tests/benchmarks/cuda/CMakeLists.txt +++ b/tests/benchmarks/cuda/CMakeLists.txt @@ -38,3 +38,13 @@ foreach(algebra ${algebra_plugins}) PRIVATE "-march=native" "-ftree-vectorize" ) endforeach() + +detray_add_executable(cuda_propagation + "propagation_new.cpp" + LINK_LIBRARIES detray::benchmark_cuda_array detray::core_array vecmem::cuda detray::test_common +) + +target_compile_options( + detray_cuda_propagation + PRIVATE "-march=native" "-ftree-vectorize" +) diff --git a/tests/benchmarks/cuda/propagation_new.cpp b/tests/benchmarks/cuda/propagation_new.cpp new file mode 100644 index 000000000..94771d6c9 --- /dev/null +++ b/tests/benchmarks/cuda/propagation_new.cpp @@ -0,0 +1,136 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// Project include(s) +#include "detray/navigation/navigator.hpp" +#include "detray/propagator/actors.hpp" +#include "detray/propagator/rk_stepper.hpp" +#include "detray/tracks/tracks.hpp" + +// Detray benchmark include(s) +#include "detray/benchmarks/device/cuda/propagator.hpp" +#include "detray/benchmarks/propagation_benchmark_utils.hpp" +#include "detray/benchmarks/types.hpp" + +// Detray test include(s) +#include "detray/test/common/bfield.hpp" +#include "detray/test/common/build_toy_detector.hpp" +#include "detray/test/common/track_generators.hpp" + +// Vecmem include(s) +#include +#include +#include + +// System include(s) +#include +#include +#include +#include +#include + +using namespace detray; + +int main(int argc, char** argv) { + + using metadata_t = benchmarks::toy_metadata; + using toy_detector_t = detector; + using algebra_t = typename toy_detector_t::algebra_type; + using scalar = dscalar; + using vector3 = dvector3D; + + using free_track_parameters_t = free_track_parameters; + using uniform_gen_t = + detail::random_numbers>; + using track_generator_t = + random_track_generator; + using field_bknd_t = bfield::const_bknd_t; + + // vecmem::host_memory_resource host_mr; + vecmem::cuda::host_memory_resource host_mr; //< pinned memory + vecmem::cuda::device_memory_resource dev_mr; + + // + // Configuration + // + + std::size_t n_tracks{262144u}; + if (argc > 1) { + n_tracks = static_cast(atoi(argv[1])); + } + + // Constant magnetic field + vector3 B{0.f, 0.f, 2.f * unit::T}; + + // Configure toy detector + toy_det_config toy_cfg{}; + toy_cfg.use_material_maps(false).n_brl_layers(4u).n_edc_layers(7u); + + std::cout << toy_cfg << std::endl; + + // Configure propagation + propagation::config prop_cfg{}; + prop_cfg.navigation.search_window = {3u, 3u}; + + std::cout << prop_cfg << std::endl; + + // + // Prepare data + // + // Generate track sample for strong scaling + track_generator_t::configuration trk_cfg{}; + trk_cfg.n_tracks(n_tracks); + trk_cfg.seed(detail::random_numbers::default_seed()); + + std::cout << trk_cfg << std::endl; + + track_generator_t trk_gen{trk_cfg}; + + dvector single_sample = + detray::benchmarks::generate_tracks(&host_mr, trk_gen, true); + + const auto [toy_det, names] = + build_toy_detector(host_mr, toy_cfg); + + auto bfield = create_const_field(B); + + pointwise_material_interactor::state interactor_state{}; + parameter_resetter::state resetter_state{}; + + auto actor_states = + detail::make_tuple(interactor_state, resetter_state); + + // + // Register benchmarks + // + std::cout << "\n----------------------\n" + << "Propagation Test\n" + << "----------------------\n\n"; + + using navigator_t = navigator_type; + using stepper_t = stepper_type; + using actor_chain_t = default_chain; + + prop_cfg.stepping.do_covariance_transport = true; + cuda_propagation propagator{ + prop_cfg}; + + std::chrono::high_resolution_clock::time_point t1 = + std::chrono::high_resolution_clock::now(); + propagator(&dev_mr, &toy_det, &bfield, &single_sample, &actor_states); + std::chrono::high_resolution_clock::time_point t2 = + std::chrono::high_resolution_clock::now(); + + const auto total_time = + std::chrono::duration_cast>(t2 - t1); + const double total_time_ms{total_time.count() * 1000.}; + + // Assumption: 1 event = 3000 truth tracks + 2 seeds per track + std::cout << "It took: " << total_time_ms << "ms (" + << total_time_ms / (static_cast(n_tracks) / 3000.) + << " ms/evt)" << std::endl; +} diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt index 5ed1e442c..787d8b41e 100644 --- a/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt +++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt @@ -30,6 +30,8 @@ foreach(algebra ${algebra_plugins}) STATIC "propagation_benchmark.hpp" "propagation_benchmark.cu" + "propagator.hpp" + "propagator.cu" ) add_library( @@ -40,9 +42,15 @@ foreach(algebra ${algebra_plugins}) target_link_libraries( detray_benchmark_cuda_${algebra} PUBLIC + CUDA::cudart vecmem::cuda detray::benchmarks detray::test_common detray::core_${algebra} ) + + set_property( + TARGET detray_benchmark_cuda_${algebra} + PROPERTY CUDA_ARCHITECTURES 75 + ) endforeach() diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp index fa121705b..9b6e5f359 100644 --- a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp +++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp @@ -104,7 +104,7 @@ template void release_actor_states( typename propagator_t::actor_chain_type::state_tuple *); -/// Device Propagation becnhmark +/// Device Propagation benchmark template diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.cu b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.cu new file mode 100644 index 000000000..413b5bb37 --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.cu @@ -0,0 +1,372 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2026 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// Project include(s) +#include "detray/benchmarks/device/cuda/propagator.hpp" +#include "detray/definitions/detail/cuda_definitions.hpp" +#include "detray/utils/logging.hpp" + +// CUDA include(s) +#include + +#include + +namespace detray { + +/// Run the stepper +template +__device__ inline void run_actors( + typename stepper_t::state &stepping, + typename actor_chain_t::state_ref_tuple &actor_states) { + + using stepper_state_t = typename stepper_t::state; + using actor_states_t = typename actor_chain_t::state_ref_tuple; + + // auto block = cooperative_groups::this_thread_block(); + + // constexpr actor_chain_t run_actors{}; +} + +/// Run the stepper +template +__device__ inline void take_step( + const stepping::config &cfg, typename stepper_t::state &stepping, + typename actor_chain_t::state_ref_tuple &actor_states) { + + using stepper_state_t = typename stepper_t::state; + using actor_states_t = typename actor_chain_t::state_ref_tuple; + + // auto block = cooperative_groups::this_thread_block(); + + // constexpr stepper_t stepper{}; + constexpr actor_chain_t run_actors{}; +} + +/// Run the navigator +template +__device__ inline void navigate(const navigation::config &cfg, + typename navigator_t::state &navigation) { + + using detector_t = typename navigator_t::detector_type; + using navigation_state_t = typename navigator_t::state; + + // auto block = cooperative_groups::this_thread_block(); +} + +/// Specialize the warps to run stepping, navigation and actors independently +template +__global__ void __launch_bounds__(256, 4) propagation_kernel( + const propagation::config cfg, + const typename navigator_t::detector_type *pinned_detector_ptr, + typename stepper_t::magnetic_field_type field_view, + vecmem::data::vector_view + tracks_view, + // vecmem::data::vector_view stepper_res_view, + // vecmem::data::vector_view navigator_res_view, + typename actor_chain_t::state_tuple *pinned_actor_state_ptr) { + + using detector_device_t = typename navigator_t::detector_type; + using algebra_t = typename detector_device_t::algebra_type; + + using track_t = typename stepper_t::free_track_parameters_type; + using stepper_state_t = typename stepper_t::state; + using navigation_state_t = typename navigator_t::state; + using propagation_state_t = propagation_state; + + assert(blockDim.y == blockDim.z == 1); + + auto block = cooperative_groups::this_thread_block(); + const unsigned int gid = threadIdx.x + blockIdx.x * blockDim.x; + const unsigned int bid = block.thread_rank(); + + // Setup some small amount of data in shared memory for actor states + __shared__ typename actor_chain_t::state_tuple actor_states_sh[2]; + + if (gid == 0) { + // Create the actor states on a fresh copy + actor_states_sh[0] = *pinned_actor_state_ptr; + actor_states_sh[1] = *pinned_actor_state_ptr; + } + + block.sync(); + + /// One navigation and one stepping/acting thread work on 2 tracks (2:2) + const vecmem::device_vector tracks(tracks_view); + if (gid >= tracks.size()) { + return; + } + + // Create a pipeline. + assert(block.size() % 2u == 0); + + constexpr auto scope{cuda::thread_scope_block}; + constexpr int n_stages{2}; + __shared__ cuda::pipeline_shared_state shared_state; + auto propagation = + cuda::make_pipeline(block, &shared_state, block.size() / 2); + + /// + /// Run propagation steps + /// + + // The first half of the block performs navigation, the second stepping and + // actors calls + if (bid < block.size() / 2u) { + + assert(2u * gid + 1u < tracks.size()); + + DETRAY_INFO_DEVICE( + "Thread %d (%d) in block %d is doing navigation (tracks: %d, %d)", + gid, bid, blockIdx.x, 2u * gid, 2u * gid + 1u); + + std::array navigation{ + navigation_state_t{*pinned_detector_ptr}, + navigation_state_t{*pinned_detector_ptr}}; + + // Initialize the navigation and fill the propagation pipeline + typename detector_device_t::geometry_context gctx{}; + for (int i = 0; i < 2; ++i) { + + constexpr navigator_t navigator{}; + navigator.init(tracks.at(2 * gid + i), navigation[i], + cfg.navigation, gctx); + + assert(navigation.is_alive()); + + // Put navigation result in pipeline + __syncwarp(); + propagation.producer_acquire(); + + DETRAY_INFO_DEVICE("Thread %d: Initial dist %f (stage %d)", + block.thread_rank(), navigation[i](), i); + + __syncwarp(); + propagation.producer_commit(); + } + + // Propagation loop + int stage = 0; + for (std::size_t step = 0; step < 10; ++step) { + + // Wait for the stepper/actor to release the pipeline + __syncwarp(); + propagation.producer_acquire(); + + DETRAY_INFO_DEVICE("Thread %d: Dist %f (stage %d)", + block.thread_rank(), navigation[stage](), stage); + + navigate(cfg.navigation, navigation[stage]); + + // Publish navigator results + __syncwarp(); + propagation.producer_commit(); + + // Flip stage and navigate the other track + stage = (stage + 1) % n_stages; + } + + DETRAY_INFO_HOST_DEVICE( + "Thread %d, block %d (Nav): Navigation finished", + block.thread_rank(), blockIdx.x); + } else { + // Map the second batch of threads back onto the same data + const unsigned int lid{gid - blockDim.x / 2u}; + assert(2u * lid + 1u < tracks.size()); + + DETRAY_INFO_DEVICE( + "Thread %d (%d) in block %d is doing stepping (tracks: %d, %d)", + gid, bid, blockIdx.x, 2u * lid, 2u * lid + 1u); + + std::array stepping{ + stepper_state_t{tracks.at(2u * lid), field_view}, + stepper_state_t{tracks.at(2u * lid + 1u), field_view}}; + + using state_refs_t = typename actor_chain_t::state_ref_tuple; + std::array actor_states{ + actor_chain_t::setup_actor_states(actor_states_sh[0]), + actor_chain_t::setup_actor_states(actor_states_sh[1])}; + + // Propagation loop + int stage = 0; //< which track to advance + std::array prop_phase{ + 0}; //< whether to run actors or stepping + for (std::size_t step = 0; step < 10; ++step) { + + // Wait for navigation to finish + __syncwarp(); + propagation.consumer_wait(); + + if (prop_phase[stage] % 2 == 0) { + DETRAY_INFO_DEVICE("Thread %d: Run actors (stage %d)", + block.thread_rank(), stage); + + run_actors(stepping[stage], + actor_states[stage]); + + // Publish actor results + } else { + DETRAY_INFO_DEVICE("Thread %d: Stepsize %f (stage %d)", + block.thread_rank(), + stepping[stage].step_size(), stage); + + take_step( + cfg.stepping, stepping[stage], actor_states[stage]); + + // Publish stepper results + } + + // Trigger navigation + __syncwarp(); + propagation.consumer_release(); + + // Flip the propagation phase for this track (stepping <-> actors) + prop_phase[stage] = (prop_phase[stage] + 1) % 2; + // Flip stage and run on the other track + stage = (stage + 1) % n_stages; + } + + DETRAY_INFO_HOST_DEVICE("Thread %d, block %d (Step): Stepping finished", + block.thread_rank(), blockIdx.x); + } +} + +template +device_detector_t *setup_device_detector( + typename device_detector_t::view_type det_view) { + + // Build a device detector type (the interal pointers and capacities refer + // to the already allocated vecmem device buffers) + device_detector_t device_det{det_view}; + + // Allocate global memory space for the device detector to be shared by + // kernels + device_detector_t *pinned_detector_ptr{nullptr}; + + DETRAY_CUDA_ERROR_CHECK(cudaHostAlloc((void **)&pinned_detector_ptr, + sizeof(device_detector_t), + cudaHostAllocPortable)); + + DETRAY_CUDA_ERROR_CHECK(cudaMemcpy(pinned_detector_ptr, &device_det, + sizeof(device_detector_t), + cudaMemcpyHostToDevice)); + + DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize()); + + return pinned_detector_ptr; +} + +template +void release_device_detector(device_detector_t *pinned_detector_ptr) { + DETRAY_CUDA_ERROR_CHECK(cudaFreeHost(pinned_detector_ptr)); +} + +template +typename actor_chain_t::state_tuple *setup_actor_states( + typename actor_chain_t::state_tuple *input_actor_states) { + + // Copy the actor state blueprint to the device + using actor_state_t = typename actor_chain_t::state_tuple; + actor_state_t *pinned_actor_state_ptr{nullptr}; + + DETRAY_CUDA_ERROR_CHECK(cudaHostAlloc((void **)&pinned_actor_state_ptr, + sizeof(actor_state_t), + cudaHostAllocPortable)); + DETRAY_CUDA_ERROR_CHECK( + cudaMemcpy(pinned_actor_state_ptr, input_actor_states, + sizeof(actor_state_t), cudaMemcpyHostToDevice)); + + DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize()); + + return pinned_actor_state_ptr; +} + +template +void release_actor_states( + typename actor_chain_t::state_tuple *pinned_actor_state_ptr) { + DETRAY_CUDA_ERROR_CHECK(cudaFreeHost(pinned_actor_state_ptr)); +} + +template +void run_propagation_kernel( + const propagation::config &cfg, + const typename navigator_t::detector_type *pinned_detector_ptr, + typename stepper_t::magnetic_field_type field_view, + vecmem::data::vector_view + tracks_view, + vecmem::data::vector_view stepper_res_view, + vecmem::data::vector_view navigator_res_view, + typename actor_chain_t::state_tuple *pinned_actor_state_ptr) { + + int thread_dim = math::min(256, static_cast(tracks_view.size())); + // One block handles 256 tracks (2 per thread) + int block_dim = static_cast((tracks_view.size() / 2 + thread_dim - 1) / + thread_dim); + + DETRAY_INFO_HOST_DEVICE("# Tracks: %ld", tracks_view.size()); + DETRAY_INFO_HOST_DEVICE("# threads per block: %d", thread_dim); + DETRAY_INFO_HOST_DEVICE("# blocks: %d", block_dim); + DETRAY_INFO_HOST_DEVICE("# threads: %d", thread_dim * block_dim); + + // run the propagation loop + propagation_kernel + <<>>(cfg, pinned_detector_ptr, field_view, + tracks_view, pinned_actor_state_ptr); + + // cuda error check + DETRAY_CUDA_ERROR_CHECK(cudaGetLastError()); + DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize()); +} + +/// Macro declaring the template instantiations for the different detector types +#define DECLARE_PROPAGATOR(METADATA, CHAIN, FIELD) \ + \ + template void run_propagation_kernel< \ + navigator_type, stepper_type, \ + CHAIN::algebra_type>>( \ + const propagation::config &, \ + const detector *, \ + covfie::field_view, \ + vecmem::data::vector_view< \ + free_track_parameters::algebra_type>>, \ + vecmem::data::vector_view, \ + vecmem::data::vector_view, \ + typename CHAIN::algebra_type>::state_tuple *); + +#define DECLARE_ACTOR_CHAIN_SETUP(METADATA, CHAIN) \ + \ + template typename CHAIN::algebra_type>::state_tuple * \ + setup_actor_states::algebra_type>>( \ + typename CHAIN::algebra_type>::state_tuple *); \ + \ + template void \ + release_actor_states::algebra_type>>( \ + typename CHAIN::algebra_type>::state_tuple *); + +#define DECLARE_DETECTOR_ALLOCATION(METADATA) \ + \ + template detector \ + *setup_device_detector>( \ + typename detector::view_type); \ + \ + template void \ + release_device_detector>( \ + detector *); + +DECLARE_PROPAGATOR(benchmarks::default_metadata, empty_chain, const_field_t) +DECLARE_PROPAGATOR(benchmarks::default_metadata, default_chain, const_field_t) + +DECLARE_PROPAGATOR(benchmarks::toy_metadata, empty_chain, const_field_t) +DECLARE_PROPAGATOR(benchmarks::toy_metadata, default_chain, const_field_t) + +// Declare only once per algebra type +DECLARE_ACTOR_CHAIN_SETUP(benchmarks::toy_metadata, empty_chain) +DECLARE_ACTOR_CHAIN_SETUP(benchmarks::toy_metadata, default_chain) + +DECLARE_DETECTOR_ALLOCATION(benchmarks::default_metadata) +DECLARE_DETECTOR_ALLOCATION(benchmarks::toy_metadata) + +} // namespace detray diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.hpp b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.hpp new file mode 100644 index 000000000..e94daa2ec --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagator.hpp @@ -0,0 +1,247 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2025 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +// Project include(s) +#include "detray/core/detector.hpp" +#include "detray/definitions/algebra.hpp" +#include "detray/navigation/caching_navigator.hpp" +#include "detray/propagator/actors.hpp" +#include "detray/propagator/propagation_config.hpp" +#include "detray/propagator/propagator.hpp" +#include "detray/propagator/rk_stepper.hpp" +#include "detray/tracks/tracks.hpp" +#include "detray/utils/logging.hpp" + +// Detray test include(s) +#include "detray/benchmarks/types.hpp" +#include "detray/test/common/bfield.hpp" + +// Vecmem include(s) +#include +#include +#include +#include +#include + +// System include(s) +#include +#include +#include +#include +#include +#include +#include + +namespace detray { + +// Define propagator type +template +using empty_chain = actor_chain<>; + +template +using default_chain = actor_chain, + pointwise_material_interactor, + parameter_resetter>; + +using const_field_t = bfield::const_bknd_t; + +template +using stepper_type = rk_stepper, + typename detector::algebra_type>; + +template +using navigator_type = + caching_navigator>; + +/// Propagation that state aggregates a stepping and a navigation state. It +/// also keeps references to the actor states. +template +struct propagation_state { + typename stepper_t::state _stepping; + typename navigator_t::state _navigation; + typename navigator_t::detector_type::geometry_context _context{}; +}; + +/// Launch the propagation kernel +/// +/// @param cfg the propagation configuration +/// @param field_data the magentic field view (maybe an empty field) +/// @param tracks_data the track collection view +/// @param navigation_cache_view the navigation cache vecemem view +/// @param opt which propagation to run (sync vs. unsync) +template +void run_propagation_kernel( + const propagation::config &, const typename navigator_t::detector_type *, + typename stepper_t::magnetic_field_type, + vecmem::data::vector_view, + vecmem::data::vector_view, + vecmem::data::vector_view, + typename actor_chain_t::state_tuple *); + +/// Allocate space for the device detector opbject to be shared between kenels +template +device_detector_t *setup_device_detector(typename device_detector_t::view_type); + +/// Release device detector +template +void release_device_detector(device_detector_t *); + +/// Copy a blueprint actor state to device, which will be taken and set up +/// correctly for every track in the propagation init kernel +/// @returns the device +template +typename actor_chain_t::state_tuple *setup_actor_states( + typename actor_chain_t::state_tuple *); + +/// Release the blueprint actor allocation after all actor states are set up +template +void release_actor_states(typename actor_chain_t::state_tuple *); + +/// Device Propagation benchmark +template +struct cuda_propagation { + /// Detector dependent types + using metadata_t = typename navigator_t::detector_type::metadata; + using detector_t = detector; + using device_detector_t = detector; + using bfield_view_t = typename stepper_t::magnetic_field_type; + using algebra_t = typename detector_t::algebra_type; + using scalar_t = dscalar; + using vector3_t = dvector3D; + + using state = propagation_state; + + /// The propagation configuration + propagation::config m_cfg{}; + + /// Default construction + cuda_propagation() = default; + + /// Construct from an externally provided configuration @param cfg + explicit cuda_propagation(const propagation::config &cfg) : m_cfg{cfg} {} + + /// @return the propagation configuration + propagation::config &config() { return m_cfg; } + + /// Prepare data and run propagation loop + template + inline void operator()( + vecmem::memory_resource *dev_mr, const detector_t *det, + const bfield_bkn_t *field, + dvector> *tracks, + typename actor_chain_t::state_tuple *input_actor_states) const { + + assert(dev_mr != nullptr); + assert(tracks != nullptr); + assert(det != nullptr); + assert(field != nullptr); + assert(input_actor_states != nullptr); + + // Helper object for performing memory copies (to CUDA devices) + vecmem::cuda::copy cuda_cpy; + + // Copy the detector to device and get its view + auto det_buffer = detray::get_buffer(*det, *dev_mr, cuda_cpy); + auto det_view = detray::get_data(det_buffer); + + bfield_view_t field_view(*field); + + // Copy the track collection to device + auto track_buffer = + detray::get_buffer(vecmem::get_data(*tracks), *dev_mr, cuda_cpy); + + // Launch the propagator test for GPU device + propagate(dev_mr, det_view, field_view, track_buffer, + input_actor_states); + } + + /// Run propagation loop + inline void operator()( + vecmem::memory_resource *dev_mr, const detector_t::view_type det_view, + const bfield_view_t field_view, + vecmem::data::vector_view> tracks_view, + typename actor_chain_t::state_tuple *input_actor_states) const { + + // Launch the propagator test for GPU device + propagate(dev_mr, det_view, field_view, tracks_view, + input_actor_states); + } + + private: + /// Prepare data and run propagation loop + inline void propagate( + vecmem::memory_resource *dev_mr, const detector_t::view_type det_view, + const bfield_view_t field_view, + vecmem::data::vector_view> tracks_view, + typename actor_chain_t::state_tuple *input_actor_states) const { + + assert(dev_mr != nullptr); + assert(tracks_view.size() != 0u); + assert(input_actor_states != nullptr); + + // Helper object for performing memory copies (to CUDA devices) + vecmem::cuda::copy cuda_cpy; + + const unsigned int n_tracks{ + static_cast(tracks_view.size())}; + + // Allocate memory for device detector object (not detector data!) + auto *pinned_detector_ptr = + setup_device_detector(det_view); + + // Copy blueprint actor states to device + auto *pinned_actor_state_ptr = + setup_actor_states(input_actor_states); + + // Stepper results + vecmem::data::vector_buffer stepper_res_buffer(n_tracks, + *dev_mr); + cuda_cpy.setup(stepper_res_buffer)->wait(); + auto stepper_res_view = vecmem::get_data(stepper_res_buffer); + + // Navigation results + vecmem::data::vector_buffer navigation_res_buffer( + n_tracks, *dev_mr); + cuda_cpy.setup(navigation_res_buffer)->wait(); + auto navigation_res_view = vecmem::get_data(navigation_res_buffer); + + std::chrono::high_resolution_clock::time_point t1_prop = + std::chrono::high_resolution_clock::now(); + // Check if all tracks finished + bool all_finished{false}; + // Safety agianst infinite loops + int iterations{0}; + + while (!all_finished && iterations < 1) { + // Launch the propagation for GPU device + run_propagation_kernel( + m_cfg, pinned_detector_ptr, field_view, tracks_view, + stepper_res_view, navigation_res_view, pinned_actor_state_ptr); + + ++iterations; + } + std::chrono::high_resolution_clock::time_point t2_prop = + std::chrono::high_resolution_clock::now(); + + // Deallocate the propagation config and detector + release_device_detector(pinned_detector_ptr); + release_actor_states(pinned_actor_state_ptr); + + const auto prop_time = + std::chrono::duration_cast>(t2_prop - + t1_prop); + const double prop_ms{prop_time.count() * 1000.}; + + DETRAY_INFO_HOST("Propagation kernel took: " + << prop_ms << "ms (" << prop_ms / (n_tracks / 3000.) + << " ms/evt)\n"); + } +}; + +} // namespace detray diff --git a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp index 36bde6f2f..86469a869 100644 --- a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp +++ b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp @@ -14,6 +14,9 @@ #include "detray/propagator/propagator.hpp" #include "detray/tracks/tracks.hpp" +// Detray benchmark include(s) +#include "detray/benchmarks/benchmark_base.hpp" + // Vecmem include(s) #include