feat: taskparallel propagation (producer-consumer model) #1070

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft

niermann999 wants to merge 4 commits into acts-project:main from niermann999:feat-taskparallel-prop

cmake/detray-compiler-options-cuda.cmake

-Original file line number
+Diff line change
@@ Expand Up / @@ -19,13 +19,6 @@ if(PROJECT_IS_TOP_LEVEL) @@
             detray_add_flag( CMAKE_CUDA_FLAGS "-Xcompiler /Zc:__cplusplus" )
         endif()
-        # Set the CUDA architecture to build code for.
-        set(CMAKE_CUDA_ARCHITECTURES
-            "52"
-            CACHE STRING
-            "CUDA architectures to build device code for"
-        )
         if("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA")
             # Allow to use functions in device code that are constexpr, even if they are
             # not marked with __device__.
@@ Expand Down @@

core/include/detray/builders/homogeneous_material_generator.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -17,6 +17,7 @@ @@
     #include "detray/materials/predefined_materials.hpp"
     #include "detray/utils/log.hpp"
     #include "detray/utils/ranges.hpp"
+    #include "detray/utils/type_registry.hpp"
     // System include(s)
     #include <sstream>
@@ Expand Down @@

core/include/detray/geometry/shapes/concentric_cylinder2D.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -112,7 +112,7 @@ class concentric_cylinder2D { @@
             const scalar_t tol = std::numeric_limits<scalar_t>::epsilon(),
             const scalar_t /*edge_tol*/ = 0.f) const {
-            return (bounds[e_lower_z] - tol <= loc_p[1] &&
+            return (bounds[e_lower_z] <= loc_p[1] + tol &&
                     loc_p[1] <= bounds[e_upper_z] + tol);
         }
         /// @}
@@ Expand Down @@

core/include/detray/propagator/rk_stepper.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -179,7 +179,7 @@ class rk_stepper final @@
             scalar_type m_next_step_size{0.f};
             /// Magnetic field view
-            const magnetic_field_t m_magnetic_field;
+            magnetic_field_t m_magnetic_field;
         };
         /// Take a step, using an adaptive Runge-Kutta algorithm.
@@ Expand Down @@

tests/benchmarks/cuda/CMakeLists.txt

-Original file line number
+Diff line change
@@ Expand Up / @@ -38,3 +38,13 @@ foreach(algebra ${algebra_plugins}) @@
             PRIVATE "-march=native" "-ftree-vectorize"
         )
     endforeach()
+    detray_add_executable(cuda_propagation
+        "propagation_new.cpp"
+        LINK_LIBRARIES detray::benchmark_cuda_array detray::core_array vecmem::cuda detray::test_common
+    )
+    target_compile_options(
+        detray_cuda_propagation
+        PRIVATE "-march=native" "-ftree-vectorize"
+    )

tests/benchmarks/cuda/propagation_new.cpp

-Original file line number
+Diff line change
@@ -0,0 +1,136 @@
+    /** Detray library, part of the ACTS project (R&D line)
+     *
+     * (c) 2024 CERN for the benefit of the ACTS project
+     *
+     * Mozilla Public License Version 2.0
+     */
+    // Project include(s)
+    #include "detray/navigation/navigator.hpp"
+    #include "detray/propagator/actors.hpp"
+    #include "detray/propagator/rk_stepper.hpp"
+    #include "detray/tracks/tracks.hpp"
+    // Detray benchmark include(s)
+    #include "detray/benchmarks/device/cuda/propagator.hpp"
+    #include "detray/benchmarks/propagation_benchmark_utils.hpp"
+    #include "detray/benchmarks/types.hpp"
+    // Detray test include(s)
+    #include "detray/test/common/bfield.hpp"
+    #include "detray/test/common/build_toy_detector.hpp"
+    #include "detray/test/common/track_generators.hpp"
+    // Vecmem include(s)
+    #include <vecmem/memory/cuda/device_memory_resource.hpp>
+    #include <vecmem/memory/cuda/host_memory_resource.hpp>
+    #include <vecmem/memory/host_memory_resource.hpp>
+    // System include(s)
+    #include <chrono>
+    #include <ctime>
+    #include <iostream>
+    #include <ratio>
+    #include <string>
+    using namespace detray;
+    int main(int argc, char** argv) {
+        using metadata_t = benchmarks::toy_metadata;
+        using toy_detector_t = detector<metadata_t>;
+        using algebra_t = typename toy_detector_t::algebra_type;
+        using scalar = dscalar<algebra_t>;
+        using vector3 = dvector3D<algebra_t>;
+        using free_track_parameters_t = free_track_parameters<algebra_t>;
+        using uniform_gen_t =
+            detail::random_numbers<scalar, std::uniform_real_distribution<scalar>>;
+        using track_generator_t =
+            random_track_generator<free_track_parameters_t, uniform_gen_t>;
+        using field_bknd_t = bfield::const_bknd_t<benchmarks::scalar>;
+        // vecmem::host_memory_resource host_mr;
+        vecmem::cuda::host_memory_resource host_mr;  //< pinned memory
+        vecmem::cuda::device_memory_resource dev_mr;
+        //
+        // Configuration
+        //
+        std::size_t n_tracks{262144u};
+        if (argc > 1) {
+            n_tracks = static_cast<std::size_t>(atoi(argv[1]));
+        }
+        // Constant magnetic field
+        vector3 B{0.f, 0.f, 2.f * unit<scalar>::T};
+        // Configure toy detector
+        toy_det_config<scalar> toy_cfg{};
+        toy_cfg.use_material_maps(false).n_brl_layers(4u).n_edc_layers(7u);
+        std::cout << toy_cfg << std::endl;
+        // Configure propagation
+        propagation::config prop_cfg{};
+        prop_cfg.navigation.search_window = {3u, 3u};
+        std::cout << prop_cfg << std::endl;
+        //
+        // Prepare data
+        //
+        // Generate track sample for strong scaling
+        track_generator_t::configuration trk_cfg{};
+        trk_cfg.n_tracks(n_tracks);
+        trk_cfg.seed(detail::random_numbers<scalar>::default_seed());
+        std::cout << trk_cfg << std::endl;
+        track_generator_t trk_gen{trk_cfg};
+        dvector<free_track_parameters_t> single_sample =
+            detray::benchmarks::generate_tracks(&host_mr, trk_gen, true);
+        const auto [toy_det, names] =
+            build_toy_detector<algebra_t>(host_mr, toy_cfg);
+        auto bfield = create_const_field<scalar>(B);
+        pointwise_material_interactor<algebra_t>::state interactor_state{};
+        parameter_resetter<algebra_t>::state resetter_state{};
+        auto actor_states =
+            detail::make_tuple<dtuple>(interactor_state, resetter_state);
+        //
+        // Register benchmarks
+        //
+        std::cout << "\n----------------------\n"
+                  << "Propagation Test\n"
+                  << "----------------------\n\n";
+        using navigator_t = navigator_type<metadata_t>;
+        using stepper_t = stepper_type<metadata_t, field_bknd_t>;
+        using actor_chain_t = default_chain<algebra_t>;
+        prop_cfg.stepping.do_covariance_transport = true;
+        cuda_propagation<navigator_t, stepper_t, actor_chain_t> propagator{
+            prop_cfg};
+        std::chrono::high_resolution_clock::time_point t1 =
+            std::chrono::high_resolution_clock::now();
+        propagator(&dev_mr, &toy_det, &bfield, &single_sample, &actor_states);
+        std::chrono::high_resolution_clock::time_point t2 =
+            std::chrono::high_resolution_clock::now();
+        const auto total_time =
+            std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+        const double total_time_ms{total_time.count() * 1000.};
+        // Assumption: 1 event = 3000 truth tracks + 2 seeds per track
+        std::cout << "It took: " << total_time_ms << "ms ("
+                  << total_time_ms / (static_cast<double>(n_tracks) / 3000.)
+                  << " ms/evt)" << std::endl;
+    }

tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt

-Original file line number
+Diff line change
@@ Expand Up / @@ -30,6 +30,8 @@ foreach(algebra ${algebra_plugins}) @@
             STATIC
             "propagation_benchmark.hpp"
             "propagation_benchmark.cu"
+            "propagator.hpp"
+            "propagator.cu"
         )
         add_library(
@@ Expand All / @@ -40,9 +42,15 @@ foreach(algebra ${algebra_plugins}) @@
         target_link_libraries(
             detray_benchmark_cuda_${algebra}
             PUBLIC
+                CUDA::cudart
                 vecmem::cuda
                 detray::benchmarks
                 detray::test_common
                 detray::core_${algebra}
         )
+        set_property(
+            TARGET detray_benchmark_cuda_${algebra}
+            PROPERTY CUDA_ARCHITECTURES 75
+        )
     endforeach()

tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -104,7 +104,7 @@ template <typename propagator_t> @@
     void release_actor_states(
         typename propagator_t::actor_chain_type::state_tuple *);
-    /// Device Propagation becnhmark
+    /// Device Propagation benchmark
     template <typename propagator_t, typename bfield_bknd_t,
               detray::benchmarks::propagation_opt kOPT =
                   detray::benchmarks::propagation_opt::e_unsync>
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: taskparallel propagation (producer-consumer model) #1070

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

feat: taskparallel propagation (producer-consumer model) #1070

Are you sure you want to change the base?

Uh oh!

feat: taskparallel propagation (producer-consumer model) #1070

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!