cern-nextgen · m-fila · Apr 29, 2026 · Apr 9, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -109,6 +109,7 @@ endif()
 option(BUILD_CUDA "Build with CUDA support" OFF)
 if(BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
+  enable_language(CUDA)
 endif()
 
 ## targets

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -70,26 +70,27 @@ if(BUILD_CAPY)
 endif()
 
 if(BUILD_CUDA AND BUILD_TBB)
+  add_library(nanospin nanospin.cu)
   add_executable(alien_reco alien_reco.cpp)
-  target_link_libraries(alien_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb)
+  target_link_libraries(alien_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb nanospin)
   add_executable(alien_delegate alien_delegate.cpp)
-  target_link_libraries(alien_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb)
+  target_link_libraries(alien_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb nanospin)
   add_executable(alien_event_poll alien_event_poll.cpp)
-  target_link_libraries(alien_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb)
+  target_link_libraries(alien_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb nanospin)
   if(BUILD_STDEXEC)
     add_executable(exec_reco_stdexec exec_reco.cpp)
-    target_link_libraries(exec_reco_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec)
+    target_link_libraries(exec_reco_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec nanospin)
     add_executable(exec_delegate_stdexec exec_delegate.cpp)
-    target_link_libraries(exec_delegate_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec)
+    target_link_libraries(exec_delegate_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec nanospin)
     add_executable(exec_event_poll_stdexec exec_event_poll.cpp)
-    target_link_libraries(exec_event_poll_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec)
+    target_link_libraries(exec_event_poll_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec nanospin)
   endif()
   if(BUILD_CAPY)
     add_executable(capy_reco capy_reco.cpp)
-    target_link_libraries(capy_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy)
+    target_link_libraries(capy_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy nanospin)
     add_executable(capy_delegate capy_delegate.cpp)
-    target_link_libraries(capy_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy)
+    target_link_libraries(capy_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy nanospin)
     add_executable(capy_event_poll capy_event_poll.cpp)
-    target_link_libraries(capy_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy)
+    target_link_libraries(capy_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy nanospin)
   endif()
 endif()
diff --git a/examples/alien_delegate.cpp b/examples/alien_delegate.cpp
@@ -16,6 +16,7 @@
 #include "CoroutineTests/threadpool.hpp"
 #include "alien_stream_await.hpp"  // StreamAwaitable
 #include "logging_utils.hpp"       // log, format_name
+#include "nanospin.hpp"            // launch_nanospin
 
 #define ERROR_CHECK_CUDA(EXP)                                              \
     do {                                                                   \
@@ -88,6 +89,7 @@ subtool::Task<DeviceBuffer<int>> clusterization(
                                              nClusters * sizeof(int), stream));
             ERROR_CHECK_CUDA(cudaMemsetAsync(
                 d_clusters, 1, nClusters / 2 * sizeof(int), stream));
+            launch_nanospin(1'000'000, stream);
         }));
 
     co_return DeviceBuffer<int>{d_clusters,
@@ -138,6 +140,7 @@ subtool::Task<DeviceBuffer<int>> seeding(
                 cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
             ERROR_CHECK_CUDA(
                 cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
+            launch_nanospin(1'000'000, stream);
         }));
 
     co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};

diff --git a/examples/alien_event_poll.cpp b/examples/alien_event_poll.cpp
@@ -16,6 +16,7 @@
 #include "CoroutineTests/alien/tool.hpp"
 #include "CoroutineTests/threadpool.hpp"
 #include "logging_utils.hpp"  // log, format_name
+#include "nanospin.hpp"       // launch_nanospin
 
 #define ERROR_CHECK_CUDA(EXP)                                              \
     do {                                                                   \
@@ -109,6 +110,7 @@ subtool::Task<DeviceBuffer<int>> clusterization(
                                              nClusters * sizeof(int), stream));
             ERROR_CHECK_CUDA(cudaMemsetAsync(
                 d_clusters, 1, nClusters / 2 * sizeof(int), stream));
+            launch_nanospin(1'000'000, stream);
         }));
 
     co_return DeviceBuffer<int>{d_clusters,
@@ -160,6 +162,7 @@ subtool::Task<DeviceBuffer<int>> seeding(
                 cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
             ERROR_CHECK_CUDA(
                 cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
+            launch_nanospin(1'000'000, stream);
         }));
 
     co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};

diff --git a/examples/alien_reco.cpp b/examples/alien_reco.cpp
@@ -10,6 +10,7 @@
 #include "CoroutineTests/alien/tool.hpp"
 #include "alien_stream_await.hpp"  // StreamAwaitable
 #include "logging_utils.hpp"       // log, format_name
+#include "nanospin.hpp"            // launch_nanospin
 
 #define ERROR_CHECK_CUDA(EXP)                                              \
     do {                                                                   \
@@ -64,6 +65,7 @@ subtool::Task<DeviceBuffer<int>> clusterization(DeviceBuffer<int> cells,
         cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
     ERROR_CHECK_CUDA(
         cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream));
+    launch_nanospin(1'000'000, stream);
 
     co_return DeviceBuffer<int>{d_clusters,
                                 static_cast<std::size_t>(nClusters)};
@@ -103,6 +105,7 @@ subtool::Task<DeviceBuffer<int>> seeding(DeviceBuffer<int> clusters,
     ERROR_CHECK_CUDA(cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
     ERROR_CHECK_CUDA(
         cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
+    launch_nanospin(1'000'000, stream);
 
     co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
 }

diff --git a/examples/capy_delegate.cpp b/examples/capy_delegate.cpp
@@ -12,6 +12,7 @@
 #include "capy_stream_await.hpp"         // StreamIoAwaitable
 #include "capy_task_arena_executor.hpp"  // TaskArenaExecutor
 #include "logging_utils.hpp"             // log, format_name
+#include "nanospin.hpp"                  // launch_nanospin
 #include "statuscode.hpp"                // StatusCodeImpl
 
 namespace tools {
@@ -84,6 +85,7 @@ boost::capy::task<DeviceBuffer<int>> clusterization(
             cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
         ERROR_CHECK_CUDA(cudaMemsetAsync(d_clusters, 1,
                                          nClusters / 2 * sizeof(int), stream));
+        launch_nanospin(1'000'000, stream);
     }));
 
     co_return DeviceBuffer<int>{d_clusters,
@@ -131,6 +133,7 @@ boost::capy::task<DeviceBuffer<int>> seeding(
             cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
         ERROR_CHECK_CUDA(
             cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
+        launch_nanospin(1'000'000, stream);
     }));
 
     co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};

diff --git a/examples/capy_event_poll.cpp b/examples/capy_event_poll.cpp
@@ -11,6 +11,7 @@
 
 #include "capy_task_arena_executor.hpp"  // TaskArenaExecutor
 #include "logging_utils.hpp"             // log, format_name
+#include "nanospin.hpp"                  // launch_nanospin
 #include "statuscode.hpp"                // StatusCodeImpl
 
 namespace tools {
@@ -105,6 +106,7 @@ boost::capy::task<DeviceBuffer<int>> clusterization(
             cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
         ERROR_CHECK_CUDA(cudaMemsetAsync(d_clusters, 1,
                                          nClusters / 2 * sizeof(int), stream));
+        launch_nanospin(1'000'000, stream);
     }));
 
     co_return DeviceBuffer<int>{d_clusters,
@@ -154,6 +156,7 @@ boost::capy::task<DeviceBuffer<int>> seeding(
             cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
         ERROR_CHECK_CUDA(
             cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
+        launch_nanospin(1'000'000, stream);
     }));
 
     co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};

diff --git a/examples/capy_reco.cpp b/examples/capy_reco.cpp
@@ -10,6 +10,7 @@
 #include "capy_stream_await.hpp"         // StreamIoAwaitable
 #include "capy_task_arena_executor.hpp"  // TaskArenaExecutor
 #include "logging_utils.hpp"             // log, format_name
+#include "nanospin.hpp"                  // launch_nanospin
 #include "statuscode.hpp"                // StatusCodeImpl
 
 namespace tools {
@@ -70,6 +71,7 @@ boost::capy::task<DeviceBuffer<int>> clusterization(DeviceBuffer<int> cells,
         cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
     ERROR_CHECK_CUDA(
         cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream));
+    launch_nanospin(1'000'000, stream);
 
     co_return DeviceBuffer<int>{d_clusters,
                                 static_cast<std::size_t>(nClusters)};
@@ -109,6 +111,7 @@ boost::capy::task<DeviceBuffer<int>> seeding(DeviceBuffer<int> clusters,
     ERROR_CHECK_CUDA(cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
     ERROR_CHECK_CUDA(
         cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
+    launch_nanospin(1'000'000, stream);
 
     co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
 }

diff --git a/examples/exec_delegate.cpp b/examples/exec_delegate.cpp
@@ -11,6 +11,7 @@
 #include "exec_stream_await_sender.hpp"   // stream_await_sender
 #include "exec_task_arena_scheduler.hpp"  // TaskArenaScheduler
 #include "logging_utils.hpp"              // log, format_name
+#include "nanospin.hpp"                   // launch_nanospin
 #include "statuscode.hpp"                 // StatusCodeImpl
 
 namespace tools {
@@ -83,6 +84,7 @@ exec::task<DeviceBuffer<int>> clusterization(
                                              nClusters * sizeof(int), stream));
             ERROR_CHECK_CUDA(cudaMemsetAsync(
                 d_clusters, 1, nClusters / 2 * sizeof(int), stream));
+            launch_nanospin(1'000'000, stream);
         });
     co_await stdexec::on(delegation_ctx.get_scheduler(),
                          std::move(allocate_clusters));
@@ -137,6 +139,7 @@ exec::task<DeviceBuffer<int>> seeding(
                 cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
             ERROR_CHECK_CUDA(
                 cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
+            launch_nanospin(1'000'000, stream);
         });
     co_await stdexec::on(delegation_ctx.get_scheduler(),
                          std::move(allocate_seeds));

diff --git a/examples/exec_event_poll.cpp b/examples/exec_event_poll.cpp
@@ -12,6 +12,7 @@
 
 #include "exec_task_arena_scheduler.hpp"  // TaskArenaScheduler
 #include "logging_utils.hpp"              // log, format_name
+#include "nanospin.hpp"                   // launch_nanospin
 #include "statuscode.hpp"                 // StatusCodeImpl
 
 namespace tools {
@@ -107,6 +108,7 @@ exec::task<DeviceBuffer<int>> clusterization(
                                              nClusters * sizeof(int), stream));
             ERROR_CHECK_CUDA(cudaMemsetAsync(
                 d_clusters, 1, nClusters / 2 * sizeof(int), stream));
+            launch_nanospin(1'000'000, stream);
         });
     co_await stdexec::on(delegation_ctx.get_scheduler(),
                          std::move(allocate_clusters));
@@ -162,6 +164,7 @@ exec::task<DeviceBuffer<int>> seeding(
                 cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
             ERROR_CHECK_CUDA(
                 cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
+            launch_nanospin(1'000'000, stream);
         });
     co_await stdexec::on(delegation_ctx.get_scheduler(),
                          std::move(allocate_seeds));

diff --git a/examples/exec_reco.cpp b/examples/exec_reco.cpp
@@ -10,6 +10,7 @@
 #include "exec_stream_await_sender.hpp"   // stream_await_sender
 #include "exec_task_arena_scheduler.hpp"  // TaskArenaScheduler
 #include "logging_utils.hpp"              // log, format_name
+#include "nanospin.hpp"                   // launch_nanospin
 #include "statuscode.hpp"                 // StatusCodeImpl
 
 namespace tools {
@@ -70,6 +71,7 @@ exec::task<DeviceBuffer<int>> clusterization(DeviceBuffer<int> cells,
         cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
     ERROR_CHECK_CUDA(
         cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream));
+    launch_nanospin(1'000'000, stream);
 
     co_return DeviceBuffer<int>{d_clusters,
                                 static_cast<std::size_t>(nClusters)};
@@ -109,6 +111,7 @@ exec::task<DeviceBuffer<int>> seeding(DeviceBuffer<int> clusters,
     ERROR_CHECK_CUDA(cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
     ERROR_CHECK_CUDA(
         cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
+    launch_nanospin(1'000'000, stream);
 
     co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
 }

diff --git a/examples/nanospin.cu b/examples/nanospin.cu
@@ -0,0 +1,14 @@
+#include "nanospin.hpp"
+
+static __global__ void nanospin(std::uint64_t ns) {
+    long long start = clock64();
+    auto cps = 1'400'000'000LL;  // Assuming 1.4 GHz clock rate
+    auto delta = (long long)((ns * cps) / 1'000'000'000ULL);
+    auto end = start + delta;
+    while (clock64() < end) {
+    }
+}
+
+void launch_nanospin(std::uint64_t ns, cudaStream_t stream) {
+    nanospin<<<1, 32, 0, stream>>>(ns);
+}
diff --git a/examples/nanospin.hpp b/examples/nanospin.hpp
@@ -0,0 +1,6 @@
+#include <cuda_runtime_api.h>
+
+#include <cstdint>
+
+/// Launch a GPU spin for the specified duration in ns on the given CUDA stream.
+void launch_nanospin(std::uint64_t ns, cudaStream_t stream);