diff --git a/CMakeLists.txt b/CMakeLists.txt index 352a309..eb576e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,6 +109,7 @@ endif() option(BUILD_CUDA "Build with CUDA support" OFF) if(BUILD_CUDA) find_package(CUDAToolkit REQUIRED) + enable_language(CUDA) endif() ## targets diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index aa3aee4..0a61b16 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -70,26 +70,27 @@ if(BUILD_CAPY) endif() if(BUILD_CUDA AND BUILD_TBB) + add_library(nanospin nanospin.cu) add_executable(alien_reco alien_reco.cpp) - target_link_libraries(alien_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb) + target_link_libraries(alien_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb nanospin) add_executable(alien_delegate alien_delegate.cpp) - target_link_libraries(alien_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb) + target_link_libraries(alien_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb nanospin) add_executable(alien_event_poll alien_event_poll.cpp) - target_link_libraries(alien_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb) + target_link_libraries(alien_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb nanospin) if(BUILD_STDEXEC) add_executable(exec_reco_stdexec exec_reco.cpp) - target_link_libraries(exec_reco_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec) + target_link_libraries(exec_reco_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec nanospin) add_executable(exec_delegate_stdexec exec_delegate.cpp) - target_link_libraries(exec_delegate_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec) + target_link_libraries(exec_delegate_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec nanospin) add_executable(exec_event_poll_stdexec exec_event_poll.cpp) - target_link_libraries(exec_event_poll_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec) + target_link_libraries(exec_event_poll_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec nanospin) endif() if(BUILD_CAPY) add_executable(capy_reco capy_reco.cpp) - target_link_libraries(capy_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy) + target_link_libraries(capy_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy nanospin) add_executable(capy_delegate capy_delegate.cpp) - target_link_libraries(capy_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy) + target_link_libraries(capy_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy nanospin) add_executable(capy_event_poll capy_event_poll.cpp) - target_link_libraries(capy_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy) + target_link_libraries(capy_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy nanospin) endif() endif() diff --git a/examples/alien_delegate.cpp b/examples/alien_delegate.cpp index e67d9af..38d94b3 100644 --- a/examples/alien_delegate.cpp +++ b/examples/alien_delegate.cpp @@ -16,6 +16,7 @@ #include "CoroutineTests/threadpool.hpp" #include "alien_stream_await.hpp" // StreamAwaitable #include "logging_utils.hpp" // log, format_name +#include "nanospin.hpp" // launch_nanospin #define ERROR_CHECK_CUDA(EXP) \ do { \ @@ -88,6 +89,7 @@ subtool::Task> clusterization( nClusters * sizeof(int), stream)); ERROR_CHECK_CUDA(cudaMemsetAsync( d_clusters, 1, nClusters / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); })); co_return DeviceBuffer{d_clusters, @@ -138,6 +140,7 @@ subtool::Task> seeding( cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); })); co_return DeviceBuffer{d_seeds, static_cast(nSeeds)}; diff --git a/examples/alien_event_poll.cpp b/examples/alien_event_poll.cpp index 19c3042..9cf4af6 100644 --- a/examples/alien_event_poll.cpp +++ b/examples/alien_event_poll.cpp @@ -16,6 +16,7 @@ #include "CoroutineTests/alien/tool.hpp" #include "CoroutineTests/threadpool.hpp" #include "logging_utils.hpp" // log, format_name +#include "nanospin.hpp" // launch_nanospin #define ERROR_CHECK_CUDA(EXP) \ do { \ @@ -109,6 +110,7 @@ subtool::Task> clusterization( nClusters * sizeof(int), stream)); ERROR_CHECK_CUDA(cudaMemsetAsync( d_clusters, 1, nClusters / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); })); co_return DeviceBuffer{d_clusters, @@ -160,6 +162,7 @@ subtool::Task> seeding( cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); })); co_return DeviceBuffer{d_seeds, static_cast(nSeeds)}; diff --git a/examples/alien_reco.cpp b/examples/alien_reco.cpp index 51ba323..9720768 100644 --- a/examples/alien_reco.cpp +++ b/examples/alien_reco.cpp @@ -10,6 +10,7 @@ #include "CoroutineTests/alien/tool.hpp" #include "alien_stream_await.hpp" // StreamAwaitable #include "logging_utils.hpp" // log, format_name +#include "nanospin.hpp" // launch_nanospin #define ERROR_CHECK_CUDA(EXP) \ do { \ @@ -64,6 +65,7 @@ subtool::Task> clusterization(DeviceBuffer cells, cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); co_return DeviceBuffer{d_clusters, static_cast(nClusters)}; @@ -103,6 +105,7 @@ subtool::Task> seeding(DeviceBuffer clusters, ERROR_CHECK_CUDA(cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); co_return DeviceBuffer{d_seeds, static_cast(nSeeds)}; } diff --git a/examples/capy_delegate.cpp b/examples/capy_delegate.cpp index 923b23a..3d12631 100644 --- a/examples/capy_delegate.cpp +++ b/examples/capy_delegate.cpp @@ -12,6 +12,7 @@ #include "capy_stream_await.hpp" // StreamIoAwaitable #include "capy_task_arena_executor.hpp" // TaskArenaExecutor #include "logging_utils.hpp" // log, format_name +#include "nanospin.hpp" // launch_nanospin #include "statuscode.hpp" // StatusCodeImpl namespace tools { @@ -84,6 +85,7 @@ boost::capy::task> clusterization( cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream)); ERROR_CHECK_CUDA(cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); })); co_return DeviceBuffer{d_clusters, @@ -131,6 +133,7 @@ boost::capy::task> seeding( cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); })); co_return DeviceBuffer{d_seeds, static_cast(nSeeds)}; diff --git a/examples/capy_event_poll.cpp b/examples/capy_event_poll.cpp index 3f48831..5b7d108 100644 --- a/examples/capy_event_poll.cpp +++ b/examples/capy_event_poll.cpp @@ -11,6 +11,7 @@ #include "capy_task_arena_executor.hpp" // TaskArenaExecutor #include "logging_utils.hpp" // log, format_name +#include "nanospin.hpp" // launch_nanospin #include "statuscode.hpp" // StatusCodeImpl namespace tools { @@ -105,6 +106,7 @@ boost::capy::task> clusterization( cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream)); ERROR_CHECK_CUDA(cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); })); co_return DeviceBuffer{d_clusters, @@ -154,6 +156,7 @@ boost::capy::task> seeding( cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); })); co_return DeviceBuffer{d_seeds, static_cast(nSeeds)}; diff --git a/examples/capy_reco.cpp b/examples/capy_reco.cpp index ae094a7..89d73e1 100644 --- a/examples/capy_reco.cpp +++ b/examples/capy_reco.cpp @@ -10,6 +10,7 @@ #include "capy_stream_await.hpp" // StreamIoAwaitable #include "capy_task_arena_executor.hpp" // TaskArenaExecutor #include "logging_utils.hpp" // log, format_name +#include "nanospin.hpp" // launch_nanospin #include "statuscode.hpp" // StatusCodeImpl namespace tools { @@ -70,6 +71,7 @@ boost::capy::task> clusterization(DeviceBuffer cells, cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); co_return DeviceBuffer{d_clusters, static_cast(nClusters)}; @@ -109,6 +111,7 @@ boost::capy::task> seeding(DeviceBuffer clusters, ERROR_CHECK_CUDA(cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); co_return DeviceBuffer{d_seeds, static_cast(nSeeds)}; } diff --git a/examples/exec_delegate.cpp b/examples/exec_delegate.cpp index cdcbedc..4f79245 100644 --- a/examples/exec_delegate.cpp +++ b/examples/exec_delegate.cpp @@ -11,6 +11,7 @@ #include "exec_stream_await_sender.hpp" // stream_await_sender #include "exec_task_arena_scheduler.hpp" // TaskArenaScheduler #include "logging_utils.hpp" // log, format_name +#include "nanospin.hpp" // launch_nanospin #include "statuscode.hpp" // StatusCodeImpl namespace tools { @@ -83,6 +84,7 @@ exec::task> clusterization( nClusters * sizeof(int), stream)); ERROR_CHECK_CUDA(cudaMemsetAsync( d_clusters, 1, nClusters / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); }); co_await stdexec::on(delegation_ctx.get_scheduler(), std::move(allocate_clusters)); @@ -137,6 +139,7 @@ exec::task> seeding( cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); }); co_await stdexec::on(delegation_ctx.get_scheduler(), std::move(allocate_seeds)); diff --git a/examples/exec_event_poll.cpp b/examples/exec_event_poll.cpp index 3b36596..f30c36e 100644 --- a/examples/exec_event_poll.cpp +++ b/examples/exec_event_poll.cpp @@ -12,6 +12,7 @@ #include "exec_task_arena_scheduler.hpp" // TaskArenaScheduler #include "logging_utils.hpp" // log, format_name +#include "nanospin.hpp" // launch_nanospin #include "statuscode.hpp" // StatusCodeImpl namespace tools { @@ -107,6 +108,7 @@ exec::task> clusterization( nClusters * sizeof(int), stream)); ERROR_CHECK_CUDA(cudaMemsetAsync( d_clusters, 1, nClusters / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); }); co_await stdexec::on(delegation_ctx.get_scheduler(), std::move(allocate_clusters)); @@ -162,6 +164,7 @@ exec::task> seeding( cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); }); co_await stdexec::on(delegation_ctx.get_scheduler(), std::move(allocate_seeds)); diff --git a/examples/exec_reco.cpp b/examples/exec_reco.cpp index dbc07d3..10dcc39 100644 --- a/examples/exec_reco.cpp +++ b/examples/exec_reco.cpp @@ -10,6 +10,7 @@ #include "exec_stream_await_sender.hpp" // stream_await_sender #include "exec_task_arena_scheduler.hpp" // TaskArenaScheduler #include "logging_utils.hpp" // log, format_name +#include "nanospin.hpp" // launch_nanospin #include "statuscode.hpp" // StatusCodeImpl namespace tools { @@ -70,6 +71,7 @@ exec::task> clusterization(DeviceBuffer cells, cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); co_return DeviceBuffer{d_clusters, static_cast(nClusters)}; @@ -109,6 +111,7 @@ exec::task> seeding(DeviceBuffer clusters, ERROR_CHECK_CUDA(cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream)); ERROR_CHECK_CUDA( cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream)); + launch_nanospin(1'000'000, stream); co_return DeviceBuffer{d_seeds, static_cast(nSeeds)}; } diff --git a/examples/nanospin.cu b/examples/nanospin.cu new file mode 100644 index 0000000..9e40c95 --- /dev/null +++ b/examples/nanospin.cu @@ -0,0 +1,14 @@ +#include "nanospin.hpp" + +static __global__ void nanospin(std::uint64_t ns) { + long long start = clock64(); + auto cps = 1'400'000'000LL; // Assuming 1.4 GHz clock rate + auto delta = (long long)((ns * cps) / 1'000'000'000ULL); + auto end = start + delta; + while (clock64() < end) { + } +} + +void launch_nanospin(std::uint64_t ns, cudaStream_t stream) { + nanospin<<<1, 32, 0, stream>>>(ns); +} diff --git a/examples/nanospin.hpp b/examples/nanospin.hpp new file mode 100644 index 0000000..4fd5fb2 --- /dev/null +++ b/examples/nanospin.hpp @@ -0,0 +1,6 @@ +#include + +#include + +/// Launch a GPU spin for the specified duration in ns on the given CUDA stream. +void launch_nanospin(std::uint64_t ns, cudaStream_t stream);