Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ endif()
option(BUILD_CUDA "Build with CUDA support" OFF)
if(BUILD_CUDA)
find_package(CUDAToolkit REQUIRED)
enable_language(CUDA)
endif()

## targets
Expand Down
19 changes: 10 additions & 9 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,26 +70,27 @@ if(BUILD_CAPY)
endif()

if(BUILD_CUDA AND BUILD_TBB)
add_library(nanospin nanospin.cu)
add_executable(alien_reco alien_reco.cpp)
target_link_libraries(alien_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb)
target_link_libraries(alien_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb nanospin)
add_executable(alien_delegate alien_delegate.cpp)
target_link_libraries(alien_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb)
target_link_libraries(alien_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb nanospin)
add_executable(alien_event_poll alien_event_poll.cpp)
target_link_libraries(alien_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb)
target_link_libraries(alien_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb nanospin)
if(BUILD_STDEXEC)
add_executable(exec_reco_stdexec exec_reco.cpp)
target_link_libraries(exec_reco_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec)
target_link_libraries(exec_reco_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec nanospin)
add_executable(exec_delegate_stdexec exec_delegate.cpp)
target_link_libraries(exec_delegate_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec)
target_link_libraries(exec_delegate_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec nanospin)
add_executable(exec_event_poll_stdexec exec_event_poll.cpp)
target_link_libraries(exec_event_poll_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec)
target_link_libraries(exec_event_poll_stdexec PRIVATE CoroutineTests CUDA::cudart TBB::tbb STDEXEC::stdexec nanospin)
endif()
if(BUILD_CAPY)
add_executable(capy_reco capy_reco.cpp)
target_link_libraries(capy_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy)
target_link_libraries(capy_reco PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy nanospin)
add_executable(capy_delegate capy_delegate.cpp)
target_link_libraries(capy_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy)
target_link_libraries(capy_delegate PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy nanospin)
add_executable(capy_event_poll capy_event_poll.cpp)
target_link_libraries(capy_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy)
target_link_libraries(capy_event_poll PRIVATE CoroutineTests CUDA::cudart TBB::tbb Boost::capy nanospin)
endif()
endif()
3 changes: 3 additions & 0 deletions examples/alien_delegate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "CoroutineTests/threadpool.hpp"
#include "alien_stream_await.hpp" // StreamAwaitable
#include "logging_utils.hpp" // log, format_name
#include "nanospin.hpp" // launch_nanospin

#define ERROR_CHECK_CUDA(EXP) \
do { \
Expand Down Expand Up @@ -88,6 +89,7 @@ subtool::Task<DeviceBuffer<int>> clusterization(
nClusters * sizeof(int), stream));
ERROR_CHECK_CUDA(cudaMemsetAsync(
d_clusters, 1, nClusters / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
}));

co_return DeviceBuffer<int>{d_clusters,
Expand Down Expand Up @@ -138,6 +140,7 @@ subtool::Task<DeviceBuffer<int>> seeding(
cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
}));

co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
Expand Down
3 changes: 3 additions & 0 deletions examples/alien_event_poll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "CoroutineTests/alien/tool.hpp"
#include "CoroutineTests/threadpool.hpp"
#include "logging_utils.hpp" // log, format_name
#include "nanospin.hpp" // launch_nanospin

#define ERROR_CHECK_CUDA(EXP) \
do { \
Expand Down Expand Up @@ -109,6 +110,7 @@ subtool::Task<DeviceBuffer<int>> clusterization(
nClusters * sizeof(int), stream));
ERROR_CHECK_CUDA(cudaMemsetAsync(
d_clusters, 1, nClusters / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
}));

co_return DeviceBuffer<int>{d_clusters,
Expand Down Expand Up @@ -160,6 +162,7 @@ subtool::Task<DeviceBuffer<int>> seeding(
cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
}));

co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
Expand Down
3 changes: 3 additions & 0 deletions examples/alien_reco.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "CoroutineTests/alien/tool.hpp"
#include "alien_stream_await.hpp" // StreamAwaitable
#include "logging_utils.hpp" // log, format_name
#include "nanospin.hpp" // launch_nanospin

#define ERROR_CHECK_CUDA(EXP) \
do { \
Expand Down Expand Up @@ -64,6 +65,7 @@ subtool::Task<DeviceBuffer<int>> clusterization(DeviceBuffer<int> cells,
cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);

co_return DeviceBuffer<int>{d_clusters,
static_cast<std::size_t>(nClusters)};
Expand Down Expand Up @@ -103,6 +105,7 @@ subtool::Task<DeviceBuffer<int>> seeding(DeviceBuffer<int> clusters,
ERROR_CHECK_CUDA(cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);

co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
}
Expand Down
3 changes: 3 additions & 0 deletions examples/capy_delegate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "capy_stream_await.hpp" // StreamIoAwaitable
#include "capy_task_arena_executor.hpp" // TaskArenaExecutor
#include "logging_utils.hpp" // log, format_name
#include "nanospin.hpp" // launch_nanospin
#include "statuscode.hpp" // StatusCodeImpl

namespace tools {
Expand Down Expand Up @@ -84,6 +85,7 @@ boost::capy::task<DeviceBuffer<int>> clusterization(
cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
ERROR_CHECK_CUDA(cudaMemsetAsync(d_clusters, 1,
nClusters / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
}));

co_return DeviceBuffer<int>{d_clusters,
Expand Down Expand Up @@ -131,6 +133,7 @@ boost::capy::task<DeviceBuffer<int>> seeding(
cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
}));

co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
Expand Down
3 changes: 3 additions & 0 deletions examples/capy_event_poll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "capy_task_arena_executor.hpp" // TaskArenaExecutor
#include "logging_utils.hpp" // log, format_name
#include "nanospin.hpp" // launch_nanospin
#include "statuscode.hpp" // StatusCodeImpl

namespace tools {
Expand Down Expand Up @@ -105,6 +106,7 @@ boost::capy::task<DeviceBuffer<int>> clusterization(
cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
ERROR_CHECK_CUDA(cudaMemsetAsync(d_clusters, 1,
nClusters / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
}));

co_return DeviceBuffer<int>{d_clusters,
Expand Down Expand Up @@ -154,6 +156,7 @@ boost::capy::task<DeviceBuffer<int>> seeding(
cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
}));

co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
Expand Down
3 changes: 3 additions & 0 deletions examples/capy_reco.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "capy_stream_await.hpp" // StreamIoAwaitable
#include "capy_task_arena_executor.hpp" // TaskArenaExecutor
#include "logging_utils.hpp" // log, format_name
#include "nanospin.hpp" // launch_nanospin
#include "statuscode.hpp" // StatusCodeImpl

namespace tools {
Expand Down Expand Up @@ -70,6 +71,7 @@ boost::capy::task<DeviceBuffer<int>> clusterization(DeviceBuffer<int> cells,
cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);

co_return DeviceBuffer<int>{d_clusters,
static_cast<std::size_t>(nClusters)};
Expand Down Expand Up @@ -109,6 +111,7 @@ boost::capy::task<DeviceBuffer<int>> seeding(DeviceBuffer<int> clusters,
ERROR_CHECK_CUDA(cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);

co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
}
Expand Down
3 changes: 3 additions & 0 deletions examples/exec_delegate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "exec_stream_await_sender.hpp" // stream_await_sender
#include "exec_task_arena_scheduler.hpp" // TaskArenaScheduler
#include "logging_utils.hpp" // log, format_name
#include "nanospin.hpp" // launch_nanospin
#include "statuscode.hpp" // StatusCodeImpl

namespace tools {
Expand Down Expand Up @@ -83,6 +84,7 @@ exec::task<DeviceBuffer<int>> clusterization(
nClusters * sizeof(int), stream));
ERROR_CHECK_CUDA(cudaMemsetAsync(
d_clusters, 1, nClusters / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
});
co_await stdexec::on(delegation_ctx.get_scheduler(),
std::move(allocate_clusters));
Expand Down Expand Up @@ -137,6 +139,7 @@ exec::task<DeviceBuffer<int>> seeding(
cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
});
co_await stdexec::on(delegation_ctx.get_scheduler(),
std::move(allocate_seeds));
Expand Down
3 changes: 3 additions & 0 deletions examples/exec_event_poll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "exec_task_arena_scheduler.hpp" // TaskArenaScheduler
#include "logging_utils.hpp" // log, format_name
#include "nanospin.hpp" // launch_nanospin
#include "statuscode.hpp" // StatusCodeImpl

namespace tools {
Expand Down Expand Up @@ -107,6 +108,7 @@ exec::task<DeviceBuffer<int>> clusterization(
nClusters * sizeof(int), stream));
ERROR_CHECK_CUDA(cudaMemsetAsync(
d_clusters, 1, nClusters / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
});
co_await stdexec::on(delegation_ctx.get_scheduler(),
std::move(allocate_clusters));
Expand Down Expand Up @@ -162,6 +164,7 @@ exec::task<DeviceBuffer<int>> seeding(
cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);
});
co_await stdexec::on(delegation_ctx.get_scheduler(),
std::move(allocate_seeds));
Expand Down
3 changes: 3 additions & 0 deletions examples/exec_reco.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "exec_stream_await_sender.hpp" // stream_await_sender
#include "exec_task_arena_scheduler.hpp" // TaskArenaScheduler
#include "logging_utils.hpp" // log, format_name
#include "nanospin.hpp" // launch_nanospin
#include "statuscode.hpp" // StatusCodeImpl

namespace tools {
Expand Down Expand Up @@ -70,6 +71,7 @@ exec::task<DeviceBuffer<int>> clusterization(DeviceBuffer<int> cells,
cudaMemsetAsync(d_clusters, 0, nClusters * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_clusters, 1, nClusters / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);

co_return DeviceBuffer<int>{d_clusters,
static_cast<std::size_t>(nClusters)};
Expand Down Expand Up @@ -109,6 +111,7 @@ exec::task<DeviceBuffer<int>> seeding(DeviceBuffer<int> clusters,
ERROR_CHECK_CUDA(cudaMemsetAsync(d_seeds, 0, nSeeds * sizeof(int), stream));
ERROR_CHECK_CUDA(
cudaMemsetAsync(d_seeds, 1, nSeeds / 2 * sizeof(int), stream));
launch_nanospin(1'000'000, stream);

co_return DeviceBuffer<int>{d_seeds, static_cast<std::size_t>(nSeeds)};
}
Expand Down
14 changes: 14 additions & 0 deletions examples/nanospin.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#include "nanospin.hpp"

static __global__ void nanospin(std::uint64_t ns) {
long long start = clock64();
auto cps = 1'400'000'000LL; // Assuming 1.4 GHz clock rate
auto delta = (long long)((ns * cps) / 1'000'000'000ULL);
auto end = start + delta;
while (clock64() < end) {
}
}

void launch_nanospin(std::uint64_t ns, cudaStream_t stream) {
nanospin<<<1, 32, 0, stream>>>(ns);
}
6 changes: 6 additions & 0 deletions examples/nanospin.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include <cuda_runtime_api.h>

#include <cstdint>

/// Launch a GPU spin for the specified duration in ns on the given CUDA stream.
void launch_nanospin(std::uint64_t ns, cudaStream_t stream);