diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 7449bb153c9f7..39f19fe463745 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -75,6 +75,19 @@ namespace cms::alpakatools { } } + /* ElementIndex + * + * an aggregate that containes the .global and .local indices of an element; returned by iterating over elements_in_block. + */ + + struct ElementIndex { + Idx global; + Idx local; + }; + + /* elements_with_stride + */ + template and alpaka::Dim::value == 1>> class elements_with_stride { public: @@ -326,6 +339,187 @@ namespace cms::alpakatools { const Vec extent_; }; + /* blocks_with_stride + * + * `blocks_with_stride(acc, size)` returns a range than spans the (virtual) block indices required to cover the given + * problem size. + * + * For example, if size is 1000 and the block size is 16, it will return the range from 1 to 62. + * If the work division has more than 63 blocks, only the first 63 will perform one iteration of the loop, and the + * other will exit immediately. + * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to + * cover then whole problem space. + * + * All threads in a block see the same loop iterations, while threads in different blocks may see a different number + * of iterations. + */ + + template and alpaka::Dim::value == 1>> + class blocks_with_stride { + public: + ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc) + : first_{alpaka::getIdx(acc)[0u]}, + stride_{alpaka::getWorkDiv(acc)[0u]}, + extent_{stride_} {} + + // extent is the total number of elements (not blocks) + ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc, Idx extent) + : first_{alpaka::getIdx(acc)[0u]}, + stride_{alpaka::getWorkDiv(acc)[0u]}, + extent_{divide_up_by(extent, alpaka::getWorkDiv(acc)[0u])} {} + + class iterator { + friend class blocks_with_stride; + + ALPAKA_FN_ACC inline iterator(Idx stride, Idx extent, Idx first) + : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return first_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline iterator& operator++() { + // increment the first-element-in-block index by the grid stride + first_ += stride_; + if (first_ < extent_) + return *this; + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (first_ == other.first_); } + + ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + }; + + ALPAKA_FN_ACC inline iterator begin() const { return iterator(stride_, extent_, first_); } + + ALPAKA_FN_ACC inline iterator end() const { return iterator(stride_, extent_, extent_); } + + private: + const Idx first_; + const Idx stride_; + const Idx extent_; + }; + + /* elements_in_block + * + * `elements_in_block(acc, block, size)` returns a range that spans all the elements within the given block. + * Iterating over the range yields values of type ElementIndex, that contain both .global and .local indices + * of the corresponding element. + * + * If the work division has only one element per thread, the loop will perform at most one iteration. + * If the work division has more than one elements per thread, the loop will perform that number of iterations, + * or less if it reaches size. + */ + + template and alpaka::Dim::value == 1>> + class elements_in_block { + public: + ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block) + : first_{block * alpaka::getWorkDiv(acc)[0u]}, + local_{alpaka::getIdx(acc)[0u] * + alpaka::getWorkDiv(acc)[0u]}, + range_{local_ + alpaka::getWorkDiv(acc)[0u]} {} + + ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block, Idx extent) + : first_{block * alpaka::getWorkDiv(acc)[0u]}, + local_{std::min(extent - first_, + alpaka::getIdx(acc)[0u] * + alpaka::getWorkDiv(acc)[0u])}, + range_{std::min(extent - first_, local_ + alpaka::getWorkDiv(acc)[0u])} {} + + class iterator { + friend class elements_in_block; + + ALPAKA_FN_ACC inline iterator(Idx local, Idx first, Idx range) : index_{local}, first_{first}, range_{range} {} + + public: + ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline iterator& operator++() { + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < range_) + return *this; + } + + // the iterator has reached or passed the end of the extent, clamp it to the extent + index_ = range_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (index_ == other.index_); } + + ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + + private: + // modified by the pre/post-increment operator + Idx index_; + // non-const to support iterator copy and assignment + Idx first_; + Idx range_; + }; + + ALPAKA_FN_ACC inline iterator begin() const { return iterator(local_, first_, range_); } + + ALPAKA_FN_ACC inline iterator end() const { return iterator(range_, first_, range_); } + + private: + const Idx first_; + const Idx local_; + const Idx range_; + }; + + /* once_per_grid + * + * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid. + * + * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon. + */ + + template >> + ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) { + return alpaka::getIdx(acc) == Vec>::zeros(); + } + + /* once_per_block + * + * `once_per_block(acc)` returns true for a single thread within the block. + * + * Usually the condition is true for thread 0, but this index should not be relied upon. + */ + + template >> + ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) { + return alpaka::getIdx(acc) == Vec>::zeros(); + } + } // namespace cms::alpakatools #endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc index c35965fa8793b..300f139b0c6e3 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc @@ -13,8 +13,6 @@ // each test binary is built for a single Alpaka backend using namespace ALPAKA_ACCELERATOR_NAMESPACE; -static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel) "]"; - struct VectorAddKernel { template ALPAKA_FN_ACC void operator()( @@ -58,233 +56,316 @@ struct VectorAddKernel3D { } }; -TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) { - SECTION("VectorAddKernel") { - // get the list of devices on the current platform - auto const& devices = cms::alpakatools::devices(); - if (devices.empty()) { - std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) - << ", the test will be skipped.\n"; - return; - } - - // random number generator with a gaussian distribution - std::random_device rd{}; - std::default_random_engine rand{rd()}; - std::normal_distribution dist{0., 1.}; - - // tolerance - constexpr float epsilon = 0.000001; - - // buffer size - constexpr size_t size = 1024 * 1024; - - // allocate input and output host buffers in pinned memory accessible by the Platform devices - auto in1_h = cms::alpakatools::make_host_buffer(size); - auto in2_h = cms::alpakatools::make_host_buffer(size); - auto out_h = cms::alpakatools::make_host_buffer(size); +/* This is not an efficient approach; it is only a test of using dynamic shared memory, + * split block and element loops, and block-level synchronisation + */ - // fill the input buffers with random data, and the output buffer with zeros - for (size_t i = 0; i < size; ++i) { - in1_h[i] = dist(rand); - in2_h[i] = dist(rand); - out_h[i] = 0.; +struct VectorAddBlockKernel { + template + ALPAKA_FN_ACC void operator()( + TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const { + // block size + auto const blockSize = alpaka::getWorkDiv(acc)[0u]; + // get the dynamic shared memory buffer + T* buffer = alpaka::getDynSharedMem(acc); + // the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space + // the inner loop is needed for backends that use more than one element per thread + for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) { + // only one thread per block: initialise the shared memory + if (cms::alpakatools::once_per_block(acc)) { + // not really necessary, just to show how to use "once_per_block" + for (Idx local = 0; local < blockSize; ++local) + buffer[local] = 0.; + } + // synchronise all threads in the block + alpaka::syncBlockThreads(acc); + // read the first set of data into shared memory + for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) { + buffer[index.local] = in1[index.global]; + } + // synchronise all threads in the block + alpaka::syncBlockThreads(acc); + // add the second set of data into shared memory + for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) { + buffer[index.local] += in2[index.global]; + } + // synchronise all threads in the block + alpaka::syncBlockThreads(acc); + // store the results into global memory + for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) { + out[index.global] = buffer[index.local]; + } } + } +}; - // run the test on each device - for (auto const& device : devices) { - std::cout << "Test 1D vector addition on " << alpaka::getName(device) << '\n'; - auto queue = Queue(device); - - // allocate input and output buffers on the device - auto in1_d = cms::alpakatools::make_device_buffer(queue, size); - auto in2_d = cms::alpakatools::make_device_buffer(queue, size); - auto out_d = cms::alpakatools::make_device_buffer(queue, size); - - // copy the input data to the device; the size is known from the buffer objects - alpaka::memcpy(queue, in1_d, in1_h); - alpaka::memcpy(queue, in2_d, in2_h); - - // fill the output buffer with zeros; the size is known from the buffer objects - alpaka::memset(queue, out_d, 0.); - - // launch the 1-dimensional kernel with scalar size - auto div = cms::alpakatools::make_workdiv(4, 4); - alpaka::exec(queue, div, VectorAddKernel{}, in1_d.data(), in2_d.data(), out_d.data(), size); - - // copy the results from the device to the host - alpaka::memcpy(queue, out_h, out_d); - - // wait for all the operations to complete - alpaka::wait(queue); +/* Run all operations in a single thread. + * Written in an inefficient way to test "once_per_grid". + */ - // check the results - for (size_t i = 0; i < size; ++i) { - float sum = in1_h[i] + in2_h[i]; - REQUIRE(out_h[i] < sum + epsilon); - REQUIRE(out_h[i] > sum - epsilon); +struct VectorAddKernelSerial { + template + ALPAKA_FN_ACC void operator()( + TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const { + // the operations are performed by a single thread + if (cms::alpakatools::once_per_grid(acc)) { + for (Idx index = 0; index < size; ++index) { + out[index] += in1[index]; + out[index] += in2[index]; } + } + } +}; - // reset the output buffer on the device to all zeros - alpaka::memset(queue, out_d, 0.); - - // launch the 1-dimensional kernel with vector size - alpaka::exec(queue, div, VectorAddKernel1D{}, in1_d.data(), in2_d.data(), out_d.data(), size); - - // copy the results from the device to the host - alpaka::memcpy(queue, out_h, out_d); - - // wait for all the operations to complete - alpaka::wait(queue); +/* Run all operations in one thread per block. + * Written in an inefficient way to test "once_per_block". + */ - // check the results - for (size_t i = 0; i < size; ++i) { - float sum = in1_h[i] + in2_h[i]; - REQUIRE(out_h[i] < sum + epsilon); - REQUIRE(out_h[i] > sum - epsilon); +struct VectorAddKernelBlockSerial { + template + ALPAKA_FN_ACC void operator()( + TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const { + // block size + auto const blockSize = alpaka::getWorkDiv(acc)[0u]; + // the loop is used to repeat the "block" as many times as needed to cover the whole problem space + for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) { + // the operations are performed by a single thread in each "logical" block + const auto first = blockSize * block; + const auto range = std::min(first + blockSize, size); + if (cms::alpakatools::once_per_block(acc)) { + for (Idx index = first; index < range; ++index) { + out[index] += in1[index]; + out[index] += in2[index]; + } } } } -} +}; -TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel2D), s_tag) { - SECTION("VectorAddKernel2D") { - // get the list of devices on the current platform - auto const& devices = cms::alpakatools::devices(); - if (devices.empty()) { - std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) - << ", the test will be skipped.\n"; - return; +namespace alpaka::trait { + // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of + // block shared dynamic memory for the VectorAddBlockKernel kernel + template + struct BlockSharedMemDynSizeBytes { + // the size in bytes of the shared memory allocated for a block + template + ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(VectorAddBlockKernel const& /* kernel */, + Vec1D threads, + Vec1D elements, + T const* __restrict__ /* in1 */, + T const* __restrict__ /* in2 */, + T* __restrict__ /* out */, + size_t size) { + return static_cast(threads[0] * elements[0] * sizeof(T)); } + }; +} // namespace alpaka::trait + +// test the 1-dimensional kernel on all devices +template +void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) { + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // buffer size + const size_t size = problem_size; + + // allocate input and output host buffers in pinned memory accessible by the Platform devices + auto in1_h = cms::alpakatools::make_host_buffer(size); + auto in2_h = cms::alpakatools::make_host_buffer(size); + auto out_h = cms::alpakatools::make_host_buffer(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } - // random number generator with a gaussian distribution - std::random_device rd{}; - std::default_random_engine rand{rd()}; - std::normal_distribution dist{0., 1.}; + // run the test on each device + for (auto const& device : cms::alpakatools::devices()) { + std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " over " << problem_size << " values with " + << grid_size << " blocks of " << block_size << " elements\n"; + auto queue = Queue(device); - // tolerance - constexpr float epsilon = 0.000001; + // allocate input and output buffers on the device + auto in1_d = cms::alpakatools::make_device_buffer(queue, size); + auto in2_d = cms::alpakatools::make_device_buffer(queue, size); + auto out_d = cms::alpakatools::make_device_buffer(queue, size); - // 3-dimensional and linearised buffer size - constexpr Vec2D ndsize = {16, 16}; - constexpr size_t size = ndsize.prod(); + // copy the input data to the device; the size is known from the buffer objects + alpaka::memcpy(queue, in1_d, in1_h); + alpaka::memcpy(queue, in2_d, in2_h); - // allocate input and output host buffers in pinned memory accessible by the Platform devices - auto in1_h = cms::alpakatools::make_host_buffer(size); - auto in2_h = cms::alpakatools::make_host_buffer(size); - auto out_h = cms::alpakatools::make_host_buffer(size); + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::memset(queue, out_d, 0.); - // fill the input buffers with random data, and the output buffer with zeros + // launch the 1-dimensional kernel with scalar size + auto div = cms::alpakatools::make_workdiv(grid_size, block_size); + alpaka::exec(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), size); + + // copy the results from the device to the host + alpaka::memcpy(queue, out_h, out_d); + + // wait for all the operations to complete + alpaka::wait(queue); + + // check the results for (size_t i = 0; i < size; ++i) { - in1_h[i] = dist(rand); - in2_h[i] = dist(rand); - out_h[i] = 0.; + float sum = in1_h[i] + in2_h[i]; + REQUIRE(out_h[i] < sum + epsilon); + REQUIRE(out_h[i] > sum - epsilon); } + } +} + +// test the N-dimensional kernels on all devices +template +void testVectorAddKernelND(Vec problem_size, Vec grid_size, Vec block_size, TKernel kernel) { + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // linearised buffer size + const size_t size = problem_size.prod(); + + // allocate input and output host buffers in pinned memory accessible by the Platform devices + auto in1_h = cms::alpakatools::make_host_buffer(size); + auto in2_h = cms::alpakatools::make_host_buffer(size); + auto out_h = cms::alpakatools::make_host_buffer(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } - // run the test on each device - for (auto const& device : devices) { - std::cout << "Test 2D vector addition on " << alpaka::getName(device) << '\n'; - auto queue = Queue(device); + // run the test on each device + for (auto const& device : cms::alpakatools::devices()) { + std::cout << "Test " << TDim::value << "D vector addition on " << alpaka::getName(device) << " over " + << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n"; + auto queue = Queue(device); - // allocate input and output buffers on the device - auto in1_d = cms::alpakatools::make_device_buffer(queue, size); - auto in2_d = cms::alpakatools::make_device_buffer(queue, size); - auto out_d = cms::alpakatools::make_device_buffer(queue, size); + // allocate input and output buffers on the device + auto in1_d = cms::alpakatools::make_device_buffer(queue, size); + auto in2_d = cms::alpakatools::make_device_buffer(queue, size); + auto out_d = cms::alpakatools::make_device_buffer(queue, size); - // copy the input data to the device; the size is known from the buffer objects - alpaka::memcpy(queue, in1_d, in1_h); - alpaka::memcpy(queue, in2_d, in2_h); + // copy the input data to the device; the size is known from the buffer objects + alpaka::memcpy(queue, in1_d, in1_h); + alpaka::memcpy(queue, in2_d, in2_h); - // fill the output buffer with zeros; the size is known from the buffer objects - alpaka::memset(queue, out_d, 0.); + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::memset(queue, out_d, 0.); - // launch the 3-dimensional kernel - auto div = cms::alpakatools::make_workdiv({4, 4}, {32, 32}); - alpaka::exec(queue, div, VectorAddKernel2D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize); + // launch the 3-dimensional kernel + using AccND = Acc; + auto div = cms::alpakatools::make_workdiv(grid_size, block_size); + alpaka::exec(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), problem_size); - // copy the results from the device to the host - alpaka::memcpy(queue, out_h, out_d); + // copy the results from the device to the host + alpaka::memcpy(queue, out_h, out_d); - // wait for all the operations to complete - alpaka::wait(queue); + // wait for all the operations to complete + alpaka::wait(queue); - // check the results - for (size_t i = 0; i < size; ++i) { - float sum = in1_h[i] + in2_h[i]; - REQUIRE(out_h[i] < sum + epsilon); - REQUIRE(out_h[i] > sum - epsilon); - } + // check the results + for (size_t i = 0; i < size; ++i) { + float sum = in1_h[i] + in2_h[i]; + REQUIRE(out_h[i] < sum + epsilon); + REQUIRE(out_h[i] > sum - epsilon); } } } -TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) { - SECTION("VectorAddKernel3D") { +TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend", + "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") { + SECTION("Alpaka N-dimensional kernels") { // get the list of devices on the current platform auto const& devices = cms::alpakatools::devices(); if (devices.empty()) { - std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) - << ", the test will be skipped.\n"; - return; + INFO("No devices available on the platform " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)); + REQUIRE(not devices.empty()); } - // random number generator with a gaussian distribution - std::random_device rd{}; - std::default_random_engine rand{rd()}; - std::normal_distribution dist{0., 1.}; - - // tolerance - constexpr float epsilon = 0.000001; - - // 3-dimensional and linearised buffer size - constexpr Vec3D ndsize = {50, 125, 16}; - constexpr size_t size = ndsize.prod(); - - // allocate input and output host buffers in pinned memory accessible by the Platform devices - auto in1_h = cms::alpakatools::make_host_buffer(size); - auto in2_h = cms::alpakatools::make_host_buffer(size); - auto out_h = cms::alpakatools::make_host_buffer(size); - - // fill the input buffers with random data, and the output buffer with zeros - for (size_t i = 0; i < size; ++i) { - in1_h[i] = dist(rand); - in2_h[i] = dist(rand); - out_h[i] = 0.; - } - - // run the test on each device - for (auto const& device : devices) { - std::cout << "Test 3D vector addition on " << alpaka::getName(device) << '\n'; - auto queue = Queue(device); - - // allocate input and output buffers on the device - auto in1_d = cms::alpakatools::make_device_buffer(queue, size); - auto in2_d = cms::alpakatools::make_device_buffer(queue, size); - auto out_d = cms::alpakatools::make_device_buffer(queue, size); - - // copy the input data to the device; the size is known from the buffer objects - alpaka::memcpy(queue, in1_d, in1_h); - alpaka::memcpy(queue, in2_d, in2_h); - - // fill the output buffer with zeros; the size is known from the buffer objects - alpaka::memset(queue, out_d, 0.); - - // launch the 3-dimensional kernel - auto div = cms::alpakatools::make_workdiv({5, 5, 1}, {4, 4, 4}); - alpaka::exec(queue, div, VectorAddKernel3D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize); - - // copy the results from the device to the host - alpaka::memcpy(queue, out_h, out_d); - - // wait for all the operations to complete - alpaka::wait(queue); - - // check the results - for (size_t i = 0; i < size; ++i) { - float sum = in1_h[i] + in2_h[i]; - REQUIRE(out_h[i] < sum + epsilon); - REQUIRE(out_h[i] > sum - epsilon); - } - } + // launch the 1-dimensional kernel with a small block size and a small number of blocks; + // this relies on the kernel to loop over the "problem space" and do more work per block + std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n"; + testVectorAddKernel(10000, 32, 32, VectorAddKernel{}); + + // launch the 1-dimensional kernel with a large block size and a single block; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n"; + testVectorAddKernel(100, 1, 1024, VectorAddKernel{}); + + // launch the 1-dimensional kernel with a small block size and a small number of blocks; + // this relies on the kernel to loop over the "problem space" and do more work per block + std::cout << "Test 1D vector addition with small block size\n"; + testVectorAddKernelND({10000}, {32}, {32}, VectorAddKernel1D{}); + + // launch the 1-dimensional kernel with a large block size and a single block; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test 1D vector addition with large block size\n"; + testVectorAddKernelND({100}, {1}, {1024}, VectorAddKernel1D{}); + + // launch the 2-dimensional kernel with a small block size and a small number of blocks; + // this relies on the kernel to loop over the "problem space" and do more work per block + std::cout << "Test 2D vector addition with small block size\n"; + testVectorAddKernelND({400, 250}, {4, 4}, {16, 16}, VectorAddKernel2D{}); + + // launch the 2-dimensional kernel with a large block size and a single block; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test 2D vector addition with large block size\n"; + testVectorAddKernelND({20, 20}, {1, 1}, {32, 32}, VectorAddKernel2D{}); + + // launch the 3-dimensional kernel with a small block size and a small number of blocks; + // this relies on the kernel to loop over the "problem space" and do more work per block + std::cout << "Test 3D vector addition with small block size\n"; + testVectorAddKernelND({50, 125, 16}, {5, 5, 1}, {4, 4, 4}, VectorAddKernel3D{}); + + // launch the 3-dimensional kernel with a large block size and a single block; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test 3D vector addition with large block size\n"; + testVectorAddKernelND({5, 5, 5}, {1, 1, 1}, {8, 8, 8}, VectorAddKernel3D{}); + + // launch the 1-dimensional kernel with a small block size and a small number of blocks; + // this relies on the kernel to loop over the "problem space" and do more work per block + std::cout << "Test 1D vector block-level addition with small block size, using scalar dimensions\n"; + testVectorAddKernel(10000, 32, 32, VectorAddBlockKernel{}); + + // launch the 1-dimensional kernel with a large block size and a single block; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n"; + testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{}); + + // launch the 1-dimensional kernel with a small block size and a small number of blocks; + // this relies on the kernel to loop over the "problem space" and do more work per block + std::cout << "Test 1D vector single-threaded serial addition with small block size, using scalar dimensions\n"; + testVectorAddKernel(10000, 32, 32, VectorAddKernelSerial{}); + + // launch the 1-dimensional kernel with a large block size and a single block; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test 1D vector single-threaded seria addition with large block size, using scalar dimensions\n"; + testVectorAddKernel(100, 1, 1024, VectorAddKernelSerial{}); + + // launch the 1-dimensional kernel with a small block size and a small number of blocks; + // this relies on the kernel to loop over the "problem space" and do more work per block + std::cout << "Test 1D vector block-level serial addition with small block size, using scalar dimensions\n"; + testVectorAddKernel(10000, 32, 32, VectorAddKernelBlockSerial{}); + + // launch the 1-dimensional kernel with a large block size and a single block; + // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data + std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n"; + testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{}); } }