diff --git a/src/alpaka/AlpakaCore/CachingDeviceAllocator.h b/src/alpaka/AlpakaCore/CachingDeviceAllocator.h new file mode 100644 index 000000000..9d1407ac9 --- /dev/null +++ b/src/alpaka/AlpakaCore/CachingDeviceAllocator.h @@ -0,0 +1,444 @@ +#ifndef HeterogenousCore_AlpakaUtilities_src_CachingDeviceAllocator_h +#define HeterogenousCore_AlpakaUtilities_src_CachingDeviceAllocator_h + +/****************************************************************************** + * Simple caching allocator for device memory allocations. The allocator is + * thread-safe and capable of managing device allocations on multiple devices. + ******************************************************************************/ + +#include +#include +#include +#include + +#include "AlpakaCore/alpakaMemoryHelper.h" +#include "AlpakaCore/deviceAllocatorStatus.h" + +/// cms::alpakatools::allocator namespace +namespace cms::alpakatools::allocator { + + /** + * \addtogroup UtilMgmt + * @{ + */ + + /****************************************************************************** + * CachingDeviceAllocator (host use) + ******************************************************************************/ + + /** + * \brief A simple caching allocator for device memory allocations. + * + * \par Overview + * The allocator is thread-safe and is capable of managing cached + * device allocations on multiple devices. It behaves as follows: + * + * \par + * - Allocations are categorized and cached by bin size. A new allocation request of + * a given size will only consider cached allocations within the corresponding bin. + * - Bin limits progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused device allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations on a given device will exceed + * \p max_cached_bytes, allocations for that device are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingDeviceAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes per device + * + */ + struct CachingDeviceAllocator { + //--------------------------------------------------------------------- + // Constants + //--------------------------------------------------------------------- + + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int)-1; + + /// Invalid size + static const size_t INVALID_SIZE = (size_t)-1; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- + + /** + * Descriptor for device memory allocations + */ + struct BlockDescriptor { + ::ALPAKA_ACCELERATOR_NAMESPACE::AlpakaDeviceBuf buf; // Device buffer + size_t bytes; // Size of allocation in bytes + size_t bytesRequested; // CMS: requested allocation size (for monitoring only) + unsigned int bin; // Bin enumeration + + // Constructor (suitable for searching maps for a block, given a device and bytes) + BlockDescriptor(unsigned int block_bin, + size_t block_bytes, + size_t bytes_requested, + const ::ALPAKA_ACCELERATOR_NAMESPACE::Device& device) + : buf{cms::alpakatools::allocDeviceBuf(device, 0u)}, + bytes{block_bytes}, + bytesRequested{bytes_requested}, // CMS + bin{block_bin} {} + + // Constructor (suitable for searching maps for a specific block, given a device buffer) + BlockDescriptor(::ALPAKA_ACCELERATOR_NAMESPACE::AlpakaDeviceBuf buffer) + : buf{std::move(buffer)}, + bytes{0}, + bytesRequested{0}, // CMS + bin{INVALID_BIN} {} + }; + + struct BlockHashByBytes { + size_t operator()(const BlockDescriptor& descriptor) const { + size_t h1 = std::hash{}(getIdxOfDev(alpaka::getDev(descriptor.buf))); + size_t h2 = std::hash{}(descriptor.bytes); + return h1 ^ (h2 << 1); + } + }; + + struct BlockEqualByBytes { + bool operator()(const BlockDescriptor& a, const BlockDescriptor& b) const { + return (getIdxOfDev(alpaka::getDev(a.buf)) == getIdxOfDev(alpaka::getDev(b.buf)) && a.bytes == b.bytes); + } + }; + + struct BlockHashByPtr { + size_t operator()(const BlockDescriptor& descriptor) const { + size_t h1 = std::hash{}(getIdxOfDev(alpaka::getDev(descriptor.buf))); + size_t h2 = std::hash{}(alpaka::getPtrNative(descriptor.buf)); + return h1 ^ (h2 << 1); + } + }; + + struct BlockEqualByPtr { + bool operator()(const BlockDescriptor& a, const BlockDescriptor& b) const { + return (getIdxOfDev(alpaka::getDev(a.buf)) == getIdxOfDev(alpaka::getDev(b.buf)) && + alpaka::getPtrNative(a.buf) == alpaka::getPtrNative(b.buf)); + } + }; + + // CMS: Moved TotalBytes to deviceAllocatorStatus.h + + /// Set type for cached blocks (hashed by size) + using CachedBlocks = std::unordered_multiset; + + /// Set type for live blocks (hashed by ptr) + using BusyBlocks = std::unordered_multiset; + + // CMS: Moved DeviceCachedBytes to deviceAllocatorStatus.h + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static constexpr unsigned int IntPow(unsigned int base, unsigned int exp) { + unsigned int retval = 1; + while (exp > 0) { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + /** + * Round up to the nearest power-of + */ + std::pair NearestPowerOf(unsigned int base, size_t value) { + unsigned int power = 0; + size_t rounded_bytes = 1; + + if (value * base < value) { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + } else { + while (rounded_bytes < value) { + rounded_bytes *= base; + power++; + } + } + + return {power, rounded_bytes}; + } + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + // CMS: use std::mutex instead of cub::Mutex, declare mutable + mutable std::mutex mutex; /// Mutex for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + + bool debug; /// Whether or not to print (de)allocation events to stdout + + DeviceCachedBytes cached_bytes; /// Map of device to aggregate cached bytes on that device + CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse + BusyBlocks live_blocks; /// Set of live device allocations currently in use + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingDeviceAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) + bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) + : bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes), + debug(debug) {} + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes per device + */ + CachingDeviceAllocator(/*bool skip_cleanup = false, */ bool debug = false) + : bin_growth(8), + min_bin(3), + max_bin(7), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1), + debug(debug) {} + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use or + * cached-in-reserve) to be freed. See \p FreeAllCached(). + */ + void SetMaxCachedBytes(size_t max_cached_bytes) { + // Lock + std::unique_lock mutex_locker(mutex); + + if (debug) { + // CMS: use raw printf + printf("Changing max_cached_bytes (%lld -> %lld)\n", + (long long)this->max_cached_bytes, + (long long)max_cached_bytes); + } + + this->max_cached_bytes = max_cached_bytes; + + mutex_locker.unlock(); + } + + /** + * \brief Provides a suitable allocation of device memory for the given size on the specified device. + * + * Once freed, the allocation becomes available immediately for reuse. + */ + auto DeviceAllocate(size_t bytes, ///< [in] Minimum no. of bytes for the allocation + const ::ALPAKA_ACCELERATOR_NAMESPACE::Device& + device) ///< [in] The device to be associated with this allocation + { + std::unique_lock mutex_locker(mutex, std::defer_lock); + int device_idx = getIdxOfDev(device); + + // Create a block descriptor for the requested allocation + bool found = false; + auto [bin, bin_bytes] = NearestPowerOf(bin_growth, bytes); + BlockDescriptor search_key{bin, bin_bytes, bytes, device}; + + if (search_key.bin > max_bin) { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; + } else { + // Search for a suitable cached allocation: lock + mutex_locker.lock(); + + if (search_key.bin < min_bin) { + // Bin is less than minimum bin: round up + search_key.bin = min_bin; + search_key.bytes = min_bin_bytes; + } + + // Find a cached block on the same device in the same bin + auto block_itr = cached_blocks.find(search_key); + if (block_itr != cached_blocks.end()) { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_bytes[device_idx].free -= search_key.bytes; + cached_bytes[device_idx].live += search_key.bytes; + cached_bytes[device_idx].liveRequested += search_key.bytesRequested; // CMS + + if (debug) { + // CMS: improved debug message + // CMS: use raw printf + printf("\tDevice %d reused cached block at %p (%lld bytes).\n", + device_idx, + alpaka::getPtrNative(search_key.buf), + (long long)search_key.bytes); + } + + cached_blocks.erase(block_itr); + } + // Done searching: unlock + mutex_locker.unlock(); + } + + // Allocate the block if necessary + if (!found) { + search_key.buf = alpaka::allocBuf( + device, static_cast(search_key.bytes)); +#if CUDA_VERSION >= 11020 + alpaka::prepareForAsyncCopy(search_key.buf); +#endif + + // Insert into live blocks + mutex_locker.lock(); + live_blocks.insert(search_key); + cached_bytes[device_idx].live += search_key.bytes; + cached_bytes[device_idx].liveRequested += search_key.bytesRequested; // CMS + mutex_locker.unlock(); + + if (debug) { + // CMS: improved debug message + // CMS: use raw printf + printf("\tDevice %d allocated new device block at %p (%lld bytes).\n", + device_idx, + alpaka::getPtrNative(search_key.buf), + (long long)search_key.bytes); + } + } + + if (debug) { + // CMS: use raw printf + printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long)cached_blocks.size(), + (long long)cached_bytes[device_idx].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device_idx].live); + } + + return search_key.buf; + } + + /** + * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. + */ + void DeviceFree(const ::ALPAKA_ACCELERATOR_NAMESPACE::AlpakaDeviceBuf& buf) { + // Lock + std::unique_lock mutex_locker(mutex); + + bool recached = false; + int device_idx = getIdxOfDev(alpaka::getDev(buf)); + // Find corresponding block descriptor + BlockDescriptor search_key{buf}; + auto block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes[device_idx].live -= search_key.bytes; + cached_bytes[device_idx].liveRequested -= search_key.bytesRequested; // CMS + + // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold + if ((search_key.bin != INVALID_BIN) && (cached_bytes[device_idx].free + search_key.bytes <= max_cached_bytes)) { + recached = true; + // Insert returned allocation into free blocks + cached_blocks.insert(search_key); + cached_bytes[device_idx].free += search_key.bytes; + + if (debug) { + // CMS: improved debug message + // CMS: use raw printf + printf( + "\tDevice %d returned %lld bytes at %p.\n\t\t %lld available " + "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + device_idx, + (long long)search_key.bytes, + alpaka::getPtrNative(search_key.buf), + (long long)cached_blocks.size(), + (long long)cached_bytes[device_idx].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device_idx].live); + } + } + } + + // Unlock + mutex_locker.unlock(); + + if (!recached and debug) { + // CMS: improved debug message + printf( + "\tDevice %d freed %lld bytes at %p.\n\t\t %lld available " + "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device_idx, + (long long)search_key.bytes, + alpaka::getPtrNative(search_key.buf), + (long long)cached_blocks.size(), + (long long)cached_bytes[device_idx].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device_idx].live); + } + } + + // CMS: give access to cache allocation status + DeviceCachedBytes CacheStatus() const { + std::unique_lock mutex_locker(mutex); + return cached_bytes; + } + }; + + /** @} */ // end group UtilMgmt + +} // namespace cms::alpakatools::allocator + +#endif diff --git a/src/alpaka/AlpakaCore/CachingHostAllocator.h b/src/alpaka/AlpakaCore/CachingHostAllocator.h new file mode 100644 index 000000000..17d63f3a5 --- /dev/null +++ b/src/alpaka/AlpakaCore/CachingHostAllocator.h @@ -0,0 +1,398 @@ +#ifndef HeterogenousCore_AlpakaUtilities_src_CachingHostAllocator_h +#define HeterogenousCore_AlpakaUtilities_src_CachingHostAllocator_h + +/****************************************************************************** + * Simple caching allocator for pinned host memory allocations. The allocator is + * thread-safe. + ******************************************************************************/ + +#include +#include +#include +#include + +#include "AlpakaCore/alpakaMemoryHelper.h" +#include "AlpakaCore/deviceAllocatorStatus.h" + +/// cms::alpaka::allocator namespace +namespace cms::alpakatools::allocator { + + /** + * \addtogroup UtilMgmt + * @{ + */ + + /****************************************************************************** + * CachingHostAllocator (host use) + ******************************************************************************/ + + /** + * \brief A simple caching allocator pinned host memory allocations. + * + * \par Overview + * The allocator is thread-safe. It behaves as follows: + * + * \par + * - Allocations are categorized and cached by bin size. A new allocation request of + * a given size will only consider cached allocations within the corresponding bin. + * - Bin limits progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused host allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations will exceed + * \p max_cached_bytes, allocations are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingHostAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes + * + */ + + struct CachingHostAllocator { + //--------------------------------------------------------------------- + // Constants + //--------------------------------------------------------------------- + + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int)-1; + + /// Invalid size + static const size_t INVALID_SIZE = (size_t)-1; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- + + /** + * Descriptor for pinned host memory allocations + */ + struct BlockDescriptor { + alpaka_common::AlpakaHostBuf buf; // Host buffer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + + // Constructor (suitable for searching maps for a specific block, given a host buffer) + BlockDescriptor(alpaka_common::AlpakaHostBuf buffer) + : buf{std::move(buffer)}, bytes{0}, bin{INVALID_BIN} {} + + // Constructor (suitable for searching maps for a block, given the bytes) + BlockDescriptor(unsigned int block_bin, size_t block_bytes) + : buf{allocHostBuf(0u)}, bytes{block_bytes}, bin{block_bin} {} + }; + + struct BlockHashByBytes { + size_t operator()(const BlockDescriptor& descriptor) const { return std::hash{}(descriptor.bytes); } + }; + + struct BlockEqualByBytes { + bool operator()(const BlockDescriptor& a, const BlockDescriptor& b) const { return (a.bytes == b.bytes); } + }; + + struct BlockHashByPtr { + size_t operator()(const BlockDescriptor& descriptor) const { + return std::hash{}(alpaka::getPtrNative(descriptor.buf)); + } + }; + + struct BlockEqualByPtr { + bool operator()(const BlockDescriptor& a, const BlockDescriptor& b) const { + return (alpaka::getPtrNative(a.buf) == alpaka::getPtrNative(b.buf)); + } + }; + + /// Set type for cached blocks (hashed by size) + using CachedBlocks = std::unordered_multiset; + + /// Set type for live blocks (hashed by ptr) + using BusyBlocks = std::unordered_multiset; + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow(unsigned int base, unsigned int exp) { + unsigned int retval = 1; + while (exp > 0) { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + /** + * Round up to the nearest power-of + */ + std::pair NearestPowerOf(unsigned int base, size_t value) { + unsigned int power = 0; + size_t rounded_bytes = 1; + + if (value * base < value) { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + } else { + while (rounded_bytes < value) { + rounded_bytes *= base; + power++; + } + } + + return {power, rounded_bytes}; + } + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + std::mutex mutex; /// Mutex for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes + + bool debug; /// Whether or not to print (de)allocation events to stdout + + TotalBytes cached_bytes; /// Aggregate cached bytes + CachedBlocks cached_blocks; /// Set of cached pinned host allocations available for reuse + BusyBlocks live_blocks; /// Set of live pinned host allocations currently in use + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingHostAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes (default is no limit) + bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) + : bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes), + debug(debug) {} + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes + */ + CachingHostAllocator(bool debug = false) + : bin_growth(8), + min_bin(3), + max_bin(7), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1), + debug(debug) {} + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use or + * cached-in-reserve) to be freed. See \p FreeAllCached(). + */ + void SetMaxCachedBytes(size_t max_cached_bytes) { + // Lock + std::unique_lock mutex_locker(mutex); + + if (debug) { + printf("Changing max_cached_bytes (%lld -> %lld)\n", + (long long)this->max_cached_bytes, + (long long)max_cached_bytes); + } + + this->max_cached_bytes = max_cached_bytes; + + // Unlock (redundant, kept for style uniformity) + mutex_locker.unlock(); + } + + /** + * \brief Provides a suitable allocation of pinned host memory for the given size. + * + * Once freed, the allocation becomes available immediately for reuse. + */ + auto HostAllocate(size_t bytes ///< [in] Minimum no. of bytes for the allocation + ) { + std::unique_lock mutex_locker(mutex, std::defer_lock); + + // Create a block descriptor for the requested allocation + bool found = false; + auto [bin, bin_bytes] = NearestPowerOf(bin_growth, bytes); + BlockDescriptor search_key{bin, bin_bytes}; + + if (search_key.bin > max_bin) { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; + } else { + // Search for a suitable cached allocation: lock + mutex_locker.lock(); + + if (search_key.bin < min_bin) { + // Bin is less than minimum bin: round up + search_key.bin = min_bin; + search_key.bytes = min_bin_bytes; + } + + // Find a cached block in the same bin + auto block_itr = cached_blocks.find(search_key); + if (block_itr != cached_blocks.end()) { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + + live_blocks.insert(search_key); + + // Remove from free blocks + cached_bytes.free -= search_key.bytes; + cached_bytes.live += search_key.bytes; + + if (debug) { + printf("\tHost reused cached block at %p (%lld bytes).\n", + alpaka::getPtrNative(search_key.buf), + (long long)search_key.bytes); + } + + cached_blocks.erase(block_itr); + } + + // Done searching: unlock + mutex_locker.unlock(); + } + + // Allocate the block if necessary + if (!found) { + // TODO: eventually support allocation flags + search_key.buf = allocHostBuf(static_cast(search_key.bytes)); +#if CUDA_VERSION >= 11020 + alpaka::prepareForAsyncCopy(search_key.buf); +#endif + + // Insert into live blocks + mutex_locker.lock(); + live_blocks.insert(search_key); + cached_bytes.live += search_key.bytes; + mutex_locker.unlock(); + + if (debug) { + printf("\tHost allocated new host block at %p (%lld bytes).\n", + alpaka::getPtrNative(search_key.buf), + (long long)search_key.bytes); + } + } + + if (debug) { + printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long)cached_blocks.size(), + (long long)cached_bytes.free, + (long long)live_blocks.size(), + (long long)cached_bytes.live); + } + + return search_key.buf; + } + + /** + * \brief Frees a live allocation of pinned host memory, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse. + */ + void HostFree(const alpaka_common::AlpakaHostBuf& buf) { + // Lock + std::unique_lock mutex_locker(mutex); + + bool recached = false; + // Find corresponding block descriptor + BlockDescriptor search_key{buf}; + auto block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes.live -= search_key.bytes; + + // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold + if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) { + recached = true; + // Insert returned allocation into free blocks + cached_blocks.insert(search_key); + cached_bytes.free += search_key.bytes; + + if (debug) { + printf( + "\tHost returned %lld bytes.\n\t\t %lld " + "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + (long long)search_key.bytes, + (long long)cached_blocks.size(), + (long long)cached_bytes.free, + (long long)live_blocks.size(), + (long long)cached_bytes.live); + } + } + } + + // Unlock + mutex_locker.unlock(); + + if (!recached and debug) { + printf( + "\tHost freed %lld bytes.\n\t\t %lld available " + "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + (long long)search_key.bytes, + (long long)cached_blocks.size(), + (long long)cached_bytes.free, + (long long)live_blocks.size(), + (long long)cached_bytes.live); + } + } + }; + + /** @} */ // end group UtilMgmt + +} // namespace cms::alpakatools::allocator + +#endif diff --git a/src/alpaka/AlpakaCore/ESProduct.h b/src/alpaka/AlpakaCore/ESProduct.h index c478cabbb..af50fdc5b 100644 --- a/src/alpaka/AlpakaCore/ESProduct.h +++ b/src/alpaka/AlpakaCore/ESProduct.h @@ -20,7 +20,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { ESProduct() : gpuDataPerDevice_(::ALPAKA_ACCELERATOR_NAMESPACE::devices.size()) { for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) { - gpuDataPerDevice_[i].m_event = ::cms::alpakatools::getEventCache().get(::ALPAKA_ACCELERATOR_NAMESPACE::devices[i]); + gpuDataPerDevice_[i].m_event = + ::cms::alpakatools::getEventCache().get(::ALPAKA_ACCELERATOR_NAMESPACE::devices[i]); } } diff --git a/src/alpaka/AlpakaCore/alpaka/alpakaDevAcc.cc b/src/alpaka/AlpakaCore/alpaka/alpakaDevAcc.cc index 626761a8d..1de7da53e 100644 --- a/src/alpaka/AlpakaCore/alpaka/alpakaDevAcc.cc +++ b/src/alpaka/AlpakaCore/alpaka/alpakaDevAcc.cc @@ -2,8 +2,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { - static - std::vector enumerate() { + static std::vector enumerate() { std::vector devices; uint32_t n = alpaka::getDevCount(); devices.reserve(n); @@ -14,4 +13,4 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } const std::vector devices = enumerate(); -} +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpakaMemoryHelper.h b/src/alpaka/AlpakaCore/alpakaMemoryHelper.h index 9219b11a8..069146b78 100644 --- a/src/alpaka/AlpakaCore/alpakaMemoryHelper.h +++ b/src/alpaka/AlpakaCore/alpakaMemoryHelper.h @@ -34,6 +34,11 @@ namespace cms::alpakatools { return alpaka::ViewPlainPtr(data, device, extent); } + template + inline size_t nbytesFromExtent(const Extent& extent) { + return (sizeof(TData) * extent); + } + } // namespace cms::alpakatools #endif // AlpakaCore_alpakaMemoryHelper_h diff --git a/src/alpaka/AlpakaCore/deviceAllocatorStatus.h b/src/alpaka/AlpakaCore/deviceAllocatorStatus.h new file mode 100644 index 000000000..41b82b524 --- /dev/null +++ b/src/alpaka/AlpakaCore/deviceAllocatorStatus.h @@ -0,0 +1,29 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_deviceAllocatorStatus_h +#define HeterogeneousCore_AlpakaUtilities_deviceAllocatorStatus_h + +#include + +#include "AlpakaCore/alpakaConfig.h" + +namespace cms { + namespace alpakatools { + namespace allocator { + struct TotalBytes { + size_t free; + size_t live; + size_t liveRequested; // CMS: monitor also requested amount + TotalBytes() { free = live = liveRequested = 0; } + }; + + inline int getIdxOfDev(const ::ALPAKA_ACCELERATOR_NAMESPACE::Device& device) { + static const auto devices{alpaka::getDevs<::ALPAKA_ACCELERATOR_NAMESPACE::Platform>()}; + return (std::find(devices.begin(), devices.end(), device) - devices.begin()); + } + + // Map device index to the number of bytes cached by it + using DeviceCachedBytes = std::map; + } // namespace allocator + } // namespace alpakatools +} // namespace cms + +#endif diff --git a/src/alpaka/AlpakaCore/device_unique_ptr.h b/src/alpaka/AlpakaCore/device_unique_ptr.h new file mode 100644 index 000000000..6c7501db9 --- /dev/null +++ b/src/alpaka/AlpakaCore/device_unique_ptr.h @@ -0,0 +1,72 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_interface_device_unique_ptr_h +#define HeterogeneousCore_AlpakaUtilities_interface_device_unique_ptr_h + +#include +#include + +#include "AlpakaCore/host_unique_ptr.h" + +namespace cms { + namespace alpakatools { + namespace device { + namespace impl { + template + class DeviceDeleter { + public: + DeviceDeleter(::ALPAKA_ACCELERATOR_NAMESPACE::AlpakaDeviceBuf buffer) : buf{std::move(buffer)} {} + + void operator()(void* d_ptr) { + if constexpr (allocator::policy == allocator::Policy::Caching) { + if (d_ptr) { + allocator::getCachingDeviceAllocator().DeviceFree(buf); + } + } + } + + private: + ::ALPAKA_ACCELERATOR_NAMESPACE::AlpakaDeviceBuf buf; + }; + } // namespace impl + template + using unique_ptr = +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + std::unique_ptr>>; +#else + host::unique_ptr; +#endif + } // namespace device + + template + auto make_device_unique(const alpaka_common::Extent& extent) { + const auto& device = ::ALPAKA_ACCELERATOR_NAMESPACE::devices[0]; +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + if constexpr (allocator::policy == allocator::Policy::Caching) { + const alpaka_common::Extent nbytes = alpakatools::nbytesFromExtent(extent); + if (nbytes > maxAllocationSize) { + throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); + } + auto buf = allocator::getCachingDeviceAllocator().DeviceAllocate(nbytes, device); + void* d_ptr = alpaka::getPtrNative(buf); + return typename device::unique_ptr{reinterpret_cast(d_ptr), + device::impl::DeviceDeleter{buf}}; + } else { + auto buf = alpaka::allocBuf(device, extent); +#if CUDA_VERSION >= 11020 + if constexpr (allocator::policy == allocator::Policy::Asynchronous) { + alpaka::prepareForAsyncCopy(buf); + } +#endif + TData* d_ptr = alpaka::getPtrNative(buf); + return typename device::unique_ptr{d_ptr, device::impl::DeviceDeleter{buf}}; + } +#else + return make_host_unique(extent); +#endif + } + } // namespace alpakatools +} // namespace cms + +#endif diff --git a/src/alpaka/AlpakaCore/getCachingDeviceAllocator.h b/src/alpaka/AlpakaCore/getCachingDeviceAllocator.h new file mode 100644 index 000000000..f19acc269 --- /dev/null +++ b/src/alpaka/AlpakaCore/getCachingDeviceAllocator.h @@ -0,0 +1,72 @@ +#ifndef HeterogeneousCore_AlpakaCore_src_getCachingDeviceAllocator +#define HeterogeneousCore_AlpakaCore_src_getCachingDeviceAllocator + +#include +#include + +#include "AlpakaCore/CachingDeviceAllocator.h" + +namespace cms::alpakatools::allocator { + // Use caching or not + enum class Policy { Synchronous = 0, Asynchronous = 1, Caching = 2 }; +#ifndef ALPAKA_DISABLE_CACHING_ALLOCATOR + constexpr Policy policy = Policy::Caching; +#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR + constexpr Policy policy = Policy::Asynchronous; +#else + constexpr Policy policy = Policy::Synchronous; +#endif + // Growth factor (bin_growth in CachingDeviceAllocator + constexpr unsigned int binGrowth = 2; + // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in CacingDeviceAllocator + constexpr unsigned int minBin = 8; + // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in CachingDeviceAllocator). Note that unlike in allocator, allocations larger than binGrowth^maxBin are set to fail. + constexpr unsigned int maxBin = 30; + // Total storage for the allocator. 0 means no limit. + constexpr size_t maxCachedBytes = 0; + // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken. + constexpr double maxCachedFraction = 0.8; + constexpr bool debug = false; + + inline size_t minCachedBytes() { + size_t ret = std::numeric_limits::max(); + const auto devices{alpaka::getDevs<::ALPAKA_ACCELERATOR_NAMESPACE::Platform>()}; + for (const auto& device : devices) { + const size_t freeMemory{alpaka::getFreeMemBytes(device)}; + ret = std::min(ret, static_cast(maxCachedFraction * freeMemory)); + } + if (maxCachedBytes > 0) { + ret = std::min(ret, maxCachedBytes); + } + return ret; + } + + inline CachingDeviceAllocator& getCachingDeviceAllocator() { + if (debug) { + std::cout << "CachingDeviceAllocator settings\n" + << " bin growth " << binGrowth << "\n" + << " min bin " << minBin << "\n" + << " max bin " << maxBin << "\n" + << " resulting bins:\n"; + for (auto bin = minBin; bin <= maxBin; ++bin) { + auto binSize = CachingDeviceAllocator::IntPow(binGrowth, bin); + if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; + } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 20) << " MB\n"; + } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 10) << " kB\n"; + } else { + std::cout << " " << std::setw(9) << binSize << " B\n"; + } + } + std::cout << " maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n"; + } + + // the public interface is thread safe + static CachingDeviceAllocator allocator{binGrowth, minBin, maxBin, minCachedBytes(), debug}; + return allocator; + } +} // namespace cms::alpakatools::allocator + +#endif diff --git a/src/alpaka/AlpakaCore/getCachingHostAllocator.h b/src/alpaka/AlpakaCore/getCachingHostAllocator.h new file mode 100644 index 000000000..db075b4d6 --- /dev/null +++ b/src/alpaka/AlpakaCore/getCachingHostAllocator.h @@ -0,0 +1,39 @@ +#ifndef HeterogeneousCore_AlpakaCore_src_getCachingHostAllocator +#define HeterogeneousCore_AlpakaCore_src_getCachingHostAllocator + +#include +#include + +#include "CachingHostAllocator.h" +#include "getCachingDeviceAllocator.h" + +namespace cms::alpakatools::allocator { + inline CachingHostAllocator& getCachingHostAllocator() { + if (debug) { + std::cout << "CachingHostAllocator settings\n" + << " bin growth " << binGrowth << "\n" + << " min bin " << minBin << "\n" + << " max bin " << maxBin << "\n" + << " resulting bins:\n"; + for (auto bin = minBin; bin <= maxBin; ++bin) { + auto binSize = CachingDeviceAllocator::IntPow(binGrowth, bin); + if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; + } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 20) << " MB\n"; + } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) { + std::cout << " " << std::setw(8) << (binSize >> 10) << " kB\n"; + } else { + std::cout << " " << std::setw(9) << binSize << " B\n"; + } + } + std::cout << " maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n"; + } + + // the public interface is thread safe + static CachingHostAllocator allocator{binGrowth, minBin, maxBin, minCachedBytes(), debug}; + return allocator; + } +} // namespace cms::alpakatools::allocator + +#endif diff --git a/src/alpaka/AlpakaCore/host_unique_ptr.h b/src/alpaka/AlpakaCore/host_unique_ptr.h new file mode 100644 index 000000000..ec7030dfc --- /dev/null +++ b/src/alpaka/AlpakaCore/host_unique_ptr.h @@ -0,0 +1,67 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_interface_host_unique_ptr_h +#define HeterogeneousCore_AlpakaUtilities_interface_host_unique_ptr_h + +#include +#include + +#include "AlpakaCore/getCachingHostAllocator.h" + +namespace cms { + namespace alpakatools { + namespace host { + namespace impl { + template + class HostDeleter { + public: + HostDeleter(alpaka_common::AlpakaHostBuf buffer) : buf{std::move(buffer)} {} + + void operator()(void* d_ptr) { + if constexpr (allocator::policy == allocator::Policy::Caching) { + if (d_ptr) { + allocator::getCachingHostAllocator().HostFree(buf); + } + } + } + + private: + alpaka_common::AlpakaHostBuf buf; + }; + } // namespace impl + + template + using unique_ptr = std::unique_ptr< + TData, + impl::HostDeleter>>; + } // namespace host + + inline constexpr size_t maxAllocationSize = + allocator::CachingDeviceAllocator::IntPow(allocator::binGrowth, allocator::maxBin); + + // Allocate pinned host memory + template + typename host::unique_ptr make_host_unique(const alpaka_common::Extent& extent) { + if constexpr (allocator::policy == allocator::Policy::Caching) { + const alpaka_common::Extent nbytes = alpakatools::nbytesFromExtent(extent); + if (nbytes > maxAllocationSize) { + throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); + } + auto buf = allocator::getCachingHostAllocator().HostAllocate(nbytes); + void* d_ptr = alpaka::getPtrNative(buf); + return + typename host::unique_ptr{reinterpret_cast(d_ptr), host::impl::HostDeleter{buf}}; + } else { + auto buf = allocHostBuf(extent); +#if CUDA_VERSION >= 11020 + if constexpr (allocator::policy == allocator::Policy::Asynchronous) { + alpaka::prepareForAsyncCopy(buf); + } +#endif + TData* d_ptr = alpaka::getPtrNative(buf); + return typename host::unique_ptr{d_ptr, host::impl::HostDeleter{buf}}; + } + } + } // namespace alpakatools +} // namespace cms + +#endif diff --git a/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h b/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h index 3012f4399..c8960b003 100644 --- a/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h @@ -1,7 +1,7 @@ #ifndef AlpakaDataFormats_BeamSpot_interface_BeamSpotAlpaka_h #define AlpakaDataFormats_BeamSpot_interface_BeamSpotAlpaka_h -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" #include "DataFormats/BeamSpotPOD.h" #include @@ -13,19 +13,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { BeamSpotAlpaka() = default; BeamSpotAlpaka(BeamSpotPOD const* data, Queue& queue) - : data_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)} { + : data_d{::cms::alpakatools::make_device_unique(1u)} { auto data_h{cms::alpakatools::createHostView(data, 1u)}; + auto data_d_view{cms::alpakatools::createDeviceView(alpaka::getDev(queue), data_d.get(), 1u)}; - alpaka::memcpy(queue, data_d, data_h, 1u); - // alpaka::wait(queue); + alpaka::memcpy(queue, data_d_view, data_h, 1u); } - //TODO ANTONIO - - const BeamSpotPOD* data() const { return alpaka::getPtrNative(data_d); } + const BeamSpotPOD* data() const { return data_d.get(); } private: - AlpakaDeviceBuf data_d; + ::cms::alpakatools::device::unique_ptr data_d; }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaDataFormats/PixelTrackAlpaka.h b/src/alpaka/AlpakaDataFormats/PixelTrackAlpaka.h index c6e0f36f0..64ff4e05c 100644 --- a/src/alpaka/AlpakaDataFormats/PixelTrackAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/PixelTrackAlpaka.h @@ -1,10 +1,12 @@ -#ifndef CUDADataFormatsTrackTrackHeterogeneous_H -#define CUDADataFormatsTrackTrackHeterogeneous_H +#ifndef AlpakaDataFormatsTrackTrackHeterogeneous_H +#define AlpakaDataFormatsTrackTrackHeterogeneous_H #include "AlpakaCore/HistoContainer.h" #include "AlpakaCore/alpakaCommon.h" #include "AlpakaDataFormats/TrajectoryStateSoA.h" +#include "AlpakaCore/device_unique_ptr.h" + namespace trackQuality { enum Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity }; } @@ -69,8 +71,8 @@ namespace pixelTrack { } // namespace pixelTrack namespace ALPAKA_ACCELERATOR_NAMESPACE { - using PixelTrackAlpaka = AlpakaDeviceBuf; - using PixelTrackHost = AlpakaHostBuf; + using PixelTrackAlpaka = cms::alpakatools::device::unique_ptr; + using PixelTrackHost = cms::alpakatools::host::unique_ptr; // NB: ANOTHER OPTION IS TO CREATE A HeterogeneousSoA class, // with a AlpakaDeviceBuf as a data member diff --git a/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h b/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h index 9c13818ca..97c14a3e9 100644 --- a/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h @@ -1,18 +1,18 @@ -#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h -#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h +#ifndef AlpakaDataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h +#define AlpakaDataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelClustersAlpaka { public: SiPixelClustersAlpaka() = default; - explicit SiPixelClustersAlpaka(Device const &device, size_t maxClusters) - : moduleStart_d{cms::alpakatools::allocDeviceBuf(device, maxClusters + 1)}, - clusInModule_d{cms::alpakatools::allocDeviceBuf(device, maxClusters)}, - moduleId_d{cms::alpakatools::allocDeviceBuf(device, maxClusters)}, - clusModuleStart_d{cms::alpakatools::allocDeviceBuf(device, maxClusters + 1)} {} + explicit SiPixelClustersAlpaka(size_t maxClusters) + : moduleStart_d{cms::alpakatools::make_device_unique(maxClusters + 1)}, + clusInModule_d{cms::alpakatools::make_device_unique(maxClusters)}, + moduleId_d{cms::alpakatools::make_device_unique(maxClusters)}, + clusModuleStart_d{cms::alpakatools::make_device_unique(maxClusters + 1)} {} ~SiPixelClustersAlpaka() = default; SiPixelClustersAlpaka(const SiPixelClustersAlpaka &) = delete; @@ -24,20 +24,20 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t nClusters() const { return nClusters_h; } - uint32_t *moduleStart() { return alpaka::getPtrNative(moduleStart_d); } - uint32_t *clusInModule() { return alpaka::getPtrNative(clusInModule_d); } - uint32_t *moduleId() { return alpaka::getPtrNative(moduleId_d); } - uint32_t *clusModuleStart() { return alpaka::getPtrNative(clusModuleStart_d); } + uint32_t *moduleStart() { return moduleStart_d.get(); } + uint32_t *clusInModule() { return clusInModule_d.get(); } + uint32_t *moduleId() { return moduleId_d.get(); } + uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } - uint32_t const *moduleStart() const { return alpaka::getPtrNative(moduleStart_d); } - uint32_t const *clusInModule() const { return alpaka::getPtrNative(clusInModule_d); } - uint32_t const *moduleId() const { return alpaka::getPtrNative(moduleId_d); } - uint32_t const *clusModuleStart() const { return alpaka::getPtrNative(clusModuleStart_d); } + uint32_t const *moduleStart() const { return moduleStart_d.get(); } + uint32_t const *clusInModule() const { return clusInModule_d.get(); } + uint32_t const *moduleId() const { return moduleId_d.get(); } + uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } - uint32_t const *c_moduleStart() const { return alpaka::getPtrNative(moduleStart_d); } - uint32_t const *c_clusInModule() const { return alpaka::getPtrNative(clusInModule_d); } - uint32_t const *c_moduleId() const { return alpaka::getPtrNative(moduleId_d); } - uint32_t const *c_clusModuleStart() const { return alpaka::getPtrNative(clusModuleStart_d); } + uint32_t const *c_moduleStart() const { return moduleStart_d.get(); } + uint32_t const *c_clusInModule() const { return clusInModule_d.get(); } + uint32_t const *c_moduleId() const { return moduleId_d.get(); } + uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); } class DeviceConstView { public: @@ -61,12 +61,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } private: - AlpakaDeviceBuf moduleStart_d; // index of the first pixel of each module - AlpakaDeviceBuf clusInModule_d; // number of clusters found in each module - AlpakaDeviceBuf moduleId_d; // module id of each module + cms::alpakatools::device::unique_ptr moduleStart_d; // index of the first pixel of each module + cms::alpakatools::device::unique_ptr clusInModule_d; // number of clusters found in each module + cms::alpakatools::device::unique_ptr moduleId_d; // module id of each module // originally from rechits - AlpakaDeviceBuf clusModuleStart_d; // index of the first cluster of each module + cms::alpakatools::device::unique_ptr clusModuleStart_d; // index of the first cluster of each module uint32_t nClusters_h = 0; }; diff --git a/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h b/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h index bba2cd2a5..fa1b4da31 100644 --- a/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h @@ -1,27 +1,33 @@ -#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h -#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h +#ifndef AlpakaDataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h +#define AlpakaDataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h #include "DataFormats/PixelErrors.h" #include "AlpakaCore/SimpleVector.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" +#include "AlpakaCore/host_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelDigiErrorsAlpaka { public: SiPixelDigiErrorsAlpaka() = default; - explicit SiPixelDigiErrorsAlpaka(Device const& device, size_t maxFedWords, PixelFormatterErrors errors, Queue& queue) - : data_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, - error_d{cms::alpakatools::allocDeviceBuf<::cms::alpakatools::SimpleVector>(device, 1u)}, - error_h{::cms::alpakatools::allocHostBuf<::cms::alpakatools::SimpleVector>(1u)}, + explicit SiPixelDigiErrorsAlpaka(size_t maxFedWords, PixelFormatterErrors errors, Queue& queue) + : data_d{cms::alpakatools::make_device_unique(maxFedWords)}, + error_d{cms::alpakatools::make_device_unique>(1u)}, + error_h{cms::alpakatools::make_host_unique>(1u)}, formatterErrors_h{std::move(errors)} { - auto perror_h = alpaka::getPtrNative(error_h); - perror_h->construct(maxFedWords, alpaka::getPtrNative(data_d)); + auto perror_h = error_h.get(); + perror_h->construct(maxFedWords, data_d.get()); ALPAKA_ASSERT_OFFLOAD(perror_h->empty()); ALPAKA_ASSERT_OFFLOAD(perror_h->capacity() == static_cast(maxFedWords)); - alpaka::memcpy(queue, error_d, error_h, 1u); + // TO DO: nothing really async in here for now... Pass the queue in constructor argument instead, and don't wait anymore! + auto error_h_view = + cms::alpakatools::createHostView>(error_h.get(), 1u); + auto error_d_view = cms::alpakatools::createDeviceView>( + alpaka::getDev(queue), error_d.get(), 1u); + alpaka::memcpy(queue, error_d_view, error_h_view, 1u); } ~SiPixelDigiErrorsAlpaka() = default; @@ -32,9 +38,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; } - ::cms::alpakatools::SimpleVector* error() { return alpaka::getPtrNative(error_d); } - ::cms::alpakatools::SimpleVector const* error() const { return alpaka::getPtrNative(error_d); } - ::cms::alpakatools::SimpleVector const* c_error() const { return alpaka::getPtrNative(error_d); } + cms::alpakatools::SimpleVector* error() { return error_d.get(); } + cms::alpakatools::SimpleVector const* error() const { return error_d.get(); } + cms::alpakatools::SimpleVector const* c_error() const { return error_d.get(); } #ifdef TODO using HostDataError = @@ -45,9 +51,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #endif private: - AlpakaDeviceBuf data_d; - AlpakaDeviceBuf<::cms::alpakatools::SimpleVector> error_d; - AlpakaHostBuf<::cms::alpakatools::SimpleVector> error_h; + cms::alpakatools::device::unique_ptr data_d; + cms::alpakatools::device::unique_ptr> error_d; + cms::alpakatools::host::unique_ptr> error_h; PixelFormatterErrors formatterErrors_h; }; diff --git a/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h b/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h index bc8c2192c..8450885f0 100644 --- a/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h @@ -1,21 +1,23 @@ -#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h -#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h +#ifndef AlpakaDataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h +#define AlpakaDataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" +#include "AlpakaCore/host_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelDigisAlpaka { public: SiPixelDigisAlpaka() = default; - explicit SiPixelDigisAlpaka(Device const &device, size_t maxFedWords) - : xx_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, - yy_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, - adc_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, - moduleInd_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, - clus_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, - pdigi_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, - rawIdArr_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)} {} + explicit SiPixelDigisAlpaka(size_t maxFedWords) + : maxFedWords_{maxFedWords}, + xx_d{cms::alpakatools::make_device_unique(maxFedWords)}, + yy_d{cms::alpakatools::make_device_unique(maxFedWords)}, + adc_d{cms::alpakatools::make_device_unique(maxFedWords)}, + moduleInd_d{cms::alpakatools::make_device_unique(maxFedWords)}, + clus_d{cms::alpakatools::make_device_unique(maxFedWords)}, + pdigi_d{cms::alpakatools::make_device_unique(maxFedWords)}, + rawIdArr_d{cms::alpakatools::make_device_unique(maxFedWords)} {} ~SiPixelDigisAlpaka() = default; SiPixelDigisAlpaka(const SiPixelDigisAlpaka &) = delete; @@ -31,34 +33,36 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t nModules() const { return nModules_h; } uint32_t nDigis() const { return nDigis_h; } - uint16_t *xx() { return alpaka::getPtrNative(xx_d); } - uint16_t *yy() { return alpaka::getPtrNative(yy_d); } - uint16_t *adc() { return alpaka::getPtrNative(adc_d); } - uint16_t *moduleInd() { return alpaka::getPtrNative(moduleInd_d); } - int32_t *clus() { return alpaka::getPtrNative(clus_d); } - uint32_t *pdigi() { return alpaka::getPtrNative(pdigi_d); } - uint32_t *rawIdArr() { return alpaka::getPtrNative(rawIdArr_d); } - - uint16_t const *xx() const { return alpaka::getPtrNative(xx_d); } - uint16_t const *yy() const { return alpaka::getPtrNative(yy_d); } - uint16_t const *adc() const { return alpaka::getPtrNative(adc_d); } - uint16_t const *moduleInd() const { return alpaka::getPtrNative(moduleInd_d); } - int32_t const *clus() const { return alpaka::getPtrNative(clus_d); } - uint32_t const *pdigi() const { return alpaka::getPtrNative(pdigi_d); } - uint32_t const *rawIdArr() const { return alpaka::getPtrNative(rawIdArr_d); } - - uint16_t const *c_xx() const { return alpaka::getPtrNative(xx_d); } - uint16_t const *c_yy() const { return alpaka::getPtrNative(yy_d); } - uint16_t const *c_adc() const { return alpaka::getPtrNative(adc_d); } - uint16_t const *c_moduleInd() const { return alpaka::getPtrNative(moduleInd_d); } - int32_t const *c_clus() const { return alpaka::getPtrNative(clus_d); } - uint32_t const *c_pdigi() const { return alpaka::getPtrNative(pdigi_d); } - uint32_t const *c_rawIdArr() const { return alpaka::getPtrNative(rawIdArr_d); } + uint16_t *xx() { return xx_d.get(); } + uint16_t *yy() { return yy_d.get(); } + uint16_t *adc() { return adc_d.get(); } + uint16_t *moduleInd() { return moduleInd_d.get(); } + int32_t *clus() { return clus_d.get(); } + uint32_t *pdigi() { return pdigi_d.get(); } + uint32_t *rawIdArr() { return rawIdArr_d.get(); } + + uint16_t const *xx() const { return xx_d.get(); } + uint16_t const *yy() const { return yy_d.get(); } + uint16_t const *adc() const { return adc_d.get(); } + uint16_t const *moduleInd() const { return moduleInd_d.get(); } + int32_t const *clus() const { return clus_d.get(); } + uint32_t const *pdigi() const { return pdigi_d.get(); } + uint32_t const *rawIdArr() const { return rawIdArr_d.get(); } + + uint16_t const *c_xx() const { return xx_d.get(); } + uint16_t const *c_yy() const { return yy_d.get(); } + uint16_t const *c_adc() const { return adc_d.get(); } + uint16_t const *c_moduleInd() const { return moduleInd_d.get(); } + int32_t const *c_clus() const { return clus_d.get(); } + uint32_t const *c_pdigi() const { return pdigi_d.get(); } + uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); } // TO DO: nothing async in here for now... Pass the queue as argument instead, and don't wait anymore! auto adcToHostAsync(Queue &queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nDigis()); - alpaka::memcpy(queue, ret, adc_d, nDigis()); + auto ret = cms::alpakatools::make_host_unique(nDigis()); + auto ret_view = cms::alpakatools::createHostView(ret.get(), nDigis()); + auto adc_d_view = cms::alpakatools::createDeviceView(alpaka::getDev(queue), adc_d.get(), maxFedWords_); + alpaka::memcpy(queue, ret_view, adc_d_view, nDigis()); return ret; } @@ -90,17 +94,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const DeviceConstView view() const { return DeviceConstView{c_xx(), c_yy(), c_adc(), c_moduleInd(), c_clus()}; } private: + size_t maxFedWords_; // These are consumed by downstream device code - AlpakaDeviceBuf xx_d; // local coordinates of each pixel - AlpakaDeviceBuf yy_d; // - AlpakaDeviceBuf adc_d; // ADC of each pixel - AlpakaDeviceBuf moduleInd_d; // module id of each pixel - AlpakaDeviceBuf clus_d; // cluster id of each pixel + cms::alpakatools::device::unique_ptr xx_d; // local coordinates of each pixel + cms::alpakatools::device::unique_ptr yy_d; // + cms::alpakatools::device::unique_ptr adc_d; // ADC of each pixel + cms::alpakatools::device::unique_ptr moduleInd_d; // module id of each pixel + cms::alpakatools::device::unique_ptr clus_d; // cluster id of each pixel // These are for CPU output; should we (eventually) place them to a // separate product? - AlpakaDeviceBuf pdigi_d; - AlpakaDeviceBuf rawIdArr_d; + cms::alpakatools::device::unique_ptr pdigi_d; + cms::alpakatools::device::unique_ptr rawIdArr_d; uint32_t nModules_h = 0; uint32_t nDigis_h = 0; diff --git a/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h b/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h index c9b372c96..d5d2f89f4 100644 --- a/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h @@ -1,8 +1,9 @@ -#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h -#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h +#ifndef AlpakaDataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h +#define AlpakaDataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h #include "AlpakaDataFormats/TrackingRecHit2DSOAView.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" +#include "AlpakaCore/host_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -20,25 +21,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // NON-OWNING DEVICE POINTERS: m_hitsModuleStart(hitsModuleStart), // OWNING DEVICE POINTERS: - m_xl{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_yl{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_xerr{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_yerr{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_xg{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_yg{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_zg{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_rg{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_iphi{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_charge{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_xsize{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_ysize{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_detInd{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_averageGeometry{ - cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, - m_hitsLayerStart{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, - m_hist{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, - // SoA view: - m_view{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, + m_xl{cms::alpakatools::make_device_unique(nHits)}, + m_yl{cms::alpakatools::make_device_unique(nHits)}, + m_xerr{cms::alpakatools::make_device_unique(nHits)}, + m_yerr{cms::alpakatools::make_device_unique(nHits)}, + m_xg{cms::alpakatools::make_device_unique(nHits)}, + m_yg{cms::alpakatools::make_device_unique(nHits)}, + m_zg{cms::alpakatools::make_device_unique(nHits)}, + m_rg{cms::alpakatools::make_device_unique(nHits)}, + m_iphi{cms::alpakatools::make_device_unique(nHits)}, + m_charge{cms::alpakatools::make_device_unique(nHits)}, + m_xsize{cms::alpakatools::make_device_unique(nHits)}, + m_ysize{cms::alpakatools::make_device_unique(nHits)}, + m_detInd{cms::alpakatools::make_device_unique(nHits)}, + m_averageGeometry{cms::alpakatools::make_device_unique(1u)}, + m_hitsLayerStart{cms::alpakatools::make_device_unique(nHits)}, + m_hist{cms::alpakatools::make_device_unique(1u)}, + // SOA view: + m_view{cms::alpakatools::make_device_unique(1u)}, m_view_h{::cms::alpakatools::allocHostBuf(1u)} { // the hits are actually accessed in order only in building // if ordering is relevant they may have to be stored phi-ordered by layer or so @@ -54,24 +54,30 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // Raw pointer to data already owned in the eventSetup by PixelCPEFast object: view.m_cpeParams = cpeParams; // Raw pointers to data owned here in TrackingRecHit2DAlpaka object: - view.m_xl = alpaka::getPtrNative(m_xl); - view.m_yl = alpaka::getPtrNative(m_yl); - view.m_xerr = alpaka::getPtrNative(m_xerr); - view.m_yerr = alpaka::getPtrNative(m_yerr); - view.m_xg = alpaka::getPtrNative(m_xg); - view.m_yg = alpaka::getPtrNative(m_yg); - view.m_zg = alpaka::getPtrNative(m_zg); - view.m_rg = alpaka::getPtrNative(m_rg); - view.m_iphi = alpaka::getPtrNative(m_iphi); - view.m_charge = alpaka::getPtrNative(m_charge); - view.m_xsize = alpaka::getPtrNative(m_xsize); - view.m_ysize = alpaka::getPtrNative(m_ysize); - view.m_detInd = alpaka::getPtrNative(m_detInd); - view.m_averageGeometry = alpaka::getPtrNative(m_averageGeometry); - view.m_hitsLayerStart = alpaka::getPtrNative(m_hitsLayerStart); - view.m_hist = alpaka::getPtrNative(m_hist); - // Copy the SoA view to the device - alpaka::memcpy(queue, m_view, m_view_h, 1u); +#define SET(name) view.name = name.get() + SET(m_xl); + SET(m_yl); + SET(m_xerr); + SET(m_yerr); + SET(m_xg); + SET(m_yg); + SET(m_zg); + SET(m_rg); + SET(m_iphi); + SET(m_charge); + SET(m_xsize); + SET(m_ysize); + SET(m_detInd); + SET(m_averageGeometry); + SET(m_hitsLayerStart); + SET(m_hist); +#undef SET + + // SoA view on device: + auto view_h{cms::alpakatools::createHostView(&view, 1u)}; + auto view_m_view{ + cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_view.get(), 1u)}; + alpaka::memcpy(queue, view_m_view, m_view_h, 1u); } ~TrackingRecHit2DAlpaka() = default; @@ -81,71 +87,93 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { TrackingRecHit2DAlpaka(TrackingRecHit2DAlpaka&&) = default; TrackingRecHit2DAlpaka& operator=(TrackingRecHit2DAlpaka&&) = default; - TrackingRecHit2DSOAView* view() { return alpaka::getPtrNative(m_view); } - TrackingRecHit2DSOAView const* view() const { return alpaka::getPtrNative(m_view); } + TrackingRecHit2DSOAView* view() { return m_view.get(); } + TrackingRecHit2DSOAView const* view() const { return m_view.get(); } auto nHits() const { return m_nHits; } auto hitsModuleStart() const { return m_hitsModuleStart; } - auto hitsLayerStart() { return alpaka::getPtrNative(m_hitsLayerStart); } - auto const* c_hitsLayerStart() const { return alpaka::getPtrNative(m_hitsLayerStart); } - auto phiBinner() { return alpaka::getPtrNative(m_hist); } - auto iphi() { return alpaka::getPtrNative(m_iphi); } - auto const* c_iphi() const { return alpaka::getPtrNative(m_iphi); } + auto hitsLayerStart() { return m_hitsLayerStart.get(); } + auto const* c_hitsLayerStart() const { return m_hitsLayerStart.get(); } + auto phiBinner() { return m_hist.get(); } + auto iphi() { return m_iphi.get(); } + auto const* c_iphi() const { return m_iphi.get(); } auto xlToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_xl, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_xl = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_xl.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_xl, nHits()); return ret; } auto ylToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_yl, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_yl = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_yl.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_yl, nHits()); return ret; } auto xerrToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_xerr, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_xerr = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_xerr.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_xerr, nHits()); return ret; } auto yerrToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_yerr, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_yerr = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_yerr.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_yerr, nHits()); return ret; } auto xgToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_xg, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_xg = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_xg.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_xg, nHits()); return ret; } auto ygToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_yg, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_yg = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_yg.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_yg, nHits()); return ret; } auto zgToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_zg, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_zg = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_zg.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_zg, nHits()); return ret; } auto rgToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_rg, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_rg = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_rg.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_rg, nHits()); return ret; } auto chargeToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_charge, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_charge = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_charge.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_charge, nHits()); return ret; } auto xsizeToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_xsize, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_xsize = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_xsize.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_xsize, nHits()); return ret; } auto ysizeToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::allocHostBuf(nHits()); - alpaka::memcpy(queue, ret, m_ysize, nHits()); + auto ret = cms::alpakatools::make_host_unique(nHits()); + auto view_ret = cms::alpakatools::createHostView(ret.get(), nHits()); + auto view_m_ysize = cms::alpakatools::createDeviceView(alpaka::getDev(queue), m_ysize.get(), nHits()); + alpaka::memcpy(queue, view_ret, view_m_ysize, nHits()); return ret; } #ifdef TODO @@ -153,17 +181,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { cms::cuda::host::unique_ptr detIndexToHostAsync(cudaStream_t stream) const; cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; #endif - auto const* xl() const { return alpaka::getPtrNative(m_xl); } - auto const* yl() const { return alpaka::getPtrNative(m_yl); } - auto const* xerr() const { return alpaka::getPtrNative(m_xerr); } - auto const* yerr() const { return alpaka::getPtrNative(m_yerr); } - auto const* xg() const { return alpaka::getPtrNative(m_xg); } - auto const* yg() const { return alpaka::getPtrNative(m_yg); } - auto const* zg() const { return alpaka::getPtrNative(m_zg); } - auto const* rg() const { return alpaka::getPtrNative(m_rg); } - auto const* charge() const { return alpaka::getPtrNative(m_charge); } - auto const* xsize() const { return alpaka::getPtrNative(m_xsize); } - auto const* ysize() const { return alpaka::getPtrNative(m_ysize); } + auto const* xl() const { return m_xl.get(); } + auto const* yl() const { return m_yl.get(); } + auto const* xerr() const { return m_xerr.get(); } + auto const* yerr() const { return m_yerr.get(); } + auto const* xg() const { return m_xg.get(); } + auto const* yg() const { return m_yg.get(); } + auto const* zg() const { return m_zg.get(); } + auto const* rg() const { return m_rg.get(); } + auto const* charge() const { return m_charge.get(); } + auto const* xsize() const { return m_xsize.get(); } + auto const* ysize() const { return m_ysize.get(); } private: uint32_t m_nHits; @@ -174,35 +202,34 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // OWNING DEVICE POINTERS // local coord - AlpakaDeviceBuf m_xl; - AlpakaDeviceBuf m_yl; - AlpakaDeviceBuf m_xerr; - AlpakaDeviceBuf m_yerr; + ::cms::alpakatools::device::unique_ptr m_xl; + ::cms::alpakatools::device::unique_ptr m_yl; + ::cms::alpakatools::device::unique_ptr m_xerr; + ::cms::alpakatools::device::unique_ptr m_yerr; // global coord - AlpakaDeviceBuf m_xg; - AlpakaDeviceBuf m_yg; - AlpakaDeviceBuf m_zg; - AlpakaDeviceBuf m_rg; - AlpakaDeviceBuf m_iphi; + ::cms::alpakatools::device::unique_ptr m_xg; + ::cms::alpakatools::device::unique_ptr m_yg; + ::cms::alpakatools::device::unique_ptr m_zg; + ::cms::alpakatools::device::unique_ptr m_rg; + ::cms::alpakatools::device::unique_ptr m_iphi; // cluster properties - AlpakaDeviceBuf m_charge; - AlpakaDeviceBuf m_xsize; - AlpakaDeviceBuf m_ysize; - AlpakaDeviceBuf m_detInd; + ::cms::alpakatools::device::unique_ptr m_charge; + ::cms::alpakatools::device::unique_ptr m_xsize; + ::cms::alpakatools::device::unique_ptr m_ysize; + ::cms::alpakatools::device::unique_ptr m_detInd; - AlpakaDeviceBuf m_averageGeometry; + ::cms::alpakatools::device::unique_ptr m_averageGeometry; // needed as kernel params... - AlpakaDeviceBuf m_hitsLayerStart; - AlpakaDeviceBuf m_hist; + ::cms::alpakatools::device::unique_ptr m_hitsLayerStart; + ::cms::alpakatools::device::unique_ptr m_hist; // This is a SoA view which itself gathers non-owning pointers to the data owned above (in TrackingRecHit2DAlpaka instance). // This is used to access and modify data on GPU in a SoA format (TrackingRecHit2DSOAView), // while the data itself is owned here in the TrackingRecHit2DAlpaka instance. - AlpakaDeviceBuf m_view; - // Keep a host copy of the device view alive during the asynchronous copy + ::cms::alpakatools::device::unique_ptr m_view; AlpakaHostBuf m_view_h; }; diff --git a/src/alpaka/AlpakaDataFormats/ZVertexAlpaka.h b/src/alpaka/AlpakaDataFormats/ZVertexAlpaka.h index d9cb43762..22c17a9f2 100644 --- a/src/alpaka/AlpakaDataFormats/ZVertexAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/ZVertexAlpaka.h @@ -1,13 +1,13 @@ -#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H -#define CUDADataFormatsVertexZVertexHeterogeneous_H +#ifndef AlpakaDataFormatsVertexZVertexHeterogeneous_H +#define AlpakaDataFormatsVertexZVertexHeterogeneous_H #include "AlpakaDataFormats/ZVertexSoA.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { - using ZVertexAlpaka = AlpakaDeviceBuf; - using ZVertexHost = AlpakaHostBuf; + using ZVertexAlpaka = ::cms::alpakatools::device::unique_ptr; + using ZVertexHost = ::cms::alpakatools::host::unique_ptr; // NB: ANOTHER OPTION IS TO CREATE A HeterogeneousSoA class, // with a AlpakaDeviceBuf as a data member diff --git a/src/alpaka/CondFormats/PixelCPEFast.h b/src/alpaka/CondFormats/PixelCPEFast.h index 4b910f1b1..4cf7250b3 100644 --- a/src/alpaka/CondFormats/PixelCPEFast.h +++ b/src/alpaka/CondFormats/PixelCPEFast.h @@ -8,16 +8,17 @@ #include "AlpakaCore/alpakaCommon.h" #include "AlpakaCore/ESProduct.h" #include "AlpakaCore/alpakaMemoryHelper.h" +#include "AlpakaCore/device_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { class PixelCPEFast { public: - PixelCPEFast(AlpakaDeviceBuf commonParams, - AlpakaDeviceBuf detParams, - AlpakaDeviceBuf layerGeometry, - AlpakaDeviceBuf averageGeometry, - AlpakaDeviceBuf params) + PixelCPEFast(cms::alpakatools::device::unique_ptr commonParams, + cms::alpakatools::device::unique_ptr detParams, + cms::alpakatools::device::unique_ptr layerGeometry, + cms::alpakatools::device::unique_ptr averageGeometry, + cms::alpakatools::device::unique_ptr params) : m_commonParams(std::move(commonParams)), m_detParams(std::move(detParams)), m_layerGeometry(std::move(layerGeometry)), @@ -26,80 +27,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ~PixelCPEFast() = default; - pixelCPEforGPU::ParamsOnGPU const *params() const { return alpaka::getPtrNative(m_params); } - -#ifdef TODO - template - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct getGPUData() const { - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct gpuData; - return gpuData; - } - - // The return value can only be used safely in kernels launched on - // the same cudaStream, or after cudaStreamSynchronize. - const pixelCPEforGPU::ParamsOnGPU getGPUProductAsync(Queue queue) const { - auto gpuData = getGPUData(); - - auto const& data = gpuData_.dataForDeviceAsync(queue, [this](GPUData &data, Queue queue) { - using namespace ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE; - // and now copy to device... - auto cParams = allocDeviceBuf(1u); - data.h_paramsOnGPU.m_commonParams = alpaka::getPtrNative(cParams); - - uint32_t size_detParams = alpaka::extent::getExtentVec(this->m_detParams)[0u]; - auto detParams = allocDeviceBuf(size_detParams); - data.h_paramsOnGPU.m_detParams = alpaka::getPtrNative(detParams); - - auto avgGeom = allocDeviceBuf(1u); - data.h_paramsOnGPU.m_averageGeometry = alpaka::getPtrNative(avgGeom); - - auto layerGeom = allocDeviceBuf(1u); - data.h_paramsOnGPU.m_layerGeometry = alpaka::getPtrNative(layerGeom); - - auto parGPU = allocDeviceBuf(1u); - data.d_paramsOnGPU = alpaka::getPtrNative(parGPU); - - alpaka::prepareForAsyncCopy(cParams); - alpaka::prepareForAsyncCopy(detParams); - alpaka::prepareForAsyncCopy(avgGeom); - alpaka::prepareForAsyncCopy(layerGeom); - alpaka::prepareForAsyncCopy(parGPU); - - alpaka::memcpy(queue, parGPU, data.h_paramsOnGPU, 1u); - alpaka::memcpy(queue, data.h_paramsOnGPU.m_commonParams, this->m_commonParams, 1u); - alpaka::memcpy(queue, data.h_paramsOnGPU.m_averageGeometry, this->m_averageGeometry, 1u); - alpaka::memcpy(queue, data.h_paramsOnGPU.m_layerGeometry, this->m_layerGeometry, 1u); - alpaka::memcpy(queue, data.h_paramsOnGPU.m_detParams, alpaka::getPtrNative(this->m_detParams), size_detParams); - }); -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - return *data.d_paramsOnGPU; -#else - return data.h_paramsOnGPU; -#endif - } -#endif // TODO + pixelCPEforGPU::ParamsOnGPU const* params() const { return m_params.get(); } private: - AlpakaDeviceBuf m_commonParams; - AlpakaDeviceBuf m_detParams; - AlpakaDeviceBuf m_layerGeometry; - AlpakaDeviceBuf m_averageGeometry; - AlpakaDeviceBuf m_params; - -#ifdef TODO - struct GPUData { - // not needed if not used on CPU... - pixelCPEforGPU::ParamsOnGPU h_paramsOnGPU; - pixelCPEforGPU::ParamsOnGPU *d_paramsOnGPU = nullptr; // copy of the above on the Device - ~GPUData() { - if (d_paramsOnGPU != nullptr) { - //cudafree - } - } - }; - - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct gpuData_; -#endif // TODO + cms::alpakatools::device::unique_ptr m_commonParams; + cms::alpakatools::device::unique_ptr m_detParams; + cms::alpakatools::device::unique_ptr m_layerGeometry; + cms::alpakatools::device::unique_ptr m_averageGeometry; + cms::alpakatools::device::unique_ptr m_params; }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/CondFormats/SiPixelFedCablingMapGPUWrapper.h b/src/alpaka/CondFormats/SiPixelFedCablingMapGPUWrapper.h index 4033ef5a9..4a840cd46 100644 --- a/src/alpaka/CondFormats/SiPixelFedCablingMapGPUWrapper.h +++ b/src/alpaka/CondFormats/SiPixelFedCablingMapGPUWrapper.h @@ -3,24 +3,24 @@ #include "CondFormats/SiPixelFedCablingMapGPU.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelFedCablingMapGPUWrapper { public: - using CablingMapDeviceBuf = AlpakaDeviceBuf; + using CablingMapDevicePtr = cms::alpakatools::device::unique_ptr; - explicit SiPixelFedCablingMapGPUWrapper(CablingMapDeviceBuf cablingMap, bool quality) + explicit SiPixelFedCablingMapGPUWrapper(CablingMapDevicePtr cablingMap, bool quality) : cablingMapDevice_{std::move(cablingMap)}, hasQuality_{quality} {} ~SiPixelFedCablingMapGPUWrapper() = default; bool hasQuality() const { return hasQuality_; } - const SiPixelFedCablingMapGPU* cablingMap() const { return alpaka::getPtrNative(cablingMapDevice_); } + const SiPixelFedCablingMapGPU* cablingMap() const { return cablingMapDevice_.get(); } private: - CablingMapDeviceBuf cablingMapDevice_; + CablingMapDevicePtr cablingMapDevice_; bool hasQuality_; }; diff --git a/src/alpaka/CondFormats/SiPixelGainForHLTonGPU.h b/src/alpaka/CondFormats/SiPixelGainForHLTonGPU.h index afd3f4482..bf205185f 100644 --- a/src/alpaka/CondFormats/SiPixelGainForHLTonGPU.h +++ b/src/alpaka/CondFormats/SiPixelGainForHLTonGPU.h @@ -1,7 +1,7 @@ #ifndef CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h #define CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -29,9 +29,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { using Range = std::pair; using RangeAndCols = std::pair; - SiPixelGainForHLTonGPU(AlpakaDeviceBuf ped, - AlpakaDeviceBuf rc, - AlpakaDeviceBuf f) + SiPixelGainForHLTonGPU(cms::alpakatools::device::unique_ptr ped, + cms::alpakatools::device::unique_ptr rc, + cms::alpakatools::device::unique_ptr f) : v_pedestals_(std::move(ped)), rangeAndCols_(std::move(rc)), fields_(std::move(f)){}; ALPAKA_FN_INLINE ALPAKA_FN_ACC static std::pair getPedAndGain(const DecodingStructure* v_pedestals, @@ -72,14 +72,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { return ped * fields->pedPrecision + fields->minPed_; } - ALPAKA_FN_HOST const DecodingStructure* getVpedestals() const { return alpaka::getPtrNative(v_pedestals_); } - ALPAKA_FN_HOST const RangeAndCols* getRangeAndCols() const { return alpaka::getPtrNative(rangeAndCols_); } - ALPAKA_FN_HOST const Fields* getFields() const { return alpaka::getPtrNative(fields_); } + ALPAKA_FN_HOST const DecodingStructure* getVpedestals() const { return v_pedestals_.get(); } + ALPAKA_FN_HOST const RangeAndCols* getRangeAndCols() const { return rangeAndCols_.get(); } + ALPAKA_FN_HOST const Fields* getFields() const { return fields_.get(); } private: - AlpakaDeviceBuf v_pedestals_; - AlpakaDeviceBuf rangeAndCols_; - AlpakaDeviceBuf fields_; + cms::alpakatools::device::unique_ptr v_pedestals_; + cms::alpakatools::device::unique_ptr rangeAndCols_; + cms::alpakatools::device::unique_ptr fields_; }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc index d3fada0b0..e5d7f0f55 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc @@ -1,6 +1,6 @@ #include "BrokenLineFitOnGPU.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -18,14 +18,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { Vec1D::all(numberOfBlocks / 4), Vec1D::all(blockSize)); // Fit internals - auto hitsGPU_ = cms::alpakatools::allocDeviceBuf( - alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); + auto hitsGPU_ = cms::alpakatools::make_device_unique(maxNumberOfConcurrentFits_ * + sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = cms::alpakatools::allocDeviceBuf( - alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); + auto hits_geGPU_ = cms::alpakatools::make_device_unique(maxNumberOfConcurrentFits_ * + sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = cms::alpakatools::allocDeviceBuf( - alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); + auto fast_fit_resultsGPU_ = cms::alpakatools::make_device_unique(maxNumberOfConcurrentFits_ * + sizeof(Rfit::Vector4d) / sizeof(double)); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets @@ -35,9 +35,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tuples_d, tupleMultiplicity_d, hv, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), 3, offset)); @@ -47,9 +47,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, bField_, outputSoa_d, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), 3, offset)); @@ -60,9 +60,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tuples_d, tupleMultiplicity_d, hv, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), 4, offset)); @@ -72,9 +72,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, bField_, outputSoa_d, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), 4, offset)); @@ -86,9 +86,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tuples_d, tupleMultiplicity_d, hv, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), 5, offset)); @@ -98,9 +98,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, bField_, outputSoa_d, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), 5, offset)); alpaka::wait(queue); @@ -112,9 +112,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tuples_d, tupleMultiplicity_d, hv, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), 5, offset)); @@ -124,9 +124,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, bField_, outputSoa_d, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), 5, offset)); alpaka::wait(queue); diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.cc b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.cc index f542cf940..2968ecc93 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.cc @@ -48,23 +48,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const Vec2D blks(numberOfBlocks, 1u); const Vec2D thrs(blockSize, stride); const WorkDiv2D kernelConnectWorkDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blks, thrs); - alpaka::enqueue(queue, - alpaka::createTaskKernel( - kernelConnectWorkDiv, - kernel_connect(), - alpaka::getPtrNative(device_hitTuple_apc_), - alpaka::getPtrNative(device_hitToTuple_apc_), // needed only to be reset, ready for next kernel - hh.view(), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_), - alpaka::getPtrNative(device_theCellNeighbors_), - alpaka::getPtrNative(device_isOuterHitOfCell_), - m_params.hardCurvCut_, - m_params.ptmin_, - m_params.CAThetaCutBarrel_, - m_params.CAThetaCutForward_, - m_params.dcaCutInnerTriplet_, - m_params.dcaCutOuterTriplet_)); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(kernelConnectWorkDiv, + kernel_connect(), + device_hitTuple_apc_.get(), + device_hitToTuple_apc_.get(), // needed only to be reset, ready for next kernel + hh.view(), + device_theCells_.get(), + device_nCells_.get(), + device_theCellNeighbors_.get(), + device_isOuterHitOfCell_.get(), + m_params.hardCurvCut_, + m_params.ptmin_, + m_params.CAThetaCutBarrel_, + m_params.CAThetaCutForward_, + m_params.dcaCutInnerTriplet_, + m_params.dcaCutOuterTriplet_)); if (nhits > 1 && m_params.earlyFishbone_) { const uint32_t nthTot = 128; @@ -79,9 +79,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::createTaskKernel(fishboneWorkDiv, gpuPixelDoublets::fishbone(), hh.view(), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_), - alpaka::getPtrNative(device_isOuterHitOfCell_), + device_theCells_.get(), + device_nCells_.get(), + device_isOuterHitOfCell_.get(), nhits, false)); alpaka::wait(queue); @@ -95,21 +95,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::createTaskKernel(workDiv1D, kernel_find_ntuplets(), hh.view(), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_), - alpaka::getPtrNative(device_theCellTracks_), + device_theCells_.get(), + device_nCells_.get(), + device_theCellTracks_.get(), tuples_d, - alpaka::getPtrNative(device_hitTuple_apc_), + device_hitTuple_apc_.get(), quality_d, m_params.minHitsPerNtuplet_)); if (m_params.doStats_) { alpaka::enqueue(queue, - alpaka::createTaskKernel(workDiv1D, - kernel_mark_used(), - hh.view(), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_))); + alpaka::createTaskKernel( + workDiv1D, kernel_mark_used(), hh.view(), device_theCells_.get(), device_nCells_.get())); } #ifdef GPU_DEBUG @@ -120,10 +117,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv1D, ::cms::alpakatools::finalizeBulk(), alpaka::getPtrNative(device_hitTuple_apc_), tuples_d)); + alpaka::enqueue(queue, + alpaka::createTaskKernel( + workDiv1D, cms::alpakatools::finalizeBulk(), device_hitTuple_apc_.get(), tuples_d)); // remove duplicates (tracks that share a doublet) numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; @@ -132,8 +128,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_earlyDuplicateRemover(), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_), + device_theCells_.get(), + device_nCells_.get(), tuples_d, quality_d)); @@ -142,20 +138,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); alpaka::enqueue(queue, - alpaka::createTaskKernel(workDiv1D, - kernel_countMultiplicity(), - tuples_d, - quality_d, - alpaka::getPtrNative(device_tupleMultiplicity_))); + alpaka::createTaskKernel( + workDiv1D, kernel_countMultiplicity(), tuples_d, quality_d, device_tupleMultiplicity_.get())); - ::cms::alpakatools::launchFinalize(alpaka::getPtrNative(device_tupleMultiplicity_), queue); + cms::alpakatools::launchFinalize(device_tupleMultiplicity_.get(), queue); workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv1D, kernel_fillMultiplicity(), tuples_d, quality_d, alpaka::getPtrNative(device_tupleMultiplicity_))); + alpaka::enqueue(queue, + alpaka::createTaskKernel( + workDiv1D, kernel_fillMultiplicity(), tuples_d, quality_d, device_tupleMultiplicity_.get())); if (nhits > 1 && m_params.lateFishbone_) { const uint32_t nthTot = 128; @@ -170,9 +162,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::createTaskKernel(workDiv2D, gpuPixelDoublets::fishbone(), hh.view(), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_), - alpaka::getPtrNative(device_isOuterHitOfCell_), + device_theCells_.get(), + device_nCells_.get(), + device_isOuterHitOfCell_.get(), nhits, true)); alpaka::wait(queue); @@ -186,16 +178,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::createTaskKernel(workDiv1D, kernel_checkOverflows(), tuples_d, - alpaka::getPtrNative(device_tupleMultiplicity_), - alpaka::getPtrNative(device_hitTuple_apc_), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_), - alpaka::getPtrNative(device_theCellNeighbors_), - alpaka::getPtrNative(device_theCellTracks_), - alpaka::getPtrNative(device_isOuterHitOfCell_), + device_tupleMultiplicity_.get(), + device_hitTuple_apc_.get(), + device_theCells_.get(), + device_nCells_.get(), + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), + device_isOuterHitOfCell_.get(), nhits, m_params.maxNumberOfDoublets_, - alpaka::getPtrNative(counters_))); + counters_.get())); alpaka::wait(queue); } #ifdef GPU_DEBUG @@ -217,23 +209,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::wait(queue); #endif - ALPAKA_ASSERT_OFFLOAD(alpaka::getPtrNative(device_isOuterHitOfCell_)); + ALPAKA_ASSERT_OFFLOAD(device_isOuterHitOfCell_.get()); { int threadsPerBlock = 128; - // at least one block! + // at least one block!device_nCells_ int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; const WorkDiv1D workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( Vec1D::all(blocks), Vec1D::all(threadsPerBlock)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, gpuPixelDoublets::initDoublets(), - alpaka::getPtrNative(device_isOuterHitOfCell_), + device_isOuterHitOfCell_.get(), nhits, - alpaka::getPtrNative(device_theCellNeighbors_), - alpaka::getPtrNative(device_theCellNeighborsContainer_), - alpaka::getPtrNative(device_theCellTracks_), - alpaka::getPtrNative(device_theCellTracksContainer_))); + device_theCellNeighbors_.get(), + device_theCellNeighborsContainer_.get(), + device_theCellTracks_.get(), + device_theCellTracksContainer_.get())); alpaka::wait(queue); } @@ -262,12 +254,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv2D, gpuPixelDoublets::getDoubletsFromHisto(), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_), - alpaka::getPtrNative(device_theCellNeighbors_), - alpaka::getPtrNative(device_theCellTracks_), + device_theCells_.get(), + device_nCells_.get(), + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), hh.view(), - alpaka::getPtrNative(device_isOuterHitOfCell_), + device_isOuterHitOfCell_.get(), nActualPairs, m_params.idealConditions_, m_params.doClusterCut_, @@ -301,12 +293,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - alpaka::enqueue(queue, - alpaka::createTaskKernel(workDiv1D, - kernel_fishboneCleaner(), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_), - quality_d)); + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workDiv1D, kernel_fishboneCleaner(), device_theCells_.get(), device_nCells_.get(), quality_d)); alpaka::wait(queue); } @@ -314,32 +304,27 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - alpaka::enqueue(queue, - alpaka::createTaskKernel(workDiv1D, - kernel_fastDuplicateRemover(), - alpaka::getPtrNative(device_theCells_), - alpaka::getPtrNative(device_nCells_), - tuples_d, - tracks_d)); + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workDiv1D, kernel_fastDuplicateRemover(), device_theCells_.get(), device_nCells_.get(), tuples_d, tracks_d)); if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) { // fill hit->track "map" numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv1D, kernel_countHitInTracks(), tuples_d, quality_d, alpaka::getPtrNative(device_hitToTuple_))); + alpaka::enqueue(queue, + alpaka::createTaskKernel( + workDiv1D, kernel_countHitInTracks(), tuples_d, quality_d, device_hitToTuple_.get())); - ::cms::alpakatools::launchFinalize(alpaka::getPtrNative(device_hitToTuple_), queue); + cms::alpakatools::launchFinalize(device_hitToTuple_.get(), queue); workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv1D, kernel_fillHitInTracks(), tuples_d, quality_d, alpaka::getPtrNative(device_hitToTuple_))); + alpaka::enqueue(queue, + alpaka::createTaskKernel( + workDiv1D, kernel_fillHitInTracks(), tuples_d, quality_d, device_hitToTuple_.get())); alpaka::wait(queue); } if (m_params.minHitsPerNtuplet_ < 4) { @@ -347,14 +332,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - alpaka::enqueue(queue, - alpaka::createTaskKernel(workDiv1D, - kernel_tripletCleaner(), - hh.view(), - tuples_d, - tracks_d, - quality_d, - alpaka::getPtrNative(device_hitToTuple_))); + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workDiv1D, kernel_tripletCleaner(), hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get())); alpaka::wait(queue); } @@ -364,17 +345,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); alpaka::enqueue(queue, - alpaka::createTaskKernel(workDiv1D, - kernel_doStatsForHitInTracks(), - alpaka::getPtrNative(device_hitToTuple_), - alpaka::getPtrNative(counters_))); + alpaka::createTaskKernel( + workDiv1D, kernel_doStatsForHitInTracks(), device_hitToTuple_.get(), counters_.get())); numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - alpaka::enqueue(queue, - alpaka::createTaskKernel( - workDiv1D, kernel_doStatsForTracks(), tuples_d, quality_d, alpaka::getPtrNative(counters_))); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workDiv1D, kernel_doStatsForTracks(), tuples_d, quality_d, counters_.get())); alpaka::wait(queue); } #ifdef GPU_DEBUG @@ -392,18 +371,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tuples_d, tracks_d, quality_d, - alpaka::getPtrNative(device_hitToTuple_), + device_hitToTuple_.get(), 100, iev)); #endif } void CAHitNtupletGeneratorKernels::printCounters(Queue &queue) { - const WorkDiv1D workDiv1D = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(1u), Vec1D::all(1u)); - alpaka::enqueue( - queue, alpaka::createTaskKernel(workDiv1D, kernel_printCounters(), alpaka::getPtrNative(counters_))); - alpaka::wait(queue); + // const WorkDiv1D workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(1u), Vec1D::all(1u)); + // alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_printCounters(), counters_.get())); + // alpaka::wait(queue); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h index b23dac1c5..59fb0db7b 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h @@ -1,6 +1,7 @@ #ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h +#include "AlpakaCore/device_unique_ptr.h" #include "AlpakaDataFormats/PixelTrackAlpaka.h" #include "GPUCACell.h" @@ -160,43 +161,41 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ////////////////////////////////////////////////////////// // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// - counters_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, + counters_{cms::alpakatools::make_device_unique(1u)}, - device_hitToTuple_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, - device_tupleMultiplicity_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, + device_hitToTuple_{cms::alpakatools::make_device_unique(1u)}, + device_tupleMultiplicity_{cms::alpakatools::make_device_unique(1u)}, - device_theCells_{ - cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), params.maxNumberOfDoublets_)}, + device_theCells_{cms::alpakatools::make_device_unique(params.maxNumberOfDoublets_)}, // in principle we can use "nhits" to heuristically dimension the workspace... device_isOuterHitOfCell_{ - cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), std::max(1U, nhits))}, - - device_theCellNeighbors_{ - cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, - device_theCellTracks_{ - cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, - - //cellStorage_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))}, - device_theCellNeighborsContainer_{cms::alpakatools::allocDeviceBuf( - alpaka::getDev(queue), CAConstants::maxNumOfActiveDoublets())}, - device_theCellTracksContainer_{cms::alpakatools::allocDeviceBuf( - alpaka::getDev(queue), CAConstants::maxNumOfActiveDoublets())}, - - //device_storage_{cms::alpakatools::allocDeviceBuf<::ALPAKA_ACCELERATOR_NAMESPACE::cmscuda::AtomicPairCounter::c_type>(alpaka::getDev(queue), 3u)}, - //device_hitTuple_apc_ = (::cms::alpakatools::AtomicPairCounter*)device_storage_.get()}, - //device_hitToTuple_apc_ = (::cms::alpakatools::AtomicPairCounter*)device_storage_.get() + 1; + cms::alpakatools::make_device_unique(std::max(1U, nhits))}, + + device_theCellNeighbors_{cms::alpakatools::make_device_unique(1u)}, + device_theCellTracks_{cms::alpakatools::make_device_unique(1u)}, + + //cellStorage_{cms::alpakatools::allocDeviceBuf(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))}, + device_theCellNeighborsContainer_{ + cms::alpakatools::make_device_unique(CAConstants::maxNumOfActiveDoublets())}, + device_theCellTracksContainer_{ + cms::alpakatools::make_device_unique(CAConstants::maxNumOfActiveDoublets())}, + + //device_storage_{cms::alpakatools::allocDeviceBuf(3u)}, + //device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get()}, + //device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1; //device_nCells_ = (uint32_t*)(device_storage_.get() + 2)}, - device_hitTuple_apc_{ - cms::alpakatools::allocDeviceBuf<::cms::alpakatools::AtomicPairCounter>(alpaka::getDev(queue), 1u)}, - device_hitToTuple_apc_{ - cms::alpakatools::allocDeviceBuf<::cms::alpakatools::AtomicPairCounter>(alpaka::getDev(queue), 1u)}, - device_nCells_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)} { - alpaka::memset(queue, counters_, 0, 1u); + device_hitTuple_apc_{cms::alpakatools::make_device_unique(1u)}, + device_hitToTuple_apc_{cms::alpakatools::make_device_unique(1u)}, + device_nCells_{cms::alpakatools::make_device_unique(1u)} { + auto counters_view = cms::alpakatools::createDeviceView(alpaka::getDev(queue), counters_.get(), 1u); + alpaka::memset(queue, counters_view, 0, 1u); - alpaka::memset(queue, device_nCells_, 0, 1u); + auto device_nCells_view = + cms::alpakatools::createDeviceView(alpaka::getDev(queue), device_nCells_.get(), 1u); + alpaka::memset(queue, device_nCells_view, 0, 1u); - launchZero(alpaka::getPtrNative(device_tupleMultiplicity_), queue); - launchZero(alpaka::getPtrNative(device_hitToTuple_), queue); + launchZero(device_tupleMultiplicity_.get(), queue); + launchZero(device_hitToTuple_.get(), queue); // we may wish to keep it in the edm... alpaka::wait(queue); @@ -204,7 +203,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ~CAHitNtupletGeneratorKernels() = default; - TupleMultiplicity const* tupleMultiplicity() const { return alpaka::getPtrNative(device_tupleMultiplicity_); } + TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); } void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, Queue& queue); @@ -222,33 +221,38 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // params Params const& m_params; - AlpakaDeviceBuf counters_; // NB: Counters: In legacy, sum of the stats of all events. + cms::alpakatools::device::unique_ptr counters_; // NB: Counters: In legacy, sum of the stats of all events. // Here instead, these stats are per event. // Does not matter much, as the stats are desactivated by default anyway, and are for debug only // (stats are not stored eventually, no interference with any result). // For debug, better to be able to see info per event that just a sum. // workspace - AlpakaDeviceBuf device_hitToTuple_; - AlpakaDeviceBuf device_tupleMultiplicity_; + cms::alpakatools::device::unique_ptr device_hitToTuple_; + cms::alpakatools::device::unique_ptr device_tupleMultiplicity_; - AlpakaDeviceBuf device_theCells_; // NB: In legacy, was allocated inside buildDoublets. - AlpakaDeviceBuf + cms::alpakatools::device::unique_ptr + device_theCells_; // NB: In legacy, was allocated inside buildDoublets. + cms::alpakatools::device::unique_ptr device_isOuterHitOfCell_; // NB: In legacy, was allocated inside buildDoublets. - AlpakaDeviceBuf device_theCellNeighbors_; - AlpakaDeviceBuf device_theCellTracks_; + cms::alpakatools::device::unique_ptr device_theCellNeighbors_; + cms::alpakatools::device::unique_ptr device_theCellTracks_; // AlpakaDeviceBuf cellStorage_; // NB: In legacy, was allocated inside buildDoublets. // NB: Here, data from cellstorage_ (legacy) directly owned by the following: - AlpakaDeviceBuf device_theCellNeighborsContainer_; // Was non-owning in legacy! - AlpakaDeviceBuf device_theCellTracksContainer_; // Was non-owning in legacy! + cms::alpakatools::device::unique_ptr + device_theCellNeighborsContainer_; // Was non-owning in legacy! + cms::alpakatools::device::unique_ptr + device_theCellTracksContainer_; // Was non-owning in legacy! // AlpakaDeviceBuf<::cms::alpakatools::AtomicPairCounter::c_type> device_storage_; // NB: In legacy // NB: Here, data from device_storage_ (legacy) directly owned by the following: - AlpakaDeviceBuf<::cms::alpakatools::AtomicPairCounter> device_hitTuple_apc_; // Was non-owning in legacy! - AlpakaDeviceBuf<::cms::alpakatools::AtomicPairCounter> device_hitToTuple_apc_; // Was non-owning in legacy! - AlpakaDeviceBuf device_nCells_; // Was non-owning in legacy! + cms::alpakatools::device::unique_ptr + device_hitTuple_apc_; // Was non-owning in legacy! + cms::alpakatools::device::unique_ptr + device_hitToTuple_apc_; // Was non-owning in legacy! + cms::alpakatools::device::unique_ptr device_nCells_; // Was non-owning in legacy! }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc index fdf2f4c8d..8cb499b84 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc @@ -7,6 +7,8 @@ #include #include +#include "AlpakaCore/device_unique_ptr.h" + #include "Framework/Event.h" #include "CAHitNtupletGeneratorOnGPU.h" @@ -91,8 +93,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { PixelTrackAlpaka CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DAlpaka const& hits_d, float bfield, Queue& queue) const { - PixelTrackAlpaka tracks{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; - auto* soa = alpaka::getPtrNative(tracks); + PixelTrackAlpaka tracks{cms::alpakatools::make_device_unique(1u)}; + auto* soa = tracks.get(); CAHitNtupletGeneratorKernels kernels(m_params, hits_d.nHits(), queue); kernels.buildDoublets(hits_d, queue); diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc index dc5208a0b..e142caa49 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc @@ -1,4 +1,4 @@ -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/host_unique_ptr.h" #include "AlpakaDataFormats/PixelTrackAlpaka.h" #include "Framework/EventSetup.h" @@ -75,9 +75,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); - auto outputData = ::cms::alpakatools::allocHostBuf(1u); + auto outputData = ::cms::alpakatools::make_host_unique(1u); ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; - alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); + auto const inputDataView = + ::cms::alpakatools::createDeviceView(alpaka::getDev(ctx.stream()), inputData.get(), 1u); + auto outputDataView = ::cms::alpakatools::createHostView(outputData.get(), 1u); + alpaka::memcpy(ctx.stream(), outputDataView, inputDataView, 1u); // DO NOT make a copy (actually TWO....) ctx.emplace(iEvent, tokenSOA_, std::move(outputData)); diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc b/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc index 34b39b2b8..5492e127b 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc @@ -1,6 +1,6 @@ #include "RiemannFitOnGPU.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -18,21 +18,20 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { Vec1D::all(numberOfBlocks / 4), Vec1D::all(blockSize)); // Fit internals - auto hitsGPU_ = cms::alpakatools::allocDeviceBuf( - alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); + auto hitsGPU_ = cms::alpakatools::make_device_unique(maxNumberOfConcurrentFits_ * + sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = cms::alpakatools::allocDeviceBuf( - alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); + auto hits_geGPU_ = cms::alpakatools::make_device_unique(maxNumberOfConcurrentFits_ * + sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = cms::alpakatools::allocDeviceBuf( - alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); + auto fast_fit_resultsGPU_ = cms::alpakatools::make_device_unique(maxNumberOfConcurrentFits_ * + sizeof(Rfit::Vector4d) / sizeof(double)); //auto circle_fit_resultsGPU_holder = //cms::cuda::make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream); //Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get()); - //auto circle_fit_resultsGPU_holder = cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)); - auto circle_fit_resultsGPU_ = - cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), maxNumberOfConcurrentFits_); + //auto circle_fit_resultsGPU_holder = cms::alpakatools::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)); + auto circle_fit_resultsGPU_ = cms::alpakatools::make_device_unique(maxNumberOfConcurrentFits_); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // triplets @@ -43,9 +42,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, 3, hv, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), offset)); alpaka::enqueue(queue, @@ -54,10 +53,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, 3, bField_, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), - alpaka::getPtrNative(circle_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_.get(), offset)); alpaka::enqueue(queue, @@ -67,10 +66,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { 3, bField_, outputSoa_d, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), - alpaka::getPtrNative(circle_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_.get(), offset)); // quads @@ -81,9 +80,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, 4, hv, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), offset)); alpaka::enqueue(queue, @@ -92,10 +91,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, 4, bField_, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), - alpaka::getPtrNative(circle_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_.get(), offset)); alpaka::enqueue(queue, @@ -105,10 +104,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { 4, bField_, outputSoa_d, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), - alpaka::getPtrNative(circle_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_.get(), offset)); if (fit5as4_) { @@ -120,9 +119,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, 5, hv, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), offset)); alpaka::enqueue(queue, @@ -131,10 +130,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, 5, bField_, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), - alpaka::getPtrNative(circle_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_.get(), offset)); alpaka::enqueue(queue, @@ -144,10 +143,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { 5, bField_, outputSoa_d, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), - alpaka::getPtrNative(circle_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_.get(), offset)); alpaka::wait(queue); } else { @@ -159,9 +158,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, 5, hv, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), offset)); alpaka::enqueue(queue, @@ -170,10 +169,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { tupleMultiplicity_d, 5, bField_, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), - alpaka::getPtrNative(circle_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_.get(), offset)); alpaka::enqueue(queue, @@ -183,10 +182,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { 5, bField_, outputSoa_d, - alpaka::getPtrNative(hitsGPU_), - alpaka::getPtrNative(hits_geGPU_), - alpaka::getPtrNative(fast_fit_resultsGPU_), - alpaka::getPtrNative(circle_fit_resultsGPU_), + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_.get(), offset)); alpaka::wait(queue); } diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc index 0f77fbc6f..02b571b7b 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc @@ -48,8 +48,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { {} void PixelVertexProducerAlpaka::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { - auto const& tracksBuf = iEvent.get(tokenTrack_); - auto const tracks = alpaka::getPtrNative(tracksBuf); + auto const& tracksPtr = iEvent.get(tokenTrack_); + auto const tracks = tracksPtr.get(); ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; ctx.emplace(iEvent, tokenVertex_, m_gpuAlgo.makeAsync(tracks, m_ptMin, ctx.stream())); diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc index fe6cd2b17..4e7255f9b 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc @@ -1,6 +1,7 @@ #include "AlpakaCore/ScopedContext.h" #include "AlpakaCore/alpakaCommon.h" #include "AlpakaCore/alpakaMemoryHelper.h" +#include "AlpakaCore/host_unique_ptr.h" #include "AlpakaDataFormats/ZVertexAlpaka.h" #include "Framework/EDProducer.h" #include "Framework/Event.h" @@ -57,9 +58,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { void PixelVertexSoAFromAlpaka::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); - auto outputData = ::cms::alpakatools::allocHostBuf(1u); + auto outputData = ::cms::alpakatools::make_host_unique(1u); ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; - alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); + auto const inputDataView = + ::cms::alpakatools::createDeviceView(alpaka::getDev(ctx.stream()), inputData.get(), 1u); + auto outputDataView = ::cms::alpakatools::createHostView(outputData.get(), 1u); + alpaka::memcpy(ctx.stream(), outputDataView, inputDataView, 1u); // No copies.... ctx.emplace(iEvent, tokenSOA_, std::move(outputData)); diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc index 600756040..0ab97ec62 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc @@ -1,4 +1,4 @@ -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" #include "gpuVertexFinder.h" #include "gpuClusterTracksByDensity.h" @@ -107,12 +107,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // std::cout << "producing Vertices on GPU" << std::endl; ALPAKA_ASSERT_OFFLOAD(tksoa); - ZVertexAlpaka vertices{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; - auto* soa = alpaka::getPtrNative(vertices); + ZVertexAlpaka vertices{::cms::alpakatools::make_device_unique(1u)}; + auto* soa = vertices.get(); ALPAKA_ASSERT_OFFLOAD(soa); - auto ws_dBuf{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; - auto ws_d = alpaka::getPtrNative(ws_dBuf); + auto ws_d = ws_dPtr.get(); auto nvFinalVerticesView = cms::alpakatools::createDeviceView(alpaka::getDev(queue), &soa->nvFinal, 1u); alpaka::memset(queue, nvFinalVerticesView, 0, 1u); diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.h index 909b2bf49..b877b26ef 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.h @@ -2,7 +2,7 @@ #ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h #define RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" #include "AlpakaDataFormats/ZVertexAlpaka.h" #include "AlpakaDataFormats/PixelTrackAlpaka.h" @@ -43,7 +43,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float ierrmax, // max error to be "seed" float ichi2max // max normalized distance to cluster ) - : oneKernel_(oneKernel && !(useDBSCAN || useIterative)), + : ws_dPtr(::cms::alpakatools::make_device_unique(1u)), + oneKernel_(oneKernel && !(useDBSCAN || useIterative)), useDensity_(useDensity), useDBSCAN_(useDBSCAN), useIterative_(useIterative), @@ -57,6 +58,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ZVertexAlpaka makeAsync(TkSoA const* tksoa, float ptMin, Queue& queue) const; private: + ::cms::alpakatools::device::unique_ptr ws_dPtr; + const bool oneKernel_; const bool useDensity_; const bool useDBSCAN_; diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc index 4c1bb9c6e..2eecbe749 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc @@ -4,7 +4,7 @@ #include "Framework/EventSetup.h" #include "Framework/ESPluginFactory.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" #include #include @@ -33,19 +33,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { Queue queue(devices[0]); auto cablingMap_h{cms::alpakatools::createHostView(&obj, 1u)}; - auto cablingMap_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; - alpaka::prepareForAsyncCopy(cablingMap_d); - alpaka::memcpy(queue, cablingMap_d, cablingMap_h, 1u); + auto cablingMap_d{cms::alpakatools::make_device_unique(1u)}; + auto cablingMap_d_view = + cms::alpakatools::createDeviceView(alpaka::getDev(queue), cablingMap_d.get(), 1u); + alpaka::memcpy(queue, cablingMap_d_view, cablingMap_h, 1u); eventSetup.put(std::make_unique(std::move(cablingMap_d), true)); auto modToUnp_h{cms::alpakatools::createHostView(modToUnpDefault.data(), modToUnpDefSize)}; - auto modToUnp_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), modToUnpDefSize)}; - alpaka::prepareForAsyncCopy(modToUnp_d); - alpaka::memcpy(queue, modToUnp_d, modToUnp_h, modToUnpDefSize); + auto modToUnp_d{cms::alpakatools::make_device_unique(modToUnpDefSize)}; + auto modToUnp_d_view = + cms::alpakatools::createDeviceView(alpaka::getDev(queue), modToUnp_d.get(), modToUnpDefSize); + alpaka::memcpy(queue, modToUnp_d_view, modToUnp_h, modToUnpDefSize); alpaka::wait(queue); - eventSetup.put(std::make_unique>(std::move(modToUnp_d))); + eventSetup.put(std::make_unique>(std::move(modToUnp_d))); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc index dd12bca26..31cd934c1 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc @@ -3,7 +3,7 @@ #include "Framework/EventSetup.h" #include "Framework/ESPluginFactory.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" #include #include @@ -48,22 +48,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const uint32_t numDecodingStructures = gainData.size() / sizeof(SiPixelGainForHLTonGPU_DecodingStructure); auto ped_h{cms::alpakatools::createHostView( reinterpret_cast(gainData.data()), numDecodingStructures)}; - auto ped_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), - numDecodingStructures)}; - alpaka::prepareForAsyncCopy(ped_d); - alpaka::memcpy(queue, ped_d, ped_h, numDecodingStructures); + auto ped_d{cms::alpakatools::make_device_unique(numDecodingStructures)}; + auto ped_d_view = cms::alpakatools::createDeviceView( + alpaka::getDev(queue), ped_d.get(), numDecodingStructures); + alpaka::memcpy(queue, ped_d_view, ped_h, numDecodingStructures); auto rangeAndCols_h{ cms::alpakatools::createHostView(gain.rangeAndCols, 2000u)}; - auto rangeAndCols_d{ - cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 2000u)}; - alpaka::prepareForAsyncCopy(rangeAndCols_d); - alpaka::memcpy(queue, rangeAndCols_d, rangeAndCols_h, 2000u); + auto rangeAndCols_d{cms::alpakatools::make_device_unique(2000u)}; + auto rangeAndCols_d_view = cms::alpakatools::createDeviceView( + alpaka::getDev(queue), rangeAndCols_d.get(), 2000u); + alpaka::memcpy(queue, rangeAndCols_d_view, rangeAndCols_h, 2000u); auto fields_h{cms::alpakatools::createHostView(&gain.fields_, 1u)}; - auto fields_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; - alpaka::prepareForAsyncCopy(fields_d); - alpaka::memcpy(queue, fields_d, fields_h, 1u); + auto fields_d{cms::alpakatools::make_device_unique(1u)}; + auto fields_d_view = + cms::alpakatools::createDeviceView(alpaka::getDev(queue), fields_d.get(), 1u); + alpaka::memcpy(queue, fields_d_view, fields_h, 1u); alpaka::wait(queue); diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc index b6fb2c74f..b8ec96847 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc @@ -17,7 +17,7 @@ #include "../ErrorChecker.h" #include "SiPixelRawToClusterGPUKernel.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" #include #include @@ -71,7 +71,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // get the GPU product already here so that the async transfer can begin const auto* gpuMap = hgpuMap.cablingMap(); - const unsigned char* gpuModulesToUnpack = alpaka::getPtrNative(iSetup.get>()); + const unsigned char* gpuModulesToUnpack = iSetup.get<::cms::alpakatools::device::unique_ptr>().get(); const auto* gpuGains = &(iSetup.get()); auto const& fedIds_ = iSetup.get().fedIds(); diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc index 36b6b5884..f2623873d 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc @@ -18,6 +18,8 @@ #include // Alpaka includes +#include "AlpakaCore/host_unique_ptr.h" +#include "AlpakaCore/device_unique_ptr.h" #include "AlpakaCore/prefixScan.h" // CMSSW includes @@ -32,15 +34,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { namespace pixelgpudetails { SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() - : word_{::cms::alpakatools::allocHostBuf(MAX_FED_WORDS)}, - fedId_{::cms::alpakatools::allocHostBuf(MAX_FED_WORDS)} {} + : word_{cms::alpakatools::make_host_unique(MAX_FED_WORDS)}, + fedId_{cms::alpakatools::make_host_unique(MAX_FED_WORDS)} {} void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const uint32_t *src, unsigned int length) { - std::memcpy(alpaka::getPtrNative(word_) + wordCounterGPU, src, sizeof(uint32_t) * length); - std::memset(alpaka::getPtrNative(fedId_) + wordCounterGPU / 2, fedId - 1200, length / 2); + std::memcpy(word() + wordCounterGPU, src, sizeof(uint32_t) * length); + std::memset(this->fedId() + wordCounterGPU / 2, fedId - 1200, length / 2); } //////////////////// @@ -563,12 +565,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::cout << "decoding " << wordCounter << " digis. Max is " << pixelgpudetails::MAX_FED_WORDS << std::endl; #endif - digis_d = SiPixelDigisAlpaka(alpaka::getDev(queue), pixelgpudetails::MAX_FED_WORDS); + digis_d = SiPixelDigisAlpaka(pixelgpudetails::MAX_FED_WORDS); if (includeErrors) { - digiErrors_d = - SiPixelDigiErrorsAlpaka(alpaka::getDev(queue), pixelgpudetails::MAX_FED_WORDS, std::move(errors), queue); + digiErrors_d = SiPixelDigiErrorsAlpaka(pixelgpudetails::MAX_FED_WORDS, std::move(errors), queue); } - clusters_d = SiPixelClustersAlpaka(alpaka::getDev(queue), gpuClustering::MaxNumModules); + clusters_d = SiPixelClustersAlpaka(gpuClustering::MaxNumModules); if (wordCounter) // protect in case of empty event.... { @@ -585,14 +586,20 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(0 == wordCounter % 2); // wordCounter is the total no of words in each event to be trasfered on device - auto word_d = cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), wordCounter); + auto word_d = cms::alpakatools::make_device_unique(wordCounter); // NB: IMPORTANT: fedId_d: In legacy, wordCounter elements are allocated. // However, only the first half of elements end up eventually used: // hence, here, only wordCounter/2 elements are allocated. - auto fedId_d = cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), wordCounter / 2); + auto fedId_d = cms::alpakatools::make_device_unique(wordCounter / 2); - alpaka::memcpy(queue, word_d, wordFed.word(), wordCounter); - alpaka::memcpy(queue, fedId_d, wordFed.fedId(), wordCounter / 2); + auto word_d_view = + cms::alpakatools::createDeviceView(alpaka::getDev(queue), word_d.get(), wordCounter); + auto fedId_d_view = + cms::alpakatools::createDeviceView(alpaka::getDev(queue), fedId_d.get(), wordCounter / 2); + auto word_view = cms::alpakatools::createHostView(wordFed.word(), MAX_FED_WORDS); + auto fedId_view = cms::alpakatools::createHostView(wordFed.fedId(), MAX_FED_WORDS); + alpaka::memcpy(queue, word_d_view, word_view, wordCounter); + alpaka::memcpy(queue, fedId_d_view, fedId_view, wordCounter / 2); // Launch rawToDigi kernel alpaka::enqueue(queue, @@ -601,8 +608,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { cablingMap, modToUnp, wordCounter, - alpaka::getPtrNative(word_d), - alpaka::getPtrNative(fedId_d), + word_d.get(), + fedId_d.get(), digis_d->xx(), digis_d->yy(), digis_d->adc(), @@ -675,7 +682,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto moduleStartFirstElement = cms::alpakatools::createDeviceView(alpaka::getDev(queue), clusters_d->moduleStart(), 1u); - alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement, 1u); + auto nModules_Clusters_h_view = cms::alpakatools::createHostView(nModules_Clusters_h.get(), 2u); + alpaka::memcpy(queue, nModules_Clusters_h_view, moduleStartFirstElement, 1u); const WorkDiv1D &workDivMaxNumModules = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(MaxNumModules), Vec1D::all(256)); @@ -737,13 +745,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const auto clusModuleStartLastElement = AlpakaDeviceSubView(clusModuleStartView, 1u, gpuClustering::MaxNumModules); // slice on host - auto nModules_Clusters_1_h{::cms::alpakatools::allocHostBuf(1u)}; - auto p_nModules_Clusters_1_h = alpaka::getPtrNative(nModules_Clusters_1_h); + auto nModules_Clusters_1_h{cms::alpakatools::make_host_unique(1u)}; + auto p_nModules_Clusters_1_h = nModules_Clusters_1_h.get(); + auto nModules_Clusters_1_h_view = cms::alpakatools::createHostView(p_nModules_Clusters_1_h, 1u); - alpaka::memcpy(queue, nModules_Clusters_1_h, clusModuleStartLastElement, 1u); + alpaka::memcpy(queue, nModules_Clusters_1_h_view, clusModuleStartLastElement, 1u); // Wait for memory transfer to host to complete before looking at host data! alpaka::wait(queue); - auto p_nModules_Clusters_h = alpaka::getPtrNative(nModules_Clusters_h); + auto p_nModules_Clusters_h = nModules_Clusters_h.get(); p_nModules_Clusters_h[1] = p_nModules_Clusters_1_h[0]; } // end clusterizer scope } diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h index 04130fe91..ff4faede9 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h @@ -7,6 +7,8 @@ #include "AlpakaDataFormats/SiPixelClustersAlpaka.h" #include "AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h" #include "AlpakaDataFormats/SiPixelDigisAlpaka.h" +#include "AlpakaCore/host_unique_ptr.h" + #include "AlpakaDataFormats/gpuClusteringConstants.h" #include "CondFormats/SiPixelFedCablingMapGPU.h" #include "CondFormats/SiPixelGainForHLTonGPU.h" @@ -158,16 +160,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { void initializeWordFed(int fedId, unsigned int wordCounterGPU, const uint32_t* src, unsigned int length); - auto word() const { return word_; } - auto fedId() const { return fedId_; } + auto word() const { return word_.get(); } + auto fedId() const { return fedId_.get(); } private: - AlpakaHostBuf word_; - AlpakaHostBuf fedId_; + ::cms::alpakatools::host::unique_ptr word_; + ::cms::alpakatools::host::unique_ptr fedId_; }; - SiPixelRawToClusterGPUKernel() : nModules_Clusters_h{::cms::alpakatools::allocHostBuf(2u)} {} - + SiPixelRawToClusterGPUKernel() : nModules_Clusters_h{::cms::alpakatools::make_host_unique(2u)} {}; + // digis_d{SiPixelDigisAlpaka(0u)}, + // clusters_d{SiPixelClustersAlpaka(0u)}, + // digiErrors_d{SiPixelDigiErrorsAlpaka(0u, PixelFormatterErrors())} {}; ~SiPixelRawToClusterGPUKernel() = default; SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete; @@ -189,7 +193,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { Queue& queue); std::pair getResults() { - auto pnModules_Clusters_h = alpaka::getPtrNative(nModules_Clusters_h); + auto pnModules_Clusters_h = nModules_Clusters_h.get(); digis_d->setNModulesDigis(pnModules_Clusters_h[0], nDigis); clusters_d->setNClusters(pnModules_Clusters_h[1]); return std::make_pair(std::move(*digis_d), std::move(*clusters_d)); @@ -201,7 +205,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t nDigis = 0; // Data to be put in the event - AlpakaHostBuf nModules_Clusters_h; + ::cms::alpakatools::host::unique_ptr nModules_Clusters_h; std::optional digis_d; std::optional clusters_d; std::optional digiErrors_d; diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc index 67d407eae..0b620a6a9 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc @@ -3,7 +3,7 @@ #include "Framework/EventSetup.h" #include "Framework/ESPluginFactory.h" -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/device_unique_ptr.h" #include #include @@ -23,15 +23,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::ifstream in((data_ + "/cpefast.bin").c_str(), std::ios::binary); in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit); - // TODO FIXME use the correct device - Queue queue(devices[0]); + Queue queue(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0]); pixelCPEforGPU::CommonParams commonParams; in.read(reinterpret_cast(&commonParams), sizeof(pixelCPEforGPU::CommonParams)); auto commonParams_h{cms::alpakatools::createHostView(&commonParams, 1u)}; - auto commonParams_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; - alpaka::prepareForAsyncCopy(commonParams_d); - alpaka::memcpy(queue, commonParams_d, commonParams_h, 1u); + auto commonParams_d{cms::alpakatools::make_device_unique(1u)}; + auto commonParams_d_view{cms::alpakatools::createDeviceView( + alpaka::getDev(queue), commonParams_d.get(), 1u)}; + alpaka::memcpy(queue, commonParams_d_view, commonParams_h, 1u); unsigned int ndetParams; in.read(reinterpret_cast(&ndetParams), sizeof(unsigned int)); @@ -40,34 +40,37 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::vector detParams(ndetParams); in.read(reinterpret_cast(detParams.data()), ndetParams * sizeof(pixelCPEforGPU::DetParams)); auto detParams_h{cms::alpakatools::createHostView(detParams.data(), ndetParams)}; - auto detParams_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), ndetParams)}; - alpaka::prepareForAsyncCopy(detParams_d); - alpaka::memcpy(queue, detParams_d, detParams_h, ndetParams); + auto detParams_d{cms::alpakatools::make_device_unique(ndetParams)}; + auto detParams_d_view{cms::alpakatools::createDeviceView( + alpaka::getDev(queue), detParams_d.get(), ndetParams)}; + alpaka::memcpy(queue, detParams_d_view, detParams_h, ndetParams); pixelCPEforGPU::AverageGeometry averageGeometry; in.read(reinterpret_cast(&averageGeometry), sizeof(pixelCPEforGPU::AverageGeometry)); auto averageGeometry_h{cms::alpakatools::createHostView(&averageGeometry, 1u)}; - auto averageGeometry_d{ - cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; - alpaka::prepareForAsyncCopy(averageGeometry_d); - alpaka::memcpy(queue, averageGeometry_d, averageGeometry_h, 1u); + auto averageGeometry_d{cms::alpakatools::make_device_unique(1u)}; + auto averageGeometry_d_view{cms::alpakatools::createDeviceView( + alpaka::getDev(queue), averageGeometry_d.get(), 1u)}; + alpaka::memcpy(queue, averageGeometry_d_view, averageGeometry_h, 1u); pixelCPEforGPU::LayerGeometry layerGeometry; in.read(reinterpret_cast(&layerGeometry), sizeof(pixelCPEforGPU::LayerGeometry)); auto layerGeometry_h{cms::alpakatools::createHostView(&layerGeometry, 1u)}; - auto layerGeometry_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; - alpaka::prepareForAsyncCopy(layerGeometry_d); - alpaka::memcpy(queue, layerGeometry_d, layerGeometry_h, 1u); + auto layerGeometry_d{cms::alpakatools::make_device_unique(1u)}; + auto layerGeometry_d_view{cms::alpakatools::createDeviceView( + alpaka::getDev(queue), layerGeometry_d.get(), 1u)}; + alpaka::memcpy(queue, layerGeometry_d_view, layerGeometry_h, 1u); pixelCPEforGPU::ParamsOnGPU params; - params.m_commonParams = alpaka::getPtrNative(commonParams_d); - params.m_detParams = alpaka::getPtrNative(detParams_d); - params.m_layerGeometry = alpaka::getPtrNative(layerGeometry_d); - params.m_averageGeometry = alpaka::getPtrNative(averageGeometry_d); + params.m_commonParams = commonParams_d.get(); + params.m_detParams = detParams_d.get(); + params.m_layerGeometry = layerGeometry_d.get(); + params.m_averageGeometry = averageGeometry_d.get(); auto params_h{cms::alpakatools::createHostView(¶ms, 1u)}; - auto params_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; - alpaka::prepareForAsyncCopy(params_d); - alpaka::memcpy(queue, params_d, params_h, 1u); + auto params_d{cms::alpakatools::make_device_unique(1u)}; + auto params_d_view{ + cms::alpakatools::createDeviceView(alpaka::getDev(queue), params_d.get(), 1u)}; + alpaka::memcpy(queue, params_d_view, params_h, 1u); alpaka::wait(queue); diff --git a/src/alpaka/plugin-Validation/alpaka/CountValidator.cc b/src/alpaka/plugin-Validation/alpaka/CountValidator.cc index 7d715061f..c0be191c3 100644 --- a/src/alpaka/plugin-Validation/alpaka/CountValidator.cc +++ b/src/alpaka/plugin-Validation/alpaka/CountValidator.cc @@ -92,8 +92,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { { auto const& count = iEvent.get(trackCountToken_); - auto const& tracksBuf = iEvent.get(trackToken_); - auto const tracks = alpaka::getPtrNative(tracksBuf); + auto const& tracksPtr = iEvent.get(trackToken_); + auto const tracks = tracksPtr.get(); int nTracks = 0; for (int i = 0; i < tracks->stride(); ++i) { @@ -116,8 +116,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { { auto const& count = iEvent.get(vertexCountToken_); - auto const& verticesBuf = iEvent.get(vertexToken_); - auto const vertices = alpaka::getPtrNative(verticesBuf); + auto const& verticesPtr = iEvent.get(vertexToken_); + auto const vertices = verticesPtr.get(); auto diff = std::abs(int(vertices->nvFinal) - int(count.nVertices())); if (diff != 0) { diff --git a/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc b/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc index 29ef61b38..2899e77bd 100644 --- a/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc +++ b/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc @@ -1,4 +1,4 @@ -#include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/host_unique_ptr.h" #include "AlpakaDataFormats/gpuClusteringConstants.h" #include "AlpakaDataFormats/PixelTrackAlpaka.h" #include "AlpakaDataFormats/SiPixelClustersAlpaka.h" @@ -120,39 +120,39 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const nHits = hits.nHits(); #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - // TODO FIXME use the correct device - Queue queue(devices[0]); - auto const h_adcBuf = digis.adcToHostAsync(queue); - auto const h_adc = alpaka::getPtrNative(h_adcBuf); + Queue queue(ALPAKA_ACCELERATOR_NAMESPACE::devices[0]); + auto const h_adcPtr = digis.adcToHostAsync(queue); + auto const h_adc = h_adcPtr.get(); auto const d_clusInModuleView = cms::alpakatools::createDeviceView( alpaka::getDev(queue), clusters.clusInModule(), gpuClustering::MaxNumModules); - auto h_clusInModuleBuf{::cms::alpakatools::allocHostBuf(gpuClustering::MaxNumModules)}; - alpaka::memcpy(queue, h_clusInModuleBuf, d_clusInModuleView, gpuClustering::MaxNumModules); - auto h_clusInModule = alpaka::getPtrNative(h_clusInModuleBuf); - - auto const h_lxBuf = hits.xlToHostAsync(queue); - auto const h_lx = alpaka::getPtrNative(h_lxBuf); - auto const h_lyBuf = hits.ylToHostAsync(queue); - auto const h_ly = alpaka::getPtrNative(h_lyBuf); - auto const h_lexBuf = hits.xerrToHostAsync(queue); - auto const h_lex = alpaka::getPtrNative(h_lexBuf); - auto const h_leyBuf = hits.yerrToHostAsync(queue); - auto const h_ley = alpaka::getPtrNative(h_leyBuf); - auto const h_gxBuf = hits.xgToHostAsync(queue); - auto const h_gx = alpaka::getPtrNative(h_gxBuf); - auto const h_gyBuf = hits.ygToHostAsync(queue); - auto const h_gy = alpaka::getPtrNative(h_gyBuf); - auto const h_gzBuf = hits.zgToHostAsync(queue); - auto const h_gz = alpaka::getPtrNative(h_gzBuf); - auto const h_grBuf = hits.rgToHostAsync(queue); - auto const h_gr = alpaka::getPtrNative(h_grBuf); - auto const h_chargeBuf = hits.chargeToHostAsync(queue); - auto const h_charge = alpaka::getPtrNative(h_chargeBuf); - auto const h_sizexBuf = hits.xsizeToHostAsync(queue); - auto const h_sizex = alpaka::getPtrNative(h_sizexBuf); - auto const h_sizeyBuf = hits.ysizeToHostAsync(queue); - auto const h_sizey = alpaka::getPtrNative(h_sizeyBuf); + auto h_clusInModulePtr{cms::alpakatools::make_host_unique(gpuClustering::MaxNumModules)}; + auto h_clusInModule{h_clusInModulePtr.get()}; + auto h_clusInModuleView{cms::alpakatools::createHostView(h_clusInModule, gpuClustering::MaxNumModules)}; + alpaka::memcpy(queue, h_clusInModuleView, d_clusInModuleView, gpuClustering::MaxNumModules); + + auto const h_lxPtr = hits.xlToHostAsync(queue); + auto const h_lx = h_lxPtr.get(); + auto const h_lyPtr = hits.ylToHostAsync(queue); + auto const h_ly = h_lyPtr.get(); + auto const h_lexPtr = hits.xerrToHostAsync(queue); + auto const h_lex = h_lexPtr.get(); + auto const h_leyPtr = hits.yerrToHostAsync(queue); + auto const h_ley = h_leyPtr.get(); + auto const h_gxPtr = hits.xgToHostAsync(queue); + auto const h_gx = h_gxPtr.get(); + auto const h_gyPtr = hits.ygToHostAsync(queue); + auto const h_gy = h_gyPtr.get(); + auto const h_gzPtr = hits.zgToHostAsync(queue); + auto const h_gz = h_gzPtr.get(); + auto const h_grPtr = hits.rgToHostAsync(queue); + auto const h_gr = h_grPtr.get(); + auto const h_chargePtr = hits.chargeToHostAsync(queue); + auto const h_charge = h_chargePtr.get(); + auto const h_sizexPtr = hits.xsizeToHostAsync(queue); + auto const h_sizex = h_sizexPtr.get(); + auto const h_sizeyPtr = hits.ysizeToHostAsync(queue); + auto const h_sizey = h_sizeyPtr.get(); alpaka::wait(queue); #else @@ -200,8 +200,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } { - auto const& tracksBuf = iEvent.get(trackToken_); - auto const tracks = alpaka::getPtrNative(tracksBuf); + auto const& tracksPtr = iEvent.get(trackToken_); + auto const tracks = tracksPtr.get(); int nTracks = 0; for (int i = 0; i < tracks->stride(); ++i) { @@ -224,8 +224,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } { - auto const& verticesBuf = iEvent.get(vertexToken_); - auto const vertices = alpaka::getPtrNative(verticesBuf); + auto const& verticesPtr = iEvent.get(vertexToken_); + auto const vertices = verticesPtr.get(); histos["vertex_n"].fill(vertices->nvFinal); for (uint32_t i = 0; i < vertices->nvFinal; ++i) {