diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc index 6d8527935e334..6271b1cc0941b 100644 --- a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc +++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc @@ -13,6 +13,7 @@ #include "FWCore/ServiceRegistry/interface/ServiceMaker.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h" namespace edm { class StreamContext; @@ -27,6 +28,7 @@ class CUDAMonitoringService { void postModuleConstruction(edm::ModuleDescription const& desc); void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc); + void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc); void postEvent(edm::StreamContext const& sc); private: @@ -46,6 +48,9 @@ CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, ed if (config.getUntrackedParameter("memoryBeginStream")) { registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream); } + if (config.getUntrackedParameter("memoryPerModule")) { + registry.watchPostModuleEvent(this, &CUDAMonitoringService::postModuleEvent); + } if (config.getUntrackedParameter("memoryPerEvent")) { registry.watchPostEvent(this, &CUDAMonitoringService::postEvent); } @@ -58,6 +63,8 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& des ->setComment("Print memory information for each device after the construction of each module"); desc.addUntracked("memoryBeginStream", true) ->setComment("Print memory information for each device after the beginStream() of each module"); + desc.addUntracked("memoryPerModule", true) + ->setComment("Print memory information for each device after the event of each module"); desc.addUntracked("memoryPerEvent", true) ->setComment("Print memory information for each device after each event"); @@ -71,15 +78,23 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& des namespace { template void dumpUsedMemory(T& log, int num) { + auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus(); int old = 0; cudaCheck(cudaGetDevice(&old)); + constexpr auto mbytes = 1 << 20; for (int i = 0; i < num; ++i) { size_t freeMemory, totalMemory; cudaCheck(cudaSetDevice(i)); cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); log << "\n" - << i << ": " << (totalMemory - freeMemory) / (1 << 20) << " MB used / " << totalMemory / (1 << 20) - << " MB total"; + << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total"; + auto found = cachingDeviceAllocatorStatus.find(i); + if (found != cachingDeviceAllocatorStatus.end()) { + auto const& cached = found->second; + log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live " + << "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free " + << (cached.live + cached.free) / mbytes << " MB total cached"; + } } cudaCheck(cudaSetDevice(old)); } @@ -98,6 +113,13 @@ void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm dumpUsedMemory(log, numberOfDevices_); } +void CUDAMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) { + auto log = edm::LogPrint("CUDAMonitoringService"); + log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " (" + << mcc.moduleDescription()->moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) { auto log = edm::LogPrint("CUDAMonitoringService"); log << "CUDA device memory after event"; diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh index bde3e26382976..b78fe644422f7 100755 --- a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh +++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh @@ -1,10 +1,78 @@ #! /bin/bash -e +function help() { + cat <<@EOF +Usage: + cmsCudaRebuild.sh [-g|-G] [-v] [-h] + +Check out and build all packages that contain CUDA code in .cu files. + +Options: + -g Compile with debugging symbols, passing + "-g -rdynamic" to the host compiler, and + "-g -lineinfo" to CUDA compiler + + -G Compile with debugging symbols and enable asserts on the GPU, passing + "-g -rdynamic -DGPU_DEBUG" to the host compiler, and + "-g -lineinfo -DGPU_DEBUG" to the CUDA compiler. + + -h Show this help, and exit. + + -v Make scram be verbose. + +@EOF +} + + +DEBUG=0 +VERBOSE=0 + +while [ "$*" ]; do + case "$1" in + -h) + help + exit 0 + ;; + -g) + DEBUG=1 + shift + ;; + -G) + DEBUG=2 + shift + ;; + -v) + VERBOSE=$((VERBOSE + 1)) + shift + ;; + *) + help + exit 1 + ;; + esac +done + # move to the .../src directory cd $CMSSW_BASE/src/ # check out all packages containing .cu files git ls-files --full-name | grep '.*\.cu$' | cut -d/ -f-2 | sort -u | xargs git cms-addpkg +# set additional compilation flags +if (( DEBUG == 1 )); then + export USER_CXXFLAGS="-g -rdynamic $USER_CXXFLAGS" + export USER_CUDA_FLAGS="-g -lineinfo $USER_CUDA_FLAGS" +elif (( DEBUG == 2 )); then + export USER_CXXFLAGS="-g -rdynamic -DGPU_DEBUG $USER_CXXFLAGS" + export USER_CUDA_FLAGS="-g -lineinfo -DGPU_DEBUG $USER_CUDA_FLAGS" +fi + +if (( VERBOSE > 0 )); then + SCRAM_VERBOSE="-v" +fi + +# clean all built packages +scram b clean + # rebuild all checked out packages -scram b -j +scram b $SCRAM_VERBOSE -j diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh index f3335f4cd409f..c130bf77688d5 100755 --- a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh +++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh @@ -6,7 +6,8 @@ DOTS=$(cudaComputeCapabilities | awk '{ print $2 }' | sort -u) CAPS=$(echo $DOTS | sed -e's#\.*##g') # remove existing capabilities -sed -i $TOOL -e'\##d' +sed -i $TOOL -e"s#-gencode arch=compute_..,code=sm_.. *##g" +sed -i $TOOL -e"\##d" # add support for the capabilities found on this machine for CAP in $CAPS; do diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h index 102a1d7bf2d86..c7a84573d4538 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h @@ -3,13 +3,16 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include namespace cms { namespace cuda { + // Single element + template inline void copyAsync(device::unique_ptr& dst, const host::unique_ptr& src, cudaStream_t stream) { // Shouldn't compile for array types because of sizeof(T), but @@ -19,6 +22,15 @@ namespace cms { cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream)); } + template + inline void copyAsync(device::unique_ptr& dst, const host::noncached::unique_ptr& src, cudaStream_t stream) { + // Shouldn't compile for array types because of sizeof(T), but + // let's add an assert with a more helpful message + static_assert(std::is_array::value == false, + "For array types, use the other overload with the size parameter"); + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream)); + } + template inline void copyAsync(host::unique_ptr& dst, const device::unique_ptr& src, cudaStream_t stream) { static_assert(std::is_array::value == false, @@ -27,6 +39,7 @@ namespace cms { } // Multiple elements + template inline void copyAsync(device::unique_ptr& dst, const host::unique_ptr& src, @@ -35,6 +48,14 @@ namespace cms { cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); } + template + inline void copyAsync(device::unique_ptr& dst, + const host::noncached::unique_ptr& src, + size_t nelements, + cudaStream_t stream) { + cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream)); + } + template inline void copyAsync(host::unique_ptr& dst, const device::unique_ptr& src, diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index 593821fe805ed..f9b4b2f8a4c16 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -27,6 +27,13 @@ namespace cms { extern thread_local dim3 blockIdx; extern thread_local dim3 gridDim; + template + T1 atomicCAS(T1* address, T1 compare, T2 val) { + T1 old = *address; + *address = old == compare ? val : old; + return old; + } + template T1 atomicInc(T1* a, T2 b) { auto ret = *a; diff --git a/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h b/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h deleted file mode 100644 index 89f131edd941e..0000000000000 --- a/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_cuda_cxx17_h -#define HeterogeneousCore_CUDAUtilities_cuda_cxx17_h - -#include - -// CUDA does not support C++17 yet, so we define here some of the missing library functions -#if __cplusplus <= 201402L - -namespace std { - - // from https://en.cppreference.com/w/cpp/iterator/size - template - constexpr auto size(const C& c) -> decltype(c.size()) { - return c.size(); - } - - template - constexpr std::size_t size(const T (&array)[N]) noexcept { - return N; - } - - // from https://en.cppreference.com/w/cpp/iterator/empty - template - constexpr auto empty(const C& c) -> decltype(c.empty()) { - return c.empty(); - } - - template - constexpr bool empty(const T (&array)[N]) noexcept { - return false; - } - - template - constexpr bool empty(std::initializer_list il) noexcept { - return il.size() == 0; - } - - // from https://en.cppreference.com/w/cpp/iterator/data - template - constexpr auto data(C& c) -> decltype(c.data()) { - return c.data(); - } - - template - constexpr auto data(const C& c) -> decltype(c.data()) { - return c.data(); - } - - template - constexpr T* data(T (&array)[N]) noexcept { - return array; - } - - template - constexpr const E* data(std::initializer_list il) noexcept { - return il.begin(); - } - -} // namespace std - -#endif - -#endif // HeterogeneousCore_CUDAUtilities_cuda_cxx17_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h b/HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h new file mode 100644 index 0000000000000..92f9f87e890ac --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h @@ -0,0 +1,23 @@ +#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h +#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h + +#include + +namespace cms { + namespace cuda { + namespace allocator { + struct TotalBytes { + size_t free; + size_t live; + size_t liveRequested; // CMS: monitor also requested amount + TotalBytes() { free = live = liveRequested = 0; } + }; + /// Map type of device ordinals to the number of cached bytes cached by each device + using GpuCachedBytes = std::map; + } // namespace allocator + + allocator::GpuCachedBytes deviceAllocatorStatus(); + } // namespace cuda +} // namespace cms + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index 6d4d5f4e7cc5e..33dc6a18ffa2a 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -41,9 +41,9 @@ namespace cms { namespace cuda { // limited to 32*32 elements.... - template - __host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci, - T* __restrict__ co, + template + __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci, + VT* co, uint32_t size, T* ws #ifndef __CUDA_ARCH__ @@ -138,7 +138,9 @@ namespace cms { // in principle not limited.... template - __global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) { + __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) { + volatile T const* ci = ici; + volatile T* co = ico; __shared__ T ws[32]; #ifdef __CUDA_ARCH__ assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below @@ -152,6 +154,7 @@ namespace cms { // count blocks that finished __shared__ bool isLastBlockDone; if (0 == threadIdx.x) { + __threadfence(); auto value = atomicAdd(pc, 1); // block counter isLastBlockDone = (value == (int(gridDim.x) - 1)); } diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h index 1803f68a4ff78..9b164c42921d1 100644 --- a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h +++ b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h @@ -44,6 +44,7 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h" /// CUB namespace namespace notcub { @@ -122,6 +123,7 @@ namespace notcub { struct BlockDescriptor { void *d_ptr; // Device pointer size_t bytes; // Size of allocation in bytes + size_t bytesRequested; // CMS: requested allocatoin size (for monitoring only) unsigned int bin; // Bin enumeration int device; // device ordinal cudaStream_t associated_stream; // Associated associated_stream @@ -129,12 +131,19 @@ namespace notcub { // Constructor (suitable for searching maps for a specific block, given its pointer and device) BlockDescriptor(void *d_ptr, int device) - : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {} + : d_ptr(d_ptr), + bytes(0), + bytesRequested(0), // CMS + bin(INVALID_BIN), + device(device), + associated_stream(nullptr), + ready_event(nullptr) {} // Constructor (suitable for searching maps for a range of suitable blocks, given a device) BlockDescriptor(int device) : d_ptr(nullptr), bytes(0), + bytesRequested(0), // CMS bin(INVALID_BIN), device(device), associated_stream(nullptr), @@ -160,12 +169,7 @@ namespace notcub { /// BlockDescriptor comparator function interface typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); - class TotalBytes { - public: - size_t free; - size_t live; - TotalBytes() { free = live = 0; } - }; + // CMS: Moved TotalBytes to deviceAllocatorStatus.h /// Set type for cached blocks (ordered by size) typedef std::multiset CachedBlocks; @@ -174,7 +178,8 @@ namespace notcub { typedef std::multiset BusyBlocks; /// Map type of device ordinals to the number of cached bytes cached by each device - typedef std::map GpuCachedBytes; + // CMS: Moved definition to deviceAllocatorStatus.h + using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes; //--------------------------------------------------------------------- // Utility functions @@ -219,8 +224,8 @@ namespace notcub { // Fields //--------------------------------------------------------------------- - // CMS: use std::mutex instead of cub::Mutex - std::mutex mutex; /// Mutex for thread-safety + // CMS: use std::mutex instead of cub::Mutex, declare mutable + mutable std::mutex mutex; /// Mutex for thread-safety unsigned int bin_growth; /// Geometric growth factor for bin-sizes unsigned int min_bin; /// Minimum bin enumeration @@ -344,6 +349,7 @@ namespace notcub { // Create a block descriptor for the requested allocation bool found = false; BlockDescriptor search_key(device); + search_key.bytesRequested = bytes; // CMS search_key.associated_stream = active_stream; NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); @@ -381,6 +387,7 @@ namespace notcub { // Remove from free blocks cached_bytes[device].free -= search_key.bytes; cached_bytes[device].live += search_key.bytes; + cached_bytes[device].liveRequested += search_key.bytesRequested; // CMS if (debug) // CMS: improved debug message @@ -490,6 +497,7 @@ namespace notcub { mutex_locker.lock(); live_blocks.insert(search_key); cached_bytes[device].live += search_key.bytes; + cached_bytes[device].liveRequested += search_key.bytesRequested; // CMS mutex_locker.unlock(); if (debug) @@ -569,6 +577,7 @@ namespace notcub { search_key = *block_itr; live_blocks.erase(block_itr); cached_bytes[device].live -= search_key.bytes; + cached_bytes[device].liveRequested -= search_key.bytesRequested; // CMS // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) { @@ -715,6 +724,12 @@ namespace notcub { return error; } + // CMS: give access to cache allocation status + GpuCachedBytes CacheStatus() const { + std::unique_lock mutex_locker(mutex); + return cached_bytes; + } + /** * \brief Destructor */ diff --git a/HeterogeneousCore/CUDAUtilities/src/deviceAllocatorStatus.cc b/HeterogeneousCore/CUDAUtilities/src/deviceAllocatorStatus.cc new file mode 100644 index 0000000000000..75111bd8fa548 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/src/deviceAllocatorStatus.cc @@ -0,0 +1,7 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h" + +#include "getCachingDeviceAllocator.h" + +namespace cms::cuda { + allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); } +} // namespace cms::cuda diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h index 8158f414b07d4..3770dbac574d9 100644 --- a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h +++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h @@ -13,11 +13,11 @@ namespace cms::cuda::allocator { // Use caching or not constexpr bool useCaching = true; // Growth factor (bin_growth in cub::CachingDeviceAllocator - constexpr unsigned int binGrowth = 8; + constexpr unsigned int binGrowth = 2; // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator - constexpr unsigned int minBin = 1; + constexpr unsigned int minBin = 8; // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. - constexpr unsigned int maxBin = 10; + constexpr unsigned int maxBin = 30; // Total storage for the allocator. 0 means no limit. constexpr size_t maxCachedBytes = 0; // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.