From 2c35bb2799b7f59a65842a2570a4ce438a9a7c43 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 5 Sep 2018 11:15:04 +0200 Subject: [PATCH 01/19] Add infrastructure around cub CachingDeviceAllocator for device memory allocations --- .../Common/interface/device_unique_ptr.h | 16 +++++ HeterogeneousCore/CUDAServices/BuildFile.xml | 2 + .../CUDAServices/interface/CUDAService.h | 49 +++++++++++++ .../CUDAServices/src/CUDAService.cc | 68 +++++++++++++++++++ .../CUDAServices/test/testCUDAService.cpp | 35 ++++++++++ 5 files changed, 170 insertions(+) create mode 100644 CUDADataFormats/Common/interface/device_unique_ptr.h diff --git a/CUDADataFormats/Common/interface/device_unique_ptr.h b/CUDADataFormats/Common/interface/device_unique_ptr.h new file mode 100644 index 0000000000000..1282c52125fa6 --- /dev/null +++ b/CUDADataFormats/Common/interface/device_unique_ptr.h @@ -0,0 +1,16 @@ +#ifndef CUDADataFormats_Common_interface_device_unique_ptr_h +#define CUDADataFormats_Common_interface_device_unique_ptr_h + +#include +#include + +namespace edm { + namespace cuda { + namespace device { + template + using unique_ptr = std::unique_ptr>; + } + } +} + +#endif diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml index c7232e16d910b..61572b96fb26e 100644 --- a/HeterogeneousCore/CUDAServices/BuildFile.xml +++ b/HeterogeneousCore/CUDAServices/BuildFile.xml @@ -2,8 +2,10 @@ + + diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h index e359a5018813a..75bd950e74ead 100644 --- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h +++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h @@ -4,14 +4,27 @@ #include #include +#include + #include "FWCore/Utilities/interface/StreamID.h" +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" + namespace edm { class ParameterSet; class ActivityRegistry; class ConfigurationDescriptions; } +namespace cudaserviceimpl { + template + struct make_unique_selector { using non_array = edm::cuda::device::unique_ptr; }; + template + struct make_unique_selector { using unbounded_array = edm::cuda::device::unique_ptr; }; + template + struct make_unique_selector { struct bounded_array {}; }; +} + /** * TODO: * - CUDA stream management? @@ -47,7 +60,43 @@ class CUDAService { // Get the current device int getCurrentDevice() const; + // Allocate device memory + template + typename cudaserviceimpl::make_unique_selector::non_array + make_unique(cuda::stream_t<>& stream) { + int dev = getCurrentDevice(); + void *mem = allocate(dev, sizeof(T), stream); + return typename cudaserviceimpl::make_unique_selector::non_array(reinterpret_cast(mem), + [this, dev](void *ptr) { + this->free(dev, ptr); + }); + } + + template + typename cudaserviceimpl::make_unique_selector::unbounded_array + make_unique(size_t n, cuda::stream_t<>& stream) { + int dev = getCurrentDevice(); + using element_type = typename std::remove_extent::type; + void *mem = allocate(dev, n*sizeof(element_type), stream); + return typename cudaserviceimpl::make_unique_selector::unbounded_array(reinterpret_cast(mem), + [this, dev](void *ptr) { + this->free(dev, ptr); + }); + } + + template + typename cudaserviceimpl::make_unique_selector::bounded_array + make_unique(Args&&...) = delete; + + // Free device memory (to be called from unique_ptr) + void free(int device, void *ptr); + private: + // PIMPL to hide details of allocator + struct Allocator; + std::unique_ptr allocator_; + void *allocate(int dev, size_t nbytes, cuda::stream_t<>& stream); + int numberOfDevices_ = 0; unsigned int numberOfStreamsTotal_ = 0; std::vector> computeCapabilities_; diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 84aebd58648f3..7757d584e1bfc 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -1,9 +1,12 @@ #include #include +#include #include #include +#include + #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" @@ -245,6 +248,40 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& log << '\n'; } + // create allocator + auto const& allocator = config.getUntrackedParameter("allocator"); + auto binGrowth = allocator.getUntrackedParameter("binGrowth"); + auto minBin = allocator.getUntrackedParameter("minBin"); + auto maxBin = allocator.getUntrackedParameter("maxBin"); + size_t maxCachedBytes = allocator.getUntrackedParameter("maxCachedBytes"); + auto maxCachedFraction = allocator.getUntrackedParameter("maxCachedFraction"); + + size_t minCachedBytes = std::numeric_limits::max(); + int currentDevice; + cudaCheck(cudaGetDevice(¤tDevice)); + for(int i=0; i(maxCachedFraction*freeMemory)); + } + if(maxCachedBytes > 0) { + minCachedBytes = std::min(minCachedBytes, maxCachedBytes); + } + log << "cub::CachingDeviceAllocator settings\n" + << " bin growth " << binGrowth << "\n" + << " min bin " << minBin << "\n" + << " max bin " << maxBin << "\n" + << " resulting bins\n"; + for(auto bin = minBin; bin <= maxBin; ++bin) { + log << " " << cub::CachingDeviceAllocator::IntPow(binGrowth, bin) << " B\n"; + } + log << " maximum amount of cached memory " << (minCachedBytes>>20) << " MB\n"; + + allocator_ = std::make_unique(cub::CachingDeviceAllocator::IntPow(binGrowth, maxBin), + binGrowth, minBin, maxBin, minCachedBytes); + log << "\n"; + log << "CUDAService fully initialized"; enabled_ = true; } @@ -275,6 +312,14 @@ void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions limits.addUntracked("cudaLimitDevRuntimePendingLaunchCount", -1)->setComment("Maximum number of outstanding device runtime launches that can be made from the current device."); desc.addUntracked("limits", limits)->setComment("See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps the default value."); + edm::ParameterSetDescription allocator; + allocator.addUntracked("binGrowth", 8)->setComment("Growth factor (bin_growth in cub::CachingDeviceAllocator"); + allocator.addUntracked("minBin", 5)->setComment("Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator"); + allocator.addUntracked("maxBin", 9)->setComment("Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail."); + allocator.addUntracked("maxCachedBytes", 0)->setComment("Total storage for the allocator. 0 means no limit."); + allocator.addUntracked("maxCachedFraction", 0.8)->setComment("Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken."); + desc.addUntracked("allocator", allocator)->setComment("See the documentation of cub::CachingDeviceAllocator for more details."); + descriptions.add("CUDAService", desc); } @@ -312,3 +357,26 @@ void CUDAService::setCurrentDevice(int device) const { int CUDAService::getCurrentDevice() const { return cuda::device::current::get().id(); } + + +// allocator +struct CUDAService::Allocator { + template + Allocator(size_t max, Args&&... args): maxAllocation(max), allocator(std::forward(args)...) {} + size_t maxAllocation; + cub::CachingDeviceAllocator allocator; +}; + +void *CUDAService::allocate(int dev, size_t nbytes, cuda::stream_t<>& stream) { + if(nbytes > allocator_->maxAllocation) { + throw std::runtime_error("Tried to allocate "+std::to_string(nbytes)+" bytes, but the allocator maximum is "+std::to_string(allocator_->maxAllocation)); + } + + void *ptr = nullptr; + cuda::throw_if_error(allocator_->allocator.DeviceAllocate(dev, &ptr, nbytes, stream.id())); + return ptr; +} + +void CUDAService::free(int device, void *ptr) { + allocator_->allocator.DeviceFree(device, ptr); +} diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp index dc1b01c2db9fe..c423001b988de 100644 --- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp +++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp @@ -5,6 +5,7 @@ #include #include +#include #include "catch.hpp" @@ -169,6 +170,40 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { } + SECTION("Allocator") { + edm::ParameterSet ps; + ps.addUntrackedParameter("enabled", true); + edm::ParameterSet alloc; + alloc.addUntrackedParameter("minBin", 1U); + alloc.addUntrackedParameter("maxBin", 3U); + ps.addUntrackedParameter("allocator", alloc); + auto cs = makeCUDAService(ps, ar); + cs.setCurrentDevice(0); + auto current_device = cuda::device::current::get(); + auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + + SECTION("Destructor") { + auto ptr = cs.make_unique(cudaStream); + REQUIRE(ptr.get() != nullptr); + cudaStream.synchronize(); + } + + SECTION("Reset") { + auto ptr = cs.make_unique(5, cudaStream); + REQUIRE(ptr.get() != nullptr); + cudaStream.synchronize(); + + ptr.reset(); + REQUIRE(ptr.get() == nullptr); + } + + SECTION("Allocating too much") { + auto ptr = cs.make_unique(512, cudaStream); + ptr.reset(); + REQUIRE_THROWS(ptr = cs.make_unique(513, cudaStream)); + } + } + //Fake the end-of-job signal. ar.postEndJobSignal_(); } From 2c515f99c954104f7e3a5dcf8ad49c4218eee689 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 21 Sep 2018 23:17:58 +0200 Subject: [PATCH 02/19] Migrate raw2cluster data products to use the cub allocator --- CUDADataFormats/SiPixelCluster/BuildFile.xml | 8 ++ .../interface/SiPixelClustersCUDA.h | 57 +++++++++++++ .../SiPixelCluster/src/SiPixelClustersCUDA.cc | 14 ++++ CUDADataFormats/SiPixelDigi/BuildFile.xml | 7 ++ .../SiPixelDigi/interface/SiPixelDigisCUDA.h | 50 ++++++++++++ .../SiPixelDigi/src/SiPixelDigisCUDA.cc | 13 +++ .../SiPixelClusterizer/plugins/BuildFile.xml | 2 + .../plugins/SiPixelRawToClusterGPUKernel.cu | 80 +++++++------------ .../plugins/SiPixelRawToClusterGPUKernel.h | 27 ++----- .../SiPixelRawToClusterHeterogeneous.cc | 1 - .../siPixelRawToClusterHeterogeneousProduct.h | 37 +++++---- .../SiPixelRecHits/plugins/PixelRecHits.cu | 18 ++--- .../plugins/ClusterSLOnGPU.cu | 13 +-- 13 files changed, 224 insertions(+), 103 deletions(-) create mode 100644 CUDADataFormats/SiPixelCluster/BuildFile.xml create mode 100644 CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h create mode 100644 CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc create mode 100644 CUDADataFormats/SiPixelDigi/BuildFile.xml create mode 100644 CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h create mode 100644 CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml new file mode 100644 index 0000000000000..21c527e7b2f0d --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h new file mode 100644 index 0000000000000..07e39932a1de0 --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -0,0 +1,57 @@ +#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h +#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h + +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" + +#include + +class SiPixelClustersCUDA { +public: + SiPixelClustersCUDA() = default; + explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream); + ~SiPixelClustersCUDA() = default; + + SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete; + SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete; + SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default; + SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default; + + uint32_t * __restrict__ moduleStart() { return moduleStart_d.get(); } + int32_t * __restrict__ clus() { return clus_d.get(); } + uint32_t * __restrict__ clusInModule() { return clusInModule_d.get(); } + uint32_t * __restrict__ moduleId() { return moduleId_d.get(); } + uint32_t * __restrict__ clusModuleStart() { return clusModuleStart_d.get(); } + + uint32_t const * __restrict__ moduleStart() const { return moduleStart_d.get(); } + int32_t const * __restrict__ clus() const { return clus_d.get(); } + uint32_t const * __restrict__ clusInModule() const { return clusInModule_d.get(); } + uint32_t const * __restrict__ moduleId() const { return moduleId_d.get(); } + uint32_t const * __restrict__ clusModuleStart() const { return clusModuleStart_d.get(); } + + uint32_t const * __restrict__ c_moduleStart() const { return moduleStart_d.get(); } + int32_t const * __restrict__ c_clus() const { return clus_d.get(); } + uint32_t const * __restrict__ c_clusInModule() const { return clusInModule_d.get(); } + uint32_t const * __restrict__ c_moduleId() const { return moduleId_d.get(); } + uint32_t const * __restrict__ c_clusModuleStart() const { return clusModuleStart_d.get(); } + + struct DeviceConstView { + uint32_t const *moduleStart; + int32_t const *clus; + uint32_t const *clusInModule; + uint32_t const *moduleId; + uint32_t const *clusModuleStart; + }; + + DeviceConstView view() const { return DeviceConstView{moduleStart_d.get(), clus_d.get(), clusInModule_d.get(), moduleId_d.get(), clusModuleStart_d.get()}; } + +private: + edm::cuda::device::unique_ptr moduleStart_d; // index of the first pixel of each module + edm::cuda::device::unique_ptr clus_d; // cluster id of each pixel + edm::cuda::device::unique_ptr clusInModule_d; // number of clusters found in each module + edm::cuda::device::unique_ptr moduleId_d; // module id of each module + + // originally from rechits + edm::cuda::device::unique_ptr clusModuleStart_d; +}; + +#endif diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc new file mode 100644 index 0000000000000..55a84c6e09275 --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -0,0 +1,14 @@ +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) { + edm::Service cs; + + moduleStart_d = cs->make_unique(nelements+1, stream); + clus_d = cs->make_unique< int32_t[]>(feds, stream); + clusInModule_d = cs->make_unique(nelements, stream); + moduleId_d = cs->make_unique(nelements, stream); + clusModuleStart_d = cs->make_unique(nelements+1, stream); +} diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml new file mode 100644 index 0000000000000..259aa9f08d054 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h new file mode 100644 index 0000000000000..bd0c2eb5329c2 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -0,0 +1,50 @@ +#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h +#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h + +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" + +#include + +class SiPixelDigisCUDA { +public: + SiPixelDigisCUDA() = default; + explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream); + ~SiPixelDigisCUDA() = default; + + SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete; + SiPixelDigisCUDA& operator=(const SiPixelDigisCUDA&) = delete; + SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default; + SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default; + + uint16_t * __restrict__ xx() { return xx_d.get(); } + uint16_t * __restrict__ yy() { return yy_d.get(); } + uint16_t * __restrict__ adc() { return adc_d.get(); } + uint16_t * __restrict__ moduleInd() { return moduleInd_d.get(); } + + uint16_t const * __restrict__ xx() const { return xx_d.get(); } + uint16_t const * __restrict__ yy() const { return yy_d.get(); } + uint16_t const * __restrict__ adc() const { return adc_d.get(); } + uint16_t const * __restrict__ moduleInd() const { return moduleInd_d.get(); } + + uint16_t const * __restrict__ c_xx() const { return xx_d.get(); } + uint16_t const * __restrict__ c_yy() const { return yy_d.get(); } + uint16_t const * __restrict__ c_adc() const { return adc_d.get(); } + uint16_t const * __restrict__ c_moduleInd() const { return moduleInd_d.get(); } + + struct DeviceConstView { + uint16_t const * xx; + uint16_t const * yy; + uint16_t const * adc; + uint16_t const * moduleInd; + }; + + DeviceConstView view() const { return DeviceConstView{xx_d.get(), yy_d.get(), adc_d.get(), moduleInd_d.get()}; } + +private: + edm::cuda::device::unique_ptr xx_d; // local coordinates of each pixel + edm::cuda::device::unique_ptr yy_d; // + edm::cuda::device::unique_ptr adc_d; // ADC of each pixel + edm::cuda::device::unique_ptr moduleInd_d; // module id of each pixel +}; + +#endif diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc new file mode 100644 index 0000000000000..a2999c97e5f47 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -0,0 +1,13 @@ +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) { + edm::Service cs; + + xx_d = cs->make_unique(nelements, stream); + yy_d = cs->make_unique(nelements, stream); + adc_d = cs->make_unique(nelements, stream); + moduleInd_d = cs->make_unique(nelements, stream); +} diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml index 9db4a46f367b3..40a489f763397 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml @@ -7,6 +7,8 @@ + + diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 7bd6eac473cc7..409c8f5d3ec75 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -50,7 +50,6 @@ namespace pixelgpudetails { constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; constexpr uint32_t MAX_WORD08_SIZE = MAX_FED_WORDS * sizeof(uint8_t); constexpr uint32_t MAX_WORD32_SIZE = MAX_FED_WORDS * sizeof(uint32_t); - constexpr uint32_t MAX_WORD16_SIZE = MAX_FED_WORDS * sizeof(uint16_t); constexpr uint32_t MAX_ERROR_SIZE = MAX_FED_WORDS * esize; SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { @@ -72,23 +71,13 @@ namespace pixelgpudetails { cudaCheck(cudaMalloc((void**) & word_d, MAX_WORD32_SIZE)); cudaCheck(cudaMalloc((void**) & fedId_d, MAX_WORD08_SIZE)); cudaCheck(cudaMalloc((void**) & pdigi_d, MAX_WORD32_SIZE)); // to store thepacked digi - cudaCheck(cudaMalloc((void**) & xx_d, MAX_WORD16_SIZE)); // to store the x and y coordinate - cudaCheck(cudaMalloc((void**) & yy_d, MAX_WORD16_SIZE)); - cudaCheck(cudaMalloc((void**) & adc_d, MAX_WORD16_SIZE)); - cudaCheck(cudaMalloc((void**) & moduleInd_d, MAX_WORD16_SIZE)); cudaCheck(cudaMalloc((void**) & rawIdArr_d, MAX_WORD32_SIZE)); cudaCheck(cudaMalloc((void**) & error_d, vsize)); cudaCheck(cudaMalloc((void**) & data_d, MAX_ERROR_SIZE)); cudaCheck(cudaMemset(data_d, 0x00, MAX_ERROR_SIZE)); - // for the clusterizer - cudaCheck(cudaMalloc((void**) & clus_d, MAX_WORD32_SIZE)); // cluser index in module - using namespace gpuClustering; - cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); - cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) )); - cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) )); new (error_h) GPU::SimpleVector(MAX_FED_WORDS, data_h); new (error_h_tmp) GPU::SimpleVector(MAX_FED_WORDS, data_d); @@ -101,14 +90,7 @@ namespace pixelgpudetails { cudaCheck(cudaMallocHost(&nModulesActive, sizeof(uint32_t))); cudaCheck(cudaMallocHost(&nClusters, sizeof(uint32_t))); - cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct))); - gpuProduct = getProduct(); - assert(xx_d==gpuProduct.xx_d); - - cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id())); - // originally from rechits - cudaCheck(cudaMalloc((void**) & clusModuleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); uint32_t *tmp = nullptr; cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize)); @@ -133,24 +115,12 @@ namespace pixelgpudetails { cudaCheck(cudaFree(word_d)); cudaCheck(cudaFree(fedId_d)); cudaCheck(cudaFree(pdigi_d)); - cudaCheck(cudaFree(xx_d)); - cudaCheck(cudaFree(yy_d)); - cudaCheck(cudaFree(adc_d)); - cudaCheck(cudaFree(moduleInd_d)); cudaCheck(cudaFree(rawIdArr_d)); cudaCheck(cudaFree(error_d)); cudaCheck(cudaFree(data_d)); - // these are for the clusterizer - cudaCheck(cudaFree(moduleStart_d)); - cudaCheck(cudaFree(clus_d)); - cudaCheck(cudaFree(clusInModule_d)); - cudaCheck(cudaFree(moduleId_d)); - cudaCheck(cudaFree(gpuProduct_d)); - // originally from rechits cudaCheck(cudaFree(tempScanStorage_d)); - cudaCheck(cudaFree(clusModuleStart_d)); } void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { @@ -620,6 +590,10 @@ namespace pixelgpudetails { { nDigis = wordCounter; + constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; + digis_d = SiPixelDigisCUDA(MAX_FED_WORDS, stream); + clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream); + const int threadsPerBlock = 512; const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all @@ -636,10 +610,10 @@ namespace pixelgpudetails { wordCounter, word_d, fedId_d, - xx_d, yy_d, adc_d, + digis_d.xx(), digis_d.yy(), digis_d.adc(), pdigi_d, rawIdArr_d, - moduleInd_d, + digis_d.moduleInd(), error_d, useQualityInfo, includeErrors, @@ -677,15 +651,15 @@ namespace pixelgpudetails { int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock; gpuCalibPixel::calibDigis<<>>( - moduleInd_d, - xx_d, yy_d, adc_d, + digis_d.moduleInd(), + digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(), gains, wordCounter); cudaCheck(cudaGetLastError()); // calibrated adc if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(adc_h, digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); } #ifdef GPU_DEBUG @@ -694,13 +668,13 @@ namespace pixelgpudetails { << " blocks of " << threadsPerBlock << " threads\n"; #endif - cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id())); + cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id())); - countModules<<>>(moduleInd_d, moduleStart_d, clus_d, wordCounter); + countModules<<>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), clusters_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); // read the number of modules into a data member, used by getProduct()) - cudaCheck(cudaMemcpyAsync(nModulesActive, moduleStart_d, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(nModulesActive, clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id())); threadsPerBlock = 256; blocks = MaxNumModules; @@ -708,23 +682,23 @@ namespace pixelgpudetails { std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id())); + cudaCheck(cudaMemsetAsync(clusters_d.clusInModule(), 0, (MaxNumModules)*sizeof(uint32_t), stream.id())); findClus<<>>( - moduleInd_d, - xx_d, yy_d, - moduleStart_d, - clusInModule_d, moduleId_d, - clus_d, + digis_d.c_moduleInd(), + digis_d.c_xx(), digis_d.c_yy(), + clusters_d.c_moduleStart(), + clusters_d.clusInModule(), clusters_d.moduleId(), + clusters_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); // apply charge cut clusterChargeCut<<>>( - moduleInd_d, - adc_d, - moduleStart_d, - clusInModule_d, moduleId_d, - clus_d, + digis_d.moduleInd(), + digis_d.c_adc(), + clusters_d.c_moduleStart(), + clusters_d.clusInModule(), clusters_d.c_moduleId(), + clusters_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -736,18 +710,18 @@ namespace pixelgpudetails { // synchronization/ExternalWork // // Set first the first element to 0 - cudaCheck(cudaMemsetAsync(clusModuleStart_d, 0, sizeof(uint32_t), stream.id())); + cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id())); // Then use inclusive_scan to get the partial sum to the rest cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize, - clusInModule_d, &clusModuleStart_d[1], gpuClustering::MaxNumModules, + clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules, stream.id())); // last element holds the number of all clusters - cudaCheck(cudaMemcpyAsync(nClusters, clusModuleStart_d+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(nClusters, clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); // clusters if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(clus_h, clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); } } // end clusterizer scope } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index ca8bd73106c2c..87682f728bc4f 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -176,15 +176,13 @@ namespace pixelgpudetails { bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug, cuda::stream_t<>& stream); - auto getProduct() { + siPixelRawToClusterHeterogeneousProduct::GPUProduct getProduct() { error_h->set_data(data_h); - return siPixelRawToClusterHeterogeneousProduct::GPUProduct{ + return siPixelRawToClusterHeterogeneousProduct::GPUProduct( pdigi_h, rawIdArr_h, clus_h, adc_h, error_h, - gpuProduct_d, - xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d, - clusModuleStart_d, + std::move(digis_d), std::move(clusters_d), nDigis, *nModulesActive, *nClusters - }; + ); } private: @@ -192,10 +190,6 @@ namespace pixelgpudetails { unsigned int *word = nullptr; // to hold input for rawtodigi unsigned char *fedId_h = nullptr; // to hold fed index for each word - // output - GPUProduct gpuProduct; - GPUProduct * gpuProduct_d; - // FIXME cleanup all these are in the gpuProduct above... uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output @@ -212,23 +206,16 @@ namespace pixelgpudetails { uint32_t * word_d; uint8_t * fedId_d; uint32_t * pdigi_d; - uint16_t * xx_d; - uint16_t * yy_d; - uint16_t * adc_d; - uint16_t * moduleInd_d; uint32_t * rawIdArr_d; GPU::SimpleVector * error_d; error_obj * data_d; - // these are for the clusterizer (to be moved) - uint32_t * moduleStart_d; - int32_t * clus_d; - uint32_t * clusInModule_d; - uint32_t * moduleId_d; + // Data to be put in the event + SiPixelDigisCUDA digis_d; + SiPixelClustersCUDA clusters_d; // originally in rechit, moved here - uint32_t *clusModuleStart_d = nullptr; void *tempScanStorage_d = nullptr; size_t tempScanStorageSize = 0; }; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index 384a4732b32e1..bb920f3b45bb4 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -555,7 +555,6 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) { auto output = std::make_unique(gpuAlgo_->getProduct()); - assert(output->me_d); if(enableConversion_) { convertGPUtoCPU(ev.event(), *output); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h index e75afd3481b25..3298249c194b8 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h @@ -1,6 +1,8 @@ #ifndef EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h #define EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "FWCore/Utilities/interface/typedefs.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" @@ -25,6 +27,25 @@ namespace siPixelRawToClusterHeterogeneousProduct { // FIXME split in two struct GPUProduct { + GPUProduct() = default; + GPUProduct(const GPUProduct&) = delete; + GPUProduct& operator=(const GPUProduct&) = delete; + GPUProduct(GPUProduct&&) = default; + GPUProduct& operator=(GPUProduct&&) = default; + + GPUProduct(uint32_t const *pdigi, + uint32_t const *rawIdArr, + int32_t const *clus, + uint16_t const *adc, + GPU::SimpleVector const * error, + SiPixelDigisCUDA&& digis, + SiPixelClustersCUDA&& clusters, + uint32_t ndig, uint32_t nmod, uint32_t nclus): + pdigi_h(pdigi), rawIdArr_h(rawIdArr), clus_h(clus), adc_h(adc), error_h(error), + digis_d(std::move(digis)), clusters_d(std::move(clusters)), + nDigis(ndig), nModules(nmod), nClusters(nclus) + {} + // Needed for digi and cluster CPU output uint32_t const * pdigi_h = nullptr; uint32_t const * rawIdArr_h = nullptr; @@ -32,20 +53,8 @@ namespace siPixelRawToClusterHeterogeneousProduct { uint16_t const * adc_h = nullptr; GPU::SimpleVector const * error_h = nullptr; - GPUProduct const * me_d = nullptr; - - // Needed for GPU rechits - uint16_t const * xx_d; - uint16_t const * yy_d; - uint16_t const * adc_d; - uint16_t const * moduleInd_d; - uint32_t const * moduleStart_d; - int32_t const * clus_d; - uint32_t const * clusInModule_d; - uint32_t const * moduleId_d; - - // originally from rechits - uint32_t const * clusModuleStart_d; + SiPixelDigisCUDA digis_d; + SiPixelClustersCUDA clusters_d; uint32_t nDigis; uint32_t nModules; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index c63466f157a1b..333898eaa601a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -127,10 +127,10 @@ namespace pixelgpudetails { pixelCPEforGPU::ParamsOnGPU const * cpeParams, bool transferToCPU, cuda::stream_t<>& stream) { - cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); - gpu_.hitsModuleStart_d = input.clusModuleStart_d; - gpu_.cpeParams = cpeParams; // copy it for use in clients - cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); + gpu_.hitsModuleStart_d = input.clusters_d.clusModuleStart(); + gpu_.cpeParams = cpeParams; // copy it for use in clients + cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); int threadsPerBlock = 256; int blocks = input.nModules; // active modules (with digis) @@ -141,11 +141,11 @@ namespace pixelgpudetails { gpuPixelRecHits::getHits<<>>( cpeParams, gpu_.bs_d, - input.moduleInd_d, - input.xx_d, input.yy_d, input.adc_d, - input.moduleStart_d, - input.clusInModule_d, input.moduleId_d, - input.clus_d, + input.digis_d.moduleInd(), + input.digis_d.xx(), input.digis_d.yy(), input.digis_d.adc(), + input.clusters_d.moduleStart(), + input.clusters_d.clusInModule(), input.clusters_d.moduleId(), + input.clusters_d.clus(), input.nDigis, gpu_.hitsModuleStart_d, gpu_.charge_d, diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu index 83a7e63e628a0..42e9ec8789d67 100644 --- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu @@ -3,6 +3,8 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" @@ -12,14 +14,13 @@ using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU; __global__ -void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) +void simLink(SiPixelDigisCUDA::DeviceConstView dd, uint32_t ndigis, SiPixelClustersCUDA::DeviceConstView cc, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) { assert(slp == slp->me_d); constexpr int32_t invTK = 0; // std::numeric_limits::max(); constexpr uint16_t InvId = 9999; // must be > MaxNumModules - auto const & dd = *ddp; auto const & hh = *hhp; auto const & sl = *slp; auto i = blockIdx.x * blockDim.x + threadIdx.x; @@ -27,14 +28,14 @@ void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLO if (i >= ndigis) return; - auto id = dd.moduleInd_d[i]; + auto id = dd.moduleInd[i]; if (InvId == id) return; assert(id < 2000); - auto ch = pixelgpudetails::pixelToChannel(dd.xx_d[i], dd.yy_d[i]); + auto ch = pixelgpudetails::pixelToChannel(dd.xx[i], dd.yy[i]); auto first = hh.hitsModuleStart_d[id]; - auto cl = first + dd.clus_d[i]; + auto cl = first + cc.clus[i]; assert(cl < 2000 * blockDim.x); const std::array me{{id, ch, 0, 0}}; @@ -176,7 +177,7 @@ namespace clusterSLOnGPU { blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock; assert(sl.me_d); - simLink<<>>(dd.me_d, ndigis, hh.gpu_d, sl.me_d, n); + simLink<<>>(dd.digis_d.view(), ndigis, dd.clusters_d.view(), hh.gpu_d, sl.me_d, n); cudaCheck(cudaGetLastError()); if (doDump) { From e30b0f2371ad8729b3879365c574f428109f8c21 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 21 Sep 2018 00:03:21 +0200 Subject: [PATCH 03/19] Migrate raw2cluster temporary buffers to use the cub allocator --- .../plugins/SiPixelRawToClusterGPUKernel.cu | 151 +++++++++--------- .../plugins/SiPixelRawToClusterGPUKernel.h | 13 -- 2 files changed, 73 insertions(+), 91 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 409c8f5d3ec75..54cde7461f9d7 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -31,6 +31,8 @@ #include // CMSSW includes +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" @@ -48,8 +50,6 @@ namespace pixelgpudetails { // number of words for all the FEDs constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; - constexpr uint32_t MAX_WORD08_SIZE = MAX_FED_WORDS * sizeof(uint8_t); - constexpr uint32_t MAX_WORD32_SIZE = MAX_FED_WORDS * sizeof(uint32_t); constexpr uint32_t MAX_ERROR_SIZE = MAX_FED_WORDS * esize; SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { @@ -68,32 +68,15 @@ namespace pixelgpudetails { cudaCheck(cudaMallocHost(&error_h_tmp, vsize)); cudaCheck(cudaMallocHost(&data_h, MAX_ERROR_SIZE)); - cudaCheck(cudaMalloc((void**) & word_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & fedId_d, MAX_WORD08_SIZE)); - cudaCheck(cudaMalloc((void**) & pdigi_d, MAX_WORD32_SIZE)); // to store thepacked digi - - cudaCheck(cudaMalloc((void**) & rawIdArr_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & error_d, vsize)); - cudaCheck(cudaMalloc((void**) & data_d, MAX_ERROR_SIZE)); - cudaCheck(cudaMemset(data_d, 0x00, MAX_ERROR_SIZE)); - using namespace gpuClustering; new (error_h) GPU::SimpleVector(MAX_FED_WORDS, data_h); - new (error_h_tmp) GPU::SimpleVector(MAX_FED_WORDS, data_d); assert(error_h->size() == 0); assert(error_h->capacity() == static_cast(MAX_FED_WORDS)); - assert(error_h_tmp->size() == 0); - assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS)); // Need these in pinned memory to be truly asynchronous cudaCheck(cudaMallocHost(&nModulesActive, sizeof(uint32_t))); cudaCheck(cudaMallocHost(&nClusters, sizeof(uint32_t))); - - // originally from rechits - uint32_t *tmp = nullptr; - cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); - cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize)); } SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { @@ -109,18 +92,6 @@ namespace pixelgpudetails { cudaCheck(cudaFreeHost(data_h)); cudaCheck(cudaFreeHost(nModulesActive)); cudaCheck(cudaFreeHost(nClusters)); - - // free device memory used for RawToDigi on GPU - // free the GPU memory - cudaCheck(cudaFree(word_d)); - cudaCheck(cudaFree(fedId_d)); - cudaCheck(cudaFree(pdigi_d)); - cudaCheck(cudaFree(rawIdArr_d)); - cudaCheck(cudaFree(error_d)); - cudaCheck(cudaFree(data_d)); - - // originally from rechits - cudaCheck(cudaFree(tempScanStorage_d)); } void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { @@ -594,52 +565,69 @@ namespace pixelgpudetails { digis_d = SiPixelDigisCUDA(MAX_FED_WORDS, stream); clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream); - const int threadsPerBlock = 512; - const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all - - assert(0 == wordCounter%2); - // wordCounter is the total no of words in each event to be trasfered on device - cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(&fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(error_d, error_h_tmp, vsize, cudaMemcpyDefault, stream.id())); - - // Launch rawToDigi kernel - RawToDigi_kernel<<>>( - cablingMap, - modToUnp, - wordCounter, - word_d, - fedId_d, - digis_d.xx(), digis_d.yy(), digis_d.adc(), - pdigi_d, - rawIdArr_d, - digis_d.moduleInd(), - error_d, - useQualityInfo, - includeErrors, - debug); - cudaCheck(cudaGetLastError()); - - // copy data to host variable - if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - - if (includeErrors) { - cudaCheck(cudaMemcpyAsync(error_h, error_d, vsize, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(data_h, data_d, MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); - // If we want to transfer only the minimal amount of data, we - // need a synchronization point. A single ExternalWork (of - // SiPixelRawToClusterHeterogeneous) does not help because it is - // already used to synchronize the data movement. So we'd need - // two ExternalWorks (or explicit use of TBB tasks). The - // prototype of #100 would allow this easily (as there would be - // two ExternalWorks). - // - //error_h->set_data(data_h); - //cudaCheck(cudaStreamSynchronize(stream.id())); - //int size = error_h->size(); - //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id())); + edm::Service cs; + + { + const int threadsPerBlock = 512; + const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all + + assert(0 == wordCounter%2); + // wordCounter is the total no of words in each event to be trasfered on device + auto word_d = cs->make_unique(wordCounter, stream); + auto fedId_d = cs->make_unique(wordCounter, stream); + + auto error_d = cs->make_unique>(stream); + auto data_d = cs->make_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id())); + new (error_h_tmp) GPU::SimpleVector(MAX_FED_WORDS, data_d.get()); + assert(error_h_tmp->size() == 0); + assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS)); + + cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(&fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp, vsize, cudaMemcpyDefault, stream.id())); + + auto pdigi_d = cs->make_unique(wordCounter, stream); + auto rawIdArr_d = cs->make_unique(wordCounter, stream); + + // Launch rawToDigi kernel + RawToDigi_kernel<<>>( + cablingMap, + modToUnp, + wordCounter, + word_d.get(), + fedId_d.get(), + digis_d.xx(), digis_d.yy(), digis_d.adc(), + pdigi_d.get(), + rawIdArr_d.get(), + digis_d.moduleInd(), + error_d.get(), + useQualityInfo, + includeErrors, + debug); + cudaCheck(cudaGetLastError()); + + // copy data to host variable + if(transferToCPU) { + cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + + if (includeErrors) { + cudaCheck(cudaMemcpyAsync(error_h, error_d.get(), vsize, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(data_h, data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); + // If we want to transfer only the minimal amount of data, we + // need a synchronization point. A single ExternalWork (of + // SiPixelRawToClusterHeterogeneous) does not help because it is + // already used to synchronize the data movement. So we'd need + // two ExternalWorks (or explicit use of TBB tasks). The + // prototype of #100 would allow this easily (as there would be + // two ExternalWorks). + // + //error_h->set_data(data_h); + //cudaCheck(cudaStreamSynchronize(stream.id())); + //int size = error_h->size(); + //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id())); + } } } // End of Raw2Digi and passing data for cluserisation @@ -709,10 +697,17 @@ namespace pixelgpudetails { // available in the rechit producer without additional points of // synchronization/ExternalWork // + // Temporary storage + size_t tempScanStorageSize = 0; + { + uint32_t *tmp = nullptr; + cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); + } + auto tempScanStorage_d = cs->make_unique(tempScanStorageSize, stream); // Set first the first element to 0 cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id())); // Then use inclusive_scan to get the partial sum to the rest - cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize, + cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d.get(), tempScanStorageSize, clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules, stream.id())); // last element holds the number of all clusters diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 87682f728bc4f..510898f751e2c 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -202,22 +202,9 @@ namespace pixelgpudetails { uint32_t *nModulesActive = nullptr; uint32_t *nClusters = nullptr; - // scratch memory buffers - uint32_t * word_d; - uint8_t * fedId_d; - uint32_t * pdigi_d; - uint32_t * rawIdArr_d; - - GPU::SimpleVector * error_d; - error_obj * data_d; - // Data to be put in the event SiPixelDigisCUDA digis_d; SiPixelClustersCUDA clusters_d; - - // originally in rechit, moved here - void *tempScanStorage_d = nullptr; - size_t tempScanStorageSize = 0; }; // configuration and memory buffers alocated on the GPU From 1bb5d59eed2ab9b7625b823dc37248015dacc601 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 25 Sep 2018 22:16:58 +0200 Subject: [PATCH 04/19] Really release the cached memory --- HeterogeneousCore/CUDAServices/src/CUDAService.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 7757d584e1bfc..967aa503e6c08 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -288,6 +288,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& CUDAService::~CUDAService() { if (enabled_) { + // Explicitly destruct the allocator before the device resets below + allocator_.reset(); + for (int i = 0; i < numberOfDevices_; ++i) { cudaCheck(cudaSetDevice(i)); cudaCheck(cudaDeviceSynchronize()); From 8d0d33079c3522ad398743c30bb5cae4e42a0a3b Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 19 Oct 2018 16:59:59 +0200 Subject: [PATCH 05/19] Add CachingHostAllocator --- .../CUDAServices/src/CachingHostAllocator.h | 635 ++++++++++++++++++ 1 file changed, 635 insertions(+) create mode 100644 HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h diff --git a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h new file mode 100644 index 0000000000000..52cf20eefd47d --- /dev/null +++ b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h @@ -0,0 +1,635 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * Modified to cache pinned host allocations by Matti Kortelainen + */ + +/****************************************************************************** + * Simple caching allocator for pinned host memory allocations. The allocator is + * thread-safe. + ******************************************************************************/ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/****************************************************************************** + * CachingHostAllocator (host use) + ******************************************************************************/ + +/** + * \brief A simple caching allocator pinned host memory allocations. + * + * \par Overview + * The allocator is thread-safe. It behaves as follows: + * + * I presume the CUDA stream-safeness is not useful as to read/write + * from/to the pinned host memory one needs to synchronize anyway. The + * difference wrt. device memory is that in the CPU all operations to + * the device memory are scheduled via the CUDA stream, while for the + * host memory one can perform operations directly. + * + * \par + * - Allocations are categorized and cached by bin size. A new allocation request of + * a given size will only consider cached allocations within the corresponding bin. + * - Bin limits progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused host allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations will exceed + * \p max_cached_bytes, allocations are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingHostAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes + * + */ +struct CachingHostAllocator +{ + + //--------------------------------------------------------------------- + // Constants + //--------------------------------------------------------------------- + + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int) -1; + + /// Invalid size + static const size_t INVALID_SIZE = (size_t) -1; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Invalid device ordinal + static const int INVALID_DEVICE_ORDINAL = -1; + + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- + + /** + * Descriptor for pinned host memory allocations + */ + struct BlockDescriptor + { + void* d_ptr; // Host pointer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + int device; // device ordinal + cudaStream_t associated_stream; // Associated associated_stream + cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed + + // Constructor (suitable for searching maps for a specific block, given its pointer) + BlockDescriptor(void *d_ptr) : + d_ptr(d_ptr), + bytes(0), + bin(INVALID_BIN), + device(INVALID_DEVICE_ORDINAL), + associated_stream(0), + ready_event(0) + {} + + // Constructor (suitable for searching maps for a range of suitable blocks) + BlockDescriptor() : + d_ptr(NULL), + bytes(0), + bin(INVALID_BIN), + device(INVALID_DEVICE_ORDINAL), + associated_stream(0), + ready_event(0) + {} + + // Comparison functor for comparing host pointers + static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + return (a.d_ptr < b.d_ptr); + } + + // Comparison functor for comparing allocation sizes + static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + return (a.bytes < b.bytes); + } + }; + + /// BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + + class TotalBytes { + public: + size_t free; + size_t live; + TotalBytes() { free = live = 0; } + }; + + /// Set type for cached blocks (ordered by size) + typedef std::multiset CachedBlocks; + + /// Set type for live blocks (ordered by ptr) + typedef std::multiset BusyBlocks; + + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow( + unsigned int base, + unsigned int exp) + { + unsigned int retval = 1; + while (exp > 0) + { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + + /** + * Round up to the nearest power-of + */ + void NearestPowerOf( + unsigned int &power, + size_t &rounded_bytes, + unsigned int base, + size_t value) + { + power = 0; + rounded_bytes = 1; + + if (value * base < value) + { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + return; + } + + while (rounded_bytes < value) + { + rounded_bytes *= base; + power++; + } + } + + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + cub::Mutex mutex; /// Mutex for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes + + const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) + bool debug; /// Whether or not to print (de)allocation events to stdout + + TotalBytes cached_bytes; /// Aggregate cached bytes + CachedBlocks cached_blocks; /// Set of cached pinned host allocations available for reuse + BusyBlocks live_blocks; /// Set of live pinned host allocations currently in use + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingHostAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes (default is no limit) + bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) + bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) + : + bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes + */ + CachingHostAllocator( + bool skip_cleanup = false, + bool debug = false) + : + bin_growth(8), + min_bin(3), + max_bin(7), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use or + * cached-in-reserve) to be freed. See \p FreeAllCached(). + */ + void SetMaxCachedBytes( + size_t max_cached_bytes) + { + // Lock + mutex.Lock(); + + if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); + + this->max_cached_bytes = max_cached_bytes; + + // Unlock + mutex.Unlock(); + } + + + /** + * \brief Provides a suitable allocation of pinned host memory for the given size. + * + * Once freed, the allocation becomes available immediately for reuse. + */ + cudaError_t HostAllocate( + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + { + *d_ptr = NULL; + int device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + if (CubDebug(error = cudaGetDevice(&device))) return error; + + // Create a block descriptor for the requested allocation + bool found = false; + BlockDescriptor search_key; + search_key.device = device; + search_key.associated_stream = active_stream; + NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); + + if (search_key.bin > max_bin) + { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; + } + else + { + // Search for a suitable cached allocation: lock + mutex.Lock(); + + if (search_key.bin < min_bin) + { + // Bin is less than minimum bin: round up + search_key.bin = min_bin; + search_key.bytes = min_bin_bytes; + } + + // Iterate through the range of cached blocks in the same bin + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + while ((block_itr != cached_blocks.end()) + && (block_itr->bin == search_key.bin)) + { + // To prevent races with reusing blocks returned by the host but still + // in use for transfers, only consider cached blocks that are from an idle stream + if(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + search_key.associated_stream = active_stream; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_bytes.free -= search_key.bytes; + cached_bytes.live += search_key.bytes; + + if (debug) _CubLog("\tHost reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", + search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); + + cached_blocks.erase(block_itr); + + break; + } + block_itr++; + } + + // Done searching: unlock + mutex.Unlock(); + } + + // Allocate the block if necessary + if (!found) + { + // Attempt to allocate + // TODO: eventually support allocation flags + if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == cudaErrorMemoryAllocation) + { + // The allocation attempt failed: free all cached blocks on device and retry + if (debug) _CubLog("\tHost failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", + (long long) search_key.bytes, (long long) search_key.associated_stream); + + error = cudaSuccess; // Reset the error we will return + cudaGetLastError(); // Reset CUDART's error + + // Lock + mutex.Lock(); + + // Iterate the range of free blocks + CachedBlocks::iterator block_itr = cached_blocks.begin(); + + while ((block_itr != cached_blocks.end())) + { + // No need to worry about synchronization with the device: cudaFree is + // blocking and will synchronize across all kernels executing + // on the current device + + // Free pinned host memory. + if (CubDebug(error = cudaFreeHost(block_itr->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes.free -= block_itr->bytes; + + if (debug) _CubLog("\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + + cached_blocks.erase(block_itr); + + block_itr++; + } + + // Unlock + mutex.Unlock(); + + // Return under error + if (error) return error; + + // Try to allocate again + if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault))) return error; + } + + // Create ready event + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + return error; + + // Insert into live blocks + mutex.Lock(); + live_blocks.insert(search_key); + cached_bytes.live += search_key.bytes; + mutex.Unlock(); + + if (debug) _CubLog("\tHost allocated new device block at %p (%lld bytes associated with stream %lld).\n", + search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); + } + + // Copy host pointer to output parameter + *d_ptr = search_key.d_ptr; + + if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + + return error; + } + + + /** + * \brief Frees a live allocation of pinned host memory, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse. + */ + cudaError_t HostFree( + void* d_ptr) + { + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + // Lock + mutex.Lock(); + + // Find corresponding block descriptor + bool recached = false; + BlockDescriptor search_key(d_ptr); + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) + { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes.live -= search_key.bytes; + + // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold + if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) + { + // Insert returned allocation into free blocks + recached = true; + cached_blocks.insert(search_key); + cached_bytes.free += search_key.bytes; + + if (debug) _CubLog("\tHost returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + } + } + + // Unlock + mutex.Unlock(); + + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (entrypoint_device != search_key.device) { + if (CubDebug(error = cudaSetDevice(search_key.device))) return error; + } + + if (recached) { + // Insert the ready event in the associated stream (must have current device set properly) + if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; + } + else + { + // Free the allocation from the runtime and cleanup the event. + if (CubDebug(error = cudaFreeHost(d_ptr))) return error; + if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; + + if (debug) _CubLog("\tHost freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + } + + // Reset device + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Frees all cached pinned host allocations + */ + cudaError_t FreeAllCached() + { + cudaError_t error = cudaSuccess; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + int current_device = INVALID_DEVICE_ORDINAL; + + mutex.Lock(); + + while (!cached_blocks.empty()) + { + // Get first block + CachedBlocks::iterator begin = cached_blocks.begin(); + + // Get entry-point device ordinal if necessary + if (entrypoint_device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + } + + // Set current device ordinal if necessary + if (begin->device != current_device) + { + if (CubDebug(error = cudaSetDevice(begin->device))) break; + current_device = begin->device; + } + + // Free host memory + if (CubDebug(error = cudaFreeHost(begin->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes.free -= begin->bytes; + + if (debug) _CubLog("\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + + cached_blocks.erase(begin); + } + + mutex.Unlock(); + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Destructor + */ + ~CachingHostAllocator() + { + if (!skip_cleanup) + FreeAllCached(); + } + +}; + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) From 20f125d7b3265d974f37cba2a6828e7e7dc814eb Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 19 Oct 2018 19:37:13 +0200 Subject: [PATCH 06/19] Rename device unique_ptrs to prepare for host unique_ptrs --- .../SiPixelCluster/src/SiPixelClustersCUDA.cc | 10 ++--- .../SiPixelDigi/src/SiPixelDigisCUDA.cc | 8 ++-- .../CUDAServices/interface/CUDAService.h | 42 +++++++++---------- .../CUDAServices/src/CUDAService.cc | 4 +- .../CUDAServices/test/testCUDAService.cpp | 10 ++--- .../plugins/SiPixelRawToClusterGPUKernel.cu | 14 +++---- 6 files changed, 44 insertions(+), 44 deletions(-) diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc index 55a84c6e09275..20d66f9025a5f 100644 --- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -6,9 +6,9 @@ SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) { edm::Service cs; - moduleStart_d = cs->make_unique(nelements+1, stream); - clus_d = cs->make_unique< int32_t[]>(feds, stream); - clusInModule_d = cs->make_unique(nelements, stream); - moduleId_d = cs->make_unique(nelements, stream); - clusModuleStart_d = cs->make_unique(nelements+1, stream); + moduleStart_d = cs->make_device_unique(nelements+1, stream); + clus_d = cs->make_device_unique< int32_t[]>(feds, stream); + clusInModule_d = cs->make_device_unique(nelements, stream); + moduleId_d = cs->make_device_unique(nelements, stream); + clusModuleStart_d = cs->make_device_unique(nelements+1, stream); } diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc index a2999c97e5f47..87613c6447b24 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -6,8 +6,8 @@ SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) { edm::Service cs; - xx_d = cs->make_unique(nelements, stream); - yy_d = cs->make_unique(nelements, stream); - adc_d = cs->make_unique(nelements, stream); - moduleInd_d = cs->make_unique(nelements, stream); + xx_d = cs->make_device_unique(nelements, stream); + yy_d = cs->make_device_unique(nelements, stream); + adc_d = cs->make_device_unique(nelements, stream); + moduleInd_d = cs->make_device_unique(nelements, stream); } diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h index 75bd950e74ead..a41766850fbd6 100644 --- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h +++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h @@ -18,11 +18,11 @@ namespace edm { namespace cudaserviceimpl { template - struct make_unique_selector { using non_array = edm::cuda::device::unique_ptr; }; + struct make_device_unique_selector { using non_array = edm::cuda::device::unique_ptr; }; template - struct make_unique_selector { using unbounded_array = edm::cuda::device::unique_ptr; }; + struct make_device_unique_selector { using unbounded_array = edm::cuda::device::unique_ptr; }; template - struct make_unique_selector { struct bounded_array {}; }; + struct make_device_unique_selector { struct bounded_array {}; }; } /** @@ -62,40 +62,40 @@ class CUDAService { // Allocate device memory template - typename cudaserviceimpl::make_unique_selector::non_array - make_unique(cuda::stream_t<>& stream) { + typename cudaserviceimpl::make_device_unique_selector::non_array + make_device_unique(cuda::stream_t<>& stream) { int dev = getCurrentDevice(); - void *mem = allocate(dev, sizeof(T), stream); - return typename cudaserviceimpl::make_unique_selector::non_array(reinterpret_cast(mem), - [this, dev](void *ptr) { - this->free(dev, ptr); - }); + void *mem = allocate_device(dev, sizeof(T), stream); + return typename cudaserviceimpl::make_device_unique_selector::non_array(reinterpret_cast(mem), + [this, dev](void *ptr) { + this->free_device(dev, ptr); + }); } template - typename cudaserviceimpl::make_unique_selector::unbounded_array - make_unique(size_t n, cuda::stream_t<>& stream) { + typename cudaserviceimpl::make_device_unique_selector::unbounded_array + make_device_unique(size_t n, cuda::stream_t<>& stream) { int dev = getCurrentDevice(); using element_type = typename std::remove_extent::type; - void *mem = allocate(dev, n*sizeof(element_type), stream); - return typename cudaserviceimpl::make_unique_selector::unbounded_array(reinterpret_cast(mem), - [this, dev](void *ptr) { - this->free(dev, ptr); - }); + void *mem = allocate_device(dev, n*sizeof(element_type), stream); + return typename cudaserviceimpl::make_device_unique_selector::unbounded_array(reinterpret_cast(mem), + [this, dev](void *ptr) { + this->free_device(dev, ptr); + }); } template - typename cudaserviceimpl::make_unique_selector::bounded_array - make_unique(Args&&...) = delete; + typename cudaserviceimpl::make_device_unique_selector::bounded_array + make_device_unique(Args&&...) = delete; // Free device memory (to be called from unique_ptr) - void free(int device, void *ptr); + void free_device(int device, void *ptr); private: // PIMPL to hide details of allocator struct Allocator; std::unique_ptr allocator_; - void *allocate(int dev, size_t nbytes, cuda::stream_t<>& stream); + void *allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream); int numberOfDevices_ = 0; unsigned int numberOfStreamsTotal_ = 0; diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 967aa503e6c08..20fc33ed6b44f 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -370,7 +370,7 @@ struct CUDAService::Allocator { cub::CachingDeviceAllocator allocator; }; -void *CUDAService::allocate(int dev, size_t nbytes, cuda::stream_t<>& stream) { +void *CUDAService::allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream) { if(nbytes > allocator_->maxAllocation) { throw std::runtime_error("Tried to allocate "+std::to_string(nbytes)+" bytes, but the allocator maximum is "+std::to_string(allocator_->maxAllocation)); } @@ -380,6 +380,6 @@ void *CUDAService::allocate(int dev, size_t nbytes, cuda::stream_t<>& stream) { return ptr; } -void CUDAService::free(int device, void *ptr) { +void CUDAService::free_device(int device, void *ptr) { allocator_->allocator.DeviceFree(device, ptr); } diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp index c423001b988de..8fcc9a62d6369 100644 --- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp +++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp @@ -170,7 +170,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { } - SECTION("Allocator") { + SECTION("Device allocator") { edm::ParameterSet ps; ps.addUntrackedParameter("enabled", true); edm::ParameterSet alloc; @@ -183,13 +183,13 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); SECTION("Destructor") { - auto ptr = cs.make_unique(cudaStream); + auto ptr = cs.make_device_unique(cudaStream); REQUIRE(ptr.get() != nullptr); cudaStream.synchronize(); } SECTION("Reset") { - auto ptr = cs.make_unique(5, cudaStream); + auto ptr = cs.make_device_unique(5, cudaStream); REQUIRE(ptr.get() != nullptr); cudaStream.synchronize(); @@ -198,9 +198,9 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { } SECTION("Allocating too much") { - auto ptr = cs.make_unique(512, cudaStream); + auto ptr = cs.make_device_unique(512, cudaStream); ptr.reset(); - REQUIRE_THROWS(ptr = cs.make_unique(513, cudaStream)); + REQUIRE_THROWS(ptr = cs.make_device_unique(513, cudaStream)); } } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 54cde7461f9d7..b668ca1a0f86c 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -573,11 +573,11 @@ namespace pixelgpudetails { assert(0 == wordCounter%2); // wordCounter is the total no of words in each event to be trasfered on device - auto word_d = cs->make_unique(wordCounter, stream); - auto fedId_d = cs->make_unique(wordCounter, stream); + auto word_d = cs->make_device_unique(wordCounter, stream); + auto fedId_d = cs->make_device_unique(wordCounter, stream); - auto error_d = cs->make_unique>(stream); - auto data_d = cs->make_unique(MAX_FED_WORDS, stream); + auto error_d = cs->make_device_unique>(stream); + auto data_d = cs->make_device_unique(MAX_FED_WORDS, stream); cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id())); new (error_h_tmp) GPU::SimpleVector(MAX_FED_WORDS, data_d.get()); assert(error_h_tmp->size() == 0); @@ -587,8 +587,8 @@ namespace pixelgpudetails { cudaCheck(cudaMemcpyAsync(&fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp, vsize, cudaMemcpyDefault, stream.id())); - auto pdigi_d = cs->make_unique(wordCounter, stream); - auto rawIdArr_d = cs->make_unique(wordCounter, stream); + auto pdigi_d = cs->make_device_unique(wordCounter, stream); + auto rawIdArr_d = cs->make_device_unique(wordCounter, stream); // Launch rawToDigi kernel RawToDigi_kernel<<>>( @@ -703,7 +703,7 @@ namespace pixelgpudetails { uint32_t *tmp = nullptr; cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); } - auto tempScanStorage_d = cs->make_unique(tempScanStorageSize, stream); + auto tempScanStorage_d = cs->make_device_unique(tempScanStorageSize, stream); // Set first the first element to 0 cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id())); // Then use inclusive_scan to get the partial sum to the rest From 21f5bcdd2263071117ba22151ed1983b1a9edffe Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 19 Oct 2018 20:55:26 +0200 Subject: [PATCH 07/19] Use CachingHostAllocator in CUDAService --- .../Common/interface/host_unique_ptr.h | 16 ++++++++ .../CUDAServices/interface/CUDAService.h | 38 +++++++++++++++++++ .../CUDAServices/src/CUDAService.cc | 25 ++++++++++-- .../CUDAServices/test/testCUDAService.cpp | 33 ++++++++++++++++ 4 files changed, 108 insertions(+), 4 deletions(-) create mode 100644 CUDADataFormats/Common/interface/host_unique_ptr.h diff --git a/CUDADataFormats/Common/interface/host_unique_ptr.h b/CUDADataFormats/Common/interface/host_unique_ptr.h new file mode 100644 index 0000000000000..c945d9b0aa027 --- /dev/null +++ b/CUDADataFormats/Common/interface/host_unique_ptr.h @@ -0,0 +1,16 @@ +#ifndef CUDADataFormats_Common_interface_host_unique_ptr_h +#define CUDADataFormats_Common_interface_host_unique_ptr_h + +#include +#include + +namespace edm { + namespace cuda { + namespace host { + template + using unique_ptr = std::unique_ptr>; + } + } +} + +#endif diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h index a41766850fbd6..feeb557042548 100644 --- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h +++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h @@ -9,6 +9,7 @@ #include "FWCore/Utilities/interface/StreamID.h" #include "CUDADataFormats/Common/interface/device_unique_ptr.h" +#include "CUDADataFormats/Common/interface/host_unique_ptr.h" namespace edm { class ParameterSet; @@ -23,6 +24,13 @@ namespace cudaserviceimpl { struct make_device_unique_selector { using unbounded_array = edm::cuda::device::unique_ptr; }; template struct make_device_unique_selector { struct bounded_array {}; }; + + template + struct make_host_unique_selector { using non_array = edm::cuda::host::unique_ptr; }; + template + struct make_host_unique_selector { using unbounded_array = edm::cuda::host::unique_ptr; }; + template + struct make_host_unique_selector { struct bounded_array {}; }; } /** @@ -87,15 +95,45 @@ class CUDAService { template typename cudaserviceimpl::make_device_unique_selector::bounded_array make_device_unique(Args&&...) = delete; + + // Allocate pinned host memory + template + typename cudaserviceimpl::make_host_unique_selector::non_array + make_host_unique(cuda::stream_t<>& stream) { + void *mem = allocate_host(sizeof(T), stream); + return typename cudaserviceimpl::make_host_unique_selector::non_array(reinterpret_cast(mem), + [this](void *ptr) { + this->free_host(ptr); + }); + } + + template + typename cudaserviceimpl::make_host_unique_selector::unbounded_array + make_host_unique(size_t n, cuda::stream_t<>& stream) { + using element_type = typename std::remove_extent::type; + void *mem = allocate_host(n*sizeof(element_type), stream); + return typename cudaserviceimpl::make_host_unique_selector::unbounded_array(reinterpret_cast(mem), + [this](void *ptr) { + this->free_host(ptr); + }); + } + + template + typename cudaserviceimpl::make_host_unique_selector::bounded_array + make_host_unique(Args&&...) = delete; // Free device memory (to be called from unique_ptr) void free_device(int device, void *ptr); + // Free pinned host memory (to be called from unique_ptr) + void free_host(void *ptr); + private: // PIMPL to hide details of allocator struct Allocator; std::unique_ptr allocator_; void *allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream); + void *allocate_host(size_t nbytes, cuda::stream_t<>& stream); int numberOfDevices_ = 0; unsigned int numberOfStreamsTotal_ = 0; diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 20fc33ed6b44f..ea45d591f12c2 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -14,6 +14,8 @@ #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "CachingHostAllocator.h" + void setCudaLimit(cudaLimit limit, const char* name, size_t request) { // read the current device int device; @@ -365,9 +367,10 @@ int CUDAService::getCurrentDevice() const { // allocator struct CUDAService::Allocator { template - Allocator(size_t max, Args&&... args): maxAllocation(max), allocator(std::forward(args)...) {} + Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward(args)...) {} size_t maxAllocation; - cub::CachingDeviceAllocator allocator; + cub::CachingDeviceAllocator deviceAllocator; + cub::CachingHostAllocator hostAllocator; }; void *CUDAService::allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream) { @@ -376,10 +379,24 @@ void *CUDAService::allocate_device(int dev, size_t nbytes, cuda::stream_t<>& str } void *ptr = nullptr; - cuda::throw_if_error(allocator_->allocator.DeviceAllocate(dev, &ptr, nbytes, stream.id())); + cuda::throw_if_error(allocator_->deviceAllocator.DeviceAllocate(dev, &ptr, nbytes, stream.id())); return ptr; } void CUDAService::free_device(int device, void *ptr) { - allocator_->allocator.DeviceFree(device, ptr); + allocator_->deviceAllocator.DeviceFree(device, ptr); +} + +void *CUDAService::allocate_host(size_t nbytes, cuda::stream_t<>& stream) { + if(nbytes > allocator_->maxAllocation) { + throw std::runtime_error("Tried to allocate "+std::to_string(nbytes)+" bytes, but the allocator maximum is "+std::to_string(allocator_->maxAllocation)); + } + + void *ptr = nullptr; + cuda::throw_if_error(allocator_->hostAllocator.HostAllocate(&ptr, nbytes, stream.id())); + return ptr; +} + +void CUDAService::free_host(void *ptr) { + allocator_->hostAllocator.HostFree(ptr); } diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp index 8fcc9a62d6369..d0a1afcc8203f 100644 --- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp +++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp @@ -204,6 +204,39 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { } } + + SECTION("Host allocator") { + edm::ParameterSet ps; + ps.addUntrackedParameter("enabled", true); + edm::ParameterSet alloc; + alloc.addUntrackedParameter("minBin", 1U); + alloc.addUntrackedParameter("maxBin", 3U); + ps.addUntrackedParameter("allocator", alloc); + auto cs = makeCUDAService(ps, ar); + cs.setCurrentDevice(0); + auto current_device = cuda::device::current::get(); + auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + + SECTION("Destructor") { + auto ptr = cs.make_host_unique(cudaStream); + REQUIRE(ptr.get() != nullptr); + } + + SECTION("Reset") { + auto ptr = cs.make_host_unique(5, cudaStream); + REQUIRE(ptr.get() != nullptr); + + ptr.reset(); + REQUIRE(ptr.get() == nullptr); + } + + SECTION("Allocating too much") { + auto ptr = cs.make_host_unique(512, cudaStream); + ptr.reset(); + REQUIRE_THROWS(ptr = cs.make_host_unique(513, cudaStream)); + } + } + //Fake the end-of-job signal. ar.postEndJobSignal_(); } From e0087ce3aa3a23acfdde968932f045b5192e2786 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 25 Oct 2018 23:58:42 +0200 Subject: [PATCH 08/19] Use unique_host_ptr for GPU->CPU buffers --- .../plugins/SiPixelRawToClusterGPUKernel.cu | 67 +++++++------------ .../plugins/SiPixelRawToClusterGPUKernel.h | 44 ++++++++---- .../SiPixelRawToClusterHeterogeneous.cc | 35 +++++----- .../siPixelRawToClusterHeterogeneousProduct.h | 15 +---- 4 files changed, 74 insertions(+), 87 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index b668ca1a0f86c..e3c057b3a1c04 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -56,42 +56,12 @@ namespace pixelgpudetails { cudaCheck(cudaMallocHost(&word, MAX_FED_WORDS * sizeof(unsigned int))); cudaCheck(cudaMallocHost(&fedId_h, MAX_FED_WORDS * sizeof(unsigned char))); - - // to store the output of RawToDigi - cudaCheck(cudaMallocHost(&pdigi_h, MAX_FED_WORDS * sizeof(uint32_t))); - cudaCheck(cudaMallocHost(&rawIdArr_h, MAX_FED_WORDS * sizeof(uint32_t))); - - cudaCheck(cudaMallocHost(&adc_h, MAX_FED_WORDS * sizeof(uint16_t))); - cudaCheck(cudaMallocHost(&clus_h, MAX_FED_WORDS * sizeof(int32_t))); - - cudaCheck(cudaMallocHost(&error_h, vsize)); - cudaCheck(cudaMallocHost(&error_h_tmp, vsize)); - cudaCheck(cudaMallocHost(&data_h, MAX_ERROR_SIZE)); - - using namespace gpuClustering; - - new (error_h) GPU::SimpleVector(MAX_FED_WORDS, data_h); - assert(error_h->size() == 0); - assert(error_h->capacity() == static_cast(MAX_FED_WORDS)); - - // Need these in pinned memory to be truly asynchronous - cudaCheck(cudaMallocHost(&nModulesActive, sizeof(uint32_t))); - cudaCheck(cudaMallocHost(&nClusters, sizeof(uint32_t))); } SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { // free the host memory cudaCheck(cudaFreeHost(word)); cudaCheck(cudaFreeHost(fedId_h)); - cudaCheck(cudaFreeHost(pdigi_h)); - cudaCheck(cudaFreeHost(rawIdArr_h)); - cudaCheck(cudaFreeHost(adc_h)); - cudaCheck(cudaFreeHost(clus_h)); - cudaCheck(cudaFreeHost(error_h)); - cudaCheck(cudaFreeHost(error_h_tmp)); - cudaCheck(cudaFreeHost(data_h)); - cudaCheck(cudaFreeHost(nModulesActive)); - cudaCheck(cudaFreeHost(nClusters)); } void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { @@ -566,6 +536,7 @@ namespace pixelgpudetails { clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream); edm::Service cs; + digis_clusters_h.nModules_Clusters = cs->make_host_unique(2, stream); { const int threadsPerBlock = 512; @@ -579,13 +550,14 @@ namespace pixelgpudetails { auto error_d = cs->make_device_unique>(stream); auto data_d = cs->make_device_unique(MAX_FED_WORDS, stream); cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id())); - new (error_h_tmp) GPU::SimpleVector(MAX_FED_WORDS, data_d.get()); + auto error_h_tmp = cs->make_host_unique>(stream); + new (error_h_tmp.get()) GPU::SimpleVector(MAX_FED_WORDS, data_d.get()); // should make_host_unique() call the constructor as well? note that even if std::make_unique does that, we can't do that in make_device_unique assert(error_h_tmp->size() == 0); assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS)); cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(&fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp, vsize, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp.get(), vsize, cudaMemcpyDefault, stream.id())); auto pdigi_d = cs->make_device_unique(wordCounter, stream); auto rawIdArr_d = cs->make_device_unique(wordCounter, stream); @@ -609,12 +581,20 @@ namespace pixelgpudetails { // copy data to host variable if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + digis_clusters_h.pdigi = cs->make_host_unique(MAX_FED_WORDS, stream); + digis_clusters_h.rawIdArr = cs->make_host_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.pdigi.get(), pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.rawIdArr.get(), rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); if (includeErrors) { - cudaCheck(cudaMemcpyAsync(error_h, error_d.get(), vsize, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(data_h, data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); + digis_clusters_h.data = cs->make_host_unique(MAX_FED_WORDS, stream); + digis_clusters_h.error = cs->make_host_unique>(stream); + new (digis_clusters_h.error.get()) GPU::SimpleVector(MAX_FED_WORDS, digis_clusters_h.data.get()); + assert(digis_clusters_h.error->size() == 0); + assert(digis_clusters_h.error->capacity() == static_cast(MAX_FED_WORDS)); + + cudaCheck(cudaMemcpyAsync(digis_clusters_h.error.get(), error_d.get(), vsize, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); // If we want to transfer only the minimal amount of data, we // need a synchronization point. A single ExternalWork (of // SiPixelRawToClusterHeterogeneous) does not help because it is @@ -623,10 +603,9 @@ namespace pixelgpudetails { // prototype of #100 would allow this easily (as there would be // two ExternalWorks). // - //error_h->set_data(data_h); //cudaCheck(cudaStreamSynchronize(stream.id())); - //int size = error_h->size(); - //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id())); + //int size = digis_clusters_h.error->size(); + //cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), size*esize, cudaMemcpyDefault, stream.id())); } } } @@ -647,7 +626,8 @@ namespace pixelgpudetails { // calibrated adc if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(adc_h, digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + digis_clusters_h.adc = cs->make_host_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.adc.get(), digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); } #ifdef GPU_DEBUG @@ -662,7 +642,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); // read the number of modules into a data member, used by getProduct()) - cudaCheck(cudaMemcpyAsync(nModulesActive, clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id())); threadsPerBlock = 256; blocks = MaxNumModules; @@ -711,12 +691,13 @@ namespace pixelgpudetails { clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules, stream.id())); // last element holds the number of all clusters - cudaCheck(cudaMemcpyAsync(nClusters, clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); // clusters if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(clus_h, clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + digis_clusters_h.clus = cs->make_host_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.clus.get(), clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); } } // end clusterizer scope } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 510898f751e2c..6dd9c3118e1f2 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -5,6 +5,7 @@ #include #include "cuda/api_wrappers.h" +#include "CUDADataFormats/Common/interface/host_unique_ptr.h" #include "FWCore/Utilities/interface/typedefs.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include "siPixelRawToClusterHeterogeneousProduct.h" @@ -159,6 +160,26 @@ namespace pixelgpudetails { using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct; + struct CPUData { + CPUData() = default; + ~CPUData() = default; + + CPUData(const CPUData&) = delete; + CPUData& operator=(const CPUData&) = delete; + CPUData(CPUData&&) = default; + CPUData& operator=(CPUData&&) = default; + + edm::cuda::host::unique_ptr nModules_Clusters; // These should really be part of the GPU product + + edm::cuda::host::unique_ptr data; + edm::cuda::host::unique_ptr> error; + + edm::cuda::host::unique_ptr pdigi; + edm::cuda::host::unique_ptr rawIdArr; + edm::cuda::host::unique_ptr adc; + edm::cuda::host::unique_ptr clus; + }; + SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream); ~SiPixelRawToClusterGPUKernel(); @@ -177,30 +198,27 @@ namespace pixelgpudetails { cuda::stream_t<>& stream); siPixelRawToClusterHeterogeneousProduct::GPUProduct getProduct() { - error_h->set_data(data_h); return siPixelRawToClusterHeterogeneousProduct::GPUProduct( - pdigi_h, rawIdArr_h, clus_h, adc_h, error_h, std::move(digis_d), std::move(clusters_d), - nDigis, *nModulesActive, *nClusters + nDigis, + digis_clusters_h.nModules_Clusters[0], + digis_clusters_h.nModules_Clusters[1] ); } + CPUData&& getCPUData() { + return std::move(digis_clusters_h); + } + private: // input unsigned int *word = nullptr; // to hold input for rawtodigi unsigned char *fedId_h = nullptr; // to hold fed index for each word - // FIXME cleanup all these are in the gpuProduct above... - - uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output - uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output - pixelgpudetails::error_obj *data_h = nullptr; - GPU::SimpleVector *error_h = nullptr; - GPU::SimpleVector *error_h_tmp = nullptr; - uint32_t nDigis = 0; - uint32_t *nModulesActive = nullptr; - uint32_t *nClusters = nullptr; + + // CPU data + CPUData digis_clusters_h; // Data to be put in the event SiPixelDigisCUDA digis_d; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index bb920f3b45bb4..9c7d87470b96d 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -113,7 +113,7 @@ class SiPixelRawToClusterHeterogeneous: public HeterogeneousEDProducer& cudaStream) override; void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; - void convertGPUtoCPU(edm::Event& ev, const GPUProduct& gpu) const; + void convertGPUtoCPU(edm::Event& ev, const GPUProduct& gpu, pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData) const; // Commonalities const FEDRawDataCollection *initialize(const edm::Event& ev, const edm::EventSetup& es); @@ -557,14 +557,15 @@ void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& e auto output = std::make_unique(gpuAlgo_->getProduct()); if(enableConversion_) { - convertGPUtoCPU(ev.event(), *output); + convertGPUtoCPU(ev.event(), *output, gpuAlgo_->getCPUData()); } ev.put(std::move(output), heterogeneous::DisableTransfer{}); } void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, - const SiPixelRawToClusterHeterogeneous::GPUProduct& gpu) const { + const SiPixelRawToClusterHeterogeneous::GPUProduct& gpu, + pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData digis_clusters_h) const { // TODO: add the transfers here as well? auto collection = std::make_unique>(); @@ -576,8 +577,8 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, edm::DetSet * detDigis=nullptr; for (uint32_t i = 0; i < gpu.nDigis; i++) { - if (gpu.pdigi_h[i]==0) continue; - detDigis = &collection->find_or_insert(gpu.rawIdArr_h[i]); + if (digis_clusters_h.pdigi[i]==0) continue; + detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]); if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations break; } @@ -609,28 +610,28 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, }; for (uint32_t i = 0; i < gpu.nDigis; i++) { - if (gpu.pdigi_h[i]==0) continue; - if (gpu.clus_h[i]>9000) continue; // not in cluster - assert(gpu.rawIdArr_h[i] > 109999); - if ( (*detDigis).detId() != gpu.rawIdArr_h[i]) + if (digis_clusters_h.pdigi[i]==0) continue; + if (digis_clusters_h.clus[i]>9000) continue; // not in cluster + assert(digis_clusters_h.rawIdArr[i] > 109999); + if ( (*detDigis).detId() != digis_clusters_h.rawIdArr[i]) { fillClusters((*detDigis).detId()); assert(nclus==-1); - detDigis = &collection->find_or_insert(gpu.rawIdArr_h[i]); + detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]); if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; } } - (*detDigis).data.emplace_back(gpu.pdigi_h[i]); + (*detDigis).data.emplace_back(digis_clusters_h.pdigi[i]); auto const & dig = (*detDigis).data.back(); // fill clusters - assert(gpu.clus_h[i]>=0); - assert(gpu.clus_h[i]<1024); - nclus = std::max(gpu.clus_h[i],nclus); + assert(digis_clusters_h.clus[i]>=0); + assert(digis_clusters_h.clus[i]<1024); + nclus = std::max(digis_clusters_h.clus[i],nclus); auto row = dig.row(); auto col = dig.column(); SiPixelCluster::PixelPos pix(row,col); - aclusters[gpu.clus_h[i]].add(pix,gpu.adc_h[i]); + aclusters[digis_clusters_h.clus[i]].add(pix, digis_clusters_h.adc[i]); } // fill final clusters @@ -641,9 +642,9 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, auto errors = errors_; // make a copy PixelDataFormatter::DetErrors nodeterrors; - auto size = gpu.error_h->size(); + auto size = digis_clusters_h.error->size(); for (auto i = 0; i < size; i++) { - pixelgpudetails::error_obj err = (*gpu.error_h)[i]; + pixelgpudetails::error_obj err = (*digis_clusters_h.error)[i]; if (err.errorType != 0) { SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200); errors[err.rawId].push_back(error); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h index 3298249c194b8..0cab299582410 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h @@ -33,26 +33,13 @@ namespace siPixelRawToClusterHeterogeneousProduct { GPUProduct(GPUProduct&&) = default; GPUProduct& operator=(GPUProduct&&) = default; - GPUProduct(uint32_t const *pdigi, - uint32_t const *rawIdArr, - int32_t const *clus, - uint16_t const *adc, - GPU::SimpleVector const * error, - SiPixelDigisCUDA&& digis, + GPUProduct(SiPixelDigisCUDA&& digis, SiPixelClustersCUDA&& clusters, uint32_t ndig, uint32_t nmod, uint32_t nclus): - pdigi_h(pdigi), rawIdArr_h(rawIdArr), clus_h(clus), adc_h(adc), error_h(error), digis_d(std::move(digis)), clusters_d(std::move(clusters)), nDigis(ndig), nModules(nmod), nClusters(nclus) {} - // Needed for digi and cluster CPU output - uint32_t const * pdigi_h = nullptr; - uint32_t const * rawIdArr_h = nullptr; - int32_t const * clus_h = nullptr; - uint16_t const * adc_h = nullptr; - GPU::SimpleVector const * error_h = nullptr; - SiPixelDigisCUDA digis_d; SiPixelClustersCUDA clusters_d; From e7eac3f9ec16fd5a1196b2f043b82cd6ab7afb54 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 26 Oct 2018 20:00:34 +0200 Subject: [PATCH 09/19] Use unique_host_ptr for CPU->GPU transfers --- .../plugins/SiPixelRawToClusterGPUKernel.cu | 27 ++++++++++--------- .../plugins/SiPixelRawToClusterGPUKernel.h | 22 ++++++++++----- .../SiPixelRawToClusterHeterogeneous.cc | 4 ++- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index e3c057b3a1c04..6546bee1a8676 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -52,21 +52,23 @@ namespace pixelgpudetails { constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; constexpr uint32_t MAX_ERROR_SIZE = MAX_FED_WORDS * esize; - SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { + SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) { + edm::Service cs; + word_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); + fedId_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); + } - cudaCheck(cudaMallocHost(&word, MAX_FED_WORDS * sizeof(unsigned int))); - cudaCheck(cudaMallocHost(&fedId_h, MAX_FED_WORDS * sizeof(unsigned char))); + void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { + std::memcpy(word_.get()+wordCounterGPU, src, sizeof(cms_uint32_t)*length); + std::memset(fedId_.get()+wordCounterGPU/2, fedId - 1200, length/2); } - SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { - // free the host memory - cudaCheck(cudaFreeHost(word)); - cudaCheck(cudaFreeHost(fedId_h)); + //////////////////// + + SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { } - void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { - std::memcpy(word+wordCounterGPU, src, sizeof(cms_uint32_t)*length); - std::memset(fedId_h+wordCounterGPU/2, fedId - 1200, length/2); + SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { } //////////////////// @@ -524,6 +526,7 @@ namespace pixelgpudetails { const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, + const WordFedAppender& wordFed, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug, @@ -555,8 +558,8 @@ namespace pixelgpudetails { assert(error_h_tmp->size() == 0); assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS)); - cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(&fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(fedId_d.get(), wordFed.fedId(), wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp.get(), vsize, cudaMemcpyDefault, stream.id())); auto pdigi_d = cs->make_device_unique(wordCounter, stream); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 6dd9c3118e1f2..cb0f45c24a798 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -180,6 +180,21 @@ namespace pixelgpudetails { edm::cuda::host::unique_ptr clus; }; + class WordFedAppender { + public: + WordFedAppender(cuda::stream_t<>& cudaStream); + ~WordFedAppender() = default; + + void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); + + const unsigned int *word() const { return word_.get(); } + const unsigned char *fedId() const { return fedId_.get(); } + + private: + edm::cuda::host::unique_ptr word_; + edm::cuda::host::unique_ptr fedId_; + }; + SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream); ~SiPixelRawToClusterGPUKernel(); @@ -189,10 +204,9 @@ namespace pixelgpudetails { SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete; SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete; - void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); - void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, + const WordFedAppender& wordFed, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug, cuda::stream_t<>& stream); @@ -211,10 +225,6 @@ namespace pixelgpudetails { } private: - // input - unsigned int *word = nullptr; // to hold input for rawtodigi - unsigned char *fedId_h = nullptr; // to hold fed index for each word - uint32_t nDigis = 0; // CPU data diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index 9c7d87470b96d..c9c3bbfcdc5a6 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -494,6 +494,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData() ErrorChecker errorcheck; + auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(cudaStream); for (auto aFed = fedIds.begin(); aFed != fedIds.end(); ++aFed) { int fedId = *aFed; @@ -543,12 +544,13 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv const cms_uint32_t * ew = (const cms_uint32_t *)(trailer); assert(0 == (ew-bw)%2); - gpuAlgo_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); + wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); wordCounterGPU+=(ew-bw); } // end of for loop gpuAlgo_->makeClustersAsync(gpuMap, gpuModulesToUnpack_->get(), hgains->getGPUProductAsync(cudaStream), + wordFedAppender, wordCounterGPU, fedCounter, convertADCtoElectrons, useQuality, includeErrors, enableTransfer_, debug, cudaStream); } From a2c63af4f0fdffd7eadd2cffecb509b04c2095da Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 26 Oct 2018 20:19:43 +0200 Subject: [PATCH 10/19] Cleanup --- .../plugins/SiPixelRawToClusterGPUKernel.cu | 8 ----- .../plugins/SiPixelRawToClusterGPUKernel.h | 4 +-- .../SiPixelRawToClusterHeterogeneous.cc | 35 ++++++++----------- 3 files changed, 16 insertions(+), 31 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 6546bee1a8676..dc768ce8f643d 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -65,14 +65,6 @@ namespace pixelgpudetails { //////////////////// - SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { - } - - SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { - } - - //////////////////// - __device__ uint32_t getLink(uint32_t ww) { return ((ww >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask); } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index cb0f45c24a798..a2d9cdda92573 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -195,8 +195,8 @@ namespace pixelgpudetails { edm::cuda::host::unique_ptr fedId_; }; - SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream); - ~SiPixelRawToClusterGPUKernel(); + SiPixelRawToClusterGPUKernel() = default; + ~SiPixelRawToClusterGPUKernel() = default; SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index c9c3bbfcdc5a6..e5e910c581a93 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -110,10 +110,9 @@ class SiPixelRawToClusterHeterogeneous: public HeterogeneousEDProducer& cudaStream) override; void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; - void convertGPUtoCPU(edm::Event& ev, const GPUProduct& gpu, pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData) const; + void convertGPUtoCPU(edm::Event& ev, unsigned int nDigis, pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData) const; // Commonalities const FEDRawDataCollection *initialize(const edm::Event& ev, const edm::EventSetup& es); @@ -149,8 +148,8 @@ std::unique_ptr regions_; SiPixelGainCalibrationForHLTService theSiPixelGainCalibration_; // GPU algo - std::unique_ptr gpuAlgo_; - std::unique_ptr gpuModulesToUnpack_; + pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_; + SiPixelFedCablingMapGPUWrapper::ModulesToUnpack gpuModulesToUnpack_; PixelDataFormatter::Errors errors_; bool enableTransfer_; @@ -455,22 +454,16 @@ void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, c } // ----------------------------------------------------------------------------- -void SiPixelRawToClusterHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) { - // Allocate GPU resources here - gpuAlgo_ = std::make_unique(cudaStream); - gpuModulesToUnpack_ = std::make_unique(); -} - void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) { const auto buffers = initialize(ev.event(), es); if (regions_) { std::set modules = *(regions_->modulesToUnpack()); - gpuModulesToUnpack_->fillAsync(*cablingMap_, modules, cudaStream); + gpuModulesToUnpack_.fillAsync(*cablingMap_, modules, cudaStream); } else if(recordWatcherUpdatedSinceLastTransfer_) { // If regions_ are disabled, it is enough to fill and transfer only if cablingMap has changed - gpuModulesToUnpack_->fillAsync(*cablingMap_, std::set(), cudaStream); + gpuModulesToUnpack_.fillAsync(*cablingMap_, std::set(), cudaStream); recordWatcherUpdatedSinceLastTransfer_ = false; } @@ -549,24 +542,24 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv } // end of for loop - gpuAlgo_->makeClustersAsync(gpuMap, gpuModulesToUnpack_->get(), hgains->getGPUProductAsync(cudaStream), - wordFedAppender, - wordCounterGPU, fedCounter, convertADCtoElectrons, - useQuality, includeErrors, enableTransfer_, debug, cudaStream); + gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack_.get(), hgains->getGPUProductAsync(cudaStream), + wordFedAppender, + wordCounterGPU, fedCounter, convertADCtoElectrons, + useQuality, includeErrors, enableTransfer_, debug, cudaStream); } void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) { - auto output = std::make_unique(gpuAlgo_->getProduct()); + auto output = std::make_unique(gpuAlgo_.getProduct()); if(enableConversion_) { - convertGPUtoCPU(ev.event(), *output, gpuAlgo_->getCPUData()); + convertGPUtoCPU(ev.event(), output->nDigis, gpuAlgo_.getCPUData()); } ev.put(std::move(output), heterogeneous::DisableTransfer{}); } void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, - const SiPixelRawToClusterHeterogeneous::GPUProduct& gpu, + unsigned int nDigis, pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData digis_clusters_h) const { // TODO: add the transfers here as well? @@ -578,7 +571,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, auto outputClusters = std::make_unique(); edm::DetSet * detDigis=nullptr; - for (uint32_t i = 0; i < gpu.nDigis; i++) { + for (uint32_t i = 0; i < nDigis; i++) { if (digis_clusters_h.pdigi[i]==0) continue; detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]); if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations @@ -611,7 +604,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, if ( spc.empty() ) spc.abort(); }; - for (uint32_t i = 0; i < gpu.nDigis; i++) { + for (uint32_t i = 0; i < nDigis; i++) { if (digis_clusters_h.pdigi[i]==0) continue; if (digis_clusters_h.clus[i]>9000) continue; // not in cluster assert(digis_clusters_h.rawIdArr[i] > 109999); From a6b549a88bd5a838a08dbc0d8afac8bbac65d1fb Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 26 Oct 2018 20:45:06 +0200 Subject: [PATCH 11/19] Decrease allocator minBin to 1 --- HeterogeneousCore/CUDAServices/src/CUDAService.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index ea45d591f12c2..f7714fe2d795f 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -319,7 +319,7 @@ void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions edm::ParameterSetDescription allocator; allocator.addUntracked("binGrowth", 8)->setComment("Growth factor (bin_growth in cub::CachingDeviceAllocator"); - allocator.addUntracked("minBin", 5)->setComment("Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator"); + allocator.addUntracked("minBin", 1)->setComment("Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator"); allocator.addUntracked("maxBin", 9)->setComment("Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail."); allocator.addUntracked("maxCachedBytes", 0)->setComment("Total storage for the allocator. 0 means no limit."); allocator.addUntracked("maxCachedFraction", 0.8)->setComment("Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken."); From 6fd9438166c94f9655ee58d499c46ee544628b98 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 1 Nov 2018 16:53:23 +0100 Subject: [PATCH 12/19] Going "back" to "GPU struct of pointers to GPU" --- .../interface/SiPixelClustersCUDA.h | 38 ++++++++++--------- .../SiPixelCluster/src/SiPixelClustersCUDA.cc | 10 +++++ .../SiPixelDigi/interface/SiPixelDigisCUDA.h | 36 +++++++++--------- .../SiPixelDigi/src/SiPixelDigisCUDA.cc | 11 ++++++ .../plugins/ClusterSLOnGPU.cu | 8 ++-- 5 files changed, 64 insertions(+), 39 deletions(-) diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h index 07e39932a1de0..af9c2419c7f8b 100644 --- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -16,23 +16,23 @@ class SiPixelClustersCUDA { SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default; SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default; - uint32_t * __restrict__ moduleStart() { return moduleStart_d.get(); } - int32_t * __restrict__ clus() { return clus_d.get(); } - uint32_t * __restrict__ clusInModule() { return clusInModule_d.get(); } - uint32_t * __restrict__ moduleId() { return moduleId_d.get(); } - uint32_t * __restrict__ clusModuleStart() { return clusModuleStart_d.get(); } - - uint32_t const * __restrict__ moduleStart() const { return moduleStart_d.get(); } - int32_t const * __restrict__ clus() const { return clus_d.get(); } - uint32_t const * __restrict__ clusInModule() const { return clusInModule_d.get(); } - uint32_t const * __restrict__ moduleId() const { return moduleId_d.get(); } - uint32_t const * __restrict__ clusModuleStart() const { return clusModuleStart_d.get(); } - - uint32_t const * __restrict__ c_moduleStart() const { return moduleStart_d.get(); } - int32_t const * __restrict__ c_clus() const { return clus_d.get(); } - uint32_t const * __restrict__ c_clusInModule() const { return clusInModule_d.get(); } - uint32_t const * __restrict__ c_moduleId() const { return moduleId_d.get(); } - uint32_t const * __restrict__ c_clusModuleStart() const { return clusModuleStart_d.get(); } + uint32_t *moduleStart() { return moduleStart_d.get(); } + int32_t *clus() { return clus_d.get(); } + uint32_t *clusInModule() { return clusInModule_d.get(); } + uint32_t *moduleId() { return moduleId_d.get(); } + uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } + + uint32_t const *moduleStart() const { return moduleStart_d.get(); } + int32_t const *clus() const { return clus_d.get(); } + uint32_t const *clusInModule() const { return clusInModule_d.get(); } + uint32_t const *moduleId() const { return moduleId_d.get(); } + uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } + + uint32_t const *c_moduleStart() const { return moduleStart_d.get(); } + int32_t const *c_clus() const { return clus_d.get(); } + uint32_t const *c_clusInModule() const { return clusInModule_d.get(); } + uint32_t const *c_moduleId() const { return moduleId_d.get(); } + uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); } struct DeviceConstView { uint32_t const *moduleStart; @@ -42,7 +42,7 @@ class SiPixelClustersCUDA { uint32_t const *clusModuleStart; }; - DeviceConstView view() const { return DeviceConstView{moduleStart_d.get(), clus_d.get(), clusInModule_d.get(), moduleId_d.get(), clusModuleStart_d.get()}; } + DeviceConstView *view() const { return view_d.get(); } private: edm::cuda::device::unique_ptr moduleStart_d; // index of the first pixel of each module @@ -52,6 +52,8 @@ class SiPixelClustersCUDA { // originally from rechits edm::cuda::device::unique_ptr clusModuleStart_d; + + edm::cuda::device::unique_ptr view_d; // "me" pointer }; #endif diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc index 20d66f9025a5f..90e6ec2f5dbaf 100644 --- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -11,4 +11,14 @@ SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::st clusInModule_d = cs->make_device_unique(nelements, stream); moduleId_d = cs->make_device_unique(nelements, stream); clusModuleStart_d = cs->make_device_unique(nelements+1, stream); + + auto view = cs->make_host_unique(stream); + view->moduleStart = moduleStart_d.get(); + view->clus = clus_d.get(); + view->clusInModule = clusInModule_d.get(); + view->moduleId = moduleId_d.get(); + view->clusModuleStart = clusModuleStart_d.get(); + + view_d = cs->make_device_unique(stream); + cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()); } diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h index bd0c2eb5329c2..b36d2fa89c288 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -2,6 +2,7 @@ #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h #include "CUDADataFormats/Common/interface/device_unique_ptr.h" +#include "FWCore/Utilities/interface/propagate_const.h" #include @@ -16,35 +17,36 @@ class SiPixelDigisCUDA { SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default; SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default; - uint16_t * __restrict__ xx() { return xx_d.get(); } - uint16_t * __restrict__ yy() { return yy_d.get(); } - uint16_t * __restrict__ adc() { return adc_d.get(); } - uint16_t * __restrict__ moduleInd() { return moduleInd_d.get(); } + uint16_t * xx() { return xx_d.get(); } + uint16_t * yy() { return yy_d.get(); } + uint16_t * adc() { return adc_d.get(); } + uint16_t * moduleInd() { return moduleInd_d.get(); } - uint16_t const * __restrict__ xx() const { return xx_d.get(); } - uint16_t const * __restrict__ yy() const { return yy_d.get(); } - uint16_t const * __restrict__ adc() const { return adc_d.get(); } - uint16_t const * __restrict__ moduleInd() const { return moduleInd_d.get(); } + uint16_t const *xx() const { return xx_d.get(); } + uint16_t const *yy() const { return yy_d.get(); } + uint16_t const *adc() const { return adc_d.get(); } + uint16_t const *moduleInd() const { return moduleInd_d.get(); } - uint16_t const * __restrict__ c_xx() const { return xx_d.get(); } - uint16_t const * __restrict__ c_yy() const { return yy_d.get(); } - uint16_t const * __restrict__ c_adc() const { return adc_d.get(); } - uint16_t const * __restrict__ c_moduleInd() const { return moduleInd_d.get(); } + uint16_t const *c_xx() const { return xx_d.get(); } + uint16_t const *c_yy() const { return yy_d.get(); } + uint16_t const *c_adc() const { return adc_d.get(); } + uint16_t const *c_moduleInd() const { return moduleInd_d.get(); } struct DeviceConstView { - uint16_t const * xx; - uint16_t const * yy; - uint16_t const * adc; - uint16_t const * moduleInd; + uint16_t const *xx; + uint16_t const *yy; + uint16_t const *adc; + uint16_t const *moduleInd; }; - DeviceConstView view() const { return DeviceConstView{xx_d.get(), yy_d.get(), adc_d.get(), moduleInd_d.get()}; } + const DeviceConstView *view() const { return view_d.get(); } private: edm::cuda::device::unique_ptr xx_d; // local coordinates of each pixel edm::cuda::device::unique_ptr yy_d; // edm::cuda::device::unique_ptr adc_d; // ADC of each pixel edm::cuda::device::unique_ptr moduleInd_d; // module id of each pixel + edm::cuda::device::unique_ptr view_d; // "me" pointer }; #endif diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc index 87613c6447b24..c42adafc8f85e 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -3,6 +3,8 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include + SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) { edm::Service cs; @@ -10,4 +12,13 @@ SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) { yy_d = cs->make_device_unique(nelements, stream); adc_d = cs->make_device_unique(nelements, stream); moduleInd_d = cs->make_device_unique(nelements, stream); + + auto view = cs->make_host_unique(stream); + view->xx = xx_d.get(); + view->yy = yy_d.get(); + view->adc = adc_d.get(); + view->moduleInd = moduleInd_d.get(); + + view_d = cs->make_device_unique(stream); + cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()); } diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu index 42e9ec8789d67..f1281aeaa3f3a 100644 --- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu @@ -14,7 +14,7 @@ using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU; __global__ -void simLink(SiPixelDigisCUDA::DeviceConstView dd, uint32_t ndigis, SiPixelClustersCUDA::DeviceConstView cc, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) +void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const SiPixelClustersCUDA::DeviceConstView *cc, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) { assert(slp == slp->me_d); @@ -28,14 +28,14 @@ void simLink(SiPixelDigisCUDA::DeviceConstView dd, uint32_t ndigis, SiPixelClust if (i >= ndigis) return; - auto id = dd.moduleInd[i]; + auto id = dd->moduleInd[i]; if (InvId == id) return; assert(id < 2000); - auto ch = pixelgpudetails::pixelToChannel(dd.xx[i], dd.yy[i]); + auto ch = pixelgpudetails::pixelToChannel(dd->xx[i], dd->yy[i]); auto first = hh.hitsModuleStart_d[id]; - auto cl = first + cc.clus[i]; + auto cl = first + cc->clus[i]; assert(cl < 2000 * blockDim.x); const std::array me{{id, ch, 0, 0}}; From 8c371ba683d635bd7f073dffa21d70770cbd8f57 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 1 Nov 2018 17:17:35 +0100 Subject: [PATCH 13/19] Using __ldg --- .../interface/SiPixelClustersCUDA.h | 26 ++++++++++++++----- .../SiPixelCluster/src/SiPixelClustersCUDA.cc | 10 +++---- .../SiPixelDigi/interface/SiPixelDigisCUDA.h | 23 ++++++++++++---- .../SiPixelDigi/src/SiPixelDigisCUDA.cc | 8 +++--- .../plugins/ClusterSLOnGPU.cu | 6 ++--- 5 files changed, 50 insertions(+), 23 deletions(-) diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h index af9c2419c7f8b..22d9ff9d103ba 100644 --- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -34,12 +34,26 @@ class SiPixelClustersCUDA { uint32_t const *c_moduleId() const { return moduleId_d.get(); } uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); } - struct DeviceConstView { - uint32_t const *moduleStart; - int32_t const *clus; - uint32_t const *clusInModule; - uint32_t const *moduleId; - uint32_t const *clusModuleStart; + class DeviceConstView { + public: + DeviceConstView() = default; + +#ifdef __CUDACC__ + __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); } + __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); } + __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); } + __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); } + __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); } +#endif + + friend SiPixelClustersCUDA; + + private: + uint32_t const *moduleStart_ = nullptr; + int32_t const *clus_ = nullptr; + uint32_t const *clusInModule_ = nullptr; + uint32_t const *moduleId_ = nullptr; + uint32_t const *clusModuleStart_ = nullptr; }; DeviceConstView *view() const { return view_d.get(); } diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc index 90e6ec2f5dbaf..7363c2fd364af 100644 --- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -13,11 +13,11 @@ SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::st clusModuleStart_d = cs->make_device_unique(nelements+1, stream); auto view = cs->make_host_unique(stream); - view->moduleStart = moduleStart_d.get(); - view->clus = clus_d.get(); - view->clusInModule = clusInModule_d.get(); - view->moduleId = moduleId_d.get(); - view->clusModuleStart = clusModuleStart_d.get(); + view->moduleStart_ = moduleStart_d.get(); + view->clus_ = clus_d.get(); + view->clusInModule_ = clusInModule_d.get(); + view->moduleId_ = moduleId_d.get(); + view->clusModuleStart_ = clusModuleStart_d.get(); view_d = cs->make_device_unique(stream); cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()); diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h index b36d2fa89c288..25e8b54a743c2 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -32,11 +32,24 @@ class SiPixelDigisCUDA { uint16_t const *c_adc() const { return adc_d.get(); } uint16_t const *c_moduleInd() const { return moduleInd_d.get(); } - struct DeviceConstView { - uint16_t const *xx; - uint16_t const *yy; - uint16_t const *adc; - uint16_t const *moduleInd; + class DeviceConstView { + public: + DeviceConstView() = default; + +#ifdef __CUDACC__ + __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_+i); } + __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); } + __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); } + __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); } +#endif + + friend class SiPixelDigisCUDA; + + private: + uint16_t const *xx_ = nullptr; + uint16_t const *yy_ = nullptr; + uint16_t const *adc_ = nullptr; + uint16_t const *moduleInd_ = nullptr; }; const DeviceConstView *view() const { return view_d.get(); } diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc index c42adafc8f85e..5ba2e920e9b04 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -14,10 +14,10 @@ SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) { moduleInd_d = cs->make_device_unique(nelements, stream); auto view = cs->make_host_unique(stream); - view->xx = xx_d.get(); - view->yy = yy_d.get(); - view->adc = adc_d.get(); - view->moduleInd = moduleInd_d.get(); + view->xx_ = xx_d.get(); + view->yy_ = yy_d.get(); + view->adc_ = adc_d.get(); + view->moduleInd_ = moduleInd_d.get(); view_d = cs->make_device_unique(stream); cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()); diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu index f1281aeaa3f3a..b402daef07a05 100644 --- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu @@ -28,14 +28,14 @@ void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const if (i >= ndigis) return; - auto id = dd->moduleInd[i]; + auto id = dd->moduleInd(i); if (InvId == id) return; assert(id < 2000); - auto ch = pixelgpudetails::pixelToChannel(dd->xx[i], dd->yy[i]); + auto ch = pixelgpudetails::pixelToChannel(dd->xx(i), dd->yy(i)); auto first = hh.hitsModuleStart_d[id]; - auto cl = first + cc->clus[i]; + auto cl = first + cc->clus(i); assert(cl < 2000 * blockDim.x); const std::array me{{id, ch, 0, 0}}; From fc83c38a64b33052667a3b1257efa448db8e0f7a Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 9 Nov 2018 21:03:21 +0100 Subject: [PATCH 14/19] Add a configuration option to CUDAService to enable debug prints in the allocators --- HeterogeneousCore/CUDAServices/src/CUDAService.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index f7714fe2d795f..dc1612e3dbc94 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -257,6 +257,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& auto maxBin = allocator.getUntrackedParameter("maxBin"); size_t maxCachedBytes = allocator.getUntrackedParameter("maxCachedBytes"); auto maxCachedFraction = allocator.getUntrackedParameter("maxCachedFraction"); + auto debug = allocator.getUntrackedParameter("debug"); size_t minCachedBytes = std::numeric_limits::max(); int currentDevice; @@ -281,7 +282,10 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& log << " maximum amount of cached memory " << (minCachedBytes>>20) << " MB\n"; allocator_ = std::make_unique(cub::CachingDeviceAllocator::IntPow(binGrowth, maxBin), - binGrowth, minBin, maxBin, minCachedBytes); + binGrowth, minBin, maxBin, minCachedBytes, + false, // do not skip cleanup + debug + ); log << "\n"; log << "CUDAService fully initialized"; @@ -323,6 +327,7 @@ void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions allocator.addUntracked("maxBin", 9)->setComment("Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail."); allocator.addUntracked("maxCachedBytes", 0)->setComment("Total storage for the allocator. 0 means no limit."); allocator.addUntracked("maxCachedFraction", 0.8)->setComment("Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken."); + allocator.addUntracked("debug", false)->setComment("Enable debug prints"); desc.addUntracked("allocator", allocator)->setComment("See the documentation of cub::CachingDeviceAllocator for more details."); descriptions.add("CUDAService", desc); From f9b53aaa352b9843fa21472b373d855ac07d20e2 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 9 Nov 2018 21:03:34 +0100 Subject: [PATCH 15/19] Fix printout in CachingHostAllocator --- HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h index 52cf20eefd47d..97be93c79131a 100644 --- a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h +++ b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h @@ -476,7 +476,7 @@ struct CachingHostAllocator cached_bytes.live += search_key.bytes; mutex.Unlock(); - if (debug) _CubLog("\tHost allocated new device block at %p (%lld bytes associated with stream %lld).\n", + if (debug) _CubLog("\tHost allocated new host block at %p (%lld bytes associated with stream %lld).\n", search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); } From 031f07fd2d6a3106aa0acc59a07c01bec99bfe92 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 9 Nov 2018 21:56:21 +0100 Subject: [PATCH 16/19] Add possibility to preallocate device and host buffers --- .../CUDAServices/scripts/cudaPreallocate.py | 38 ++++++++++++++++ .../CUDAServices/src/CUDAService.cc | 43 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100755 HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py diff --git a/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py new file mode 100755 index 0000000000000..331ddd30f73bd --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +from __future__ import print_function +import re +import sys +import argparse + +def main(opts): + device = [] + host = [] + + device_re = re.compile("Device.*allocated new device block.*\((?P\d+) bytes") + host_re = re.compile("Host.*allocated new host block.*\((?P\d+) bytes") + + f = open(opts.file) + for line in f: + m = device_re.search(line) + if m: + device.append(m.group("bytes")) + continue + m = host_re.search(line) + if m: + host.append(m.group("bytes")) + f.close() + + print("process.CUDAService.allocator.devicePreallocate = cms.untracked.vuint32(%s)" % ",".join(device)) + print("process.CUDAService.allocator.hostPreallocate = cms.untracked.vuint32(%s)" % ",".join(host)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""Extract CUDAService preallocation parameters from a log file. + +To use, run the job once with "process.CUDAService.allocator.debug = +True" and direct the output to a file. Then run this script by passing +the file as an argument, and copy the output of this script back to +the configuration file.""") + parser.add_argument("file", type=str, help="Log file to parse") + opts = parser.parse_args() + main(opts) diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index dc1612e3dbc94..c2327c0e5626a 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -80,7 +80,40 @@ unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) { } } +namespace { + template typename UniquePtr, + typename Allocate> + void preallocate(Allocate allocate, const std::vector& bufferSizes) { + auto current_device = cuda::device::current::get(); + auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + + std::vector > buffers; + buffers.reserve(bufferSizes.size()); + for(auto size: bufferSizes) { + buffers.push_back(allocate(size, stream)); + } + } + + void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector& bufferSizes) { + int device; + cudaCheck(cudaGetDevice(&device)); + for(int i=0; i([&](size_t size, cuda::stream_t<>& stream) { + return cs.make_device_unique(size, stream); + }, bufferSizes); + } + cudaCheck(cudaSetDevice(device)); + } + void hostPreallocate(CUDAService& cs, const std::vector& bufferSizes) { + preallocate([&](size_t size, cuda::stream_t<>& stream) { + return cs.make_host_unique(size, stream); + }, bufferSizes); + } +} + +/// Constructor CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& iRegistry) { bool configEnabled = config.getUntrackedParameter("enabled"); if (not configEnabled) { @@ -290,6 +323,10 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& log << "CUDAService fully initialized"; enabled_ = true; + + // Preallocate buffers if asked to + devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter >("devicePreallocate")); + hostPreallocate(*this, allocator.getUntrackedParameter >("hostPreallocate")); } CUDAService::~CUDAService() { @@ -328,6 +365,8 @@ void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions allocator.addUntracked("maxCachedBytes", 0)->setComment("Total storage for the allocator. 0 means no limit."); allocator.addUntracked("maxCachedFraction", 0.8)->setComment("Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken."); allocator.addUntracked("debug", false)->setComment("Enable debug prints"); + allocator.addUntracked >("devicePreallocate", std::vector{})->setComment("Preallocates buffers of given bytes on all devices"); + allocator.addUntracked >("hostPreallocate", std::vector{})->setComment("Preallocates buffers of given bytes on the host"); desc.addUntracked("allocator", allocator)->setComment("See the documentation of cub::CachingDeviceAllocator for more details."); descriptions.add("CUDAService", desc); @@ -373,6 +412,10 @@ int CUDAService::getCurrentDevice() const { struct CUDAService::Allocator { template Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward(args)...) {} + + void devicePreallocate(int numberOfDevices, const std::vector& bytes); + void hostPreallocate(int numberOfDevices, const std::vector& bytes); + size_t maxAllocation; cub::CachingDeviceAllocator deviceAllocator; cub::CachingHostAllocator hostAllocator; From 15c15ab4fc982bb210542fcd99ef68db40391481 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Mon, 26 Nov 2018 23:39:09 +0100 Subject: [PATCH 17/19] Fix memory problem with SiPixelFedCablingMapGPUWrapper::ModulesToUnpack --- .../interface/SiPixelFedCablingMapGPUWrapper.h | 8 +++++--- .../plugins/SiPixelRawToClusterHeterogeneous.cc | 8 ++++---- .../src/SiPixelFedCablingMapGPUWrapper.cc | 10 ++++++---- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h index e4a07e55bd3c9..50e1fb51949e3 100644 --- a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h +++ b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h @@ -1,6 +1,8 @@ #ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" +#include "CUDADataFormats/Common/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h" #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h" @@ -33,7 +35,7 @@ class SiPixelFedCablingMapGPUWrapper { // operations on the device memory have completed. class ModulesToUnpack { public: - ModulesToUnpack(); + ModulesToUnpack(cuda::stream_t<>& cudaStream); ~ModulesToUnpack() = default; void fillAsync(SiPixelFedCablingMap const& cablingMap, std::set const& modules, cuda::stream_t<>& cudaStream); @@ -41,8 +43,8 @@ class SiPixelFedCablingMapGPUWrapper { const unsigned char *get() const { return modToUnpDevice.get(); } private: - cuda::memory::device::unique_ptr modToUnpDevice; - std::vector> modToUnpHost; + edm::cuda::device::unique_ptr modToUnpDevice; + edm::cuda::host::unique_ptr modToUnpHost; }; private: diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index e5e910c581a93..0c8139b8cba49 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -149,7 +149,6 @@ std::unique_ptr regions_; // GPU algo pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_; - SiPixelFedCablingMapGPUWrapper::ModulesToUnpack gpuModulesToUnpack_; PixelDataFormatter::Errors errors_; bool enableTransfer_; @@ -457,13 +456,14 @@ void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, c void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) { const auto buffers = initialize(ev.event(), es); + auto gpuModulesToUnpack = SiPixelFedCablingMapGPUWrapper::ModulesToUnpack(cudaStream); if (regions_) { std::set modules = *(regions_->modulesToUnpack()); - gpuModulesToUnpack_.fillAsync(*cablingMap_, modules, cudaStream); + gpuModulesToUnpack.fillAsync(*cablingMap_, modules, cudaStream); } else if(recordWatcherUpdatedSinceLastTransfer_) { // If regions_ are disabled, it is enough to fill and transfer only if cablingMap has changed - gpuModulesToUnpack_.fillAsync(*cablingMap_, std::set(), cudaStream); + gpuModulesToUnpack.fillAsync(*cablingMap_, std::set(), cudaStream); recordWatcherUpdatedSinceLastTransfer_ = false; } @@ -542,7 +542,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv } // end of for loop - gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack_.get(), hgains->getGPUProductAsync(cudaStream), + gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack.get(), hgains->getGPUProductAsync(cudaStream), wordFedAppender, wordCounterGPU, fedCounter, convertADCtoElectrons, useQuality, includeErrors, enableTransfer_, debug, cudaStream); diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc index df2f488fe488c..1f872f3cb7464 100644 --- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc +++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc @@ -14,6 +14,7 @@ #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "Geometry/CommonDetUnit/interface/GeomDetType.h" #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h" @@ -123,10 +124,11 @@ const SiPixelFedCablingMapGPU *SiPixelFedCablingMapGPUWrapper::getGPUProductAsyn return data.cablingMapDevice; } -SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::ModulesToUnpack(): - modToUnpDevice(cuda::memory::device::make_unique(cuda::device::current::get().id(), pixelgpudetails::MAX_SIZE)), - modToUnpHost(pixelgpudetails::MAX_SIZE) +SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::ModulesToUnpack(cuda::stream_t<>& cudaStream) { + edm::Service cs; + modToUnpDevice = cs->make_device_unique(pixelgpudetails::MAX_SIZE, cudaStream); + modToUnpHost = cs->make_host_unique(pixelgpudetails::MAX_SIZE, cudaStream); } void SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::fillAsync(SiPixelFedCablingMap const& cablingMap, std::set const& modules, cuda::stream_t<>& cudaStream) { @@ -154,7 +156,7 @@ void SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::fillAsync(SiPixelFedCablin } } - cuda::memory::async::copy(modToUnpDevice.get(), modToUnpHost.data(), modToUnpHost.size() * sizeof(unsigned char), cudaStream.id()); + cuda::memory::async::copy(modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaStream.id()); } From 91cb38a5e895f5158cd320f84ecb48cc700aefee Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 27 Nov 2018 13:14:21 +0100 Subject: [PATCH 18/19] Wrap the exception in a try-catch block to let GDB break on it Throw and catch the "CUDAError" exception instead of creating it, to let GDB `catch throw` and break on it while debugging. --- HeterogeneousCore/CUDACore/src/GPUCuda.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc index 958abf619b2ce..154c3edf80411 100644 --- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc +++ b/HeterogeneousCore/CUDACore/src/GPUCuda.cc @@ -74,15 +74,19 @@ namespace heterogeneous { waitingTaskHolder, // copy needed for the catch block locationSetter = iEvent.locationSetter() ](cuda::stream::id_t streamId, cuda::status_t status) mutable { - if(status == cudaSuccess) { + if (status == cudaSuccess) { locationSetter(HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, deviceId)); LogTrace("GPUCuda") << " GPU kernel finished (in callback) device " << deviceId << " CUDA stream " << streamId; waitingTaskHolder.doneWaiting(nullptr); - } - else { - auto error = cudaGetErrorName(status); - auto message = cudaGetErrorString(status); - waitingTaskHolder.doneWaiting(std::make_exception_ptr(cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message)); + } else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message; + } catch(...) { + waitingTaskHolder.doneWaiting(std::current_exception()); + } } }); } catch(...) { From ae51c9ae68e7c67299451ef1fb8b8f3a5844a4d0 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 27 Nov 2018 13:18:08 +0100 Subject: [PATCH 19/19] Improve allocator log information Improve the report about the cub::CachingDeviceAllocator configuration. Restore the previous CUDA device after reading the available memory. --- .../CUDAServices/src/CUDAService.cc | 36 ++++++++++++------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index c2327c0e5626a..0defe163c8fb4 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -285,34 +285,44 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& // create allocator auto const& allocator = config.getUntrackedParameter("allocator"); - auto binGrowth = allocator.getUntrackedParameter("binGrowth"); - auto minBin = allocator.getUntrackedParameter("minBin"); - auto maxBin = allocator.getUntrackedParameter("maxBin"); - size_t maxCachedBytes = allocator.getUntrackedParameter("maxCachedBytes"); + auto binGrowth = allocator.getUntrackedParameter("binGrowth"); + auto minBin = allocator.getUntrackedParameter("minBin"); + auto maxBin = allocator.getUntrackedParameter("maxBin"); + size_t maxCachedBytes = allocator.getUntrackedParameter("maxCachedBytes"); auto maxCachedFraction = allocator.getUntrackedParameter("maxCachedFraction"); auto debug = allocator.getUntrackedParameter("debug"); size_t minCachedBytes = std::numeric_limits::max(); int currentDevice; cudaCheck(cudaGetDevice(¤tDevice)); - for(int i=0; i(maxCachedFraction*freeMemory)); + cudaCheck(cudaSetDevice(i)); + cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); + minCachedBytes = std::min(minCachedBytes, static_cast(maxCachedFraction * freeMemory)); } - if(maxCachedBytes > 0) { + cudaCheck(cudaSetDevice(currentDevice)); + if (maxCachedBytes > 0) { minCachedBytes = std::min(minCachedBytes, maxCachedBytes); } log << "cub::CachingDeviceAllocator settings\n" << " bin growth " << binGrowth << "\n" << " min bin " << minBin << "\n" << " max bin " << maxBin << "\n" - << " resulting bins\n"; - for(auto bin = minBin; bin <= maxBin; ++bin) { - log << " " << cub::CachingDeviceAllocator::IntPow(binGrowth, bin) << " B\n"; + << " resulting bins:\n"; + for (auto bin = minBin; bin <= maxBin; ++bin) { + auto binSize = cub::CachingDeviceAllocator::IntPow(binGrowth, bin); + if (binSize >= (1<<30) and binSize % (1<<30) == 0) { + log << " " << std::setw(8) << (binSize >> 30) << " GB\n"; + } else if (binSize >= (1<<20) and binSize % (1<<20) == 0) { + log << " " << std::setw(8) << (binSize >> 20) << " MB\n"; + } else if (binSize >= (1<<10) and binSize % (1<<10) == 0) { + log << " " << std::setw(8) << (binSize >> 10) << " kB\n"; + } else { + log << " " << std::setw(9) << binSize << " B\n"; + } } - log << " maximum amount of cached memory " << (minCachedBytes>>20) << " MB\n"; + log << " maximum amount of cached memory: " << (minCachedBytes >> 20) << " MB\n"; allocator_ = std::make_unique(cub::CachingDeviceAllocator::IntPow(binGrowth, maxBin), binGrowth, minBin, maxBin, minCachedBytes,