diff --git a/CUDADataFormats/Common/interface/device_unique_ptr.h b/CUDADataFormats/Common/interface/device_unique_ptr.h new file mode 100644 index 0000000000000..1282c52125fa6 --- /dev/null +++ b/CUDADataFormats/Common/interface/device_unique_ptr.h @@ -0,0 +1,16 @@ +#ifndef CUDADataFormats_Common_interface_device_unique_ptr_h +#define CUDADataFormats_Common_interface_device_unique_ptr_h + +#include +#include + +namespace edm { + namespace cuda { + namespace device { + template + using unique_ptr = std::unique_ptr>; + } + } +} + +#endif diff --git a/CUDADataFormats/Common/interface/host_unique_ptr.h b/CUDADataFormats/Common/interface/host_unique_ptr.h new file mode 100644 index 0000000000000..c945d9b0aa027 --- /dev/null +++ b/CUDADataFormats/Common/interface/host_unique_ptr.h @@ -0,0 +1,16 @@ +#ifndef CUDADataFormats_Common_interface_host_unique_ptr_h +#define CUDADataFormats_Common_interface_host_unique_ptr_h + +#include +#include + +namespace edm { + namespace cuda { + namespace host { + template + using unique_ptr = std::unique_ptr>; + } + } +} + +#endif diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml new file mode 100644 index 0000000000000..21c527e7b2f0d --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h new file mode 100644 index 0000000000000..22d9ff9d103ba --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -0,0 +1,73 @@ +#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h +#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h + +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" + +#include + +class SiPixelClustersCUDA { +public: + SiPixelClustersCUDA() = default; + explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream); + ~SiPixelClustersCUDA() = default; + + SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete; + SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete; + SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default; + SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default; + + uint32_t *moduleStart() { return moduleStart_d.get(); } + int32_t *clus() { return clus_d.get(); } + uint32_t *clusInModule() { return clusInModule_d.get(); } + uint32_t *moduleId() { return moduleId_d.get(); } + uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } + + uint32_t const *moduleStart() const { return moduleStart_d.get(); } + int32_t const *clus() const { return clus_d.get(); } + uint32_t const *clusInModule() const { return clusInModule_d.get(); } + uint32_t const *moduleId() const { return moduleId_d.get(); } + uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } + + uint32_t const *c_moduleStart() const { return moduleStart_d.get(); } + int32_t const *c_clus() const { return clus_d.get(); } + uint32_t const *c_clusInModule() const { return clusInModule_d.get(); } + uint32_t const *c_moduleId() const { return moduleId_d.get(); } + uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); } + + class DeviceConstView { + public: + DeviceConstView() = default; + +#ifdef __CUDACC__ + __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); } + __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); } + __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); } + __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); } + __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); } +#endif + + friend SiPixelClustersCUDA; + + private: + uint32_t const *moduleStart_ = nullptr; + int32_t const *clus_ = nullptr; + uint32_t const *clusInModule_ = nullptr; + uint32_t const *moduleId_ = nullptr; + uint32_t const *clusModuleStart_ = nullptr; + }; + + DeviceConstView *view() const { return view_d.get(); } + +private: + edm::cuda::device::unique_ptr moduleStart_d; // index of the first pixel of each module + edm::cuda::device::unique_ptr clus_d; // cluster id of each pixel + edm::cuda::device::unique_ptr clusInModule_d; // number of clusters found in each module + edm::cuda::device::unique_ptr moduleId_d; // module id of each module + + // originally from rechits + edm::cuda::device::unique_ptr clusModuleStart_d; + + edm::cuda::device::unique_ptr view_d; // "me" pointer +}; + +#endif diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc new file mode 100644 index 0000000000000..7363c2fd364af --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -0,0 +1,24 @@ +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) { + edm::Service cs; + + moduleStart_d = cs->make_device_unique(nelements+1, stream); + clus_d = cs->make_device_unique< int32_t[]>(feds, stream); + clusInModule_d = cs->make_device_unique(nelements, stream); + moduleId_d = cs->make_device_unique(nelements, stream); + clusModuleStart_d = cs->make_device_unique(nelements+1, stream); + + auto view = cs->make_host_unique(stream); + view->moduleStart_ = moduleStart_d.get(); + view->clus_ = clus_d.get(); + view->clusInModule_ = clusInModule_d.get(); + view->moduleId_ = moduleId_d.get(); + view->clusModuleStart_ = clusModuleStart_d.get(); + + view_d = cs->make_device_unique(stream); + cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()); +} diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml new file mode 100644 index 0000000000000..259aa9f08d054 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h new file mode 100644 index 0000000000000..25e8b54a743c2 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -0,0 +1,65 @@ +#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h +#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h + +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" +#include "FWCore/Utilities/interface/propagate_const.h" + +#include + +class SiPixelDigisCUDA { +public: + SiPixelDigisCUDA() = default; + explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream); + ~SiPixelDigisCUDA() = default; + + SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete; + SiPixelDigisCUDA& operator=(const SiPixelDigisCUDA&) = delete; + SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default; + SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default; + + uint16_t * xx() { return xx_d.get(); } + uint16_t * yy() { return yy_d.get(); } + uint16_t * adc() { return adc_d.get(); } + uint16_t * moduleInd() { return moduleInd_d.get(); } + + uint16_t const *xx() const { return xx_d.get(); } + uint16_t const *yy() const { return yy_d.get(); } + uint16_t const *adc() const { return adc_d.get(); } + uint16_t const *moduleInd() const { return moduleInd_d.get(); } + + uint16_t const *c_xx() const { return xx_d.get(); } + uint16_t const *c_yy() const { return yy_d.get(); } + uint16_t const *c_adc() const { return adc_d.get(); } + uint16_t const *c_moduleInd() const { return moduleInd_d.get(); } + + class DeviceConstView { + public: + DeviceConstView() = default; + +#ifdef __CUDACC__ + __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_+i); } + __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); } + __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); } + __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); } +#endif + + friend class SiPixelDigisCUDA; + + private: + uint16_t const *xx_ = nullptr; + uint16_t const *yy_ = nullptr; + uint16_t const *adc_ = nullptr; + uint16_t const *moduleInd_ = nullptr; + }; + + const DeviceConstView *view() const { return view_d.get(); } + +private: + edm::cuda::device::unique_ptr xx_d; // local coordinates of each pixel + edm::cuda::device::unique_ptr yy_d; // + edm::cuda::device::unique_ptr adc_d; // ADC of each pixel + edm::cuda::device::unique_ptr moduleInd_d; // module id of each pixel + edm::cuda::device::unique_ptr view_d; // "me" pointer +}; + +#endif diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc new file mode 100644 index 0000000000000..5ba2e920e9b04 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -0,0 +1,24 @@ +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include + +SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) { + edm::Service cs; + + xx_d = cs->make_device_unique(nelements, stream); + yy_d = cs->make_device_unique(nelements, stream); + adc_d = cs->make_device_unique(nelements, stream); + moduleInd_d = cs->make_device_unique(nelements, stream); + + auto view = cs->make_host_unique(stream); + view->xx_ = xx_d.get(); + view->yy_ = yy_d.get(); + view->adc_ = adc_d.get(); + view->moduleInd_ = moduleInd_d.get(); + + view_d = cs->make_device_unique(stream); + cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()); +} diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc index 958abf619b2ce..154c3edf80411 100644 --- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc +++ b/HeterogeneousCore/CUDACore/src/GPUCuda.cc @@ -74,15 +74,19 @@ namespace heterogeneous { waitingTaskHolder, // copy needed for the catch block locationSetter = iEvent.locationSetter() ](cuda::stream::id_t streamId, cuda::status_t status) mutable { - if(status == cudaSuccess) { + if (status == cudaSuccess) { locationSetter(HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, deviceId)); LogTrace("GPUCuda") << " GPU kernel finished (in callback) device " << deviceId << " CUDA stream " << streamId; waitingTaskHolder.doneWaiting(nullptr); - } - else { - auto error = cudaGetErrorName(status); - auto message = cudaGetErrorString(status); - waitingTaskHolder.doneWaiting(std::make_exception_ptr(cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message)); + } else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message; + } catch(...) { + waitingTaskHolder.doneWaiting(std::current_exception()); + } } }); } catch(...) { diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml index c7232e16d910b..61572b96fb26e 100644 --- a/HeterogeneousCore/CUDAServices/BuildFile.xml +++ b/HeterogeneousCore/CUDAServices/BuildFile.xml @@ -2,8 +2,10 @@ + + diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h index e359a5018813a..feeb557042548 100644 --- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h +++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h @@ -4,14 +4,35 @@ #include #include +#include + #include "FWCore/Utilities/interface/StreamID.h" +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" +#include "CUDADataFormats/Common/interface/host_unique_ptr.h" + namespace edm { class ParameterSet; class ActivityRegistry; class ConfigurationDescriptions; } +namespace cudaserviceimpl { + template + struct make_device_unique_selector { using non_array = edm::cuda::device::unique_ptr; }; + template + struct make_device_unique_selector { using unbounded_array = edm::cuda::device::unique_ptr; }; + template + struct make_device_unique_selector { struct bounded_array {}; }; + + template + struct make_host_unique_selector { using non_array = edm::cuda::host::unique_ptr; }; + template + struct make_host_unique_selector { using unbounded_array = edm::cuda::host::unique_ptr; }; + template + struct make_host_unique_selector { struct bounded_array {}; }; +} + /** * TODO: * - CUDA stream management? @@ -47,7 +68,73 @@ class CUDAService { // Get the current device int getCurrentDevice() const; + // Allocate device memory + template + typename cudaserviceimpl::make_device_unique_selector::non_array + make_device_unique(cuda::stream_t<>& stream) { + int dev = getCurrentDevice(); + void *mem = allocate_device(dev, sizeof(T), stream); + return typename cudaserviceimpl::make_device_unique_selector::non_array(reinterpret_cast(mem), + [this, dev](void *ptr) { + this->free_device(dev, ptr); + }); + } + + template + typename cudaserviceimpl::make_device_unique_selector::unbounded_array + make_device_unique(size_t n, cuda::stream_t<>& stream) { + int dev = getCurrentDevice(); + using element_type = typename std::remove_extent::type; + void *mem = allocate_device(dev, n*sizeof(element_type), stream); + return typename cudaserviceimpl::make_device_unique_selector::unbounded_array(reinterpret_cast(mem), + [this, dev](void *ptr) { + this->free_device(dev, ptr); + }); + } + + template + typename cudaserviceimpl::make_device_unique_selector::bounded_array + make_device_unique(Args&&...) = delete; + + // Allocate pinned host memory + template + typename cudaserviceimpl::make_host_unique_selector::non_array + make_host_unique(cuda::stream_t<>& stream) { + void *mem = allocate_host(sizeof(T), stream); + return typename cudaserviceimpl::make_host_unique_selector::non_array(reinterpret_cast(mem), + [this](void *ptr) { + this->free_host(ptr); + }); + } + + template + typename cudaserviceimpl::make_host_unique_selector::unbounded_array + make_host_unique(size_t n, cuda::stream_t<>& stream) { + using element_type = typename std::remove_extent::type; + void *mem = allocate_host(n*sizeof(element_type), stream); + return typename cudaserviceimpl::make_host_unique_selector::unbounded_array(reinterpret_cast(mem), + [this](void *ptr) { + this->free_host(ptr); + }); + } + + template + typename cudaserviceimpl::make_host_unique_selector::bounded_array + make_host_unique(Args&&...) = delete; + + // Free device memory (to be called from unique_ptr) + void free_device(int device, void *ptr); + + // Free pinned host memory (to be called from unique_ptr) + void free_host(void *ptr); + private: + // PIMPL to hide details of allocator + struct Allocator; + std::unique_ptr allocator_; + void *allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream); + void *allocate_host(size_t nbytes, cuda::stream_t<>& stream); + int numberOfDevices_ = 0; unsigned int numberOfStreamsTotal_ = 0; std::vector> computeCapabilities_; diff --git a/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py new file mode 100755 index 0000000000000..331ddd30f73bd --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +from __future__ import print_function +import re +import sys +import argparse + +def main(opts): + device = [] + host = [] + + device_re = re.compile("Device.*allocated new device block.*\((?P\d+) bytes") + host_re = re.compile("Host.*allocated new host block.*\((?P\d+) bytes") + + f = open(opts.file) + for line in f: + m = device_re.search(line) + if m: + device.append(m.group("bytes")) + continue + m = host_re.search(line) + if m: + host.append(m.group("bytes")) + f.close() + + print("process.CUDAService.allocator.devicePreallocate = cms.untracked.vuint32(%s)" % ",".join(device)) + print("process.CUDAService.allocator.hostPreallocate = cms.untracked.vuint32(%s)" % ",".join(host)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""Extract CUDAService preallocation parameters from a log file. + +To use, run the job once with "process.CUDAService.allocator.debug = +True" and direct the output to a file. Then run this script by passing +the file as an argument, and copy the output of this script back to +the configuration file.""") + parser.add_argument("file", type=str, help="Log file to parse") + opts = parser.parse_args() + main(opts) diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 84aebd58648f3..0defe163c8fb4 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -1,9 +1,12 @@ #include #include +#include #include #include +#include + #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" @@ -11,6 +14,8 @@ #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "CachingHostAllocator.h" + void setCudaLimit(cudaLimit limit, const char* name, size_t request) { // read the current device int device; @@ -75,7 +80,40 @@ unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) { } } +namespace { + template typename UniquePtr, + typename Allocate> + void preallocate(Allocate allocate, const std::vector& bufferSizes) { + auto current_device = cuda::device::current::get(); + auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + + std::vector > buffers; + buffers.reserve(bufferSizes.size()); + for(auto size: bufferSizes) { + buffers.push_back(allocate(size, stream)); + } + } + + void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector& bufferSizes) { + int device; + cudaCheck(cudaGetDevice(&device)); + for(int i=0; i([&](size_t size, cuda::stream_t<>& stream) { + return cs.make_device_unique(size, stream); + }, bufferSizes); + } + cudaCheck(cudaSetDevice(device)); + } + + void hostPreallocate(CUDAService& cs, const std::vector& bufferSizes) { + preallocate([&](size_t size, cuda::stream_t<>& stream) { + return cs.make_host_unique(size, stream); + }, bufferSizes); + } +} +/// Constructor CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& iRegistry) { bool configEnabled = config.getUntrackedParameter("enabled"); if (not configEnabled) { @@ -245,12 +283,67 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& log << '\n'; } + // create allocator + auto const& allocator = config.getUntrackedParameter("allocator"); + auto binGrowth = allocator.getUntrackedParameter("binGrowth"); + auto minBin = allocator.getUntrackedParameter("minBin"); + auto maxBin = allocator.getUntrackedParameter("maxBin"); + size_t maxCachedBytes = allocator.getUntrackedParameter("maxCachedBytes"); + auto maxCachedFraction = allocator.getUntrackedParameter("maxCachedFraction"); + auto debug = allocator.getUntrackedParameter("debug"); + + size_t minCachedBytes = std::numeric_limits::max(); + int currentDevice; + cudaCheck(cudaGetDevice(¤tDevice)); + for (int i = 0; i < numberOfDevices_; ++i) { + size_t freeMemory, totalMemory; + cudaCheck(cudaSetDevice(i)); + cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); + minCachedBytes = std::min(minCachedBytes, static_cast(maxCachedFraction * freeMemory)); + } + cudaCheck(cudaSetDevice(currentDevice)); + if (maxCachedBytes > 0) { + minCachedBytes = std::min(minCachedBytes, maxCachedBytes); + } + log << "cub::CachingDeviceAllocator settings\n" + << " bin growth " << binGrowth << "\n" + << " min bin " << minBin << "\n" + << " max bin " << maxBin << "\n" + << " resulting bins:\n"; + for (auto bin = minBin; bin <= maxBin; ++bin) { + auto binSize = cub::CachingDeviceAllocator::IntPow(binGrowth, bin); + if (binSize >= (1<<30) and binSize % (1<<30) == 0) { + log << " " << std::setw(8) << (binSize >> 30) << " GB\n"; + } else if (binSize >= (1<<20) and binSize % (1<<20) == 0) { + log << " " << std::setw(8) << (binSize >> 20) << " MB\n"; + } else if (binSize >= (1<<10) and binSize % (1<<10) == 0) { + log << " " << std::setw(8) << (binSize >> 10) << " kB\n"; + } else { + log << " " << std::setw(9) << binSize << " B\n"; + } + } + log << " maximum amount of cached memory: " << (minCachedBytes >> 20) << " MB\n"; + + allocator_ = std::make_unique(cub::CachingDeviceAllocator::IntPow(binGrowth, maxBin), + binGrowth, minBin, maxBin, minCachedBytes, + false, // do not skip cleanup + debug + ); + log << "\n"; + log << "CUDAService fully initialized"; enabled_ = true; + + // Preallocate buffers if asked to + devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter >("devicePreallocate")); + hostPreallocate(*this, allocator.getUntrackedParameter >("hostPreallocate")); } CUDAService::~CUDAService() { if (enabled_) { + // Explicitly destruct the allocator before the device resets below + allocator_.reset(); + for (int i = 0; i < numberOfDevices_; ++i) { cudaCheck(cudaSetDevice(i)); cudaCheck(cudaDeviceSynchronize()); @@ -275,6 +368,17 @@ void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions limits.addUntracked("cudaLimitDevRuntimePendingLaunchCount", -1)->setComment("Maximum number of outstanding device runtime launches that can be made from the current device."); desc.addUntracked("limits", limits)->setComment("See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps the default value."); + edm::ParameterSetDescription allocator; + allocator.addUntracked("binGrowth", 8)->setComment("Growth factor (bin_growth in cub::CachingDeviceAllocator"); + allocator.addUntracked("minBin", 1)->setComment("Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator"); + allocator.addUntracked("maxBin", 9)->setComment("Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail."); + allocator.addUntracked("maxCachedBytes", 0)->setComment("Total storage for the allocator. 0 means no limit."); + allocator.addUntracked("maxCachedFraction", 0.8)->setComment("Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken."); + allocator.addUntracked("debug", false)->setComment("Enable debug prints"); + allocator.addUntracked >("devicePreallocate", std::vector{})->setComment("Preallocates buffers of given bytes on all devices"); + allocator.addUntracked >("hostPreallocate", std::vector{})->setComment("Preallocates buffers of given bytes on the host"); + desc.addUntracked("allocator", allocator)->setComment("See the documentation of cub::CachingDeviceAllocator for more details."); + descriptions.add("CUDAService", desc); } @@ -312,3 +416,45 @@ void CUDAService::setCurrentDevice(int device) const { int CUDAService::getCurrentDevice() const { return cuda::device::current::get().id(); } + + +// allocator +struct CUDAService::Allocator { + template + Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward(args)...) {} + + void devicePreallocate(int numberOfDevices, const std::vector& bytes); + void hostPreallocate(int numberOfDevices, const std::vector& bytes); + + size_t maxAllocation; + cub::CachingDeviceAllocator deviceAllocator; + cub::CachingHostAllocator hostAllocator; +}; + +void *CUDAService::allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream) { + if(nbytes > allocator_->maxAllocation) { + throw std::runtime_error("Tried to allocate "+std::to_string(nbytes)+" bytes, but the allocator maximum is "+std::to_string(allocator_->maxAllocation)); + } + + void *ptr = nullptr; + cuda::throw_if_error(allocator_->deviceAllocator.DeviceAllocate(dev, &ptr, nbytes, stream.id())); + return ptr; +} + +void CUDAService::free_device(int device, void *ptr) { + allocator_->deviceAllocator.DeviceFree(device, ptr); +} + +void *CUDAService::allocate_host(size_t nbytes, cuda::stream_t<>& stream) { + if(nbytes > allocator_->maxAllocation) { + throw std::runtime_error("Tried to allocate "+std::to_string(nbytes)+" bytes, but the allocator maximum is "+std::to_string(allocator_->maxAllocation)); + } + + void *ptr = nullptr; + cuda::throw_if_error(allocator_->hostAllocator.HostAllocate(&ptr, nbytes, stream.id())); + return ptr; +} + +void CUDAService::free_host(void *ptr) { + allocator_->hostAllocator.HostFree(ptr); +} diff --git a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h new file mode 100644 index 0000000000000..97be93c79131a --- /dev/null +++ b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h @@ -0,0 +1,635 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * Modified to cache pinned host allocations by Matti Kortelainen + */ + +/****************************************************************************** + * Simple caching allocator for pinned host memory allocations. The allocator is + * thread-safe. + ******************************************************************************/ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/****************************************************************************** + * CachingHostAllocator (host use) + ******************************************************************************/ + +/** + * \brief A simple caching allocator pinned host memory allocations. + * + * \par Overview + * The allocator is thread-safe. It behaves as follows: + * + * I presume the CUDA stream-safeness is not useful as to read/write + * from/to the pinned host memory one needs to synchronize anyway. The + * difference wrt. device memory is that in the CPU all operations to + * the device memory are scheduled via the CUDA stream, while for the + * host memory one can perform operations directly. + * + * \par + * - Allocations are categorized and cached by bin size. A new allocation request of + * a given size will only consider cached allocations within the corresponding bin. + * - Bin limits progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused host allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations will exceed + * \p max_cached_bytes, allocations are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingHostAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes + * + */ +struct CachingHostAllocator +{ + + //--------------------------------------------------------------------- + // Constants + //--------------------------------------------------------------------- + + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int) -1; + + /// Invalid size + static const size_t INVALID_SIZE = (size_t) -1; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Invalid device ordinal + static const int INVALID_DEVICE_ORDINAL = -1; + + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- + + /** + * Descriptor for pinned host memory allocations + */ + struct BlockDescriptor + { + void* d_ptr; // Host pointer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + int device; // device ordinal + cudaStream_t associated_stream; // Associated associated_stream + cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed + + // Constructor (suitable for searching maps for a specific block, given its pointer) + BlockDescriptor(void *d_ptr) : + d_ptr(d_ptr), + bytes(0), + bin(INVALID_BIN), + device(INVALID_DEVICE_ORDINAL), + associated_stream(0), + ready_event(0) + {} + + // Constructor (suitable for searching maps for a range of suitable blocks) + BlockDescriptor() : + d_ptr(NULL), + bytes(0), + bin(INVALID_BIN), + device(INVALID_DEVICE_ORDINAL), + associated_stream(0), + ready_event(0) + {} + + // Comparison functor for comparing host pointers + static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + return (a.d_ptr < b.d_ptr); + } + + // Comparison functor for comparing allocation sizes + static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + return (a.bytes < b.bytes); + } + }; + + /// BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + + class TotalBytes { + public: + size_t free; + size_t live; + TotalBytes() { free = live = 0; } + }; + + /// Set type for cached blocks (ordered by size) + typedef std::multiset CachedBlocks; + + /// Set type for live blocks (ordered by ptr) + typedef std::multiset BusyBlocks; + + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow( + unsigned int base, + unsigned int exp) + { + unsigned int retval = 1; + while (exp > 0) + { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + + /** + * Round up to the nearest power-of + */ + void NearestPowerOf( + unsigned int &power, + size_t &rounded_bytes, + unsigned int base, + size_t value) + { + power = 0; + rounded_bytes = 1; + + if (value * base < value) + { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + return; + } + + while (rounded_bytes < value) + { + rounded_bytes *= base; + power++; + } + } + + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + cub::Mutex mutex; /// Mutex for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes + + const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) + bool debug; /// Whether or not to print (de)allocation events to stdout + + TotalBytes cached_bytes; /// Aggregate cached bytes + CachedBlocks cached_blocks; /// Set of cached pinned host allocations available for reuse + BusyBlocks live_blocks; /// Set of live pinned host allocations currently in use + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingHostAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes (default is no limit) + bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) + bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) + : + bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes + */ + CachingHostAllocator( + bool skip_cleanup = false, + bool debug = false) + : + bin_growth(8), + min_bin(3), + max_bin(7), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use or + * cached-in-reserve) to be freed. See \p FreeAllCached(). + */ + void SetMaxCachedBytes( + size_t max_cached_bytes) + { + // Lock + mutex.Lock(); + + if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); + + this->max_cached_bytes = max_cached_bytes; + + // Unlock + mutex.Unlock(); + } + + + /** + * \brief Provides a suitable allocation of pinned host memory for the given size. + * + * Once freed, the allocation becomes available immediately for reuse. + */ + cudaError_t HostAllocate( + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + { + *d_ptr = NULL; + int device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + if (CubDebug(error = cudaGetDevice(&device))) return error; + + // Create a block descriptor for the requested allocation + bool found = false; + BlockDescriptor search_key; + search_key.device = device; + search_key.associated_stream = active_stream; + NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); + + if (search_key.bin > max_bin) + { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; + } + else + { + // Search for a suitable cached allocation: lock + mutex.Lock(); + + if (search_key.bin < min_bin) + { + // Bin is less than minimum bin: round up + search_key.bin = min_bin; + search_key.bytes = min_bin_bytes; + } + + // Iterate through the range of cached blocks in the same bin + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + while ((block_itr != cached_blocks.end()) + && (block_itr->bin == search_key.bin)) + { + // To prevent races with reusing blocks returned by the host but still + // in use for transfers, only consider cached blocks that are from an idle stream + if(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + search_key.associated_stream = active_stream; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_bytes.free -= search_key.bytes; + cached_bytes.live += search_key.bytes; + + if (debug) _CubLog("\tHost reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", + search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); + + cached_blocks.erase(block_itr); + + break; + } + block_itr++; + } + + // Done searching: unlock + mutex.Unlock(); + } + + // Allocate the block if necessary + if (!found) + { + // Attempt to allocate + // TODO: eventually support allocation flags + if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == cudaErrorMemoryAllocation) + { + // The allocation attempt failed: free all cached blocks on device and retry + if (debug) _CubLog("\tHost failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", + (long long) search_key.bytes, (long long) search_key.associated_stream); + + error = cudaSuccess; // Reset the error we will return + cudaGetLastError(); // Reset CUDART's error + + // Lock + mutex.Lock(); + + // Iterate the range of free blocks + CachedBlocks::iterator block_itr = cached_blocks.begin(); + + while ((block_itr != cached_blocks.end())) + { + // No need to worry about synchronization with the device: cudaFree is + // blocking and will synchronize across all kernels executing + // on the current device + + // Free pinned host memory. + if (CubDebug(error = cudaFreeHost(block_itr->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes.free -= block_itr->bytes; + + if (debug) _CubLog("\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + + cached_blocks.erase(block_itr); + + block_itr++; + } + + // Unlock + mutex.Unlock(); + + // Return under error + if (error) return error; + + // Try to allocate again + if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault))) return error; + } + + // Create ready event + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + return error; + + // Insert into live blocks + mutex.Lock(); + live_blocks.insert(search_key); + cached_bytes.live += search_key.bytes; + mutex.Unlock(); + + if (debug) _CubLog("\tHost allocated new host block at %p (%lld bytes associated with stream %lld).\n", + search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); + } + + // Copy host pointer to output parameter + *d_ptr = search_key.d_ptr; + + if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + + return error; + } + + + /** + * \brief Frees a live allocation of pinned host memory, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse. + */ + cudaError_t HostFree( + void* d_ptr) + { + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + // Lock + mutex.Lock(); + + // Find corresponding block descriptor + bool recached = false; + BlockDescriptor search_key(d_ptr); + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) + { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes.live -= search_key.bytes; + + // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold + if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) + { + // Insert returned allocation into free blocks + recached = true; + cached_blocks.insert(search_key); + cached_bytes.free += search_key.bytes; + + if (debug) _CubLog("\tHost returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + } + } + + // Unlock + mutex.Unlock(); + + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (entrypoint_device != search_key.device) { + if (CubDebug(error = cudaSetDevice(search_key.device))) return error; + } + + if (recached) { + // Insert the ready event in the associated stream (must have current device set properly) + if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; + } + else + { + // Free the allocation from the runtime and cleanup the event. + if (CubDebug(error = cudaFreeHost(d_ptr))) return error; + if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; + + if (debug) _CubLog("\tHost freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + } + + // Reset device + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Frees all cached pinned host allocations + */ + cudaError_t FreeAllCached() + { + cudaError_t error = cudaSuccess; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + int current_device = INVALID_DEVICE_ORDINAL; + + mutex.Lock(); + + while (!cached_blocks.empty()) + { + // Get first block + CachedBlocks::iterator begin = cached_blocks.begin(); + + // Get entry-point device ordinal if necessary + if (entrypoint_device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + } + + // Set current device ordinal if necessary + if (begin->device != current_device) + { + if (CubDebug(error = cudaSetDevice(begin->device))) break; + current_device = begin->device; + } + + // Free host memory + if (CubDebug(error = cudaFreeHost(begin->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes.free -= begin->bytes; + + if (debug) _CubLog("\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live); + + cached_blocks.erase(begin); + } + + mutex.Unlock(); + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Destructor + */ + ~CachingHostAllocator() + { + if (!skip_cleanup) + FreeAllCached(); + } + +}; + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp index dc1b01c2db9fe..d0a1afcc8203f 100644 --- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp +++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp @@ -5,6 +5,7 @@ #include #include +#include #include "catch.hpp" @@ -169,6 +170,73 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { } + SECTION("Device allocator") { + edm::ParameterSet ps; + ps.addUntrackedParameter("enabled", true); + edm::ParameterSet alloc; + alloc.addUntrackedParameter("minBin", 1U); + alloc.addUntrackedParameter("maxBin", 3U); + ps.addUntrackedParameter("allocator", alloc); + auto cs = makeCUDAService(ps, ar); + cs.setCurrentDevice(0); + auto current_device = cuda::device::current::get(); + auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + + SECTION("Destructor") { + auto ptr = cs.make_device_unique(cudaStream); + REQUIRE(ptr.get() != nullptr); + cudaStream.synchronize(); + } + + SECTION("Reset") { + auto ptr = cs.make_device_unique(5, cudaStream); + REQUIRE(ptr.get() != nullptr); + cudaStream.synchronize(); + + ptr.reset(); + REQUIRE(ptr.get() == nullptr); + } + + SECTION("Allocating too much") { + auto ptr = cs.make_device_unique(512, cudaStream); + ptr.reset(); + REQUIRE_THROWS(ptr = cs.make_device_unique(513, cudaStream)); + } + } + + + SECTION("Host allocator") { + edm::ParameterSet ps; + ps.addUntrackedParameter("enabled", true); + edm::ParameterSet alloc; + alloc.addUntrackedParameter("minBin", 1U); + alloc.addUntrackedParameter("maxBin", 3U); + ps.addUntrackedParameter("allocator", alloc); + auto cs = makeCUDAService(ps, ar); + cs.setCurrentDevice(0); + auto current_device = cuda::device::current::get(); + auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + + SECTION("Destructor") { + auto ptr = cs.make_host_unique(cudaStream); + REQUIRE(ptr.get() != nullptr); + } + + SECTION("Reset") { + auto ptr = cs.make_host_unique(5, cudaStream); + REQUIRE(ptr.get() != nullptr); + + ptr.reset(); + REQUIRE(ptr.get() == nullptr); + } + + SECTION("Allocating too much") { + auto ptr = cs.make_host_unique(512, cudaStream); + ptr.reset(); + REQUIRE_THROWS(ptr = cs.make_host_unique(513, cudaStream)); + } + } + //Fake the end-of-job signal. ar.postEndJobSignal_(); } diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h index e4a07e55bd3c9..50e1fb51949e3 100644 --- a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h +++ b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h @@ -1,6 +1,8 @@ #ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h +#include "CUDADataFormats/Common/interface/device_unique_ptr.h" +#include "CUDADataFormats/Common/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h" #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h" @@ -33,7 +35,7 @@ class SiPixelFedCablingMapGPUWrapper { // operations on the device memory have completed. class ModulesToUnpack { public: - ModulesToUnpack(); + ModulesToUnpack(cuda::stream_t<>& cudaStream); ~ModulesToUnpack() = default; void fillAsync(SiPixelFedCablingMap const& cablingMap, std::set const& modules, cuda::stream_t<>& cudaStream); @@ -41,8 +43,8 @@ class SiPixelFedCablingMapGPUWrapper { const unsigned char *get() const { return modToUnpDevice.get(); } private: - cuda::memory::device::unique_ptr modToUnpDevice; - std::vector> modToUnpHost; + edm::cuda::device::unique_ptr modToUnpDevice; + edm::cuda::host::unique_ptr modToUnpHost; }; private: diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml index 9db4a46f367b3..40a489f763397 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml @@ -7,6 +7,8 @@ + + diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 7bd6eac473cc7..dc768ce8f643d 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -31,6 +31,8 @@ #include // CMSSW includes +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" @@ -48,114 +50,17 @@ namespace pixelgpudetails { // number of words for all the FEDs constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; - constexpr uint32_t MAX_WORD08_SIZE = MAX_FED_WORDS * sizeof(uint8_t); - constexpr uint32_t MAX_WORD32_SIZE = MAX_FED_WORDS * sizeof(uint32_t); - constexpr uint32_t MAX_WORD16_SIZE = MAX_FED_WORDS * sizeof(uint16_t); constexpr uint32_t MAX_ERROR_SIZE = MAX_FED_WORDS * esize; - SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { - - cudaCheck(cudaMallocHost(&word, MAX_FED_WORDS * sizeof(unsigned int))); - cudaCheck(cudaMallocHost(&fedId_h, MAX_FED_WORDS * sizeof(unsigned char))); - - // to store the output of RawToDigi - cudaCheck(cudaMallocHost(&pdigi_h, MAX_FED_WORDS * sizeof(uint32_t))); - cudaCheck(cudaMallocHost(&rawIdArr_h, MAX_FED_WORDS * sizeof(uint32_t))); - - cudaCheck(cudaMallocHost(&adc_h, MAX_FED_WORDS * sizeof(uint16_t))); - cudaCheck(cudaMallocHost(&clus_h, MAX_FED_WORDS * sizeof(int32_t))); - - cudaCheck(cudaMallocHost(&error_h, vsize)); - cudaCheck(cudaMallocHost(&error_h_tmp, vsize)); - cudaCheck(cudaMallocHost(&data_h, MAX_ERROR_SIZE)); - - cudaCheck(cudaMalloc((void**) & word_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & fedId_d, MAX_WORD08_SIZE)); - cudaCheck(cudaMalloc((void**) & pdigi_d, MAX_WORD32_SIZE)); // to store thepacked digi - cudaCheck(cudaMalloc((void**) & xx_d, MAX_WORD16_SIZE)); // to store the x and y coordinate - cudaCheck(cudaMalloc((void**) & yy_d, MAX_WORD16_SIZE)); - cudaCheck(cudaMalloc((void**) & adc_d, MAX_WORD16_SIZE)); - - cudaCheck(cudaMalloc((void**) & moduleInd_d, MAX_WORD16_SIZE)); - cudaCheck(cudaMalloc((void**) & rawIdArr_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & error_d, vsize)); - cudaCheck(cudaMalloc((void**) & data_d, MAX_ERROR_SIZE)); - cudaCheck(cudaMemset(data_d, 0x00, MAX_ERROR_SIZE)); - - // for the clusterizer - cudaCheck(cudaMalloc((void**) & clus_d, MAX_WORD32_SIZE)); // cluser index in module - - using namespace gpuClustering; - cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); - cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) )); - cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) )); - - new (error_h) GPU::SimpleVector(MAX_FED_WORDS, data_h); - new (error_h_tmp) GPU::SimpleVector(MAX_FED_WORDS, data_d); - assert(error_h->size() == 0); - assert(error_h->capacity() == static_cast(MAX_FED_WORDS)); - assert(error_h_tmp->size() == 0); - assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS)); - - // Need these in pinned memory to be truly asynchronous - cudaCheck(cudaMallocHost(&nModulesActive, sizeof(uint32_t))); - cudaCheck(cudaMallocHost(&nClusters, sizeof(uint32_t))); - - cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct))); - gpuProduct = getProduct(); - assert(xx_d==gpuProduct.xx_d); - - cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id())); - - // originally from rechits - cudaCheck(cudaMalloc((void**) & clusModuleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); - uint32_t *tmp = nullptr; - cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); - cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize)); + SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) { + edm::Service cs; + word_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); + fedId_ = cs->make_host_unique(MAX_FED_WORDS, cudaStream); } - SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { - // free the host memory - cudaCheck(cudaFreeHost(word)); - cudaCheck(cudaFreeHost(fedId_h)); - cudaCheck(cudaFreeHost(pdigi_h)); - cudaCheck(cudaFreeHost(rawIdArr_h)); - cudaCheck(cudaFreeHost(adc_h)); - cudaCheck(cudaFreeHost(clus_h)); - cudaCheck(cudaFreeHost(error_h)); - cudaCheck(cudaFreeHost(error_h_tmp)); - cudaCheck(cudaFreeHost(data_h)); - cudaCheck(cudaFreeHost(nModulesActive)); - cudaCheck(cudaFreeHost(nClusters)); - - // free device memory used for RawToDigi on GPU - // free the GPU memory - cudaCheck(cudaFree(word_d)); - cudaCheck(cudaFree(fedId_d)); - cudaCheck(cudaFree(pdigi_d)); - cudaCheck(cudaFree(xx_d)); - cudaCheck(cudaFree(yy_d)); - cudaCheck(cudaFree(adc_d)); - cudaCheck(cudaFree(moduleInd_d)); - cudaCheck(cudaFree(rawIdArr_d)); - cudaCheck(cudaFree(error_d)); - cudaCheck(cudaFree(data_d)); - - // these are for the clusterizer - cudaCheck(cudaFree(moduleStart_d)); - cudaCheck(cudaFree(clus_d)); - cudaCheck(cudaFree(clusInModule_d)); - cudaCheck(cudaFree(moduleId_d)); - cudaCheck(cudaFree(gpuProduct_d)); - - // originally from rechits - cudaCheck(cudaFree(tempScanStorage_d)); - cudaCheck(cudaFree(clusModuleStart_d)); - } - - void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { - std::memcpy(word+wordCounterGPU, src, sizeof(cms_uint32_t)*length); - std::memset(fedId_h+wordCounterGPU/2, fedId - 1200, length/2); + void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { + std::memcpy(word_.get()+wordCounterGPU, src, sizeof(cms_uint32_t)*length); + std::memset(fedId_.get()+wordCounterGPU/2, fedId - 1200, length/2); } //////////////////// @@ -613,6 +518,7 @@ namespace pixelgpudetails { const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, + const WordFedAppender& wordFed, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug, @@ -620,52 +526,82 @@ namespace pixelgpudetails { { nDigis = wordCounter; - const int threadsPerBlock = 512; - const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all - - assert(0 == wordCounter%2); - // wordCounter is the total no of words in each event to be trasfered on device - cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(&fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(error_d, error_h_tmp, vsize, cudaMemcpyDefault, stream.id())); - - // Launch rawToDigi kernel - RawToDigi_kernel<<>>( - cablingMap, - modToUnp, - wordCounter, - word_d, - fedId_d, - xx_d, yy_d, adc_d, - pdigi_d, - rawIdArr_d, - moduleInd_d, - error_d, - useQualityInfo, - includeErrors, - debug); - cudaCheck(cudaGetLastError()); - - // copy data to host variable - if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - - if (includeErrors) { - cudaCheck(cudaMemcpyAsync(error_h, error_d, vsize, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(data_h, data_d, MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); - // If we want to transfer only the minimal amount of data, we - // need a synchronization point. A single ExternalWork (of - // SiPixelRawToClusterHeterogeneous) does not help because it is - // already used to synchronize the data movement. So we'd need - // two ExternalWorks (or explicit use of TBB tasks). The - // prototype of #100 would allow this easily (as there would be - // two ExternalWorks). - // - //error_h->set_data(data_h); - //cudaCheck(cudaStreamSynchronize(stream.id())); - //int size = error_h->size(); - //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id())); + constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; + digis_d = SiPixelDigisCUDA(MAX_FED_WORDS, stream); + clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream); + + edm::Service cs; + digis_clusters_h.nModules_Clusters = cs->make_host_unique(2, stream); + + { + const int threadsPerBlock = 512; + const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all + + assert(0 == wordCounter%2); + // wordCounter is the total no of words in each event to be trasfered on device + auto word_d = cs->make_device_unique(wordCounter, stream); + auto fedId_d = cs->make_device_unique(wordCounter, stream); + + auto error_d = cs->make_device_unique>(stream); + auto data_d = cs->make_device_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id())); + auto error_h_tmp = cs->make_host_unique>(stream); + new (error_h_tmp.get()) GPU::SimpleVector(MAX_FED_WORDS, data_d.get()); // should make_host_unique() call the constructor as well? note that even if std::make_unique does that, we can't do that in make_device_unique + assert(error_h_tmp->size() == 0); + assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS)); + + cudaCheck(cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(fedId_d.get(), wordFed.fedId(), wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp.get(), vsize, cudaMemcpyDefault, stream.id())); + + auto pdigi_d = cs->make_device_unique(wordCounter, stream); + auto rawIdArr_d = cs->make_device_unique(wordCounter, stream); + + // Launch rawToDigi kernel + RawToDigi_kernel<<>>( + cablingMap, + modToUnp, + wordCounter, + word_d.get(), + fedId_d.get(), + digis_d.xx(), digis_d.yy(), digis_d.adc(), + pdigi_d.get(), + rawIdArr_d.get(), + digis_d.moduleInd(), + error_d.get(), + useQualityInfo, + includeErrors, + debug); + cudaCheck(cudaGetLastError()); + + // copy data to host variable + if(transferToCPU) { + digis_clusters_h.pdigi = cs->make_host_unique(MAX_FED_WORDS, stream); + digis_clusters_h.rawIdArr = cs->make_host_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.pdigi.get(), pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.rawIdArr.get(), rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + + if (includeErrors) { + digis_clusters_h.data = cs->make_host_unique(MAX_FED_WORDS, stream); + digis_clusters_h.error = cs->make_host_unique>(stream); + new (digis_clusters_h.error.get()) GPU::SimpleVector(MAX_FED_WORDS, digis_clusters_h.data.get()); + assert(digis_clusters_h.error->size() == 0); + assert(digis_clusters_h.error->capacity() == static_cast(MAX_FED_WORDS)); + + cudaCheck(cudaMemcpyAsync(digis_clusters_h.error.get(), error_d.get(), vsize, cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); + // If we want to transfer only the minimal amount of data, we + // need a synchronization point. A single ExternalWork (of + // SiPixelRawToClusterHeterogeneous) does not help because it is + // already used to synchronize the data movement. So we'd need + // two ExternalWorks (or explicit use of TBB tasks). The + // prototype of #100 would allow this easily (as there would be + // two ExternalWorks). + // + //cudaCheck(cudaStreamSynchronize(stream.id())); + //int size = digis_clusters_h.error->size(); + //cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), size*esize, cudaMemcpyDefault, stream.id())); + } } } // End of Raw2Digi and passing data for cluserisation @@ -677,15 +613,16 @@ namespace pixelgpudetails { int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock; gpuCalibPixel::calibDigis<<>>( - moduleInd_d, - xx_d, yy_d, adc_d, + digis_d.moduleInd(), + digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(), gains, wordCounter); cudaCheck(cudaGetLastError()); // calibrated adc if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); + digis_clusters_h.adc = cs->make_host_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.adc.get(), digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); } #ifdef GPU_DEBUG @@ -694,13 +631,13 @@ namespace pixelgpudetails { << " blocks of " << threadsPerBlock << " threads\n"; #endif - cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id())); + cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id())); - countModules<<>>(moduleInd_d, moduleStart_d, clus_d, wordCounter); + countModules<<>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), clusters_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); // read the number of modules into a data member, used by getProduct()) - cudaCheck(cudaMemcpyAsync(nModulesActive, moduleStart_d, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id())); threadsPerBlock = 256; blocks = MaxNumModules; @@ -708,23 +645,23 @@ namespace pixelgpudetails { std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id())); + cudaCheck(cudaMemsetAsync(clusters_d.clusInModule(), 0, (MaxNumModules)*sizeof(uint32_t), stream.id())); findClus<<>>( - moduleInd_d, - xx_d, yy_d, - moduleStart_d, - clusInModule_d, moduleId_d, - clus_d, + digis_d.c_moduleInd(), + digis_d.c_xx(), digis_d.c_yy(), + clusters_d.c_moduleStart(), + clusters_d.clusInModule(), clusters_d.moduleId(), + clusters_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); // apply charge cut clusterChargeCut<<>>( - moduleInd_d, - adc_d, - moduleStart_d, - clusInModule_d, moduleId_d, - clus_d, + digis_d.moduleInd(), + digis_d.c_adc(), + clusters_d.c_moduleStart(), + clusters_d.clusInModule(), clusters_d.c_moduleId(), + clusters_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -735,19 +672,27 @@ namespace pixelgpudetails { // available in the rechit producer without additional points of // synchronization/ExternalWork // + // Temporary storage + size_t tempScanStorageSize = 0; + { + uint32_t *tmp = nullptr; + cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules)); + } + auto tempScanStorage_d = cs->make_device_unique(tempScanStorageSize, stream); // Set first the first element to 0 - cudaCheck(cudaMemsetAsync(clusModuleStart_d, 0, sizeof(uint32_t), stream.id())); + cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id())); // Then use inclusive_scan to get the partial sum to the rest - cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize, - clusInModule_d, &clusModuleStart_d[1], gpuClustering::MaxNumModules, + cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d.get(), tempScanStorageSize, + clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules, stream.id())); // last element holds the number of all clusters - cudaCheck(cudaMemcpyAsync(nClusters, clusModuleStart_d+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); // clusters if(transferToCPU) { - cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + digis_clusters_h.clus = cs->make_host_unique(MAX_FED_WORDS, stream); + cudaCheck(cudaMemcpyAsync(digis_clusters_h.clus.get(), clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); } } // end clusterizer scope } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index ca8bd73106c2c..a2d9cdda92573 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -5,6 +5,7 @@ #include #include "cuda/api_wrappers.h" +#include "CUDADataFormats/Common/interface/host_unique_ptr.h" #include "FWCore/Utilities/interface/typedefs.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include "siPixelRawToClusterHeterogeneousProduct.h" @@ -159,8 +160,43 @@ namespace pixelgpudetails { using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct; - SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream); - ~SiPixelRawToClusterGPUKernel(); + struct CPUData { + CPUData() = default; + ~CPUData() = default; + + CPUData(const CPUData&) = delete; + CPUData& operator=(const CPUData&) = delete; + CPUData(CPUData&&) = default; + CPUData& operator=(CPUData&&) = default; + + edm::cuda::host::unique_ptr nModules_Clusters; // These should really be part of the GPU product + + edm::cuda::host::unique_ptr data; + edm::cuda::host::unique_ptr> error; + + edm::cuda::host::unique_ptr pdigi; + edm::cuda::host::unique_ptr rawIdArr; + edm::cuda::host::unique_ptr adc; + edm::cuda::host::unique_ptr clus; + }; + + class WordFedAppender { + public: + WordFedAppender(cuda::stream_t<>& cudaStream); + ~WordFedAppender() = default; + + void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); + + const unsigned int *word() const { return word_.get(); } + const unsigned char *fedId() const { return fedId_.get(); } + + private: + edm::cuda::host::unique_ptr word_; + edm::cuda::host::unique_ptr fedId_; + }; + + SiPixelRawToClusterGPUKernel() = default; + ~SiPixelRawToClusterGPUKernel() = default; SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete; @@ -168,69 +204,35 @@ namespace pixelgpudetails { SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete; SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete; - void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); - void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, + const WordFedAppender& wordFed, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug, cuda::stream_t<>& stream); - auto getProduct() { - error_h->set_data(data_h); - return siPixelRawToClusterHeterogeneousProduct::GPUProduct{ - pdigi_h, rawIdArr_h, clus_h, adc_h, error_h, - gpuProduct_d, - xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d, - clusModuleStart_d, - nDigis, *nModulesActive, *nClusters - }; + siPixelRawToClusterHeterogeneousProduct::GPUProduct getProduct() { + return siPixelRawToClusterHeterogeneousProduct::GPUProduct( + std::move(digis_d), std::move(clusters_d), + nDigis, + digis_clusters_h.nModules_Clusters[0], + digis_clusters_h.nModules_Clusters[1] + ); } - private: - // input - unsigned int *word = nullptr; // to hold input for rawtodigi - unsigned char *fedId_h = nullptr; // to hold fed index for each word - - // output - GPUProduct gpuProduct; - GPUProduct * gpuProduct_d; - - // FIXME cleanup all these are in the gpuProduct above... - - uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output - uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output - pixelgpudetails::error_obj *data_h = nullptr; - GPU::SimpleVector *error_h = nullptr; - GPU::SimpleVector *error_h_tmp = nullptr; + CPUData&& getCPUData() { + return std::move(digis_clusters_h); + } + private: uint32_t nDigis = 0; - uint32_t *nModulesActive = nullptr; - uint32_t *nClusters = nullptr; - - // scratch memory buffers - uint32_t * word_d; - uint8_t * fedId_d; - uint32_t * pdigi_d; - uint16_t * xx_d; - uint16_t * yy_d; - uint16_t * adc_d; - uint16_t * moduleInd_d; - uint32_t * rawIdArr_d; - GPU::SimpleVector * error_d; - error_obj * data_d; - - // these are for the clusterizer (to be moved) - uint32_t * moduleStart_d; - int32_t * clus_d; - uint32_t * clusInModule_d; - uint32_t * moduleId_d; + // CPU data + CPUData digis_clusters_h; - // originally in rechit, moved here - uint32_t *clusModuleStart_d = nullptr; - void *tempScanStorage_d = nullptr; - size_t tempScanStorageSize = 0; + // Data to be put in the event + SiPixelDigisCUDA digis_d; + SiPixelClustersCUDA clusters_d; }; // configuration and memory buffers alocated on the GPU diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index 384a4732b32e1..0c8139b8cba49 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -110,10 +110,9 @@ class SiPixelRawToClusterHeterogeneous: public HeterogeneousEDProducer& cudaStream) override; void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; - void convertGPUtoCPU(edm::Event& ev, const GPUProduct& gpu) const; + void convertGPUtoCPU(edm::Event& ev, unsigned int nDigis, pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData) const; // Commonalities const FEDRawDataCollection *initialize(const edm::Event& ev, const edm::EventSetup& es); @@ -149,8 +148,7 @@ std::unique_ptr regions_; SiPixelGainCalibrationForHLTService theSiPixelGainCalibration_; // GPU algo - std::unique_ptr gpuAlgo_; - std::unique_ptr gpuModulesToUnpack_; + pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_; PixelDataFormatter::Errors errors_; bool enableTransfer_; @@ -455,22 +453,17 @@ void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, c } // ----------------------------------------------------------------------------- -void SiPixelRawToClusterHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) { - // Allocate GPU resources here - gpuAlgo_ = std::make_unique(cudaStream); - gpuModulesToUnpack_ = std::make_unique(); -} - void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) { const auto buffers = initialize(ev.event(), es); + auto gpuModulesToUnpack = SiPixelFedCablingMapGPUWrapper::ModulesToUnpack(cudaStream); if (regions_) { std::set modules = *(regions_->modulesToUnpack()); - gpuModulesToUnpack_->fillAsync(*cablingMap_, modules, cudaStream); + gpuModulesToUnpack.fillAsync(*cablingMap_, modules, cudaStream); } else if(recordWatcherUpdatedSinceLastTransfer_) { // If regions_ are disabled, it is enough to fill and transfer only if cablingMap has changed - gpuModulesToUnpack_->fillAsync(*cablingMap_, std::set(), cudaStream); + gpuModulesToUnpack.fillAsync(*cablingMap_, std::set(), cudaStream); recordWatcherUpdatedSinceLastTransfer_ = false; } @@ -494,6 +487,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData() ErrorChecker errorcheck; + auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(cudaStream); for (auto aFed = fedIds.begin(); aFed != fedIds.end(); ++aFed) { int fedId = *aFed; @@ -543,29 +537,30 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv const cms_uint32_t * ew = (const cms_uint32_t *)(trailer); assert(0 == (ew-bw)%2); - gpuAlgo_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); + wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); wordCounterGPU+=(ew-bw); } // end of for loop - gpuAlgo_->makeClustersAsync(gpuMap, gpuModulesToUnpack_->get(), hgains->getGPUProductAsync(cudaStream), - wordCounterGPU, fedCounter, convertADCtoElectrons, - useQuality, includeErrors, enableTransfer_, debug, cudaStream); + gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack.get(), hgains->getGPUProductAsync(cudaStream), + wordFedAppender, + wordCounterGPU, fedCounter, convertADCtoElectrons, + useQuality, includeErrors, enableTransfer_, debug, cudaStream); } void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) { - auto output = std::make_unique(gpuAlgo_->getProduct()); - assert(output->me_d); + auto output = std::make_unique(gpuAlgo_.getProduct()); if(enableConversion_) { - convertGPUtoCPU(ev.event(), *output); + convertGPUtoCPU(ev.event(), output->nDigis, gpuAlgo_.getCPUData()); } ev.put(std::move(output), heterogeneous::DisableTransfer{}); } void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, - const SiPixelRawToClusterHeterogeneous::GPUProduct& gpu) const { + unsigned int nDigis, + pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData digis_clusters_h) const { // TODO: add the transfers here as well? auto collection = std::make_unique>(); @@ -576,9 +571,9 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, auto outputClusters = std::make_unique(); edm::DetSet * detDigis=nullptr; - for (uint32_t i = 0; i < gpu.nDigis; i++) { - if (gpu.pdigi_h[i]==0) continue; - detDigis = &collection->find_or_insert(gpu.rawIdArr_h[i]); + for (uint32_t i = 0; i < nDigis; i++) { + if (digis_clusters_h.pdigi[i]==0) continue; + detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]); if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations break; } @@ -609,29 +604,29 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, if ( spc.empty() ) spc.abort(); }; - for (uint32_t i = 0; i < gpu.nDigis; i++) { - if (gpu.pdigi_h[i]==0) continue; - if (gpu.clus_h[i]>9000) continue; // not in cluster - assert(gpu.rawIdArr_h[i] > 109999); - if ( (*detDigis).detId() != gpu.rawIdArr_h[i]) + for (uint32_t i = 0; i < nDigis; i++) { + if (digis_clusters_h.pdigi[i]==0) continue; + if (digis_clusters_h.clus[i]>9000) continue; // not in cluster + assert(digis_clusters_h.rawIdArr[i] > 109999); + if ( (*detDigis).detId() != digis_clusters_h.rawIdArr[i]) { fillClusters((*detDigis).detId()); assert(nclus==-1); - detDigis = &collection->find_or_insert(gpu.rawIdArr_h[i]); + detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]); if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; } } - (*detDigis).data.emplace_back(gpu.pdigi_h[i]); + (*detDigis).data.emplace_back(digis_clusters_h.pdigi[i]); auto const & dig = (*detDigis).data.back(); // fill clusters - assert(gpu.clus_h[i]>=0); - assert(gpu.clus_h[i]<1024); - nclus = std::max(gpu.clus_h[i],nclus); + assert(digis_clusters_h.clus[i]>=0); + assert(digis_clusters_h.clus[i]<1024); + nclus = std::max(digis_clusters_h.clus[i],nclus); auto row = dig.row(); auto col = dig.column(); SiPixelCluster::PixelPos pix(row,col); - aclusters[gpu.clus_h[i]].add(pix,gpu.adc_h[i]); + aclusters[digis_clusters_h.clus[i]].add(pix, digis_clusters_h.adc[i]); } // fill final clusters @@ -642,9 +637,9 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, auto errors = errors_; // make a copy PixelDataFormatter::DetErrors nodeterrors; - auto size = gpu.error_h->size(); + auto size = digis_clusters_h.error->size(); for (auto i = 0; i < size; i++) { - pixelgpudetails::error_obj err = (*gpu.error_h)[i]; + pixelgpudetails::error_obj err = (*digis_clusters_h.error)[i]; if (err.errorType != 0) { SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200); errors[err.rawId].push_back(error); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h index e75afd3481b25..0cab299582410 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h @@ -1,6 +1,8 @@ #ifndef EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h #define EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "FWCore/Utilities/interface/typedefs.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" @@ -25,27 +27,21 @@ namespace siPixelRawToClusterHeterogeneousProduct { // FIXME split in two struct GPUProduct { - // Needed for digi and cluster CPU output - uint32_t const * pdigi_h = nullptr; - uint32_t const * rawIdArr_h = nullptr; - int32_t const * clus_h = nullptr; - uint16_t const * adc_h = nullptr; - GPU::SimpleVector const * error_h = nullptr; - - GPUProduct const * me_d = nullptr; - - // Needed for GPU rechits - uint16_t const * xx_d; - uint16_t const * yy_d; - uint16_t const * adc_d; - uint16_t const * moduleInd_d; - uint32_t const * moduleStart_d; - int32_t const * clus_d; - uint32_t const * clusInModule_d; - uint32_t const * moduleId_d; - - // originally from rechits - uint32_t const * clusModuleStart_d; + GPUProduct() = default; + GPUProduct(const GPUProduct&) = delete; + GPUProduct& operator=(const GPUProduct&) = delete; + GPUProduct(GPUProduct&&) = default; + GPUProduct& operator=(GPUProduct&&) = default; + + GPUProduct(SiPixelDigisCUDA&& digis, + SiPixelClustersCUDA&& clusters, + uint32_t ndig, uint32_t nmod, uint32_t nclus): + digis_d(std::move(digis)), clusters_d(std::move(clusters)), + nDigis(ndig), nModules(nmod), nClusters(nclus) + {} + + SiPixelDigisCUDA digis_d; + SiPixelClustersCUDA clusters_d; uint32_t nDigis; uint32_t nModules; diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc index df2f488fe488c..1f872f3cb7464 100644 --- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc +++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc @@ -14,6 +14,7 @@ #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "Geometry/CommonDetUnit/interface/GeomDetType.h" #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h" @@ -123,10 +124,11 @@ const SiPixelFedCablingMapGPU *SiPixelFedCablingMapGPUWrapper::getGPUProductAsyn return data.cablingMapDevice; } -SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::ModulesToUnpack(): - modToUnpDevice(cuda::memory::device::make_unique(cuda::device::current::get().id(), pixelgpudetails::MAX_SIZE)), - modToUnpHost(pixelgpudetails::MAX_SIZE) +SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::ModulesToUnpack(cuda::stream_t<>& cudaStream) { + edm::Service cs; + modToUnpDevice = cs->make_device_unique(pixelgpudetails::MAX_SIZE, cudaStream); + modToUnpHost = cs->make_host_unique(pixelgpudetails::MAX_SIZE, cudaStream); } void SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::fillAsync(SiPixelFedCablingMap const& cablingMap, std::set const& modules, cuda::stream_t<>& cudaStream) { @@ -154,7 +156,7 @@ void SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::fillAsync(SiPixelFedCablin } } - cuda::memory::async::copy(modToUnpDevice.get(), modToUnpHost.data(), modToUnpHost.size() * sizeof(unsigned char), cudaStream.id()); + cuda::memory::async::copy(modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaStream.id()); } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index c63466f157a1b..333898eaa601a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -127,10 +127,10 @@ namespace pixelgpudetails { pixelCPEforGPU::ParamsOnGPU const * cpeParams, bool transferToCPU, cuda::stream_t<>& stream) { - cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); - gpu_.hitsModuleStart_d = input.clusModuleStart_d; - gpu_.cpeParams = cpeParams; // copy it for use in clients - cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); + gpu_.hitsModuleStart_d = input.clusters_d.clusModuleStart(); + gpu_.cpeParams = cpeParams; // copy it for use in clients + cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); int threadsPerBlock = 256; int blocks = input.nModules; // active modules (with digis) @@ -141,11 +141,11 @@ namespace pixelgpudetails { gpuPixelRecHits::getHits<<>>( cpeParams, gpu_.bs_d, - input.moduleInd_d, - input.xx_d, input.yy_d, input.adc_d, - input.moduleStart_d, - input.clusInModule_d, input.moduleId_d, - input.clus_d, + input.digis_d.moduleInd(), + input.digis_d.xx(), input.digis_d.yy(), input.digis_d.adc(), + input.clusters_d.moduleStart(), + input.clusters_d.clusInModule(), input.clusters_d.moduleId(), + input.clusters_d.clus(), input.nDigis, gpu_.hitsModuleStart_d, gpu_.charge_d, diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu index 83a7e63e628a0..b402daef07a05 100644 --- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu @@ -3,6 +3,8 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" @@ -12,14 +14,13 @@ using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU; __global__ -void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) +void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const SiPixelClustersCUDA::DeviceConstView *cc, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) { assert(slp == slp->me_d); constexpr int32_t invTK = 0; // std::numeric_limits::max(); constexpr uint16_t InvId = 9999; // must be > MaxNumModules - auto const & dd = *ddp; auto const & hh = *hhp; auto const & sl = *slp; auto i = blockIdx.x * blockDim.x + threadIdx.x; @@ -27,14 +28,14 @@ void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLO if (i >= ndigis) return; - auto id = dd.moduleInd_d[i]; + auto id = dd->moduleInd(i); if (InvId == id) return; assert(id < 2000); - auto ch = pixelgpudetails::pixelToChannel(dd.xx_d[i], dd.yy_d[i]); + auto ch = pixelgpudetails::pixelToChannel(dd->xx(i), dd->yy(i)); auto first = hh.hitsModuleStart_d[id]; - auto cl = first + dd.clus_d[i]; + auto cl = first + cc->clus(i); assert(cl < 2000 * blockDim.x); const std::array me{{id, ch, 0, 0}}; @@ -176,7 +177,7 @@ namespace clusterSLOnGPU { blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock; assert(sl.me_d); - simLink<<>>(dd.me_d, ndigis, hh.gpu_d, sl.me_d, n); + simLink<<>>(dd.digis_d.view(), ndigis, dd.clusters_d.view(), hh.gpu_d, sl.me_d, n); cudaCheck(cudaGetLastError()); if (doDump) {