diff --git a/CUDADataFormats/SiStripCluster/BuildFile.xml b/CUDADataFormats/SiStripCluster/BuildFile.xml
new file mode 100644
index 0000000000000..5e401d215c4eb
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/BuildFile.xml
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h b/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
new file mode 100644
index 0000000000000..f64b8a533d513
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
@@ -0,0 +1,59 @@
+#ifndef CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h
+#define CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h
+
+#include "DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+#include
+
+class SiStripClustersCUDADevice : public SiStripClustersSOABase {
+public:
+ SiStripClustersCUDADevice() = default;
+ explicit SiStripClustersCUDADevice(uint32_t maxClusters, uint32_t maxStripsPerCluster, cudaStream_t stream);
+ ~SiStripClustersCUDADevice() override = default;
+
+ SiStripClustersCUDADevice(const SiStripClustersCUDADevice &) = delete;
+ SiStripClustersCUDADevice &operator=(const SiStripClustersCUDADevice &) = delete;
+ SiStripClustersCUDADevice(SiStripClustersCUDADevice &&) = default;
+ SiStripClustersCUDADevice &operator=(SiStripClustersCUDADevice &&) = default;
+
+ struct DeviceView {
+ uint32_t *clusterIndex_;
+ uint32_t *clusterSize_;
+ uint8_t *clusterADCs_;
+ stripgpu::detId_t *clusterDetId_;
+ stripgpu::stripId_t *firstStrip_;
+ bool *trueCluster_;
+ float *barycenter_;
+ float *charge_;
+ uint32_t nClusters_;
+ uint32_t maxClusterSize_;
+ };
+
+ DeviceView *view() const { return view_d.get(); }
+ uint32_t nClusters() const { return nClusters_; }
+ uint32_t *nClustersPtr() { return &nClusters_; }
+ uint32_t maxClusterSize() const { return maxClusterSize_; }
+ uint32_t *maxClusterSizePtr() { return &maxClusterSize_; }
+
+private:
+ cms::cuda::device::unique_ptr view_d; // "me" pointer
+ uint32_t nClusters_;
+ uint32_t maxClusterSize_;
+};
+
+class SiStripClustersCUDAHost : public SiStripClustersSOABase {
+public:
+ SiStripClustersCUDAHost() = default;
+ explicit SiStripClustersCUDAHost(const SiStripClustersCUDADevice &clusters_d, cudaStream_t stream);
+ ~SiStripClustersCUDAHost() override = default;
+
+ SiStripClustersCUDAHost(const SiStripClustersCUDAHost &) = delete;
+ SiStripClustersCUDAHost &operator=(const SiStripClustersCUDAHost &) = delete;
+ SiStripClustersCUDAHost(SiStripClustersCUDAHost &&) = default;
+ SiStripClustersCUDAHost &operator=(SiStripClustersCUDAHost &&) = default;
+};
+
+#endif
diff --git a/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc b/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
new file mode 100644
index 0000000000000..220456760476a
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
@@ -0,0 +1,59 @@
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+SiStripClustersCUDADevice::SiStripClustersCUDADevice(uint32_t maxClusters,
+ uint32_t maxStripsPerCluster,
+ cudaStream_t stream) {
+ maxClusterSize_ = maxStripsPerCluster;
+
+ clusterIndex_ = cms::cuda::make_device_unique(maxClusters, stream);
+ clusterSize_ = cms::cuda::make_device_unique(maxClusters, stream);
+ clusterADCs_ = cms::cuda::make_device_unique(maxClusters * maxStripsPerCluster, stream);
+ clusterDetId_ = cms::cuda::make_device_unique(maxClusters, stream);
+ firstStrip_ = cms::cuda::make_device_unique(maxClusters, stream);
+ trueCluster_ = cms::cuda::make_device_unique(maxClusters, stream);
+ barycenter_ = cms::cuda::make_device_unique(maxClusters, stream);
+ charge_ = cms::cuda::make_device_unique(maxClusters, stream);
+
+ auto view = cms::cuda::make_host_unique(stream);
+ view->clusterIndex_ = clusterIndex_.get();
+ view->clusterSize_ = clusterSize_.get();
+ view->clusterADCs_ = clusterADCs_.get();
+ view->clusterDetId_ = clusterDetId_.get();
+ view->firstStrip_ = firstStrip_.get();
+ view->trueCluster_ = trueCluster_.get();
+ view->barycenter_ = barycenter_.get();
+ view->charge_ = charge_.get();
+ view->maxClusterSize_ = maxStripsPerCluster;
+
+ view_d = cms::cuda::make_device_unique(stream);
+ cms::cuda::copyAsync(view_d, view, stream);
+#ifdef GPU_CHECK
+ cudaCheck(cudaStreamSynchronize(stream));
+#endif
+}
+
+SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice& clusters_d, cudaStream_t stream) {
+ nClusters_ = clusters_d.nClusters();
+ maxClusterSize_ = clusters_d.maxClusterSize();
+ clusterIndex_ = cms::cuda::make_host_unique(nClusters_, stream);
+ clusterSize_ = cms::cuda::make_host_unique(nClusters_, stream);
+ clusterADCs_ = cms::cuda::make_host_unique(nClusters_ * maxClusterSize_, stream);
+ clusterDetId_ = cms::cuda::make_host_unique(nClusters_, stream);
+ firstStrip_ = cms::cuda::make_host_unique(nClusters_, stream);
+ trueCluster_ = cms::cuda::make_host_unique(nClusters_, stream);
+ barycenter_ = cms::cuda::make_host_unique(nClusters_, stream);
+ charge_ = cms::cuda::make_host_unique(nClusters_, stream);
+
+ cms::cuda::copyAsync(clusterIndex_, clusters_d.clusterIndex(), nClusters_, stream);
+ cms::cuda::copyAsync(clusterSize_, clusters_d.clusterSize(), nClusters_, stream);
+ cms::cuda::copyAsync(clusterADCs_, clusters_d.clusterADCs(), nClusters_ * maxClusterSize_, stream);
+ cms::cuda::copyAsync(clusterDetId_, clusters_d.clusterDetId(), nClusters_, stream);
+ cms::cuda::copyAsync(firstStrip_, clusters_d.firstStrip(), nClusters_, stream);
+ cms::cuda::copyAsync(trueCluster_, clusters_d.trueCluster(), nClusters_, stream);
+ cms::cuda::copyAsync(barycenter_, clusters_d.barycenter(), nClusters_, stream);
+ cms::cuda::copyAsync(charge_, clusters_d.charge(), nClusters_, stream);
+#ifdef GPU_CHECK
+ cudaCheck(cudaStreamSynchronize(stream));
+#endif
+}
diff --git a/CUDADataFormats/SiStripCluster/src/classes.h b/CUDADataFormats/SiStripCluster/src/classes.h
new file mode 100644
index 0000000000000..b38f397dee067
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_SiStripCluster_classes_h
+#define CUDADataFormats_SiStripCluster_classes_h
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/SiStripCluster/src/classes_def.xml b/CUDADataFormats/SiStripCluster/src/classes_def.xml
new file mode 100644
index 0000000000000..3c2f3ab27c620
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/src/classes_def.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
diff --git a/CalibFormats/SiStripObjects/BuildFile.xml b/CalibFormats/SiStripObjects/BuildFile.xml
index 325f0aa1bcb9f..83c3901a34f13 100644
--- a/CalibFormats/SiStripObjects/BuildFile.xml
+++ b/CalibFormats/SiStripObjects/BuildFile.xml
@@ -2,6 +2,9 @@
+
+
+
diff --git a/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h b/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h
new file mode 100644
index 0000000000000..94f0080f88019
--- /dev/null
+++ b/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h
@@ -0,0 +1,137 @@
+#ifndef CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
+#define CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
+
+#include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
+#include "DataFormats/SiStripCommon/interface/ConstantsForHardwareSystems.h"
+
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
+class SiStripQuality;
+class SiStripGain;
+class SiStripNoises;
+
+namespace stripgpu {
+ __host__ __device__ inline fedId_t fedIndex(fedId_t fed) { return fed - sistrip::FED_ID_MIN; }
+ __host__ __device__ inline std::uint32_t stripIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+ return fedIndex(fed) * sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH + channel * sistrip::STRIPS_PER_FEDCH +
+ (strip % sistrip::STRIPS_PER_FEDCH);
+ }
+ __host__ __device__ inline std::uint32_t apvIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+ return fedIndex(fed) * sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED + sistrip::APVS_PER_CHAN * channel +
+ (strip % sistrip::STRIPS_PER_FEDCH) / sistrip::STRIPS_PER_APV;
+ }
+ __host__ __device__ inline std::uint32_t channelIndex(fedId_t fed, fedCh_t channel) {
+ return fedIndex(fed) * sistrip::FEDCH_PER_FED + channel;
+ }
+
+ class SiStripClusterizerConditionsGPU {
+ public:
+ class DetToFed {
+ public:
+ DetToFed(detId_t detid, apvPair_t ipair, fedId_t fedid, fedCh_t fedch)
+ : detid_(detid), ipair_(ipair), fedid_(fedid), fedch_(fedch) {}
+ detId_t detID() const { return detid_; }
+ apvPair_t pair() const { return ipair_; }
+ fedId_t fedID() const { return fedid_; }
+ fedCh_t fedCh() const { return fedch_; }
+
+ private:
+ detId_t detid_;
+ apvPair_t ipair_;
+ fedId_t fedid_;
+ fedCh_t fedch_;
+ };
+ using DetToFeds = std::vector;
+
+ static constexpr std::uint16_t badBit = 1 << 15;
+
+ class Data {
+ public:
+ struct DeviceView {
+ __device__ inline detId_t detID(fedId_t fed, fedCh_t channel) const {
+ return detID_[channelIndex(fed, channel)];
+ }
+
+ __device__ inline apvPair_t iPair(fedId_t fed, fedCh_t channel) const {
+ return iPair_[channelIndex(fed, channel)];
+ }
+
+ __device__ inline float invthick(fedId_t fed, fedCh_t channel) const {
+ return invthick_[channelIndex(fed, channel)];
+ }
+
+ __device__ inline float noise(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+ // noise is stored as 9 bits with a fixed point scale factor of 0.1
+ return 0.1f * (noise_[stripIndex(fed, channel, strip)] & ~badBit);
+ }
+
+ __device__ inline float gain(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+ return gain_[apvIndex(fed, channel, strip)];
+ }
+
+ __device__ inline bool bad(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+ return badBit == (noise_[stripIndex(fed, channel, strip)] & badBit);
+ }
+ const std::uint16_t* noise_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH];
+ const float* invthick_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+ const detId_t* detID_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+ const apvPair_t* iPair_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+ const float* gain_; //[sistrip::NUMBER_OF_FEDS*sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED];
+ };
+
+ const DeviceView* deviceView() const { return deviceView_.get(); }
+
+ cms::cuda::device::unique_ptr deviceView_;
+ cms::cuda::host::unique_ptr hostView_;
+
+ cms::cuda::device::unique_ptr
+ noise_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH];
+ cms::cuda::device::unique_ptr invthick_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+ cms::cuda::device::unique_ptr detID_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+ cms::cuda::device::unique_ptr iPair_; //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+ cms::cuda::device::unique_ptr
+ gain_; //[sistrip::NUMBER_OF_FEDS*sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED];
+ };
+
+ SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
+ const SiStripGain* gains,
+ const SiStripNoises& noises);
+ ~SiStripClusterizerConditionsGPU() = default;
+
+ // Function to return the actual payload on the memory of the current device
+ Data const& getGPUProductAsync(cudaStream_t stream) const;
+
+ const DetToFeds& detToFeds() const { return detToFeds_; }
+
+ private:
+ void setStrip(fedId_t fed, fedCh_t channel, stripId_t strip, std::uint16_t noise, float gain, bool bad) {
+ gain_[apvIndex(fed, channel, strip)] = gain;
+ noise_[stripIndex(fed, channel, strip)] = noise;
+ if (bad) {
+ noise_[stripIndex(fed, channel, strip)] |= badBit;
+ }
+ }
+
+ void setInvThickness(fedId_t fed, fedCh_t channel, float invthick) {
+ invthick_[channelIndex(fed, channel)] = invthick;
+ }
+
+ // Holds the data in pinned CPU memory
+ std::vector> noise_;
+ std::vector> invthick_;
+ std::vector> detID_;
+ std::vector> iPair_;
+ std::vector> gain_;
+
+ // Helper that takes care of complexity of transferring the data to
+ // multiple devices
+ cms::cuda::ESProduct gpuData_;
+ DetToFeds detToFeds_;
+ };
+} // namespace stripgpu
+
+#endif
diff --git a/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc b/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
index ddf7a0ffb914f..05530484f14c4 100644
--- a/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
+++ b/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
@@ -23,3 +23,6 @@ TYPELOOKUP_DATA_REG(SiStripQuality);
#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditions.h"
TYPELOOKUP_DATA_REG(SiStripClusterizerConditions);
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+TYPELOOKUP_DATA_REG(stripgpu::SiStripClusterizerConditionsGPU);
diff --git a/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc b/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc
new file mode 100644
index 0000000000000..33d0889ff5550
--- /dev/null
+++ b/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc
@@ -0,0 +1,100 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+#include "CondFormats/SiStripObjects/interface/SiStripNoises.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripGain.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripDetCabling.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripQuality.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+
+#include "DataFormats/SiStripCluster/interface/SiStripClusterTools.h"
+
+namespace stripgpu {
+ SiStripClusterizerConditionsGPU::SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
+ const SiStripGain* gains,
+ const SiStripNoises& noises)
+
+ : noise_(sistrip::NUMBER_OF_FEDS * sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH),
+ invthick_(sistrip::NUMBER_OF_FEDS * sistrip::FEDCH_PER_FED),
+ detID_(sistrip::NUMBER_OF_FEDS * sistrip::FEDCH_PER_FED),
+ iPair_(sistrip::NUMBER_OF_FEDS * sistrip::FEDCH_PER_FED),
+ gain_(sistrip::NUMBER_OF_FEDS * sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED) {
+ // connected: map>
+ // map of KEY=detid DATA=vector of apvs, maximum 6 APVs per detector module :
+ const auto& connected = quality.cabling()->connected();
+ // detCabling: map
+ // map of KEY=detid DATA=vector
+ const auto& detCabling = quality.cabling()->getDetCabling();
+
+ for (const auto& conn : connected) {
+ const auto det = conn.first;
+ if (!quality.IsModuleBad(det)) {
+ const auto detConn_it = detCabling.find(det);
+
+ if (detCabling.end() != detConn_it) {
+ for (const auto& chan : (*detConn_it).second) {
+ if (chan && chan->fedId() && chan->isConnected()) {
+ const auto detID = chan->detId();
+ const auto fedID = chan->fedId();
+ const auto fedCh = chan->fedCh();
+ const auto iPair = chan->apvPairNumber();
+
+ detToFeds_.emplace_back(detID, iPair, fedID, fedCh);
+
+ detID_[channelIndex(fedID, fedCh)] = detID;
+ iPair_[channelIndex(fedID, fedCh)] = iPair;
+ setInvThickness(fedID, fedCh, siStripClusterTools::sensorThicknessInverse(detID));
+
+ auto offset = 256 * iPair;
+
+ for (auto strip = 0; strip < 256; ++strip) {
+ const auto gainRange = gains->getRange(det);
+
+ const auto detstrip = strip + offset;
+ const std::uint16_t noise = SiStripNoises::getRawNoise(detstrip, noises.getRange(det));
+ const auto gain = SiStripGain::getStripGain(detstrip, gainRange);
+ const auto bad = quality.IsStripBad(quality.getRange(det), detstrip);
+
+ // gain is actually stored per-APV, not per-strip
+ setStrip(fedID, fedCh, detstrip, noise, gain, bad);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ std::sort(detToFeds_.begin(), detToFeds_.end(), [](const DetToFed& a, const DetToFed& b) {
+ return a.detID() < b.detID() || (a.detID() == b.detID() && a.pair() < b.pair());
+ });
+ }
+
+ SiStripClusterizerConditionsGPU::Data const& SiStripClusterizerConditionsGPU::getGPUProductAsync(
+ cudaStream_t stream) const {
+ auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](Data& data, cudaStream_t stream) {
+ data.noise_ = cms::cuda::make_device_unique(noise_.size(), stream);
+ data.invthick_ = cms::cuda::make_device_unique(invthick_.size(), stream);
+ data.detID_ = cms::cuda::make_device_unique(detID_.size(), stream);
+ data.iPair_ = cms::cuda::make_device_unique(iPair_.size(), stream);
+ data.gain_ = cms::cuda::make_device_unique(gain_.size(), stream);
+
+ cms::cuda::copyAsync(data.noise_, noise_, stream);
+ cms::cuda::copyAsync(data.invthick_, invthick_, stream);
+ cms::cuda::copyAsync(data.detID_, detID_, stream);
+ cms::cuda::copyAsync(data.iPair_, iPair_, stream);
+ cms::cuda::copyAsync(data.gain_, gain_, stream);
+
+ data.hostView_ = cms::cuda::make_host_unique(stream);
+ data.hostView_->noise_ = data.noise_.get();
+ data.hostView_->invthick_ = data.invthick_.get();
+ data.hostView_->detID_ = data.detID_.get();
+ data.hostView_->iPair_ = data.iPair_.get();
+ data.hostView_->gain_ = data.gain_.get();
+
+ data.deviceView_ = cms::cuda::make_device_unique(stream);
+ cms::cuda::copyAsync(data.deviceView_, data.hostView_, stream);
+ });
+
+ return data;
+ }
+} // namespace stripgpu
diff --git a/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h b/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h
new file mode 100644
index 0000000000000..e6c262d456289
--- /dev/null
+++ b/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h
@@ -0,0 +1,27 @@
+#ifndef DataFormats_SiStripCluster_interface_SiStripClustersSOA_h
+#define DataFormats_SiStripCluster_interface_SiStripClustersSOA_h
+
+#include "DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h"
+
+#include
+
+namespace detail {
+ namespace impl {
+ template
+ using unique_ptr_default_deleter = typename std::unique_ptr;
+ }
+} // namespace detail
+
+class SiStripClustersSOA : public SiStripClustersSOABase {
+public:
+ SiStripClustersSOA() = default;
+ explicit SiStripClustersSOA(uint32_t maxClusters, uint32_t maxStripsPerCluster);
+ ~SiStripClustersSOA() override = default;
+
+ SiStripClustersSOA(const SiStripClustersSOA &) = delete;
+ SiStripClustersSOA &operator=(const SiStripClustersSOA &) = delete;
+ SiStripClustersSOA(SiStripClustersSOA &&) = default;
+ SiStripClustersSOA &operator=(SiStripClustersSOA &&) = default;
+};
+
+#endif
diff --git a/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h b/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h
new file mode 100644
index 0000000000000..036ab7c3dd3e5
--- /dev/null
+++ b/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h
@@ -0,0 +1,59 @@
+#ifndef DataFormats_SiStripCluster_interface_SiStripClustersSOABase_
+#define DataFormats_SiStripCluster_interface_SiStripClustersSOABase_
+
+#include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
+
+#include
+#include
+
+template class T>
+class SiStripClustersSOABase {
+public:
+ //static constexpr uint32_t kClusterMaxStrips = 16;
+
+ SiStripClustersSOABase() = default;
+ //explicit SiStripClustersSOABase(uint32_t maxClusters, uint32_t maxStripsPerCluster);
+ virtual ~SiStripClustersSOABase() = default;
+
+ SiStripClustersSOABase(const SiStripClustersSOABase&) = delete;
+ SiStripClustersSOABase& operator=(const SiStripClustersSOABase&) = delete;
+ SiStripClustersSOABase(SiStripClustersSOABase&&) = default;
+ SiStripClustersSOABase& operator=(SiStripClustersSOABase&&) = default;
+
+ void setNClusters(uint32_t nClusters) { nClusters_ = nClusters; }
+ uint32_t nClusters() const { return nClusters_; }
+
+ void setMaxClusterSize(uint32_t maxClusterSize) { maxClusterSize_ = maxClusterSize; }
+ uint32_t maxClusterSize() const { return maxClusterSize_; }
+
+ const auto& clusterIndex() const { return clusterIndex_; }
+ const auto& clusterSize() const { return clusterSize_; }
+ const auto& clusterADCs() const { return clusterADCs_; }
+ const auto& clusterDetId() const { return clusterDetId_; }
+ const auto& firstStrip() const { return firstStrip_; }
+ const auto& trueCluster() const { return trueCluster_; }
+ const auto& barycenter() const { return barycenter_; }
+ const auto& charge() const { return charge_; }
+
+ auto& clusterIndex() { return clusterIndex_; }
+ auto& clusterSize() { return clusterSize_; }
+ auto& clusterADCs() { return clusterADCs_; }
+ auto& clusterDetId() { return clusterDetId_; }
+ auto& firstStrip() { return firstStrip_; }
+ auto& trueCluster() { return trueCluster_; }
+ auto& barycenter() { return barycenter_; }
+ auto& charge() { return charge_; }
+
+protected:
+ T clusterIndex_;
+ T clusterSize_;
+ T clusterADCs_;
+ T clusterDetId_;
+ T firstStrip_;
+ T trueCluster_;
+ T barycenter_;
+ T charge_;
+ uint32_t nClusters_;
+ uint32_t maxClusterSize_;
+};
+#endif
diff --git a/DataFormats/SiStripCluster/interface/SiStripTypes.h b/DataFormats/SiStripCluster/interface/SiStripTypes.h
new file mode 100644
index 0000000000000..0b13041f47b1c
--- /dev/null
+++ b/DataFormats/SiStripCluster/interface/SiStripTypes.h
@@ -0,0 +1,19 @@
+#ifndef DataFormats_SiStripCluster_interface_SiStripTypes_h
+#define DataFormats_SiStripCluster_interface_SiStripTypes_h
+
+#include
+#include
+
+namespace stripgpu {
+ using detId_t = std::uint32_t;
+ using fedId_t = std::uint16_t;
+ using fedCh_t = std::uint8_t;
+ using apvPair_t = std::uint16_t;
+ using stripId_t = std::uint16_t;
+
+ static constexpr detId_t invalidDet = std::numeric_limits::max();
+ static constexpr fedId_t invalidFed = std::numeric_limits::max();
+ static constexpr stripId_t invalidStrip = std::numeric_limits::max();
+} // namespace stripgpu
+
+#endif
diff --git a/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc b/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc
new file mode 100644
index 0000000000000..d43d8509b5920
--- /dev/null
+++ b/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc
@@ -0,0 +1,13 @@
+#include "DataFormats/SiStripCluster/interface/SiStripClustersSOA.h"
+
+SiStripClustersSOA::SiStripClustersSOA(uint32_t maxClusters, uint32_t maxStripsPerCluster) {
+ clusterIndex_ = std::make_unique(maxClusters);
+ clusterSize_ = std::make_unique(maxClusters);
+ clusterADCs_ = std::make_unique(maxClusters * maxStripsPerCluster);
+ clusterDetId_ = std::make_unique(maxClusters);
+ firstStrip_ = std::make_unique(maxClusters);
+ trueCluster_ = std::make_unique(maxClusters);
+ barycenter_ = std::make_unique(maxClusters);
+ charge_ = std::make_unique(maxClusters);
+ maxClusterSize_ = maxStripsPerCluster;
+}
diff --git a/DataFormats/SiStripCluster/src/classes.h b/DataFormats/SiStripCluster/src/classes.h
index 57f45748f7f2e..00819e76d08a4 100644
--- a/DataFormats/SiStripCluster/src/classes.h
+++ b/DataFormats/SiStripCluster/src/classes.h
@@ -5,6 +5,7 @@
#include "DataFormats/Common/interface/DetSetVectorNew.h"
#include "DataFormats/Common/interface/Ref.h"
#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+#include "DataFormats/SiStripCluster/interface/SiStripClustersSOA.h"
#include "DataFormats/SiStripCluster/interface/SiStripApproximateCluster.h"
#include "DataFormats/Common/interface/ContainerMask.h"
diff --git a/DataFormats/SiStripCluster/src/classes_def.xml b/DataFormats/SiStripCluster/src/classes_def.xml
index ee5d79f1fd266..8c8c0a49d911a 100755
--- a/DataFormats/SiStripCluster/src/classes_def.xml
+++ b/DataFormats/SiStripCluster/src/classes_def.xml
@@ -46,4 +46,9 @@
+
+
+
+
+
diff --git a/RecoLocalTracker/SiStripClusterizer/BuildFile.xml b/RecoLocalTracker/SiStripClusterizer/BuildFile.xml
index 205bdbd9de298..e7fdcd6cbf34a 100644
--- a/RecoLocalTracker/SiStripClusterizer/BuildFile.xml
+++ b/RecoLocalTracker/SiStripClusterizer/BuildFile.xml
@@ -9,7 +9,6 @@
-
diff --git a/RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h b/RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h
index e949dc668e02f..58d399042a019 100644
--- a/RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h
+++ b/RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h
@@ -3,7 +3,8 @@
namespace edm {
class ParameterSet;
-}
+ class ParameterSetDescription;
+} // namespace edm
class StripClusterizerAlgorithm;
#include
@@ -12,5 +13,6 @@ class StripClusterizerAlgorithm;
class StripClusterizerAlgorithmFactory {
public:
static std::unique_ptr create(edm::ConsumesCollector&&, const edm::ParameterSet&);
+ static void fillDescriptions(edm::ParameterSetDescription& clusterizer);
};
#endif
diff --git a/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h b/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h
index 6c9964dd2a43f..2c4551ef40550 100644
--- a/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h
+++ b/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h
@@ -35,6 +35,7 @@ class ThreeThresholdAlgorithm final : public StripClusterizerAlgorithm {
unsigned,
unsigned,
unsigned,
+ unsigned,
bool removeApvShots,
float minGoodCharge);
@@ -58,6 +59,7 @@ class ThreeThresholdAlgorithm final : public StripClusterizerAlgorithm {
float ChannelThreshold, SeedThreshold, ClusterThresholdSquared;
uint8_t MaxSequentialHoles, MaxSequentialBad, MaxAdjacentBad;
+ unsigned MaxClusterSize;
bool RemoveApvShots;
float minGoodCharge;
};
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiStripClusterizer/plugins/BuildFile.xml
index 0343f425c91f1..27ae390132063 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/BuildFile.xml
@@ -1,6 +1,10 @@
-
+
+
+
+
+
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.cc b/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.cc
new file mode 100644
index 0000000000000..5b721778284d4
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.cc
@@ -0,0 +1,60 @@
+#include
+
+#include
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+#include "ChannelLocsGPU.h"
+
+ChannelLocs::ChannelLocs(size_t size, cudaStream_t stream) : ChannelLocsBase(size) {
+ if (size > 0) {
+ input_ = cms::cuda::make_host_unique(size, stream);
+ inoff_ = cms::cuda::make_host_unique(size, stream);
+ offset_ = cms::cuda::make_host_unique(size, stream);
+ length_ = cms::cuda::make_host_unique(size, stream);
+ fedID_ = cms::cuda::make_host_unique(size, stream);
+ fedCh_ = cms::cuda::make_host_unique(size, stream);
+ detID_ = cms::cuda::make_host_unique(size, stream);
+ }
+}
+
+void ChannelLocsView::fill(const ChannelLocsGPU& c) {
+ input_ = c.input();
+ inoff_ = c.inoff();
+ offset_ = c.offset();
+ length_ = c.length();
+ fedID_ = c.fedID();
+ fedCh_ = c.fedCh();
+ detID_ = c.detID();
+ size_ = c.size();
+}
+
+ChannelLocsGPU::ChannelLocsGPU(size_t size, cudaStream_t stream) : ChannelLocsBase(size) {
+ if (size > 0) {
+ input_ = cms::cuda::make_device_unique(size, stream);
+ inoff_ = cms::cuda::make_device_unique(size, stream);
+ offset_ = cms::cuda::make_device_unique(size, stream);
+ length_ = cms::cuda::make_device_unique(size, stream);
+ fedID_ = cms::cuda::make_device_unique(size, stream);
+ fedCh_ = cms::cuda::make_device_unique(size, stream);
+ detID_ = cms::cuda::make_device_unique(size, stream);
+
+ auto channelLocsView = cms::cuda::make_host_unique(stream);
+ channelLocsView->fill(*this);
+ channelLocsViewGPU_ = cms::cuda::make_device_unique(stream);
+ cms::cuda::copyAsync(channelLocsViewGPU_, channelLocsView, stream);
+ }
+}
+
+void ChannelLocsGPU::setVals(const ChannelLocs* c,
+ cms::cuda::host::unique_ptr inputGPU,
+ cudaStream_t stream) {
+ assert(c->size() == size_);
+ cms::cuda::copyAsync(input_, inputGPU, size_, stream);
+ cms::cuda::copyAsync(inoff_, c->inoff_, size_, stream);
+ cms::cuda::copyAsync(offset_, c->offset_, size_, stream);
+ cms::cuda::copyAsync(length_, c->length_, size_, stream);
+ cms::cuda::copyAsync(fedID_, c->fedID_, size_, stream);
+ cms::cuda::copyAsync(fedCh_, c->fedCh_, size_, stream);
+ cms::cuda::copyAsync(detID_, c->detID_, size_, stream);
+}
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.h b/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.h
new file mode 100644
index 0000000000000..343d5a536f035
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.h
@@ -0,0 +1,138 @@
+#ifndef RecoLocalTracker_SiStripClusterizer_plugins_ChannelLocsGPU_h
+#define RecoLocalTracker_SiStripClusterizer_plugins_ChannelLocsGPU_h
+
+#include
+#include
+
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
+
+class ChannelLocsGPU;
+
+template class T>
+class ChannelLocsBase {
+public:
+ ChannelLocsBase(size_t size) : size_(size) {}
+ virtual ~ChannelLocsBase() = default;
+
+ ChannelLocsBase(ChannelLocsBase&& arg)
+ : input_(std::move(arg.input_)),
+ inoff_(std::move(arg.inoff_)),
+ offset_(std::move(arg.offset_)),
+ length_(std::move(arg.length_)),
+ fedID_(std::move(arg.fedID_)),
+ fedCh_(std::move(arg.fedCh_)),
+ detID_(std::move(arg.detID_)),
+ size_(arg.size_) {}
+
+ void setChannelLoc(uint32_t index,
+ const uint8_t* input,
+ size_t inoff,
+ size_t offset,
+ uint16_t length,
+ stripgpu::fedId_t fedID,
+ stripgpu::fedCh_t fedCh,
+ stripgpu::detId_t detID) {
+ input_[index] = input;
+ inoff_[index] = inoff;
+ offset_[index] = offset;
+ length_[index] = length;
+ fedID_[index] = fedID;
+ fedCh_[index] = fedCh;
+ detID_[index] = detID;
+ }
+
+ size_t size() const { return size_; }
+
+ const uint8_t* input(uint32_t index) const { return input_[index]; }
+ size_t inoff(uint32_t index) const { return inoff_[index]; }
+ size_t offset(uint32_t index) const { return offset_[index]; }
+ uint16_t length(uint32_t index) const { return length_[index]; }
+ stripgpu::fedId_t fedID(uint32_t index) const { return fedID_[index]; }
+ stripgpu::fedCh_t fedCh(uint32_t index) const { return fedCh_[index]; }
+ stripgpu::detId_t detID(uint32_t index) const { return detID_[index]; }
+
+ const uint8_t* const* input() const { return input_.get(); }
+ size_t* inoff() const { return inoff_.get(); }
+ size_t* offset() const { return offset_.get(); }
+ uint16_t* length() const { return length_.get(); }
+ stripgpu::fedId_t* fedID() const { return fedID_.get(); }
+ stripgpu::fedCh_t* fedCh() const { return fedCh_.get(); }
+ stripgpu::detId_t* detID() const { return detID_.get(); }
+
+protected:
+ T input_; // input raw data for channel
+ T inoff_; // offset in input raw data
+ T offset_; // global offset in alldata
+ T length_; // length of channel data
+ T fedID_;
+ T fedCh_;
+ T detID_;
+ size_t size_ = 0;
+};
+
+class ChannelLocs : public ChannelLocsBase {
+ friend class ChannelLocsGPU;
+
+public:
+ ChannelLocs(size_t size, cudaStream_t stream);
+ ChannelLocs(ChannelLocs&& arg) : ChannelLocsBase(std::move(arg)) {}
+
+ ChannelLocs(ChannelLocs&) = delete;
+ ChannelLocs(const ChannelLocs&) = delete;
+ ChannelLocs& operator=(const ChannelLocs&) = delete;
+ ChannelLocs& operator=(ChannelLocs&&) = delete;
+
+ ~ChannelLocs() override = default;
+};
+
+class ChannelLocsView {
+public:
+ void fill(const ChannelLocsGPU& c);
+
+ __device__ size_t size() const { return size_; }
+
+ __device__ const uint8_t* input(uint32_t index) const { return input_[index]; }
+ __device__ size_t inoff(uint32_t index) const { return inoff_[index]; }
+ __device__ size_t offset(uint32_t index) const { return offset_[index]; }
+ __device__ uint16_t length(uint32_t index) const { return length_[index]; }
+ __device__ stripgpu::fedId_t fedID(uint32_t index) const { return fedID_[index]; }
+ __device__ stripgpu::fedCh_t fedCh(uint32_t index) const { return fedCh_[index]; }
+ __device__ stripgpu::detId_t detID(uint32_t index) const { return detID_[index]; }
+
+private:
+ const uint8_t* const* input_; // input raw data for channel
+ size_t* inoff_; // offset in input raw data
+ size_t* offset_; // global offset in alldata
+ uint16_t* length_; // length of channel data
+ stripgpu::fedId_t* fedID_;
+ stripgpu::fedCh_t* fedCh_;
+ stripgpu::detId_t* detID_;
+ size_t size_;
+};
+
+class ChannelLocsGPU : public ChannelLocsBase {
+public:
+ //using Base = ChannelLocsBase;
+ ChannelLocsGPU(size_t size, cudaStream_t stream);
+ ChannelLocsGPU(ChannelLocsGPU&& arg)
+ : ChannelLocsBase(std::move(arg)), channelLocsViewGPU_(std::move(arg.channelLocsViewGPU_)) {}
+
+ ChannelLocsGPU(ChannelLocsGPU&) = delete;
+ ChannelLocsGPU(const ChannelLocsGPU&) = delete;
+ ChannelLocsGPU& operator=(const ChannelLocsGPU&) = delete;
+ ChannelLocsGPU& operator=(ChannelLocsGPU&&) = delete;
+
+ ~ChannelLocsGPU() override = default;
+
+ void setVals(const ChannelLocs* c, cms::cuda::host::unique_ptr inputGPU, cudaStream_t stream);
+ const ChannelLocsView* channelLocsView() const { return channelLocsViewGPU_.get(); }
+
+private:
+ cms::cuda::device::unique_ptr channelLocsViewGPU_;
+};
+
+#endif
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc
index ce456617dbc22..4e6c929014985 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc
@@ -17,6 +17,8 @@
#include "FWCore/Framework/interface/stream/EDProducer.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
#include "FWCore/Utilities/interface/InputTag.h"
#include "FWCore/Framework/interface/Event.h"
#include "FWCore/Framework/interface/EventSetup.h"
@@ -173,9 +175,8 @@ class SiStripClusterizerFromRaw final : public edm::stream::EDProducer<> {
conf.getParameter("Clusterizer"))),
rawAlgos_(SiStripRawProcessingFactory::create(conf.getParameter("Algorithms"),
consumesCollector())),
- doAPVEmulatorCheck_(conf.existsAs("DoAPVEmulatorCheck") ? conf.getParameter("DoAPVEmulatorCheck")
- : true),
- legacy_(conf.existsAs("LegacyUnpacker") ? conf.getParameter("LegacyUnpacker") : false),
+ doAPVEmulatorCheck_(conf.getParameter("DoAPVEmulatorCheck")),
+ legacy_(conf.getParameter("LegacyUnpacker")),
hybridZeroSuppressed_(conf.getParameter("HybridZeroSuppressed")) {
productToken_ = consumes(conf.getParameter("ProductLabel"));
produces >();
@@ -211,6 +212,8 @@ class SiStripClusterizerFromRaw final : public edm::stream::EDProducer<> {
ev.put(std::move(output));
}
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
private:
void initialize(const edm::EventSetup& es);
@@ -231,6 +234,27 @@ class SiStripClusterizerFromRaw final : public edm::stream::EDProducer<> {
bool hybridZeroSuppressed_;
};
+void SiStripClusterizerFromRaw::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+
+ desc.add("ProductLabel", edm::InputTag("rawDataCollector"));
+ desc.add("ConditionsLabel", "");
+ desc.add("onDemand", true);
+ desc.add("DoAPVEmulatorCheck", true);
+ desc.add("LegacyUnpacker", false);
+ desc.add("HybridZeroSuppressed", false);
+
+ edm::ParameterSetDescription clusterizer;
+ StripClusterizerAlgorithmFactory::fillDescriptions(clusterizer);
+ desc.add("Clusterizer", clusterizer);
+
+ edm::ParameterSetDescription algorithms;
+ SiStripRawProcessingFactory::fillDescriptions(algorithms);
+ desc.add("Algorithms", algorithms);
+
+ descriptions.addWithDefaultLabel(desc);
+}
+
#include "FWCore/Framework/interface/MakerMacros.h"
DEFINE_FWK_MODULE(SiStripClusterizerFromRaw);
@@ -250,7 +274,6 @@ void SiStripClusterizerFromRaw::run(const FEDRawDataCollection& rawColl, edmNew:
if (record.empty())
record.abort();
-
} // end loop over dets
}
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc
new file mode 100644
index 0000000000000..2d7b4c83a4a4a
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc
@@ -0,0 +1,181 @@
+/*
+ */
+#include "RecoLocalTracker/Records/interface/SiStripClusterizerConditionsRcd.h"
+
+#include "RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h"
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "EventFilter/SiStripRawToDigi/interface/SiStripFEDBuffer.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditions.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Utilities/interface/Likely.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "SiStripRawToClusterGPUKernel.h"
+#include "ChannelLocsGPU.h"
+
+//#include
+#include
+#include
+
+namespace {
+ std::unique_ptr fillBuffer(int fedId, const FEDRawData& rawData) {
+ std::unique_ptr buffer;
+
+ // Check on FEDRawData pointer
+ const auto st_buffer = sistrip::preconstructCheckFEDBuffer(rawData);
+ if UNLIKELY (sistrip::FEDBufferStatusCode::SUCCESS != st_buffer) {
+ LogDebug(sistrip::mlRawToCluster_) << "[ClustersFromRawProducer::" << __func__ << "]" << st_buffer
+ << " for FED ID " << fedId;
+ return buffer;
+ }
+ buffer = std::make_unique(rawData);
+ const auto st_chan = buffer->findChannels();
+ if UNLIKELY (sistrip::FEDBufferStatusCode::SUCCESS != st_chan) {
+ LogDebug(sistrip::mlRawToCluster_) << "Exception caught when creating FEDBuffer object for FED " << fedId << ": "
+ << st_chan;
+ buffer.reset();
+ return buffer;
+ }
+ if UNLIKELY (!buffer->doChecks(false)) {
+ LogDebug(sistrip::mlRawToCluster_) << "Exception caught when creating FEDBuffer object for FED " << fedId
+ << ": FED Buffer check fails";
+ buffer.reset();
+ return buffer;
+ }
+
+ return buffer;
+ }
+} // namespace
+
+class SiStripClusterizerFromRawGPU final : public edm::stream::EDProducer {
+public:
+ explicit SiStripClusterizerFromRawGPU(const edm::ParameterSet& conf)
+ : buffers_(sistrip::FED_ID_MAX),
+ raw_(sistrip::FED_ID_MAX),
+ gpuAlgo_(conf.getParameter("Clusterizer")),
+ inputToken_(consumes(conf.getParameter("ProductLabel"))),
+ outputToken_(produces>()),
+ conditionsToken_(esConsumes(edm::ESInputTag{"", conf.getParameter("ConditionsLabel")})),
+ cpuConditionsToken_(esConsumes(edm::ESInputTag{"", conf.getParameter("ConditionsLabel")})) {}
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+ void acquire(edm::Event const& ev,
+ edm::EventSetup const& es,
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder) override {
+ const auto& conditions = es.getData(conditionsToken_); //these need to be GPU conditions
+ const auto& cpuConditions = es.getData(cpuConditionsToken_); //CPU conditions
+
+ // Sets the current device and creates a CUDA stream
+ cms::cuda::ScopedContextAcquire ctx{ev.streamID(), std::move(waitingTaskHolder), ctxState_};
+
+ // get raw data
+ auto const& rawData = ev.get(inputToken_);
+ run(rawData, cpuConditions);
+
+ // Queues asynchronous data transfers and kernels to the CUDA stream
+ // returned by cms::cuda::ScopedContextAcquire::stream()
+ gpuAlgo_.makeAsync(raw_, buffers_, conditions, ctx.stream());
+
+ // Destructor of ctx queues a callback to the CUDA stream notifying
+ // waitingTaskHolder when the queued asynchronous work has finished
+ }
+
+ void produce(edm::Event& ev, const edm::EventSetup& es) override {
+ cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+ // Now getResult() returns data in GPU memory that is passed to the
+ // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+ // OutputData to cms::cuda::Product. cms::cuda::Product stores also
+ // the current device and the CUDA stream since those will be needed
+ // in the consumer side.
+ ctx.emplace(ev, outputToken_, gpuAlgo_.getResults(ctx.stream()));
+
+ for (auto& buf : buffers_)
+ buf.reset(nullptr);
+ }
+
+private:
+ void run(const FEDRawDataCollection& rawColl, const SiStripClusterizerConditions& conditions);
+ void fill(uint32_t idet, const FEDRawDataCollection& rawColl, const SiStripClusterizerConditions& conditions);
+
+private:
+ std::vector> buffers_;
+ std::vector raw_;
+ cms::cuda::ContextState ctxState_;
+
+ stripgpu::SiStripRawToClusterGPUKernel gpuAlgo_;
+
+ edm::EDGetTokenT inputToken_;
+ edm::EDPutTokenT> outputToken_;
+ edm::ESGetToken conditionsToken_;
+ edm::ESGetToken cpuConditionsToken_;
+};
+
+void SiStripClusterizerFromRawGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+
+ desc.add("ProductLabel", edm::InputTag("rawDataCollector"));
+ desc.add("ConditionsLabel", "");
+
+ edm::ParameterSetDescription clusterizer;
+ StripClusterizerAlgorithmFactory::fillDescriptions(clusterizer);
+ desc.add("Clusterizer", clusterizer);
+
+ descriptions.addWithDefaultLabel(desc);
+}
+
+void SiStripClusterizerFromRawGPU::run(const FEDRawDataCollection& rawColl,
+ const SiStripClusterizerConditions& conditions) {
+ // loop over good det in cabling
+ for (auto idet : conditions.allDetIds()) {
+ fill(idet, rawColl, conditions);
+ } // end loop over dets
+}
+
+void SiStripClusterizerFromRawGPU::fill(uint32_t idet,
+ const FEDRawDataCollection& rawColl,
+ const SiStripClusterizerConditions& conditions) {
+ auto const& det = conditions.findDetId(idet);
+ if (!det.valid())
+ return;
+
+ // Loop over apv-pairs of det
+ for (auto const conn : conditions.currentConnection(det)) {
+ if UNLIKELY (!conn)
+ continue;
+
+ const uint16_t fedId = conn->fedId();
+
+ // If fed id is null or connection is invalid continue
+ if UNLIKELY (!fedId || !conn->isConnected()) {
+ continue;
+ }
+
+ // If Fed hasnt already been initialised, extract data and initialise
+ sistrip::FEDBuffer* buffer = buffers_[fedId].get();
+ if (!buffer) {
+ const FEDRawData& rawData = rawColl.FEDData(fedId);
+ raw_[fedId] = &rawData;
+ buffers_[fedId] = fillBuffer(fedId, rawData);
+ }
+ } // end loop over conn
+}
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiStripClusterizerFromRawGPU);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc
new file mode 100644
index 0000000000000..67f4f3152dd9c
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc
@@ -0,0 +1,64 @@
+/**\class SiStripClusterizerConditionsGPUESProducer
+ *
+ * Create a GPU cache object for fast access to conditions needed by the SiStrip clusterizer
+ *
+ * @see SiStripClusterizerConditions
+ */
+#include
+
+#include "FWCore/Framework/interface/ModuleFactory.h"
+#include "FWCore/Framework/interface/ESProducer.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "RecoLocalTracker/Records/interface/SiStripClusterizerConditionsRcd.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripGain.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripDetCabling.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+
+using namespace stripgpu;
+
+class SiStripClusterizerConditionsGPUESProducer : public edm::ESProducer {
+public:
+ SiStripClusterizerConditionsGPUESProducer(const edm::ParameterSet&);
+ ~SiStripClusterizerConditionsGPUESProducer() override {}
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+ using ReturnType = std::unique_ptr;
+ ReturnType produce(const SiStripClusterizerConditionsRcd&);
+
+private:
+ edm::ESGetToken gainToken_;
+ edm::ESGetToken noisesToken_;
+ edm::ESGetToken qualityToken_;
+};
+
+SiStripClusterizerConditionsGPUESProducer::SiStripClusterizerConditionsGPUESProducer(const edm::ParameterSet& iConfig) {
+ auto cc = setWhatProduced(this, iConfig.getParameter("Label"));
+
+ gainToken_ = cc.consumesFrom();
+ noisesToken_ = cc.consumesFrom();
+ qualityToken_ = cc.consumesFrom(
+ edm::ESInputTag{"", iConfig.getParameter("QualityLabel")});
+}
+
+void SiStripClusterizerConditionsGPUESProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("QualityLabel", "");
+ desc.add("Label", "");
+ descriptions.add("SiStripClusterizerConditionsGPUESProducer", desc);
+}
+
+SiStripClusterizerConditionsGPUESProducer::ReturnType SiStripClusterizerConditionsGPUESProducer::produce(
+ const SiStripClusterizerConditionsRcd& iRecord) {
+ auto gainsH = iRecord.getTransientHandle(gainToken_);
+ const auto& noises = iRecord.get(noisesToken_);
+ const auto& quality = iRecord.get(qualityToken_);
+
+ return std::make_unique(quality, gainsH.product(), noises);
+}
+
+DEFINE_FWK_EVENTSETUP_MODULE(SiStripClusterizerConditionsGPUESProducer);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc
new file mode 100644
index 0000000000000..8b891382085c7
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc
@@ -0,0 +1,83 @@
+/*
+ */
+#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+
+#include
+
+class SiStripClustersFromSOA final : public edm::stream::EDProducer<> {
+public:
+ explicit SiStripClustersFromSOA(const edm::ParameterSet& conf)
+ : inputToken_(consumes(conf.getParameter("ProductLabel"))),
+ outputToken_(produces>()) {}
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+
+ desc.add("ProductLabel", edm::InputTag("siStripClustersSOAtoHost"));
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ void produce(edm::Event& ev, const edm::EventSetup& es) override {
+ const auto& clust_data = ev.get(inputToken_);
+
+ const int nSeedStripsNC = clust_data.nClusters();
+ const auto clusterSize = clust_data.clusterSize().get();
+ const auto clusterADCs = clust_data.clusterADCs().get();
+ const auto detIDs = clust_data.clusterDetId().get();
+ const auto stripIDs = clust_data.firstStrip().get();
+ const auto trueCluster = clust_data.trueCluster().get();
+
+ const unsigned int initSeedStripsSize = 15000;
+
+ using out_t = edmNew::DetSetVector;
+ auto output{std::make_unique(edmNew::DetSetVector())};
+ output->reserve(initSeedStripsSize, nSeedStripsNC);
+
+ std::vector adcs;
+
+ for (int i = 0; i < nSeedStripsNC;) {
+ const auto detid = detIDs[i];
+ out_t::FastFiller record(*output, detid);
+
+ while (i < nSeedStripsNC && detIDs[i] == detid) {
+ if (trueCluster[i]) {
+ const auto size = clusterSize[i];
+ const auto firstStrip = stripIDs[i];
+
+ adcs.clear();
+ adcs.reserve(size);
+
+ for (uint32_t j = 0; j < size; ++j) {
+ adcs.push_back(clusterADCs[i + j * nSeedStripsNC]);
+ }
+ record.push_back(SiStripCluster(firstStrip, std::move(adcs)));
+ }
+ i++;
+ }
+ }
+
+ output->shrink_to_fit();
+ ev.put(std::move(output));
+ }
+
+private:
+ edm::EDGetTokenT inputToken_;
+ edm::EDPutTokenT> outputToken_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiStripClustersFromSOA);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc
new file mode 100644
index 0000000000000..a51bb1216d0e0
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc
@@ -0,0 +1,76 @@
+/*
+ */
+#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+
+#include
+
+class SiStripSOAtoHost {
+public:
+ SiStripSOAtoHost() = default;
+ void makeAsync(const SiStripClustersCUDADevice& clusters_d, cudaStream_t stream) {
+ hostView_ = std::make_unique(clusters_d, stream);
+ }
+ std::unique_ptr getResults() { return std::move(hostView_); }
+
+private:
+ std::unique_ptr hostView_;
+};
+
+class SiStripClustersSOAtoHost final : public edm::stream::EDProducer {
+public:
+ explicit SiStripClustersSOAtoHost(const edm::ParameterSet& conf)
+ : inputToken_(
+ consumes>(conf.getParameter("ProductLabel"))),
+ outputToken_(produces()) {}
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+
+ desc.add("ProductLabel", edm::InputTag("siStripClusterizerFromRawGPU"));
+ descriptions.addWithDefaultLabel(desc);
+ }
+
+private:
+ void acquire(edm::Event const& ev,
+ edm::EventSetup const& es,
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder) override {
+ const auto& wrapper = ev.get(inputToken_);
+
+ // Sets the current device and creates a CUDA stream
+ cms::cuda::ScopedContextAcquire ctx{wrapper, std::move(waitingTaskHolder)};
+
+ const auto& input = ctx.get(wrapper);
+
+ // Queues asynchronous data transfers and kernels to the CUDA stream
+ // returned by cms::cuda::ScopedContextAcquire::stream()
+ gpuAlgo_.makeAsync(input, ctx.stream());
+
+ // Destructor of ctx queues a callback to the CUDA stream notifying
+ // waitingTaskHolder when the queued asynchronous work has finished
+ }
+
+ void produce(edm::Event& ev, const edm::EventSetup& es) override { ev.put(gpuAlgo_.getResults()); }
+
+private:
+ SiStripSOAtoHost gpuAlgo_;
+
+ edm::EDGetTokenT> inputToken_;
+ edm::EDPutTokenT outputToken_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiStripClustersSOAtoHost);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc
new file mode 100644
index 0000000000000..a8b9aa04a00b3
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc
@@ -0,0 +1,185 @@
+#include "EventFilter/SiStripRawToDigi/interface/SiStripFEDBuffer.h"
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "RecoLocalTracker/SiStripClusterizer/interface/ClusterChargeCut.h"
+
+#include "SiStripRawToClusterGPUKernel.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+#include "ChannelLocsGPU.h"
+#include "StripDataView.h"
+
+namespace stripgpu {
+ StripDataGPU::StripDataGPU(size_t size, cudaStream_t stream) {
+ alldataGPU_ = cms::cuda::make_device_unique(size, stream);
+ channelGPU_ = cms::cuda::make_device_unique(size, stream);
+ stripIdGPU_ = cms::cuda::make_device_unique(size, stream);
+ }
+
+ SiStripRawToClusterGPUKernel::SiStripRawToClusterGPUKernel(const edm::ParameterSet& conf)
+ : fedIndex_(sistrip::NUMBER_OF_FEDS, stripgpu::invalidFed),
+ channelThreshold_(conf.getParameter("ChannelThreshold")),
+ seedThreshold_(conf.getParameter("SeedThreshold")),
+ clusterThresholdSquared_(std::pow(conf.getParameter("ClusterThreshold"), 2.0f)),
+ maxSequentialHoles_(conf.getParameter("MaxSequentialHoles")),
+ maxSequentialBad_(conf.getParameter("MaxSequentialBad")),
+ maxAdjacentBad_(conf.getParameter("MaxAdjacentBad")),
+ maxClusterSize_(conf.getParameter("MaxClusterSize")),
+ minGoodCharge_(clusterChargeCut(conf)) {
+ fedRawDataOffsets_.reserve(sistrip::NUMBER_OF_FEDS);
+ }
+
+ void SiStripRawToClusterGPUKernel::makeAsync(const std::vector& rawdata,
+ const std::vector>& buffers,
+ const SiStripClusterizerConditionsGPU& conditions,
+ cudaStream_t stream) {
+ size_t totalSize{0};
+ for (const auto& buff : buffers) {
+ if (buff != nullptr) {
+ totalSize += buff->bufferSize();
+ }
+ }
+
+ auto fedRawDataHost = cms::cuda::make_host_unique(totalSize, stream);
+ auto fedRawDataGPU = cms::cuda::make_device_unique(totalSize, stream);
+
+ size_t off = 0;
+ fedRawDataOffsets_.clear();
+ fedIndex_.clear();
+ fedIndex_.resize(sistrip::NUMBER_OF_FEDS, stripgpu::invalidFed);
+
+ sistrip::FEDReadoutMode mode = sistrip::READOUT_MODE_INVALID;
+
+ for (size_t fedi = 0; fedi < buffers.size(); ++fedi) {
+ auto& buff = buffers[fedi];
+ if (buff != nullptr) {
+ const auto raw = rawdata[fedi];
+ memcpy(fedRawDataHost.get() + off, raw->data(), raw->size());
+ fedIndex_[stripgpu::fedIndex(fedi)] = fedRawDataOffsets_.size();
+ fedRawDataOffsets_.push_back(off);
+ off += raw->size();
+ if (fedRawDataOffsets_.size() == 1) {
+ mode = buff->readoutMode();
+ } else {
+ if (buff->readoutMode() != mode) {
+ throw cms::Exception("[SiStripRawToClusterGPUKernel] inconsistent readout mode ")
+ << buff->readoutMode() << " != " << mode;
+ }
+ }
+ }
+ }
+ // send rawdata to GPU
+ cms::cuda::copyAsync(fedRawDataGPU, fedRawDataHost, totalSize, stream);
+
+ const auto& detmap = conditions.detToFeds();
+ if ((mode != sistrip::READOUT_MODE_ZERO_SUPPRESSED) && (mode != sistrip::READOUT_MODE_ZERO_SUPPRESSED_LITE10)) {
+ throw cms::Exception("[SiStripRawToClusterGPUKernel] unsupported readout mode ") << mode;
+ }
+ const uint16_t headerlen = mode == sistrip::READOUT_MODE_ZERO_SUPPRESSED ? 7 : 2;
+ size_t offset = 0;
+ auto chanlocs = std::make_unique(detmap.size(), stream);
+ auto inputGPU = cms::cuda::make_host_unique(chanlocs->size(), stream);
+
+ // iterate over the detector in DetID/APVPair order
+ // mapping out where the data are
+ for (size_t i = 0; i < detmap.size(); ++i) {
+ const auto& detp = detmap[i];
+ const auto fedId = detp.fedID();
+ const auto fedCh = detp.fedCh();
+ const auto fedi = fedIndex_[stripgpu::fedIndex(fedId)];
+
+ if (fedi != invalidFed) {
+ const auto buffer = buffers[fedId].get();
+ const auto& channel = buffer->channel(detp.fedCh());
+
+ auto len = channel.length();
+ auto off = channel.offset();
+
+ assert(len >= headerlen || len == 0);
+
+ if (len >= headerlen) {
+ len -= headerlen;
+ off += headerlen;
+ }
+
+ chanlocs->setChannelLoc(i, channel.data(), off, offset, len, fedId, fedCh, detp.detID());
+ inputGPU[i] = fedRawDataGPU.get() + fedRawDataOffsets_[fedi] + (channel.data() - rawdata[fedId]->data());
+ offset += len;
+
+ } else {
+ chanlocs->setChannelLoc(i, nullptr, 0, 0, 0, invalidFed, 0, invalidDet);
+ inputGPU[i] = nullptr;
+ }
+ }
+
+ const auto n_strips = offset;
+
+ sst_data_d_ = cms::cuda::make_host_unique(stream);
+ sst_data_d_->nStrips = n_strips;
+
+ chanlocsGPU_ = std::make_unique(detmap.size(), stream);
+ chanlocsGPU_->setVals(chanlocs.get(), std::move(inputGPU), stream);
+
+ stripdata_ = std::make_unique(n_strips, stream);
+
+ const auto& condGPU = conditions.getGPUProductAsync(stream);
+
+ unpackChannelsGPU(condGPU.deviceView(), stream);
+#ifdef GPU_CHECK
+ cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+#ifdef EDM_ML_DEBUG
+ auto outdata = cms::cuda::make_host_unique(n_strips, stream);
+ cms::cuda::copyAsync(outdata, stripdata_->alldataGPU_, n_strips, stream);
+ cudaCheck(cudaStreamSynchronize(stream));
+
+ constexpr int xor3bits = 7;
+ for (size_t i = 0; i < chanlocs->size(); ++i) {
+ const auto data = chanlocs->input(i);
+ const auto len = chanlocs->length(i);
+
+ if (data != nullptr && len > 0) {
+ auto aoff = chanlocs->offset(i);
+ auto choff = chanlocs->inoff(i);
+ const auto end = choff + len;
+
+ while (choff < end) {
+ const auto stripIndex = data[choff++ ^ xor3bits];
+ const auto groupLength = data[choff++ ^ xor3bits];
+ aoff += 2;
+ for (auto k = 0; k < groupLength; ++k, ++choff, ++aoff) {
+ if (data[choff ^ xor3bits] != outdata[aoff]) {
+ LogDebug("SiStripRawToClusterGPUKernel")
+ << "Strip mismatch " << stripIndex << " i:k " << i << ":" << k << " "
+ << (uint32_t)data[choff ^ xor3bits] << " != " << (uint32_t)outdata[aoff] << std::endl;
+ }
+ }
+ }
+ }
+ }
+ outdata.reset(nullptr);
+#endif
+
+ fedRawDataGPU.reset();
+ allocateSSTDataGPU(n_strips, stream);
+ setSeedStripsNCIndexGPU(condGPU.deviceView(), stream);
+
+ clusters_d_ = SiStripClustersCUDADevice(kMaxSeedStrips, maxClusterSize_, stream);
+ findClusterGPU(condGPU.deviceView(), stream);
+
+ stripdata_.reset();
+ }
+
+ SiStripClustersCUDADevice SiStripRawToClusterGPUKernel::getResults(cudaStream_t stream) {
+ reset();
+
+ return std::move(clusters_d_);
+ }
+
+ void SiStripRawToClusterGPUKernel::reset() {
+ chanlocsGPU_.reset();
+ sst_data_d_.reset();
+ }
+} // namespace stripgpu
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu
new file mode 100644
index 0000000000000..c88d5a60ec86b
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu
@@ -0,0 +1,581 @@
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+
+#include "ChannelLocsGPU.h"
+#include "SiStripRawToClusterGPUKernel.h"
+#include "StripDataView.h"
+
+//#define GPU_DEBUG
+#if defined(EDM_ML_DEBUG) || defined(GPU_DEBUG)
+#define GPU_CHECK
+#include
+#endif
+
+using namespace stripgpu;
+using ConditionsDeviceView = SiStripClusterizerConditionsGPU::Data::DeviceView;
+
+__global__ static void unpackChannels(const ChannelLocsView *chanlocs,
+ const ConditionsDeviceView *conditions,
+ uint8_t *alldata,
+ uint16_t *channel,
+ stripId_t *stripId) {
+ const int tid = threadIdx.x;
+ const int bid = blockIdx.x;
+ const int nthreads = blockDim.x;
+
+ const auto first = nthreads * bid + tid;
+ const auto stride = blockDim.x * gridDim.x;
+ for (auto chan = first; chan < chanlocs->size(); chan += stride) {
+ const auto fedid = chanlocs->fedID(chan);
+ const auto fedch = chanlocs->fedCh(chan);
+ const auto ipair = conditions->iPair(fedid, fedch);
+ const auto ipoff = sistrip::STRIPS_PER_FEDCH * ipair;
+
+ const auto data = chanlocs->input(chan);
+ const auto len = chanlocs->length(chan);
+
+ if (data != nullptr && len > 0) {
+ auto aoff = chanlocs->offset(chan);
+ auto choff = chanlocs->inoff(chan);
+ const auto end = choff + len;
+
+ while (choff < end) {
+ auto stripIndex = data[(choff++) ^ 7] + ipoff;
+ const auto groupLength = data[(choff++) ^ 7];
+
+ for (auto i = 0; i < 2; ++i) {
+ stripId[aoff] = invalidStrip;
+ alldata[aoff++] = 0;
+ }
+
+ for (auto i = 0; i < groupLength; ++i) {
+ stripId[aoff] = stripIndex++;
+ channel[aoff] = chan;
+ alldata[aoff++] = data[(choff++) ^ 7];
+ }
+ }
+ } // choff < end
+ } // data != nullptr && len > 0
+} // chan < chanlocs->size()
+
+__global__ static void setSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
+ const int nStrips = sst_data_d->nStrips;
+ const auto __restrict__ chanlocs = sst_data_d->chanlocs;
+ const uint8_t *__restrict__ adc = sst_data_d->adc;
+ const uint16_t *__restrict__ channels = sst_data_d->channel;
+ const uint16_t *__restrict__ stripId = sst_data_d->stripId;
+ int *__restrict__ seedStripsMask = sst_data_d->seedStripsMask;
+ int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
+ const float seedThreshold = sst_data_d->seedThreshold;
+
+ const int tid = threadIdx.x;
+ const int bid = blockIdx.x;
+ const int nthreads = blockDim.x;
+ const int first = nthreads * bid + tid;
+ const int stride = blockDim.x * gridDim.x;
+
+ for (int i = first; i < nStrips; i += stride) {
+ seedStripsMask[i] = 0;
+ seedStripsNCMask[i] = 0;
+ const stripId_t strip = stripId[i];
+ if (strip != invalidStrip) {
+ const auto chan = channels[i];
+ const fedId_t fed = chanlocs->fedID(chan);
+ const fedCh_t channel = chanlocs->fedCh(chan);
+ const float noise_i = conditions->noise(fed, channel, strip);
+ const uint8_t adc_i = adc[i];
+
+ seedStripsMask[i] = (adc_i >= static_cast(noise_i * seedThreshold)) ? 1 : 0;
+ seedStripsNCMask[i] = seedStripsMask[i];
+ }
+ }
+}
+
+__global__ static void setNCSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
+ const int nStrips = sst_data_d->nStrips;
+ const auto __restrict__ chanlocs = sst_data_d->chanlocs;
+ const uint16_t *__restrict__ channels = sst_data_d->channel;
+ const uint16_t *__restrict__ stripId = sst_data_d->stripId;
+ const int *__restrict__ seedStripsMask = sst_data_d->seedStripsMask;
+ int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
+
+ const int tid = threadIdx.x;
+ const int bid = blockIdx.x;
+ const int nthreads = blockDim.x;
+ const int first = nthreads * bid + tid;
+ const int stride = blockDim.x * gridDim.x;
+
+ for (int i = first; i < nStrips; i += stride) {
+ if (i > 0) {
+ const auto detid = chanlocs->detID(channels[i]);
+ const auto detid1 = chanlocs->detID(channels[i - 1]);
+
+ if (seedStripsMask[i] && seedStripsMask[i - 1] && (stripId[i] - stripId[i - 1]) == 1 && (detid == detid1))
+ seedStripsNCMask[i] = 0;
+ }
+ }
+}
+
+__global__ static void setStripIndexGPU(StripDataView *sst_data_d) {
+ const int nStrips = sst_data_d->nStrips;
+ const int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
+ const int *__restrict__ prefixSeedStripsNCMask = sst_data_d->prefixSeedStripsNCMask;
+ int *__restrict__ seedStripsNCIndex = sst_data_d->seedStripsNCIndex;
+
+ const int tid = threadIdx.x;
+ const int bid = blockIdx.x;
+ const int nthreads = blockDim.x;
+ const int first = nthreads * bid + tid;
+ const int stride = blockDim.x * gridDim.x;
+
+ for (int i = first; i < nStrips; i += stride) {
+ if (seedStripsNCMask[i] == 1) {
+ const int index = prefixSeedStripsNCMask[i];
+ seedStripsNCIndex[index] = i;
+ }
+ }
+}
+
+__global__ static void findLeftRightBoundaryGPU(const StripDataView *sst_data_d,
+ const ConditionsDeviceView *conditions,
+ SiStripClustersCUDADevice::DeviceView *clust_data_d) {
+ const int nStrips = sst_data_d->nStrips;
+ const int *__restrict__ seedStripsNCIndex = sst_data_d->seedStripsNCIndex;
+ const auto __restrict__ chanlocs = sst_data_d->chanlocs;
+ const uint16_t *__restrict__ stripId = sst_data_d->stripId;
+ const uint16_t *__restrict__ channels = sst_data_d->channel;
+ const uint8_t *__restrict__ adc = sst_data_d->adc;
+ const int nSeedStripsNC = std::min(kMaxSeedStrips, *(sst_data_d->prefixSeedStripsNCMask + nStrips - 1));
+ const uint8_t maxSequentialHoles = sst_data_d->maxSequentialHoles;
+ const float channelThreshold = sst_data_d->channelThreshold;
+ const float clusterThresholdSquared = sst_data_d->clusterThresholdSquared;
+ const int clusterSizeLimit = sst_data_d->clusterSizeLimit;
+
+ auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
+ auto __restrict__ clusterSize = clust_data_d->clusterSize_;
+ auto __restrict__ clusterDetId = clust_data_d->clusterDetId_;
+ auto __restrict__ firstStrip = clust_data_d->firstStrip_;
+ auto __restrict__ trueCluster = clust_data_d->trueCluster_;
+
+ const int tid = threadIdx.x;
+ const int bid = blockIdx.x;
+ const int nthreads = blockDim.x;
+ const int first = nthreads * bid + tid;
+ const int stride = blockDim.x * gridDim.x;
+
+ for (int i = first; i < nSeedStripsNC; i += stride) {
+ const auto index = seedStripsNCIndex[i];
+ const auto chan = channels[index];
+ const auto fed = chanlocs->fedID(chan);
+ const auto channel = chanlocs->fedCh(chan);
+ const auto det = chanlocs->detID(chan);
+ const auto strip = stripId[index];
+ const auto noise_i = conditions->noise(fed, channel, strip);
+
+ auto noiseSquared_i = noise_i * noise_i;
+ float adcSum_i = static_cast(adc[index]);
+ auto testIndex = index - 1;
+ auto size = 1;
+
+ auto addtocluster = [&](int &indexLR) {
+ const auto testchan = channels[testIndex];
+ const auto testFed = chanlocs->fedID(testchan);
+ const auto testChannel = chanlocs->fedCh(testchan);
+ const auto testStrip = stripId[testIndex];
+ const auto testNoise = conditions->noise(testFed, testChannel, testStrip);
+ const auto testADC = adc[testIndex];
+
+ if (testADC >= static_cast(testNoise * channelThreshold)) {
+ ++size;
+ indexLR = testIndex;
+ noiseSquared_i += testNoise * testNoise;
+ adcSum_i += static_cast(testADC);
+ }
+ };
+
+ // find left boundary
+ auto indexLeft = index;
+
+ if (testIndex >= 0 && stripId[testIndex] == invalidStrip) {
+ testIndex -= 2;
+ }
+
+ if (testIndex >= 0) {
+ const auto testchan = channels[testIndex];
+ const auto testDet = chanlocs->detID(testchan);
+ auto rangeLeft = stripId[indexLeft] - stripId[testIndex] - 1;
+ auto sameDetLeft = det == testDet;
+
+ while (sameDetLeft && rangeLeft >= 0 && rangeLeft <= maxSequentialHoles && size < clusterSizeLimit + 1) {
+ addtocluster(indexLeft);
+ --testIndex;
+ if (testIndex >= 0 && stripId[testIndex] == invalidStrip) {
+ testIndex -= 2;
+ }
+ if (testIndex >= 0) {
+ rangeLeft = stripId[indexLeft] - stripId[testIndex] - 1;
+ const auto newchan = channels[testIndex];
+ const auto newdet = chanlocs->detID(newchan);
+ sameDetLeft = det == newdet;
+ } else {
+ sameDetLeft = false;
+ }
+ } // while loop
+ } // testIndex >= 0
+
+ // find right boundary
+ auto indexRight = index;
+ testIndex = index + 1;
+
+ if (testIndex < nStrips && stripId[testIndex] == invalidStrip) {
+ testIndex += 2;
+ }
+
+ if (testIndex < nStrips) {
+ const auto testchan = channels[testIndex];
+ const auto testDet = chanlocs->detID(testchan);
+ auto rangeRight = stripId[testIndex] - stripId[indexRight] - 1;
+ auto sameDetRight = det == testDet;
+
+ while (sameDetRight && rangeRight >= 0 && rangeRight <= maxSequentialHoles && size < clusterSizeLimit + 1) {
+ addtocluster(indexRight);
+ ++testIndex;
+ if (testIndex < nStrips && stripId[testIndex] == invalidStrip) {
+ testIndex += 2;
+ }
+ if (testIndex < nStrips) {
+ rangeRight = stripId[testIndex] - stripId[indexRight] - 1;
+ const auto newchan = channels[testIndex];
+ const auto newdet = chanlocs->detID(newchan);
+ sameDetRight = det == newdet;
+ } else {
+ sameDetRight = false;
+ }
+ } // while loop
+ } // testIndex < nStrips
+ clusterIndexLeft[i] = indexLeft;
+ clusterSize[i] = indexRight - indexLeft + 1;
+ clusterDetId[i] = det;
+ firstStrip[i] = stripId[indexLeft];
+ trueCluster[i] =
+ (noiseSquared_i * clusterThresholdSquared <= adcSum_i * adcSum_i) and (clusterSize[i] <= clusterSizeLimit);
+ } // i < nSeedStripsNC
+ if (first == 0) {
+ clust_data_d->nClusters_ = nSeedStripsNC;
+ }
+}
+
+__global__ static void checkClusterConditionGPU(StripDataView *sst_data_d,
+ const ConditionsDeviceView *conditions,
+ SiStripClustersCUDADevice::DeviceView *clust_data_d) {
+ const uint16_t *__restrict__ stripId = sst_data_d->stripId;
+ const auto __restrict__ chanlocs = sst_data_d->chanlocs;
+ const uint16_t *__restrict__ channels = sst_data_d->channel;
+ const uint8_t *__restrict__ adc = sst_data_d->adc;
+ const float minGoodCharge = sst_data_d->minGoodCharge; //1620.0;
+ const auto nSeedStripsNC = clust_data_d->nClusters_;
+ const auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
+
+ auto __restrict__ clusterSize = clust_data_d->clusterSize_;
+ auto __restrict__ clusterADCs = clust_data_d->clusterADCs_;
+ auto __restrict__ trueCluster = clust_data_d->trueCluster_;
+ auto __restrict__ barycenter = clust_data_d->barycenter_;
+ auto __restrict__ charge = clust_data_d->charge_;
+
+ constexpr uint16_t stripIndexMask = 0x7FFF;
+
+ const int tid = threadIdx.x;
+ const int bid = blockIdx.x;
+ const int nthreads = blockDim.x;
+ const int first = nthreads * bid + tid;
+ const int stride = blockDim.x * gridDim.x;
+
+ for (int i = first; i < nSeedStripsNC; i += stride) {
+ if (trueCluster[i]) {
+ const int left = clusterIndexLeft[i];
+ const int size = clusterSize[i];
+
+ if (i > 0 && clusterIndexLeft[i - 1] == left) {
+ trueCluster[i] = 0; // ignore duplicates
+ } else {
+ float adcSum = 0.0f;
+ int sumx = 0;
+ int suma = 0;
+
+ auto j = 0;
+ for (int k = 0; k < size; k++) {
+ const auto index = left + k;
+ const auto chan = channels[index];
+ const auto fed = chanlocs->fedID(chan);
+ const auto channel = chanlocs->fedCh(chan);
+ const auto strip = stripId[index];
+#ifdef GPU_CHECK
+ if (fed == invalidFed) {
+ printf("Invalid fed index %d\n", index);
+ }
+#endif
+ if (strip != invalidStrip) {
+ const float gain_j = conditions->gain(fed, channel, strip);
+
+ uint8_t adc_j = adc[index];
+ const int charge = static_cast(static_cast(adc_j) / gain_j + 0.5f);
+
+ constexpr uint8_t adc_low_saturation = 254;
+ constexpr uint8_t adc_high_saturation = 255;
+ constexpr int charge_low_saturation = 253;
+ constexpr int charge_high_saturation = 1022;
+ if (adc_j < adc_low_saturation) {
+ adc_j =
+ (charge > charge_high_saturation ? adc_high_saturation
+ : (charge > charge_low_saturation ? adc_low_saturation : charge));
+ }
+ clusterADCs[j * nSeedStripsNC + i] = adc_j;
+
+ adcSum += static_cast(adc_j);
+ sumx += j * adc_j;
+ suma += adc_j;
+ j++;
+ }
+ } // loop over cluster strips
+ charge[i] = adcSum;
+ const auto chan = channels[left];
+ const fedId_t fed = chanlocs->fedID(chan);
+ const fedCh_t channel = chanlocs->fedCh(chan);
+ trueCluster[i] = (adcSum * conditions->invthick(fed, channel)) > minGoodCharge;
+ const auto bary_i = static_cast(sumx) / static_cast(suma);
+ barycenter[i] = static_cast(stripId[left] & stripIndexMask) + bary_i + 0.5f;
+ clusterSize[i] = j;
+ } // not a duplicate cluster
+ } // trueCluster[i] is true
+ } // i < nSeedStripsNC
+}
+
+namespace stripgpu {
+ void SiStripRawToClusterGPUKernel::unpackChannelsGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
+ constexpr int nthreads = 128;
+ const auto channels = chanlocsGPU_->size();
+ const auto nblocks = (channels + nthreads - 1) / nthreads;
+
+ unpackChannels<<>>(chanlocsGPU_->channelLocsView(),
+ conditions,
+ stripdata_->alldataGPU_.get(),
+ stripdata_->channelGPU_.get(),
+ stripdata_->stripIdGPU_.get());
+ }
+
+ void SiStripRawToClusterGPUKernel::allocateSSTDataGPU(int max_strips, cudaStream_t stream) {
+ stripdata_->seedStripsMask_ = cms::cuda::make_device_unique(2 * max_strips, stream);
+ stripdata_->prefixSeedStripsNCMask_ = cms::cuda::make_device_unique(2 * max_strips, stream);
+
+ sst_data_d_->chanlocs = chanlocsGPU_->channelLocsView();
+ sst_data_d_->stripId = stripdata_->stripIdGPU_.get();
+ sst_data_d_->channel = stripdata_->channelGPU_.get();
+ sst_data_d_->adc = stripdata_->alldataGPU_.get();
+ sst_data_d_->seedStripsMask = stripdata_->seedStripsMask_.get();
+ sst_data_d_->prefixSeedStripsNCMask = stripdata_->prefixSeedStripsNCMask_.get();
+
+ sst_data_d_->seedStripsNCMask = sst_data_d_->seedStripsMask + max_strips;
+ sst_data_d_->seedStripsNCIndex = sst_data_d_->prefixSeedStripsNCMask + max_strips;
+
+ sst_data_d_->channelThreshold = channelThreshold_;
+ sst_data_d_->seedThreshold = seedThreshold_;
+ sst_data_d_->clusterThresholdSquared = clusterThresholdSquared_;
+ sst_data_d_->maxSequentialHoles = maxSequentialHoles_;
+ sst_data_d_->maxSequentialBad = maxSequentialBad_;
+ sst_data_d_->maxAdjacentBad = maxAdjacentBad_;
+ sst_data_d_->minGoodCharge = minGoodCharge_;
+ sst_data_d_->clusterSizeLimit = maxClusterSize_;
+
+ pt_sst_data_d_ = cms::cuda::make_device_unique(stream);
+ cms::cuda::copyAsync(pt_sst_data_d_, sst_data_d_, stream);
+#ifdef GPU_CHECK
+ cudaCheck(cudaStreamSynchronize(stream));
+#endif
+ }
+
+ void SiStripRawToClusterGPUKernel::findClusterGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
+ const int nthreads = 128;
+ const int nStrips = sst_data_d_->nStrips;
+ const int nSeeds = std::min(kMaxSeedStrips, nStrips);
+ const int nblocks = (nSeeds + nthreads - 1) / nthreads;
+
+#ifdef GPU_DEBUG
+ auto cpu_index = cms::cuda::make_host_unique(nStrips, stream);
+ auto cpu_strip = cms::cuda::make_host_unique(nStrips, stream);
+ auto cpu_adc = cms::cuda::make_host_unique(nStrips, stream);
+
+ cudaCheck(cudaMemcpyAsync(
+ cpu_strip.get(), sst_data_d_->stripId, nStrips * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
+ cudaCheck(
+ cudaMemcpyAsync(cpu_adc.get(), sst_data_d_->adc, nStrips * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
+ cudaCheck(cudaMemcpyAsync(
+ cpu_index.get(), sst_data_d_->seedStripsNCIndex, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
+ cudaCheck(cudaStreamSynchronize(stream));
+
+ for (int i = 0; i < nStrips; i++) {
+ std::cout << " cpu_strip " << cpu_strip[i] << " cpu_adc " << (unsigned int)cpu_adc[i] << " cpu index "
+ << cpu_index[i] << std::endl;
+ }
+#endif
+
+ auto clust_data_d = clusters_d_.view();
+ findLeftRightBoundaryGPU<<>>(pt_sst_data_d_.get(), conditions, clust_data_d);
+ cudaCheck(cudaGetLastError());
+#ifdef GPU_CHECK
+ cudaDeviceSynchronize();
+ cudaCheck(cudaGetLastError());
+#endif
+
+ cudaCheck(cudaMemcpyAsync(clusters_d_.nClustersPtr(),
+ &(clust_data_d->nClusters_),
+ sizeof(clust_data_d->nClusters_),
+ cudaMemcpyDeviceToHost,
+ stream));
+
+ checkClusterConditionGPU<<>>(pt_sst_data_d_.get(), conditions, clust_data_d);
+ cudaCheck(cudaGetLastError());
+
+#ifdef GPU_CHECK
+ cudaDeviceSynchronize();
+ cudaCheck(cudaGetLastError());
+#endif
+
+#ifdef GPU_DEBUG
+ cudaStreamSynchronize(stream);
+ auto clust_data = std::make_unique(clusters_d_, stream);
+ cudaStreamSynchronize(stream);
+
+ const auto clusterIndexLeft = clust_data->clusterIndex().get();
+ const auto clusterSize = clust_data->clusterSize().get();
+ const auto trueCluster = clust_data->trueCluster().get();
+ const auto clusterADCs = clust_data->clusterADCs().get();
+ const auto detids = clust_data->clusterDetId().get();
+ const auto charge = clust_data->charge().get();
+
+ const auto nSeedStripsNC = clusters_d_.nClusters();
+ std::cout << "findClusterGPU nSeedStripsNC=" << nSeedStripsNC << std::endl;
+
+ for (auto i = 0U; i < nSeedStripsNC; i++) {
+ if (trueCluster[i]) {
+ int left = clusterIndexLeft[i];
+ uint32_t size = clusterSize[i];
+ const auto detid = detids[i];
+ std::cout << "i=" << i << " detId " << detid << " left " << left << " size " << size << " charge " << charge[i]
+ << ": ";
+ size = std::min(size, maxClusterSize_);
+ for (uint32_t j = 0; j < size; j++) {
+ std::cout << (unsigned int)clusterADCs[j * nSeedStripsNC + i] << " ";
+ }
+ std::cout << std::endl;
+ }
+ }
+#endif
+ }
+
+ void SiStripRawToClusterGPUKernel::setSeedStripsNCIndexGPU(const ConditionsDeviceView *conditions,
+ cudaStream_t stream) {
+#ifdef GPU_DEBUG
+ int nStrips = sst_data_d_->nStrips;
+ auto cpu_strip = cms::cuda::make_host_unique(nStrips, stream);
+ auto cpu_adc = cms::cuda::make_host_unique(nStrips, stream);
+
+ cudaCheck(cudaMemcpyAsync(
+ cpu_strip.get(), sst_data_d_->stripId, nStrips * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
+ cudaCheck(
+ cudaMemcpyAsync(cpu_adc.get(), sst_data_d_->adc, nStrips * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
+ cudaCheck(cudaStreamSynchronize(stream));
+
+ for (int i = 0; i < nStrips; i++) {
+ std::cout << " cpu_strip " << cpu_strip[i] << " cpu_adc " << (unsigned int)cpu_adc[i] << std::endl;
+ }
+#endif
+
+ int nthreads = 256;
+ int nblocks = (sst_data_d_->nStrips + nthreads - 1) / nthreads;
+
+ //mark seed strips
+ setSeedStripsGPU<<>>(pt_sst_data_d_.get(), conditions);
+ cudaCheck(cudaGetLastError());
+#ifdef GPU_CHECK
+ cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+ //mark only non-consecutive seed strips (mask out consecutive seed strips)
+ setNCSeedStripsGPU<<>>(pt_sst_data_d_.get(), conditions);
+ cudaCheck(cudaGetLastError());
+#ifdef GPU_CHECK
+ cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+ std::size_t temp_storage_bytes = 0;
+ cub::DeviceScan::ExclusiveSum(nullptr,
+ temp_storage_bytes,
+ sst_data_d_->seedStripsNCMask,
+ sst_data_d_->prefixSeedStripsNCMask,
+ sst_data_d_->nStrips,
+ stream);
+#ifdef GPU_DEBUG
+ std::cout << "temp_storage_bytes=" << temp_storage_bytes << std::endl;
+#endif
+#ifdef GPU_CHECK
+ cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+ {
+ auto d_temp_storage = cms::cuda::make_device_unique(temp_storage_bytes, stream);
+ cub::DeviceScan::ExclusiveSum(d_temp_storage.get(),
+ temp_storage_bytes,
+ sst_data_d_->seedStripsNCMask,
+ sst_data_d_->prefixSeedStripsNCMask,
+ sst_data_d_->nStrips,
+ stream);
+ }
+#ifdef GPU_CHECK
+ cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+ setStripIndexGPU<<>>(pt_sst_data_d_.get());
+ cudaCheck(cudaGetLastError());
+#ifdef GPU_CHECK
+ cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+#ifdef GPU_DEBUG
+ auto cpu_mask = cms::cuda::make_host_unique(nStrips, stream);
+ auto cpu_prefix = cms::cuda::make_host_unique(nStrips, stream);
+ auto cpu_index = cms::cuda::make_host_unique(nStrips, stream);
+
+ cudaCheck(cudaMemcpyAsync(&(sst_data_d_->nSeedStripsNC),
+ sst_data_d_->prefixSeedStripsNCMask + sst_data_d_->nStrips - 1,
+ sizeof(int),
+ cudaMemcpyDeviceToHost,
+ stream));
+ cudaCheck(cudaMemcpyAsync(
+ cpu_mask.get(), sst_data_d_->seedStripsNCMask, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
+ cudaCheck(cudaMemcpyAsync(
+ cpu_prefix.get(), sst_data_d_->prefixSeedStripsNCMask, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
+ cudaCheck(cudaMemcpyAsync(
+ cpu_index.get(), sst_data_d_->seedStripsNCIndex, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
+ cudaCheck(cudaStreamSynchronize(stream));
+
+ const int nSeedStripsNC = std::min(kMaxSeedStrips, sst_data_d_->nSeedStripsNC);
+ std::cout << "nStrips=" << nStrips << " nSeedStripsNC=" << sst_data_d_->nSeedStripsNC << std::endl;
+ for (int i = 0; i < nStrips; i++) {
+ std::cout << " i " << i << " mask " << cpu_mask[i] << " prefix " << cpu_prefix[i] << " index " << cpu_index[i]
+ << std::endl;
+ }
+#endif
+ }
+} // namespace stripgpu
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h
new file mode 100644
index 0000000000000..a9c88b340b623
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h
@@ -0,0 +1,77 @@
+#ifndef RecoLocalTracker_SiStripClusterizer_plugins_SiStripRawToClusterGPUKernel_h
+#define RecoLocalTracker_SiStripClusterizer_plugins_SiStripRawToClusterGPUKernel_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+//#include "clusterGPU.cuh"
+
+#include
+
+#include
+#include
+
+class ChannelLocs;
+class ChannelLocsGPU;
+class FEDRawData;
+
+namespace sistrip {
+ class FEDBuffer;
+}
+namespace edm {
+ class ParameterSet;
+}
+
+namespace stripgpu {
+ struct StripDataView;
+
+ class StripDataGPU {
+ public:
+ StripDataGPU(size_t size, cudaStream_t stream);
+
+ cms::cuda::device::unique_ptr alldataGPU_;
+ cms::cuda::device::unique_ptr channelGPU_;
+ cms::cuda::device::unique_ptr stripIdGPU_;
+ cms::cuda::device::unique_ptr seedStripsMask_;
+ cms::cuda::device::unique_ptr prefixSeedStripsNCMask_;
+ };
+
+ class SiStripRawToClusterGPUKernel {
+ public:
+ SiStripRawToClusterGPUKernel(const edm::ParameterSet& conf);
+ void makeAsync(const std::vector& rawdata,
+ const std::vector>& buffers,
+ const SiStripClusterizerConditionsGPU& conditions,
+ cudaStream_t stream);
+ void copyAsync(cudaStream_t stream);
+ SiStripClustersCUDADevice getResults(cudaStream_t stream);
+
+ private:
+ using ConditionsDeviceView = SiStripClusterizerConditionsGPU::Data::DeviceView;
+
+ void reset();
+ void unpackChannelsGPU(const ConditionsDeviceView* conditions, cudaStream_t stream);
+ void allocateSSTDataGPU(int max_strips, cudaStream_t stream);
+ void freeSSTDataGPU(cudaStream_t stream);
+
+ void setSeedStripsNCIndexGPU(const ConditionsDeviceView* conditions, cudaStream_t stream);
+ void findClusterGPU(const ConditionsDeviceView* conditions, cudaStream_t stream);
+
+ std::vector fedIndex_;
+ std::vector fedRawDataOffsets_;
+
+ std::unique_ptr stripdata_;
+ std::unique_ptr chanlocsGPU_;
+
+ cms::cuda::host::unique_ptr sst_data_d_;
+ cms::cuda::device::unique_ptr pt_sst_data_d_;
+
+ SiStripClustersCUDADevice clusters_d_;
+ float channelThreshold_, seedThreshold_, clusterThresholdSquared_;
+ uint8_t maxSequentialHoles_, maxSequentialBad_, maxAdjacentBad_;
+ uint32_t maxClusterSize_;
+ float minGoodCharge_;
+ };
+} // namespace stripgpu
+#endif
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.h b/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.h
new file mode 100644
index 0000000000000..785b714376e2a
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.h
@@ -0,0 +1,28 @@
+#ifndef RecoLocalTracker_SiStripClusterizer_plugins_StripDataView_h
+#define RecoLocalTracker_SiStripClusterizer_plugins_StripDataView_h
+
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+#include "FWCore/Utilities/interface/HostDeviceConstant.h"
+
+#include
+
+class ChannelLocsView;
+
+namespace stripgpu {
+ HOST_DEVICE_CONSTANT auto kMaxSeedStrips = 200000;
+
+ struct StripDataView {
+ const ChannelLocsView *chanlocs;
+ uint8_t *adc;
+ uint16_t *channel;
+ stripId_t *stripId;
+ int *seedStripsNCIndex, *seedStripsMask, *seedStripsNCMask, *prefixSeedStripsNCMask;
+ int nSeedStripsNC;
+ int nStrips;
+ float channelThreshold, seedThreshold, clusterThresholdSquared;
+ uint8_t maxSequentialHoles, maxSequentialBad, maxAdjacentBad;
+ float minGoodCharge;
+ int clusterSizeLimit;
+ };
+} // namespace stripgpu
+#endif
diff --git a/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py b/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py
index 2f0a141beb688..b187755f84727 100644
--- a/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py
+++ b/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py
@@ -11,6 +11,7 @@
MaxSequentialHoles = cms.uint32(0),
MaxSequentialBad = cms.uint32(1),
MaxAdjacentBad = cms.uint32(0),
+ MaxClusterSize = cms.uint32(768),
RemoveApvShots = cms.bool(True),
clusterChargeCut = cms.PSet(refToPSet_ = cms.string('SiStripClusterChargeCutNone')),
ConditionsLabel = cms.string("")
diff --git a/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py b/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py
index 602b63215018a..0e912f7ba5660 100644
--- a/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py
+++ b/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py
@@ -1,16 +1,56 @@
import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
# from RecoLocalTracker.SiStripClusterizer.SiStripClusterizer_RealData_cfi import *
from RecoLocalTracker.SiStripClusterizer.DefaultClusterizer_cff import *
from RecoLocalTracker.SiStripZeroSuppression.DefaultAlgorithms_cff import *
-siStripClusters = cms.EDProducer("SiStripClusterizerFromRaw",
- onDemand = cms.bool(True),
- Clusterizer = DefaultClusterizer,
- Algorithms = DefaultAlgorithms,
- DoAPVEmulatorCheck = cms.bool(False),
- HybridZeroSuppressed = cms.bool(False),
- ProductLabel = cms.InputTag('rawDataCollector')
- )
+from RecoLocalTracker.SiStripClusterizer.siStripClusterizerFromRawGPU_cfi import siStripClusterizerFromRawGPU
+from RecoLocalTracker.SiStripClusterizer.siStripClustersSOAtoHost_cfi import siStripClustersSOAtoHost
+from RecoLocalTracker.SiStripClusterizer.siStripClustersFromSOA_cfi import siStripClustersFromSOA
+from RecoLocalTracker.SiStripClusterizer.SiStripClusterizerConditionsGPUESProducer_cfi import SiStripClusterizerConditionsGPUESProducer
+_siStripClusterizerFromRaw = cms.EDProducer("SiStripClusterizerFromRaw",
+ onDemand = cms.bool(True),
+ Clusterizer = DefaultClusterizer,
+ Algorithms = DefaultAlgorithms,
+ DoAPVEmulatorCheck = cms.bool(False),
+ HybridZeroSuppressed = cms.bool(False),
+ ProductLabel = cms.InputTag('rawDataCollector'))
+
+_siStripClusterizerFromRaw.Clusterizer.MaxClusterSize = cms.uint32(16)
+
+siStripClusterizerFromRawGPU.Clusterizer = DefaultClusterizer
+
+siStripClusters = SwitchProducerCUDA(
+ cpu = _siStripClusterizerFromRaw.clone(),
+)
+
+siStripClustersTask = cms.Task(
+ siStripClusters,
+)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+gpu.toModify(siStripClusters,
+ cuda = siStripClustersFromSOA,
+)
+
+siStripClustersTaskCUDA = cms.Task()
+
+gpu.toReplaceWith(siStripClustersTaskCUDA, cms.Task(
+ # conditions used *only* by the modules running on GPU
+ SiStripClusterizerConditionsGPUESProducer,
+ # reconstruct the strip clusters on the gpu
+ siStripClusterizerFromRawGPU,
+ # copy clusters from GPU to pinned host memory
+ siStripClustersSOAtoHost,
+))
+
+gpu.toReplaceWith(siStripClustersTask, cms.Task(
+ # CUDA specific
+ siStripClustersTaskCUDA,
+ # switch producer to legacy format
+ siStripClusters,
+))
diff --git a/RecoLocalTracker/SiStripClusterizer/python/customizeStripClustersFromRaw.py b/RecoLocalTracker/SiStripClusterizer/python/customizeStripClustersFromRaw.py
new file mode 100644
index 0000000000000..db7d02855713f
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/python/customizeStripClustersFromRaw.py
@@ -0,0 +1,24 @@
+import FWCore.ParameterSet.Config as cms
+
+# replace the standard SiStripClusterizer with the switch producer
+# meant primarily for testing
+def customizeStripClustersFromRaw(process):
+ if hasattr(process, 'striptrackerlocalrecoTask'):
+ process.striptrackerlocalrecoTask.remove(process.siStripClusters)
+ process.load("RecoLocalTracker.SiStripClusterizer.SiStripClusterizerOnDemand_cfi")
+ # CPU should emulate the full detector clusterizer
+ process.siStripClusters.cpu.onDemand = cms.bool(False)
+ process.striptrackerlocalrecoTask.add(process.siStripClustersTask)
+
+ return process
+
+def customizeHLTStripClustersFromRaw(process):
+ if hasattr(process, 'hltSiStripRawToClustersFacility'):
+ import RecoLocalTracker.SiStripClusterizer.SiStripClusterizerOnDemand_cfi as SiStripClusterizerOnDemand_cfi
+
+ process.load("RecoLocalTracker.SiStripClusterizer.SiStripClusterizerOnDemand_cfi")
+ process.hltSiStripRawToClustersFacility = SiStripClusterizerOnDemand_cfi.siStripClusters.clone()
+ process.HLTDoLocalStripSequence.replace(process.hltSiStripRawToClustersFacility,
+ cms.Sequence(process.hltSiStripRawToClustersFacility, process.siStripClustersTaskCUDA))
+
+ return process
diff --git a/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc b/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc
index b8e38e97d4379..034283a910054 100644
--- a/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc
+++ b/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc
@@ -1,6 +1,7 @@
#include "RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithm.h"
#include "RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h"
#include "RecoLocalTracker/SiStripClusterizer/interface/ClusterChargeCut.h"
@@ -10,17 +11,19 @@ std::unique_ptr StripClusterizerAlgorithmFactory::cre
std::string algorithm = conf.getParameter("Algorithm");
if (algorithm == "ThreeThresholdAlgorithm") {
- return std::unique_ptr(
- new ThreeThresholdAlgorithm(iC.esConsumes(
- edm::ESInputTag{"", conf.getParameter("ConditionsLabel")}),
- conf.getParameter("ChannelThreshold"),
- conf.getParameter("SeedThreshold"),
- conf.getParameter