diff --git a/CUDADataFormats/SiStripCluster/BuildFile.xml b/CUDADataFormats/SiStripCluster/BuildFile.xml
new file mode 100644
index 0000000000000..5e401d215c4eb
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/BuildFile.xml
@@ -0,0 +1,10 @@
+<use name="DataFormats/Common"/>
+<use name="CUDADataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
+<use name="rootcore"/>
+
+<export>
+    <lib name="1"/>
+</export>
+
diff --git a/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h b/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
new file mode 100644
index 0000000000000..f64b8a533d513
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
@@ -0,0 +1,59 @@
+#ifndef CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h
+#define CUDADataFormats_SiStripCluster_interface_SiStripClustersCUDA_h
+
+#include "DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+#include <cuda_runtime.h>
+
+class SiStripClustersCUDADevice : public SiStripClustersSOABase<cms::cuda::device::unique_ptr> {
+public:
+  SiStripClustersCUDADevice() = default;
+  explicit SiStripClustersCUDADevice(uint32_t maxClusters, uint32_t maxStripsPerCluster, cudaStream_t stream);
+  ~SiStripClustersCUDADevice() override = default;
+
+  SiStripClustersCUDADevice(const SiStripClustersCUDADevice &) = delete;
+  SiStripClustersCUDADevice &operator=(const SiStripClustersCUDADevice &) = delete;
+  SiStripClustersCUDADevice(SiStripClustersCUDADevice &&) = default;
+  SiStripClustersCUDADevice &operator=(SiStripClustersCUDADevice &&) = default;
+
+  struct DeviceView {
+    uint32_t *clusterIndex_;
+    uint32_t *clusterSize_;
+    uint8_t *clusterADCs_;
+    stripgpu::detId_t *clusterDetId_;
+    stripgpu::stripId_t *firstStrip_;
+    bool *trueCluster_;
+    float *barycenter_;
+    float *charge_;
+    uint32_t nClusters_;
+    uint32_t maxClusterSize_;
+  };
+
+  DeviceView *view() const { return view_d.get(); }
+  uint32_t nClusters() const { return nClusters_; }
+  uint32_t *nClustersPtr() { return &nClusters_; }
+  uint32_t maxClusterSize() const { return maxClusterSize_; }
+  uint32_t *maxClusterSizePtr() { return &maxClusterSize_; }
+
+private:
+  cms::cuda::device::unique_ptr<DeviceView> view_d;  // "me" pointer
+  uint32_t nClusters_;
+  uint32_t maxClusterSize_;
+};
+
+class SiStripClustersCUDAHost : public SiStripClustersSOABase<cms::cuda::host::unique_ptr> {
+public:
+  SiStripClustersCUDAHost() = default;
+  explicit SiStripClustersCUDAHost(const SiStripClustersCUDADevice &clusters_d, cudaStream_t stream);
+  ~SiStripClustersCUDAHost() override = default;
+
+  SiStripClustersCUDAHost(const SiStripClustersCUDAHost &) = delete;
+  SiStripClustersCUDAHost &operator=(const SiStripClustersCUDAHost &) = delete;
+  SiStripClustersCUDAHost(SiStripClustersCUDAHost &&) = default;
+  SiStripClustersCUDAHost &operator=(SiStripClustersCUDAHost &&) = default;
+};
+
+#endif
diff --git a/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc b/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
new file mode 100644
index 0000000000000..220456760476a
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
@@ -0,0 +1,59 @@
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+SiStripClustersCUDADevice::SiStripClustersCUDADevice(uint32_t maxClusters,
+                                                     uint32_t maxStripsPerCluster,
+                                                     cudaStream_t stream) {
+  maxClusterSize_ = maxStripsPerCluster;
+
+  clusterIndex_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
+  clusterSize_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
+  clusterADCs_ = cms::cuda::make_device_unique<uint8_t[]>(maxClusters * maxStripsPerCluster, stream);
+  clusterDetId_ = cms::cuda::make_device_unique<stripgpu::detId_t[]>(maxClusters, stream);
+  firstStrip_ = cms::cuda::make_device_unique<stripgpu::stripId_t[]>(maxClusters, stream);
+  trueCluster_ = cms::cuda::make_device_unique<bool[]>(maxClusters, stream);
+  barycenter_ = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
+  charge_ = cms::cuda::make_device_unique<float[]>(maxClusters, stream);
+
+  auto view = cms::cuda::make_host_unique<DeviceView>(stream);
+  view->clusterIndex_ = clusterIndex_.get();
+  view->clusterSize_ = clusterSize_.get();
+  view->clusterADCs_ = clusterADCs_.get();
+  view->clusterDetId_ = clusterDetId_.get();
+  view->firstStrip_ = firstStrip_.get();
+  view->trueCluster_ = trueCluster_.get();
+  view->barycenter_ = barycenter_.get();
+  view->charge_ = charge_.get();
+  view->maxClusterSize_ = maxStripsPerCluster;
+
+  view_d = cms::cuda::make_device_unique<DeviceView>(stream);
+  cms::cuda::copyAsync(view_d, view, stream);
+#ifdef GPU_CHECK
+  cudaCheck(cudaStreamSynchronize(stream));
+#endif
+}
+
+SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice& clusters_d, cudaStream_t stream) {
+  nClusters_ = clusters_d.nClusters();
+  maxClusterSize_ = clusters_d.maxClusterSize();
+  clusterIndex_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
+  clusterSize_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
+  clusterADCs_ = cms::cuda::make_host_unique<uint8_t[]>(nClusters_ * maxClusterSize_, stream);
+  clusterDetId_ = cms::cuda::make_host_unique<stripgpu::detId_t[]>(nClusters_, stream);
+  firstStrip_ = cms::cuda::make_host_unique<stripgpu::stripId_t[]>(nClusters_, stream);
+  trueCluster_ = cms::cuda::make_host_unique<bool[]>(nClusters_, stream);
+  barycenter_ = cms::cuda::make_host_unique<float[]>(nClusters_, stream);
+  charge_ = cms::cuda::make_host_unique<float[]>(nClusters_, stream);
+
+  cms::cuda::copyAsync(clusterIndex_, clusters_d.clusterIndex(), nClusters_, stream);
+  cms::cuda::copyAsync(clusterSize_, clusters_d.clusterSize(), nClusters_, stream);
+  cms::cuda::copyAsync(clusterADCs_, clusters_d.clusterADCs(), nClusters_ * maxClusterSize_, stream);
+  cms::cuda::copyAsync(clusterDetId_, clusters_d.clusterDetId(), nClusters_, stream);
+  cms::cuda::copyAsync(firstStrip_, clusters_d.firstStrip(), nClusters_, stream);
+  cms::cuda::copyAsync(trueCluster_, clusters_d.trueCluster(), nClusters_, stream);
+  cms::cuda::copyAsync(barycenter_, clusters_d.barycenter(), nClusters_, stream);
+  cms::cuda::copyAsync(charge_, clusters_d.charge(), nClusters_, stream);
+#ifdef GPU_CHECK
+  cudaCheck(cudaStreamSynchronize(stream));
+#endif
+}
diff --git a/CUDADataFormats/SiStripCluster/src/classes.h b/CUDADataFormats/SiStripCluster/src/classes.h
new file mode 100644
index 0000000000000..b38f397dee067
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_SiStripCluster_classes_h
+#define CUDADataFormats_SiStripCluster_classes_h
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/SiStripCluster/src/classes_def.xml b/CUDADataFormats/SiStripCluster/src/classes_def.xml
new file mode 100644
index 0000000000000..3c2f3ab27c620
--- /dev/null
+++ b/CUDADataFormats/SiStripCluster/src/classes_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="cms::cuda::Product<SiStripClustersCUDADevice>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<SiStripClustersCUDADevice>>" persistent="false"/>
+  <class name="SiStripClustersCUDAHost" persistent="false"/>
+  <class name="edm::Wrapper<SiStripClustersCUDAHost>" persistent="false"/>
+</lcgdict>
diff --git a/CalibFormats/SiStripObjects/BuildFile.xml b/CalibFormats/SiStripObjects/BuildFile.xml
index 325f0aa1bcb9f..83c3901a34f13 100644
--- a/CalibFormats/SiStripObjects/BuildFile.xml
+++ b/CalibFormats/SiStripObjects/BuildFile.xml
@@ -2,6 +2,9 @@
 <use name="DataFormats/SiStripCommon"/>
 <use name="CondFormats/SiStripObjects"/>
 <use name="DataFormats/TrackerCommon"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
 <export>
   <lib name="1"/>
 </export>
diff --git a/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h b/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h
new file mode 100644
index 0000000000000..94f0080f88019
--- /dev/null
+++ b/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h
@@ -0,0 +1,137 @@
+#ifndef CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
+#define CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
+
+#include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
+#include "DataFormats/SiStripCommon/interface/ConstantsForHardwareSystems.h"
+
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
+class SiStripQuality;
+class SiStripGain;
+class SiStripNoises;
+
+namespace stripgpu {
+  __host__ __device__ inline fedId_t fedIndex(fedId_t fed) { return fed - sistrip::FED_ID_MIN; }
+  __host__ __device__ inline std::uint32_t stripIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+    return fedIndex(fed) * sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH + channel * sistrip::STRIPS_PER_FEDCH +
+           (strip % sistrip::STRIPS_PER_FEDCH);
+  }
+  __host__ __device__ inline std::uint32_t apvIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+    return fedIndex(fed) * sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED + sistrip::APVS_PER_CHAN * channel +
+           (strip % sistrip::STRIPS_PER_FEDCH) / sistrip::STRIPS_PER_APV;
+  }
+  __host__ __device__ inline std::uint32_t channelIndex(fedId_t fed, fedCh_t channel) {
+    return fedIndex(fed) * sistrip::FEDCH_PER_FED + channel;
+  }
+
+  class SiStripClusterizerConditionsGPU {
+  public:
+    class DetToFed {
+    public:
+      DetToFed(detId_t detid, apvPair_t ipair, fedId_t fedid, fedCh_t fedch)
+          : detid_(detid), ipair_(ipair), fedid_(fedid), fedch_(fedch) {}
+      detId_t detID() const { return detid_; }
+      apvPair_t pair() const { return ipair_; }
+      fedId_t fedID() const { return fedid_; }
+      fedCh_t fedCh() const { return fedch_; }
+
+    private:
+      detId_t detid_;
+      apvPair_t ipair_;
+      fedId_t fedid_;
+      fedCh_t fedch_;
+    };
+    using DetToFeds = std::vector<DetToFed>;
+
+    static constexpr std::uint16_t badBit = 1 << 15;
+
+    class Data {
+    public:
+      struct DeviceView {
+        __device__ inline detId_t detID(fedId_t fed, fedCh_t channel) const {
+          return detID_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline apvPair_t iPair(fedId_t fed, fedCh_t channel) const {
+          return iPair_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline float invthick(fedId_t fed, fedCh_t channel) const {
+          return invthick_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline float noise(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          // noise is stored as 9 bits with a fixed point scale factor of 0.1
+          return 0.1f * (noise_[stripIndex(fed, channel, strip)] & ~badBit);
+        }
+
+        __device__ inline float gain(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return gain_[apvIndex(fed, channel, strip)];
+        }
+
+        __device__ inline bool bad(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return badBit == (noise_[stripIndex(fed, channel, strip)] & badBit);
+        }
+        const std::uint16_t* noise_;  //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH];
+        const float* invthick_;       //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+        const detId_t* detID_;        //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+        const apvPair_t* iPair_;      //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+        const float* gain_;           //[sistrip::NUMBER_OF_FEDS*sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED];
+      };
+
+      const DeviceView* deviceView() const { return deviceView_.get(); }
+
+      cms::cuda::device::unique_ptr<DeviceView> deviceView_;
+      cms::cuda::host::unique_ptr<DeviceView> hostView_;
+
+      cms::cuda::device::unique_ptr<std::uint16_t[]>
+          noise_;  //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH];
+      cms::cuda::device::unique_ptr<float[]> invthick_;   //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+      cms::cuda::device::unique_ptr<detId_t[]> detID_;    //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+      cms::cuda::device::unique_ptr<apvPair_t[]> iPair_;  //[sistrip::NUMBER_OF_FEDS*sistrip::FEDCH_PER_FED];
+      cms::cuda::device::unique_ptr<float[]>
+          gain_;  //[sistrip::NUMBER_OF_FEDS*sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED];
+    };
+
+    SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
+                                    const SiStripGain* gains,
+                                    const SiStripNoises& noises);
+    ~SiStripClusterizerConditionsGPU() = default;
+
+    // Function to return the actual payload on the memory of the current device
+    Data const& getGPUProductAsync(cudaStream_t stream) const;
+
+    const DetToFeds& detToFeds() const { return detToFeds_; }
+
+  private:
+    void setStrip(fedId_t fed, fedCh_t channel, stripId_t strip, std::uint16_t noise, float gain, bool bad) {
+      gain_[apvIndex(fed, channel, strip)] = gain;
+      noise_[stripIndex(fed, channel, strip)] = noise;
+      if (bad) {
+        noise_[stripIndex(fed, channel, strip)] |= badBit;
+      }
+    }
+
+    void setInvThickness(fedId_t fed, fedCh_t channel, float invthick) {
+      invthick_[channelIndex(fed, channel)] = invthick;
+    }
+
+    // Holds the data in pinned CPU memory
+    std::vector<std::uint16_t, cms::cuda::HostAllocator<std::uint16_t>> noise_;
+    std::vector<float, cms::cuda::HostAllocator<float>> invthick_;
+    std::vector<detId_t, cms::cuda::HostAllocator<detId_t>> detID_;
+    std::vector<apvPair_t, cms::cuda::HostAllocator<apvPair_t>> iPair_;
+    std::vector<float, cms::cuda::HostAllocator<float>> gain_;
+
+    // Helper that takes care of complexity of transferring the data to
+    // multiple devices
+    cms::cuda::ESProduct<Data> gpuData_;
+    DetToFeds detToFeds_;
+  };
+}  // namespace stripgpu
+
+#endif
diff --git a/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc b/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
index ddf7a0ffb914f..05530484f14c4 100644
--- a/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
+++ b/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
@@ -23,3 +23,6 @@ TYPELOOKUP_DATA_REG(SiStripQuality);
 
 #include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditions.h"
 TYPELOOKUP_DATA_REG(SiStripClusterizerConditions);
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+TYPELOOKUP_DATA_REG(stripgpu::SiStripClusterizerConditionsGPU);
diff --git a/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc b/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc
new file mode 100644
index 0000000000000..33d0889ff5550
--- /dev/null
+++ b/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc
@@ -0,0 +1,100 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+#include "CondFormats/SiStripObjects/interface/SiStripNoises.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripGain.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripDetCabling.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripQuality.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+
+#include "DataFormats/SiStripCluster/interface/SiStripClusterTools.h"
+
+namespace stripgpu {
+  SiStripClusterizerConditionsGPU::SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
+                                                                   const SiStripGain* gains,
+                                                                   const SiStripNoises& noises)
+
+      : noise_(sistrip::NUMBER_OF_FEDS * sistrip::FEDCH_PER_FED * sistrip::STRIPS_PER_FEDCH),
+        invthick_(sistrip::NUMBER_OF_FEDS * sistrip::FEDCH_PER_FED),
+        detID_(sistrip::NUMBER_OF_FEDS * sistrip::FEDCH_PER_FED),
+        iPair_(sistrip::NUMBER_OF_FEDS * sistrip::FEDCH_PER_FED),
+        gain_(sistrip::NUMBER_OF_FEDS * sistrip::APVS_PER_FEDCH * sistrip::FEDCH_PER_FED) {
+    // connected: map<DetID, std::vector<int>>
+    // map of KEY=detid DATA=vector of apvs, maximum 6 APVs per detector module :
+    const auto& connected = quality.cabling()->connected();
+    // detCabling: map<DetID, std::vector<const FedChannelConnection *>
+    // map of KEY=detid DATA=vector<FedChannelConnection>
+    const auto& detCabling = quality.cabling()->getDetCabling();
+
+    for (const auto& conn : connected) {
+      const auto det = conn.first;
+      if (!quality.IsModuleBad(det)) {
+        const auto detConn_it = detCabling.find(det);
+
+        if (detCabling.end() != detConn_it) {
+          for (const auto& chan : (*detConn_it).second) {
+            if (chan && chan->fedId() && chan->isConnected()) {
+              const auto detID = chan->detId();
+              const auto fedID = chan->fedId();
+              const auto fedCh = chan->fedCh();
+              const auto iPair = chan->apvPairNumber();
+
+              detToFeds_.emplace_back(detID, iPair, fedID, fedCh);
+
+              detID_[channelIndex(fedID, fedCh)] = detID;
+              iPair_[channelIndex(fedID, fedCh)] = iPair;
+              setInvThickness(fedID, fedCh, siStripClusterTools::sensorThicknessInverse(detID));
+
+              auto offset = 256 * iPair;
+
+              for (auto strip = 0; strip < 256; ++strip) {
+                const auto gainRange = gains->getRange(det);
+
+                const auto detstrip = strip + offset;
+                const std::uint16_t noise = SiStripNoises::getRawNoise(detstrip, noises.getRange(det));
+                const auto gain = SiStripGain::getStripGain(detstrip, gainRange);
+                const auto bad = quality.IsStripBad(quality.getRange(det), detstrip);
+
+                // gain is actually stored per-APV, not per-strip
+                setStrip(fedID, fedCh, detstrip, noise, gain, bad);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    std::sort(detToFeds_.begin(), detToFeds_.end(), [](const DetToFed& a, const DetToFed& b) {
+      return a.detID() < b.detID() || (a.detID() == b.detID() && a.pair() < b.pair());
+    });
+  }
+
+  SiStripClusterizerConditionsGPU::Data const& SiStripClusterizerConditionsGPU::getGPUProductAsync(
+      cudaStream_t stream) const {
+    auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](Data& data, cudaStream_t stream) {
+      data.noise_ = cms::cuda::make_device_unique<std::uint16_t[]>(noise_.size(), stream);
+      data.invthick_ = cms::cuda::make_device_unique<float[]>(invthick_.size(), stream);
+      data.detID_ = cms::cuda::make_device_unique<detId_t[]>(detID_.size(), stream);
+      data.iPair_ = cms::cuda::make_device_unique<apvPair_t[]>(iPair_.size(), stream);
+      data.gain_ = cms::cuda::make_device_unique<float[]>(gain_.size(), stream);
+
+      cms::cuda::copyAsync(data.noise_, noise_, stream);
+      cms::cuda::copyAsync(data.invthick_, invthick_, stream);
+      cms::cuda::copyAsync(data.detID_, detID_, stream);
+      cms::cuda::copyAsync(data.iPair_, iPair_, stream);
+      cms::cuda::copyAsync(data.gain_, gain_, stream);
+
+      data.hostView_ = cms::cuda::make_host_unique<SiStripClusterizerConditionsGPU::Data::DeviceView>(stream);
+      data.hostView_->noise_ = data.noise_.get();
+      data.hostView_->invthick_ = data.invthick_.get();
+      data.hostView_->detID_ = data.detID_.get();
+      data.hostView_->iPair_ = data.iPair_.get();
+      data.hostView_->gain_ = data.gain_.get();
+
+      data.deviceView_ = cms::cuda::make_device_unique<SiStripClusterizerConditionsGPU::Data::DeviceView>(stream);
+      cms::cuda::copyAsync(data.deviceView_, data.hostView_, stream);
+    });
+
+    return data;
+  }
+}  // namespace stripgpu
diff --git a/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h b/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h
new file mode 100644
index 0000000000000..e6c262d456289
--- /dev/null
+++ b/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h
@@ -0,0 +1,27 @@
+#ifndef DataFormats_SiStripCluster_interface_SiStripClustersSOA_h
+#define DataFormats_SiStripCluster_interface_SiStripClustersSOA_h
+
+#include "DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h"
+
+#include <memory>
+
+namespace detail {
+  namespace impl {
+    template <typename T>
+    using unique_ptr_default_deleter = typename std::unique_ptr<T>;
+  }
+}  // namespace detail
+
+class SiStripClustersSOA : public SiStripClustersSOABase<detail::impl::unique_ptr_default_deleter> {
+public:
+  SiStripClustersSOA() = default;
+  explicit SiStripClustersSOA(uint32_t maxClusters, uint32_t maxStripsPerCluster);
+  ~SiStripClustersSOA() override = default;
+
+  SiStripClustersSOA(const SiStripClustersSOA &) = delete;
+  SiStripClustersSOA &operator=(const SiStripClustersSOA &) = delete;
+  SiStripClustersSOA(SiStripClustersSOA &&) = default;
+  SiStripClustersSOA &operator=(SiStripClustersSOA &&) = default;
+};
+
+#endif
diff --git a/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h b/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h
new file mode 100644
index 0000000000000..036ab7c3dd3e5
--- /dev/null
+++ b/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h
@@ -0,0 +1,59 @@
+#ifndef DataFormats_SiStripCluster_interface_SiStripClustersSOABase_
+#define DataFormats_SiStripCluster_interface_SiStripClustersSOABase_
+
+#include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
+
+#include <cstdint>
+#include <limits>
+
+template <template <typename> class T>
+class SiStripClustersSOABase {
+public:
+  //static constexpr uint32_t kClusterMaxStrips = 16;
+
+  SiStripClustersSOABase() = default;
+  //explicit SiStripClustersSOABase(uint32_t maxClusters, uint32_t maxStripsPerCluster);
+  virtual ~SiStripClustersSOABase() = default;
+
+  SiStripClustersSOABase(const SiStripClustersSOABase&) = delete;
+  SiStripClustersSOABase& operator=(const SiStripClustersSOABase&) = delete;
+  SiStripClustersSOABase(SiStripClustersSOABase&&) = default;
+  SiStripClustersSOABase& operator=(SiStripClustersSOABase&&) = default;
+
+  void setNClusters(uint32_t nClusters) { nClusters_ = nClusters; }
+  uint32_t nClusters() const { return nClusters_; }
+
+  void setMaxClusterSize(uint32_t maxClusterSize) { maxClusterSize_ = maxClusterSize; }
+  uint32_t maxClusterSize() const { return maxClusterSize_; }
+
+  const auto& clusterIndex() const { return clusterIndex_; }
+  const auto& clusterSize() const { return clusterSize_; }
+  const auto& clusterADCs() const { return clusterADCs_; }
+  const auto& clusterDetId() const { return clusterDetId_; }
+  const auto& firstStrip() const { return firstStrip_; }
+  const auto& trueCluster() const { return trueCluster_; }
+  const auto& barycenter() const { return barycenter_; }
+  const auto& charge() const { return charge_; }
+
+  auto& clusterIndex() { return clusterIndex_; }
+  auto& clusterSize() { return clusterSize_; }
+  auto& clusterADCs() { return clusterADCs_; }
+  auto& clusterDetId() { return clusterDetId_; }
+  auto& firstStrip() { return firstStrip_; }
+  auto& trueCluster() { return trueCluster_; }
+  auto& barycenter() { return barycenter_; }
+  auto& charge() { return charge_; }
+
+protected:
+  T<uint32_t[]> clusterIndex_;
+  T<uint32_t[]> clusterSize_;
+  T<uint8_t[]> clusterADCs_;
+  T<stripgpu::detId_t[]> clusterDetId_;
+  T<stripgpu::stripId_t[]> firstStrip_;
+  T<bool[]> trueCluster_;
+  T<float[]> barycenter_;
+  T<float[]> charge_;
+  uint32_t nClusters_;
+  uint32_t maxClusterSize_;
+};
+#endif
diff --git a/DataFormats/SiStripCluster/interface/SiStripTypes.h b/DataFormats/SiStripCluster/interface/SiStripTypes.h
new file mode 100644
index 0000000000000..0b13041f47b1c
--- /dev/null
+++ b/DataFormats/SiStripCluster/interface/SiStripTypes.h
@@ -0,0 +1,19 @@
+#ifndef DataFormats_SiStripCluster_interface_SiStripTypes_h
+#define DataFormats_SiStripCluster_interface_SiStripTypes_h
+
+#include <cstdint>
+#include <limits>
+
+namespace stripgpu {
+  using detId_t = std::uint32_t;
+  using fedId_t = std::uint16_t;
+  using fedCh_t = std::uint8_t;
+  using apvPair_t = std::uint16_t;
+  using stripId_t = std::uint16_t;
+
+  static constexpr detId_t invalidDet = std::numeric_limits<detId_t>::max();
+  static constexpr fedId_t invalidFed = std::numeric_limits<fedId_t>::max();
+  static constexpr stripId_t invalidStrip = std::numeric_limits<stripId_t>::max();
+}  // namespace stripgpu
+
+#endif
diff --git a/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc b/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc
new file mode 100644
index 0000000000000..d43d8509b5920
--- /dev/null
+++ b/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc
@@ -0,0 +1,13 @@
+#include "DataFormats/SiStripCluster/interface/SiStripClustersSOA.h"
+
+SiStripClustersSOA::SiStripClustersSOA(uint32_t maxClusters, uint32_t maxStripsPerCluster) {
+  clusterIndex_ = std::make_unique<uint32_t[]>(maxClusters);
+  clusterSize_ = std::make_unique<uint32_t[]>(maxClusters);
+  clusterADCs_ = std::make_unique<uint8_t[]>(maxClusters * maxStripsPerCluster);
+  clusterDetId_ = std::make_unique<stripgpu::detId_t[]>(maxClusters);
+  firstStrip_ = std::make_unique<stripgpu::stripId_t[]>(maxClusters);
+  trueCluster_ = std::make_unique<bool[]>(maxClusters);
+  barycenter_ = std::make_unique<float[]>(maxClusters);
+  charge_ = std::make_unique<float[]>(maxClusters);
+  maxClusterSize_ = maxStripsPerCluster;
+}
diff --git a/DataFormats/SiStripCluster/src/classes.h b/DataFormats/SiStripCluster/src/classes.h
index 57f45748f7f2e..00819e76d08a4 100644
--- a/DataFormats/SiStripCluster/src/classes.h
+++ b/DataFormats/SiStripCluster/src/classes.h
@@ -5,6 +5,7 @@
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
 #include "DataFormats/Common/interface/Ref.h"
 #include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+#include "DataFormats/SiStripCluster/interface/SiStripClustersSOA.h"
 #include "DataFormats/SiStripCluster/interface/SiStripApproximateCluster.h"
 #include "DataFormats/Common/interface/ContainerMask.h"
 
diff --git a/DataFormats/SiStripCluster/src/classes_def.xml b/DataFormats/SiStripCluster/src/classes_def.xml
index ee5d79f1fd266..8c8c0a49d911a 100755
--- a/DataFormats/SiStripCluster/src/classes_def.xml
+++ b/DataFormats/SiStripCluster/src/classes_def.xml
@@ -46,4 +46,9 @@
  <class name="edmNew::DetSetVector<edm::Ref<edmNew::DetSetVector<SiStripApproximateCluster>,SiStripApproximateCluster,edmNew::DetSetVector<SiStripApproximateCluster>::FindForDetSetVector> >" />
  <class name="edm::Wrapper<edmNew::DetSetVector<edm::Ref<edmNew::DetSetVector<SiStripApproximateCluster>,SiStripApproximateCluster,edmNew::DetSetVector<SiStripApproximateCluster>::FindForDetSetVector> > >" />
 
+ <class name="SiStripClustersSOA" ClassVersion="3">
+  <version ClassVersion="3" checksum="2739562998"/>
+ </class>
+ <class name="edm::Wrapper<SiStripClustersSOA>"/>
+
 </lcgdict>
diff --git a/RecoLocalTracker/SiStripClusterizer/BuildFile.xml b/RecoLocalTracker/SiStripClusterizer/BuildFile.xml
index 205bdbd9de298..e7fdcd6cbf34a 100644
--- a/RecoLocalTracker/SiStripClusterizer/BuildFile.xml
+++ b/RecoLocalTracker/SiStripClusterizer/BuildFile.xml
@@ -9,7 +9,6 @@
 <use name="CalibFormats/SiStripObjects"/>
 <use name="CalibTracker/Records"/>
 <use name="RecoLocalTracker/Records"/>
-<use name="root"/>
 <export>
   <lib name="1"/>
 </export>
diff --git a/RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h b/RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h
index e949dc668e02f..58d399042a019 100644
--- a/RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h
+++ b/RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h
@@ -3,7 +3,8 @@
 
 namespace edm {
   class ParameterSet;
-}
+  class ParameterSetDescription;
+}  // namespace edm
 class StripClusterizerAlgorithm;
 #include <memory>
 
@@ -12,5 +13,6 @@ class StripClusterizerAlgorithm;
 class StripClusterizerAlgorithmFactory {
 public:
   static std::unique_ptr<StripClusterizerAlgorithm> create(edm::ConsumesCollector&&, const edm::ParameterSet&);
+  static void fillDescriptions(edm::ParameterSetDescription& clusterizer);
 };
 #endif
diff --git a/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h b/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h
index 6c9964dd2a43f..2c4551ef40550 100644
--- a/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h
+++ b/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h
@@ -35,6 +35,7 @@ class ThreeThresholdAlgorithm final : public StripClusterizerAlgorithm {
                           unsigned,
                           unsigned,
                           unsigned,
+                          unsigned,
                           bool removeApvShots,
                           float minGoodCharge);
 
@@ -58,6 +59,7 @@ class ThreeThresholdAlgorithm final : public StripClusterizerAlgorithm {
 
   float ChannelThreshold, SeedThreshold, ClusterThresholdSquared;
   uint8_t MaxSequentialHoles, MaxSequentialBad, MaxAdjacentBad;
+  unsigned MaxClusterSize;
   bool RemoveApvShots;
   float minGoodCharge;
 };
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiStripClusterizer/plugins/BuildFile.xml
index 0343f425c91f1..27ae390132063 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/BuildFile.xml
@@ -1,6 +1,10 @@
-<library name="RecoLocalTrackerSiStripClusterizerPlugins" file="*.cc">
+<library name="RecoLocalTrackerSiStripClusterizerPlugins" file="*.cc *.cu">
   <use name="EventFilter/SiStripRawToDigi"/>
   <use name="RecoLocalTracker/SiStripClusterizer"/>
   <use name="RecoLocalTracker/SiStripZeroSuppression"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="CUDADataFormats/SiStripCluster"/>
+  <use name="cuda"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.cc b/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.cc
new file mode 100644
index 0000000000000..5b721778284d4
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.cc
@@ -0,0 +1,60 @@
+#include <cassert>
+
+#include <cuda_runtime.h>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+#include "ChannelLocsGPU.h"
+
+ChannelLocs::ChannelLocs(size_t size, cudaStream_t stream) : ChannelLocsBase(size) {
+  if (size > 0) {
+    input_ = cms::cuda::make_host_unique<const uint8_t*[]>(size, stream);
+    inoff_ = cms::cuda::make_host_unique<size_t[]>(size, stream);
+    offset_ = cms::cuda::make_host_unique<size_t[]>(size, stream);
+    length_ = cms::cuda::make_host_unique<uint16_t[]>(size, stream);
+    fedID_ = cms::cuda::make_host_unique<stripgpu::fedId_t[]>(size, stream);
+    fedCh_ = cms::cuda::make_host_unique<stripgpu::fedCh_t[]>(size, stream);
+    detID_ = cms::cuda::make_host_unique<stripgpu::detId_t[]>(size, stream);
+  }
+}
+
+void ChannelLocsView::fill(const ChannelLocsGPU& c) {
+  input_ = c.input();
+  inoff_ = c.inoff();
+  offset_ = c.offset();
+  length_ = c.length();
+  fedID_ = c.fedID();
+  fedCh_ = c.fedCh();
+  detID_ = c.detID();
+  size_ = c.size();
+}
+
+ChannelLocsGPU::ChannelLocsGPU(size_t size, cudaStream_t stream) : ChannelLocsBase(size) {
+  if (size > 0) {
+    input_ = cms::cuda::make_device_unique<const uint8_t*[]>(size, stream);
+    inoff_ = cms::cuda::make_device_unique<size_t[]>(size, stream);
+    offset_ = cms::cuda::make_device_unique<size_t[]>(size, stream);
+    length_ = cms::cuda::make_device_unique<uint16_t[]>(size, stream);
+    fedID_ = cms::cuda::make_device_unique<stripgpu::fedId_t[]>(size, stream);
+    fedCh_ = cms::cuda::make_device_unique<stripgpu::fedCh_t[]>(size, stream);
+    detID_ = cms::cuda::make_device_unique<stripgpu::detId_t[]>(size, stream);
+
+    auto channelLocsView = cms::cuda::make_host_unique<ChannelLocsView>(stream);
+    channelLocsView->fill(*this);
+    channelLocsViewGPU_ = cms::cuda::make_device_unique<ChannelLocsView>(stream);
+    cms::cuda::copyAsync(channelLocsViewGPU_, channelLocsView, stream);
+  }
+}
+
+void ChannelLocsGPU::setVals(const ChannelLocs* c,
+                             cms::cuda::host::unique_ptr<const uint8_t*[]> inputGPU,
+                             cudaStream_t stream) {
+  assert(c->size() == size_);
+  cms::cuda::copyAsync(input_, inputGPU, size_, stream);
+  cms::cuda::copyAsync(inoff_, c->inoff_, size_, stream);
+  cms::cuda::copyAsync(offset_, c->offset_, size_, stream);
+  cms::cuda::copyAsync(length_, c->length_, size_, stream);
+  cms::cuda::copyAsync(fedID_, c->fedID_, size_, stream);
+  cms::cuda::copyAsync(fedCh_, c->fedCh_, size_, stream);
+  cms::cuda::copyAsync(detID_, c->detID_, size_, stream);
+}
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.h b/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.h
new file mode 100644
index 0000000000000..343d5a536f035
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ChannelLocsGPU.h
@@ -0,0 +1,138 @@
+#ifndef RecoLocalTracker_SiStripClusterizer_plugins_ChannelLocsGPU_h
+#define RecoLocalTracker_SiStripClusterizer_plugins_ChannelLocsGPU_h
+
+#include <memory>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
+
+class ChannelLocsGPU;
+
+template <template <typename> class T>
+class ChannelLocsBase {
+public:
+  ChannelLocsBase(size_t size) : size_(size) {}
+  virtual ~ChannelLocsBase() = default;
+
+  ChannelLocsBase(ChannelLocsBase&& arg)
+      : input_(std::move(arg.input_)),
+        inoff_(std::move(arg.inoff_)),
+        offset_(std::move(arg.offset_)),
+        length_(std::move(arg.length_)),
+        fedID_(std::move(arg.fedID_)),
+        fedCh_(std::move(arg.fedCh_)),
+        detID_(std::move(arg.detID_)),
+        size_(arg.size_) {}
+
+  void setChannelLoc(uint32_t index,
+                     const uint8_t* input,
+                     size_t inoff,
+                     size_t offset,
+                     uint16_t length,
+                     stripgpu::fedId_t fedID,
+                     stripgpu::fedCh_t fedCh,
+                     stripgpu::detId_t detID) {
+    input_[index] = input;
+    inoff_[index] = inoff;
+    offset_[index] = offset;
+    length_[index] = length;
+    fedID_[index] = fedID;
+    fedCh_[index] = fedCh;
+    detID_[index] = detID;
+  }
+
+  size_t size() const { return size_; }
+
+  const uint8_t* input(uint32_t index) const { return input_[index]; }
+  size_t inoff(uint32_t index) const { return inoff_[index]; }
+  size_t offset(uint32_t index) const { return offset_[index]; }
+  uint16_t length(uint32_t index) const { return length_[index]; }
+  stripgpu::fedId_t fedID(uint32_t index) const { return fedID_[index]; }
+  stripgpu::fedCh_t fedCh(uint32_t index) const { return fedCh_[index]; }
+  stripgpu::detId_t detID(uint32_t index) const { return detID_[index]; }
+
+  const uint8_t* const* input() const { return input_.get(); }
+  size_t* inoff() const { return inoff_.get(); }
+  size_t* offset() const { return offset_.get(); }
+  uint16_t* length() const { return length_.get(); }
+  stripgpu::fedId_t* fedID() const { return fedID_.get(); }
+  stripgpu::fedCh_t* fedCh() const { return fedCh_.get(); }
+  stripgpu::detId_t* detID() const { return detID_.get(); }
+
+protected:
+  T<const uint8_t*[]> input_;  // input raw data for channel
+  T<size_t[]> inoff_;          // offset in input raw data
+  T<size_t[]> offset_;         // global offset in alldata
+  T<uint16_t[]> length_;       // length of channel data
+  T<stripgpu::fedId_t[]> fedID_;
+  T<stripgpu::fedCh_t[]> fedCh_;
+  T<stripgpu::detId_t[]> detID_;
+  size_t size_ = 0;
+};
+
+class ChannelLocs : public ChannelLocsBase<cms::cuda::host::unique_ptr> {
+  friend class ChannelLocsGPU;
+
+public:
+  ChannelLocs(size_t size, cudaStream_t stream);
+  ChannelLocs(ChannelLocs&& arg) : ChannelLocsBase(std::move(arg)) {}
+
+  ChannelLocs(ChannelLocs&) = delete;
+  ChannelLocs(const ChannelLocs&) = delete;
+  ChannelLocs& operator=(const ChannelLocs&) = delete;
+  ChannelLocs& operator=(ChannelLocs&&) = delete;
+
+  ~ChannelLocs() override = default;
+};
+
+class ChannelLocsView {
+public:
+  void fill(const ChannelLocsGPU& c);
+
+  __device__ size_t size() const { return size_; }
+
+  __device__ const uint8_t* input(uint32_t index) const { return input_[index]; }
+  __device__ size_t inoff(uint32_t index) const { return inoff_[index]; }
+  __device__ size_t offset(uint32_t index) const { return offset_[index]; }
+  __device__ uint16_t length(uint32_t index) const { return length_[index]; }
+  __device__ stripgpu::fedId_t fedID(uint32_t index) const { return fedID_[index]; }
+  __device__ stripgpu::fedCh_t fedCh(uint32_t index) const { return fedCh_[index]; }
+  __device__ stripgpu::detId_t detID(uint32_t index) const { return detID_[index]; }
+
+private:
+  const uint8_t* const* input_;  // input raw data for channel
+  size_t* inoff_;                // offset in input raw data
+  size_t* offset_;               // global offset in alldata
+  uint16_t* length_;             // length of channel data
+  stripgpu::fedId_t* fedID_;
+  stripgpu::fedCh_t* fedCh_;
+  stripgpu::detId_t* detID_;
+  size_t size_;
+};
+
+class ChannelLocsGPU : public ChannelLocsBase<cms::cuda::device::unique_ptr> {
+public:
+  //using Base = ChannelLocsBase<cms::cuda::device::unique_ptr>;
+  ChannelLocsGPU(size_t size, cudaStream_t stream);
+  ChannelLocsGPU(ChannelLocsGPU&& arg)
+      : ChannelLocsBase(std::move(arg)), channelLocsViewGPU_(std::move(arg.channelLocsViewGPU_)) {}
+
+  ChannelLocsGPU(ChannelLocsGPU&) = delete;
+  ChannelLocsGPU(const ChannelLocsGPU&) = delete;
+  ChannelLocsGPU& operator=(const ChannelLocsGPU&) = delete;
+  ChannelLocsGPU& operator=(ChannelLocsGPU&&) = delete;
+
+  ~ChannelLocsGPU() override = default;
+
+  void setVals(const ChannelLocs* c, cms::cuda::host::unique_ptr<const uint8_t*[]> inputGPU, cudaStream_t stream);
+  const ChannelLocsView* channelLocsView() const { return channelLocsViewGPU_.get(); }
+
+private:
+  cms::cuda::device::unique_ptr<ChannelLocsView> channelLocsViewGPU_;
+};
+
+#endif
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc
index ce456617dbc22..4e6c929014985 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc
@@ -17,6 +17,8 @@
 
 #include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -173,9 +175,8 @@ class SiStripClusterizerFromRaw final : public edm::stream::EDProducer<> {
                                                               conf.getParameter<edm::ParameterSet>("Clusterizer"))),
         rawAlgos_(SiStripRawProcessingFactory::create(conf.getParameter<edm::ParameterSet>("Algorithms"),
                                                       consumesCollector())),
-        doAPVEmulatorCheck_(conf.existsAs<bool>("DoAPVEmulatorCheck") ? conf.getParameter<bool>("DoAPVEmulatorCheck")
-                                                                      : true),
-        legacy_(conf.existsAs<bool>("LegacyUnpacker") ? conf.getParameter<bool>("LegacyUnpacker") : false),
+        doAPVEmulatorCheck_(conf.getParameter<bool>("DoAPVEmulatorCheck")),
+        legacy_(conf.getParameter<bool>("LegacyUnpacker")),
         hybridZeroSuppressed_(conf.getParameter<bool>("HybridZeroSuppressed")) {
     productToken_ = consumes<FEDRawDataCollection>(conf.getParameter<edm::InputTag>("ProductLabel"));
     produces<edmNew::DetSetVector<SiStripCluster> >();
@@ -211,6 +212,8 @@ class SiStripClusterizerFromRaw final : public edm::stream::EDProducer<> {
     ev.put(std::move(output));
   }
 
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
 private:
   void initialize(const edm::EventSetup& es);
 
@@ -231,6 +234,27 @@ class SiStripClusterizerFromRaw final : public edm::stream::EDProducer<> {
   bool hybridZeroSuppressed_;
 };
 
+void SiStripClusterizerFromRaw::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add("ProductLabel", edm::InputTag("rawDataCollector"));
+  desc.add<std::string>("ConditionsLabel", "");
+  desc.add("onDemand", true);
+  desc.add("DoAPVEmulatorCheck", true);
+  desc.add("LegacyUnpacker", false);
+  desc.add("HybridZeroSuppressed", false);
+
+  edm::ParameterSetDescription clusterizer;
+  StripClusterizerAlgorithmFactory::fillDescriptions(clusterizer);
+  desc.add("Clusterizer", clusterizer);
+
+  edm::ParameterSetDescription algorithms;
+  SiStripRawProcessingFactory::fillDescriptions(algorithms);
+  desc.add("Algorithms", algorithms);
+
+  descriptions.addWithDefaultLabel(desc);
+}
+
 #include "FWCore/Framework/interface/MakerMacros.h"
 DEFINE_FWK_MODULE(SiStripClusterizerFromRaw);
 
@@ -250,7 +274,6 @@ void SiStripClusterizerFromRaw::run(const FEDRawDataCollection& rawColl, edmNew:
 
     if (record.empty())
       record.abort();
-
   }  // end loop over dets
 }
 
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc
new file mode 100644
index 0000000000000..2d7b4c83a4a4a
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc
@@ -0,0 +1,181 @@
+/*
+ */
+#include "RecoLocalTracker/Records/interface/SiStripClusterizerConditionsRcd.h"
+
+#include "RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h"
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "EventFilter/SiStripRawToDigi/interface/SiStripFEDBuffer.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditions.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Utilities/interface/Likely.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "SiStripRawToClusterGPUKernel.h"
+#include "ChannelLocsGPU.h"
+
+//#include <sstream>
+#include <memory>
+#include <mutex>
+
+namespace {
+  std::unique_ptr<sistrip::FEDBuffer> fillBuffer(int fedId, const FEDRawData& rawData) {
+    std::unique_ptr<sistrip::FEDBuffer> buffer;
+
+    // Check on FEDRawData pointer
+    const auto st_buffer = sistrip::preconstructCheckFEDBuffer(rawData);
+    if UNLIKELY (sistrip::FEDBufferStatusCode::SUCCESS != st_buffer) {
+      LogDebug(sistrip::mlRawToCluster_) << "[ClustersFromRawProducer::" << __func__ << "]" << st_buffer
+                                         << " for FED ID " << fedId;
+      return buffer;
+    }
+    buffer = std::make_unique<sistrip::FEDBuffer>(rawData);
+    const auto st_chan = buffer->findChannels();
+    if UNLIKELY (sistrip::FEDBufferStatusCode::SUCCESS != st_chan) {
+      LogDebug(sistrip::mlRawToCluster_) << "Exception caught when creating FEDBuffer object for FED " << fedId << ": "
+                                         << st_chan;
+      buffer.reset();
+      return buffer;
+    }
+    if UNLIKELY (!buffer->doChecks(false)) {
+      LogDebug(sistrip::mlRawToCluster_) << "Exception caught when creating FEDBuffer object for FED " << fedId
+                                         << ": FED Buffer check fails";
+      buffer.reset();
+      return buffer;
+    }
+
+    return buffer;
+  }
+}  // namespace
+
+class SiStripClusterizerFromRawGPU final : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiStripClusterizerFromRawGPU(const edm::ParameterSet& conf)
+      : buffers_(sistrip::FED_ID_MAX),
+        raw_(sistrip::FED_ID_MAX),
+        gpuAlgo_(conf.getParameter<edm::ParameterSet>("Clusterizer")),
+        inputToken_(consumes(conf.getParameter<edm::InputTag>("ProductLabel"))),
+        outputToken_(produces<cms::cuda::Product<SiStripClustersCUDADevice>>()),
+        conditionsToken_(esConsumes(edm::ESInputTag{"", conf.getParameter<std::string>("ConditionsLabel")})),
+        cpuConditionsToken_(esConsumes(edm::ESInputTag{"", conf.getParameter<std::string>("ConditionsLabel")})) {}
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(edm::Event const& ev,
+               edm::EventSetup const& es,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override {
+    const auto& conditions = es.getData(conditionsToken_);        //these need to be GPU conditions
+    const auto& cpuConditions = es.getData(cpuConditionsToken_);  //CPU conditions
+
+    // Sets the current device and creates a CUDA stream
+    cms::cuda::ScopedContextAcquire ctx{ev.streamID(), std::move(waitingTaskHolder), ctxState_};
+
+    // get raw data
+    auto const& rawData = ev.get(inputToken_);
+    run(rawData, cpuConditions);
+
+    // Queues asynchronous data transfers and kernels to the CUDA stream
+    // returned by cms::cuda::ScopedContextAcquire::stream()
+    gpuAlgo_.makeAsync(raw_, buffers_, conditions, ctx.stream());
+
+    // Destructor of ctx queues a callback to the CUDA stream notifying
+    // waitingTaskHolder when the queued asynchronous work has finished
+  }
+
+  void produce(edm::Event& ev, const edm::EventSetup& es) override {
+    cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+    // Now getResult() returns data in GPU memory that is passed to the
+    // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+    // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
+    // the current device and the CUDA stream since those will be needed
+    // in the consumer side.
+    ctx.emplace(ev, outputToken_, gpuAlgo_.getResults(ctx.stream()));
+
+    for (auto& buf : buffers_)
+      buf.reset(nullptr);
+  }
+
+private:
+  void run(const FEDRawDataCollection& rawColl, const SiStripClusterizerConditions& conditions);
+  void fill(uint32_t idet, const FEDRawDataCollection& rawColl, const SiStripClusterizerConditions& conditions);
+
+private:
+  std::vector<std::unique_ptr<sistrip::FEDBuffer>> buffers_;
+  std::vector<const FEDRawData*> raw_;
+  cms::cuda::ContextState ctxState_;
+
+  stripgpu::SiStripRawToClusterGPUKernel gpuAlgo_;
+
+  edm::EDGetTokenT<FEDRawDataCollection> inputToken_;
+  edm::EDPutTokenT<cms::cuda::Product<SiStripClustersCUDADevice>> outputToken_;
+  edm::ESGetToken<stripgpu::SiStripClusterizerConditionsGPU, SiStripClusterizerConditionsRcd> conditionsToken_;
+  edm::ESGetToken<SiStripClusterizerConditions, SiStripClusterizerConditionsRcd> cpuConditionsToken_;
+};
+
+void SiStripClusterizerFromRawGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add("ProductLabel", edm::InputTag("rawDataCollector"));
+  desc.add<std::string>("ConditionsLabel", "");
+
+  edm::ParameterSetDescription clusterizer;
+  StripClusterizerAlgorithmFactory::fillDescriptions(clusterizer);
+  desc.add("Clusterizer", clusterizer);
+
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiStripClusterizerFromRawGPU::run(const FEDRawDataCollection& rawColl,
+                                       const SiStripClusterizerConditions& conditions) {
+  // loop over good det in cabling
+  for (auto idet : conditions.allDetIds()) {
+    fill(idet, rawColl, conditions);
+  }  // end loop over dets
+}
+
+void SiStripClusterizerFromRawGPU::fill(uint32_t idet,
+                                        const FEDRawDataCollection& rawColl,
+                                        const SiStripClusterizerConditions& conditions) {
+  auto const& det = conditions.findDetId(idet);
+  if (!det.valid())
+    return;
+
+  // Loop over apv-pairs of det
+  for (auto const conn : conditions.currentConnection(det)) {
+    if UNLIKELY (!conn)
+      continue;
+
+    const uint16_t fedId = conn->fedId();
+
+    // If fed id is null or connection is invalid continue
+    if UNLIKELY (!fedId || !conn->isConnected()) {
+      continue;
+    }
+
+    // If Fed hasnt already been initialised, extract data and initialise
+    sistrip::FEDBuffer* buffer = buffers_[fedId].get();
+    if (!buffer) {
+      const FEDRawData& rawData = rawColl.FEDData(fedId);
+      raw_[fedId] = &rawData;
+      buffers_[fedId] = fillBuffer(fedId, rawData);
+    }
+  }  // end loop over conn
+}
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiStripClusterizerFromRawGPU);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc
new file mode 100644
index 0000000000000..67f4f3152dd9c
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc
@@ -0,0 +1,64 @@
+/**\class SiStripClusterizerConditionsGPUESProducer
+ *
+ * Create a GPU cache object for fast access to conditions needed by the SiStrip clusterizer
+ *
+ * @see SiStripClusterizerConditions
+ */
+#include <memory>
+
+#include "FWCore/Framework/interface/ModuleFactory.h"
+#include "FWCore/Framework/interface/ESProducer.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "RecoLocalTracker/Records/interface/SiStripClusterizerConditionsRcd.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripGain.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripDetCabling.h"
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+
+using namespace stripgpu;
+
+class SiStripClusterizerConditionsGPUESProducer : public edm::ESProducer {
+public:
+  SiStripClusterizerConditionsGPUESProducer(const edm::ParameterSet&);
+  ~SiStripClusterizerConditionsGPUESProducer() override {}
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  using ReturnType = std::unique_ptr<SiStripClusterizerConditionsGPU>;
+  ReturnType produce(const SiStripClusterizerConditionsRcd&);
+
+private:
+  edm::ESGetToken<SiStripGain, SiStripGainRcd> gainToken_;
+  edm::ESGetToken<SiStripNoises, SiStripNoisesRcd> noisesToken_;
+  edm::ESGetToken<SiStripQuality, SiStripQualityRcd> qualityToken_;
+};
+
+SiStripClusterizerConditionsGPUESProducer::SiStripClusterizerConditionsGPUESProducer(const edm::ParameterSet& iConfig) {
+  auto cc = setWhatProduced(this, iConfig.getParameter<std::string>("Label"));
+
+  gainToken_ = cc.consumesFrom<SiStripGain, SiStripGainRcd>();
+  noisesToken_ = cc.consumesFrom<SiStripNoises, SiStripNoisesRcd>();
+  qualityToken_ = cc.consumesFrom<SiStripQuality, SiStripQualityRcd>(
+      edm::ESInputTag{"", iConfig.getParameter<std::string>("QualityLabel")});
+}
+
+void SiStripClusterizerConditionsGPUESProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<std::string>("QualityLabel", "");
+  desc.add<std::string>("Label", "");
+  descriptions.add("SiStripClusterizerConditionsGPUESProducer", desc);
+}
+
+SiStripClusterizerConditionsGPUESProducer::ReturnType SiStripClusterizerConditionsGPUESProducer::produce(
+    const SiStripClusterizerConditionsRcd& iRecord) {
+  auto gainsH = iRecord.getTransientHandle(gainToken_);
+  const auto& noises = iRecord.get(noisesToken_);
+  const auto& quality = iRecord.get(qualityToken_);
+
+  return std::make_unique<SiStripClusterizerConditionsGPU>(quality, gainsH.product(), noises);
+}
+
+DEFINE_FWK_EVENTSETUP_MODULE(SiStripClusterizerConditionsGPUESProducer);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc
new file mode 100644
index 0000000000000..8b891382085c7
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc
@@ -0,0 +1,83 @@
+/*
+ */
+#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+
+#include <memory>
+
+class SiStripClustersFromSOA final : public edm::stream::EDProducer<> {
+public:
+  explicit SiStripClustersFromSOA(const edm::ParameterSet& conf)
+      : inputToken_(consumes<SiStripClustersCUDAHost>(conf.getParameter<edm::InputTag>("ProductLabel"))),
+        outputToken_(produces<edmNew::DetSetVector<SiStripCluster>>()) {}
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+
+    desc.add("ProductLabel", edm::InputTag("siStripClustersSOAtoHost"));
+    descriptions.addWithDefaultLabel(desc);
+  }
+
+private:
+  void produce(edm::Event& ev, const edm::EventSetup& es) override {
+    const auto& clust_data = ev.get(inputToken_);
+
+    const int nSeedStripsNC = clust_data.nClusters();
+    const auto clusterSize = clust_data.clusterSize().get();
+    const auto clusterADCs = clust_data.clusterADCs().get();
+    const auto detIDs = clust_data.clusterDetId().get();
+    const auto stripIDs = clust_data.firstStrip().get();
+    const auto trueCluster = clust_data.trueCluster().get();
+
+    const unsigned int initSeedStripsSize = 15000;
+
+    using out_t = edmNew::DetSetVector<SiStripCluster>;
+    auto output{std::make_unique<out_t>(edmNew::DetSetVector<SiStripCluster>())};
+    output->reserve(initSeedStripsSize, nSeedStripsNC);
+
+    std::vector<uint8_t> adcs;
+
+    for (int i = 0; i < nSeedStripsNC;) {
+      const auto detid = detIDs[i];
+      out_t::FastFiller record(*output, detid);
+
+      while (i < nSeedStripsNC && detIDs[i] == detid) {
+        if (trueCluster[i]) {
+          const auto size = clusterSize[i];
+          const auto firstStrip = stripIDs[i];
+
+          adcs.clear();
+          adcs.reserve(size);
+
+          for (uint32_t j = 0; j < size; ++j) {
+            adcs.push_back(clusterADCs[i + j * nSeedStripsNC]);
+          }
+          record.push_back(SiStripCluster(firstStrip, std::move(adcs)));
+        }
+        i++;
+      }
+    }
+
+    output->shrink_to_fit();
+    ev.put(std::move(output));
+  }
+
+private:
+  edm::EDGetTokenT<SiStripClustersCUDAHost> inputToken_;
+  edm::EDPutTokenT<edmNew::DetSetVector<SiStripCluster>> outputToken_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiStripClustersFromSOA);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc
new file mode 100644
index 0000000000000..a51bb1216d0e0
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc
@@ -0,0 +1,76 @@
+/*
+ */
+#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+
+#include <memory>
+
+class SiStripSOAtoHost {
+public:
+  SiStripSOAtoHost() = default;
+  void makeAsync(const SiStripClustersCUDADevice& clusters_d, cudaStream_t stream) {
+    hostView_ = std::make_unique<SiStripClustersCUDAHost>(clusters_d, stream);
+  }
+  std::unique_ptr<SiStripClustersCUDAHost> getResults() { return std::move(hostView_); }
+
+private:
+  std::unique_ptr<SiStripClustersCUDAHost> hostView_;
+};
+
+class SiStripClustersSOAtoHost final : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiStripClustersSOAtoHost(const edm::ParameterSet& conf)
+      : inputToken_(
+            consumes<cms::cuda::Product<SiStripClustersCUDADevice>>(conf.getParameter<edm::InputTag>("ProductLabel"))),
+        outputToken_(produces<SiStripClustersCUDAHost>()) {}
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+
+    desc.add("ProductLabel", edm::InputTag("siStripClusterizerFromRawGPU"));
+    descriptions.addWithDefaultLabel(desc);
+  }
+
+private:
+  void acquire(edm::Event const& ev,
+               edm::EventSetup const& es,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override {
+    const auto& wrapper = ev.get(inputToken_);
+
+    // Sets the current device and creates a CUDA stream
+    cms::cuda::ScopedContextAcquire ctx{wrapper, std::move(waitingTaskHolder)};
+
+    const auto& input = ctx.get(wrapper);
+
+    // Queues asynchronous data transfers and kernels to the CUDA stream
+    // returned by cms::cuda::ScopedContextAcquire::stream()
+    gpuAlgo_.makeAsync(input, ctx.stream());
+
+    // Destructor of ctx queues a callback to the CUDA stream notifying
+    // waitingTaskHolder when the queued asynchronous work has finished
+  }
+
+  void produce(edm::Event& ev, const edm::EventSetup& es) override { ev.put(gpuAlgo_.getResults()); }
+
+private:
+  SiStripSOAtoHost gpuAlgo_;
+
+  edm::EDGetTokenT<cms::cuda::Product<SiStripClustersCUDADevice>> inputToken_;
+  edm::EDPutTokenT<SiStripClustersCUDAHost> outputToken_;
+};
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiStripClustersSOAtoHost);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc
new file mode 100644
index 0000000000000..a8b9aa04a00b3
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc
@@ -0,0 +1,185 @@
+#include "EventFilter/SiStripRawToDigi/interface/SiStripFEDBuffer.h"
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "RecoLocalTracker/SiStripClusterizer/interface/ClusterChargeCut.h"
+
+#include "SiStripRawToClusterGPUKernel.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+#include "ChannelLocsGPU.h"
+#include "StripDataView.h"
+
+namespace stripgpu {
+  StripDataGPU::StripDataGPU(size_t size, cudaStream_t stream) {
+    alldataGPU_ = cms::cuda::make_device_unique<uint8_t[]>(size, stream);
+    channelGPU_ = cms::cuda::make_device_unique<uint16_t[]>(size, stream);
+    stripIdGPU_ = cms::cuda::make_device_unique<stripgpu::stripId_t[]>(size, stream);
+  }
+
+  SiStripRawToClusterGPUKernel::SiStripRawToClusterGPUKernel(const edm::ParameterSet& conf)
+      : fedIndex_(sistrip::NUMBER_OF_FEDS, stripgpu::invalidFed),
+        channelThreshold_(conf.getParameter<double>("ChannelThreshold")),
+        seedThreshold_(conf.getParameter<double>("SeedThreshold")),
+        clusterThresholdSquared_(std::pow(conf.getParameter<double>("ClusterThreshold"), 2.0f)),
+        maxSequentialHoles_(conf.getParameter<unsigned>("MaxSequentialHoles")),
+        maxSequentialBad_(conf.getParameter<unsigned>("MaxSequentialBad")),
+        maxAdjacentBad_(conf.getParameter<unsigned>("MaxAdjacentBad")),
+        maxClusterSize_(conf.getParameter<unsigned>("MaxClusterSize")),
+        minGoodCharge_(clusterChargeCut(conf)) {
+    fedRawDataOffsets_.reserve(sistrip::NUMBER_OF_FEDS);
+  }
+
+  void SiStripRawToClusterGPUKernel::makeAsync(const std::vector<const FEDRawData*>& rawdata,
+                                               const std::vector<std::unique_ptr<sistrip::FEDBuffer>>& buffers,
+                                               const SiStripClusterizerConditionsGPU& conditions,
+                                               cudaStream_t stream) {
+    size_t totalSize{0};
+    for (const auto& buff : buffers) {
+      if (buff != nullptr) {
+        totalSize += buff->bufferSize();
+      }
+    }
+
+    auto fedRawDataHost = cms::cuda::make_host_unique<uint8_t[]>(totalSize, stream);
+    auto fedRawDataGPU = cms::cuda::make_device_unique<uint8_t[]>(totalSize, stream);
+
+    size_t off = 0;
+    fedRawDataOffsets_.clear();
+    fedIndex_.clear();
+    fedIndex_.resize(sistrip::NUMBER_OF_FEDS, stripgpu::invalidFed);
+
+    sistrip::FEDReadoutMode mode = sistrip::READOUT_MODE_INVALID;
+
+    for (size_t fedi = 0; fedi < buffers.size(); ++fedi) {
+      auto& buff = buffers[fedi];
+      if (buff != nullptr) {
+        const auto raw = rawdata[fedi];
+        memcpy(fedRawDataHost.get() + off, raw->data(), raw->size());
+        fedIndex_[stripgpu::fedIndex(fedi)] = fedRawDataOffsets_.size();
+        fedRawDataOffsets_.push_back(off);
+        off += raw->size();
+        if (fedRawDataOffsets_.size() == 1) {
+          mode = buff->readoutMode();
+        } else {
+          if (buff->readoutMode() != mode) {
+            throw cms::Exception("[SiStripRawToClusterGPUKernel] inconsistent readout mode ")
+                << buff->readoutMode() << " != " << mode;
+          }
+        }
+      }
+    }
+    // send rawdata to GPU
+    cms::cuda::copyAsync(fedRawDataGPU, fedRawDataHost, totalSize, stream);
+
+    const auto& detmap = conditions.detToFeds();
+    if ((mode != sistrip::READOUT_MODE_ZERO_SUPPRESSED) && (mode != sistrip::READOUT_MODE_ZERO_SUPPRESSED_LITE10)) {
+      throw cms::Exception("[SiStripRawToClusterGPUKernel] unsupported readout mode ") << mode;
+    }
+    const uint16_t headerlen = mode == sistrip::READOUT_MODE_ZERO_SUPPRESSED ? 7 : 2;
+    size_t offset = 0;
+    auto chanlocs = std::make_unique<ChannelLocs>(detmap.size(), stream);
+    auto inputGPU = cms::cuda::make_host_unique<const uint8_t*[]>(chanlocs->size(), stream);
+
+    // iterate over the detector in DetID/APVPair order
+    // mapping out where the data are
+    for (size_t i = 0; i < detmap.size(); ++i) {
+      const auto& detp = detmap[i];
+      const auto fedId = detp.fedID();
+      const auto fedCh = detp.fedCh();
+      const auto fedi = fedIndex_[stripgpu::fedIndex(fedId)];
+
+      if (fedi != invalidFed) {
+        const auto buffer = buffers[fedId].get();
+        const auto& channel = buffer->channel(detp.fedCh());
+
+        auto len = channel.length();
+        auto off = channel.offset();
+
+        assert(len >= headerlen || len == 0);
+
+        if (len >= headerlen) {
+          len -= headerlen;
+          off += headerlen;
+        }
+
+        chanlocs->setChannelLoc(i, channel.data(), off, offset, len, fedId, fedCh, detp.detID());
+        inputGPU[i] = fedRawDataGPU.get() + fedRawDataOffsets_[fedi] + (channel.data() - rawdata[fedId]->data());
+        offset += len;
+
+      } else {
+        chanlocs->setChannelLoc(i, nullptr, 0, 0, 0, invalidFed, 0, invalidDet);
+        inputGPU[i] = nullptr;
+      }
+    }
+
+    const auto n_strips = offset;
+
+    sst_data_d_ = cms::cuda::make_host_unique<StripDataView>(stream);
+    sst_data_d_->nStrips = n_strips;
+
+    chanlocsGPU_ = std::make_unique<ChannelLocsGPU>(detmap.size(), stream);
+    chanlocsGPU_->setVals(chanlocs.get(), std::move(inputGPU), stream);
+
+    stripdata_ = std::make_unique<StripDataGPU>(n_strips, stream);
+
+    const auto& condGPU = conditions.getGPUProductAsync(stream);
+
+    unpackChannelsGPU(condGPU.deviceView(), stream);
+#ifdef GPU_CHECK
+    cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+#ifdef EDM_ML_DEBUG
+    auto outdata = cms::cuda::make_host_unique<uint8_t[]>(n_strips, stream);
+    cms::cuda::copyAsync(outdata, stripdata_->alldataGPU_, n_strips, stream);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    constexpr int xor3bits = 7;
+    for (size_t i = 0; i < chanlocs->size(); ++i) {
+      const auto data = chanlocs->input(i);
+      const auto len = chanlocs->length(i);
+
+      if (data != nullptr && len > 0) {
+        auto aoff = chanlocs->offset(i);
+        auto choff = chanlocs->inoff(i);
+        const auto end = choff + len;
+
+        while (choff < end) {
+          const auto stripIndex = data[choff++ ^ xor3bits];
+          const auto groupLength = data[choff++ ^ xor3bits];
+          aoff += 2;
+          for (auto k = 0; k < groupLength; ++k, ++choff, ++aoff) {
+            if (data[choff ^ xor3bits] != outdata[aoff]) {
+              LogDebug("SiStripRawToClusterGPUKernel")
+                  << "Strip mismatch " << stripIndex << " i:k " << i << ":" << k << " "
+                  << (uint32_t)data[choff ^ xor3bits] << " != " << (uint32_t)outdata[aoff] << std::endl;
+            }
+          }
+        }
+      }
+    }
+    outdata.reset(nullptr);
+#endif
+
+    fedRawDataGPU.reset();
+    allocateSSTDataGPU(n_strips, stream);
+    setSeedStripsNCIndexGPU(condGPU.deviceView(), stream);
+
+    clusters_d_ = SiStripClustersCUDADevice(kMaxSeedStrips, maxClusterSize_, stream);
+    findClusterGPU(condGPU.deviceView(), stream);
+
+    stripdata_.reset();
+  }
+
+  SiStripClustersCUDADevice SiStripRawToClusterGPUKernel::getResults(cudaStream_t stream) {
+    reset();
+
+    return std::move(clusters_d_);
+  }
+
+  void SiStripRawToClusterGPUKernel::reset() {
+    chanlocsGPU_.reset();
+    sst_data_d_.reset();
+  }
+}  // namespace stripgpu
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu
new file mode 100644
index 0000000000000..c88d5a60ec86b
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu
@@ -0,0 +1,581 @@
+#include <cub/cub.cuh>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+
+#include "ChannelLocsGPU.h"
+#include "SiStripRawToClusterGPUKernel.h"
+#include "StripDataView.h"
+
+//#define GPU_DEBUG
+#if defined(EDM_ML_DEBUG) || defined(GPU_DEBUG)
+#define GPU_CHECK
+#include <cstdio>
+#endif
+
+using namespace stripgpu;
+using ConditionsDeviceView = SiStripClusterizerConditionsGPU::Data::DeviceView;
+
+__global__ static void unpackChannels(const ChannelLocsView *chanlocs,
+                                      const ConditionsDeviceView *conditions,
+                                      uint8_t *alldata,
+                                      uint16_t *channel,
+                                      stripId_t *stripId) {
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int nthreads = blockDim.x;
+
+  const auto first = nthreads * bid + tid;
+  const auto stride = blockDim.x * gridDim.x;
+  for (auto chan = first; chan < chanlocs->size(); chan += stride) {
+    const auto fedid = chanlocs->fedID(chan);
+    const auto fedch = chanlocs->fedCh(chan);
+    const auto ipair = conditions->iPair(fedid, fedch);
+    const auto ipoff = sistrip::STRIPS_PER_FEDCH * ipair;
+
+    const auto data = chanlocs->input(chan);
+    const auto len = chanlocs->length(chan);
+
+    if (data != nullptr && len > 0) {
+      auto aoff = chanlocs->offset(chan);
+      auto choff = chanlocs->inoff(chan);
+      const auto end = choff + len;
+
+      while (choff < end) {
+        auto stripIndex = data[(choff++) ^ 7] + ipoff;
+        const auto groupLength = data[(choff++) ^ 7];
+
+        for (auto i = 0; i < 2; ++i) {
+          stripId[aoff] = invalidStrip;
+          alldata[aoff++] = 0;
+        }
+
+        for (auto i = 0; i < groupLength; ++i) {
+          stripId[aoff] = stripIndex++;
+          channel[aoff] = chan;
+          alldata[aoff++] = data[(choff++) ^ 7];
+        }
+      }
+    }  // choff < end
+  }    // data != nullptr && len > 0
+}  // chan < chanlocs->size()
+
+__global__ static void setSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
+  const int nStrips = sst_data_d->nStrips;
+  const auto __restrict__ chanlocs = sst_data_d->chanlocs;
+  const uint8_t *__restrict__ adc = sst_data_d->adc;
+  const uint16_t *__restrict__ channels = sst_data_d->channel;
+  const uint16_t *__restrict__ stripId = sst_data_d->stripId;
+  int *__restrict__ seedStripsMask = sst_data_d->seedStripsMask;
+  int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
+  const float seedThreshold = sst_data_d->seedThreshold;
+
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int nthreads = blockDim.x;
+  const int first = nthreads * bid + tid;
+  const int stride = blockDim.x * gridDim.x;
+
+  for (int i = first; i < nStrips; i += stride) {
+    seedStripsMask[i] = 0;
+    seedStripsNCMask[i] = 0;
+    const stripId_t strip = stripId[i];
+    if (strip != invalidStrip) {
+      const auto chan = channels[i];
+      const fedId_t fed = chanlocs->fedID(chan);
+      const fedCh_t channel = chanlocs->fedCh(chan);
+      const float noise_i = conditions->noise(fed, channel, strip);
+      const uint8_t adc_i = adc[i];
+
+      seedStripsMask[i] = (adc_i >= static_cast<uint8_t>(noise_i * seedThreshold)) ? 1 : 0;
+      seedStripsNCMask[i] = seedStripsMask[i];
+    }
+  }
+}
+
+__global__ static void setNCSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
+  const int nStrips = sst_data_d->nStrips;
+  const auto __restrict__ chanlocs = sst_data_d->chanlocs;
+  const uint16_t *__restrict__ channels = sst_data_d->channel;
+  const uint16_t *__restrict__ stripId = sst_data_d->stripId;
+  const int *__restrict__ seedStripsMask = sst_data_d->seedStripsMask;
+  int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
+
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int nthreads = blockDim.x;
+  const int first = nthreads * bid + tid;
+  const int stride = blockDim.x * gridDim.x;
+
+  for (int i = first; i < nStrips; i += stride) {
+    if (i > 0) {
+      const auto detid = chanlocs->detID(channels[i]);
+      const auto detid1 = chanlocs->detID(channels[i - 1]);
+
+      if (seedStripsMask[i] && seedStripsMask[i - 1] && (stripId[i] - stripId[i - 1]) == 1 && (detid == detid1))
+        seedStripsNCMask[i] = 0;
+    }
+  }
+}
+
+__global__ static void setStripIndexGPU(StripDataView *sst_data_d) {
+  const int nStrips = sst_data_d->nStrips;
+  const int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
+  const int *__restrict__ prefixSeedStripsNCMask = sst_data_d->prefixSeedStripsNCMask;
+  int *__restrict__ seedStripsNCIndex = sst_data_d->seedStripsNCIndex;
+
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int nthreads = blockDim.x;
+  const int first = nthreads * bid + tid;
+  const int stride = blockDim.x * gridDim.x;
+
+  for (int i = first; i < nStrips; i += stride) {
+    if (seedStripsNCMask[i] == 1) {
+      const int index = prefixSeedStripsNCMask[i];
+      seedStripsNCIndex[index] = i;
+    }
+  }
+}
+
+__global__ static void findLeftRightBoundaryGPU(const StripDataView *sst_data_d,
+                                                const ConditionsDeviceView *conditions,
+                                                SiStripClustersCUDADevice::DeviceView *clust_data_d) {
+  const int nStrips = sst_data_d->nStrips;
+  const int *__restrict__ seedStripsNCIndex = sst_data_d->seedStripsNCIndex;
+  const auto __restrict__ chanlocs = sst_data_d->chanlocs;
+  const uint16_t *__restrict__ stripId = sst_data_d->stripId;
+  const uint16_t *__restrict__ channels = sst_data_d->channel;
+  const uint8_t *__restrict__ adc = sst_data_d->adc;
+  const int nSeedStripsNC = std::min(kMaxSeedStrips, *(sst_data_d->prefixSeedStripsNCMask + nStrips - 1));
+  const uint8_t maxSequentialHoles = sst_data_d->maxSequentialHoles;
+  const float channelThreshold = sst_data_d->channelThreshold;
+  const float clusterThresholdSquared = sst_data_d->clusterThresholdSquared;
+  const int clusterSizeLimit = sst_data_d->clusterSizeLimit;
+
+  auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
+  auto __restrict__ clusterSize = clust_data_d->clusterSize_;
+  auto __restrict__ clusterDetId = clust_data_d->clusterDetId_;
+  auto __restrict__ firstStrip = clust_data_d->firstStrip_;
+  auto __restrict__ trueCluster = clust_data_d->trueCluster_;
+
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int nthreads = blockDim.x;
+  const int first = nthreads * bid + tid;
+  const int stride = blockDim.x * gridDim.x;
+
+  for (int i = first; i < nSeedStripsNC; i += stride) {
+    const auto index = seedStripsNCIndex[i];
+    const auto chan = channels[index];
+    const auto fed = chanlocs->fedID(chan);
+    const auto channel = chanlocs->fedCh(chan);
+    const auto det = chanlocs->detID(chan);
+    const auto strip = stripId[index];
+    const auto noise_i = conditions->noise(fed, channel, strip);
+
+    auto noiseSquared_i = noise_i * noise_i;
+    float adcSum_i = static_cast<float>(adc[index]);
+    auto testIndex = index - 1;
+    auto size = 1;
+
+    auto addtocluster = [&](int &indexLR) {
+      const auto testchan = channels[testIndex];
+      const auto testFed = chanlocs->fedID(testchan);
+      const auto testChannel = chanlocs->fedCh(testchan);
+      const auto testStrip = stripId[testIndex];
+      const auto testNoise = conditions->noise(testFed, testChannel, testStrip);
+      const auto testADC = adc[testIndex];
+
+      if (testADC >= static_cast<uint8_t>(testNoise * channelThreshold)) {
+        ++size;
+        indexLR = testIndex;
+        noiseSquared_i += testNoise * testNoise;
+        adcSum_i += static_cast<float>(testADC);
+      }
+    };
+
+    // find left boundary
+    auto indexLeft = index;
+
+    if (testIndex >= 0 && stripId[testIndex] == invalidStrip) {
+      testIndex -= 2;
+    }
+
+    if (testIndex >= 0) {
+      const auto testchan = channels[testIndex];
+      const auto testDet = chanlocs->detID(testchan);
+      auto rangeLeft = stripId[indexLeft] - stripId[testIndex] - 1;
+      auto sameDetLeft = det == testDet;
+
+      while (sameDetLeft && rangeLeft >= 0 && rangeLeft <= maxSequentialHoles && size < clusterSizeLimit + 1) {
+        addtocluster(indexLeft);
+        --testIndex;
+        if (testIndex >= 0 && stripId[testIndex] == invalidStrip) {
+          testIndex -= 2;
+        }
+        if (testIndex >= 0) {
+          rangeLeft = stripId[indexLeft] - stripId[testIndex] - 1;
+          const auto newchan = channels[testIndex];
+          const auto newdet = chanlocs->detID(newchan);
+          sameDetLeft = det == newdet;
+        } else {
+          sameDetLeft = false;
+        }
+      }  // while loop
+    }    // testIndex >= 0
+
+    // find right boundary
+    auto indexRight = index;
+    testIndex = index + 1;
+
+    if (testIndex < nStrips && stripId[testIndex] == invalidStrip) {
+      testIndex += 2;
+    }
+
+    if (testIndex < nStrips) {
+      const auto testchan = channels[testIndex];
+      const auto testDet = chanlocs->detID(testchan);
+      auto rangeRight = stripId[testIndex] - stripId[indexRight] - 1;
+      auto sameDetRight = det == testDet;
+
+      while (sameDetRight && rangeRight >= 0 && rangeRight <= maxSequentialHoles && size < clusterSizeLimit + 1) {
+        addtocluster(indexRight);
+        ++testIndex;
+        if (testIndex < nStrips && stripId[testIndex] == invalidStrip) {
+          testIndex += 2;
+        }
+        if (testIndex < nStrips) {
+          rangeRight = stripId[testIndex] - stripId[indexRight] - 1;
+          const auto newchan = channels[testIndex];
+          const auto newdet = chanlocs->detID(newchan);
+          sameDetRight = det == newdet;
+        } else {
+          sameDetRight = false;
+        }
+      }  // while loop
+    }    // testIndex < nStrips
+    clusterIndexLeft[i] = indexLeft;
+    clusterSize[i] = indexRight - indexLeft + 1;
+    clusterDetId[i] = det;
+    firstStrip[i] = stripId[indexLeft];
+    trueCluster[i] =
+        (noiseSquared_i * clusterThresholdSquared <= adcSum_i * adcSum_i) and (clusterSize[i] <= clusterSizeLimit);
+  }  // i < nSeedStripsNC
+  if (first == 0) {
+    clust_data_d->nClusters_ = nSeedStripsNC;
+  }
+}
+
+__global__ static void checkClusterConditionGPU(StripDataView *sst_data_d,
+                                                const ConditionsDeviceView *conditions,
+                                                SiStripClustersCUDADevice::DeviceView *clust_data_d) {
+  const uint16_t *__restrict__ stripId = sst_data_d->stripId;
+  const auto __restrict__ chanlocs = sst_data_d->chanlocs;
+  const uint16_t *__restrict__ channels = sst_data_d->channel;
+  const uint8_t *__restrict__ adc = sst_data_d->adc;
+  const float minGoodCharge = sst_data_d->minGoodCharge;  //1620.0;
+  const auto nSeedStripsNC = clust_data_d->nClusters_;
+  const auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
+
+  auto __restrict__ clusterSize = clust_data_d->clusterSize_;
+  auto __restrict__ clusterADCs = clust_data_d->clusterADCs_;
+  auto __restrict__ trueCluster = clust_data_d->trueCluster_;
+  auto __restrict__ barycenter = clust_data_d->barycenter_;
+  auto __restrict__ charge = clust_data_d->charge_;
+
+  constexpr uint16_t stripIndexMask = 0x7FFF;
+
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int nthreads = blockDim.x;
+  const int first = nthreads * bid + tid;
+  const int stride = blockDim.x * gridDim.x;
+
+  for (int i = first; i < nSeedStripsNC; i += stride) {
+    if (trueCluster[i]) {
+      const int left = clusterIndexLeft[i];
+      const int size = clusterSize[i];
+
+      if (i > 0 && clusterIndexLeft[i - 1] == left) {
+        trueCluster[i] = 0;  // ignore duplicates
+      } else {
+        float adcSum = 0.0f;
+        int sumx = 0;
+        int suma = 0;
+
+        auto j = 0;
+        for (int k = 0; k < size; k++) {
+          const auto index = left + k;
+          const auto chan = channels[index];
+          const auto fed = chanlocs->fedID(chan);
+          const auto channel = chanlocs->fedCh(chan);
+          const auto strip = stripId[index];
+#ifdef GPU_CHECK
+          if (fed == invalidFed) {
+            printf("Invalid fed index %d\n", index);
+          }
+#endif
+          if (strip != invalidStrip) {
+            const float gain_j = conditions->gain(fed, channel, strip);
+
+            uint8_t adc_j = adc[index];
+            const int charge = static_cast<int>(static_cast<float>(adc_j) / gain_j + 0.5f);
+
+            constexpr uint8_t adc_low_saturation = 254;
+            constexpr uint8_t adc_high_saturation = 255;
+            constexpr int charge_low_saturation = 253;
+            constexpr int charge_high_saturation = 1022;
+            if (adc_j < adc_low_saturation) {
+              adc_j =
+                  (charge > charge_high_saturation ? adc_high_saturation
+                                                   : (charge > charge_low_saturation ? adc_low_saturation : charge));
+            }
+            clusterADCs[j * nSeedStripsNC + i] = adc_j;
+
+            adcSum += static_cast<float>(adc_j);
+            sumx += j * adc_j;
+            suma += adc_j;
+            j++;
+          }
+        }  // loop over cluster strips
+        charge[i] = adcSum;
+        const auto chan = channels[left];
+        const fedId_t fed = chanlocs->fedID(chan);
+        const fedCh_t channel = chanlocs->fedCh(chan);
+        trueCluster[i] = (adcSum * conditions->invthick(fed, channel)) > minGoodCharge;
+        const auto bary_i = static_cast<float>(sumx) / static_cast<float>(suma);
+        barycenter[i] = static_cast<float>(stripId[left] & stripIndexMask) + bary_i + 0.5f;
+        clusterSize[i] = j;
+      }  // not a duplicate cluster
+    }    // trueCluster[i] is true
+  }      // i < nSeedStripsNC
+}
+
+namespace stripgpu {
+  void SiStripRawToClusterGPUKernel::unpackChannelsGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
+    constexpr int nthreads = 128;
+    const auto channels = chanlocsGPU_->size();
+    const auto nblocks = (channels + nthreads - 1) / nthreads;
+
+    unpackChannels<<<nblocks, nthreads, 0, stream>>>(chanlocsGPU_->channelLocsView(),
+                                                     conditions,
+                                                     stripdata_->alldataGPU_.get(),
+                                                     stripdata_->channelGPU_.get(),
+                                                     stripdata_->stripIdGPU_.get());
+  }
+
+  void SiStripRawToClusterGPUKernel::allocateSSTDataGPU(int max_strips, cudaStream_t stream) {
+    stripdata_->seedStripsMask_ = cms::cuda::make_device_unique<int[]>(2 * max_strips, stream);
+    stripdata_->prefixSeedStripsNCMask_ = cms::cuda::make_device_unique<int[]>(2 * max_strips, stream);
+
+    sst_data_d_->chanlocs = chanlocsGPU_->channelLocsView();
+    sst_data_d_->stripId = stripdata_->stripIdGPU_.get();
+    sst_data_d_->channel = stripdata_->channelGPU_.get();
+    sst_data_d_->adc = stripdata_->alldataGPU_.get();
+    sst_data_d_->seedStripsMask = stripdata_->seedStripsMask_.get();
+    sst_data_d_->prefixSeedStripsNCMask = stripdata_->prefixSeedStripsNCMask_.get();
+
+    sst_data_d_->seedStripsNCMask = sst_data_d_->seedStripsMask + max_strips;
+    sst_data_d_->seedStripsNCIndex = sst_data_d_->prefixSeedStripsNCMask + max_strips;
+
+    sst_data_d_->channelThreshold = channelThreshold_;
+    sst_data_d_->seedThreshold = seedThreshold_;
+    sst_data_d_->clusterThresholdSquared = clusterThresholdSquared_;
+    sst_data_d_->maxSequentialHoles = maxSequentialHoles_;
+    sst_data_d_->maxSequentialBad = maxSequentialBad_;
+    sst_data_d_->maxAdjacentBad = maxAdjacentBad_;
+    sst_data_d_->minGoodCharge = minGoodCharge_;
+    sst_data_d_->clusterSizeLimit = maxClusterSize_;
+
+    pt_sst_data_d_ = cms::cuda::make_device_unique<StripDataView>(stream);
+    cms::cuda::copyAsync(pt_sst_data_d_, sst_data_d_, stream);
+#ifdef GPU_CHECK
+    cudaCheck(cudaStreamSynchronize(stream));
+#endif
+  }
+
+  void SiStripRawToClusterGPUKernel::findClusterGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
+    const int nthreads = 128;
+    const int nStrips = sst_data_d_->nStrips;
+    const int nSeeds = std::min(kMaxSeedStrips, nStrips);
+    const int nblocks = (nSeeds + nthreads - 1) / nthreads;
+
+#ifdef GPU_DEBUG
+    auto cpu_index = cms::cuda::make_host_unique<int[]>(nStrips, stream);
+    auto cpu_strip = cms::cuda::make_host_unique<uint16_t[]>(nStrips, stream);
+    auto cpu_adc = cms::cuda::make_host_unique<uint8_t[]>(nStrips, stream);
+
+    cudaCheck(cudaMemcpyAsync(
+        cpu_strip.get(), sst_data_d_->stripId, nStrips * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(
+        cudaMemcpyAsync(cpu_adc.get(), sst_data_d_->adc, nStrips * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaMemcpyAsync(
+        cpu_index.get(), sst_data_d_->seedStripsNCIndex, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    for (int i = 0; i < nStrips; i++) {
+      std::cout << " cpu_strip " << cpu_strip[i] << " cpu_adc " << (unsigned int)cpu_adc[i] << " cpu index "
+                << cpu_index[i] << std::endl;
+    }
+#endif
+
+    auto clust_data_d = clusters_d_.view();
+    findLeftRightBoundaryGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions, clust_data_d);
+    cudaCheck(cudaGetLastError());
+#ifdef GPU_CHECK
+    cudaDeviceSynchronize();
+    cudaCheck(cudaGetLastError());
+#endif
+
+    cudaCheck(cudaMemcpyAsync(clusters_d_.nClustersPtr(),
+                              &(clust_data_d->nClusters_),
+                              sizeof(clust_data_d->nClusters_),
+                              cudaMemcpyDeviceToHost,
+                              stream));
+
+    checkClusterConditionGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions, clust_data_d);
+    cudaCheck(cudaGetLastError());
+
+#ifdef GPU_CHECK
+    cudaDeviceSynchronize();
+    cudaCheck(cudaGetLastError());
+#endif
+
+#ifdef GPU_DEBUG
+    cudaStreamSynchronize(stream);
+    auto clust_data = std::make_unique<SiStripClustersCUDAHost>(clusters_d_, stream);
+    cudaStreamSynchronize(stream);
+
+    const auto clusterIndexLeft = clust_data->clusterIndex().get();
+    const auto clusterSize = clust_data->clusterSize().get();
+    const auto trueCluster = clust_data->trueCluster().get();
+    const auto clusterADCs = clust_data->clusterADCs().get();
+    const auto detids = clust_data->clusterDetId().get();
+    const auto charge = clust_data->charge().get();
+
+    const auto nSeedStripsNC = clusters_d_.nClusters();
+    std::cout << "findClusterGPU nSeedStripsNC=" << nSeedStripsNC << std::endl;
+
+    for (auto i = 0U; i < nSeedStripsNC; i++) {
+      if (trueCluster[i]) {
+        int left = clusterIndexLeft[i];
+        uint32_t size = clusterSize[i];
+        const auto detid = detids[i];
+        std::cout << "i=" << i << " detId " << detid << " left " << left << " size " << size << " charge " << charge[i]
+                  << ": ";
+        size = std::min(size, maxClusterSize_);
+        for (uint32_t j = 0; j < size; j++) {
+          std::cout << (unsigned int)clusterADCs[j * nSeedStripsNC + i] << " ";
+        }
+        std::cout << std::endl;
+      }
+    }
+#endif
+  }
+
+  void SiStripRawToClusterGPUKernel::setSeedStripsNCIndexGPU(const ConditionsDeviceView *conditions,
+                                                             cudaStream_t stream) {
+#ifdef GPU_DEBUG
+    int nStrips = sst_data_d_->nStrips;
+    auto cpu_strip = cms::cuda::make_host_unique<uint16_t[]>(nStrips, stream);
+    auto cpu_adc = cms::cuda::make_host_unique<uint8_t[]>(nStrips, stream);
+
+    cudaCheck(cudaMemcpyAsync(
+        cpu_strip.get(), sst_data_d_->stripId, nStrips * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(
+        cudaMemcpyAsync(cpu_adc.get(), sst_data_d_->adc, nStrips * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    for (int i = 0; i < nStrips; i++) {
+      std::cout << " cpu_strip " << cpu_strip[i] << " cpu_adc " << (unsigned int)cpu_adc[i] << std::endl;
+    }
+#endif
+
+    int nthreads = 256;
+    int nblocks = (sst_data_d_->nStrips + nthreads - 1) / nthreads;
+
+    //mark seed strips
+    setSeedStripsGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions);
+    cudaCheck(cudaGetLastError());
+#ifdef GPU_CHECK
+    cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+    //mark only non-consecutive seed strips (mask out consecutive seed strips)
+    setNCSeedStripsGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions);
+    cudaCheck(cudaGetLastError());
+#ifdef GPU_CHECK
+    cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+    std::size_t temp_storage_bytes = 0;
+    cub::DeviceScan::ExclusiveSum(nullptr,
+                                  temp_storage_bytes,
+                                  sst_data_d_->seedStripsNCMask,
+                                  sst_data_d_->prefixSeedStripsNCMask,
+                                  sst_data_d_->nStrips,
+                                  stream);
+#ifdef GPU_DEBUG
+    std::cout << "temp_storage_bytes=" << temp_storage_bytes << std::endl;
+#endif
+#ifdef GPU_CHECK
+    cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+    {
+      auto d_temp_storage = cms::cuda::make_device_unique<uint8_t[]>(temp_storage_bytes, stream);
+      cub::DeviceScan::ExclusiveSum(d_temp_storage.get(),
+                                    temp_storage_bytes,
+                                    sst_data_d_->seedStripsNCMask,
+                                    sst_data_d_->prefixSeedStripsNCMask,
+                                    sst_data_d_->nStrips,
+                                    stream);
+    }
+#ifdef GPU_CHECK
+    cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+    setStripIndexGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get());
+    cudaCheck(cudaGetLastError());
+#ifdef GPU_CHECK
+    cudaCheck(cudaStreamSynchronize(stream));
+#endif
+
+#ifdef GPU_DEBUG
+    auto cpu_mask = cms::cuda::make_host_unique<int[]>(nStrips, stream);
+    auto cpu_prefix = cms::cuda::make_host_unique<int[]>(nStrips, stream);
+    auto cpu_index = cms::cuda::make_host_unique<int[]>(nStrips, stream);
+
+    cudaCheck(cudaMemcpyAsync(&(sst_data_d_->nSeedStripsNC),
+                              sst_data_d_->prefixSeedStripsNCMask + sst_data_d_->nStrips - 1,
+                              sizeof(int),
+                              cudaMemcpyDeviceToHost,
+                              stream));
+    cudaCheck(cudaMemcpyAsync(
+        cpu_mask.get(), sst_data_d_->seedStripsNCMask, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaMemcpyAsync(
+        cpu_prefix.get(), sst_data_d_->prefixSeedStripsNCMask, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaMemcpyAsync(
+        cpu_index.get(), sst_data_d_->seedStripsNCIndex, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    const int nSeedStripsNC = std::min(kMaxSeedStrips, sst_data_d_->nSeedStripsNC);
+    std::cout << "nStrips=" << nStrips << " nSeedStripsNC=" << sst_data_d_->nSeedStripsNC << std::endl;
+    for (int i = 0; i < nStrips; i++) {
+      std::cout << " i " << i << " mask " << cpu_mask[i] << " prefix " << cpu_prefix[i] << " index " << cpu_index[i]
+                << std::endl;
+    }
+#endif
+  }
+}  // namespace stripgpu
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h
new file mode 100644
index 0000000000000..a9c88b340b623
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h
@@ -0,0 +1,77 @@
+#ifndef RecoLocalTracker_SiStripClusterizer_plugins_SiStripRawToClusterGPUKernel_h
+#define RecoLocalTracker_SiStripClusterizer_plugins_SiStripRawToClusterGPUKernel_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+
+#include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
+//#include "clusterGPU.cuh"
+
+#include <cuda_runtime.h>
+
+#include <vector>
+#include <memory>
+
+class ChannelLocs;
+class ChannelLocsGPU;
+class FEDRawData;
+
+namespace sistrip {
+  class FEDBuffer;
+}
+namespace edm {
+  class ParameterSet;
+}
+
+namespace stripgpu {
+  struct StripDataView;
+
+  class StripDataGPU {
+  public:
+    StripDataGPU(size_t size, cudaStream_t stream);
+
+    cms::cuda::device::unique_ptr<uint8_t[]> alldataGPU_;
+    cms::cuda::device::unique_ptr<uint16_t[]> channelGPU_;
+    cms::cuda::device::unique_ptr<stripgpu::stripId_t[]> stripIdGPU_;
+    cms::cuda::device::unique_ptr<int[]> seedStripsMask_;
+    cms::cuda::device::unique_ptr<int[]> prefixSeedStripsNCMask_;
+  };
+
+  class SiStripRawToClusterGPUKernel {
+  public:
+    SiStripRawToClusterGPUKernel(const edm::ParameterSet& conf);
+    void makeAsync(const std::vector<const FEDRawData*>& rawdata,
+                   const std::vector<std::unique_ptr<sistrip::FEDBuffer>>& buffers,
+                   const SiStripClusterizerConditionsGPU& conditions,
+                   cudaStream_t stream);
+    void copyAsync(cudaStream_t stream);
+    SiStripClustersCUDADevice getResults(cudaStream_t stream);
+
+  private:
+    using ConditionsDeviceView = SiStripClusterizerConditionsGPU::Data::DeviceView;
+
+    void reset();
+    void unpackChannelsGPU(const ConditionsDeviceView* conditions, cudaStream_t stream);
+    void allocateSSTDataGPU(int max_strips, cudaStream_t stream);
+    void freeSSTDataGPU(cudaStream_t stream);
+
+    void setSeedStripsNCIndexGPU(const ConditionsDeviceView* conditions, cudaStream_t stream);
+    void findClusterGPU(const ConditionsDeviceView* conditions, cudaStream_t stream);
+
+    std::vector<stripgpu::fedId_t> fedIndex_;
+    std::vector<size_t> fedRawDataOffsets_;
+
+    std::unique_ptr<StripDataGPU> stripdata_;
+    std::unique_ptr<ChannelLocsGPU> chanlocsGPU_;
+
+    cms::cuda::host::unique_ptr<StripDataView> sst_data_d_;
+    cms::cuda::device::unique_ptr<StripDataView> pt_sst_data_d_;
+
+    SiStripClustersCUDADevice clusters_d_;
+    float channelThreshold_, seedThreshold_, clusterThresholdSquared_;
+    uint8_t maxSequentialHoles_, maxSequentialBad_, maxAdjacentBad_;
+    uint32_t maxClusterSize_;
+    float minGoodCharge_;
+  };
+}  // namespace stripgpu
+#endif
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.h b/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.h
new file mode 100644
index 0000000000000..785b714376e2a
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.h
@@ -0,0 +1,28 @@
+#ifndef RecoLocalTracker_SiStripClusterizer_plugins_StripDataView_h
+#define RecoLocalTracker_SiStripClusterizer_plugins_StripDataView_h
+
+#include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
+#include "FWCore/Utilities/interface/HostDeviceConstant.h"
+
+#include <cstdint>
+
+class ChannelLocsView;
+
+namespace stripgpu {
+  HOST_DEVICE_CONSTANT auto kMaxSeedStrips = 200000;
+
+  struct StripDataView {
+    const ChannelLocsView *chanlocs;
+    uint8_t *adc;
+    uint16_t *channel;
+    stripId_t *stripId;
+    int *seedStripsNCIndex, *seedStripsMask, *seedStripsNCMask, *prefixSeedStripsNCMask;
+    int nSeedStripsNC;
+    int nStrips;
+    float channelThreshold, seedThreshold, clusterThresholdSquared;
+    uint8_t maxSequentialHoles, maxSequentialBad, maxAdjacentBad;
+    float minGoodCharge;
+    int clusterSizeLimit;
+  };
+}  // namespace stripgpu
+#endif
diff --git a/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py b/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py
index 2f0a141beb688..b187755f84727 100644
--- a/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py
+++ b/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py
@@ -11,6 +11,7 @@
     MaxSequentialHoles = cms.uint32(0),
     MaxSequentialBad = cms.uint32(1),
     MaxAdjacentBad = cms.uint32(0),
+    MaxClusterSize = cms.uint32(768),
     RemoveApvShots     = cms.bool(True),
     clusterChargeCut = cms.PSet(refToPSet_ = cms.string('SiStripClusterChargeCutNone')),
     ConditionsLabel = cms.string("")
diff --git a/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py b/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py
index 602b63215018a..0e912f7ba5660 100644
--- a/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py
+++ b/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py
@@ -1,16 +1,56 @@
 import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
 
 # from RecoLocalTracker.SiStripClusterizer.SiStripClusterizer_RealData_cfi import *
 
 from RecoLocalTracker.SiStripClusterizer.DefaultClusterizer_cff import *
 from RecoLocalTracker.SiStripZeroSuppression.DefaultAlgorithms_cff import *
-siStripClusters = cms.EDProducer("SiStripClusterizerFromRaw",
-                                                onDemand = cms.bool(True),
-                                                Clusterizer = DefaultClusterizer,
-                                                Algorithms = DefaultAlgorithms,
-                                                DoAPVEmulatorCheck = cms.bool(False),
-                                                HybridZeroSuppressed = cms.bool(False),
-                                                ProductLabel = cms.InputTag('rawDataCollector')
-                                                )
 
+from RecoLocalTracker.SiStripClusterizer.siStripClusterizerFromRawGPU_cfi import siStripClusterizerFromRawGPU
+from RecoLocalTracker.SiStripClusterizer.siStripClustersSOAtoHost_cfi import siStripClustersSOAtoHost
+from RecoLocalTracker.SiStripClusterizer.siStripClustersFromSOA_cfi import siStripClustersFromSOA
+from RecoLocalTracker.SiStripClusterizer.SiStripClusterizerConditionsGPUESProducer_cfi import SiStripClusterizerConditionsGPUESProducer
 
+_siStripClusterizerFromRaw = cms.EDProducer("SiStripClusterizerFromRaw",
+                                            onDemand = cms.bool(True),
+                                            Clusterizer = DefaultClusterizer,
+                                            Algorithms = DefaultAlgorithms,
+                                            DoAPVEmulatorCheck = cms.bool(False),
+                                            HybridZeroSuppressed = cms.bool(False),
+                                            ProductLabel = cms.InputTag('rawDataCollector'))
+
+_siStripClusterizerFromRaw.Clusterizer.MaxClusterSize = cms.uint32(16)
+
+siStripClusterizerFromRawGPU.Clusterizer = DefaultClusterizer
+
+siStripClusters = SwitchProducerCUDA(
+    cpu = _siStripClusterizerFromRaw.clone(),
+)
+
+siStripClustersTask = cms.Task(
+    siStripClusters,
+)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+gpu.toModify(siStripClusters,
+    cuda = siStripClustersFromSOA,
+)
+
+siStripClustersTaskCUDA = cms.Task()
+
+gpu.toReplaceWith(siStripClustersTaskCUDA, cms.Task(
+    # conditions used *only* by the modules running on GPU
+    SiStripClusterizerConditionsGPUESProducer,
+    # reconstruct the strip clusters on the gpu
+    siStripClusterizerFromRawGPU,
+    # copy clusters from GPU to pinned host memory
+    siStripClustersSOAtoHost,
+))
+
+gpu.toReplaceWith(siStripClustersTask, cms.Task(
+    # CUDA specific
+    siStripClustersTaskCUDA,
+    # switch producer to legacy format
+    siStripClusters,
+))
diff --git a/RecoLocalTracker/SiStripClusterizer/python/customizeStripClustersFromRaw.py b/RecoLocalTracker/SiStripClusterizer/python/customizeStripClustersFromRaw.py
new file mode 100644
index 0000000000000..db7d02855713f
--- /dev/null
+++ b/RecoLocalTracker/SiStripClusterizer/python/customizeStripClustersFromRaw.py
@@ -0,0 +1,24 @@
+import FWCore.ParameterSet.Config as cms
+
+# replace the standard SiStripClusterizer with the switch producer
+# meant primarily for testing
+def customizeStripClustersFromRaw(process):
+    if hasattr(process, 'striptrackerlocalrecoTask'):
+        process.striptrackerlocalrecoTask.remove(process.siStripClusters)
+        process.load("RecoLocalTracker.SiStripClusterizer.SiStripClusterizerOnDemand_cfi")
+        # CPU should emulate the full detector clusterizer
+        process.siStripClusters.cpu.onDemand = cms.bool(False)
+        process.striptrackerlocalrecoTask.add(process.siStripClustersTask)
+
+    return process
+
+def customizeHLTStripClustersFromRaw(process):
+    if hasattr(process, 'hltSiStripRawToClustersFacility'):
+        import RecoLocalTracker.SiStripClusterizer.SiStripClusterizerOnDemand_cfi as SiStripClusterizerOnDemand_cfi
+
+        process.load("RecoLocalTracker.SiStripClusterizer.SiStripClusterizerOnDemand_cfi")
+        process.hltSiStripRawToClustersFacility = SiStripClusterizerOnDemand_cfi.siStripClusters.clone()
+        process.HLTDoLocalStripSequence.replace(process.hltSiStripRawToClustersFacility,
+                                   cms.Sequence(process.hltSiStripRawToClustersFacility, process.siStripClustersTaskCUDA))
+
+    return process
diff --git a/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc b/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc
index b8e38e97d4379..034283a910054 100644
--- a/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc
+++ b/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc
@@ -1,6 +1,7 @@
 #include "RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithmFactory.h"
 
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "RecoLocalTracker/SiStripClusterizer/interface/StripClusterizerAlgorithm.h"
 #include "RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h"
 #include "RecoLocalTracker/SiStripClusterizer/interface/ClusterChargeCut.h"
@@ -10,17 +11,19 @@ std::unique_ptr<StripClusterizerAlgorithm> StripClusterizerAlgorithmFactory::cre
   std::string algorithm = conf.getParameter<std::string>("Algorithm");
 
   if (algorithm == "ThreeThresholdAlgorithm") {
-    return std::unique_ptr<StripClusterizerAlgorithm>(
-        new ThreeThresholdAlgorithm(iC.esConsumes<SiStripClusterizerConditions, SiStripClusterizerConditionsRcd>(
-                                        edm::ESInputTag{"", conf.getParameter<std::string>("ConditionsLabel")}),
-                                    conf.getParameter<double>("ChannelThreshold"),
-                                    conf.getParameter<double>("SeedThreshold"),
-                                    conf.getParameter<double>("ClusterThreshold"),
-                                    conf.getParameter<unsigned>("MaxSequentialHoles"),
-                                    conf.getParameter<unsigned>("MaxSequentialBad"),
-                                    conf.getParameter<unsigned>("MaxAdjacentBad"),
-                                    conf.getParameter<bool>("RemoveApvShots"),
-                                    clusterChargeCut(conf)));
+    return std::unique_ptr<StripClusterizerAlgorithm>(new ThreeThresholdAlgorithm(
+        iC.esConsumes<SiStripClusterizerConditions, SiStripClusterizerConditionsRcd>(
+            edm::ESInputTag{"", conf.getParameter<std::string>("ConditionsLabel")}),
+        conf.getParameter<double>("ChannelThreshold"),
+        conf.getParameter<double>("SeedThreshold"),
+        conf.getParameter<double>("ClusterThreshold"),
+        conf.getParameter<unsigned>("MaxSequentialHoles"),
+        conf.getParameter<unsigned>("MaxSequentialBad"),
+        conf.getParameter<unsigned>("MaxAdjacentBad"),
+        // existsAs test should be removed once MaxClusterSize is in the HLT config
+        conf.existsAs<unsigned>("MaxClusterSize") ? conf.getParameter<unsigned>("MaxClusterSize") : 3U * 256U,
+        conf.getParameter<bool>("RemoveApvShots"),
+        clusterChargeCut(conf)));
   }
 
   if (algorithm == "OldThreeThresholdAlgorithm") {
@@ -30,3 +33,18 @@ std::unique_ptr<StripClusterizerAlgorithm> StripClusterizerAlgorithmFactory::cre
   throw cms::Exception("[StripClusterizerAlgorithmFactory] Unregistered Algorithm")
       << algorithm << " is not a registered StripClusterizerAlgorithm";
 }
+
+void StripClusterizerAlgorithmFactory::fillDescriptions(edm::ParameterSetDescription& clusterizer) {
+  clusterizer.add<std::string>("Algorithm", "ThreeThresholdAlgorithm");
+  clusterizer.add<std::string>("ConditionsLabel", "");
+  clusterizer.add("ChannelThreshold", 2.0);
+  clusterizer.add("SeedThreshold", 3.0);
+  clusterizer.add("ClusterThreshold", 5.0);
+  clusterizer.add("MaxSequentialHoles", 0U);
+  clusterizer.add("MaxSequentialBad", 1U);
+  clusterizer.add("MaxAdjacentBad", 0U);
+  clusterizer.addOptional("MaxClusterSize", 3U * 256U);  // eventually should be add()
+  clusterizer.add("RemoveApvShots", true);
+  clusterizer.add("setDetId", true);
+  clusterizer.add("clusterChargeCut", getConfigurationDescription4CCC(CCC::kNone));
+}
diff --git a/RecoLocalTracker/SiStripClusterizer/src/ThreeThresholdAlgorithm.cc b/RecoLocalTracker/SiStripClusterizer/src/ThreeThresholdAlgorithm.cc
index 7bc7fc495830e..c2cf5b8962896 100644
--- a/RecoLocalTracker/SiStripClusterizer/src/ThreeThresholdAlgorithm.cc
+++ b/RecoLocalTracker/SiStripClusterizer/src/ThreeThresholdAlgorithm.cc
@@ -15,6 +15,7 @@ ThreeThresholdAlgorithm::ThreeThresholdAlgorithm(
     unsigned holes,
     unsigned bad,
     unsigned adj,
+    unsigned maxClusterSize,
     bool removeApvShots,
     float minGoodCharge)
     : StripClusterizerAlgorithm(conditionsToken),
@@ -24,6 +25,7 @@ ThreeThresholdAlgorithm::ThreeThresholdAlgorithm(
       MaxSequentialHoles(holes),
       MaxSequentialBad(bad),
       MaxAdjacentBad(adj),
+      MaxClusterSize(maxClusterSize),
       RemoveApvShots(removeApvShots),
       minGoodCharge(minGoodCharge) {}
 
@@ -80,7 +82,8 @@ inline void ThreeThresholdAlgorithm::addToCandidate(State& state, uint16_t strip
   while (++state.lastStrip < strip)
     state.ADCs.push_back(0);  // pad holes
 
-  state.ADCs.push_back(adc);
+  if (state.ADCs.size() <= MaxClusterSize)
+    state.ADCs.push_back(adc);
   state.noiseSquared += Noise * Noise;
 }
 
@@ -98,7 +101,7 @@ inline void ThreeThresholdAlgorithm::endCandidate(State& state, T& out) const {
 }
 
 inline bool ThreeThresholdAlgorithm::candidateAccepted(State const& state) const {
-  return (!state.candidateLacksSeed &&
+  return (!state.candidateLacksSeed && state.ADCs.size() <= MaxClusterSize &&
           state.noiseSquared * ClusterThresholdSquared <=
               std::pow(float(std::accumulate(state.ADCs.begin(), state.ADCs.end(), int(0))), 2.f));
 }
diff --git a/RecoLocalTracker/SiStripClusterizer/test/BuildFile.xml b/RecoLocalTracker/SiStripClusterizer/test/BuildFile.xml
index b48376dc570b8..6486482ae8f01 100644
--- a/RecoLocalTracker/SiStripClusterizer/test/BuildFile.xml
+++ b/RecoLocalTracker/SiStripClusterizer/test/BuildFile.xml
@@ -4,5 +4,6 @@
   <use name="RecoLocalTracker/SiStripClusterizer"/>
   <use name="SimTracker/TrackerHitAssociation"/>
   <use name="CalibTracker/SiStripCommon"/>
+  <use name="root"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoLocalTracker/SiStripZeroSuppression/interface/SiStripAPVRestorer.h b/RecoLocalTracker/SiStripZeroSuppression/interface/SiStripAPVRestorer.h
index b9281f0d78884..fe370cfa167fb 100644
--- a/RecoLocalTracker/SiStripZeroSuppression/interface/SiStripAPVRestorer.h
+++ b/RecoLocalTracker/SiStripZeroSuppression/interface/SiStripAPVRestorer.h
@@ -22,6 +22,11 @@
 #include <vector>
 #include <cstdint>
 
+namespace edm {
+  class ParameterSet;
+  class ParameterSetDescription;
+}  // namespace edm
+
 class SiStripAPVRestorer {
   friend class SiStripRawProcessingFactory;
 
@@ -82,6 +87,8 @@ class SiStripAPVRestorer {
   void createCMMapRealPed(const edm::DetSetVector<SiStripRawDigi>& input);
   void createCMMapCMstored(const edm::DetSetVector<SiStripProcessedRawDigi>& input);
 
+  static void fillDescriptions(edm::ParameterSetDescription& desc);
+
 private:  // members
   edm::EDGetTokenT<edm::DetSetVector<SiStripRawDigi>> siStripRawDigiToken_;
   edm::EDGetTokenT<edm::DetSetVector<SiStripProcessedRawDigi>> siStripProcessedRawDigiToken_;
diff --git a/RecoLocalTracker/SiStripZeroSuppression/interface/SiStripRawProcessingFactory.h b/RecoLocalTracker/SiStripZeroSuppression/interface/SiStripRawProcessingFactory.h
index abe31a61f0eb9..cad1c417d7a53 100644
--- a/RecoLocalTracker/SiStripZeroSuppression/interface/SiStripRawProcessingFactory.h
+++ b/RecoLocalTracker/SiStripZeroSuppression/interface/SiStripRawProcessingFactory.h
@@ -3,6 +3,7 @@
 
 namespace edm {
   class ParameterSet;
+  class ParameterSetDescription;
   class ConsumesCollector;
 }  // namespace edm
 class SiStripRawProcessingAlgorithms;
@@ -22,5 +23,6 @@ class SiStripRawProcessingFactory {
   static std::unique_ptr<SiStripCommonModeNoiseSubtractor> create_SubtractorCMN(const edm::ParameterSet&,
                                                                                 edm::ConsumesCollector);
   static std::unique_ptr<SiStripAPVRestorer> create_Restorer(const edm::ParameterSet&, edm::ConsumesCollector);
+  static void fillDescriptions(edm::ParameterSetDescription& algorithms);
 };
 #endif
diff --git a/RecoLocalTracker/SiStripZeroSuppression/src/SiStripAPVRestorer.cc b/RecoLocalTracker/SiStripZeroSuppression/src/SiStripAPVRestorer.cc
index e7fb9e0c24978..1ddc724552edf 100644
--- a/RecoLocalTracker/SiStripZeroSuppression/src/SiStripAPVRestorer.cc
+++ b/RecoLocalTracker/SiStripZeroSuppression/src/SiStripAPVRestorer.cc
@@ -1,5 +1,6 @@
 #include "RecoLocalTracker/SiStripZeroSuppression/interface/SiStripAPVRestorer.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/Framework/interface/ConsumesCollector.h"
 
@@ -1236,3 +1237,33 @@ void SiStripAPVRestorer::derivativeFollowerRestore(uint16_t apvN, uint16_t first
     }
   }
 }
+
+void SiStripAPVRestorer::fillDescriptions(edm::ParameterSetDescription& desc) {
+  desc.add("ForceNoRestore", false);
+  desc.add<std::string>("APVInspectMode", "BaselineFollower");
+  desc.add<std::string>("APVRestoreMode", "");
+  desc.add("useRealMeanCM", false);
+  desc.add("MeanCM", 0);
+  desc.add("DeltaCMThreshold", 20U);
+  desc.add("Fraction", 0.2);
+  desc.add("Deviation", 25U);
+  desc.add("restoreThreshold", 0.5);
+  desc.add("nSaturatedStrip", 2U);
+  desc.add("nSigmaNoiseDerTh", 4U);
+  desc.add("consecThreshold", 5U);
+  desc.add("nSmooth", 9U);
+  desc.add("distortionThreshold", 20U);
+  desc.add("ApplyBaselineCleaner", true);
+  desc.add("CleaningSequence", 1U);
+  desc.add("slopeX", 3);
+  desc.add("slopeY", 4);
+  desc.add("hitStripThreshold", 40U);
+  desc.add("minStripsToFit", 4U);
+  desc.add("ApplyBaselineRejection", true);
+  desc.add("filteredBaselineMax", 6.0);
+  desc.add("filteredBaselineDerivativeSumSquare", 30.0);
+  desc.add("discontinuityThreshold", 12);
+  desc.add("lastGradient", 10);
+  desc.add("sizeWindow", 1);
+  desc.add("widthCluster", 64);
+}
diff --git a/RecoLocalTracker/SiStripZeroSuppression/src/SiStripRawProcessingFactory.cc b/RecoLocalTracker/SiStripZeroSuppression/src/SiStripRawProcessingFactory.cc
index 0a09812b9c932..c0f2fa38e4402 100644
--- a/RecoLocalTracker/SiStripZeroSuppression/src/SiStripRawProcessingFactory.cc
+++ b/RecoLocalTracker/SiStripZeroSuppression/src/SiStripRawProcessingFactory.cc
@@ -4,6 +4,7 @@
 #include "FWCore/Framework/interface/ConsumesCollector.h"
 
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "RecoLocalTracker/SiStripZeroSuppression/interface/SiStripRawProcessingAlgorithms.h"
 #include "RecoLocalTracker/SiStripZeroSuppression/interface/SiStripPedestalsSubtractor.h"
 #include "RecoLocalTracker/SiStripZeroSuppression/interface/SiStripFedZeroSuppression.h"
@@ -87,3 +88,17 @@ std::unique_ptr<SiStripAPVRestorer> SiStripRawProcessingFactory::create_Restorer
     return std::unique_ptr<SiStripAPVRestorer>(new SiStripAPVRestorer(conf, iC));
   }
 }
+
+void SiStripRawProcessingFactory::fillDescriptions(edm::ParameterSetDescription& algorithms) {
+  algorithms.add<std::string>("CommonModeNoiseSubtractionMode", "Median");
+  algorithms.add("useCMMeanMap", false);
+  algorithms.add("TruncateInSuppressor", true);
+  algorithms.add("doAPVRestore", false);
+  algorithms.add("SiStripFedZeroSuppressionMode", 4U);
+  algorithms.add("PedestalSubtractionFedMode", true);
+  algorithms.add("Use10bitsTruncation", false);
+  algorithms.add("Percentile", 25.0);
+  algorithms.add("CutToAvoidSignal", 2.0);
+  algorithms.add("Iterations", 3);
+  SiStripAPVRestorer::fillDescriptions(algorithms);
+}