diff --git a/CUDADataFormats/EcalDigi/BuildFile.xml b/CUDADataFormats/EcalDigi/BuildFile.xml
new file mode 100644
index 0000000000000..b7d25b0872646
--- /dev/null
+++ b/CUDADataFormats/EcalDigi/BuildFile.xml
@@ -0,0 +1,8 @@
+<use name="CUDADataFormats/Common"/>
+<use name="CUDADataFormats/CaloCommon"/>
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+  <lib   name="1"/>
+</export>
diff --git a/CUDADataFormats/EcalDigi/interface/DigisCollection.h b/CUDADataFormats/EcalDigi/interface/DigisCollection.h
new file mode 100644
index 0000000000000..f471dbfb9fac8
--- /dev/null
+++ b/CUDADataFormats/EcalDigi/interface/DigisCollection.h
@@ -0,0 +1,24 @@
+#ifndef CUDADataFormats_EcalDigi_interface_DigisCollection_h
+#define CUDADataFormats_EcalDigi_interface_DigisCollection_h
+
+#include "CUDADataFormats/CaloCommon/interface/Common.h"
+
+namespace ecal {
+
+  template <typename StoragePolicy>
+  struct DigisCollection : public ::calo::common::AddSize<typename StoragePolicy::TagType> {
+    DigisCollection() = default;
+    DigisCollection(DigisCollection const &) = default;
+    DigisCollection &operator=(DigisCollection const &) = default;
+
+    DigisCollection(DigisCollection &&) = default;
+    DigisCollection &operator=(DigisCollection &&) = default;
+
+    // stride is statically known
+    typename StoragePolicy::template StorageSelector<uint32_t>::type ids;
+    typename StoragePolicy::template StorageSelector<uint16_t>::type data;
+  };
+
+}  // namespace ecal
+
+#endif  // CUDADataFormats_EcalDigi_interface_DigisCollection_h
diff --git a/CUDADataFormats/EcalDigi/src/classes.h b/CUDADataFormats/EcalDigi/src/classes.h
new file mode 100644
index 0000000000000..cd60b775e229b
--- /dev/null
+++ b/CUDADataFormats/EcalDigi/src/classes.h
@@ -0,0 +1,3 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/EcalDigi/src/classes_def.xml b/CUDADataFormats/EcalDigi/src/classes_def.xml
new file mode 100644
index 0000000000000..6a3adfe4b41c5
--- /dev/null
+++ b/CUDADataFormats/EcalDigi/src/classes_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+    <class name="cms::cuda::Product<ecal::DigisCollection<calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<ecal::DigisCollection<calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<ecal::DigisCollection<calo::common::ViewStoragePolicy>>>" persistent="false"/>
+    <class name="edm::Wrapper<cms::cuda::Product<ecal::DigisCollection<calo::common::DevStoragePolicy>>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
new file mode 100644
index 0000000000000..a684d9a23f1c6
--- /dev/null
+++ b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
@@ -0,0 +1,10 @@
+<use name="cuda"/>
+<use name="CUDADataFormats/Common"/>
+<use name="CUDADataFormats/CaloCommon"/>
+<use name="DataFormats/Common"/>
+<use name="DataFormats/EcalDigi"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+  <lib   name="1"/>
+</export>
diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h
new file mode 100644
index 0000000000000..731b8b801407f
--- /dev/null
+++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h
@@ -0,0 +1,45 @@
+#ifndef CUDADataFormats_EcalRecHitSoA_interface_EcalRecHit_h
+#define CUDADataFormats_EcalRecHitSoA_interface_EcalRecHit_h
+
+#include <array>
+#include <vector>
+
+#include "CUDADataFormats/CaloCommon/interface/Common.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
+namespace ecal {
+
+  template <typename StoragePolicy>
+  struct RecHit : public ::calo::common::AddSize<typename StoragePolicy::TagType> {
+    RecHit() = default;
+    RecHit(const RecHit&) = default;
+    RecHit& operator=(const RecHit&) = default;
+
+    RecHit(RecHit&&) = default;
+    RecHit& operator=(RecHit&&) = default;
+
+    typename StoragePolicy::template StorageSelector<reco::StorageScalarType>::type energy;
+    typename StoragePolicy::template StorageSelector<reco::StorageScalarType>::type time;
+    // should we remove the following, since already included in "extra" ?
+    typename StoragePolicy::template StorageSelector<reco::StorageScalarType>::type chi2;
+    typename StoragePolicy::template StorageSelector<uint32_t>::type
+        extra;  // packed uint32_t for timeError, chi2, energyError
+    typename StoragePolicy::template StorageSelector<uint32_t>::type
+        flagBits;  // store rechit condition (see Flags enum) in a bit-wise way
+    typename StoragePolicy::template StorageSelector<uint32_t>::type did;
+
+    template <typename U = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<U, ::calo::common::tags::Vec>::value, void>::type resize(size_t size) {
+      energy.resize(size);
+      time.resize(size);
+      chi2.resize(size);
+      extra.resize(size);
+      flagBits.resize(size);
+      did.resize(size);
+    }
+  };
+
+}  // namespace ecal
+
+#endif  // CUDADataFormats_EcalRecHitSoA_interface_EcalRecHit_h
diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h
new file mode 100644
index 0000000000000..78c909b029dc1
--- /dev/null
+++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h
@@ -0,0 +1,46 @@
+#ifndef CUDADataFormats_EcalRecHitSoA_interface_EcalUncalibratedRecHit_h
+#define CUDADataFormats_EcalRecHitSoA_interface_EcalUncalibratedRecHit_h
+
+#include <array>
+#include <vector>
+
+#include "CUDADataFormats/CaloCommon/interface/Common.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+
+namespace ecal {
+
+  template <typename StoragePolicy>
+  struct UncalibratedRecHit : public ::calo::common::AddSize<typename StoragePolicy::TagType> {
+    UncalibratedRecHit() = default;
+    UncalibratedRecHit(const UncalibratedRecHit&) = default;
+    UncalibratedRecHit& operator=(const UncalibratedRecHit&) = default;
+
+    UncalibratedRecHit(UncalibratedRecHit&&) = default;
+    UncalibratedRecHit& operator=(UncalibratedRecHit&&) = default;
+
+    typename StoragePolicy::template StorageSelector<reco::ComputationScalarType>::type amplitudesAll;
+    typename StoragePolicy::template StorageSelector<reco::StorageScalarType>::type amplitude;
+    typename StoragePolicy::template StorageSelector<reco::StorageScalarType>::type chi2;
+    typename StoragePolicy::template StorageSelector<reco::StorageScalarType>::type pedestal;
+    typename StoragePolicy::template StorageSelector<reco::StorageScalarType>::type jitter;
+    typename StoragePolicy::template StorageSelector<reco::StorageScalarType>::type jitterError;
+    typename StoragePolicy::template StorageSelector<uint32_t>::type did;
+    typename StoragePolicy::template StorageSelector<uint32_t>::type flags;
+
+    template <typename U = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<U, ::calo::common::tags::Vec>::value, void>::type resize(size_t size) {
+      amplitudesAll.resize(size * EcalDataFrame::MAXSAMPLES);
+      amplitude.resize(size);
+      pedestal.resize(size);
+      chi2.resize(size);
+      did.resize(size);
+      flags.resize(size);
+      jitter.resize(size);
+      jitterError.resize(size);
+    }
+  };
+
+}  // namespace ecal
+
+#endif  // CUDADataFormats_EcalRecHitSoA_interface_EcalUncalibratedRecHit_h
diff --git a/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h b/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h
new file mode 100644
index 0000000000000..87c4252a5e949
--- /dev/null
+++ b/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h
@@ -0,0 +1,13 @@
+#ifndef CUDADataFormats_EcalRecHitSoA_interface_RecoTypes_h
+#define CUDADataFormats_EcalRecHitSoA_interface_RecoTypes_h
+
+namespace ecal {
+  namespace reco {
+
+    using ComputationScalarType = float;
+    using StorageScalarType = float;
+
+  }  // namespace reco
+}  // namespace ecal
+
+#endif  // CUDADataFormats_EcalRecHitSoA_interface_RecoTypes_h
diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes.h b/CUDADataFormats/EcalRecHitSoA/src/classes.h
new file mode 100644
index 0000000000000..ef95da461e3ba
--- /dev/null
+++ b/CUDADataFormats/EcalRecHitSoA/src/classes.h
@@ -0,0 +1,4 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
new file mode 100644
index 0000000000000..27e315b2c2822
--- /dev/null
+++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
@@ -0,0 +1,20 @@
+<lcgdict>
+    <class name="cms::cuda::Product<ecal::UncalibratedRecHit<calo::common::ViewStoragePolicy>>" persistent="false"/>
+    <class name="cms::cuda::Product<ecal::UncalibratedRecHit<calo::common::DevStoragePolicy>>" persistent="false"/>
+    <class name="edm::Wrapper<cms::cuda::Product<ecal::UncalibratedRecHit<calo::common::ViewStoragePolicy>>>" persistent="false"/>
+    <class name="edm::Wrapper<cms::cuda::Product<ecal::UncalibratedRecHit<calo::common::DevStoragePolicy>>>" persistent="false"/>
+                
+    <class name="cms::cuda::Product<ecal::RecHit<calo::common::ViewStoragePolicy>>" persistent="false"/>
+    <class name="cms::cuda::Product<ecal::RecHit<calo::common::DevStoragePolicy>>" persistent="false"/>
+    <class name="edm::Wrapper<cms::cuda::Product<ecal::RecHit<calo::common::ViewStoragePolicy>>>" persistent="false"/>
+    <class name="edm::Wrapper<cms::cuda::Product<ecal::RecHit<calo::common::DevStoragePolicy>>>" persistent="false"/>
+        
+    <class name="ecal::UncalibratedRecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" persistent="false"/>
+    <class name="edm::Wrapper<ecal::UncalibratedRecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" persistent="false"/>
+    <class name="ecal::RecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" persistent="false"/>
+    <class name="edm::Wrapper<ecal::RecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" persistent="false"/>
+    <class name="ecal::UncalibratedRecHit<calo::common::VecStoragePolicy<std::allocator>>" persistent="false"/>
+    <class name="edm::Wrapper<ecal::UncalibratedRecHit<calo::common::VecStoragePolicy<std::allocator>>>" persistent="false"/>
+    <class name="ecal::RecHit<calo::common::VecStoragePolicy<std::allocator>>" persistent="false"/>
+    <class name="edm::Wrapper<ecal::RecHit<calo::common::VecStoragePolicy<std::allocator>>>" persistent="false"/>
+</lcgdict>
diff --git a/Configuration/StandardSequences/python/RawToDigi_cff.py b/Configuration/StandardSequences/python/RawToDigi_cff.py
index dd3bf675faf0d..102e8b1132f71 100644
--- a/Configuration/StandardSequences/python/RawToDigi_cff.py
+++ b/Configuration/StandardSequences/python/RawToDigi_cff.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
 # This object is used to selectively make changes for different running
 # scenarios. In this case it makes changes for Run 2.
@@ -74,7 +75,7 @@
 
 scalersRawToDigi.scalersInputTag = 'rawDataCollector'
 siPixelDigis.cpu.InputLabel = 'rawDataCollector'
-ecalDigis.InputLabel = 'rawDataCollector'
+(~gpu).toModify(ecalDigis, InputLabel='rawDataCollector')
 ecalPreshowerDigis.sourceTag = 'rawDataCollector'
 hcalDigis.InputLabel = 'rawDataCollector'
 muonCSCDigis.InputObjects = 'rawDataCollector'
diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml
index 88f864737813e..2ac1b25233910 100644
--- a/EventFilter/EcalRawToDigi/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/BuildFile.xml
@@ -1,14 +1,18 @@
-<use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
 <use name="boost"/>
+<use name="cuda"/>
+<use name="CUDADataFormats/EcalDigi" />
 <use name="CondFormats/EcalObjects"/>
 <use name="DataFormats/EcalDetId"/>
 <use name="DataFormats/EcalDigi"/>
 <use name="DataFormats/EcalRawData"/>
 <use name="DataFormats/EcalRecHit"/>
 <use name="DataFormats/FEDRawData"/>
+<use name="FWCore/Framework"/>
 <use name="FWCore/MessageLogger"/>
+<use name="FWCore/ParameterSet"/>
 <use name="Geometry/EcalMapping"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="Utilities/StorageFactory"/>
 <export>
   <lib name="1"/>
diff --git a/EventFilter/EcalRawToDigi/bin/BuildFile.xml b/EventFilter/EcalRawToDigi/bin/BuildFile.xml
new file mode 100644
index 0000000000000..792fe438d8799
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/bin/BuildFile.xml
@@ -0,0 +1,7 @@
+<bin name="makeEcalRaw2DigiGpuValidationPlots" file="makeEcalRaw2DigiGpuValidationPlots.cpp">
+    <use name="root"/>
+    <use name="rootgraphics"/>
+    <use name="DataFormats/Common"/>
+    <use name="DataFormats/EcalDigi"/>
+    <use name="DataFormats/EcalDetId"/>
+</bin>
diff --git a/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp
new file mode 100644
index 0000000000000..609c277e19288
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp
@@ -0,0 +1,210 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <TCanvas.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TTree.h>
+#include <TPaveStats.h>
+
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "run with: ./<exe> <path to input file> <path to output file>\n";
+    exit(0);
+  }
+
+  // branches to use
+  edm::Wrapper<EBDigiCollection>*wgpuEB = nullptr, *wcpuEB = nullptr;
+  edm::Wrapper<EEDigiCollection>*wgpuEE = nullptr, *wcpuEE = nullptr;
+
+  std::string inFileName{argv[1]};
+  std::string outFileName{argv[2]};
+
+  // prep output
+  TFile rfout{outFileName.c_str(), "recreate"};
+
+  int const nbins = 400;
+  float const last = 4096.;
+  auto hADCEBGPU = new TH1D("hADCEBGPU", "hADCEBGPU", nbins, 0, last);
+  auto hADCEBCPU = new TH1D("hADCEBCPU", "hADCEBCPU", nbins, 0, last);
+  auto hADCEEGPU = new TH1D("hADCEEGPU", "hADCEEGPU", nbins, 0, last);
+  auto hADCEECPU = new TH1D("hADCEECPU", "hADCEECPU", nbins, 0, last);
+
+  auto hGainEBGPU = new TH1D("hGainEBGPU", "hGainEBGPU", 4, 0, 4);
+  auto hGainEBCPU = new TH1D("hGainEBCPU", "hGainEBCPU", 4, 0, 4);
+  auto hGainEEGPU = new TH1D("hGainEEGPU", "hGainEEGPU", 4, 0, 4);
+  auto hGainEECPU = new TH1D("hGainEECPU", "hGainEECPU", 4, 0, 4);
+
+  auto hADCEBGPUvsCPU = new TH2D("hADCEBGPUvsCPU", "hADCEBGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  auto hADCEEGPUvsCPU = new TH2D("hADCEEGPUvsCPU", "hADCEEGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  auto hGainEBGPUvsCPU = new TH2D("hGainEBGPUvsCPU", "hGainEBGPUvsCPU", 4, 0, 4, 4, 0, 4);
+  auto hGainEEGPUvsCPU = new TH2D("hGainEEGPUvsCPU", "hGainEEGPUvsCPU", 4, 0, 4, 4, 0, 4);
+
+  // prep input
+  TFile rfin{inFileName.c_str()};
+  TTree* rt = (TTree*)rfin.Get("Events");
+  rt->SetBranchAddress("EBDigiCollection_ecalCPUDigisProducer_ebDigis_RECO.", &wgpuEB);
+  rt->SetBranchAddress("EEDigiCollection_ecalCPUDigisProducer_eeDigis_RECO.", &wgpuEE);
+  rt->SetBranchAddress("EBDigiCollection_ecalDigis_ebDigis_RECO.", &wcpuEB);
+  rt->SetBranchAddress("EEDigiCollection_ecalDigis_eeDigis_RECO.", &wcpuEE);
+
+  // accumulate
+  auto const nentries = rt->GetEntries();
+  std::cout << ">>> nentries = " << nentries << std::endl;
+  for (int ie = 0; ie < nentries; ++ie) {
+    rt->GetEntry(ie);
+
+    auto const ngpuebs = wgpuEB->bareProduct().size();
+    auto const ncpuebs = wcpuEB->bareProduct().size();
+    auto const ngpuees = wgpuEE->bareProduct().size();
+    auto const ncpuees = wcpuEE->bareProduct().size();
+
+    if (ngpuebs != ncpuebs or ngpuees != ncpuees) {
+      std::cerr << "*** mismatch in ndigis: "
+                << "ie = " << ie << "  ngpuebs = " << ngpuebs << "  ncpuebs = " << ncpuebs << "  ngpuees = " << ngpuees
+                << "  ncpuees = " << ncpuees << std::endl;
+
+      // this is a must for now
+      //assert(ngpuebs==ncpuebs);
+      //assert(ngpuees==ncpuees);
+    }
+
+    // assume identical sizes
+    auto const& idsgpuEB = wgpuEB->bareProduct().ids();
+    auto const& datagpuEB = wgpuEB->bareProduct().data();
+    auto const& idscpuEB = wcpuEB->bareProduct().ids();
+    auto const& datacpuEB = wcpuEB->bareProduct().data();
+    for (uint32_t ieb = 0; ieb < ngpuebs; ++ieb) {
+      auto const& idgpu = idsgpuEB[ieb];
+      auto iter2idcpu = std::find(idscpuEB.begin(), idscpuEB.end(), idgpu);
+      // FIXME
+      assert(idgpu == *iter2idcpu);
+
+      auto const ptrdiff = iter2idcpu - idscpuEB.begin();
+      for (uint32_t s = 0u; s < 10u; s++) {
+        EcalMGPASample sampleGPU{datagpuEB[ieb * 10 + s]};
+        EcalMGPASample sampleCPU{datacpuEB[ptrdiff * 10 + s]};
+
+        hADCEBGPU->Fill(sampleGPU.adc());
+        hGainEBGPU->Fill(sampleGPU.gainId());
+        hADCEBCPU->Fill(sampleCPU.adc());
+        hGainEBCPU->Fill(sampleCPU.gainId());
+        hADCEBGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc());
+        hGainEBGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId());
+      }
+    }
+
+    auto const& idsgpuEE = wgpuEE->bareProduct().ids();
+    auto const& datagpuEE = wgpuEE->bareProduct().data();
+    auto const& idscpuEE = wcpuEE->bareProduct().ids();
+    auto const& datacpuEE = wcpuEE->bareProduct().data();
+    for (uint32_t iee = 0; iee < ngpuees; ++iee) {
+      auto const& idgpu = idsgpuEE[iee];
+      auto iter2idcpu = std::find(idscpuEE.begin(), idscpuEE.end(), idgpu);
+      // FIXME
+      assert(idgpu == *iter2idcpu);
+
+      // get the digis
+      auto const ptrdiff = iter2idcpu - idscpuEE.begin();
+      for (uint32_t s = 0u; s < 10u; s++) {
+        EcalMGPASample sampleGPU{datagpuEE[iee * 10 + s]};
+        EcalMGPASample sampleCPU{datacpuEE[ptrdiff * 10 + s]};
+
+        hADCEEGPU->Fill(sampleGPU.adc());
+        hGainEEGPU->Fill(sampleGPU.gainId());
+        hADCEECPU->Fill(sampleCPU.adc());
+        hGainEECPU->Fill(sampleCPU.gainId());
+        hADCEEGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc());
+        hGainEEGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId());
+      }
+    }
+  }
+
+  {
+    TCanvas c{"plots", "plots", 4200, 6200};
+    c.Divide(2, 4);
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hADCEBCPU->SetLineColor(kBlack);
+      hADCEBCPU->SetLineWidth(1.);
+      hADCEBCPU->Draw("");
+      hADCEBGPU->SetLineColor(kBlue);
+      hADCEBGPU->SetLineWidth(1.);
+      hADCEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(2);
+    {
+      gPad->SetLogy();
+      hADCEECPU->SetLineColor(kBlack);
+      hADCEECPU->SetLineWidth(1.);
+      hADCEECPU->Draw("");
+      hADCEEGPU->SetLineColor(kBlue);
+      hADCEEGPU->SetLineWidth(1.);
+      hADCEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(3);
+    {
+      gPad->SetLogy();
+      hGainEBCPU->SetLineColor(kBlack);
+      hGainEBCPU->SetLineWidth(1.);
+      hGainEBCPU->Draw("");
+      hGainEBGPU->SetLineColor(kBlue);
+      hGainEBGPU->SetLineWidth(1.);
+      hGainEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hGainEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(4);
+    {
+      gPad->SetLogy();
+      hGainEECPU->SetLineColor(kBlack);
+      hGainEECPU->SetLineWidth(1.);
+      hGainEECPU->Draw("");
+      hGainEEGPU->SetLineColor(kBlue);
+      hGainEEGPU->SetLineWidth(1.);
+      hGainEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hGainEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(5);
+    hADCEBGPUvsCPU->Draw("colz");
+    c.cd(6);
+    hADCEEGPUvsCPU->Draw("colz");
+    c.cd(7);
+    hGainEBGPUvsCPU->Draw("colz");
+    c.cd(8);
+    hGainEEGPUvsCPU->Draw("colz");
+    c.SaveAs("plots.pdf");
+  }
+
+  rfin.Close();
+  rfout.Write();
+  rfout.Close();
+}
diff --git a/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h b/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h
index fa6e9f5d5a161..38a9ebdf18cb8 100644
--- a/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h
+++ b/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h
@@ -1,14 +1,11 @@
-#ifndef EcalRegionCabling_H
-#define EcalRegionCabling_H
+#ifndef EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h
+#define EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h
 
-#include "Geometry/EcalMapping/interface/EcalElectronicsMapping.h"
-#include "Geometry/EcalMapping/interface/ESElectronicsMapper.h"
-
-#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "DataFormats/FEDRawData/interface/FEDNumbering.h"
-
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "Geometry/EcalMapping/interface/ESElectronicsMapper.h"
+#include "Geometry/EcalMapping/interface/EcalElectronicsMapping.h"
 
 class EcalRegionCabling {
 public:
@@ -73,4 +70,4 @@ class EcalRegionCabling {
   const ESElectronicsMapper* es_mapping_;
 };
 
-#endif
+#endif  // EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h
diff --git a/EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h b/EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h
new file mode 100644
index 0000000000000..abedcf5a2d479
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h
@@ -0,0 +1,91 @@
+#ifndef EventFilter_EcalRawToDigi_interface_ElectronicsIdGPU_h
+#define EventFilter_EcalRawToDigi_interface_ElectronicsIdGPU_h
+
+#include <cstdint>
+
+#include "DataFormats/EcalDetId/interface/EcalSubdetector.h"
+
+namespace ecal {
+  namespace raw {
+
+    /** \brief Ecal readout channel identification
+    [32:20] Unused (so far)
+    [19:13]  DCC id
+    [12:6]   tower
+    [5:3]    strip
+    [2:0]    xtal
+    Index starts from 1
+ */
+
+    class ElectronicsIdGPU {
+    public:
+      /** Default constructor -- invalid value */
+      constexpr ElectronicsIdGPU() : id_{0xFFFFFFFFu} {}
+      /** from raw */
+      constexpr ElectronicsIdGPU(uint32_t id) : id_{id} {}
+      /** Constructor from dcc,tower,channel **/
+      constexpr ElectronicsIdGPU(uint8_t const dccid, uint8_t const towerid, uint8_t const stripid, uint8_t const xtalid)
+          : id_{static_cast<uint32_t>((xtalid & 0x7) | ((stripid & 0x7) << 3) | ((towerid & 0x7F) << 6) |
+                                      ((dccid & 0x7F) << 13))} {}
+
+      constexpr uint32_t operator()() { return id_; }
+      constexpr uint32_t rawId() const { return id_; }
+
+      /// get the DCC (Ecal Local DCC value not global one) id
+      constexpr uint8_t dccId() const { return (id_ >> 13) & 0x7F; }
+      /// get the tower id
+      constexpr uint8_t towerId() const { return (id_ >> 6) & 0x7F; }
+      /// get the tower id
+      constexpr uint8_t stripId() const { return (id_ >> 3) & 0x7; }
+      /// get the channel id
+      constexpr uint8_t xtalId() const { return (id_ & 0x7); }
+
+      /// get the subdet
+      //EcalSubdetector subdet() const;
+
+      /// get a fast, compact, unique index for linear lookups (maximum value = 4194303)
+      constexpr uint32_t linearIndex() const { return id_ & 0x3FFFFF; }
+
+      /// so far for EndCap only :
+      //int channelId() const;  // xtal id between 1 and 25
+
+      static constexpr int kTowersInPhi = 4;     // see EBDetId
+      static constexpr int kCrystalsInPhi = 20;  // see EBDetId
+
+      static constexpr uint8_t MAX_DCCID = 54;  //To be updated with correct and final number
+      static constexpr uint8_t MIN_DCCID = 1;
+      static constexpr uint8_t MAX_TOWERID = 70;
+      static constexpr uint8_t MIN_TOWERID = 1;
+      static constexpr uint8_t MAX_STRIPID = 5;
+      static constexpr uint8_t MIN_STRIPID = 1;
+      static constexpr uint8_t MAX_CHANNELID = 25;
+      static constexpr uint8_t MIN_CHANNELID = 1;
+      static constexpr uint8_t MAX_XTALID = 5;
+      static constexpr uint8_t MIN_XTALID = 1;
+
+      static constexpr int MIN_DCCID_EEM = 1;
+      static constexpr int MAX_DCCID_EEM = 9;
+      static constexpr int MIN_DCCID_EBM = 10;
+      static constexpr int MAX_DCCID_EBM = 27;
+      static constexpr int MIN_DCCID_EBP = 28;
+      static constexpr int MAX_DCCID_EBP = 45;
+      static constexpr int MIN_DCCID_EEP = 46;
+      static constexpr int MAX_DCCID_EEP = 54;
+
+      static constexpr int DCCID_PHI0_EBM = 10;
+      static constexpr int DCCID_PHI0_EBP = 28;
+
+      static constexpr int kDCCChannelBoundary = 17;
+      static constexpr int DCC_EBM = 10;  // id of the DCC in EB- which contains phi=0 deg.
+      static constexpr int DCC_EBP = 28;  // id of the DCC in EB+ which contains phi=0 deg.
+      static constexpr int DCC_EEM = 1;   // id of the DCC in EE- which contains phi=0 deg.
+      static constexpr int DCC_EEP = 46;  // id of the DCC in EE+ which contains phi=0 deg.
+
+    private:
+      uint32_t id_;
+    };
+
+  }  // namespace raw
+}  // namespace ecal
+
+#endif  // EventFilter_EcalRawToDigi_interface_ElectronicsIdGPU_h
diff --git a/EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h b/EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h
new file mode 100644
index 0000000000000..004821afe3ed8
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h
@@ -0,0 +1,47 @@
+#ifndef EventFilter_EcalRawToDigi_interface_ElectronicsMappingGPU_h
+#define EventFilter_EcalRawToDigi_interface_ElectronicsMappingGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalMappingElectronics.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+namespace ecal {
+  namespace raw {
+
+    class ElectronicsMappingGPU {
+    public:
+      struct Product {
+        ~Product();
+        uint32_t* eid2did;
+      };
+
+#ifndef __CUDACC__
+
+      // rearrange pedestals
+      ElectronicsMappingGPU(EcalMappingElectronics const&);
+
+      // will call dealloation for Product thru ~Product
+      ~ElectronicsMappingGPU() = default;
+
+      // get device pointers
+      Product const& getProduct(cudaStream_t) const;
+
+      //
+      static std::string name() { return std::string{"ecalElectronicsMappingGPU"}; }
+
+    private:
+      // in the future, we need to arrange so to avoid this copy on the host
+      // store eb first then ee
+      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> eid2did_;
+
+      cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+    };
+
+  }  // namespace raw
+}  // namespace ecal
+
+#endif  // EventFilter_EcalRawToDigi_interface_ElectronicsMappingGPU_h
diff --git a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
index e55f1bcaab660..c3c2bd988e2c3 100644
--- a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
@@ -1,10 +1,14 @@
-<use name="EventFilter/EcalRawToDigi"/>
+<use name="cuda"/>
 <use name="root"/>
+<use name="CUDADataFormats/EcalDigi" />
 <use name="CondFormats/DataRecord"/>
-<use name="FWCore/Utilities"/>
 <use name="DataFormats/Common"/>
 <use name="DataFormats/Scalers"/>
+<use name="EventFilter/EcalRawToDigi"/>
 <use name="FWCore/Framework"/>
-<library file="*.cc" name="EventFilterEcalRawToDigiPlugins">
+<use name="FWCore/Utilities"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<library file="*.cc *.cu" name="EventFilterEcalRawToDigiPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h
new file mode 100644
index 0000000000000..a6429121adc82
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h
@@ -0,0 +1,70 @@
+#ifndef EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h
+#define EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h
+
+#include <vector>
+
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "EventFilter/EcalRawToDigi/interface/DCCRawDataDefinitions.h"
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+namespace ecal {
+  namespace raw {
+
+    constexpr auto empty_event_size = EMPTYEVENTSIZE;
+    constexpr uint32_t nfeds_max = 54;
+    constexpr uint32_t nbytes_per_fed_max = 10 * 1024;
+
+    struct InputDataCPU {
+      cms::cuda::host::unique_ptr<unsigned char[]> data;
+      cms::cuda::host::unique_ptr<uint32_t[]> offsets;
+      cms::cuda::host::unique_ptr<int[]> feds;
+    };
+
+    struct ConfigurationParameters {
+      uint32_t maxChannelsEE;
+      uint32_t maxChannelsEB;
+    };
+
+    struct OutputDataCPU {
+      // [0] - eb, [1] - ee
+      cms::cuda::host::unique_ptr<uint32_t[]> nchannels;
+    };
+
+    struct OutputDataGPU {
+      DigisCollection<::calo::common::DevStoragePolicy> digisEB, digisEE;
+
+      void allocate(ConfigurationParameters const &config, cudaStream_t cudaStream) {
+        digisEB.data =
+            cms::cuda::make_device_unique<uint16_t[]>(config.maxChannelsEB * EcalDataFrame::MAXSAMPLES, cudaStream);
+        digisEE.data =
+            cms::cuda::make_device_unique<uint16_t[]>(config.maxChannelsEE * EcalDataFrame::MAXSAMPLES, cudaStream);
+        digisEB.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsEB, cudaStream);
+        digisEE.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsEE, cudaStream);
+      }
+    };
+
+    struct ScratchDataGPU {
+      // [0] = EB
+      // [1] = EE
+      cms::cuda::device::unique_ptr<uint32_t[]> pChannelsCounter;
+    };
+
+    struct InputDataGPU {
+      cms::cuda::device::unique_ptr<unsigned char[]> data;
+      cms::cuda::device::unique_ptr<uint32_t[]> offsets;
+      cms::cuda::device::unique_ptr<int[]> feds;
+    };
+
+    struct ConditionsProducts {
+      ElectronicsMappingGPU::Product const &eMappingProduct;
+    };
+
+  }  // namespace raw
+}  // namespace ecal
+
+#endif  // EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
new file mode 100644
index 0000000000000..5563dd5b52cc8
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
@@ -0,0 +1,196 @@
+#include <iostream>
+
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h"
+#include "DataFormats/EcalDetId/interface/EcalDetIdCollections.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "DeclsForKernels.h"
+#include "UnpackGPU.h"
+
+class EcalCPUDigisProducer : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit EcalCPUDigisProducer(edm::ParameterSet const& ps);
+  ~EcalCPUDigisProducer() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  // input digi collections in GPU-friendly format
+  using InputProduct = cms::cuda::Product<ecal::DigisCollection<calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<InputProduct> digisInEBToken_;
+  edm::EDGetTokenT<InputProduct> digisInEEToken_;
+
+  // output digi collections in legacy format
+  edm::EDPutTokenT<EBDigiCollection> digisOutEBToken_;
+  edm::EDPutTokenT<EEDigiCollection> digisOutEEToken_;
+
+  // whether to produce dummy integrity collections
+  bool produceDummyIntegrityCollections_;
+
+  // dummy SRP collections
+  edm::EDPutTokenT<EBSrFlagCollection> ebSrFlagToken_;
+  edm::EDPutTokenT<EESrFlagCollection> eeSrFlagToken_;
+
+  // dummy integrity for xtal data
+  edm::EDPutTokenT<EBDetIdCollection> ebIntegrityGainErrorsToken_;
+  edm::EDPutTokenT<EBDetIdCollection> ebIntegrityGainSwitchErrorsToken_;
+  edm::EDPutTokenT<EBDetIdCollection> ebIntegrityChIdErrorsToken_;
+
+  // dummy integrity for xtal data - EE specific (to be rivisited towards EB+EE common collection)
+  edm::EDPutTokenT<EEDetIdCollection> eeIntegrityGainErrorsToken_;
+  edm::EDPutTokenT<EEDetIdCollection> eeIntegrityGainSwitchErrorsToken_;
+  edm::EDPutTokenT<EEDetIdCollection> eeIntegrityChIdErrorsToken_;
+
+  // dummy integrity errors
+  edm::EDPutTokenT<EcalElectronicsIdCollection> integrityTTIdErrorsToken_;
+  edm::EDPutTokenT<EcalElectronicsIdCollection> integrityBlockSizeErrorsToken_;
+
+  // FIXME better way to pass pointers from acquire to produce?
+  std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> idsebtmp, idseetmp;
+  std::vector<uint16_t, cms::cuda::HostAllocator<uint16_t>> dataebtmp, dataeetmp;
+};
+
+void EcalCPUDigisProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("digisInLabelEB", edm::InputTag{"ecalRawToDigiGPU", "ebDigis"});
+  desc.add<edm::InputTag>("digisInLabelEE", edm::InputTag{"ecalRawToDigiGPU", "eeDigis"});
+  desc.add<std::string>("digisOutLabelEB", "ebDigis");
+  desc.add<std::string>("digisOutLabelEE", "eeDigis");
+
+  desc.add<bool>("produceDummyIntegrityCollections", false);
+
+  std::string label = "ecalCPUDigisProducer";
+  confDesc.add(label, desc);
+}
+
+EcalCPUDigisProducer::EcalCPUDigisProducer(const edm::ParameterSet& ps)
+    :  // input digi collections in GPU-friendly format
+      digisInEBToken_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("digisInLabelEB"))},
+      digisInEEToken_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("digisInLabelEE"))},
+      // output digi collections in legacy format
+      digisOutEBToken_{produces<EBDigiCollection>(ps.getParameter<std::string>("digisOutLabelEB"))},
+      digisOutEEToken_{produces<EEDigiCollection>(ps.getParameter<std::string>("digisOutLabelEE"))},
+      // whether to produce dummy integrity collections
+      produceDummyIntegrityCollections_{ps.getParameter<bool>("produceDummyIntegrityCollections")},
+      // dummy SRP collections
+      ebSrFlagToken_{produceDummyIntegrityCollections_ ? produces<EBSrFlagCollection>()
+                                                       : edm::EDPutTokenT<EBSrFlagCollection>{}},
+      eeSrFlagToken_{produceDummyIntegrityCollections_ ? produces<EESrFlagCollection>()
+                                                       : edm::EDPutTokenT<EESrFlagCollection>{}},
+      // dummy integrity for xtal data
+      ebIntegrityGainErrorsToken_{produceDummyIntegrityCollections_
+                                      ? produces<EBDetIdCollection>("EcalIntegrityGainErrors")
+                                      : edm::EDPutTokenT<EBDetIdCollection>{}},
+      ebIntegrityGainSwitchErrorsToken_{produceDummyIntegrityCollections_
+                                            ? produces<EBDetIdCollection>("EcalIntegrityGainSwitchErrors")
+                                            : edm::EDPutTokenT<EBDetIdCollection>{}},
+      ebIntegrityChIdErrorsToken_{produceDummyIntegrityCollections_
+                                      ? produces<EBDetIdCollection>("EcalIntegrityChIdErrors")
+                                      : edm::EDPutTokenT<EBDetIdCollection>{}},
+      // dummy integrity for xtal data - EE specific (to be rivisited towards EB+EE common collection)
+      eeIntegrityGainErrorsToken_{produceDummyIntegrityCollections_
+                                      ? produces<EEDetIdCollection>("EcalIntegrityGainErrors")
+                                      : edm::EDPutTokenT<EEDetIdCollection>{}},
+      eeIntegrityGainSwitchErrorsToken_{produceDummyIntegrityCollections_
+                                            ? produces<EEDetIdCollection>("EcalIntegrityGainSwitchErrors")
+                                            : edm::EDPutTokenT<EEDetIdCollection>{}},
+      eeIntegrityChIdErrorsToken_{produceDummyIntegrityCollections_
+                                      ? produces<EEDetIdCollection>("EcalIntegrityChIdErrors")
+                                      : edm::EDPutTokenT<EEDetIdCollection>{}},
+      // dummy integrity errors
+      integrityTTIdErrorsToken_{produceDummyIntegrityCollections_
+                                    ? produces<EcalElectronicsIdCollection>("EcalIntegrityTTIdErrors")
+                                    : edm::EDPutTokenT<EcalElectronicsIdCollection>{}},
+      integrityBlockSizeErrorsToken_{produceDummyIntegrityCollections_
+                                         ? produces<EcalElectronicsIdCollection>("EcalIntegrityBlockSizeErrors")
+                                         : edm::EDPutTokenT<EcalElectronicsIdCollection>{}} {}
+
+EcalCPUDigisProducer::~EcalCPUDigisProducer() {}
+
+void EcalCPUDigisProducer::acquire(edm::Event const& event,
+                                   edm::EventSetup const& setup,
+                                   edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& ebdigisProduct = event.get(digisInEBToken_);
+  auto const& eedigisProduct = event.get(digisInEEToken_);
+  cms::cuda::ScopedContextAcquire ctx{ebdigisProduct, std::move(taskHolder)};
+  auto const& ebdigis = ctx.get(ebdigisProduct);
+  auto const& eedigis = ctx.get(eedigisProduct);
+
+  // resize tmp buffers
+  dataebtmp.resize(ebdigis.size * EcalDataFrame::MAXSAMPLES);
+  dataeetmp.resize(eedigis.size * EcalDataFrame::MAXSAMPLES);
+  idsebtmp.resize(ebdigis.size);
+  idseetmp.resize(eedigis.size);
+
+  // enqeue transfers
+  cudaCheck(cudaMemcpyAsync(
+      dataebtmp.data(), ebdigis.data.get(), dataebtmp.size() * sizeof(uint16_t), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(
+      dataeetmp.data(), eedigis.data.get(), dataeetmp.size() * sizeof(uint16_t), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(
+      idsebtmp.data(), ebdigis.ids.get(), idsebtmp.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(
+      idseetmp.data(), eedigis.ids.get(), idseetmp.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, ctx.stream()));
+}
+
+void EcalCPUDigisProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  // output collections
+  auto digisEB = std::make_unique<EBDigiCollection>();
+  auto digisEE = std::make_unique<EEDigiCollection>();
+  digisEB->resize(idsebtmp.size());
+  digisEE->resize(idseetmp.size());
+
+  // cast constness away
+  // use pointers to buffers instead of move operator= semantics
+  // cause we have different allocators in there...
+  auto* dataEB = const_cast<uint16_t*>(digisEB->data().data());
+  auto* dataEE = const_cast<uint16_t*>(digisEE->data().data());
+  auto* idsEB = const_cast<uint32_t*>(digisEB->ids().data());
+  auto* idsEE = const_cast<uint32_t*>(digisEE->ids().data());
+
+  // copy data
+  std::memcpy(dataEB, dataebtmp.data(), dataebtmp.size() * sizeof(uint16_t));
+  std::memcpy(dataEE, dataeetmp.data(), dataeetmp.size() * sizeof(uint16_t));
+  std::memcpy(idsEB, idsebtmp.data(), idsebtmp.size() * sizeof(uint32_t));
+  std::memcpy(idsEE, idseetmp.data(), idseetmp.size() * sizeof(uint32_t));
+
+  event.put(digisOutEBToken_, std::move(digisEB));
+  event.put(digisOutEEToken_, std::move(digisEE));
+
+  if (produceDummyIntegrityCollections_) {
+    // dummy SRP collections
+    event.emplace(ebSrFlagToken_);
+    event.emplace(eeSrFlagToken_);
+    // dummy integrity for xtal data
+    event.emplace(ebIntegrityGainErrorsToken_);
+    event.emplace(ebIntegrityGainSwitchErrorsToken_);
+    event.emplace(ebIntegrityChIdErrorsToken_);
+    // dummy integrity for xtal data - EE specific (to be rivisited towards EB+EE common collection)
+    event.emplace(eeIntegrityGainErrorsToken_);
+    event.emplace(eeIntegrityGainSwitchErrorsToken_);
+    event.emplace(eeIntegrityChIdErrorsToken_);
+    // dummy integrity errors
+    event.emplace(integrityTTIdErrorsToken_);
+    event.emplace(integrityBlockSizeErrorsToken_);
+  }
+}
+
+DEFINE_FWK_MODULE(EcalCPUDigisProducer);
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc
new file mode 100644
index 0000000000000..84fcc7b2b2952
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc
@@ -0,0 +1,9 @@
+#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h"
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h"
+
+using EcalElectronicsMappingGPUESProducer =
+    ConvertingESProducerT<EcalMappingElectronicsRcd, ecal::raw::ElectronicsMappingGPU, EcalMappingElectronics>;
+
+DEFINE_FWK_EVENTSETUP_MODULE(EcalElectronicsMappingGPUESProducer);
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
new file mode 100644
index 0000000000000..4f0743c9b1b51
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
@@ -0,0 +1,155 @@
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/ESGetToken.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "DeclsForKernels.h"
+#include "UnpackGPU.h"
+
+class EcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit EcalRawToDigiGPU(edm::ParameterSet const& ps);
+  ~EcalRawToDigiGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  edm::EDGetTokenT<FEDRawDataCollection> rawDataToken_;
+  using OutputProduct = cms::cuda::Product<ecal::DigisCollection<calo::common::DevStoragePolicy>>;
+  edm::EDPutTokenT<OutputProduct> digisEBToken_, digisEEToken_;
+  edm::ESGetToken<ecal::raw::ElectronicsMappingGPU, EcalMappingElectronicsRcd> eMappingToken_;
+
+  cms::cuda::ContextState cudaState_;
+
+  std::vector<int> fedsToUnpack_;
+
+  ecal::raw::ConfigurationParameters config_;
+  ecal::raw::OutputDataGPU outputGPU_;
+  ecal::raw::OutputDataCPU outputCPU_;
+};
+
+void EcalRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("InputLabel", edm::InputTag("rawDataCollector"));
+  std::vector<int> feds(54);
+  for (uint32_t i = 0; i < 54; ++i)
+    feds[i] = i + 601;
+  desc.add<std::vector<int>>("FEDs", feds);
+  desc.add<uint32_t>("maxChannelsEB", 61200);
+  desc.add<uint32_t>("maxChannelsEE", 14648);
+  desc.add<std::string>("digisLabelEB", "ebDigis");
+  desc.add<std::string>("digisLabelEE", "eeDigis");
+
+  std::string label = "ecalRawToDigiGPU";
+  confDesc.add(label, desc);
+}
+
+EcalRawToDigiGPU::EcalRawToDigiGPU(const edm::ParameterSet& ps)
+    : rawDataToken_{consumes<FEDRawDataCollection>(ps.getParameter<edm::InputTag>("InputLabel"))},
+      digisEBToken_{produces<OutputProduct>(ps.getParameter<std::string>("digisLabelEB"))},
+      digisEEToken_{produces<OutputProduct>(ps.getParameter<std::string>("digisLabelEE"))},
+      eMappingToken_{esConsumes<ecal::raw::ElectronicsMappingGPU, EcalMappingElectronicsRcd>()},
+      fedsToUnpack_{ps.getParameter<std::vector<int>>("FEDs")} {
+  config_.maxChannelsEB = ps.getParameter<uint32_t>("maxChannelsEB");
+  config_.maxChannelsEE = ps.getParameter<uint32_t>("maxChannelsEE");
+}
+
+EcalRawToDigiGPU::~EcalRawToDigiGPU() {}
+
+void EcalRawToDigiGPU::acquire(edm::Event const& event,
+                               edm::EventSetup const& setup,
+                               edm::WaitingTaskWithArenaHolder holder) {
+  // raii
+  cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
+
+  // conditions
+  edm::ESHandle<ecal::raw::ElectronicsMappingGPU> eMappingHandle = setup.getHandle(eMappingToken_);
+  auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream());
+
+  // bundle up conditions
+  ecal::raw::ConditionsProducts conditions{eMappingProduct};
+
+  // event data
+  edm::Handle<FEDRawDataCollection> rawDataHandle;
+  event.getByToken(rawDataToken_, rawDataHandle);
+
+  // scratch
+  ecal::raw::ScratchDataGPU scratchGPU = {cms::cuda::make_device_unique<uint32_t[]>(2, ctx.stream())};
+
+  // input cpu data
+  ecal::raw::InputDataCPU inputCPU = {
+      cms::cuda::make_host_unique<unsigned char[]>(ecal::raw::nfeds_max * ecal::raw::nbytes_per_fed_max, ctx.stream()),
+      cms::cuda::make_host_unique<uint32_t[]>(ecal::raw::nfeds_max, ctx.stream()),
+      cms::cuda::make_host_unique<int[]>(ecal::raw::nfeds_max, ctx.stream())};
+
+  // input data gpu
+  ecal::raw::InputDataGPU inputGPU = {cms::cuda::make_device_unique<unsigned char[]>(
+                                          ecal::raw::nfeds_max * ecal::raw::nbytes_per_fed_max, ctx.stream()),
+                                      cms::cuda::make_device_unique<uint32_t[]>(ecal::raw::nfeds_max, ctx.stream()),
+                                      cms::cuda::make_device_unique<int[]>(ecal::raw::nfeds_max, ctx.stream())};
+
+  // output cpu
+  outputCPU_ = {cms::cuda::make_host_unique<uint32_t[]>(2, ctx.stream())};
+
+  // output gpu
+  outputGPU_.allocate(config_, ctx.stream());
+
+  // iterate over feds
+  // TODO: another idea
+  //   - loop over all feds to unpack and enqueue cuda memcpy
+  //   - accumulate the sizes
+  //   - after the loop launch cuda memcpy for sizes
+  //   - enqueue the kernel
+  uint32_t currentCummOffset = 0;
+  uint32_t counter = 0;
+  for (auto const& fed : fedsToUnpack_) {
+    auto const& data = rawDataHandle->FEDData(fed);
+    auto const nbytes = data.size();
+
+    // skip empty feds
+    if (nbytes < ecal::raw::empty_event_size)
+      continue;
+
+    // copy raw data into plain buffer
+    std::memcpy(inputCPU.data.get() + currentCummOffset, data.data(), nbytes);
+    // set the offset in bytes from the start
+    inputCPU.offsets[counter] = currentCummOffset;
+    inputCPU.feds[counter] = fed;
+
+    // this is the current offset into the vector
+    currentCummOffset += nbytes;
+    ++counter;
+  }
+
+  ecal::raw::entryPoint(
+      inputCPU, inputGPU, outputGPU_, scratchGPU, outputCPU_, conditions, ctx.stream(), counter, currentCummOffset);
+}
+
+void EcalRawToDigiGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+  // get the number of channels
+  outputGPU_.digisEB.size = outputCPU_.nchannels[0];
+  outputGPU_.digisEE.size = outputCPU_.nchannels[1];
+
+  ctx.emplace(event, digisEBToken_, std::move(outputGPU_.digisEB));
+  ctx.emplace(event, digisEEToken_, std::move(outputGPU_.digisEE));
+
+  // reset ptrs that are carried as members
+  outputCPU_.nchannels.reset();
+}
+
+DEFINE_FWK_MODULE(EcalRawToDigiGPU);
diff --git a/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu
new file mode 100644
index 0000000000000..a25bf235d15f6
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu
@@ -0,0 +1,333 @@
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h"
+
+#include "UnpackGPU.h"
+
+namespace ecal {
+  namespace raw {
+
+    __forceinline__ __device__ void print_raw_buffer(uint8_t const* const buffer,
+                                                     uint32_t const nbytes,
+                                                     uint32_t const nbytes_per_row = 20) {
+      for (uint32_t i = 0; i < nbytes; i++) {
+        if (i % nbytes_per_row == 0 && i > 0)
+          printf("\n");
+        printf("%02X ", buffer[i]);
+      }
+    }
+
+    __forceinline__ __device__ void print_first3bits(uint64_t const* buffer, uint32_t size) {
+      for (uint32_t i = 0; i < size; ++i) {
+        uint8_t const b61 = (buffer[i] >> 61) & 0x1;
+        uint8_t const b62 = (buffer[i] >> 62) & 0x1;
+        uint8_t const b63 = (buffer[i] >> 63) & 0x1;
+        printf("[word: %u] %u%u%u\n", i, b63, b62, b61);
+      }
+    }
+
+    __forceinline__ __device__ bool is_barrel(uint8_t dccid) {
+      return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && dccid <= ElectronicsIdGPU::MAX_DCCID_EBP;
+    }
+
+    __forceinline__ __device__ uint8_t fed2dcc(int fed) { return static_cast<uint8_t>(fed - 600); }
+
+    __forceinline__ __device__ int zside_for_eb(ElectronicsIdGPU const& eid) {
+      int dcc = eid.dccId();
+      return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) ? -1 : 1;
+    }
+
+    __forceinline__ __device__ bool is_synced_towerblock(uint16_t const dccbx,
+                                                         uint16_t const bx,
+                                                         uint16_t const dccl1,
+                                                         uint16_t const l1) {
+      bool const bxsync = (bx == 0 && dccbx == 3564) || (bx == dccbx && dccbx != 3564);
+      bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff));
+      return bxsync && l1sync;
+    }
+
+    __forceinline__ __device__ bool right_tower_for_eb(int tower) {
+      // for EB, two types of tower (LVRB top/bottom)
+      if ((tower > 12 && tower < 21) || (tower > 28 && tower < 37) || (tower > 44 && tower < 53) ||
+          (tower > 60 && tower < 69))
+        return true;
+      else
+        return false;
+    }
+
+    __forceinline__ __device__ uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) {
+      // as in Geometry/EcalMaping/.../EcalElectronicsMapping
+      auto const dcc = eid.dccId();
+      auto const tower = eid.towerId();
+      auto const strip = eid.stripId();
+      auto const xtal = eid.xtalId();
+
+      int smid = 0;
+      int iphi = 0;
+      bool EBPlus = (zside_for_eb(eid) > 0);
+      bool EBMinus = !EBPlus;
+
+      if (zside_for_eb(eid) < 0) {
+        smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM;
+        iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi;
+        iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi);
+      } else {
+        smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP;
+        iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi;
+        iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1);
+      }
+
+      bool RightTower = right_tower_for_eb(tower);
+      int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1;
+      if (RightTower) {
+        ieta += (strip - 1);
+        if (strip % 2 == 1) {
+          if (EBMinus)
+            iphi += (xtal - 1) + 1;
+          else
+            iphi += (4 - (xtal - 1)) + 1;
+        } else {
+          if (EBMinus)
+            iphi += (4 - (xtal - 1)) + 1;
+          else
+            iphi += (xtal - 1) + 1;
+        }
+      } else {
+        ieta += 4 - (strip - 1);
+        if (strip % 2 == 1) {
+          if (EBMinus)
+            iphi += (4 - (xtal - 1)) + 1;
+          else
+            iphi += (xtal - 1) + 1;
+        } else {
+          if (EBMinus)
+            iphi += (xtal - 1) + 1;
+          else
+            iphi += (4 - (xtal - 1)) + 1;
+        }
+      }
+
+      if (zside_for_eb(eid) < 0)
+        ieta = -ieta;
+
+      DetId did{DetId::Ecal, EcalBarrel};
+      return did.rawId() | ((ieta > 0) ? (0x10000 | (ieta << 9)) : ((-ieta) << 9)) | (iphi & 0x1FF);
+    }
+
+    __forceinline__ __device__ int adc(uint16_t sample) { return sample & 0xfff; }
+
+    __forceinline__ __device__ int gainId(uint16_t sample) { return (sample >> 12) & 0x3; }
+
+    template <int NTHREADS>
+    __global__ void kernel_unpack_test(unsigned char const* __restrict__ data,
+                                       uint32_t const* __restrict__ offsets,
+                                       int const* __restrict__ feds,
+                                       uint16_t* samplesEB,
+                                       uint16_t* samplesEE,
+                                       uint32_t* idsEB,
+                                       uint32_t* idsEE,
+                                       uint32_t* pChannelsCounterEBEE,
+                                       uint32_t const* eid2did,
+                                       uint32_t const nbytesTotal) {
+      // indices
+      auto const ifed = blockIdx.x;
+
+      // offset in bytes
+      auto const offset = offsets[ifed];
+      // fed id
+      auto const fed = feds[ifed];
+      auto const isBarrel = is_barrel(static_cast<uint8_t>(fed - 600));
+      // size
+      auto const size = ifed == gridDim.x - 1 ? nbytesTotal - offset : offsets[ifed + 1] - offset;
+      auto* samples = isBarrel ? samplesEB : samplesEE;
+      auto* ids = isBarrel ? idsEB : idsEE;
+      auto* pChannelsCounter = isBarrel ? &pChannelsCounterEBEE[0] : &pChannelsCounterEBEE[1];
+
+      // offset to the right raw buffer
+      uint64_t const* buffer = reinterpret_cast<uint64_t const*>(data + offset);
+
+      // dump first 3 bits for each 64-bit word
+      //print_first3bits(buffer, size / 8);
+
+      //
+      // fed header
+      //
+      auto const fed_header = buffer[0];
+      uint32_t bx = (fed_header >> 20) & 0xfff;
+      uint32_t lv1 = (fed_header >> 32) & 0xffffff;
+
+      // 9 for fed + dcc header
+      // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block
+      // 6 for SR block size
+
+      // dcc header w2
+      auto const w2 = buffer[2];
+      uint8_t const fov = (w2 >> 48) & 0xf;
+
+      //
+      // print Tower block headers
+      //
+      uint8_t ntccblockwords = isBarrel ? 18 : 36;
+      auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6;
+      auto const* trailer = buffer + (size / 8 - 1);
+      auto const* current_tower_block = tower_blocks_start;
+      while (current_tower_block != trailer) {
+        auto const w = *current_tower_block;
+        uint8_t ttid = w & 0xff;
+        uint16_t bxlocal = (w >> 16) & 0xfff;
+        uint16_t lv1local = (w >> 32) & 0xfff;
+        uint16_t block_length = (w >> 48) & 0x1ff;
+
+        uint16_t const dccbx = bx & 0xfff;
+        uint16_t const dccl1 = lv1 & 0xfff;
+        // fov>=1 is required to support simulated data for which bx==bxlocal==0
+        if (fov >= 1 && !is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) {
+          current_tower_block += block_length;
+          continue;
+        }
+
+        // go through all the channels
+        // get the next channel coordinates
+        uint32_t nchannels = (block_length - 1) / 3;
+
+        // 1 threads per channel in this block
+        for (uint32_t ich = 0; ich < nchannels; ich += NTHREADS) {
+          auto const i_to_access = ich + threadIdx.x;
+          // threads outside of the range -> leave the loop
+          if (i_to_access >= nchannels)
+            break;
+
+          // inc the channel's counter and get the pos where to store
+          auto const wdata = current_tower_block[1 + i_to_access * 3];
+          uint8_t const stripid = wdata & 0x7;
+          uint8_t const xtalid = (wdata >> 4) & 0x7;
+          ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid};
+          auto const didraw = isBarrel ? compute_ebdetid(eid) : eid2did[eid.linearIndex()];
+          // FIXME: what kind of channels are these guys
+          if (didraw == 0)
+            continue;
+
+          // get samples
+          uint16_t sampleValues[10];
+          sampleValues[0] = (wdata >> 16) & 0x3fff;
+          sampleValues[1] = (wdata >> 32) & 0x3fff;
+          sampleValues[2] = (wdata >> 48) & 0x3fff;
+          auto const wdata1 = current_tower_block[2 + i_to_access * 3];
+          sampleValues[3] = wdata1 & 0x3fff;
+          sampleValues[4] = (wdata1 >> 16) & 0x3fff;
+          sampleValues[5] = (wdata1 >> 32) & 0x3fff;
+          sampleValues[6] = (wdata1 >> 48) & 0x3fff;
+          auto const wdata2 = current_tower_block[3 + i_to_access * 3];
+          sampleValues[7] = wdata2 & 0x3fff;
+          sampleValues[8] = (wdata2 >> 16) & 0x3fff;
+          sampleValues[9] = (wdata2 >> 32) & 0x3fff;
+
+          // check gain
+          bool isSaturation = true;
+          short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1};
+          for (uint32_t si = 0; si < 10; si++) {
+            if (gainId(sampleValues[si]) == 0) {
+              firstGainZeroSampID = si;
+              firstGainZeroSampADC = adc(sampleValues[si]);
+              break;
+            }
+          }
+          if (firstGainZeroSampID != -1) {
+            unsigned int plateauEnd = std::min(10u, (unsigned int)(firstGainZeroSampID + 5));
+            for (unsigned int s = firstGainZeroSampID; s < plateauEnd; s++) {
+              if (gainId(sampleValues[s]) == 0 && adc(sampleValues[s]) == firstGainZeroSampADC) {
+                ;
+              } else {
+                isSaturation = false;
+                break;
+              }  //it's not saturation
+            }
+            // get rid of channels which are stuck in gain0
+            if (firstGainZeroSampID < 3) {
+              isSaturation = false;
+            }
+            if (!isSaturation)
+              continue;
+          } else {  // there is no zero gainId sample
+            // gain switch check
+            short numGain = 1;
+            bool gainSwitchError = false;
+            for (unsigned int si = 1; si < 10; si++) {
+              if ((gainId(sampleValues[si - 1]) > gainId(sampleValues[si])) && numGain < 5)
+                gainSwitchError = true;
+              if (gainId(sampleValues[si - 1]) == gainId(sampleValues[si]))
+                numGain++;
+              else
+                numGain = 1;
+            }
+            if (gainSwitchError)
+              continue;
+          }
+
+          auto const pos = atomicAdd(pChannelsCounter, 1);
+
+          // store to global
+          ids[pos] = didraw;
+          samples[pos * 10] = sampleValues[0];
+          samples[pos * 10 + 1] = sampleValues[1];
+          samples[pos * 10 + 2] = sampleValues[2];
+          samples[pos * 10 + 3] = sampleValues[3];
+          samples[pos * 10 + 4] = sampleValues[4];
+          samples[pos * 10 + 5] = sampleValues[5];
+          samples[pos * 10 + 6] = sampleValues[6];
+          samples[pos * 10 + 7] = sampleValues[7];
+          samples[pos * 10 + 8] = sampleValues[8];
+          samples[pos * 10 + 9] = sampleValues[9];
+        }
+
+        current_tower_block += block_length;
+      }
+    }
+
+    void entryPoint(InputDataCPU const& inputCPU,
+                    InputDataGPU& inputGPU,
+                    OutputDataGPU& outputGPU,
+                    ScratchDataGPU& scratchGPU,
+                    OutputDataCPU& outputCPU,
+                    ConditionsProducts const& conditions,
+                    cudaStream_t cudaStream,
+                    uint32_t const nfedsWithData,
+                    uint32_t const nbytesTotal) {
+      // transfer
+      cudaCheck(cudaMemcpyAsync(inputGPU.data.get(),
+                                inputCPU.data.get(),
+                                nbytesTotal * sizeof(unsigned char),
+                                cudaMemcpyHostToDevice,
+                                cudaStream));
+      cudaCheck(cudaMemcpyAsync(inputGPU.offsets.get(),
+                                inputCPU.offsets.get(),
+                                nfedsWithData * sizeof(uint32_t),
+                                cudaMemcpyHostToDevice,
+                                cudaStream));
+      cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounter.get(),
+                                0,
+                                sizeof(uint32_t) * 2,  // EB + EE
+                                cudaStream));
+      cudaCheck(cudaMemcpyAsync(
+          inputGPU.feds.get(), inputCPU.feds.get(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
+
+      kernel_unpack_test<32><<<nfedsWithData, 32, 0, cudaStream>>>(inputGPU.data.get(),
+                                                                   inputGPU.offsets.get(),
+                                                                   inputGPU.feds.get(),
+                                                                   outputGPU.digisEB.data.get(),
+                                                                   outputGPU.digisEE.data.get(),
+                                                                   outputGPU.digisEB.ids.get(),
+                                                                   outputGPU.digisEE.ids.get(),
+                                                                   scratchGPU.pChannelsCounter.get(),
+                                                                   conditions.eMappingProduct.eid2did,
+                                                                   nbytesTotal);
+      cudaCheck(cudaGetLastError());
+
+      // transfer the counters for how many eb and ee channels we got
+      cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.get(),
+                                scratchGPU.pChannelsCounter.get(),
+                                sizeof(uint32_t) * 2,
+                                cudaMemcpyDeviceToHost,
+                                cudaStream));
+    }
+
+  }  // namespace raw
+}  // namespace ecal
diff --git a/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h
new file mode 100644
index 0000000000000..d98906e7e24a7
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h
@@ -0,0 +1,23 @@
+#ifndef EventFilter_EcalRawToDigi_plugins_UnpackGPU_h
+#define EventFilter_EcalRawToDigi_plugins_UnpackGPU_h
+
+#include "DeclsForKernels.h"
+
+namespace ecal {
+  namespace raw {
+
+    // FIXME: bundle up uint32_t values
+    void entryPoint(InputDataCPU const&,
+                    InputDataGPU&,
+                    OutputDataGPU&,
+                    ScratchDataGPU&,
+                    OutputDataCPU&,
+                    ConditionsProducts const&,
+                    cudaStream_t,
+                    uint32_t const,
+                    uint32_t const);
+
+  }  // namespace raw
+}  // namespace ecal
+
+#endif  // EventFilter_EcalRawToDigi_plugins_UnpackGPU_h
diff --git a/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py b/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py
index 849aaeeb414a4..00a54ad56c128 100644
--- a/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py
+++ b/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py
@@ -5,3 +5,24 @@
 ecalDigis = _ecalEBunpacker.clone()
 
 ecalDigisTask = cms.Task(ecalDigis)
+
+# process modifier to run on GPUs
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+# GPU-friendly EventSetup modules
+from EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi import ecalElectronicsMappingGPUESProducer
+
+# raw to digi on GPUs
+from EventFilter.EcalRawToDigi.ecalRawToDigiGPU_cfi import ecalRawToDigiGPU as _ecalRawToDigiGPU
+ecalDigisGPU = _ecalRawToDigiGPU.clone()
+
+# copy the digi from the GPU to the CPU and convert to legacy format
+from EventFilter.EcalRawToDigi.ecalCPUDigisProducer_cfi import ecalCPUDigisProducer as _ecalCPUDigisProducer
+_ecalDigis_gpu = _ecalCPUDigisProducer.clone(
+  digisInLabelEB = ('ecalDigisGPU', 'ebDigis'),
+  digisInLabelEE = ('ecalDigisGPU', 'eeDigis'),
+  produceDummyIntegrityCollections = True
+)
+gpu.toReplaceWith(ecalDigis, _ecalDigis_gpu)
+
+gpu.toReplaceWith(ecalDigisTask, cms.Task(ecalElectronicsMappingGPUESProducer, ecalDigisGPU, ecalDigis))
diff --git a/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc
new file mode 100644
index 0000000000000..8264c501a896c
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc
@@ -0,0 +1,57 @@
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "DataFormats/EcalDetId/interface/EcalElectronicsId.h"
+
+namespace ecal {
+  namespace raw {
+
+    // TODO: 0x3FFFFF * 4B ~= 16MB
+    // tmp solution for linear mapping of eid -> did
+    ElectronicsMappingGPU::ElectronicsMappingGPU(EcalMappingElectronics const& mapping) : eid2did_(0x3FFFFF) {
+      // fill in eb
+      // TODO: EB vector is actually empty
+      auto const& barrelValues = mapping.barrelItems();
+      for (unsigned int i = 0; i < barrelValues.size(); i++) {
+        EcalElectronicsId eid{barrelValues[i].electronicsid};
+        EBDetId did{EBDetId::unhashIndex(i)};
+        eid2did_[eid.linearIndex()] = did.rawId();
+      }
+
+      // fill in ee
+      auto const& endcapValues = mapping.endcapItems();
+      for (unsigned int i = 0; i < endcapValues.size(); i++) {
+        EcalElectronicsId eid{endcapValues[i].electronicsid};
+        EEDetId did{EEDetId::unhashIndex(i)};
+        eid2did_[eid.linearIndex()] = did.rawId();
+      }
+    }
+
+    ElectronicsMappingGPU::Product::~Product() {
+      // deallocation
+      cudaCheck(cudaFree(eid2did));
+    }
+
+    ElectronicsMappingGPU::Product const& ElectronicsMappingGPU::getProduct(cudaStream_t cudaStream) const {
+      auto const& product = product_.dataForCurrentDeviceAsync(
+          cudaStream, [this](ElectronicsMappingGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck(cudaMalloc((void**)&product.eid2did, this->eid2did_.size() * sizeof(uint32_t)));
+
+            // transfer
+            cudaCheck(cudaMemcpyAsync(product.eid2did,
+                                      this->eid2did_.data(),
+                                      this->eid2did_.size() * sizeof(uint32_t),
+                                      cudaMemcpyHostToDevice,
+                                      cudaStream));
+          });
+
+      return product;
+    }
+
+  }  // namespace raw
+}  // namespace ecal
+
+TYPELOOKUP_DATA_REG(ecal::raw::ElectronicsMappingGPU);
diff --git a/RecoLocalCalo/Configuration/python/customizeEcalOnlyForProfiling.py b/RecoLocalCalo/Configuration/python/customizeEcalOnlyForProfiling.py
new file mode 100644
index 0000000000000..4fa955bd33836
--- /dev/null
+++ b/RecoLocalCalo/Configuration/python/customizeEcalOnlyForProfiling.py
@@ -0,0 +1,51 @@
+import FWCore.ParameterSet.Config as cms
+
+# Customise the ECAL-only reconstruction to run on GPU
+#
+# Currently, this means running only the unpacker and multifit, up to the uncalbrated rechits
+def customizeEcalOnlyForProfilingGPUOnly(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('ecalMultiFitUncalibRecHitGPU')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the ECAL-only reconstruction to run on GPU, and copy the data to the host
+#
+# Currently, this means running only the unpacker and multifit, up to the uncalbrated rechits
+def customizeEcalOnlyForProfilingGPUWithHostCopy(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('ecalMultiFitUncalibRecHitSoA')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the ECAL-only reconstruction to run on GPU, copy the data to the host, and convert to legacy format
+#
+# Currently, this means running only the unpacker and multifit, up to the uncalbrated rechits, on the GPU
+# and the rechits producer on the CPU
+#
+# The same customisation can be also used on the CPU workflow, running up to the rechits on CPU.
+def customizeEcalOnlyForProfiling(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('ecalRecHit')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
diff --git a/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py b/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py
index 06fecf4787baf..75ae5fc0c202f 100644
--- a/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py
+++ b/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
 # TPG condition needed by ecalRecHit producer if TT recovery is ON
 from RecoLocalCalo.EcalRecProducers.ecalRecHitTPGConditions_cff import *
@@ -43,6 +44,61 @@
 
 ecalOnlyLocalRecoSequence = cms.Sequence(ecalOnlyLocalRecoTask)
 
+# ECAL rechit calibrations on GPU
+from RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi import ecalRechitADCToGeVConstantGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi import ecalRechitChannelStatusGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi import ecalIntercalibConstantsGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi import ecalLaserAPDPNRatiosGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi import ecalLaserAPDPNRatiosRefGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi import ecalLaserAlphasGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi import ecalLinearCorrectionsGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalRecHitParametersGPUESProducer_cfi import ecalRecHitParametersGPUESProducer
+
+# ECAL rechits running on GPU
+from RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi import ecalRecHitGPU as _ecalRecHitGPU
+ecalRecHitGPU = _ecalRecHitGPU.clone(
+    uncalibrecHitsInLabelEB = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEB'),
+    uncalibrecHitsInLabelEE = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEE')
+)
+
+# copy the rechits from GPU to CPU
+from RecoLocalCalo.EcalRecProducers.ecalCPURecHitProducer_cfi import ecalCPURecHitProducer as _ecalCPURecHitProducer
+ecalRecHitSoA = _ecalCPURecHitProducer.clone(
+    recHitsInLabelEB = cms.InputTag('ecalRecHitGPU', 'EcalRecHitsEB'),
+    recHitsInLabelEE = cms.InputTag('ecalRecHitGPU', 'EcalRecHitsEE')
+)
+
+# convert the rechits from SoA to legacy format
+from RecoLocalCalo.EcalRecProducers.ecalRecHitConvertGPU2CPUFormat_cfi import ecalRecHitConvertGPU2CPUFormat as _ecalRecHitConvertGPU2CPUFormat
+_ecalRecHit_gpu = _ecalRecHitConvertGPU2CPUFormat.clone(
+    recHitsLabelGPUEB = cms.InputTag('ecalRecHitSoA', 'EcalRecHitsEB'),
+    recHitsLabelGPUEE = cms.InputTag('ecalRecHitSoA', 'EcalRecHitsEE')
+)
+# TODO: the ECAL calibrated rechits produced on the GPU are not correct, yet.
+# When they are working and validated, remove this comment and uncomment the next line:
+#gpu.toReplaceWith(ecalRecHit, _ecalRecHit_gpu)
+
+# ECAL reconstruction on GPU
+gpu.toReplaceWith(ecalRecHitNoTPTask, cms.Task(
+  # ECAL rechit calibrations on GPU
+  ecalRechitADCToGeVConstantGPUESProducer,
+  ecalRechitChannelStatusGPUESProducer,
+  ecalIntercalibConstantsGPUESProducer,
+  ecalLaserAPDPNRatiosGPUESProducer,
+  ecalLaserAPDPNRatiosRefGPUESProducer,
+  ecalLaserAlphasGPUESProducer,
+  ecalLinearCorrectionsGPUESProducer,
+  ecalRecHitParametersGPUESProducer,
+  # ECAL rechits running on GPU
+  ecalRecHitGPU,
+  # copy the rechits from GPU to CPU
+  ecalRecHitSoA,
+  # convert the rechits from SoA to legacy format
+  ecalRecHit,
+  # ECAL preshower rechit legacy module
+  ecalPreshowerRecHit
+))
+
 # Phase 2 modifications
 from RecoLocalCalo.EcalRecProducers.ecalDetailedTimeRecHit_cfi import *
 _phase2_timing_ecalRecHitTask = cms.Task( ecalRecHitTask.copy() , ecalDetailedTimeRecHit )
diff --git a/RecoLocalCalo/EcalRecAlgos/BuildFile.xml b/RecoLocalCalo/EcalRecAlgos/BuildFile.xml
index 2eaf053c342dd..c2858ae76d7bc 100644
--- a/RecoLocalCalo/EcalRecAlgos/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecAlgos/BuildFile.xml
@@ -1,9 +1,13 @@
 <use name="clhep"/>
+<use name="cuda"/>
+<use name="CUDADataFormats/EcalRecHitSoA"/>
 <use name="DataFormats/EcalRecHit"/>
 <use name="DataFormats/EcalDigi"/>
 <use name="FWCore/MessageLogger"/>
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/Framework"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="CondFormats/ESObjects"/>
 <use name="CondFormats/EcalObjects"/>
 <use name="CondFormats/DataRecord"/>
@@ -11,6 +15,7 @@
 <use name="root"/>
 <use name="rootminuit"/>
 <use name="eigen"/>
+
 <export>
   <lib name="1"/>
 </export>
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml
new file mode 100644
index 0000000000000..4c98171091b84
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml
@@ -0,0 +1,17 @@
+<bin name="makeEcalMultifitResultsGpuValidationPlots" file="makeEcalMultifitResultsGpuValidationPlots.cpp">
+    <use name="root"/>
+    <use name="rootgraphics"/>
+    <use name="CUDADataFormats/EcalRecHitSoA"/>
+    <use name="DataFormats/Common"/>
+    <use name="DataFormats/EcalRecHit"/>
+</bin>
+
+<bin name="makeEcalRechitValidationPlots" file="makeEcalRechitValidationPlots.cpp">
+  <use name="root"/>
+  <use name="rootgraphics"/>
+  <use name="CUDADataFormats/EcalRecHitSoA"/>
+  <use name="DataFormats/Common"/>
+  <use name="DataFormats/EcalRecHit"/>
+</bin>
+
+
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
new file mode 100644
index 0000000000000..f010e3afdbb18
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
@@ -0,0 +1,564 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <TCanvas.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TTree.h>
+#include <TPaveStats.h>
+
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h"
+
+#include "TStyle.h"
+
+void setAxis(TH2D *histo) {
+  histo->GetXaxis()->SetTitle("cpu");
+  histo->GetYaxis()->SetTitle("gpu");
+}
+
+void setAxisDelta(TH2D *histo) {
+  histo->GetXaxis()->SetTitle("cpu");
+  histo->GetYaxis()->SetTitle("#Delta gpu-cpu");
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    std::cout << "run with: ./validateGPU <path to input file> <output file>\n";
+    exit(0);
+  }
+
+  gStyle->SetOptStat("ourme");
+
+  edm::Wrapper<ecal::UncalibratedRecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>> *wgpuEB =
+      nullptr;
+  edm::Wrapper<ecal::UncalibratedRecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>> *wgpuEE =
+      nullptr;
+  edm::Wrapper<EBUncalibratedRecHitCollection> *wcpuEB = nullptr;
+  edm::Wrapper<EEUncalibratedRecHitCollection> *wcpuEE = nullptr;
+
+  std::string fileName = argv[1];
+  std::string outFileName = argv[2];
+
+  // output
+  TFile rfout{outFileName.c_str(), "recreate"};
+
+  int nbins_count = 200;
+  float last_count = 5000.;
+  int nbins_count_delta = 201;
+
+  int nbins = 300;
+  float last = 3000.;
+
+  //     int nbins_chi2 = 1000;
+  //     float last_chi2 = 1000.;
+  int nbins_chi2 = 1000;
+  float last_chi2 = 200.;
+
+  int nbins_flags = 100;
+  float last_flags = 100.;
+  float delta_flags = 20;
+
+  int nbins_delta = 201;  // use an odd number to center around 0
+  float delta = 0.2;
+
+  // RecHits plots for EB and EE on both GPU and CPU
+  auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins_count, 0, last_count);
+  auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins_count, 0, last_count);
+  auto hRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins_count, 0, last_count);
+  auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins_count, 0, last_count);
+  auto hRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1);
+  auto hRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1);
+
+  auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last);
+  auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last);
+  auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last);
+  auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last);
+  auto hSOIAmplitudesEBGPUCPUratio =
+      new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  auto hSOIAmplitudesEEGPUCPUratio =
+      new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+
+  auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2);
+  auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2);
+  auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2);
+  auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2);
+  auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+
+  auto hFlagsEBGPU = new TH1D("hFlagsEBGPU", "hFlagsEBGPU", nbins_flags, 0, last_flags);
+  auto hFlagsEEGPU = new TH1D("hFlagsEEGPU", "hFlagsEEGPU", nbins_flags, 0, last_flags);
+  auto hFlagsEBCPU = new TH1D("hFlagsEBCPU", "hFlagsEBCPU", nbins_flags, 0, last_flags);
+  auto hFlagsEECPU = new TH1D("hFlagsEECPU", "hFlagsEECPU", nbins_flags, 0, last_flags);
+  auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+
+  auto hSOIAmplitudesEBGPUvsCPU =
+      new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  setAxis(hSOIAmplitudesEBGPUvsCPU);
+  auto hSOIAmplitudesEEGPUvsCPU =
+      new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  setAxis(hSOIAmplitudesEEGPUvsCPU);
+  auto hSOIAmplitudesEBdeltavsCPU =
+      new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  setAxisDelta(hSOIAmplitudesEBdeltavsCPU);
+  auto hSOIAmplitudesEEdeltavsCPU =
+      new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  setAxisDelta(hSOIAmplitudesEEdeltavsCPU);
+
+  auto hChi2EBGPUvsCPU =
+      new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
+  setAxis(hChi2EBGPUvsCPU);
+  auto hChi2EEGPUvsCPU =
+      new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
+  setAxis(hChi2EEGPUvsCPU);
+  auto hChi2EBdeltavsCPU =
+      new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  setAxisDelta(hChi2EBdeltavsCPU);
+  auto hChi2EEdeltavsCPU =
+      new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  setAxisDelta(hChi2EEdeltavsCPU);
+
+  auto hFlagsEBGPUvsCPU =
+      new TH2D("hFlagsEBGPUvsCPU", "hFlagsEBGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags);
+  setAxis(hFlagsEBGPUvsCPU);
+  auto hFlagsEEGPUvsCPU =
+      new TH2D("hFlagsEEGPUvsCPU", "hFlagsEEGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags);
+  setAxis(hFlagsEEGPUvsCPU);
+  auto hFlagsEBdeltavsCPU = new TH2D(
+      "hFlagsEBdeltavsCPU", "hFlagsEBdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags);
+  setAxisDelta(hFlagsEBdeltavsCPU);
+  auto hFlagsEEdeltavsCPU = new TH2D(
+      "hFlagsEEdeltavsCPU", "hFlagsEEdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags);
+  setAxisDelta(hFlagsEEdeltavsCPU);
+
+  auto hRechitsEBGPUvsCPU = new TH2D(
+      "RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count);
+  setAxis(hRechitsEBGPUvsCPU);
+  auto hRechitsEEGPUvsCPU = new TH2D(
+      "RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count);
+  setAxis(hRechitsEEGPUvsCPU);
+  auto hRechitsEBdeltavsCPU = new TH2D(
+      "RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta);
+  setAxisDelta(hRechitsEBdeltavsCPU);
+  auto hRechitsEEdeltavsCPU = new TH2D(
+      "RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta);
+  setAxisDelta(hRechitsEEdeltavsCPU);
+
+  // input
+  std::cout << "validating file " << fileName << std::endl;
+  TFile rf{fileName.c_str()};
+  TTree *rt = (TTree *)rf.Get("Events");
+  rt->SetBranchAddress(
+      "calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_"
+      "EcalUncalibRecHitsEB_RECO.",
+      &wgpuEB);
+  rt->SetBranchAddress(
+      "calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_"
+      "EcalUncalibRecHitsEE_RECO.",
+      &wgpuEE);
+  rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB);
+  rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE);
+
+  constexpr float eps_diff = 1e-3;
+
+  // accumulate
+  auto const nentries = rt->GetEntries();
+  std::cout << "#events to validate over: " << nentries << std::endl;
+  for (int ie = 0; ie < nentries; ++ie) {
+    rt->GetEntry(ie);
+
+    const char *ordinal[] = {"th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th"};
+    auto cpu_eb_size = wcpuEB->bareProduct().size();
+    auto cpu_ee_size = wcpuEE->bareProduct().size();
+    auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size();
+    auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size();
+
+    float eb_ratio = (float)gpu_eb_size / cpu_eb_size;
+    float ee_ratio = (float)gpu_ee_size / cpu_ee_size;
+
+    // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
+    hRechitsEBGPU->Fill(gpu_eb_size);
+    hRechitsEBCPU->Fill(cpu_eb_size);
+    hRechitsEEGPU->Fill(gpu_ee_size);
+    hRechitsEECPU->Fill(cpu_ee_size);
+    hRechitsEBGPUvsCPU->Fill(cpu_eb_size, gpu_eb_size);
+    hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size);
+    hRechitsEBGPUCPUratio->Fill(eb_ratio);
+    hRechitsEEGPUCPUratio->Fill(ee_ratio);
+    hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size - cpu_eb_size);
+    hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size - cpu_ee_size);
+
+    if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
+      std::cerr << ie << ordinal[ie % 10] << " entry:\n"
+                << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size
+                << " (gpu)\n"
+                << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size
+                << " (gpu)" << std::endl;
+      continue;
+    }
+
+    assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size());
+    assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size());
+    auto const neb = wcpuEB->bareProduct().size();
+    auto const nee = wcpuEE->bareProduct().size();
+
+    for (uint32_t i = 0; i < neb; ++i) {
+      auto const did_gpu = wgpuEB->bareProduct().did[i];
+      auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i];
+      auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu});
+      if (cpu_iter == wcpuEB->bareProduct().end()) {
+        std::cerr << ie << ordinal[ie % 10] << " entry\n"
+                  << "  Did not find a DetId " << did_gpu << " in a CPU collection\n";
+        continue;
+      }
+      auto const soi_amp_cpu = cpu_iter->amplitude();
+      auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
+      auto const chi2_cpu = cpu_iter->chi2();
+
+      auto const flags_gpu = wgpuEB->bareProduct().flags[i];
+      auto const flags_cpu = cpu_iter->flags();
+
+      hSOIAmplitudesEBGPU->Fill(soi_amp_gpu);
+      hSOIAmplitudesEBCPU->Fill(soi_amp_cpu);
+      hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
+      hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
+      if (soi_amp_cpu > 0)
+        hSOIAmplitudesEBGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu);
+
+      hChi2EBGPU->Fill(chi2_gpu);
+      hChi2EBCPU->Fill(chi2_cpu);
+      hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+      hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
+      if (chi2_cpu > 0)
+        hChi2EBGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu);
+
+      if (std::abs(chi2_gpu / chi2_cpu - 1) > 0.05 || std::abs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) {
+        std::cout << " ---- EB  " << std::endl;
+        std::cout << " eventid = " << ie << " xtal = " << i << std::endl;
+        std::cout << " chi2_gpu    = " << chi2_gpu << " chi2_cpu =    " << chi2_cpu << std::endl;
+        std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl;
+        std::cout << " flags_gpu   = " << flags_gpu << " flags_cpu =   " << flags_cpu << std::endl;
+      }
+
+      hFlagsEBGPU->Fill(flags_gpu);
+      hFlagsEBCPU->Fill(flags_cpu);
+      hFlagsEBGPUvsCPU->Fill(flags_cpu, flags_gpu);
+      hFlagsEBdeltavsCPU->Fill(flags_cpu, flags_gpu - flags_cpu);
+      if (flags_cpu > 0)
+        hFlagsEBGPUCPUratio->Fill((float)flags_gpu / flags_cpu);
+
+      if (flags_cpu != flags_gpu) {
+        std::cout << "    >>  No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu;
+        std::cout << std::endl;
+      }
+
+      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
+          std::isnan(chi2_gpu) or (flags_cpu != flags_gpu)) {
+        printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+               ie,
+               i,
+               soi_amp_gpu,
+               soi_amp_cpu,
+               chi2_gpu,
+               chi2_cpu);
+        if (std::isnan(chi2_gpu))
+          printf("*** nan ***\n");
+      }
+    }
+
+    for (uint32_t i = 0; i < nee; ++i) {
+      auto const did_gpu = wgpuEE->bareProduct().did[i];
+      auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i];
+      auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu});
+      if (cpu_iter == wcpuEE->bareProduct().end()) {
+        std::cerr << ie << ordinal[ie % 10] << " entry\n"
+                  << "  did not find a DetId " << did_gpu << " in a CPU collection\n";
+        continue;
+      }
+      auto const soi_amp_cpu = cpu_iter->amplitude();
+      auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
+      auto const chi2_cpu = cpu_iter->chi2();
+
+      auto const flags_gpu = wgpuEE->bareProduct().flags[i];
+      auto const flags_cpu = cpu_iter->flags();
+
+      hSOIAmplitudesEEGPU->Fill(soi_amp_gpu);
+      hSOIAmplitudesEECPU->Fill(soi_amp_cpu);
+      hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
+      hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
+      if (soi_amp_cpu > 0)
+        hSOIAmplitudesEEGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu);
+
+      hChi2EEGPU->Fill(chi2_gpu);
+      hChi2EECPU->Fill(chi2_cpu);
+      hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+      hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
+      if (chi2_cpu > 0)
+        hChi2EEGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu);
+
+      if (std::abs(chi2_gpu / chi2_cpu - 1) > 0.05 || std::abs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) {
+        std::cout << " ---- EE  " << std::endl;
+        std::cout << " eventid = " << ie << " xtal = " << i << std::endl;
+        std::cout << " chi2_gpu    = " << chi2_gpu << " chi2_cpu =    " << chi2_cpu << std::endl;
+        std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl;
+        std::cout << " flags_gpu   = " << flags_gpu << " flags_cpu =   " << flags_cpu << std::endl;
+      }
+
+      hFlagsEEGPU->Fill(flags_gpu);
+      hFlagsEECPU->Fill(flags_cpu);
+      hFlagsEEGPUvsCPU->Fill(flags_cpu, flags_gpu);
+      hFlagsEEdeltavsCPU->Fill(flags_cpu, flags_gpu - flags_cpu);
+      if (flags_cpu > 0)
+        hFlagsEEGPUCPUratio->Fill((float)flags_gpu / flags_cpu);
+
+      if (flags_cpu != flags_gpu) {
+        std::cout << "    >>  No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu;
+        std::cout << std::endl;
+      }
+
+      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
+          std::isnan(chi2_gpu) or (flags_cpu != flags_gpu)) {
+        printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+               ie,
+               static_cast<int>(neb + i),
+               soi_amp_gpu,
+               soi_amp_cpu,
+               chi2_gpu,
+               chi2_cpu);
+        if (std::isnan(chi2_gpu))
+          printf("*** nan ***\n");
+      }
+    }
+  }
+
+  {
+    TCanvas c("plots", "plots", 1750, 860);
+    c.Divide(3, 2);
+
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hSOIAmplitudesEBCPU->SetLineColor(kBlack);
+      hSOIAmplitudesEBCPU->SetLineWidth(1.);
+      hSOIAmplitudesEBCPU->Draw("");
+      hSOIAmplitudesEBGPU->SetLineColor(kBlue);
+      hSOIAmplitudesEBGPU->SetLineWidth(1.);
+      hSOIAmplitudesEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hSOIAmplitudesEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+
+    c.cd(4);
+    {
+      gPad->SetLogy();
+      hSOIAmplitudesEECPU->SetLineColor(kBlack);
+      hSOIAmplitudesEECPU->SetLineWidth(1.);
+      hSOIAmplitudesEECPU->Draw("");
+      hSOIAmplitudesEEGPU->SetLineColor(kBlue);
+      hSOIAmplitudesEEGPU->SetLineWidth(1.);
+      hSOIAmplitudesEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hSOIAmplitudesEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+
+    c.cd(2);
+    gPad->SetGrid();
+    hSOIAmplitudesEBGPUvsCPU->Draw("COLZ");
+
+    c.cd(5);
+    gPad->SetGrid();
+    hSOIAmplitudesEEGPUvsCPU->Draw("COLZ");
+
+    c.cd(3);
+
+    hSOIAmplitudesEBGPUCPUratio->Draw("");
+
+    c.cd(6);
+
+    hSOIAmplitudesEEGPUCPUratio->Draw("");
+
+    c.SaveAs("ecal-amplitudes.root");
+    c.SaveAs("ecal-amplitudes.png");
+
+    // chi2
+
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hChi2EBCPU->SetLineColor(kBlack);
+      hChi2EBCPU->SetLineWidth(1.);
+      hChi2EBCPU->Draw("");
+      hChi2EBGPU->SetLineColor(kBlue);
+      hChi2EBGPU->SetLineWidth(1.);
+      hChi2EBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+
+    c.cd(4);
+    {
+      gPad->SetLogy();
+      hChi2EECPU->SetLineColor(kBlack);
+      hChi2EECPU->SetLineWidth(1.);
+      hChi2EECPU->Draw("");
+      hChi2EEGPU->SetLineColor(kBlue);
+      hChi2EEGPU->SetLineWidth(1.);
+      hChi2EEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+
+    c.cd(2);
+    gPad->SetGrid();
+    hChi2EBGPUvsCPU->Draw("COLZ");
+
+    c.cd(5);
+    gPad->SetGrid();
+    hChi2EEGPUvsCPU->Draw("COLZ");
+
+    c.cd(3);
+
+    hChi2EBGPUCPUratio->Draw("");
+
+    c.cd(6);
+
+    hChi2EEGPUCPUratio->Draw("");
+
+    c.SaveAs("ecal-chi2.root");
+    c.SaveAs("ecal-chi2.png");
+
+    // flags
+
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hFlagsEBCPU->SetLineColor(kBlack);
+      hFlagsEBCPU->SetLineWidth(1.);
+      hFlagsEBCPU->Draw("");
+      hFlagsEBGPU->SetLineColor(kBlue);
+      hFlagsEBGPU->SetLineWidth(1.);
+      hFlagsEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hFlagsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+
+    c.cd(4);
+    {
+      gPad->SetLogy();
+      hFlagsEECPU->SetLineColor(kBlack);
+      hFlagsEECPU->SetLineWidth(1.);
+      hFlagsEECPU->Draw("");
+      hFlagsEEGPU->SetLineColor(kBlue);
+      hFlagsEEGPU->SetLineWidth(1.);
+      hFlagsEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hFlagsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+
+    c.cd(2);
+    gPad->SetGrid();
+    hFlagsEBGPUvsCPU->Draw("COLZ");
+
+    c.cd(5);
+    gPad->SetGrid();
+    hFlagsEEGPUvsCPU->Draw("COLZ");
+
+    c.cd(3);
+    hFlagsEBGPUCPUratio->Draw("");
+
+    c.cd(6);
+    hFlagsEEGPUCPUratio->Draw("");
+
+    c.SaveAs("ecal-flags.root");
+    c.SaveAs("ecal-flags.png");
+
+    TCanvas cRechits("Rechits", "Rechits", 1750, 860);
+    cRechits.Divide(3, 2);
+
+    // Plotting the sizes of GPU vs CPU for each event of EB
+    cRechits.cd(1);
+    {
+      gPad->SetLogy();
+      hRechitsEBCPU->SetLineColor(kRed);
+      hRechitsEBCPU->SetLineWidth(2);
+      hRechitsEBCPU->Draw("");
+      hRechitsEBGPU->SetLineColor(kBlue);
+      hRechitsEBGPU->SetLineWidth(2);
+      hRechitsEBGPU->Draw("sames");
+      cRechits.Update();
+      auto stats = (TPaveStats *)hRechitsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cRechits.cd(4);
+    {
+      gPad->SetLogy();
+      hRechitsEECPU->SetLineColor(kRed);
+      hRechitsEECPU->SetLineWidth(2);
+      hRechitsEECPU->Draw("");
+      hRechitsEEGPU->SetLineColor(kBlue);
+      hRechitsEEGPU->SetLineWidth(2);
+      hRechitsEEGPU->Draw("sames");
+      cRechits.Update();
+      auto stats = (TPaveStats *)hRechitsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cRechits.cd(2);
+    { hRechitsEBGPUvsCPU->Draw("COLZ"); }
+    cRechits.cd(5);
+    { hRechitsEEGPUvsCPU->Draw("COLZ"); }
+    cRechits.cd(3);
+    {
+      gPad->SetLogy();
+      hRechitsEBGPUCPUratio->Draw("");
+    }
+    cRechits.cd(6);
+    {
+      gPad->SetLogy();
+      hRechitsEEGPUCPUratio->Draw("");
+    }
+    cRechits.SaveAs("ecal-rechits.root");
+    cRechits.SaveAs("ecal-rechits.png");
+  }
+
+  rf.Close();
+  rfout.Write();
+  rfout.Close();
+
+  return 0;
+}
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp
new file mode 100644
index 0000000000000..42d1fceaf8b76
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp
@@ -0,0 +1,864 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <fstream>
+
+#include <TCanvas.h>
+#include <TStyle.h>
+#include <TPad.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TTree.h>
+#include <TPaveStats.h>
+
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h"
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    std::cout << "run with: ./makeEcalRechitValidationPlots <path to input file> <output file>\n";
+    exit(0);
+  }
+  // Set the GPU and CPU pointers for both EB and EE
+  edm::Wrapper<ecal::RecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>> *wgpuEB = nullptr;
+  edm::Wrapper<ecal::RecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>> *wgpuEE = nullptr;
+  edm::Wrapper<EBRecHitCollection> *wcpuEB = nullptr;
+  edm::Wrapper<EERecHitCollection> *wcpuEE = nullptr;
+
+  std::string fileName = argv[1];     // The input file containing the data to be validated (i.e. result.root)
+  std::string outFileName = argv[2];  //The output file in which the validation results will be saved (i.e. output.root)
+
+  //output
+  TFile rfout{outFileName.c_str(), "recreate"};
+
+  int nbins = 200;
+  int last = 5000.;
+
+  int nbins_energy = 300;
+  float last_energy = 2.;
+
+  int nbins_chi2 = 200;
+  float last_chi2 = 100.;
+
+  int nbins_flag = 40;
+  //   int nbins_flag = 1000;
+  int last_flag = 1500;
+  //   int nbins_flag = 40;
+  //   int last_flag = 10000;
+
+  int nbins_extra = 200;
+  int last_extra = 200;
+
+  int nbins_delta = 201;  // use an odd number to center around 0
+  float delta = 0.2;
+
+  // RecHits plots for EB and EE on both GPU and CPU
+  auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits. No Filter GPU", nbins, 0, last);
+  auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits. No Filter GPU", nbins, 0, last);
+  auto hRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits. No Filter GPU", nbins, 0, last);
+  auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits. No Filter GPU", nbins, 0, last);
+  auto hRechitsEBGPUvsCPU =
+      new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last);
+  auto hRechitsEEGPUvsCPU =
+      new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last);
+  auto hRechitsEBGPUCPUratio =
+      new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05);
+  auto hRechitsEEGPUCPUratio =
+      new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05);
+  auto hRechitsEBdeltavsCPU =
+      new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hRechitsEEdeltavsCPU =
+      new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta);
+
+  // RecHits plots for EB and EE on both GPU and CPU
+  auto hSelectedRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last);
+  auto hSelectedRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last);
+  auto hSelectedRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last);
+  auto hSelectedRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last);
+  auto hSelectedRechitsEBGPUvsCPU =
+      new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hSelectedRechitsEEGPUvsCPU =
+      new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hSelectedRechitsEBGPUCPUratio =
+      new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hSelectedRechitsEEGPUCPUratio =
+      new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hSelectedRechitsEBdeltavsCPU =
+      new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hSelectedRechitsEEdeltavsCPU =
+      new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+
+  // RecHits plots for EB and EE on both GPU and CPU
+  auto hPositiveRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last);
+  auto hPositiveRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last);
+  auto hPositiveRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last);
+  auto hPositiveRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last);
+  auto hPositiveRechitsEBGPUvsCPU =
+      new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hPositiveRechitsEEGPUvsCPU =
+      new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hPositiveRechitsEBGPUCPUratio =
+      new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hPositiveRechitsEEGPUCPUratio =
+      new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hPositiveRechitsEBdeltavsCPU =
+      new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hPositiveRechitsEEdeltavsCPU =
+      new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+
+  // Energies plots for EB and EE on both GPU and CPU
+  auto hEnergiesEBGPU = new TH1D("EnergiesEBGPU", "EnergiesEBGPU; Energy [GeV]", nbins_energy, 0, last_energy);
+  auto hEnergiesEEGPU = new TH1D("EnergiesEEGPU", "EnergiesEEGPU; Energy [GeV]", nbins_energy, 0, last_energy);
+  auto hEnergiesEBCPU = new TH1D("EnergiesEBCPU", "EnergiesEBCPU; Energy [GeV]", nbins_energy, 0, last_energy);
+  auto hEnergiesEECPU = new TH1D("EnergiesEECPU", "EnergiesEECPU; Energy [GeV]", nbins_energy, 0, last_energy);
+  auto hEnergiesEBGPUvsCPU = new TH2D(
+      "EnergiesEBGPUvsCPU", "EnergiesEBGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy);
+  auto hEnergiesEEGPUvsCPU = new TH2D(
+      "EnergiesEEGPUvsCPU", "EnergiesEEGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy);
+  auto hEnergiesEBGPUCPUratio = new TH1D("EnergiesEBGPU/CPUratio", "EnergiesEBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
+  auto hEnergiesEEGPUCPUratio = new TH1D("EnergiesEEGPU/CPUratio", "EnergiesEEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
+  auto hEnergiesEBdeltavsCPU =
+      new TH2D("EnergiesEBdeltavsCPU", "EnergiesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hEnergiesEEdeltavsCPU =
+      new TH2D("EnergiesEEdeltavsCPU", "EnergiesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+
+  // Chi2 plots for EB and EE on both GPU and CPU
+  auto hChi2EBGPU = new TH1D("Chi2EBGPU", "Chi2EBGPU; Ch^{2}", nbins_chi2, 0, last_chi2);
+  auto hChi2EEGPU = new TH1D("Chi2EEGPU", "Chi2EEGPU; Ch^{2}", nbins_chi2, 0, last_chi2);
+  auto hChi2EBCPU = new TH1D("Chi2EBCPU", "Chi2EBCPU; Ch^{2}", nbins_chi2, 0, last_chi2);
+  auto hChi2EECPU = new TH1D("Chi2EECPU", "Chi2EECPU; Ch^{2}", nbins_chi2, 0, last_chi2);
+  auto hChi2EBGPUvsCPU = new TH2D("Chi2EBGPUvsCPU", "Chi2EBGPUvsCPU; CPU; GPU", nbins_chi2, 0, 100, nbins_chi2, 0, 100);
+  auto hChi2EEGPUvsCPU = new TH2D("Chi2EEGPUvsCPU", "Chi2EEGPUvsCPU; CPU; GPU", nbins_chi2, 0, 100, nbins_chi2, 0, 100);
+  auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
+  auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
+  auto hChi2EBdeltavsCPU =
+      new TH2D("Chi2EBdeltavsCPU", "Chi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  auto hChi2EEdeltavsCPU =
+      new TH2D("Chi2EEdeltavsCPU", "Chi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+
+  // Flags plots for EB and EE on both GPU and CPU
+  auto hFlagsEBGPU = new TH1D("FlagsEBGPU", "FlagsEBGPU; Flags", nbins_flag, -10, last_flag);
+  auto hFlagsEBCPU = new TH1D("FlagsEBCPU", "FlagsEBCPU; Flags", nbins_flag, -10, last_flag);
+  auto hFlagsEEGPU = new TH1D("FlagsEEGPU", "FlagsEEGPU; Flags", nbins_flag, -10, last_flag);
+  auto hFlagsEECPU = new TH1D("FlagsEECPU", "FlagsEECPU; Flags", nbins_flag, -10, last_flag);
+  auto hFlagsEBGPUvsCPU =
+      new TH2D("FlagsEBGPUvsCPU", "FlagsEBGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag);
+  auto hFlagsEEGPUvsCPU =
+      new TH2D("FlagsEEGPUvsCPU", "FlagsEEGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag);
+  auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 50, -5, 10);
+  auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 50, -5, 10);
+  auto hFlagsEBdeltavsCPU =
+      new TH2D("FlagsEBdeltavsCPU", "FlagsEBdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta);
+  auto hFlagsEEdeltavsCPU =
+      new TH2D("FlagsEEdeltavsCPU", "FlagsEEdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta);
+
+  // Extras plots for EB and EE on both GPU and CPU
+  auto hExtrasEBGPU = new TH1D("ExtrasEBGPU", "ExtrasEBGPU; No. of Extras", nbins_extra, 0, last_extra);
+  auto hExtrasEBCPU = new TH1D("ExtrasEBCPU", "ExtrasEBCPU; No. of Extras", nbins_extra, 0, last_extra);
+  auto hExtrasEEGPU = new TH1D("ExtrasEEGPU", "ExtrasEEGPU; No. of Extras", nbins_extra, 0, last_extra);
+  auto hExtrasEECPU = new TH1D("ExtrasEECPU", "ExtrasEECPU; No. of Extras", nbins_extra, 0, last_extra);
+  auto hExtrasEBGPUvsCPU = new TH2D(
+      "ExtrasEBGPUvsCPU", "ExtrasEBGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra);
+  auto hExtrasEEGPUvsCPU = new TH2D(
+      "ExtrasEEGPUvsCPU", "ExtrasEEGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra);
+  auto hExtrasEBGPUCPUratio = new TH1D("ExtrasEBGPU/CPUratio", "ExtrasEBGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0);
+  auto hExtrasEEGPUCPUratio = new TH1D("ExtrasEEGPU/CPUratio", "ExtrasEEGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0);
+  auto hExtrasEBdeltavsCPU =
+      new TH2D("ExtrasEBdeltavsCPU", "ExtrasEBdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta);
+  auto hExtrasEEdeltavsCPU =
+      new TH2D("ExtrasEEdeltavsCPU", "ExtrasEEdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta);
+
+  // input file setup for tree
+  std::cout << "validating file " << fileName << std::endl;
+  TFile rf{fileName.c_str()};
+  TTree *rt = (TTree *)rf.Get("Events");
+
+  // Allocating the appropriate data to their respective pointers
+  rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEB_RECO.", &wgpuEB);
+  rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEE_RECO.", &wgpuEE);
+  rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEB_RECO.", &wcpuEB);
+  rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEE_RECO.", &wcpuEE);
+
+  // constexpr float eps_diff = 1e-3;
+
+  // accumulate sizes for events and sizes of each event on both GPU and CPU
+  //   auto const nentries = rt->GetEntries();
+  int nentries = rt->GetEntries();
+
+  //---- AM: tests
+  if (nentries > 1000) {
+    nentries = 1000;
+  }
+  //   nentries = 1;
+
+  std::cout << "#events to validate over: " << nentries << std::endl;
+  for (int ie = 0; ie < nentries; ++ie) {
+    rt->GetEntry(ie);
+
+    //     const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" };
+    auto cpu_eb_size = wcpuEB->bareProduct().size();
+    auto cpu_ee_size = wcpuEE->bareProduct().size();
+    auto gpu_eb_size = wgpuEB->bareProduct().energy.size();
+    auto gpu_ee_size = wgpuEE->bareProduct().energy.size();
+    float eb_ratio = (float)gpu_eb_size / cpu_eb_size;
+    float ee_ratio = (float)gpu_ee_size / cpu_ee_size;
+
+    // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
+    hRechitsEBGPU->Fill(gpu_eb_size);
+    hRechitsEBCPU->Fill(cpu_eb_size);
+    hRechitsEEGPU->Fill(gpu_ee_size);
+    hRechitsEECPU->Fill(cpu_ee_size);
+    hRechitsEBGPUvsCPU->Fill(cpu_eb_size, gpu_eb_size);
+    hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size);
+    hRechitsEBGPUCPUratio->Fill(eb_ratio);
+    hRechitsEEGPUCPUratio->Fill(ee_ratio);
+    hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size - cpu_eb_size);
+    hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size - cpu_ee_size);
+
+    /*    
+     *    // condition that sizes on GPU and CPU should be the same for EB or EE
+     *       if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
+     *         std::cerr << ie << ordinal[ie % 10] << " entry:\n"
+     *                   << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n"
+     *                   << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl;
+     *                  
+     *         continue;
+  }
+  assert(wgpuEB->bareProduct().energy.size() == wcpuEB->bareProduct().size());
+  assert(wgpuEE->bareProduct().energy.size() == wcpuEE->bareProduct().size()); 
+  auto const neb = wcpuEB->bareProduct().size(); //like cpu_eb_size but set to constant
+  auto const nee = wcpuEE->bareProduct().size(); //like cpu_ee_size but set to constant
+  */
+
+    uint selected_gpu_eb_size = 0;
+    uint selected_gpu_ee_size = 0;
+
+    uint positive_gpu_eb_size = 0;
+    uint positive_gpu_ee_size = 0;
+
+    // EB:
+    for (uint32_t i = 0; i < gpu_eb_size; ++i) {
+      auto const did_gpu = wgpuEB->bareProduct().did[i];  // set the did for the current RecHit
+      // Set the variables for GPU
+      auto const enr_gpu = wgpuEB->bareProduct().energy[i];
+      auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
+      auto const flag_gpu = wgpuEB->bareProduct().flagBits[i];
+      auto const extra_gpu = wgpuEB->bareProduct().extra[i];
+
+      // you have "-1" if the crystal is not selected
+      if (enr_gpu >= 0) {
+        selected_gpu_eb_size++;
+
+        if (enr_gpu > 0) {
+          positive_gpu_eb_size++;
+        }
+
+        // find the Rechit on CPU reflecting the same did
+        auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu});
+        if (cpu_iter == wcpuEB->bareProduct().end()) {
+          //           std::cerr << ie << ordinal[ie % 10] << " entry\n"
+          //                   << "  Did not find a DetId " << did_gpu_eb
+          //                 << " in a CPU collection\n";
+          std::cerr << "  Did not find a DetId " << did_gpu << " in a CPU collection\n";
+          continue;
+        }
+        // Set the variables for CPU
+        auto const enr_cpu = cpu_iter->energy();
+        auto const chi2_cpu = cpu_iter->chi2();
+        //         auto const flag_cpu = cpu_iter->flagBits();
+        auto const flag_cpu = 1;
+        //         auto const extra_cpu = cpu_iter->extra();
+        auto const extra_cpu = 1;
+        //       auto const flag_cpu = cpu_iter->flagBits() ? cpu_iter->flagBits():-1;
+        //       auto const extra_cpu = cpu_iter->extra() ? cpu_iter->extra():-1;
+
+        // AM: TEST
+        //       if (extra_cpu != 10) continue;
+
+        // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta
+        hEnergiesEBGPU->Fill(enr_gpu);
+        hEnergiesEBCPU->Fill(enr_cpu);
+        //       std::cout<<"EB CPU Energy:\t"<<enr_cpu<<std::endl;
+        hEnergiesEBGPUvsCPU->Fill(enr_cpu, enr_gpu);
+        hEnergiesEBGPUCPUratio->Fill(enr_gpu / enr_cpu);
+        hEnergiesEBdeltavsCPU->Fill(enr_cpu, enr_gpu - enr_cpu);
+
+        hChi2EBGPU->Fill(chi2_gpu);
+        hChi2EBCPU->Fill(chi2_cpu);
+        hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+        hChi2EBGPUCPUratio->Fill(chi2_gpu / chi2_cpu);
+        hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
+
+        hFlagsEBGPU->Fill(flag_gpu);
+        hFlagsEBCPU->Fill(flag_cpu);
+        hFlagsEBGPUvsCPU->Fill(flag_cpu, flag_gpu);
+        hFlagsEBGPUCPUratio->Fill(flag_cpu ? flag_gpu / flag_cpu : -1);
+        hFlagsEBdeltavsCPU->Fill(flag_cpu, flag_gpu - flag_cpu);
+
+        hExtrasEBGPU->Fill(extra_gpu);
+        hExtrasEBCPU->Fill(extra_cpu);
+        hExtrasEBGPUvsCPU->Fill(extra_cpu, extra_gpu);
+        hExtrasEBGPUCPUratio->Fill(extra_cpu ? extra_gpu / extra_cpu : -1);
+        hExtrasEBdeltavsCPU->Fill(extra_cpu, extra_gpu - extra_cpu);
+
+        // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message
+        // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or
+        //      (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
+        //  {
+        //      printf("EB eventid = %d chid = %d energy_gpu = %f energy_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+        //          ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu);
+        //      if (std::isnan(chi2_gpu))
+        //        printf("*** nan ***\n");
+        //  }
+      }
+    }
+
+    // EE:
+    for (uint32_t i = 0; i < gpu_ee_size; ++i) {
+      auto const did_gpu = wgpuEE->bareProduct().did[i];  // set the did for the current RecHit
+      // Set the variables for GPU
+      auto const enr_gpu = wgpuEE->bareProduct().energy[i];
+      auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
+      auto const flag_gpu = wgpuEE->bareProduct().flagBits[i];
+      auto const extra_gpu = wgpuEE->bareProduct().extra[i];
+
+      // you have "-1" if the crystal is not selected
+      if (enr_gpu >= 0) {
+        selected_gpu_ee_size++;
+
+        if (enr_gpu > 0) {
+          positive_gpu_ee_size++;
+        }
+
+        // find the Rechit on CPU reflecting the same did
+        auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu});
+        if (cpu_iter == wcpuEE->bareProduct().end()) {
+          //    std::cerr << ie << ordinal[ie % 10] << " entry\n"
+          //            << "  Did not find a DetId " << did_gpu
+          //          << " in a CPU collection\n";
+          std::cerr << "  Did not find a DetId " << did_gpu << " in a CPU collection\n";
+          continue;
+        }
+        // Set the variables for CPU
+        auto const enr_cpu = cpu_iter->energy();
+        auto const chi2_cpu = cpu_iter->chi2();
+        //         auto const flag_cpu = cpu_iter->flagBits();
+        auto const flag_cpu = 1;
+        //         auto const extra_cpu = cpu_iter->extra();
+        auto const extra_cpu = 1;
+        //       auto const flag_cpu = cpu_iter->flagBits()?cpu_iter->flagBits():-1;
+        //       auto const extra_cpu = cpu_iter->extra()?cpu_iter->extra():-1;
+
+        // AM: TEST
+        //       if (extra_cpu != 10) continue;
+
+        // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta
+        hEnergiesEEGPU->Fill(enr_gpu);
+        hEnergiesEECPU->Fill(enr_cpu);
+        hEnergiesEEGPUvsCPU->Fill(enr_cpu, enr_gpu);
+        hEnergiesEEGPUCPUratio->Fill(enr_gpu / enr_cpu);
+        hEnergiesEEdeltavsCPU->Fill(enr_cpu, enr_gpu - enr_cpu);
+
+        hChi2EEGPU->Fill(chi2_gpu);
+        hChi2EECPU->Fill(chi2_cpu);
+        hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+        hChi2EEGPUCPUratio->Fill(chi2_gpu / chi2_cpu);
+        hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
+
+        hFlagsEEGPU->Fill(flag_gpu);
+        hFlagsEECPU->Fill(flag_cpu);
+        hFlagsEEGPUvsCPU->Fill(flag_cpu, flag_gpu);
+        hFlagsEEGPUCPUratio->Fill(flag_cpu ? flag_gpu / flag_cpu : -1);
+        hFlagsEEdeltavsCPU->Fill(flag_cpu, flag_gpu - flag_cpu);
+
+        hExtrasEEGPU->Fill(extra_gpu);
+        hExtrasEECPU->Fill(extra_cpu);
+        hExtrasEEGPUvsCPU->Fill(extra_cpu, extra_gpu);
+        hExtrasEEGPUCPUratio->Fill(extra_cpu ? extra_gpu / extra_cpu : -1);
+        hExtrasEEdeltavsCPU->Fill(extra_cpu, extra_gpu - extra_cpu);
+
+        // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message
+        // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or
+        //      (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
+        //  {
+        //      printf("EE eventid = %d chid = %d energy_gpu = %f energy_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+        //          ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu);
+        //      if (std::isnan(chi2_gpu))
+        //        printf("*** nan ***\n");
+        //  }
+      }
+    }
+
+    //
+    // now the rechit counting
+    //
+    float selected_eb_ratio = (float)selected_gpu_eb_size / cpu_eb_size;
+    float selected_ee_ratio = (float)selected_gpu_ee_size / cpu_ee_size;
+
+    // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
+    hSelectedRechitsEBGPU->Fill(selected_gpu_eb_size);
+    hSelectedRechitsEBCPU->Fill(cpu_eb_size);
+    hSelectedRechitsEEGPU->Fill(selected_gpu_ee_size);
+    hSelectedRechitsEECPU->Fill(cpu_ee_size);
+    hSelectedRechitsEBGPUvsCPU->Fill(cpu_eb_size, selected_gpu_eb_size);
+    hSelectedRechitsEEGPUvsCPU->Fill(cpu_ee_size, selected_gpu_ee_size);
+    hSelectedRechitsEBGPUCPUratio->Fill(selected_eb_ratio);
+    hSelectedRechitsEEGPUCPUratio->Fill(selected_ee_ratio);
+    hSelectedRechitsEBdeltavsCPU->Fill(cpu_eb_size, selected_gpu_eb_size - cpu_eb_size);
+    hSelectedRechitsEEdeltavsCPU->Fill(cpu_ee_size, selected_gpu_ee_size - cpu_ee_size);
+
+    //
+    // now the rechit counting
+    //
+
+    uint positive_cpu_eb_size = 0;
+    uint positive_cpu_ee_size = 0;
+
+    // EB:
+    for (uint32_t i = 0; i < cpu_eb_size; ++i) {
+      auto const enr_cpu = wcpuEB->bareProduct()[i].energy();
+      if (enr_cpu > 0) {
+        positive_cpu_eb_size++;
+      }
+    }
+    // EE:
+    for (uint32_t i = 0; i < cpu_ee_size; ++i) {
+      auto const enr_cpu = wcpuEE->bareProduct()[i].energy();
+      if (enr_cpu > 0) {
+        positive_cpu_ee_size++;
+      }
+    }
+
+    float positive_eb_ratio = (float)positive_gpu_eb_size / positive_cpu_eb_size;
+    float positive_ee_ratio = (float)positive_gpu_ee_size / positive_cpu_ee_size;
+
+    // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
+    hPositiveRechitsEBGPU->Fill(positive_gpu_eb_size);
+    hPositiveRechitsEBCPU->Fill(positive_cpu_eb_size);
+    hPositiveRechitsEEGPU->Fill(positive_gpu_ee_size);
+    hPositiveRechitsEECPU->Fill(positive_cpu_ee_size);
+    hPositiveRechitsEBGPUvsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size);
+    hPositiveRechitsEEGPUvsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size);
+    hPositiveRechitsEBGPUCPUratio->Fill(positive_eb_ratio);
+    hPositiveRechitsEEGPUCPUratio->Fill(positive_ee_ratio);
+    hPositiveRechitsEBdeltavsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size - positive_cpu_eb_size);
+    hPositiveRechitsEEdeltavsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size - positive_cpu_ee_size);
+
+    if (cpu_eb_size != selected_gpu_eb_size or cpu_ee_size != selected_gpu_ee_size) {
+      //       std::cerr << ie << ordinal[ie % 10] << " entry:\n"
+      std::cerr << ie << " entry:\n"
+                << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << selected_gpu_eb_size
+                << " (gpu)\n"
+                << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << selected_gpu_ee_size
+                << " (gpu)" << std::endl;
+    }
+  }
+
+  // Plotting the results:
+  {
+    // Canvases Setup:
+    TCanvas cAllRechits("AllRechits", "AllRechits", 1750, 860);
+    cAllRechits.Divide(3, 2);
+    TCanvas cRechits("Rechits", "Rechits", 1750, 860);
+    cRechits.Divide(3, 2);
+    TCanvas cRechitsPositive("RechitsPositive", "RechitsPositive", 1750, 860);
+    cRechitsPositive.Divide(3, 2);
+    TCanvas cEnergies("Energies", "Energies", 1750, 860);
+    cEnergies.Divide(3, 2);
+    TCanvas cChi2("Chi2", "Chi2", 1750, 860);
+    cChi2.Divide(3, 2);
+    TCanvas cFlags("Flags", "Flags", 1750, 860);
+    cFlags.Divide(3, 2);
+    TCanvas cExtras("Extras", "Extras", 1750, 860);
+    cExtras.Divide(3, 2);
+
+    // Plotting the sizes of GPU vs CPU for each event of EB
+    cAllRechits.cd(1);
+    {
+      gPad->SetLogy();
+      hRechitsEBCPU->SetLineColor(kRed);
+      hRechitsEBCPU->SetLineWidth(2);
+      hRechitsEBCPU->Draw("");
+      hRechitsEBGPU->SetLineColor(kBlue);
+      hRechitsEBGPU->SetLineWidth(2);
+      hRechitsEBGPU->Draw("sames");
+      cAllRechits.Update();
+      auto stats = (TPaveStats *)hRechitsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cAllRechits.cd(4);
+    {
+      gPad->SetLogy();
+      hRechitsEECPU->SetLineColor(kRed);
+      hRechitsEECPU->SetLineWidth(2);
+      hRechitsEECPU->Draw("");
+      hRechitsEEGPU->SetLineColor(kBlue);
+      hRechitsEEGPU->SetLineWidth(2);
+      hRechitsEEGPU->Draw("sames");
+      cAllRechits.Update();
+      auto stats = (TPaveStats *)hRechitsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cAllRechits.cd(2);
+    {
+      gStyle->SetPalette(55);
+      hRechitsEBGPUvsCPU->Draw("COLZ");
+    }
+    cAllRechits.cd(5);
+    {
+      gStyle->SetPalette(55);
+      hRechitsEEGPUvsCPU->Draw("COLZ");
+    }
+    cAllRechits.cd(3);
+    {
+      gPad->SetLogy();
+      //hRechitsEBdeltavsCPU->Draw("COLZ");
+      hRechitsEBGPUCPUratio->Draw("");
+    }
+    cAllRechits.cd(6);
+    {
+      gPad->SetLogy();
+      //hRechitsEEdeltavsCPU->Draw("COLZ");
+      hRechitsEEGPUCPUratio->Draw("");
+    }
+    cAllRechits.SaveAs("ecal-allrechits.root");
+    cAllRechits.SaveAs("ecal-allrechits.png");
+
+    // Plotting the sizes of GPU vs CPU for each event of EB
+    cRechits.cd(1);
+    {
+      gPad->SetLogy();
+      hSelectedRechitsEBCPU->SetLineColor(kRed);
+      hSelectedRechitsEBCPU->SetLineWidth(2);
+      hSelectedRechitsEBCPU->Draw("");
+      hSelectedRechitsEBGPU->SetLineColor(kBlue);
+      hSelectedRechitsEBGPU->SetLineWidth(2);
+      hSelectedRechitsEBGPU->Draw("sames");
+      cRechits.Update();
+      auto stats = (TPaveStats *)hSelectedRechitsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cRechits.cd(4);
+    {
+      gPad->SetLogy();
+      hSelectedRechitsEECPU->SetLineColor(kRed);
+      hSelectedRechitsEECPU->SetLineWidth(2);
+      hSelectedRechitsEECPU->Draw("");
+      hSelectedRechitsEEGPU->SetLineColor(kBlue);
+      hSelectedRechitsEEGPU->SetLineWidth(2);
+      hSelectedRechitsEEGPU->Draw("sames");
+      cRechits.Update();
+      auto stats = (TPaveStats *)hSelectedRechitsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cRechits.cd(2);
+    {
+      gStyle->SetPalette(55);
+      hSelectedRechitsEBGPUvsCPU->Draw("COLZ");
+    }
+    cRechits.cd(5);
+    {
+      gStyle->SetPalette(55);
+      hSelectedRechitsEEGPUvsCPU->Draw("COLZ");
+    }
+    cRechits.cd(3);
+    {
+      gPad->SetLogy();
+      //hSelectedRechitsEBdeltavsCPU->Draw("COLZ");
+      hSelectedRechitsEBGPUCPUratio->Draw("");
+    }
+    cRechits.cd(6);
+    {
+      gPad->SetLogy();
+      //hSelectedRechitsEEdeltavsCPU->Draw("COLZ");
+      hSelectedRechitsEEGPUCPUratio->Draw("");
+    }
+    cRechits.SaveAs("ecal-rechits.root");
+    cRechits.SaveAs("ecal-rechits.png");
+
+    // Plotting the sizes of GPU vs CPU for each event of EB
+    cRechitsPositive.cd(1);
+    {
+      gPad->SetLogy();
+      hPositiveRechitsEBCPU->SetLineColor(kRed);
+      hPositiveRechitsEBCPU->SetLineWidth(2);
+      hPositiveRechitsEBCPU->Draw("");
+      hPositiveRechitsEBGPU->SetLineColor(kBlue);
+      hPositiveRechitsEBGPU->SetLineWidth(2);
+      hPositiveRechitsEBGPU->Draw("sames");
+      cRechitsPositive.Update();
+      auto stats = (TPaveStats *)hPositiveRechitsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cRechitsPositive.cd(4);
+    {
+      gPad->SetLogy();
+      hPositiveRechitsEECPU->SetLineColor(kRed);
+      hPositiveRechitsEECPU->SetLineWidth(2);
+      hPositiveRechitsEECPU->Draw("");
+      hPositiveRechitsEEGPU->SetLineColor(kBlue);
+      hPositiveRechitsEEGPU->SetLineWidth(2);
+      hPositiveRechitsEEGPU->Draw("sames");
+      cRechitsPositive.Update();
+      auto stats = (TPaveStats *)hPositiveRechitsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cRechitsPositive.cd(2);
+    {
+      gStyle->SetPalette(55);
+      hPositiveRechitsEBGPUvsCPU->Draw("COLZ");
+    }
+    cRechitsPositive.cd(5);
+    {
+      gStyle->SetPalette(55);
+      hPositiveRechitsEEGPUvsCPU->Draw("COLZ");
+    }
+    cRechitsPositive.cd(3);
+    {
+      gPad->SetLogy();
+      //hPositiveRechitsEBdeltavsCPU->Draw("COLZ");
+      hPositiveRechitsEBGPUCPUratio->Draw("");
+    }
+    cRechitsPositive.cd(6);
+    {
+      gPad->SetLogy();
+      //hPositiveRechitsEEdeltavsCPU->Draw("COLZ");
+      hPositiveRechitsEEGPUCPUratio->Draw("");
+    }
+    cRechitsPositive.SaveAs("ecal-rechits-positive.root");
+    cRechitsPositive.SaveAs("ecal-rechits-positive.png");
+
+    cEnergies.cd(1);
+    {
+      gPad->SetLogy();
+      hEnergiesEBCPU->SetLineColor(kBlack);
+      hEnergiesEBCPU->SetLineWidth(2);
+      hEnergiesEBCPU->Draw("");
+      hEnergiesEBGPU->SetLineColor(kBlue);
+      hEnergiesEBGPU->SetLineWidth(2);
+      hEnergiesEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hEnergiesEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cEnergies.cd(4);
+    {
+      gPad->SetLogy();
+      hEnergiesEECPU->SetLineColor(kBlack);
+      hEnergiesEECPU->SetLineWidth(2);
+      hEnergiesEECPU->Draw("");
+      hEnergiesEEGPU->SetLineColor(kBlue);
+      hEnergiesEEGPU->SetLineWidth(2);
+      hEnergiesEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hEnergiesEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cEnergies.cd(2);
+    { hEnergiesEBGPUvsCPU->Draw("COLZ"); }
+    cEnergies.cd(5);
+    { hEnergiesEEGPUvsCPU->Draw("COLZ"); }
+    cEnergies.cd(3);
+    {
+      gPad->SetLogy();
+      //hEnergiesEBdeltavsCPU->Draw("COLZ");
+      hEnergiesEBGPUCPUratio->Draw("");
+    }
+    cEnergies.cd(6);
+    {
+      gPad->SetLogy();
+      //hEnergiesEEdeltavsCPU->Draw("COLZ");
+      hEnergiesEEGPUCPUratio->Draw("");
+    }
+    cEnergies.SaveAs("ecal-energies.root");
+    cEnergies.SaveAs("ecal-energies.png");
+
+    cChi2.cd(1);
+    {
+      gPad->SetLogy();
+      hChi2EBCPU->SetLineColor(kBlack);
+      hChi2EBCPU->SetLineWidth(2);
+      hChi2EBCPU->Draw("");
+      hChi2EBGPU->SetLineColor(kBlue);
+      hChi2EBGPU->SetLineWidth(2);
+      hChi2EBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cChi2.cd(4);
+    {
+      gPad->SetLogy();
+      hChi2EECPU->SetLineColor(kBlack);
+      hChi2EECPU->SetLineWidth(2);
+      hChi2EECPU->Draw("");
+      hChi2EEGPU->SetLineColor(kBlue);
+      hChi2EEGPU->SetLineWidth(2);
+      hChi2EEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cChi2.cd(2);
+    { hChi2EBGPUvsCPU->Draw("COLZ"); }
+    cChi2.cd(5);
+    { hChi2EEGPUvsCPU->Draw("COLZ"); }
+    cChi2.cd(3);
+    {
+      gPad->SetLogy();
+      //hChi2EBdeltavsCPU->Draw("COLZ");
+      hChi2EBGPUCPUratio->Draw("");
+    }
+    cChi2.cd(6);
+    {
+      gPad->SetLogy();
+      //hChi2EEdeltavsCPU->Draw("COLZ");
+      hChi2EEGPUCPUratio->Draw("");
+    }
+    cChi2.SaveAs("ecal-chi2.root");
+    cChi2.SaveAs("ecal-chi2.png");
+
+    cFlags.cd(1);
+    {
+      gPad->SetLogy();
+      hFlagsEBCPU->SetLineColor(kBlack);
+      hFlagsEBCPU->SetLineWidth(2);
+      hFlagsEBCPU->Draw("");
+      hFlagsEBGPU->SetLineColor(kBlue);
+      hFlagsEBGPU->SetLineWidth(2);
+      hFlagsEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hFlagsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cFlags.cd(4);
+    {
+      gPad->SetLogy();
+      hFlagsEECPU->SetLineColor(kBlack);
+      hFlagsEECPU->SetLineWidth(2);
+      hFlagsEECPU->Draw("");
+      hFlagsEEGPU->SetLineColor(kBlue);
+      hFlagsEEGPU->SetLineWidth(2);
+      hFlagsEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hFlagsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cFlags.cd(2);
+    { hFlagsEBGPUvsCPU->Draw("COLZ"); }
+    cFlags.cd(5);
+    { hFlagsEEGPUvsCPU->Draw("COLZ"); }
+    cFlags.cd(3);
+    {
+      gPad->SetLogy();
+      //hFlagsEBdeltavsCPU->Draw("COLZ");
+      hFlagsEBGPUCPUratio->Draw("");
+    }
+    cFlags.cd(6);
+    {
+      gPad->SetLogy();
+      //hFlagsEEdeltavsCPU->Draw("COLZ");
+      hFlagsEEGPUCPUratio->Draw("");
+    }
+    cFlags.SaveAs("ecal-flags.root");
+    cFlags.SaveAs("ecal-flags.png");
+
+    cExtras.cd(1);
+    {
+      gPad->SetLogy();
+      hExtrasEBCPU->SetLineColor(kBlack);
+      hExtrasEBCPU->SetLineWidth(2);
+      hExtrasEBCPU->Draw("");
+      hExtrasEBGPU->SetLineColor(kBlue);
+      hExtrasEBGPU->SetLineWidth(2);
+      hExtrasEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hExtrasEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cExtras.cd(4);
+    {
+      gPad->SetLogy();
+      hExtrasEECPU->SetLineColor(kBlack);
+      hExtrasEECPU->SetLineWidth(2);
+      hExtrasEECPU->Draw("");
+      hExtrasEEGPU->SetLineColor(kBlue);
+      hExtrasEEGPU->SetLineWidth(2);
+      hExtrasEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hExtrasEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    cExtras.cd(2);
+    { hExtrasEBGPUvsCPU->Draw("COLZ"); }
+    cExtras.cd(5);
+    { hExtrasEEGPUvsCPU->Draw("COLZ"); }
+    cExtras.cd(3);
+    {
+      gPad->SetLogy();
+      //hExtrasEBdeltavsCPU->Draw("COLZ");
+      hExtrasEBGPUCPUratio->Draw("");
+    }
+    cExtras.cd(6);
+    {
+      gPad->SetLogy();
+      //hExtrasEEdeltavsCPU->Draw("COLZ");
+      hExtrasEEGPUCPUratio->Draw("");
+    }
+    cExtras.SaveAs("ecal-extras.root");
+    cExtras.SaveAs("ecal-extras.png");
+  }
+
+  // Close all open files
+  rf.Close();
+  rfout.Write();
+  rfout.Close();
+
+  return 0;
+}
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h
new file mode 100644
index 0000000000000..a3f65d0b509fc
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalGainRatiosGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalGainRatiosGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalGainRatios.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalGainRatiosGPU {
+public:
+  struct Product {
+    ~Product();
+    float *gain12Over6 = nullptr, *gain6Over1 = nullptr;
+  };
+
+#ifndef __CUDACC__
+
+  // rearrange pedestals
+  EcalGainRatiosGPU(EcalGainRatios const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalGainRatiosGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalGainRatiosGPU"}; }
+
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<float, cms::cuda::HostAllocator<float>> gain12Over6_;
+  std::vector<float, cms::cuda::HostAllocator<float>> gain6Over1_;
+
+  cms::cuda::ESProduct<Product> product_;
+
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalGainRatiosGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
new file mode 100644
index 0000000000000..4b5401ff0316f
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalIntercalibConstantsGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalIntercalibConstantsGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalIntercalibConstants.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalIntercalibConstantsGPU {
+public:
+  struct Product {
+    ~Product();
+    float* values = nullptr;
+  };
+
+#ifndef __CUDACC__
+  //
+  EcalIntercalibConstantsGPU(EcalIntercalibConstants const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalIntercalibConstantsGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  // TODO: do this centrally
+  // get offset for hashes. equals number of barrel items
+  uint32_t getOffset() const { return valuesEB_.size(); }
+
+  //
+  static std::string name() { return std::string{"ecalIntercalibConstantsGPU"}; }
+
+private:
+  std::vector<float> const& valuesEB_;
+  std::vector<float> const& valuesEE_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalIntercalibConstantsGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
new file mode 100644
index 0000000000000..4a6cd34fcd171
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
@@ -0,0 +1,53 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalLaserAPDPNRatiosGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalLaserAPDPNRatiosGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatios.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalLaserAPDPNRatiosGPU {
+public:
+  struct Product {
+    ~Product();
+    float *p1 = nullptr;
+    float *p2 = nullptr;
+    float *p3 = nullptr;
+    edm::TimeValue_t *t1 = nullptr;
+    edm::TimeValue_t *t2 = nullptr;
+    edm::TimeValue_t *t3 = nullptr;
+  };
+
+#ifndef __CUDACC__
+
+  //
+  EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const &);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalLaserAPDPNRatiosGPU() = default;
+
+  // get device pointers
+  Product const &getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalLaserAPDPNRatiosGPU"}; }
+
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<float, cms::cuda::HostAllocator<float> > p1_;
+  std::vector<float, cms::cuda::HostAllocator<float> > p2_;
+  std::vector<float, cms::cuda::HostAllocator<float> > p3_;
+
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t> > t1_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t> > t2_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t> > t3_;
+
+  cms::cuda::ESProduct<Product> product_;
+
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalLaserAPDPNRatiosGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
new file mode 100644
index 0000000000000..985bfd9579f7c
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalLaserAPDPNRatiosRefGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalLaserAPDPNRatiosRefGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatiosRef.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalLaserAPDPNRatiosRefGPU {
+public:
+  struct Product {
+    ~Product();
+    float* values = nullptr;
+  };
+
+#ifndef __CUDACC__
+  //
+  EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalLaserAPDPNRatiosRefGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  // TODO: do this centrally
+  // get offset for hashes. equals number of barrel items
+  uint32_t getOffset() const { return valuesEB_.size(); }
+
+  //
+  static std::string name() { return std::string{"ecalLaserAPDPNRatiosRefGPU"}; }
+
+private:
+  std::vector<float> const& valuesEB_;
+  std::vector<float> const& valuesEE_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalLaserAPDPNRatiosRefGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
new file mode 100644
index 0000000000000..9dd05e9ee3c4d
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalLaserAlphasGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalLaserAlphasGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalLaserAlphas.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalLaserAlphasGPU {
+public:
+  struct Product {
+    ~Product();
+    float* values = nullptr;
+  };
+
+#ifndef __CUDACC__
+  //
+  EcalLaserAlphasGPU(EcalLaserAlphas const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalLaserAlphasGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  // TODO: do this centrally
+  // get offset for hashes. equals number of barrel items
+  uint32_t getOffset() const { return valuesEB_.size(); }
+
+  //
+  static std::string name() { return std::string{"ecalLaserAlphasGPU"}; }
+
+private:
+  std::vector<float> const& valuesEB_;
+  std::vector<float> const& valuesEE_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalLaserAlphasGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
new file mode 100644
index 0000000000000..343bdf1dd1afc
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
@@ -0,0 +1,53 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalLinearCorrectionsGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalLinearCorrectionsGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalLinearCorrections.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalLinearCorrectionsGPU {
+public:
+  struct Product {
+    ~Product();
+    float *p1 = nullptr;
+    float *p2 = nullptr;
+    float *p3 = nullptr;
+    edm::TimeValue_t *t1 = nullptr;
+    edm::TimeValue_t *t2 = nullptr;
+    edm::TimeValue_t *t3 = nullptr;
+  };
+
+#ifndef __CUDACC__
+
+  //
+  EcalLinearCorrectionsGPU(EcalLinearCorrections const &);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalLinearCorrectionsGPU() = default;
+
+  // get device pointers
+  Product const &getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalLinearCorrectionsGPU"}; }
+
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<float, cms::cuda::HostAllocator<float>> p1_;
+  std::vector<float, cms::cuda::HostAllocator<float>> p2_;
+  std::vector<float, cms::cuda::HostAllocator<float>> p3_;
+
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t>> t1_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t>> t2_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t>> t3_;
+
+  cms::cuda::ESProduct<Product> product_;
+
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalLinearCorrectionsGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalMultifitParametersGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalMultifitParametersGPU.h
new file mode 100644
index 0000000000000..56aa0579ff77f
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalMultifitParametersGPU.h
@@ -0,0 +1,39 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalMultifitParametersGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalMultifitParametersGPU_h
+
+#include <array>
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalMultifitParametersGPU {
+public:
+  struct Product {
+    ~Product();
+    double *amplitudeFitParametersEB, *amplitudeFitParametersEE, *timeFitParametersEB, *timeFitParametersEE;
+  };
+
+#ifndef __CUDACC__
+  EcalMultifitParametersGPU(edm::ParameterSet const&);
+
+  ~EcalMultifitParametersGPU() = default;
+
+  Product const& getProduct(cudaStream_t) const;
+
+  std::array<std::reference_wrapper<std::vector<double, cms::cuda::HostAllocator<double>> const>, 4> getValues() const {
+    return {{amplitudeFitParametersEB_, amplitudeFitParametersEE_, timeFitParametersEB_, timeFitParametersEE_}};
+  }
+
+private:
+  std::vector<double, cms::cuda::HostAllocator<double>> amplitudeFitParametersEB_, amplitudeFitParametersEE_,
+      timeFitParametersEB_, timeFitParametersEE_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalMultifitParametersGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h
new file mode 100644
index 0000000000000..5387c422ddd9e
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h
@@ -0,0 +1,47 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalPedestalsGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalPedestalsGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalPedestals.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalPedestalsGPU {
+public:
+  struct Product {
+    ~Product();
+    float *mean_x12 = nullptr, *mean_x6 = nullptr, *mean_x1 = nullptr;
+    float *rms_x12 = nullptr, *rms_x6 = nullptr, *rms_x1 = nullptr;
+  };
+
+#ifndef __CUDACC__
+
+  // rearrange pedestals
+  EcalPedestalsGPU(EcalPedestals const &);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalPedestalsGPU() = default;
+
+  // get device pointers
+  Product const &getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalPedestalsGPU"}; }
+
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<float, cms::cuda::HostAllocator<float>> mean_x12_;
+  std::vector<float, cms::cuda::HostAllocator<float>> rms_x12_;
+  std::vector<float, cms::cuda::HostAllocator<float>> mean_x6_;
+  std::vector<float, cms::cuda::HostAllocator<float>> rms_x6_;
+  std::vector<float, cms::cuda::HostAllocator<float>> mean_x1_;
+  std::vector<float, cms::cuda::HostAllocator<float>> rms_x1_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalPedestalsGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h
new file mode 100644
index 0000000000000..6c5a3d9b95e2e
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h
@@ -0,0 +1,40 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalPulseCovariancesGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalPulseCovariancesGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalPulseCovariancesGPU {
+public:
+  struct Product {
+    ~Product();
+    EcalPulseCovariance* values = nullptr;
+  };
+
+#ifndef __CUDACC__
+  // rearrange pedestals
+  EcalPulseCovariancesGPU(EcalPulseCovariances const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalPulseCovariancesGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalPulseCovariancesGPU"}; }
+
+private:
+  // reuse original vectors (although with default allocator)
+  std::vector<EcalPulseCovariance> const& valuesEB_;
+  std::vector<EcalPulseCovariance> const& valuesEE_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalPulseCovariancesGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h
new file mode 100644
index 0000000000000..3edb2c9bcdfd3
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h
@@ -0,0 +1,40 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalPulseShapesGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalPulseShapesGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalPulseShapesGPU {
+public:
+  struct Product {
+    ~Product();
+    EcalPulseShape* values = nullptr;
+  };
+
+#ifndef __CUDACC__
+  // rearrange pedestals
+  EcalPulseShapesGPU(EcalPulseShapes const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalPulseShapesGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalPulseShapesGPU"}; }
+
+private:
+  // reuse original vectors (although with default allocator)
+  std::vector<EcalPulseShape> const& valuesEB_;
+  std::vector<EcalPulseShape> const& valuesEE_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalPulseShapesGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRecHitParametersGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRecHitParametersGPU.h
new file mode 100644
index 0000000000000..c5d3dd0388d15
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRecHitParametersGPU.h
@@ -0,0 +1,47 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalRecHitParametersGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalRecHitParametersGPU_h
+
+#include <array>
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalRecHitParametersGPU {
+public:
+  struct Product {
+    ~Product();
+    int *ChannelStatusToBeExcluded, *expanded_v_DB_reco_flags;
+    uint32_t *expanded_Sizes_v_DB_reco_flags, *expanded_flagbit_v_DB_reco_flags;
+  };
+
+#ifndef __CUDACC__
+  EcalRecHitParametersGPU(edm::ParameterSet const &);
+
+  ~EcalRecHitParametersGPU() = default;
+
+  Product const &getProduct(cudaStream_t) const;
+
+  using intvec = std::reference_wrapper<std::vector<int, cms::cuda::HostAllocator<int>> const>;
+  using uint32vec = std::reference_wrapper<std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> const>;
+  std::tuple<intvec, intvec, uint32vec, uint32vec> getValues() const {
+    return {ChannelStatusToBeExcluded_,
+            expanded_v_DB_reco_flags_,
+            expanded_Sizes_v_DB_reco_flags_,
+            expanded_flagbit_v_DB_reco_flags_};
+  }
+
+private:
+  std::vector<int, cms::cuda::HostAllocator<int>> ChannelStatusToBeExcluded_;
+  std::vector<int, cms::cuda::HostAllocator<int>> expanded_v_DB_reco_flags_;
+  std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> expanded_Sizes_v_DB_reco_flags_,
+      expanded_flagbit_v_DB_reco_flags_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalRecHitParametersGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
new file mode 100644
index 0000000000000..7d4d3cc60fd5c
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
@@ -0,0 +1,42 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalRechitADCToGeVConstantGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalRechitADCToGeVConstantGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalADCToGeVConstant.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalRechitADCToGeVConstantGPU {
+public:
+  struct Product {
+    ~Product();
+    float* adc2gev = nullptr;
+  };
+
+#ifndef __CUDACC__
+
+  //
+  EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalRechitADCToGeVConstantGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalRechitADCToGeVConstantGPU"}; }
+
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<float, cms::cuda::HostAllocator<float>> adc2gev_;
+
+  cms::cuda::ESProduct<Product> product_;
+
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalRechitADCToGeVConstantGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
new file mode 100644
index 0000000000000..bab99ab656c2d
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
@@ -0,0 +1,42 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalRechitChannelStatusGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalRechitChannelStatusGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalRechitChannelStatusGPU {
+public:
+  struct Product {
+    ~Product();
+    uint16_t* status = nullptr;
+  };
+
+#ifndef __CUDACC__
+
+  //
+  EcalRechitChannelStatusGPU(EcalChannelStatus const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalRechitChannelStatusGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalRechitChannelStatusGPU"}; }
+
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<uint16_t, cms::cuda::HostAllocator<uint16_t>> status_;
+
+  cms::cuda::ESProduct<Product> product_;
+
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalRechitChannelStatusGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h
new file mode 100644
index 0000000000000..e1dee2d505e6c
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h
@@ -0,0 +1,44 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalSamplesCorrelationGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalSamplesCorrelationGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalSamplesCorrelationGPU {
+public:
+  struct Product {
+    ~Product();
+    double *EBG12SamplesCorrelation = nullptr, *EBG6SamplesCorrelation = nullptr, *EBG1SamplesCorrelation = nullptr;
+    double *EEG12SamplesCorrelation = nullptr, *EEG6SamplesCorrelation = nullptr, *EEG1SamplesCorrelation = nullptr;
+  };
+
+#ifndef __CUDACC__
+  // rearrange pedestals
+  EcalSamplesCorrelationGPU(EcalSamplesCorrelation const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalSamplesCorrelationGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalSamplesCorrelationGPU"}; }
+
+private:
+  std::vector<double> const& EBG12SamplesCorrelation_;
+  std::vector<double> const& EBG6SamplesCorrelation_;
+  std::vector<double> const& EBG1SamplesCorrelation_;
+  std::vector<double> const& EEG12SamplesCorrelation_;
+  std::vector<double> const& EEG6SamplesCorrelation_;
+  std::vector<double> const& EEG1SamplesCorrelation_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalSamplesCorrelationGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h
new file mode 100644
index 0000000000000..9e2bf0aa18909
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h
@@ -0,0 +1,49 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalTimeBiasCorrectionsGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalTimeBiasCorrectionsGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalTimeBiasCorrections.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalTimeBiasCorrectionsGPU {
+public:
+  struct Product {
+    ~Product();
+    float *EBTimeCorrAmplitudeBins, *EBTimeCorrShiftBins;
+    float *EETimeCorrAmplitudeBins, *EETimeCorrShiftBins;
+    int EBTimeCorrAmplitudeBinsSize, EETimeCorrAmplitudeBinsSize;
+  };
+
+  // rearrange pedestals
+  EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const&);
+
+#ifndef __CUDACC__
+
+  // will call dealloation for Product thru ~Product
+  ~EcalTimeBiasCorrectionsGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"ecalTimeBiasCorrectionsGPU"}; }
+#endif  // __CUDACC__
+
+  std::vector<float> const& EBTimeCorrAmplitudeBins() const { return EBTimeCorrAmplitudeBins_; }
+  std::vector<float> const& EETimeCorrAmplitudeBins() const { return EETimeCorrAmplitudeBins_; }
+
+private:
+  std::vector<float> const& EBTimeCorrAmplitudeBins_;
+  std::vector<float> const& EBTimeCorrShiftBins_;
+  std::vector<float> const& EETimeCorrAmplitudeBins_;
+  std::vector<float> const& EETimeCorrShiftBins_;
+
+#ifndef __CUDACC__
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalTimeBiasCorrectionsGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h
new file mode 100644
index 0000000000000..823334d433cc2
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_interface_EcalTimeCalibConstantsGPU_h
+#define RecoLocalCalo_EcalRecAlgos_interface_EcalTimeCalibConstantsGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalTimeCalibConstants.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif  // __CUDACC__
+
+class EcalTimeCalibConstantsGPU {
+public:
+  struct Product {
+    ~Product();
+    float* values = nullptr;
+  };
+
+#ifndef __CUDACC__
+  // rearrange pedestals
+  EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const&);
+
+  // will call dealloation for Product thru ~Product
+  ~EcalTimeCalibConstantsGPU() = default;
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+  // TODO: do this centrally
+  // get offset for hashes. equals number of barrel items
+  uint32_t getOffset() const { return valuesEB_.size(); }
+
+  //
+  static std::string name() { return std::string{"ecalTimeCalibConstantsGPU"}; }
+
+private:
+  std::vector<float> const& valuesEB_;
+  std::vector<float> const& valuesEE_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif  // __CUDACC__
+};
+
+#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalTimeCalibConstantsGPU_h
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc
new file mode 100644
index 0000000000000..d5980d8a757aa
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc
@@ -0,0 +1,52 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalGainRatiosGPU::EcalGainRatiosGPU(EcalGainRatios const& values)
+    : gain12Over6_(values.size()), gain6Over1_(values.size()) {
+  // fill in eb
+  auto const& barrelValues = values.barrelItems();
+  for (unsigned int i = 0; i < barrelValues.size(); i++) {
+    gain12Over6_[i] = barrelValues[i].gain12Over6();
+    gain6Over1_[i] = barrelValues[i].gain6Over1();
+  }
+
+  // fill in ee
+  auto const& endcapValues = values.endcapItems();
+  auto const offset = barrelValues.size();
+  for (unsigned int i = 0; i < endcapValues.size(); i++) {
+    gain12Over6_[offset + i] = endcapValues[i].gain12Over6();
+    gain6Over1_[offset + i] = endcapValues[i].gain6Over1();
+  }
+}
+
+EcalGainRatiosGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(gain12Over6));
+  cudaCheck(cudaFree(gain6Over1));
+}
+
+EcalGainRatiosGPU::Product const& EcalGainRatiosGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalGainRatiosGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.gain12Over6, this->gain12Over6_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.gain6Over1, this->gain6Over1_.size() * sizeof(float)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.gain12Over6,
+                                  this->gain12Over6_.data(),
+                                  this->gain12Over6_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.gain6Over1,
+                                  this->gain6Over1_.data(),
+                                  this->gain6Over1_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalGainRatiosGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc
new file mode 100644
index 0000000000000..dec10cff57dd0
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc
@@ -0,0 +1,40 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalIntercalibConstantsGPU::EcalIntercalibConstantsGPU(EcalIntercalibConstants const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
+
+EcalIntercalibConstantsGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(values));
+}
+
+EcalIntercalibConstantsGPU::Product const& EcalIntercalibConstantsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalIntercalibConstantsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(
+            cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float)));
+
+        // offset in floats, not bytes
+        auto const offset = this->valuesEB_.size();
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalIntercalibConstantsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc
new file mode 100644
index 0000000000000..4aa92ea6750fe
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc
@@ -0,0 +1,86 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalLaserAPDPNRatiosGPU::EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const& values)
+    : p1_(values.getLaserMap().size()),
+      p2_(values.getLaserMap().size()),
+      p3_(values.getLaserMap().size()),
+      t1_(values.getTimeMap().size()),
+      t2_(values.getTimeMap().size()),
+      t3_(values.getTimeMap().size()) {
+  // fill in eb
+  //     auto const& barrelValues = values.barrelItems();
+  for (unsigned int i = 0; i < values.getLaserMap().barrelItems().size(); i++) {
+    p1_[i] = values.getLaserMap().barrelItems()[i].p1;
+    p2_[i] = values.getLaserMap().barrelItems()[i].p2;
+    p3_[i] = values.getLaserMap().barrelItems()[i].p3;
+  }
+
+  // fill in ee
+  //     auto const& endcapValues = values.endcapItems();
+  auto const offset_laser = values.getLaserMap().barrelItems().size();
+  for (unsigned int i = 0; i < values.getLaserMap().endcapItems().size(); i++) {
+    p1_[offset_laser + i] = values.getLaserMap().endcapItems()[i].p1;
+    p2_[offset_laser + i] = values.getLaserMap().endcapItems()[i].p2;
+    p3_[offset_laser + i] = values.getLaserMap().endcapItems()[i].p3;
+  }
+
+  //   Time is a simple std::vector
+  //       typedef std::vector<EcalLaserTimeStamp> EcalLaserTimeStampMap;
+  for (unsigned int i = 0; i < values.getTimeMap().size(); i++) {
+    t1_[i] = values.getTimeMap()[i].t1.value();
+    t2_[i] = values.getTimeMap()[i].t2.value();
+    t3_[i] = values.getTimeMap()[i].t3.value();
+  }
+}
+
+EcalLaserAPDPNRatiosGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(p1));
+  cudaCheck(cudaFree(p2));
+  cudaCheck(cudaFree(p3));
+  cudaCheck(cudaFree(t1));
+  cudaCheck(cudaFree(t2));
+  cudaCheck(cudaFree(t3));
+}
+
+EcalLaserAPDPNRatiosGPU::Product const& EcalLaserAPDPNRatiosGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalLaserAPDPNRatiosGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.p1, this->p1_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.p2, this->p2_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.p3, this->p3_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.t1, this->t1_.size() * sizeof(edm::TimeValue_t)));
+        cudaCheck(cudaMalloc((void**)&product.t2, this->t2_.size() * sizeof(edm::TimeValue_t)));
+        cudaCheck(cudaMalloc((void**)&product.t3, this->t3_.size() * sizeof(edm::TimeValue_t)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(
+            product.p1, this->p1_.data(), this->p1_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.p2, this->p2_.data(), this->p2_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.p3, this->p3_.data(), this->p3_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t1,
+                                  this->t1_.data(),
+                                  this->t1_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t2,
+                                  this->t2_.data(),
+                                  this->t2_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t3,
+                                  this->t3_.data(),
+                                  this->t3_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc
new file mode 100644
index 0000000000000..8f77cf48fe1d1
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc
@@ -0,0 +1,40 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalLaserAPDPNRatiosRefGPU::EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
+
+EcalLaserAPDPNRatiosRefGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(values));
+}
+
+EcalLaserAPDPNRatiosRefGPU::Product const& EcalLaserAPDPNRatiosRefGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalLaserAPDPNRatiosRefGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(
+            cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float)));
+
+        // offset in floats, not bytes
+        auto const offset = this->valuesEB_.size();
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosRefGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc
new file mode 100644
index 0000000000000..91de441bff683
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc
@@ -0,0 +1,40 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalLaserAlphasGPU::EcalLaserAlphasGPU(EcalLaserAlphas const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
+
+EcalLaserAlphasGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(values));
+}
+
+EcalLaserAlphasGPU::Product const& EcalLaserAlphasGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalLaserAlphasGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(
+            cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float)));
+
+        // offset in floats, not bytes
+        auto const offset = this->valuesEB_.size();
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalLaserAlphasGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
new file mode 100644
index 0000000000000..0af2a9044ab65
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
@@ -0,0 +1,84 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalLinearCorrectionsGPU::EcalLinearCorrectionsGPU(EcalLinearCorrections const& values)
+    : p1_(values.getValueMap().size()),
+      p2_(values.getValueMap().size()),
+      p3_(values.getValueMap().size()),
+      t1_(values.getTimeMap().size()),
+      t2_(values.getTimeMap().size()),
+      t3_(values.getTimeMap().size()) {
+  // fill in eb
+  for (unsigned int i = 0; i < values.getValueMap().barrelItems().size(); i++) {
+    p1_[i] = values.getValueMap().barrelItems()[i].p1;
+    p2_[i] = values.getValueMap().barrelItems()[i].p2;
+    p3_[i] = values.getValueMap().barrelItems()[i].p3;
+  }
+
+  // fill in ee
+  auto const offset_laser = values.getValueMap().barrelItems().size();
+  for (unsigned int i = 0; i < values.getValueMap().endcapItems().size(); i++) {
+    p1_[offset_laser + i] = values.getValueMap().endcapItems()[i].p1;
+    p2_[offset_laser + i] = values.getValueMap().endcapItems()[i].p2;
+    p3_[offset_laser + i] = values.getValueMap().endcapItems()[i].p3;
+  }
+
+  //   Time is a simple std::vector
+  //       typedef std::vector<EcalLaserTimeStamp> EcalLaserTimeStampMap;
+  for (unsigned int i = 0; i < values.getTimeMap().size(); i++) {
+    t1_[i] = values.getTimeMap()[i].t1.value();
+    t2_[i] = values.getTimeMap()[i].t2.value();
+    t3_[i] = values.getTimeMap()[i].t3.value();
+  }
+}
+
+EcalLinearCorrectionsGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(p1));
+  cudaCheck(cudaFree(p2));
+  cudaCheck(cudaFree(p3));
+  cudaCheck(cudaFree(t1));
+  cudaCheck(cudaFree(t2));
+  cudaCheck(cudaFree(t3));
+}
+
+EcalLinearCorrectionsGPU::Product const& EcalLinearCorrectionsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalLinearCorrectionsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.p1, this->p1_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.p2, this->p2_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.p3, this->p3_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.t1, this->t1_.size() * sizeof(edm::TimeValue_t)));
+        cudaCheck(cudaMalloc((void**)&product.t2, this->t2_.size() * sizeof(edm::TimeValue_t)));
+        cudaCheck(cudaMalloc((void**)&product.t3, this->t3_.size() * sizeof(edm::TimeValue_t)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(
+            product.p1, this->p1_.data(), this->p1_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.p2, this->p2_.data(), this->p2_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.p3, this->p3_.data(), this->p3_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t1,
+                                  this->t1_.data(),
+                                  this->t1_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t2,
+                                  this->t2_.data(),
+                                  this->t2_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t3,
+                                  this->t3_.data(),
+                                  this->t3_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalLinearCorrectionsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalMultifitParametersGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalMultifitParametersGPU.cc
new file mode 100644
index 0000000000000..010da6444b614
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalMultifitParametersGPU.cc
@@ -0,0 +1,66 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalMultifitParametersGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalMultifitParametersGPU::EcalMultifitParametersGPU(edm::ParameterSet const& ps) {
+  auto const& amplitudeFitParametersEB = ps.getParameter<std::vector<double>>("EBamplitudeFitParameters");
+  auto const& amplitudeFitParametersEE = ps.getParameter<std::vector<double>>("EEamplitudeFitParameters");
+  auto const& timeFitParametersEB = ps.getParameter<std::vector<double>>("EBtimeFitParameters");
+  auto const& timeFitParametersEE = ps.getParameter<std::vector<double>>("EEtimeFitParameters");
+
+  amplitudeFitParametersEB_.resize(amplitudeFitParametersEB.size());
+  amplitudeFitParametersEE_.resize(amplitudeFitParametersEE.size());
+  timeFitParametersEB_.resize(timeFitParametersEB.size());
+  timeFitParametersEE_.resize(timeFitParametersEE.size());
+
+  std::copy(amplitudeFitParametersEB.begin(), amplitudeFitParametersEB.end(), amplitudeFitParametersEB_.begin());
+  std::copy(amplitudeFitParametersEE.begin(), amplitudeFitParametersEE.end(), amplitudeFitParametersEE_.begin());
+  std::copy(timeFitParametersEB.begin(), timeFitParametersEB.end(), timeFitParametersEB_.begin());
+  std::copy(timeFitParametersEE.begin(), timeFitParametersEE.end(), timeFitParametersEE_.begin());
+}
+
+EcalMultifitParametersGPU::Product::~Product() {
+  cudaCheck(cudaFree(amplitudeFitParametersEB));
+  cudaCheck(cudaFree(amplitudeFitParametersEE));
+  cudaCheck(cudaFree(timeFitParametersEB));
+  cudaCheck(cudaFree(timeFitParametersEE));
+}
+
+EcalMultifitParametersGPU::Product const& EcalMultifitParametersGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalMultifitParametersGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.amplitudeFitParametersEB,
+                             this->amplitudeFitParametersEB_.size() * sizeof(double)));
+        cudaCheck(cudaMalloc((void**)&product.amplitudeFitParametersEE,
+                             this->amplitudeFitParametersEE_.size() * sizeof(double)));
+        cudaCheck(cudaMalloc((void**)&product.timeFitParametersEB, this->timeFitParametersEB_.size() * sizeof(double)));
+        cudaCheck(cudaMalloc((void**)&product.timeFitParametersEE, this->timeFitParametersEE_.size() * sizeof(double)));
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.amplitudeFitParametersEB,
+                                  this->amplitudeFitParametersEB_.data(),
+                                  this->amplitudeFitParametersEB_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.amplitudeFitParametersEE,
+                                  this->amplitudeFitParametersEE_.data(),
+                                  this->amplitudeFitParametersEE_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.timeFitParametersEB,
+                                  this->timeFitParametersEB_.data(),
+                                  this->timeFitParametersEB_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.timeFitParametersEE,
+                                  this->timeFitParametersEE_.data(),
+                                  this->timeFitParametersEE_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalMultifitParametersGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc
new file mode 100644
index 0000000000000..9e3284cd9c7c8
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc
@@ -0,0 +1,94 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalPedestalsGPU::EcalPedestalsGPU(EcalPedestals const& pedestals)
+    : mean_x12_(pedestals.size()),
+      rms_x12_(pedestals.size()),
+      mean_x6_(pedestals.size()),
+      rms_x6_(pedestals.size()),
+      mean_x1_(pedestals.size()),
+      rms_x1_(pedestals.size()) {
+  // fill in eb
+  auto const& barrelValues = pedestals.barrelItems();
+  for (unsigned int i = 0; i < barrelValues.size(); i++) {
+    mean_x12_[i] = barrelValues[i].mean_x12;
+    rms_x12_[i] = barrelValues[i].rms_x12;
+    mean_x6_[i] = barrelValues[i].mean_x6;
+    rms_x6_[i] = barrelValues[i].rms_x6;
+    mean_x1_[i] = barrelValues[i].mean_x1;
+    rms_x1_[i] = barrelValues[i].rms_x1;
+  }
+
+  // fill in ee
+  auto const& endcapValues = pedestals.endcapItems();
+  auto const offset = barrelValues.size();
+  for (unsigned int i = 0; i < endcapValues.size(); i++) {
+    mean_x12_[offset + i] = endcapValues[i].mean_x12;
+    rms_x12_[offset + i] = endcapValues[i].rms_x12;
+    mean_x6_[offset + i] = endcapValues[i].mean_x6;
+    rms_x6_[offset + i] = endcapValues[i].rms_x6;
+    mean_x1_[offset + i] = endcapValues[i].mean_x1;
+    rms_x1_[offset + i] = endcapValues[i].rms_x1;
+  }
+}
+
+EcalPedestalsGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(mean_x12));
+  cudaCheck(cudaFree(rms_x12));
+  cudaCheck(cudaFree(mean_x6));
+  cudaCheck(cudaFree(rms_x6));
+  cudaCheck(cudaFree(mean_x1));
+  cudaCheck(cudaFree(rms_x1));
+}
+
+EcalPedestalsGPU::Product const& EcalPedestalsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalPedestalsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.mean_x12, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.rms_x12, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.mean_x6, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.rms_x6, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.mean_x1, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.rms_x1, this->mean_x12_.size() * sizeof(float)));
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.mean_x12,
+                                  this->mean_x12_.data(),
+                                  this->mean_x12_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.rms_x12,
+                                  this->rms_x12_.data(),
+                                  this->rms_x12_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.mean_x6,
+                                  this->mean_x6_.data(),
+                                  this->mean_x6_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.rms_x6,
+                                  this->rms_x6_.data(),
+                                  this->rms_x6_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.mean_x1,
+                                  this->mean_x1_.data(),
+                                  this->mean_x1_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.rms_x1,
+                                  this->rms_x1_.data(),
+                                  this->rms_x1_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalPedestalsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc
new file mode 100644
index 0000000000000..bbeda99652e22
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc
@@ -0,0 +1,42 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalPulseCovariancesGPU::EcalPulseCovariancesGPU(EcalPulseCovariances const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
+
+EcalPulseCovariancesGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(values));
+}
+
+EcalPulseCovariancesGPU::Product const& EcalPulseCovariancesGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalPulseCovariancesGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.values,
+                             (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseCovariance)));
+
+        // offset in terms of sizeof(EcalPulseCovariance)
+        uint32_t offset = this->valuesEB_.size();
+
+        // transfer eb
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(EcalPulseCovariance),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+
+        // transfer ee starting at values + offset
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(EcalPulseCovariance),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalPulseCovariancesGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc
new file mode 100644
index 0000000000000..aee122a01627d
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc
@@ -0,0 +1,42 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalPulseShapesGPU::EcalPulseShapesGPU(EcalPulseShapes const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
+
+EcalPulseShapesGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(values));
+}
+
+EcalPulseShapesGPU::Product const& EcalPulseShapesGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalPulseShapesGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.values,
+                             (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseShape)));
+
+        // offset in terms of sizeof(EcalPulseShape) - plain c array
+        uint32_t offset = this->valuesEB_.size();
+
+        // transfer eb
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(EcalPulseShape),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+
+        // transfer ee starting at values + offset
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(EcalPulseShape),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalPulseShapesGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitParametersGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitParametersGPU.cc
new file mode 100644
index 0000000000000..0f6812d6d6ffe
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitParametersGPU.cc
@@ -0,0 +1,82 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRecHitParametersGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "CommonTools/Utils/interface/StringToEnumValue.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
+
+EcalRecHitParametersGPU::EcalRecHitParametersGPU(edm::ParameterSet const& ps) {
+  auto const& ChannelStatusToBeExcluded = StringToEnumValue<EcalChannelStatusCode::Code>(
+      ps.getParameter<std::vector<std::string>>("ChannelStatusToBeExcluded"));
+
+  ChannelStatusToBeExcluded_.resize(ChannelStatusToBeExcluded.size());
+  std::copy(ChannelStatusToBeExcluded.begin(), ChannelStatusToBeExcluded.end(), ChannelStatusToBeExcluded_.begin());
+
+  //     https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.cc
+
+  // Traslate string representation of flagsMapDBReco into enum values
+  const edm::ParameterSet& p = ps.getParameter<edm::ParameterSet>("flagsMapDBReco");
+  std::vector<std::string> recoflagbitsStrings = p.getParameterNames();
+
+  for (unsigned int i = 0; i != recoflagbitsStrings.size(); ++i) {
+    EcalRecHit::Flags recoflagbit = (EcalRecHit::Flags)StringToEnumValue<EcalRecHit::Flags>(recoflagbitsStrings[i]);
+    std::vector<std::string> dbstatus_s = p.getParameter<std::vector<std::string>>(recoflagbitsStrings[i]);
+    //     std::vector<uint32_t> dbstatuses;
+    for (unsigned int j = 0; j != dbstatus_s.size(); ++j) {
+      EcalChannelStatusCode::Code dbstatus =
+          (EcalChannelStatusCode::Code)StringToEnumValue<EcalChannelStatusCode::Code>(dbstatus_s[j]);
+      expanded_v_DB_reco_flags_.push_back(dbstatus);
+    }
+
+    expanded_Sizes_v_DB_reco_flags_.push_back(dbstatus_s.size());
+    expanded_flagbit_v_DB_reco_flags_.push_back(recoflagbit);
+  }
+}
+
+EcalRecHitParametersGPU::Product::~Product() {
+  cudaCheck(cudaFree(ChannelStatusToBeExcluded));
+  cudaCheck(cudaFree(expanded_v_DB_reco_flags));
+  cudaCheck(cudaFree(expanded_Sizes_v_DB_reco_flags));
+  cudaCheck(cudaFree(expanded_flagbit_v_DB_reco_flags));
+}
+
+EcalRecHitParametersGPU::Product const& EcalRecHitParametersGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalRecHitParametersGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.ChannelStatusToBeExcluded,
+                             this->ChannelStatusToBeExcluded_.size() * sizeof(int)));
+        cudaCheck(cudaMalloc((void**)&product.expanded_v_DB_reco_flags,
+                             this->expanded_v_DB_reco_flags_.size() * sizeof(int)));
+        cudaCheck(cudaMalloc((void**)&product.expanded_Sizes_v_DB_reco_flags,
+                             this->expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&product.expanded_flagbit_v_DB_reco_flags,
+                             this->expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t)));
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.ChannelStatusToBeExcluded,
+                                  this->ChannelStatusToBeExcluded_.data(),
+                                  this->ChannelStatusToBeExcluded_.size() * sizeof(int),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.expanded_v_DB_reco_flags,
+                                  this->expanded_v_DB_reco_flags_.data(),
+                                  this->expanded_v_DB_reco_flags_.size() * sizeof(int),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.expanded_Sizes_v_DB_reco_flags,
+                                  this->expanded_Sizes_v_DB_reco_flags_.data(),
+                                  this->expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.expanded_flagbit_v_DB_reco_flags,
+                                  this->expanded_flagbit_v_DB_reco_flags_.data(),
+                                  this->expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalRecHitParametersGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc
new file mode 100644
index 0000000000000..5f01068f95186
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc
@@ -0,0 +1,34 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalRechitADCToGeVConstantGPU::EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const& values)
+    : adc2gev_(2)  // size is 2, one form EB and one for EE
+{
+  adc2gev_[0] = values.getEBValue();
+  adc2gev_[1] = values.getEEValue();
+}
+
+EcalRechitADCToGeVConstantGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(adc2gev));
+}
+
+EcalRechitADCToGeVConstantGPU::Product const& EcalRechitADCToGeVConstantGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalRechitADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.adc2gev, this->adc2gev_.size() * sizeof(float)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.adc2gev,
+                                  this->adc2gev_.data(),
+                                  this->adc2gev_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalRechitADCToGeVConstantGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
new file mode 100644
index 0000000000000..1e6801fbd326a
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
@@ -0,0 +1,42 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalRechitChannelStatusGPU::EcalRechitChannelStatusGPU(EcalChannelStatus const& values) : status_(values.size()) {
+  // fill in eb
+  auto const& barrelValues = values.barrelItems();
+  for (unsigned int i = 0; i < barrelValues.size(); i++) {
+    status_[i] = barrelValues[i].getEncodedStatusCode();
+  }
+
+  // fill in ee
+  auto const& endcapValues = values.endcapItems();
+  auto const offset = barrelValues.size();
+  for (unsigned int i = 0; i < endcapValues.size(); i++) {
+    status_[offset + i] = endcapValues[i].getEncodedStatusCode();
+  }
+}
+
+EcalRechitChannelStatusGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(status));
+}
+
+EcalRechitChannelStatusGPU::Product const& EcalRechitChannelStatusGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalRechitChannelStatusGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.status, this->status_.size() * sizeof(uint16_t)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.status,
+                                  this->status_.data(),
+                                  this->status_.size() * sizeof(uint16_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalRechitChannelStatusGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc
new file mode 100644
index 0000000000000..2a98067f51d9e
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc
@@ -0,0 +1,76 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalSamplesCorrelationGPU::EcalSamplesCorrelationGPU(EcalSamplesCorrelation const& values)
+    : EBG12SamplesCorrelation_{values.EBG12SamplesCorrelation},
+      EBG6SamplesCorrelation_{values.EBG6SamplesCorrelation},
+      EBG1SamplesCorrelation_{values.EBG1SamplesCorrelation},
+      EEG12SamplesCorrelation_{values.EEG12SamplesCorrelation},
+      EEG6SamplesCorrelation_{values.EEG6SamplesCorrelation},
+      EEG1SamplesCorrelation_{values.EEG1SamplesCorrelation} {}
+
+EcalSamplesCorrelationGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(EBG12SamplesCorrelation));
+  cudaCheck(cudaFree(EBG6SamplesCorrelation));
+  cudaCheck(cudaFree(EBG1SamplesCorrelation));
+  cudaCheck(cudaFree(EEG12SamplesCorrelation));
+  cudaCheck(cudaFree(EEG6SamplesCorrelation));
+  cudaCheck(cudaFree(EEG1SamplesCorrelation));
+}
+
+EcalSamplesCorrelationGPU::Product const& EcalSamplesCorrelationGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalSamplesCorrelationGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.EBG12SamplesCorrelation,
+                             this->EBG12SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(
+            cudaMalloc((void**)&product.EBG6SamplesCorrelation, this->EBG6SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(
+            cudaMalloc((void**)&product.EBG1SamplesCorrelation, this->EBG1SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(cudaMalloc((void**)&product.EEG12SamplesCorrelation,
+                             this->EEG12SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(
+            cudaMalloc((void**)&product.EEG6SamplesCorrelation, this->EEG6SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(
+            cudaMalloc((void**)&product.EEG1SamplesCorrelation, this->EEG1SamplesCorrelation_.size() * sizeof(double)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.EBG12SamplesCorrelation,
+                                  this->EBG12SamplesCorrelation_.data(),
+                                  this->EBG12SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EBG6SamplesCorrelation,
+                                  this->EBG6SamplesCorrelation_.data(),
+                                  this->EBG6SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EBG1SamplesCorrelation,
+                                  this->EBG1SamplesCorrelation_.data(),
+                                  this->EBG1SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EEG12SamplesCorrelation,
+                                  this->EEG12SamplesCorrelation_.data(),
+                                  this->EEG12SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EEG6SamplesCorrelation,
+                                  this->EEG6SamplesCorrelation_.data(),
+                                  this->EEG6SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EEG1SamplesCorrelation,
+                                  this->EEG1SamplesCorrelation_.data(),
+                                  this->EEG1SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalSamplesCorrelationGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc
new file mode 100644
index 0000000000000..9ab0a6302a9c4
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc
@@ -0,0 +1,61 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalTimeBiasCorrectionsGPU::EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const& values)
+    : EBTimeCorrAmplitudeBins_{values.EBTimeCorrAmplitudeBins},
+      EBTimeCorrShiftBins_{values.EBTimeCorrShiftBins},
+      EETimeCorrAmplitudeBins_{values.EETimeCorrAmplitudeBins},
+      EETimeCorrShiftBins_{values.EETimeCorrShiftBins} {}
+
+EcalTimeBiasCorrectionsGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(EBTimeCorrAmplitudeBins));
+  cudaCheck(cudaFree(EBTimeCorrShiftBins));
+  cudaCheck(cudaFree(EETimeCorrAmplitudeBins));
+  cudaCheck(cudaFree(EETimeCorrShiftBins));
+}
+
+EcalTimeBiasCorrectionsGPU::Product const& EcalTimeBiasCorrectionsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalTimeBiasCorrectionsGPU::Product& product, cudaStream_t cudaStream) {
+        // to get the size of vectors later on
+        // should be removed and host conditions' objects used directly
+        product.EBTimeCorrAmplitudeBinsSize = this->EBTimeCorrAmplitudeBins_.size();
+        product.EETimeCorrAmplitudeBinsSize = this->EETimeCorrAmplitudeBins_.size();
+
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.EBTimeCorrAmplitudeBins,
+                             this->EBTimeCorrAmplitudeBins_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.EBTimeCorrShiftBins, this->EBTimeCorrShiftBins_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.EETimeCorrAmplitudeBins,
+                             this->EETimeCorrAmplitudeBins_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.EETimeCorrShiftBins, this->EETimeCorrShiftBins_.size() * sizeof(float)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.EBTimeCorrAmplitudeBins,
+                                  this->EBTimeCorrAmplitudeBins_.data(),
+                                  this->EBTimeCorrAmplitudeBins_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EBTimeCorrShiftBins,
+                                  this->EBTimeCorrShiftBins_.data(),
+                                  this->EBTimeCorrShiftBins_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EETimeCorrAmplitudeBins,
+                                  this->EETimeCorrAmplitudeBins_.data(),
+                                  this->EETimeCorrAmplitudeBins_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EETimeCorrShiftBins,
+                                  this->EETimeCorrShiftBins_.data(),
+                                  this->EETimeCorrShiftBins_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalTimeBiasCorrectionsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc
new file mode 100644
index 0000000000000..d724a33f1d4e1
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc
@@ -0,0 +1,40 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalTimeCalibConstantsGPU::EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
+
+EcalTimeCalibConstantsGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(values));
+}
+
+EcalTimeCalibConstantsGPU::Product const& EcalTimeCalibConstantsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalTimeCalibConstantsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(
+            cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float)));
+
+        // offset in floats, not bytes
+        auto const offset = this->valuesEB_.size();
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalTimeCalibConstantsGPU);
diff --git a/RecoLocalCalo/EcalRecProducers/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/BuildFile.xml
index 25939a2a69b8e..aa19516964fd9 100644
--- a/RecoLocalCalo/EcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/BuildFile.xml
@@ -1,5 +1,11 @@
-<use name="FWCore/Framework"/>
 <use name="clhep"/>
+<use name="cuda"/>
+<use name="CUDADataFormats/EcalRecHitSoA"/>
+<use name="CondFormats/EcalObjects"/>
+<use name="FWCore/Framework"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
 <export>
   <lib name="1"/>
 </export>
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu
new file mode 100644
index 0000000000000..f1b1a53a78a30
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu
@@ -0,0 +1,486 @@
+#include <cstdlib>
+#include <limits>
+
+#include <cuda.h>
+
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
+
+#include "AmplitudeComputationCommonKernels.h"
+#include "KernelHelpers.h"
+
+namespace ecal {
+  namespace multifit {
+
+    ///
+    /// assume kernel launch configuration is
+    /// (MAXSAMPLES * nchannels, blocks)
+    ///
+    __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
+                                                  uint16_t const* digis_in_eb,
+                                                  uint32_t const* dids_eb,
+                                                  uint16_t const* digis_in_ee,
+                                                  uint32_t const* dids_ee,
+                                                  SampleVector* amplitudes,
+                                                  SampleVector* amplitudesForMinimizationEB,
+                                                  SampleVector* amplitudesForMinimizationEE,
+                                                  SampleGainVector* gainsNoise,
+                                                  float const* mean_x1,
+                                                  float const* mean_x12,
+                                                  float const* rms_x12,
+                                                  float const* mean_x6,
+                                                  float const* gain6Over1,
+                                                  float const* gain12Over6,
+                                                  bool* hasSwitchToGain6,
+                                                  bool* hasSwitchToGain1,
+                                                  bool* isSaturated,
+                                                  ::ecal::reco::StorageScalarType* energiesEB,
+                                                  ::ecal::reco::StorageScalarType* energiesEE,
+                                                  ::ecal::reco::StorageScalarType* chi2EB,
+                                                  ::ecal::reco::StorageScalarType* chi2EE,
+                                                  ::ecal::reco::StorageScalarType* g_pedestalEB,
+                                                  ::ecal::reco::StorageScalarType* g_pedestalEE,
+                                                  uint32_t* dids_outEB,
+                                                  uint32_t* dids_outEE,
+                                                  uint32_t* flagsEB,
+                                                  uint32_t* flagsEE,
+                                                  char* acState,
+                                                  BXVectorType* bxs,
+                                                  uint32_t const offsetForHashes,
+                                                  uint32_t const offsetForInputs,
+                                                  bool const gainSwitchUseMaxSampleEB,
+                                                  bool const gainSwitchUseMaxSampleEE,
+                                                  int const nchannels) {
+      constexpr bool dynamicPedestal = false;  //---- default to false, ok
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+      constexpr int sample_max = 5;
+      constexpr int full_pulse_max = 9;
+      int const tx = threadIdx.x + blockIdx.x * blockDim.x;
+      int const nchannels_per_block = blockDim.x / nsamples;
+      int const ch = tx / nsamples;
+      // for accessing input arrays
+      int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+      int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * 10 : tx;
+      // eb is first and then ee
+      auto const* digis_in = ch >= offsetForInputs ? digis_in_ee : digis_in_eb;
+      auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+      int const sample = threadIdx.x % nsamples;
+
+      // need to ref the right ptr
+      // macro is for clarity and safety
+#define ARRANGE(var) auto* var = ch >= offsetForInputs ? var##EE : var##EB
+      ARRANGE(amplitudesForMinimization);
+      ARRANGE(energies);
+      ARRANGE(chi2);
+      ARRANGE(g_pedestal);
+      ARRANGE(dids_out);
+      ARRANGE(flags);
+#undef ARRANGE
+
+      if (ch < nchannels) {
+        // array of 10 x channels per block
+        // TODO: any other way of doing simple reduction
+        // assume bool is 1 byte, should be quite safe
+        extern __shared__ char shared_mem[];
+        bool* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(shared_mem);
+        bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples;
+        bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples;
+        bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples;
+        bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples;
+        char* shr_counts = reinterpret_cast<char*>(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples;
+
+        //
+        // indices
+        //
+        auto const did = DetId{dids[inputCh]};
+        auto const isBarrel = did.subdetId() == EcalBarrel;
+        // TODO offset for ee, 0 for eb
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
+        //
+        // pulse shape template
+
+        // will be used in the future for setting state
+        auto const rmsForChecking = rms_x12[hashedId];
+
+        //
+        // amplitudes
+        //
+        int const adc = ecal::mgpa::adc(digis_in[inputTx]);
+        int const gainId = ecal::mgpa::gainId(digis_in[inputTx]);
+        SampleVector::Scalar amplitude = 0.;
+        SampleVector::Scalar pedestal = 0.;
+        SampleVector::Scalar gainratio = 0.;
+
+        // store into shared mem for initialization
+        shr_hasSwitchToGain6[threadIdx.x] = gainId == EcalMgpaBitwiseGain6;
+        shr_hasSwitchToGain1[threadIdx.x] = gainId == EcalMgpaBitwiseGain1;
+        shr_hasSwitchToGain0_tmp[threadIdx.x] = gainId == EcalMgpaBitwiseGain0;
+        shr_hasSwitchToGain0[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x];
+        shr_counts[threadIdx.x] = 0;
+        __syncthreads();
+
+        // non-divergent branch (except for the last 4 threads)
+        if (threadIdx.x <= blockDim.x - 5) {
+          CMS_UNROLL_LOOP
+          for (int i = 0; i < 5; i++)
+            shr_counts[threadIdx.x] += shr_hasSwitchToGain0[threadIdx.x + i];
+        }
+        shr_isSaturated[threadIdx.x] = shr_counts[threadIdx.x] == 5;
+
+        //
+        // unrolled reductions
+        //
+        if (sample < 5) {
+          shr_hasSwitchToGain6[threadIdx.x] =
+              shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 5];
+          shr_hasSwitchToGain1[threadIdx.x] =
+              shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 5];
+
+          // duplication of hasSwitchToGain0 in order not to
+          // introduce another syncthreads
+          shr_hasSwitchToGain0_tmp[threadIdx.x] =
+              shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 5];
+        }
+        __syncthreads();
+
+        if (sample < 2) {
+          // note, both threads per channel take value [3] twice to avoid another if
+          shr_hasSwitchToGain6[threadIdx.x] = shr_hasSwitchToGain6[threadIdx.x] ||
+                                              shr_hasSwitchToGain6[threadIdx.x + 2] ||
+                                              shr_hasSwitchToGain6[threadIdx.x + 3];
+          shr_hasSwitchToGain1[threadIdx.x] = shr_hasSwitchToGain1[threadIdx.x] ||
+                                              shr_hasSwitchToGain1[threadIdx.x + 2] ||
+                                              shr_hasSwitchToGain1[threadIdx.x + 3];
+
+          shr_hasSwitchToGain0_tmp[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x] ||
+                                                  shr_hasSwitchToGain0_tmp[threadIdx.x + 2] ||
+                                                  shr_hasSwitchToGain0_tmp[threadIdx.x + 3];
+
+          // sample < 2 -> first 2 threads of each channel will be used here
+          // => 0 -> will compare 3 and 4 and put into 0
+          // => 1 -> will compare 4 and 5 and put into 1
+          shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4];
+        }
+        __syncthreads();
+
+        bool check_hasSwitchToGain0 = false;
+
+        if (sample == 0) {
+          shr_hasSwitchToGain6[threadIdx.x] =
+              shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1];
+          shr_hasSwitchToGain1[threadIdx.x] =
+              shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1];
+          shr_hasSwitchToGain0_tmp[threadIdx.x] =
+              shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1];
+
+          hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x];
+          hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x];
+
+          // set only for the threadIdx.x corresponding to sample==0
+          check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x];
+
+          shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1];
+          isSaturated[ch] = shr_isSaturated[threadIdx.x + 3];
+        }
+
+        // TODO: w/o this sync, there is a race
+        // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample
+        // check if we can remove it
+        __syncthreads();
+
+        // TODO: divergent branch
+        if (gainId == 0 || gainId == 3) {
+          pedestal = mean_x1[hashedId];
+          gainratio = gain6Over1[hashedId] * gain12Over6[hashedId];
+          gainsNoise[ch](sample) = 2;
+        } else if (gainId == 1) {
+          pedestal = mean_x12[hashedId];
+          gainratio = 1.;
+          gainsNoise[ch](sample) = 0;
+        } else if (gainId == 2) {
+          pedestal = mean_x6[hashedId];
+          gainratio = gain12Over6[hashedId];
+          gainsNoise[ch](sample) = 1;
+        }
+
+        // TODO: compile time constant -> branch should be non-divergent
+        if (dynamicPedestal)
+          amplitude = static_cast<SampleVector::Scalar>(adc) * gainratio;
+        else
+          amplitude = (static_cast<SampleVector::Scalar>(adc) - pedestal) * gainratio;
+        amplitudes[ch][sample] = amplitude;
+
+#ifdef ECAL_RECO_CUDA_DEBUG
+        printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio);
+        if (adc == 0)
+          printf("adc is zero\n");
+#endif
+
+        //
+        // initialization
+        //
+        amplitudesForMinimization[inputCh](sample) = 0;
+        bxs[ch](sample) = sample - 5;
+
+        // select the thread for the max sample
+        //---> hardcoded above to be 5th sample, ok
+        if (sample == sample_max) {
+          //
+          // initialization
+          //
+          acState[ch] = static_cast<char>(MinimizationState::NotFinished);
+          energies[inputCh] = 0;
+          chi2[inputCh] = 0;
+          g_pedestal[inputCh] = 0;
+          uint32_t flag = 0;
+          dids_out[inputCh] = did.rawId();
+
+          // start of this channel in shared mem
+          int const chStart = threadIdx.x - sample_max;
+          // thread for the max sample in shared mem
+          int const threadMax = threadIdx.x;
+          auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE;
+
+          // this flag setting is applied to all of the cases
+          if (shr_hasSwitchToGain6[chStart])
+            flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
+          if (shr_hasSwitchToGain1[chStart])
+            flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
+
+          // this corresponds to cpu branching on lastSampleBeforeSaturation
+          // likely false
+          if (check_hasSwitchToGain0) {
+            // assign for the case some sample having gainId == 0
+            //energies[inputCh] = amplitudes[ch][sample_max];
+            energies[inputCh] = amplitude;
+
+            // check if samples before sample_max have true
+            bool saturated_before_max = false;
+            CMS_UNROLL_LOOP
+            for (char ii = 0; ii < 5; ii++)
+              saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
+
+            // if saturation is in the max sample and not in the first 5
+            if (!saturated_before_max && shr_hasSwitchToGain0[threadMax])
+              energies[inputCh] = 49140;  // 4095 * 12 (maximum ADC range * MultiGainPreAmplifier (MGPA) gain)
+                                          // This is the actual maximum range that is set when we saturate.
+                                          //---- AM FIXME : no pedestal subtraction???
+                                          //It should be "(4095. - pedestal) * gainratio"
+
+            // set state flag to terminate further processing of this channel
+            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+            flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
+            flags[inputCh] = flag;
+            return;
+          }
+
+          // according to cpu version
+          //            auto max_amplitude = amplitudes[ch][sample_max];
+          auto const max_amplitude = amplitude;
+          // according to cpu version
+          auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7];
+          // note, no syncing as the same thread will be accessing here
+          bool hasGainSwitch =
+              shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3];
+
+          // pedestal is final unconditionally
+          g_pedestal[inputCh] = pedestal;
+          if (hasGainSwitch && gainSwitchUseMaxSample) {
+            // thread for sample=0 will access the right guys
+            energies[inputCh] = max_amplitude / shape_value;
+            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+            flags[inputCh] = flag;
+            return;
+          }
+
+          // this happens cause sometimes rms_x12 is 0...
+          // needs to be checkec why this is the case
+          // general case here is that noisecov is a Zero matrix
+          if (rmsForChecking == 0) {
+            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+            flags[inputCh] = flag;
+            return;
+          }
+
+          // for the case when no shortcuts were taken
+          flags[inputCh] = flag;
+        }
+      }
+    }
+
+    ///
+    /// assume kernel launch configuration is
+    /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
+    ///
+    __global__ void kernel_prep_2d(SampleGainVector const* gainNoise,
+                                   uint32_t const* dids_eb,
+                                   uint32_t const* dids_ee,
+                                   float const* rms_x12,
+                                   float const* rms_x6,
+                                   float const* rms_x1,
+                                   float const* gain12Over6,
+                                   float const* gain6Over1,
+                                   double const* G12SamplesCorrelationEB,
+                                   double const* G6SamplesCorrelationEB,
+                                   double const* G1SamplesCorrelationEB,
+                                   double const* G12SamplesCorrelationEE,
+                                   double const* G6SamplesCorrelationEE,
+                                   double const* G1SamplesCorrelationEE,
+                                   SampleMatrix* noisecov,
+                                   PulseMatrixType* pulse_matrix,
+                                   EcalPulseShape const* pulse_shape,
+                                   bool const* hasSwitchToGain6,
+                                   bool const* hasSwitchToGain1,
+                                   bool const* isSaturated,
+                                   uint32_t const offsetForHashes,
+                                   uint32_t const offsetForInputs) {
+      int const ch = blockIdx.x;
+      int const tx = threadIdx.x;
+      int const ty = threadIdx.y;
+      constexpr float addPedestalUncertainty = 0.f;
+      constexpr bool dynamicPedestal = false;
+      constexpr bool simplifiedNoiseModelForGainSwitch = true;  //---- default is true
+
+      // to access input arrays (ids and digis only)
+      int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+      auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+
+      bool tmp0 = hasSwitchToGain6[ch];
+      bool tmp1 = hasSwitchToGain1[ch];
+      auto const did = DetId{dids[inputCh]};
+      auto const isBarrel = did.subdetId() == EcalBarrel;
+      auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                     : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+      auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE;
+      auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE;
+      auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE;
+      bool tmp2 = isSaturated[ch];
+      bool hasGainSwitch = tmp0 || tmp1 || tmp2;
+      auto const vidx = std::abs(ty - tx);
+
+      // non-divergent branch for all threads per block
+      if (hasGainSwitch) {
+        // TODO: did not include simplified noise model
+        float noise_value = 0;
+
+        // non-divergent branch - all threads per block
+        // TODO: all of these constants indicate that
+        // that these parts could be splitted into completely different
+        // kernels and run one of them only depending on the config
+        if (simplifiedNoiseModelForGainSwitch) {
+          int isample_max = 5;  // according to cpu defs
+          int gainidx = gainNoise[ch][isample_max];
+
+          // non-divergent branches
+          if (gainidx == 0)
+            noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx];
+          if (gainidx == 1)
+            noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
+                          G6SamplesCorrelation[vidx];
+          if (gainidx == 2)
+            noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] *
+                          rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx];
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f)
+            noise_value += addPedestalUncertainty * addPedestalUncertainty;
+        } else {
+          int gainidx = 0;
+          char mask = gainidx;
+          int pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+          //            NB: gainratio is 1, that is why it does not appear in the formula
+          noise_value += rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx];
+          // non-divergent branch
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+            noise_value += addPedestalUncertainty * addPedestalUncertainty * pedestal;  // gainratio is 1
+          }
+
+          //
+          gainidx = 1;
+          mask = gainidx;
+          pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+          noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
+                         pedestal * G6SamplesCorrelation[vidx];
+          // non-divergent branch
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+            noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty *
+                           addPedestalUncertainty * pedestal;
+          }
+
+          //
+          gainidx = 2;
+          mask = gainidx;
+          pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+          float tmp = gain6Over1[hashedId] * gain12Over6[hashedId];
+          noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx];
+          // non-divergent branch
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+            noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal;
+          }
+        }
+
+        noisecov[ch](ty, tx) = noise_value;
+      } else {
+        auto rms = rms_x12[hashedId];
+        float noise_value = rms * rms * G12SamplesCorrelation[vidx];
+        if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+          //----  add fully correlated component to noise covariance to inflate pedestal uncertainty
+          noise_value += addPedestalUncertainty * addPedestalUncertainty;
+        }
+        noisecov[ch](ty, tx) = noise_value;
+      }
+
+      // pulse matrix
+      int const posToAccess = 9 - tx + ty;  // see cpu for reference
+      float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0;
+      pulse_matrix[ch](ty, tx) = value;
+    }
+
+    __global__ void kernel_permute_results(SampleVector* amplitudes,
+                                           BXVectorType const* activeBXs,
+                                           ::ecal::reco::StorageScalarType* energies,
+                                           char const* acState,
+                                           int const nchannels) {
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      int const tx = threadIdx.x + blockIdx.x * blockDim.x;
+      int const ch = tx / nsamples;
+      int const sampleidx = tx % nsamples;  // this is to address activeBXs
+
+      if (ch >= nchannels)
+        return;
+
+      // channels that have amplitude precomputed do not need results to be permuted
+      auto const state = static_cast<MinimizationState>(acState[ch]);
+      if (state == MinimizationState::Precomputed)
+        return;
+
+      // configure shared memory and cp into it
+      extern __shared__ char smem[];
+      SampleVector::Scalar* values = reinterpret_cast<SampleVector::Scalar*>(smem);
+      values[threadIdx.x] = amplitudes[ch](sampleidx);
+      __syncthreads();
+
+      // get the sample for this bx
+      auto const sample = static_cast<int>(activeBXs[ch](sampleidx)) + 5;
+
+      // store back to global
+      amplitudes[ch](sample) = values[threadIdx.x];
+
+      // store sample 5 separately
+      // only for the case when minimization was performed
+      // not for cases with precomputed amplitudes
+      if (sample == 5)
+        energies[ch] = values[threadIdx.x];
+    }
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h
new file mode 100644
index 0000000000000..479c623e83f62
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h
@@ -0,0 +1,104 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EigenMatrixTypes_gpu.h"
+
+class EcalPulseShape;
+// this flag setting is applied to all of the cases
+class EcalPulseCovariance;
+class EcalUncalibratedRecHit;
+
+namespace ecal {
+  namespace multifit {
+
+    ///
+    /// assume kernel launch configuration is
+    /// (MAXSAMPLES * nchannels, blocks)
+    /// TODO: is there a point to split this kernel further to separate reductions
+    ///
+    __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
+                                                  uint16_t const* digis_in_eb,
+                                                  uint32_t const* dids_eb,
+                                                  uint16_t const* digis_in_ee,
+                                                  uint32_t const* dids_ee,
+                                                  SampleVector* amplitudes,
+                                                  SampleVector* amplitudesForMinimizationEB,
+                                                  SampleVector* amplitudesForMinimizationEE,
+                                                  SampleGainVector* gainsNoise,
+                                                  float const* mean_x1,
+                                                  float const* mean_x12,
+                                                  float const* rms_x12,
+                                                  float const* mean_x6,
+                                                  float const* gain6Over1,
+                                                  float const* gain12Over6,
+                                                  bool* hasSwitchToGain6,
+                                                  bool* hasSwitchToGain1,
+                                                  bool* isSaturated,
+                                                  ::ecal::reco::StorageScalarType* energiesEB,
+                                                  ::ecal::reco::StorageScalarType* energiesEE,
+                                                  ::ecal::reco::StorageScalarType* chi2EB,
+                                                  ::ecal::reco::StorageScalarType* chi2EE,
+                                                  ::ecal::reco::StorageScalarType* pedestalEB,
+                                                  ::ecal::reco::StorageScalarType* pedestalEE,
+                                                  uint32_t* dids_outEB,
+                                                  uint32_t* dids_outEE,
+                                                  uint32_t* flagsEB,
+                                                  uint32_t* flagsEE,
+                                                  char* acState,
+                                                  BXVectorType* bxs,
+                                                  uint32_t const offsetForHashes,
+                                                  uint32_t const offsetForInputs,
+                                                  bool const gainSwitchUseMaxSampleEB,
+                                                  bool const gainSwitchUseMaxSampleEE,
+                                                  int const nchannels);
+
+    ///
+    /// assume kernel launch configuration is
+    /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
+    ///
+    __global__ void kernel_prep_2d(SampleGainVector const* gainNoise,
+                                   uint32_t const* dids_eb,
+                                   uint32_t const* dids_ee,
+                                   float const* rms_x12,
+                                   float const* rms_x6,
+                                   float const* rms_x1,
+                                   float const* gain12Over6,
+                                   float const* gain6Over1,
+                                   double const* G12SamplesCorrelationEB,
+                                   double const* G6SamplesCorrelationEB,
+                                   double const* G1SamplesCorrelationEB,
+                                   double const* G12SamplesCorrelationEE,
+                                   double const* G6SamplesCorrelationEE,
+                                   double const* G1SamplesCorrelationEE,
+                                   SampleMatrix* noisecov,
+                                   PulseMatrixType* pulse_matrix,
+                                   EcalPulseShape const* pulse_shape,
+                                   bool const* hasSwitchToGain6,
+                                   bool const* hasSwitchToGain1,
+                                   bool const* isSaturated,
+                                   uint32_t const offsetForHashes,
+                                   uint32_t const offsetForInputs);
+
+    __global__ void kernel_permute_results(SampleVector* amplitudes,
+                                           BXVectorType const* activeBXs,
+                                           ::ecal::reco::StorageScalarType* energies,
+                                           char const* acState,
+                                           int const nchannels);
+
+///
+/// Build an Ecal RecHit.
+/// TODO: Use SoA data structures on the host directly
+/// the reason for removing this from minimize kernel is to isolate the minimize +
+/// again, building an aos rec hit involves strides... -> bad memory access pattern
+///
+#ifdef RUN_BUILD_AOS_RECHIT
+    __global__ void kernel_build_rechit(
+        float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels);
+#endif  // RUN_BUILD_AOS_RECHIT
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
new file mode 100644
index 0000000000000..e5eff86d15ec7
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
@@ -0,0 +1,305 @@
+#include <cmath>
+#include <limits>
+
+#include <cuda.h>
+
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
+
+#include "AmplitudeComputationCommonKernels.h"
+#include "AmplitudeComputationKernels.h"
+#include "KernelHelpers.h"
+
+namespace ecal {
+  namespace multifit {
+
+    template <typename MatrixType>
+    __device__ __forceinline__ bool update_covariance(EcalPulseCovariance const& pulse_covariance,
+                                                      MatrixType& inverse_cov,
+                                                      SampleVector const& amplitudes) {
+      constexpr int nsamples = SampleVector::RowsAtCompileTime;
+      constexpr int npulses = BXVectorType::RowsAtCompileTime;
+
+      CMS_UNROLL_LOOP
+      for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) {
+        auto const amplitude = amplitudes.coeff(ipulse);
+        if (amplitude == 0)
+          continue;
+
+        // FIXME: ipulse - 5 -> ipulse - firstOffset
+        int bx = ipulse - 5;
+        int first_sample_t = std::max(0, bx + 3);
+        int offset = -3 - bx;
+
+        auto const value_sq = amplitude * amplitude;
+
+        for (int col = first_sample_t; col < nsamples; col++) {
+          for (int row = col; row < nsamples; row++) {
+            inverse_cov(row, col) += value_sq * __ldg(&pulse_covariance.covval[row + offset][col + offset]);
+          }
+        }
+      }
+
+      return true;
+    }
+
+    ///
+    /// launch ctx parameters are (nchannels / block, blocks)
+    /// TODO: trivial impl for now, there must be a way to improve
+    ///
+    /// Conventions:
+    ///   - amplitudes -> solution vector, what we are fitting for
+    ///   - samples -> raw detector responses
+    ///   - passive constraint - satisfied constraint
+    ///   - active constraint - unsatisfied (yet) constraint
+    ///
+    __global__ void kernel_minimize(uint32_t const* dids_eb,
+                                    uint32_t const* dids_ee,
+                                    SampleMatrix const* __restrict__ noisecov,
+                                    EcalPulseCovariance const* __restrict__ pulse_covariance,
+                                    BXVectorType* bxs,
+                                    SampleVector const* __restrict__ samples,
+                                    SampleVector* amplitudesEB,
+                                    SampleVector* amplitudesEE,
+                                    PulseMatrixType const* __restrict__ pulse_matrix,
+                                    ::ecal::reco::StorageScalarType* chi2sEB,
+                                    ::ecal::reco::StorageScalarType* chi2sEE,
+                                    ::ecal::reco::StorageScalarType* energiesEB,
+                                    ::ecal::reco::StorageScalarType* energiesEE,
+                                    char* acState,
+                                    int nchannels,
+                                    int max_iterations,
+                                    uint32_t const offsetForHashes,
+                                    uint32_t const offsetForInputs) {
+      // FIXME: ecal has 10 samples and 10 pulses....
+      // but this needs to be properly treated and renamed everywhere
+      constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
+      constexpr auto NPULSES = SampleMatrix::ColsAtCompileTime;
+      static_assert(NSAMPLES == NPULSES);
+
+      using DataType = SampleVector::Scalar;
+
+      extern __shared__ char shrmem[];
+      DataType* shrMatrixLForFnnlsStorage =
+          reinterpret_cast<DataType*>(shrmem) + calo::multifit::MapSymM<DataType, NPULSES>::total * threadIdx.x;
+      DataType* shrAtAStorage = reinterpret_cast<DataType*>(shrmem) +
+                                calo::multifit::MapSymM<DataType, NPULSES>::total * (threadIdx.x + blockDim.x);
+
+      // channel
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+// ref the right ptr
+#define ARRANGE(var) auto* var = idx >= offsetForInputs ? var##EE : var##EB
+      ARRANGE(amplitudes);
+      ARRANGE(chi2s);
+      ARRANGE(energies);
+#undef ARRANGE
+
+      if (idx < nchannels) {
+        if (static_cast<MinimizationState>(acState[idx]) == MinimizationState::Precomputed)
+          return;
+
+        // get the hash
+        int const inputCh = idx >= offsetForInputs ? idx - offsetForInputs : idx;
+        auto const* dids = idx >= offsetForInputs ? dids_ee : dids_eb;
+        auto const did = DetId{dids[inputCh]};
+        auto const isBarrel = did.subdetId() == EcalBarrel;
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
+        // inits
+        int iter = 0;
+        int npassive = 0;
+
+        calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
+        CMS_UNROLL_LOOP
+        for (int i = 0; i < NPULSES; ++i)
+          pulseOffsets(i) = i;
+
+        calo::multifit::ColumnVector<NPULSES, DataType> resultAmplitudes;
+        CMS_UNROLL_LOOP
+        for (int counter = 0; counter < NPULSES; counter++)
+          resultAmplitudes(counter) = 0;
+
+        // inits
+        //SampleDecompLLT covariance_decomposition;
+        //SampleMatrix inverse_cov;
+        //        SampleVector::Scalar chi2 = 0, chi2_now = 0;
+        float chi2 = 0, chi2_now = 0;
+
+        // loop until ocnverge
+        while (true) {
+          if (iter >= max_iterations)
+            break;
+
+          //inverse_cov = noisecov[idx];
+          //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
+          DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
+          calo::multifit::MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
+          int counter = 0;
+          CMS_UNROLL_LOOP
+          for (int col = 0; col < NSAMPLES; col++) {
+            CMS_UNROLL_LOOP
+            for (int row = col; row < NSAMPLES; row++)
+              covMatrixStorage[counter++] = __ldg(&noisecov[idx].coeffRef(row, col));
+          }
+          update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes);
+
+          // compute actual covariance decomposition
+          //covariance_decomposition.compute(inverse_cov);
+          //auto const& matrixL = covariance_decomposition.matrixL();
+          DataType matrixLStorage[calo::multifit::MapSymM<DataType, NSAMPLES>::total];
+          calo::multifit::MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
+          calo::multifit::compute_decomposition_unrolled(matrixL, covMatrix);
+
+          // L * A = P
+          calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
+          calo::multifit::solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
+
+          // L b = s
+          float reg_b[NSAMPLES];
+          calo::multifit::solve_forward_subst_vector(reg_b, samples[idx], matrixL);
+
+          // FIXME: shared mem
+          //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
+          calo::multifit::MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
+          //SampleMatrix AtA;
+          SampleVector Atb;
+          CMS_UNROLL_LOOP
+          for (int icol = 0; icol < NPULSES; icol++) {
+            float reg_ai[NSAMPLES];
+
+            // load column icol
+            CMS_UNROLL_LOOP
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              reg_ai[counter] = A(counter, icol);
+
+            // compute diagoanl
+            float sum = 0.f;
+            CMS_UNROLL_LOOP
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              sum += reg_ai[counter] * reg_ai[counter];
+
+            // store
+            AtA(icol, icol) = sum;
+
+            // go thru the other columns
+            CMS_UNROLL_LOOP
+            for (int j = icol + 1; j < NPULSES; j++) {
+              // load column j
+              float reg_aj[NSAMPLES];
+              CMS_UNROLL_LOOP
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                reg_aj[counter] = A(counter, j);
+
+              // accum
+              float sum = 0.f;
+              CMS_UNROLL_LOOP
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                sum += reg_aj[counter] * reg_ai[counter];
+
+              // store
+              //AtA(icol, j) = sum;
+              AtA(j, icol) = sum;
+            }
+
+            // Atb accum
+            float sum_atb = 0.f;
+            CMS_UNROLL_LOOP
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              sum_atb += reg_ai[counter] * reg_b[counter];
+
+            // store atb
+            Atb(icol) = sum_atb;
+          }
+
+          // FIXME: shared mem
+          //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
+          calo::multifit::MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
+
+          calo::multifit::fnnls(AtA,
+                                Atb,
+                                //amplitudes[idx],
+                                resultAmplitudes,
+                                npassive,
+                                pulseOffsets,
+                                matrixLForFnnls,
+                                1e-11,
+                                500,
+                                16,
+                                2);
+
+          calo::multifit::calculateChiSq(matrixL, pulse_matrix[idx], resultAmplitudes, samples[idx], chi2_now);
+
+          auto deltachi2 = chi2_now - chi2;
+          chi2 = chi2_now;
+
+          if (std::abs(deltachi2) < 1e-3)
+            break;
+
+          //---- AM: TEST
+          //---- it was 3 lines above, now here as in the CPU version
+          ++iter;
+        }
+
+        // store to global output values
+        // FIXME: amplitudes are used in global directly
+        chi2s[inputCh] = chi2;
+        energies[inputCh] = resultAmplitudes(5);
+
+        CMS_UNROLL_LOOP
+        for (int counter = 0; counter < NPULSES; counter++)
+          amplitudes[inputCh](counter) = resultAmplitudes(counter);
+      }
+    }
+
+    namespace v1 {
+
+      void minimization_procedure(EventInputDataGPU const& eventInputGPU,
+                                  EventOutputDataGPU& eventOutputGPU,
+                                  EventDataForScratchGPU& scratch,
+                                  ConditionsProducts const& conditions,
+                                  ConfigurationParameters const& configParameters,
+                                  cudaStream_t cudaStream) {
+        using DataType = SampleVector::Scalar;
+        unsigned int totalChannels = eventInputGPU.ebDigis.size + eventInputGPU.eeDigis.size;
+        //    unsigned int threads_min = conf.threads.x;
+        // TODO: configure from python
+        unsigned int threads_min = configParameters.kernelMinimizeThreads[0];
+        unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min;
+        uint32_t const offsetForHashes = conditions.offsetForHashes;
+        uint32_t const offsetForInputs = eventInputGPU.ebDigis.size;
+        auto const nbytesShared = 2 * threads_min *
+                                  calo::multifit::MapSymM<DataType, SampleVector::RowsAtCompileTime>::total *
+                                  sizeof(DataType);
+        kernel_minimize<<<blocks_min, threads_min, nbytesShared, cudaStream>>>(
+            eventInputGPU.ebDigis.ids.get(),
+            eventInputGPU.eeDigis.ids.get(),
+            (SampleMatrix*)scratch.noisecov.get(),
+            conditions.pulseCovariances.values,
+            (BXVectorType*)scratch.activeBXs.get(),
+            (SampleVector*)scratch.samples.get(),
+            (SampleVector*)eventOutputGPU.recHitsEB.amplitudesAll.get(),
+            (SampleVector*)eventOutputGPU.recHitsEE.amplitudesAll.get(),
+            (PulseMatrixType*)scratch.pulse_matrix.get(),
+            eventOutputGPU.recHitsEB.chi2.get(),
+            eventOutputGPU.recHitsEE.chi2.get(),
+            eventOutputGPU.recHitsEB.amplitude.get(),
+            eventOutputGPU.recHitsEE.amplitude.get(),
+            scratch.acState.get(),
+            totalChannels,
+            50,
+            offsetForHashes,
+            offsetForInputs);
+        cudaCheck(cudaGetLastError());
+      }
+
+    }  // namespace v1
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h
new file mode 100644
index 0000000000000..b8202f75b653b
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h
@@ -0,0 +1,29 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EigenMatrixTypes_gpu.h"
+
+class EcalPulseShape;
+class EcalPulseCovariance;
+class EcalUncalibratedRecHit;
+
+namespace ecal {
+  namespace multifit {
+
+    namespace v1 {
+
+      void minimization_procedure(EventInputDataGPU const& eventInputGPU,
+                                  EventOutputDataGPU& eventOutputGPU,
+                                  EventDataForScratchGPU& scratch,
+                                  ConditionsProducts const& conditions,
+                                  ConfigurationParameters const& configParameters,
+                                  cudaStream_t cudaStream);
+
+    }
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
index 25fe21603e864..61eed4689fd20 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
@@ -1,18 +1,22 @@
-<use name="CalibCalorimetry/EcalTPGTools"/>
-<use name="RecoLocalCalo/EcalDeadChannelRecoveryAlgos"/>
-<use name="Geometry/CaloGeometry"/>
-<use name="RecoLocalCalo/EcalRecProducers"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
+<use name="cuda"/>
+<use name="CUDADataFormats/EcalRecHitSoA"/>
 <use name="CalibCalorimetry/EcalLaserCorrection"/>
+<use name="CalibCalorimetry/EcalTPGTools"/>
+<use name="CondFormats/DataRecord"/>
+<use name="CondFormats/ESObjects"/>
+<use name="CondFormats/EcalObjects"/>
 <use name="DataFormats/EcalDigi"/>
 <use name="DataFormats/EcalRecHit"/>
-<use name="CondFormats/EcalObjects"/>
-<use name="CondFormats/ESObjects"/>
-<use name="CondFormats/DataRecord"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ParameterSet"/>
+<use name="Geometry/CaloGeometry"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="RecoLocalCalo/EcalDeadChannelRecoveryAlgos"/>
 <use name="RecoLocalCalo/EcalRecAlgos"/>
-<use name="FWCore/MessageLogger"/>
+<use name="RecoLocalCalo/EcalRecProducers"/>
 <use name="SimCalorimetry/EcalSimAlgos"/>
-<library file="*.cc" name="RecoLocalCaloEcalRecProducersPlugins">
+
+<library file="*.cc *.cu" name="RecoLocalCaloEcalRecProducersPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/Common.h b/RecoLocalCalo/EcalRecProducers/plugins/Common.h
new file mode 100644
index 0000000000000..55f5f613ed356
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/Common.h
@@ -0,0 +1,17 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_Common_h
+#define RecoLocalCalo_EcalRecProducers_plugins_Common_h
+
+// a workaround for std::abs not being a constexpr function
+namespace ecal {
+
+  // temporary
+  namespace mgpa {
+
+    constexpr int adc(uint16_t sample) { return sample & 0xfff; }
+    constexpr int gainId(uint16_t sample) { return (sample >> 12) & 0x3; }
+
+  }  // namespace mgpa
+
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_Common_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h
new file mode 100644
index 0000000000000..cac63b6b30112
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h
@@ -0,0 +1,325 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h
+
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
+#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h"
+#include "CondFormats/EcalObjects/interface/EcalGainRatios.h"
+#include "CondFormats/EcalObjects/interface/EcalPedestals.h"
+#include "CondFormats/EcalObjects/interface/EcalTimeBiasCorrections.h"
+#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h"
+#include "CondFormats/EcalObjects/interface/EcalWeightSet.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalMultifitParametersGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
+
+#include "EigenMatrixTypes_gpu.h"
+
+struct EcalPulseShape;
+class EcalSampleMask;
+class EcalTimeBiasCorrections;
+struct EcalPulseCovariance;
+class EcalDigiCollection;
+class EcalXtalGroupId;
+class EcalSamplesCorrelation;
+class EBDigiCollection;
+class EEDigiCollection;
+
+namespace ecal {
+  namespace multifit {
+
+    enum class TimeComputationState : char { NotFinished = 0, Finished = 1 };
+    enum class MinimizationState : char {
+      NotFinished = 0,
+      Finished = 1,
+      Precomputed = 2,
+    };
+
+    //
+    struct EventInputDataGPU {
+      ecal::DigisCollection<calo::common::DevStoragePolicy> const& ebDigis;
+      ecal::DigisCollection<calo::common::DevStoragePolicy> const& eeDigis;
+    };
+
+    // parameters have a fixed type
+    // Can we go by with single precision
+    struct ConfigurationParameters {
+      using type = double;
+      // device ptrs
+      type *amplitudeFitParametersEB = nullptr, *amplitudeFitParametersEE = nullptr;
+
+      uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE;
+      // device ptrs
+      type *timeFitParametersEB = nullptr, *timeFitParametersEE = nullptr;
+
+      type timeFitLimitsFirstEB, timeFitLimitsFirstEE;
+      type timeFitLimitsSecondEB, timeFitLimitsSecondEE;
+
+      type timeConstantTermEB, timeConstantTermEE;
+
+      type timeNconstEB, timeNconstEE;
+
+      type amplitudeThreshEE, amplitudeThreshEB;
+
+      type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB;
+      type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE;
+      type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE;
+      type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB;
+
+      std::array<uint32_t, 3> kernelMinimizeThreads;
+
+      bool shouldRunTimingComputation;
+
+      uint32_t maxNumberHitsEB;
+      uint32_t maxNumberHitsEE;
+    };
+
+    struct EventOutputDataGPU {
+      UncalibratedRecHit<::calo::common::DevStoragePolicy> recHitsEB, recHitsEE;
+
+      void allocate(ConfigurationParameters const& configParameters, cudaStream_t cudaStream) {
+        auto const sizeEB = configParameters.maxNumberHitsEB;
+        recHitsEB.amplitudesAll = cms::cuda::make_device_unique<reco::ComputationScalarType[]>(
+            sizeEB * EcalDataFrame::MAXSAMPLES, cudaStream);
+        recHitsEB.amplitude = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEB, cudaStream);
+        recHitsEB.chi2 = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEB, cudaStream);
+        recHitsEB.pedestal = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEB, cudaStream);
+
+        if (configParameters.shouldRunTimingComputation) {
+          recHitsEB.jitter = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEB, cudaStream);
+          recHitsEB.jitterError = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEB, cudaStream);
+        }
+
+        recHitsEB.did = cms::cuda::make_device_unique<uint32_t[]>(sizeEB, cudaStream);
+        recHitsEB.flags = cms::cuda::make_device_unique<uint32_t[]>(sizeEB, cudaStream);
+
+        auto const sizeEE = configParameters.maxNumberHitsEE;
+        recHitsEE.amplitudesAll = cms::cuda::make_device_unique<reco::ComputationScalarType[]>(
+            sizeEE * EcalDataFrame::MAXSAMPLES, cudaStream);
+        recHitsEE.amplitude = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEE, cudaStream);
+        recHitsEE.chi2 = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEE, cudaStream);
+        recHitsEE.pedestal = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEE, cudaStream);
+
+        if (configParameters.shouldRunTimingComputation) {
+          recHitsEE.jitter = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEE, cudaStream);
+          recHitsEE.jitterError = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEE, cudaStream);
+        }
+
+        recHitsEE.did = cms::cuda::make_device_unique<uint32_t[]>(sizeEE, cudaStream);
+        recHitsEE.flags = cms::cuda::make_device_unique<uint32_t[]>(sizeEE, cudaStream);
+      }
+    };
+
+    template <typename EigenM>
+    constexpr auto getLength() -> uint32_t {
+      return EigenM::RowsAtCompileTime * EigenM::ColsAtCompileTime;
+    }
+
+    struct EventDataForScratchGPU {
+      using SVT = SampleVector::Scalar;
+      using SGVT = SampleGainVector::Scalar;
+      using SMT = SampleMatrix::Scalar;
+      using PMT = PulseMatrixType::Scalar;
+      using BXVT = BXVectorType::Scalar;
+
+      cms::cuda::device::unique_ptr<SVT[]> samples;
+      cms::cuda::device::unique_ptr<SGVT[]> gainsNoise;
+
+      cms::cuda::device::unique_ptr<SMT[]> noisecov;
+      cms::cuda::device::unique_ptr<PMT[]> pulse_matrix;
+      cms::cuda::device::unique_ptr<BXVT[]> activeBXs;
+      cms::cuda::device::unique_ptr<char[]> acState;
+
+      cms::cuda::device::unique_ptr<bool[]> hasSwitchToGain6, hasSwitchToGain1, isSaturated;
+
+      cms::cuda::device::unique_ptr<SVT[]> sample_values, sample_value_errors;
+      cms::cuda::device::unique_ptr<bool[]> useless_sample_values;
+      cms::cuda::device::unique_ptr<SVT[]> chi2sNullHypot;
+      cms::cuda::device::unique_ptr<SVT[]> sum0sNullHypot;
+      cms::cuda::device::unique_ptr<SVT[]> sumAAsNullHypot;
+      cms::cuda::device::unique_ptr<char[]> pedestal_nums;
+      cms::cuda::device::unique_ptr<SVT[]> tMaxAlphaBetas, tMaxErrorAlphaBetas;
+      cms::cuda::device::unique_ptr<SVT[]> accTimeMax, accTimeWgt;
+      cms::cuda::device::unique_ptr<SVT[]> ampMaxAlphaBeta, ampMaxError;
+      cms::cuda::device::unique_ptr<SVT[]> timeMax, timeError;
+      cms::cuda::device::unique_ptr<TimeComputationState[]> tcState;
+
+      void allocate(ConfigurationParameters const& configParameters, cudaStream_t cudaStream) {
+        constexpr auto svlength = getLength<SampleVector>();
+        constexpr auto sgvlength = getLength<SampleGainVector>();
+        constexpr auto smlength = getLength<SampleMatrix>();
+        constexpr auto pmlength = getLength<PulseMatrixType>();
+        constexpr auto bxvlength = getLength<BXVectorType>();
+        auto const size = configParameters.maxNumberHitsEB + configParameters.maxNumberHitsEE;
+
+        auto alloc = [cudaStream](auto& var, uint32_t size) {
+          using element_type = typename std::remove_reference_t<decltype(var)>::element_type;
+          var = cms::cuda::make_device_unique<element_type[]>(size, cudaStream);
+        };
+
+        alloc(samples, size * svlength);
+        alloc(gainsNoise, size * sgvlength);
+
+        alloc(noisecov, size * smlength);
+        alloc(pulse_matrix, size * pmlength);
+        alloc(activeBXs, size * bxvlength);
+        alloc(acState, size);
+
+        alloc(hasSwitchToGain6, size);
+        alloc(hasSwitchToGain1, size);
+        alloc(isSaturated, size);
+
+        if (configParameters.shouldRunTimingComputation) {
+          alloc(sample_values, size * svlength);
+          alloc(sample_value_errors, size * svlength);
+          alloc(useless_sample_values, size * EcalDataFrame::MAXSAMPLES);
+          alloc(chi2sNullHypot, size);
+          alloc(sum0sNullHypot, size);
+          alloc(sumAAsNullHypot, size);
+          alloc(pedestal_nums, size);
+
+          alloc(tMaxAlphaBetas, size);
+          alloc(tMaxErrorAlphaBetas, size);
+          alloc(accTimeMax, size);
+          alloc(accTimeWgt, size);
+          alloc(ampMaxAlphaBeta, size);
+          alloc(ampMaxError, size);
+          alloc(timeMax, size);
+          alloc(timeError, size);
+          alloc(tcState, size);
+        }
+      }
+    };
+
+    // const refs products to conditions
+    struct ConditionsProducts {
+      EcalPedestalsGPU::Product const& pedestals;
+      EcalGainRatiosGPU::Product const& gainRatios;
+      EcalPulseShapesGPU::Product const& pulseShapes;
+      EcalPulseCovariancesGPU::Product const& pulseCovariances;
+      EcalSamplesCorrelationGPU::Product const& samplesCorrelation;
+      EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections;
+      EcalTimeCalibConstantsGPU::Product const& timeCalibConstants;
+      EcalSampleMask const& sampleMask;
+      EcalTimeOffsetConstant const& timeOffsetConstant;
+      uint32_t offsetForHashes;
+      EcalMultifitParametersGPU::Product const& multifitParameters;
+    };
+
+    struct xyz {
+      int x, y, z;
+    };
+
+    struct conf_data {
+      xyz threads;
+      bool runV1;
+      cudaStream_t cuStream;
+    };
+
+  }  // namespace multifit
+}  // namespace ecal
+
+//
+// ECAL Rechit producer
+//
+
+namespace ecal {
+  namespace rechit {
+
+    // parameters that are read in the configuration file for rechit producer
+    struct ConfigurationParameters {
+      // device ptrs
+      int* ChannelStatusToBeExcluded = nullptr;
+      uint32_t ChannelStatusToBeExcludedSize;
+
+      bool killDeadChannels;
+
+      bool recoverEBIsolatedChannels;
+      bool recoverEEIsolatedChannels;
+      bool recoverEBVFE;
+      bool recoverEEVFE;
+      bool recoverEBFE;
+      bool recoverEEFE;
+
+      float EBLaserMIN;
+      float EELaserMIN;
+      float EBLaserMAX;
+      float EELaserMAX;
+
+      int* expanded_v_DB_reco_flags;
+      uint32_t* expanded_Sizes_v_DB_reco_flags;
+      uint32_t* expanded_flagbit_v_DB_reco_flags;
+      uint32_t expanded_v_DB_reco_flagsSize;
+
+      uint32_t flagmask;
+      uint32_t maxNumberHitsEB;
+      uint32_t maxNumberHitsEE;
+    };
+
+    struct EventOutputDataGPU {
+      RecHit<::calo::common::DevStoragePolicy> recHitsEB, recHitsEE;
+
+      void allocate(ConfigurationParameters const& configParameters, cudaStream_t cudaStream) {
+        //---- configParameters -> needed only to decide if to save the timing information or not
+        auto const sizeEB = configParameters.maxNumberHitsEB;
+        recHitsEB.energy = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEB, cudaStream);
+        recHitsEB.time = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEB, cudaStream);
+        recHitsEB.chi2 = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEB, cudaStream);
+        recHitsEB.flagBits = cms::cuda::make_device_unique<uint32_t[]>(sizeEB, cudaStream);
+        recHitsEB.extra = cms::cuda::make_device_unique<uint32_t[]>(sizeEB, cudaStream);
+        recHitsEB.did = cms::cuda::make_device_unique<uint32_t[]>(sizeEB, cudaStream);
+
+        auto const sizeEE = configParameters.maxNumberHitsEE;
+        recHitsEE.energy = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEE, cudaStream);
+        recHitsEE.time = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEE, cudaStream);
+        recHitsEE.chi2 = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEE, cudaStream);
+        recHitsEE.flagBits = cms::cuda::make_device_unique<uint32_t[]>(sizeEE, cudaStream);
+        recHitsEE.extra = cms::cuda::make_device_unique<uint32_t[]>(sizeEE, cudaStream);
+        recHitsEE.did = cms::cuda::make_device_unique<uint32_t[]>(sizeEE, cudaStream);
+      }
+    };
+
+    struct EventInputDataGPU {
+      ecal::UncalibratedRecHit<calo::common::DevStoragePolicy> const& ebUncalibRecHits;
+      ecal::UncalibratedRecHit<calo::common::DevStoragePolicy> const& eeUncalibRecHits;
+    };
+
+    // const refs products to conditions
+    struct ConditionsProducts {
+      EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV;
+      EcalIntercalibConstantsGPU::Product const& Intercalib;
+      EcalRechitChannelStatusGPU::Product const& ChannelStatus;
+
+      EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios;
+      EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef;
+      EcalLaserAlphasGPU::Product const& LaserAlphas;
+      EcalLinearCorrectionsGPU::Product const& LinearCorrections;
+
+      uint32_t offsetForHashes;
+    };
+
+  }  // namespace rechit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
new file mode 100644
index 0000000000000..3de6b62898925
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
@@ -0,0 +1,168 @@
+//#define ECAL_RECO_CUDA_DEBUG
+
+#ifdef ECAL_RECO_CUDA_DEBUG
+#include <iostream>
+#endif
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+// algorithm specific
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h"
+
+class EcalCPURecHitProducer : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit EcalCPURecHitProducer(edm::ParameterSet const& ps);
+  ~EcalCPURecHitProducer() override = default;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  using InputProduct = cms::cuda::Product<ecal::RecHit<calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<InputProduct> recHitsInEBToken_, recHitsInEEToken_;
+  using OutputProduct = ecal::RecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OutputProduct> recHitsOutEBToken_, recHitsOutEEToken_;
+
+  OutputProduct recHitsEB_, recHitsEE_;
+  bool containsTimingInformation_;
+};
+
+void EcalCPURecHitProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("recHitsInLabelEB", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEB"});
+  desc.add<edm::InputTag>("recHitsInLabelEE", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEE"});
+  desc.add<std::string>("recHitsOutLabelEB", "EcalRecHitsEB");
+  desc.add<std::string>("recHitsOutLabelEE", "EcalRecHitsEE");
+  desc.add<bool>("containsTimingInformation", false);
+
+  confDesc.addWithDefaultLabel(desc);
+}
+
+EcalCPURecHitProducer::EcalCPURecHitProducer(const edm::ParameterSet& ps)
+    : recHitsInEBToken_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("recHitsInLabelEB"))},
+      recHitsInEEToken_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("recHitsInLabelEE"))},
+      recHitsOutEBToken_{produces<OutputProduct>(ps.getParameter<std::string>("recHitsOutLabelEB"))},
+      recHitsOutEEToken_{produces<OutputProduct>(ps.getParameter<std::string>("recHitsOutLabelEE"))},
+      containsTimingInformation_{ps.getParameter<bool>("containsTimingInformation")} {}
+
+void EcalCPURecHitProducer::acquire(edm::Event const& event,
+                                    edm::EventSetup const& setup,
+                                    edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& ebRecHitsProduct = event.get(recHitsInEBToken_);
+  auto const& eeRecHitsProduct = event.get(recHitsInEEToken_);
+  cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)};
+  auto const& ebRecHits = ctx.get(ebRecHitsProduct);
+  auto const& eeRecHits = ctx.get(eeRecHitsProduct);
+
+  // resize the output buffers
+  recHitsEB_.resize(ebRecHits.size);
+  recHitsEE_.resize(eeRecHits.size);
+
+#ifdef ECAL_RECO_CUDA_DEBUG
+  std::cout << " [EcalCPURecHitProducer::acquire] ebRecHits.size = " << ebRecHits.size << std::endl;
+  std::cout << " [EcalCPURecHitProducer::acquire] eeRecHits.size = " << eeRecHits.size << std::endl;
+#endif
+
+  // enqeue transfers
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.did.data(),
+                            ebRecHits.did.get(),
+                            recHitsEB_.did.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.did.data(),
+                            eeRecHits.did.get(),
+                            recHitsEE_.did.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  //
+  //     ./CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h:using StorageScalarType = float;
+  //
+
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.energy.data(),
+                            ebRecHits.energy.get(),
+                            recHitsEB_.energy.size() * sizeof(::ecal::reco::StorageScalarType),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.energy.data(),
+                            eeRecHits.energy.get(),
+                            recHitsEE_.energy.size() * sizeof(::ecal::reco::StorageScalarType),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.chi2.data(),
+                            ebRecHits.chi2.get(),
+                            recHitsEB_.chi2.size() * sizeof(::ecal::reco::StorageScalarType),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.chi2.data(),
+                            eeRecHits.chi2.get(),
+                            recHitsEE_.chi2.size() * sizeof(::ecal::reco::StorageScalarType),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.extra.data(),
+                            ebRecHits.extra.get(),
+                            recHitsEB_.extra.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.extra.data(),
+                            eeRecHits.extra.get(),
+                            recHitsEE_.extra.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.flagBits.data(),
+                            ebRecHits.flagBits.get(),
+                            recHitsEB_.flagBits.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.flagBits.data(),
+                            eeRecHits.flagBits.get(),
+                            recHitsEE_.flagBits.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+#ifdef ECAL_RECO_CUDA_DEBUG
+  for (unsigned int ieb = 0; ieb < ebRecHits.size; ieb++) {
+    if (recHitsEB_.extra[ieb] != 0)
+      std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb]
+                << " ] eb extra = " << recHitsEB_.extra[ieb] << std::endl;
+  }
+
+  for (unsigned int ieb = 0; ieb < ebRecHits.size; ieb++) {
+    if (recHitsEB_.energy[ieb] != 0)
+      std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb]
+                << " ] eb energy = " << recHitsEB_.energy[ieb] << std::endl;
+  }
+
+  for (unsigned int iee = 0; iee < eeRecHits.size; iee++) {
+    if (recHitsEE_.energy[iee] != 0)
+      std::cout << " [ " << iee << " :: " << eeRecHits.size << " ] [ " << recHitsEE_.did[iee]
+                << " ] ee energy = " << recHitsEE_.energy[iee] << std::endl;
+  }
+#endif
+}
+
+void EcalCPURecHitProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  // put into event
+  event.emplace(recHitsOutEBToken_, std::move(recHitsEB_));
+  event.emplace(recHitsOutEEToken_, std::move(recHitsEE_));
+}
+
+DEFINE_FWK_MODULE(EcalCPURecHitProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
new file mode 100644
index 0000000000000..801d378c7c391
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
@@ -0,0 +1,120 @@
+#include <iostream>
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+// algorithm specific
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h"
+
+class EcalCPUUncalibRecHitProducer : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit EcalCPUUncalibRecHitProducer(edm::ParameterSet const& ps);
+  ~EcalCPUUncalibRecHitProducer() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  using InputProduct = cms::cuda::Product<ecal::UncalibratedRecHit<calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<InputProduct> recHitsInEBToken_, recHitsInEEToken_;
+  using OutputProduct = ecal::UncalibratedRecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OutputProduct> recHitsOutEBToken_, recHitsOutEEToken_;
+
+  OutputProduct recHitsEB_, recHitsEE_;
+  bool containsTimingInformation_;
+};
+
+void EcalCPUUncalibRecHitProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("recHitsInLabelEB", edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"});
+  desc.add<edm::InputTag>("recHitsInLabelEE", edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"});
+  desc.add<std::string>("recHitsOutLabelEB", "EcalUncalibRecHitsEB");
+  desc.add<std::string>("recHitsOutLabelEE", "EcalUncalibRecHitsEE");
+  desc.add<bool>("containsTimingInformation", false);
+
+  confDesc.add("ecalCPUUncalibRecHitProducer", desc);
+}
+
+EcalCPUUncalibRecHitProducer::EcalCPUUncalibRecHitProducer(const edm::ParameterSet& ps)
+    : recHitsInEBToken_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("recHitsInLabelEB"))},
+      recHitsInEEToken_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("recHitsInLabelEE"))},
+      recHitsOutEBToken_{produces<OutputProduct>(ps.getParameter<std::string>("recHitsOutLabelEB"))},
+      recHitsOutEEToken_{produces<OutputProduct>(ps.getParameter<std::string>("recHitsOutLabelEE"))},
+      containsTimingInformation_{ps.getParameter<bool>("containsTimingInformation")} {}
+
+EcalCPUUncalibRecHitProducer::~EcalCPUUncalibRecHitProducer() {}
+
+void EcalCPUUncalibRecHitProducer::acquire(edm::Event const& event,
+                                           edm::EventSetup const& setup,
+                                           edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& ebRecHitsProduct = event.get(recHitsInEBToken_);
+  auto const& eeRecHitsProduct = event.get(recHitsInEEToken_);
+  cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)};
+  auto const& ebRecHits = ctx.get(ebRecHitsProduct);
+  auto const& eeRecHits = ctx.get(eeRecHitsProduct);
+
+  // resize the output buffers
+  recHitsEB_.resize(ebRecHits.size);
+  recHitsEE_.resize(eeRecHits.size);
+
+  auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
+    using vector_type = typename std::remove_reference<decltype(dest)>::type;
+    using type = typename vector_type::value_type;
+    using src_type = typename std::remove_pointer<decltype(src)>::type;
+    static_assert(std::is_same<src_type, type>::value && "dst and src data types do not match");
+    cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream()));
+  };
+
+  // enqeue transfers
+  lambdaToTransfer(recHitsEB_.did, ebRecHits.did.get());
+  lambdaToTransfer(recHitsEE_.did, eeRecHits.did.get());
+
+  lambdaToTransfer(recHitsEB_.amplitudesAll, ebRecHits.amplitudesAll.get());
+  lambdaToTransfer(recHitsEE_.amplitudesAll, eeRecHits.amplitudesAll.get());
+
+  lambdaToTransfer(recHitsEB_.amplitude, ebRecHits.amplitude.get());
+  lambdaToTransfer(recHitsEE_.amplitude, eeRecHits.amplitude.get());
+
+  lambdaToTransfer(recHitsEB_.chi2, ebRecHits.chi2.get());
+  lambdaToTransfer(recHitsEE_.chi2, eeRecHits.chi2.get());
+
+  lambdaToTransfer(recHitsEB_.pedestal, ebRecHits.pedestal.get());
+  lambdaToTransfer(recHitsEE_.pedestal, eeRecHits.pedestal.get());
+
+  lambdaToTransfer(recHitsEB_.flags, ebRecHits.flags.get());
+  lambdaToTransfer(recHitsEE_.flags, eeRecHits.flags.get());
+
+  if (containsTimingInformation_) {
+    lambdaToTransfer(recHitsEB_.jitter, ebRecHits.jitter.get());
+    lambdaToTransfer(recHitsEE_.jitter, eeRecHits.jitter.get());
+
+    lambdaToTransfer(recHitsEB_.jitterError, ebRecHits.jitterError.get());
+    lambdaToTransfer(recHitsEE_.jitterError, eeRecHits.jitterError.get());
+  }
+}
+
+void EcalCPUUncalibRecHitProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  // tmp vectors
+  auto recHitsOutEB = std::make_unique<OutputProduct>(std::move(recHitsEB_));
+  auto recHitsOutEE = std::make_unique<OutputProduct>(std::move(recHitsEE_));
+
+  // put into event
+  event.put(recHitsOutEBToken_, std::move(recHitsOutEB));
+  event.put(recHitsOutEEToken_, std::move(recHitsOutEE));
+}
+
+DEFINE_FWK_MODULE(EcalCPUUncalibRecHitProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
new file mode 100644
index 0000000000000..3118d54c6a7e9
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
@@ -0,0 +1,88 @@
+#include "CondFormats/DataRecord/interface/EcalADCToGeVConstantRcd.h"
+#include "CondFormats/DataRecord/interface/EcalChannelStatusRcd.h"
+#include "CondFormats/DataRecord/interface/EcalGainRatiosRcd.h"
+#include "CondFormats/DataRecord/interface/EcalIntercalibConstantsRcd.h"
+#include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRcd.h"
+#include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRefRcd.h"
+#include "CondFormats/DataRecord/interface/EcalLaserAlphasRcd.h"
+#include "CondFormats/DataRecord/interface/EcalLinearCorrectionsRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPedestalsRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPulseCovariancesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPulseShapesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalSamplesCorrelationRcd.h"
+#include "CondFormats/DataRecord/interface/EcalTimeBiasCorrectionsRcd.h"
+#include "CondFormats/DataRecord/interface/EcalTimeCalibConstantsRcd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
+
+using EcalPedestalsGPUESProducer = ConvertingESProducerT<EcalPedestalsRcd, EcalPedestalsGPU, EcalPedestals>;
+
+using EcalGainRatiosGPUESProducer = ConvertingESProducerT<EcalGainRatiosRcd, EcalGainRatiosGPU, EcalGainRatios>;
+
+using EcalPulseShapesGPUESProducer = ConvertingESProducerT<EcalPulseShapesRcd, EcalPulseShapesGPU, EcalPulseShapes>;
+
+using EcalPulseCovariancesGPUESProducer =
+    ConvertingESProducerT<EcalPulseCovariancesRcd, EcalPulseCovariancesGPU, EcalPulseCovariances>;
+
+using EcalSamplesCorrelationGPUESProducer =
+    ConvertingESProducerT<EcalSamplesCorrelationRcd, EcalSamplesCorrelationGPU, EcalSamplesCorrelation>;
+
+using EcalTimeBiasCorrectionsGPUESProducer =
+    ConvertingESProducerT<EcalTimeBiasCorrectionsRcd, EcalTimeBiasCorrectionsGPU, EcalTimeBiasCorrections>;
+
+using EcalTimeCalibConstantsGPUESProducer =
+    ConvertingESProducerT<EcalTimeCalibConstantsRcd, EcalTimeCalibConstantsGPU, EcalTimeCalibConstants>;
+
+using EcalRechitADCToGeVConstantGPUESProducer =
+    ConvertingESProducerT<EcalADCToGeVConstantRcd, EcalRechitADCToGeVConstantGPU, EcalADCToGeVConstant>;
+
+using EcalIntercalibConstantsGPUESProducer =
+    ConvertingESProducerT<EcalIntercalibConstantsRcd, EcalIntercalibConstantsGPU, EcalIntercalibConstants>;
+
+using EcalRechitChannelStatusGPUESProducer =
+    ConvertingESProducerT<EcalChannelStatusRcd, EcalRechitChannelStatusGPU, EcalChannelStatus>;
+
+using EcalLaserAPDPNRatiosGPUESProducer =
+    ConvertingESProducerT<EcalLaserAPDPNRatiosRcd, EcalLaserAPDPNRatiosGPU, EcalLaserAPDPNRatios>;
+
+using EcalLaserAPDPNRatiosRefGPUESProducer =
+    ConvertingESProducerT<EcalLaserAPDPNRatiosRefRcd, EcalLaserAPDPNRatiosRefGPU, EcalLaserAPDPNRatiosRef>;
+
+using EcalLaserAlphasGPUESProducer = ConvertingESProducerT<EcalLaserAlphasRcd, EcalLaserAlphasGPU, EcalLaserAlphas>;
+
+using EcalLinearCorrectionsGPUESProducer =
+    ConvertingESProducerT<EcalLinearCorrectionsRcd, EcalLinearCorrectionsGPU, EcalLinearCorrections>;
+
+//
+// This below also creates the .py config files, as described in HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h
+//
+
+DEFINE_FWK_EVENTSETUP_MODULE(EcalPedestalsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalGainRatiosGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalPulseShapesGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalPulseCovariancesGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalSamplesCorrelationGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalTimeBiasCorrectionsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalTimeCalibConstantsGPUESProducer);
+
+DEFINE_FWK_EVENTSETUP_MODULE(EcalRechitADCToGeVConstantGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalIntercalibConstantsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalRechitChannelStatusGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosRefGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAlphasGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalLinearCorrectionsGPUESProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalMultifitParametersGPUESProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalMultifitParametersGPUESProducer.cc
new file mode 100644
index 0000000000000..1743df5aa945d
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalMultifitParametersGPUESProducer.cc
@@ -0,0 +1,78 @@
+#include <array>
+#include <tuple>
+#include <utility>
+
+#include "FWCore/Framework/interface/ESProducer.h"
+#include "FWCore/Framework/interface/ESProductHost.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/EventSetupRecordIntervalFinder.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/ModuleFactory.h"
+#include "FWCore/Framework/interface/SourceFactory.h"
+#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalMultifitParametersGPU.h"
+
+class EcalMultifitParametersGPUESProducer : public edm::ESProducer, public edm::EventSetupRecordIntervalFinder {
+public:
+  EcalMultifitParametersGPUESProducer(edm::ParameterSet const&);
+  ~EcalMultifitParametersGPUESProducer() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+  std::unique_ptr<EcalMultifitParametersGPU> produce(JobConfigurationGPURecord const&);
+
+protected:
+  void setIntervalFor(const edm::eventsetup::EventSetupRecordKey&,
+                      const edm::IOVSyncValue&,
+                      edm::ValidityInterval&) override;
+
+private:
+  edm::ParameterSet const pset_;
+};
+
+EcalMultifitParametersGPUESProducer::EcalMultifitParametersGPUESProducer(edm::ParameterSet const& pset) : pset_{pset} {
+  setWhatProduced(this);
+  findingRecord<JobConfigurationGPURecord>();
+}
+
+void EcalMultifitParametersGPUESProducer::setIntervalFor(const edm::eventsetup::EventSetupRecordKey& iKey,
+                                                         const edm::IOVSyncValue& iTime,
+                                                         edm::ValidityInterval& oInterval) {
+  oInterval = edm::ValidityInterval(edm::IOVSyncValue::beginOfTime(), edm::IOVSyncValue::endOfTime());
+}
+
+void EcalMultifitParametersGPUESProducer::fillDescriptions(edm::ConfigurationDescriptions& desc) {
+  edm::ParameterSetDescription d;
+  d.add<std::vector<int>>("pulseOffsets", {-3, -2, -1, 0, 1, 2, 3, 4});
+  d.add<std::vector<double>>("EBtimeFitParameters",
+                             {-2.015452e+00,
+                              3.130702e+00,
+                              -1.234730e+01,
+                              4.188921e+01,
+                              -8.283944e+01,
+                              9.101147e+01,
+                              -5.035761e+01,
+                              1.105621e+01});
+  d.add<std::vector<double>>("EEtimeFitParameters",
+                             {-2.390548e+00,
+                              3.553628e+00,
+                              -1.762341e+01,
+                              6.767538e+01,
+                              -1.332130e+02,
+                              1.407432e+02,
+                              -7.541106e+01,
+                              1.620277e+01});
+  d.add<std::vector<double>>("EBamplitudeFitParameters", {1.138, 1.652});
+  d.add<std::vector<double>>("EEamplitudeFitParameters", {1.890, 1.400});
+  desc.addWithDefaultLabel(d);
+}
+
+std::unique_ptr<EcalMultifitParametersGPU> EcalMultifitParametersGPUESProducer::produce(
+    JobConfigurationGPURecord const&) {
+  return std::make_unique<EcalMultifitParametersGPU>(pset_);
+}
+
+DEFINE_FWK_EVENTSETUP_SOURCE(EcalMultifitParametersGPUESProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu
new file mode 100644
index 0000000000000..6e1b2a66c2507
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu
@@ -0,0 +1,676 @@
+#include <cuda.h>
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h"
+
+#include "EcalRecHitBuilderKernels.h"
+#include "KernelHelpers.h"
+
+namespace ecal {
+  namespace rechit {
+
+    // uncalibrecHit flags
+    enum UncalibRecHitFlags {
+      kGood = -1,  // channel is good (mutually exclusive with other states)  setFlagBit(kGood) reset flags_ to zero
+      kPoorReco,   // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.)
+      kSaturated,  // saturated channel
+      kOutOfTime,  // channel out of time
+      kLeadingEdgeRecovered,  // saturated channel: energy estimated from the leading edge before saturation
+      kHasSwitchToGain6,      // at least one data frame is in G6
+      kHasSwitchToGain1       // at least one data frame is in G1
+    };
+
+    // recHit flags
+    enum RecHitFlags {
+      RecHitFlags_kGood = 0,  // channel ok, the energy and time measurement are reliable
+      RecHitFlags_kPoorReco,  // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2)
+      RecHitFlags_kOutOfTime,  // the energy is available from the UncalibRecHit (sync reco), but the event is out of time
+      RecHitFlags_kFaultyHardware,  // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy)
+      RecHitFlags_kNoisy,      // the channel is very noisy
+      RecHitFlags_kPoorCalib,  // the energy is available from the UncalibRecHit, but the calibration of the channel is poor
+      RecHitFlags_kSaturated,             // saturated channel (recovery not tried)
+      RecHitFlags_kLeadingEdgeRecovered,  // saturated channel: energy estimated from the leading edge before saturation
+      RecHitFlags_kNeighboursRecovered,   // saturated/isolated dead: energy estimated from neighbours
+      RecHitFlags_kTowerRecovered,        // channel in TT with no data link, info retrieved from Trigger Primitive
+      RecHitFlags_kDead,                  // channel is dead and any recovery fails
+      RecHitFlags_kKilled,                // MC only flag: the channel is killed in the real detector
+      RecHitFlags_kTPSaturated,           // the channel is in a region with saturated TP
+      RecHitFlags_kL1SpikeFlag,           // the channel is in a region with TP with sFGVB = 0
+      RecHitFlags_kWeird,                 // the signal is believed to originate from an anomalous deposit (spike)
+      RecHitFlags_kDiWeird,               // the signal is anomalous, and neighbors another anomalous signal
+      RecHitFlags_kHasSwitchToGain6,      // at least one data frame is in G6
+      RecHitFlags_kHasSwitchToGain1,      // at least one data frame is in G1
+      //
+      RecHitFlags_kUnknown  // to ease the interface with functions returning flags.
+    };
+
+    // status code
+    enum EcalChannelStatusCode_Code {
+      kOk = 0,
+      kDAC,
+      kNoLaser,
+      kNoisy,
+      kNNoisy,
+      kNNNoisy,
+      kNNNNoisy,
+      kNNNNNoisy,
+      kFixedG6,
+      kFixedG1,
+      kFixedG0,
+      kNonRespondingIsolated,
+      kDeadVFE,
+      kDeadFE,
+      kNoDataNoTP
+    };
+
+    __global__ void kernel_create_ecal_rehit(
+        // configuration
+        int const* ChannelStatusToBeExcluded,
+        uint32_t ChannelStatusToBeExcludedSize,
+        bool const killDeadChannels,
+        bool const recoverEBIsolatedChannels,
+        bool const recoverEEIsolatedChannels,
+        bool const recoverEBVFE,
+        bool const recoverEEVFE,
+        bool const recoverEBFE,
+        bool const recoverEEFE,
+        float const EBLaserMIN,
+        float const EELaserMIN,
+        float const EBLaserMAX,
+        float const EELaserMAX,
+        // for flags setting
+        int const* expanded_v_DB_reco_flags,  // FIXME AM: to be checked
+        uint32_t const* expanded_Sizes_v_DB_reco_flags,
+        uint32_t const* expanded_flagbit_v_DB_reco_flags,
+        uint32_t expanded_v_DB_reco_flagsSize,
+        uint32_t flagmask,
+        // conditions
+        float const* adc2gev,
+        float const* intercalib,
+        uint16_t const* status,
+        float const* apdpnrefs,
+        float const* alphas,
+        // input for transparency corrections
+        float const* p1,
+        float const* p2,
+        float const* p3,
+        edm::TimeValue_t const* t1,
+        edm::TimeValue_t const* t2,
+        edm::TimeValue_t const* t3,
+        // input for linear corrections
+        float const* lp1,
+        float const* lp2,
+        float const* lp3,
+        edm::TimeValue_t const* lt1,
+        edm::TimeValue_t const* lt2,
+        edm::TimeValue_t const* lt3,
+        // time, used for time dependent corrections
+        edm::TimeValue_t const event_time,
+        // input
+        uint32_t const* did_eb,
+        uint32_t const* did_ee,
+        ::ecal::reco::StorageScalarType const* amplitude_eb,  // in adc counts
+        ::ecal::reco::StorageScalarType const* amplitude_ee,  // in adc counts
+        ::ecal::reco::StorageScalarType const* time_eb,
+        ::ecal::reco::StorageScalarType const* time_ee,
+        ::ecal::reco::StorageScalarType const* chi2_eb,
+        ::ecal::reco::StorageScalarType const* chi2_ee,
+        uint32_t const* flags_eb,
+        uint32_t const* flags_ee,
+        // output
+        uint32_t* didEB,
+        uint32_t* didEE,
+        ::ecal::reco::StorageScalarType* energyEB,  // in energy [GeV]
+        ::ecal::reco::StorageScalarType* energyEE,  // in energy [GeV]
+        ::ecal::reco::StorageScalarType* timeEB,
+        ::ecal::reco::StorageScalarType* timeEE,
+        ::ecal::reco::StorageScalarType* chi2EB,
+        ::ecal::reco::StorageScalarType* chi2EE,
+        uint32_t* flagBitsEB,
+        uint32_t* flagBitsEE,
+        uint32_t* extraEB,
+        uint32_t* extraEE,
+        // other
+        int const nchannels,
+        uint32_t const nChannelsBarrel,
+        uint32_t const offsetForHashes) {
+      //
+      //    NB: energy   "type_wrapper<reco::StorageScalarType, L>::type" most likely std::vector<float>
+      //
+
+      for (int ch = threadIdx.x + blockDim.x * blockIdx.x; ch < nchannels; ch += blockDim.x * gridDim.x) {
+        bool isEndcap = (ch >= nChannelsBarrel);
+
+        int const inputCh = isEndcap ? ch - nChannelsBarrel : ch;
+
+        uint32_t const* didCh = isEndcap ? did_ee : did_eb;
+
+        // arrange to access the right ptrs
+#define ARRANGE(var) auto* var = isEndcap ? var##EE : var##EB
+        ARRANGE(did);
+        ARRANGE(energy);
+        ARRANGE(chi2);
+        ARRANGE(flagBits);
+        ARRANGE(extra);
+#undef ARRANGE
+
+        // only two values, EB or EE
+        // AM : FIXME : why not using "isBarrel" ?    isBarrel ? adc2gev[0] : adc2gev[1]
+        float adc2gev_to_use = isEndcap ? adc2gev[1]   // ee
+                                        : adc2gev[0];  // eb
+
+        // first EB and then EE
+
+        ::ecal::reco::StorageScalarType const* amplitude = isEndcap ? amplitude_ee : amplitude_eb;
+
+        ::ecal::reco::StorageScalarType const* chi2_in = isEndcap ? chi2_ee : chi2_eb;
+
+        uint32_t const* flags_in = isEndcap ? flags_ee : flags_eb;
+
+        // simple copy
+        did[inputCh] = didCh[inputCh];
+
+        auto const did_to_use = DetId{didCh[inputCh]};
+
+        auto const isBarrel = did_to_use.subdetId() == EcalBarrel;
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId());
+
+        float const intercalib_to_use = intercalib[hashedId];
+
+        // get laser coefficient
+        float lasercalib = 1.;
+
+        //
+        // AM: ideas
+        //
+        //    One possibility is to create the map of laser corrections once on CPU
+        //    for all crystals and push them on GPU.
+        //    Then only if the LS is different, update the laser correction
+        //    The variation within a LS is not worth pursuing (<< 0.1% !!)
+        //    and below the precision we can claim on the laser corrections (right?).
+        //    This will save quite some time (also for the CPU version?)
+        //
+
+        int iLM = 1;
+
+        if (isBarrel) {
+          iLM = ecal::reconstruction::laser_monitoring_region_EB(did_to_use.rawId());
+        } else {
+          iLM = ecal::reconstruction::laser_monitoring_region_EE(did_to_use.rawId());
+        }
+
+        long long t_i = 0, t_f = 0;
+        float p_i = 0, p_f = 0;
+        long long lt_i = 0, lt_f = 0;
+        float lp_i = 0, lp_f = 0;
+
+        // laser
+        if (event_time >= t1[iLM - 1] && event_time < t2[iLM - 1]) {
+          t_i = t1[iLM - 1];
+          t_f = t2[iLM - 1];
+          p_i = p1[hashedId];
+          p_f = p2[hashedId];
+        } else if (event_time >= t2[iLM - 1] && event_time <= t3[iLM - 1]) {
+          t_i = t2[iLM - 1];
+          t_f = t3[iLM - 1];
+          p_i = p2[hashedId];
+          p_f = p3[hashedId];
+        } else if (event_time < t1[iLM - 1]) {
+          t_i = t1[iLM - 1];
+          t_f = t2[iLM - 1];
+          p_i = p1[hashedId];
+          p_f = p2[hashedId];
+
+        } else if (event_time > t3[iLM - 1]) {
+          t_i = t2[iLM - 1];
+          t_f = t3[iLM - 1];
+          p_i = p2[hashedId];
+          p_f = p3[hashedId];
+        }
+
+        // linear corrections
+        if (event_time >= lt1[iLM - 1] && event_time < lt2[iLM - 1]) {
+          lt_i = lt1[iLM - 1];
+          lt_f = lt2[iLM - 1];
+          lp_i = lp1[hashedId];
+          lp_f = lp2[hashedId];
+        } else if (event_time >= lt2[iLM - 1] && event_time <= lt3[iLM - 1]) {
+          lt_i = lt2[iLM - 1];
+          lt_f = lt3[iLM - 1];
+          lp_i = lp2[hashedId];
+          lp_f = lp3[hashedId];
+        } else if (event_time < lt1[iLM - 1]) {
+          lt_i = lt1[iLM - 1];
+          lt_f = lt2[iLM - 1];
+          lp_i = lp1[hashedId];
+          lp_f = lp2[hashedId];
+
+        } else if (event_time > lt3[iLM - 1]) {
+          lt_i = lt2[iLM - 1];
+          lt_f = lt3[iLM - 1];
+          lp_i = lp2[hashedId];
+          lp_f = lp3[hashedId];
+        }
+
+        // apdpnref and alpha
+        float apdpnref = apdpnrefs[hashedId];
+        float alpha = alphas[hashedId];
+
+        // now calculate transparency correction
+        if (apdpnref != 0 && (t_i - t_f) != 0 && (lt_i - lt_f) != 0) {
+          long long tt = event_time;  // never subtract two unsigned!
+          float interpolatedLaserResponse =
+              p_i / apdpnref + float(tt - t_i) * (p_f - p_i) / (apdpnref * float(t_f - t_i));
+
+          float interpolatedLinearResponse =
+              lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i));  // FIXED BY FC
+
+          if (interpolatedLinearResponse > 2.f || interpolatedLinearResponse < 0.1f) {
+            interpolatedLinearResponse = 1.f;
+          }
+          if (interpolatedLaserResponse <= 0.) {
+            // AM :  how the heck is it possible?
+            //             interpolatedLaserResponse = 0.0001;
+            lasercalib = 1.;
+
+          } else {
+            float interpolatedTransparencyResponse = interpolatedLaserResponse / interpolatedLinearResponse;
+
+            // ... and now this:
+            lasercalib = 1.f / (std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse);
+          }
+        }
+
+        //
+        // Check for channels to be excluded from reconstruction
+        //
+        //
+        // Default energy? Not to be updated if "ChannelStatusToBeExcluded"
+        // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat"
+        //
+        energy[inputCh] = -1;  //---- AM: default, un-physical, ok
+
+        // truncate the chi2
+        if (chi2_in[inputCh] > 64)
+          chi2[inputCh] = 64;
+        else
+          chi2[inputCh] = chi2_in[inputCh];
+
+        // default values for the flags
+        flagBits[inputCh] = 0;
+        extra[inputCh] = 0;
+
+        static const int chStatusMask = 0x1f;
+        // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same
+        int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask);
+        if (ChannelStatusToBeExcludedSize != 0) {
+          bool skip_this_channel = false;
+          for (int ich_to_check = 0; ich_to_check < ChannelStatusToBeExcludedSize; ich_to_check++) {
+            if (ChannelStatusToBeExcluded[ich_to_check] == dbstatus) {
+              skip_this_channel = true;
+              break;
+            }
+          }
+          if (skip_this_channel) {
+            // skip this channel
+            continue;
+          }
+        }
+
+        // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word
+
+        //
+        // AM: get the smaller "flagbit_counter" with match
+        //
+
+        uint32_t temporary_flagBits = 0;
+
+        int iterator_flags = 0;
+        bool need_to_exit = false;
+        int flagbit_counter = 0;
+        while (!need_to_exit) {
+          iterator_flags = 0;
+          for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) {
+            // check the correct "flagbit"
+            if (expanded_flagbit_v_DB_reco_flags[i] == flagbit_counter) {
+              for (unsigned int j = 0; j < expanded_Sizes_v_DB_reco_flags[i]; j++) {
+                if (expanded_v_DB_reco_flags[iterator_flags] == dbstatus) {
+                  temporary_flagBits = 0x1 << expanded_flagbit_v_DB_reco_flags[i];
+                  need_to_exit = true;
+                  break;  // also from the big loop!!!
+                }
+                iterator_flags++;
+              }
+            } else {
+              // if not, got to the next bunch directly
+              iterator_flags += expanded_Sizes_v_DB_reco_flags[i];
+            }
+
+            if (need_to_exit) {
+              break;
+            }
+          }
+          flagbit_counter += 1;
+        }
+
+        flagBits[inputCh] = temporary_flagBits;
+
+        if ((flagmask & temporary_flagBits) && killDeadChannels) {
+          // skip this channel
+          continue;
+        }
+
+        //
+        // multiply the adc counts with factors to get GeV
+        //
+
+        //         energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use ;
+        energy[inputCh] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib;
+
+        // Time is not saved so far, FIXME
+        //         time[ch] = time_in[inputCh];
+
+        // NB: calculate the "flagBits extra"  --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ...
+
+        //
+        // extra packing ...
+        //
+
+        uint32_t offset;
+        uint32_t width;
+        uint32_t value;
+
+        float chi2_temp = chi2[inputCh];
+        if (chi2_temp > 64)
+          chi2_temp = 64;
+        // use 7 bits
+        uint32_t rawChi2 = lround(chi2_temp / 64. * ((1 << 7) - 1));
+
+        offset = 0;
+        width = 7;
+        value = 0;
+
+        uint32_t mask = ((1 << width) - 1) << offset;
+        value &= ~mask;
+        value |= (rawChi2 & ((1U << width) - 1)) << offset;
+
+        // rawEnergy is actually "error" !!!
+        uint32_t rawEnergy = 0;
+
+        // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA
+        //            if you want to store this in "extra", we need first to add it to the uncalibrecHit results
+        //            then it will be something like the following
+        //         amplitudeError[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib
+        //
+        //
+
+        float amplitudeError_ch = 0.;  // amplitudeError[ch];
+
+        if (amplitudeError_ch > 0.001) {
+          static constexpr float p10[] = {1.e-2f, 1.e-1f, 1.f, 1.e1f, 1.e2f, 1.e3f, 1.e4f, 1.e5f, 1.e6f};
+          int b = amplitudeError_ch < p10[4] ? 0 : 5;
+          for (; b < 9; ++b)
+            if (amplitudeError_ch < p10[b])
+              break;
+
+          uint16_t exponent = b;
+
+          static constexpr float ip10[] = {1.e5f, 1.e4f, 1.e3f, 1.e2f, 1.e1f, 1.e0f, 1.e-1f, 1.e-2f, 1.e-3f, 1.e-4};
+          uint16_t significand = lround(amplitudeError_ch * ip10[exponent]);
+          // use 13 bits (3 exponent, 10 significand)
+          rawEnergy = exponent << 10 | significand;
+        }
+
+        offset = 8;
+        width = 13;
+        // value from last change, ok
+
+        mask = ((1 << width) - 1) << offset;
+        value &= ~mask;
+        value |= (rawEnergy & ((1U << width) - 1)) << offset;
+
+        uint32_t jitterErrorBits = 0;
+        jitterErrorBits = jitterErrorBits & 0xFF;
+
+        offset = 24;
+        width = 8;
+        // value from last change, ok
+
+        mask = ((1 << width) - 1) << offset;
+        value &= ~mask;
+        value |= (jitterErrorBits & ((1U << width) - 1)) << offset;
+
+        //
+        // now finally set "extra[ch]"
+        //
+        extra[inputCh] = value;
+
+        //
+        // additional flags setting
+        //
+        // using correctly the flags as calculated at the UncalibRecHit stage
+        //
+        // Now fill flags
+
+        bool good = true;
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kLeadingEdgeRecovered))) {
+          flagBits[inputCh] |= (0x1 << (RecHitFlags::RecHitFlags_kLeadingEdgeRecovered));
+          good = false;
+        }
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) {
+          // leading edge recovery failed - still keep the information
+          // about the saturation and do not flag as dead
+          flagBits[inputCh] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated));
+          good = false;
+        }
+
+        //
+        // AM: why do we have two tests one after the other checking almost the same thing???
+        // Please clean up the code, ... also the original one!
+        //
+        // uncalibRH.isSaturated() --->
+        //
+        //                                   bool EcalUncalibratedRecHit::isSaturated() const {
+        //                                     return EcalUncalibratedRecHit::checkFlag(kSaturated);
+        //                                   }
+        //
+        //
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) {
+          flagBits[inputCh] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated));
+          good = false;
+        }
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kOutOfTime))) {
+          flagBits[inputCh] |= (0x1 << (RecHitFlags::RecHitFlags_kOutOfTime));
+          good = false;
+        }
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kPoorReco))) {
+          flagBits[inputCh] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorReco));
+          good = false;
+        }
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain6))) {
+          flagBits[inputCh] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain6));
+        }
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain1))) {
+          flagBits[inputCh] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain1));
+        }
+
+        if (good) {
+          flagBits[inputCh] |= (0x1 << (RecHitFlags::RecHitFlags_kGood));
+        }
+
+        if ((isBarrel && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) ||
+            (!isBarrel && (lasercalib < EELaserMIN || lasercalib > EELaserMAX))) {
+          flagBits[inputCh] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib));
+        }
+
+        // recover, killing, and other stuff
+
+        //
+        // Structure:
+        //  EB
+        //  EE
+        //
+        //
+        //  - single MVA
+        //  - democratic sharing
+        //  - kill all the other cases
+        //
+
+        bool is_Single = false;
+        bool is_FE = false;
+        bool is_VFE = false;
+
+        bool is_recoverable = false;  // DetIdToBeRecovered
+
+        if (dbstatus == 10 || dbstatus == 11 || dbstatus == 12) {
+          is_recoverable = true;
+        }
+
+        if (is_recoverable) {
+          if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
+            is_VFE = true;
+          } else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
+            is_FE = true;
+          } else {
+            is_Single = true;
+          }
+
+          // EB
+          if (isBarrel) {
+            if (is_Single || is_FE || is_VFE) {
+              // single MVA
+              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) {
+              }
+              // decmocratic sharing
+              else if (is_FE && (recoverEBFE || !killDeadChannels)) {
+              }
+              // kill all the other cases
+              else {
+                energy[inputCh] = 0.;  // Need to set also the flags ...
+              }
+            }
+          }
+          // EE
+          else {
+            if (is_Single || is_FE || is_VFE) {
+              // single MVA
+              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) {
+              }
+              // decmocratic sharing
+              else if (is_FE && (recoverEBFE || !killDeadChannels)) {
+                //
+                //  Code is definitely too long ...
+                //
+
+              }
+              // kill all the other cases
+              else {
+                energy[inputCh] = 0.;  // Need to set also the flags ...
+              }
+            }
+          }
+        }
+
+      }  // end channel
+    }
+
+    // host version, to be called by the plugin
+    void create_ecal_rehit(EventInputDataGPU const& eventInputGPU,
+                           EventOutputDataGPU& eventOutputGPU,
+                           //     eventDataForScratchGPU_,
+                           ConditionsProducts const& conditions,
+                           ConfigurationParameters const& configParameters,
+                           uint32_t const nChannelsBarrel,
+                           edm::TimeValue_t const event_time,
+                           cudaStream_t cudaStream) {
+      int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size;
+
+      unsigned int nchannels_per_block = 16;
+      unsigned int threads_min = nchannels_per_block;
+      unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min;  // TEST : to be optimized (AM)
+
+      //
+      // kernel create rechit
+      //
+
+      kernel_create_ecal_rehit<<<blocks_min, threads_min, 0, cudaStream>>>(
+          // configuration
+          configParameters.ChannelStatusToBeExcluded,
+          configParameters.ChannelStatusToBeExcludedSize,
+          configParameters.killDeadChannels,
+          configParameters.recoverEBIsolatedChannels,
+          configParameters.recoverEEIsolatedChannels,
+          configParameters.recoverEBVFE,
+          configParameters.recoverEEVFE,
+          configParameters.recoverEBFE,
+          configParameters.recoverEEFE,
+          configParameters.EBLaserMIN,
+          configParameters.EELaserMIN,
+          configParameters.EBLaserMAX,
+          configParameters.EELaserMAX,
+          // for flags setting
+          configParameters.expanded_v_DB_reco_flags,
+          configParameters.expanded_Sizes_v_DB_reco_flags,
+          configParameters.expanded_flagbit_v_DB_reco_flags,
+          configParameters.expanded_v_DB_reco_flagsSize,
+          configParameters.flagmask,
+          // conditions
+          conditions.ADCToGeV.adc2gev,
+          conditions.Intercalib.values,
+          conditions.ChannelStatus.status,
+          conditions.LaserAPDPNRatiosRef.values,
+          conditions.LaserAlphas.values,
+          // input for transparency corrections
+          conditions.LaserAPDPNRatios.p1,
+          conditions.LaserAPDPNRatios.p2,
+          conditions.LaserAPDPNRatios.p3,
+          conditions.LaserAPDPNRatios.t1,
+          conditions.LaserAPDPNRatios.t2,
+          conditions.LaserAPDPNRatios.t3,
+          // input for linear corrections
+          conditions.LinearCorrections.p1,
+          conditions.LinearCorrections.p2,
+          conditions.LinearCorrections.p3,
+          conditions.LinearCorrections.t1,
+          conditions.LinearCorrections.t2,
+          conditions.LinearCorrections.t3,
+          // time, used for time dependent corrections
+          event_time,
+          // input
+          eventInputGPU.ebUncalibRecHits.did.get(),
+          eventInputGPU.eeUncalibRecHits.did.get(),
+          eventInputGPU.ebUncalibRecHits.amplitude.get(),
+          eventInputGPU.eeUncalibRecHits.amplitude.get(),
+          eventInputGPU.ebUncalibRecHits.jitter.get(),
+          eventInputGPU.eeUncalibRecHits.jitter.get(),
+          eventInputGPU.ebUncalibRecHits.chi2.get(),
+          eventInputGPU.eeUncalibRecHits.chi2.get(),
+          eventInputGPU.ebUncalibRecHits.flags.get(),
+          eventInputGPU.eeUncalibRecHits.flags.get(),
+          // output
+          eventOutputGPU.recHitsEB.did.get(),
+          eventOutputGPU.recHitsEE.did.get(),
+          eventOutputGPU.recHitsEB.energy.get(),
+          eventOutputGPU.recHitsEE.energy.get(),
+          eventOutputGPU.recHitsEB.time.get(),
+          eventOutputGPU.recHitsEE.time.get(),
+          eventOutputGPU.recHitsEB.chi2.get(),
+          eventOutputGPU.recHitsEE.chi2.get(),
+          eventOutputGPU.recHitsEB.flagBits.get(),
+          eventOutputGPU.recHitsEE.flagBits.get(),
+          eventOutputGPU.recHitsEB.extra.get(),
+          eventOutputGPU.recHitsEE.extra.get(),
+          // other
+          nchannels,
+          nChannelsBarrel,
+          conditions.offsetForHashes);
+    }
+
+  }  // namespace rechit
+
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h
new file mode 100644
index 0000000000000..cb9c7f435d7b3
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h
@@ -0,0 +1,93 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h
+
+//
+// Builder of ECAL RecHits on GPU
+//
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/Provenance/interface/Timestamp.h"
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+
+namespace ecal {
+  namespace rechit {
+
+    __global__ void kernel_create_ecal_rehit(
+        // configuration
+        int const* ChannelStatusToBeExcluded,
+        uint32_t ChannelStatusToBeExcludedSize,
+        bool killDeadChannels,
+        bool const recoverEBIsolatedChannels,
+        bool const recoverEEIsolatedChannels,
+        bool const recoverEBVFE,
+        bool const recoverEEVFE,
+        bool const recoverEBFE,
+        bool const recoverEEFE,
+        // for flags setting
+        int const* expanded_v_DB_reco_flags,
+        uint32_t const* expanded_Sizes_v_DB_reco_flags,
+        uint32_t const* expanded_flagbit_v_DB_reco_flags,
+        uint32_t expanded_v_DB_reco_flagsSize,
+        uint32_t flagmask,
+        // conditions
+        float const* adc2gev,
+        float const* intercalib,
+        uint16_t const* status,
+        float const* apdpnrefs,
+        float const* alphas,
+        // input for transparency corrections
+        float const* p1,
+        float const* p2,
+        float const* p3,
+        edm::TimeValue_t const* t1,
+        edm::TimeValue_t const* t2,
+        edm::TimeValue_t const* t3,
+        // input for linear corrections
+        float const* lp1,
+        float const* lp2,
+        float const* lp3,
+        edm::TimeValue_t const* lt1,
+        edm::TimeValue_t const* lt2,
+        edm::TimeValue_t const* lt3,
+        // time, used for time dependent corrections
+        edm::TimeValue_t const event_time,
+        // input
+        uint32_t const* did_eb,
+        uint32_t const* did_ee,
+        ::ecal::reco::StorageScalarType const* amplitude_eb,  // in adc counts
+        ::ecal::reco::StorageScalarType const* amplitude_ee,  // in adc counts
+        ::ecal::reco::StorageScalarType const* time_eb,
+        ::ecal::reco::StorageScalarType const* time_ee,
+        ::ecal::reco::StorageScalarType const* chi2_eb,
+        ::ecal::reco::StorageScalarType const* chi2_ee,
+        uint32_t const* flags_eb,
+        uint32_t const* flags_ee,
+        // output
+        uint32_t* did,
+        ::ecal::reco::StorageScalarType* energy,  // in energy [GeV]
+        ::ecal::reco::StorageScalarType* time,
+        ::ecal::reco::StorageScalarType* chi2,
+        uint32_t* flagBits,
+        uint32_t* extra,
+        int const nchannels,
+        uint32_t const nChannelsBarrel,
+        uint32_t const offsetForHashes);
+
+    // host version, to be called by the plugin
+
+    void create_ecal_rehit(EventInputDataGPU const& eventInputGPU,
+                           EventOutputDataGPU& eventOutputGPU,
+                           ConditionsProducts const& conditions,
+                           ConfigurationParameters const& configParameters,
+                           uint32_t const nChannelsBarrel,
+                           edm::TimeValue_t const event_time,
+                           cudaStream_t cudaStream);
+
+  }  // namespace rechit
+
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
new file mode 100644
index 0000000000000..6df36f4a8b592
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
@@ -0,0 +1,98 @@
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#include "Common.h"
+
+class EcalRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> {
+public:
+  explicit EcalRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps);
+  ~EcalRecHitConvertGPU2CPUFormat() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  using InputProduct = ecal::RecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  const edm::EDGetTokenT<InputProduct> recHitsGPUEB_;
+  const edm::EDGetTokenT<InputProduct> recHitsGPUEE_;
+
+  const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_;
+};
+
+void EcalRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("recHitsLabelGPUEB", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEB"));
+  desc.add<edm::InputTag>("recHitsLabelGPUEE", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEE"));
+
+  desc.add<std::string>("recHitsLabelCPUEB", "EcalRecHitsEB");
+  desc.add<std::string>("recHitsLabelCPUEE", "EcalRecHitsEE");
+
+  confDesc.addWithDefaultLabel(desc);
+}
+
+EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps)
+    : recHitsGPUEB_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEB"))},
+      recHitsGPUEE_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEE"))},
+      recHitsLabelCPUEB_{ps.getParameter<std::string>("recHitsLabelCPUEB")},
+      recHitsLabelCPUEE_{ps.getParameter<std::string>("recHitsLabelCPUEE")} {
+  produces<EBRecHitCollection>(recHitsLabelCPUEB_);
+  produces<EERecHitCollection>(recHitsLabelCPUEE_);
+}
+
+EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {}
+
+void EcalRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) {
+  auto const& hRecHitsGPUEB = event.get(recHitsGPUEB_);
+  auto const& hRecHitsGPUEE = event.get(recHitsGPUEE_);
+
+  auto recHitsCPUEB = std::make_unique<EBRecHitCollection>();
+  auto recHitsCPUEE = std::make_unique<EERecHitCollection>();
+  recHitsCPUEB->reserve(hRecHitsGPUEB.energy.size());
+  recHitsCPUEE->reserve(hRecHitsGPUEE.energy.size());
+
+  for (uint32_t i = 0; i < hRecHitsGPUEB.energy.size(); ++i) {
+    //
+    // Save only if energy is >= 0 !
+    // This is extremely important because the channels that were supposed
+    // to be excluded get "-1" as energy
+    //
+
+    if (hRecHitsGPUEB.energy[i] >= 0) {
+      recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB.did[i]},
+                                 hRecHitsGPUEB.energy[i],
+                                 hRecHitsGPUEB.time[i],
+                                 hRecHitsGPUEB.extra[i],
+                                 hRecHitsGPUEB.flagBits[i]);
+    }
+  }
+
+  for (uint32_t i = 0; i < hRecHitsGPUEE.energy.size(); ++i) {
+    //
+    // Save only if energy is >= 0 !
+    // This is extremely important because the channels that were supposed
+    // to be excluded get "-1" as energy
+    //
+
+    if (hRecHitsGPUEE.energy[i] >= 0) {
+      recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE.did[i]},
+                                 hRecHitsGPUEE.energy[i],
+                                 hRecHitsGPUEE.time[i],
+                                 hRecHitsGPUEE.extra[i],
+                                 hRecHitsGPUEE.flagBits[i]);
+    }
+  }
+
+  event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);
+  event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_);
+}
+
+DEFINE_FWK_MODULE(EcalRecHitConvertGPU2CPUFormat);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitParametersGPUESProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitParametersGPUESProducer.cc
new file mode 100644
index 0000000000000..a63ed42cb2b70
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitParametersGPUESProducer.cc
@@ -0,0 +1,83 @@
+#include <array>
+#include <tuple>
+#include <utility>
+
+#include "FWCore/Framework/interface/ESProducer.h"
+#include "FWCore/Framework/interface/ESProductHost.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/EventSetupRecordIntervalFinder.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/ModuleFactory.h"
+#include "FWCore/Framework/interface/SourceFactory.h"
+#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRecHitParametersGPU.h"
+
+class EcalRecHitParametersGPUESProducer : public edm::ESProducer, public edm::EventSetupRecordIntervalFinder {
+public:
+  EcalRecHitParametersGPUESProducer(edm::ParameterSet const&);
+  ~EcalRecHitParametersGPUESProducer() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+  std::unique_ptr<EcalRecHitParametersGPU> produce(JobConfigurationGPURecord const&);
+
+protected:
+  void setIntervalFor(const edm::eventsetup::EventSetupRecordKey&,
+                      const edm::IOVSyncValue&,
+                      edm::ValidityInterval&) override;
+
+private:
+  edm::ParameterSet const pset_;
+};
+
+EcalRecHitParametersGPUESProducer::EcalRecHitParametersGPUESProducer(edm::ParameterSet const& pset) : pset_{pset} {
+  setWhatProduced(this);
+  findingRecord<JobConfigurationGPURecord>();
+}
+
+void EcalRecHitParametersGPUESProducer::setIntervalFor(const edm::eventsetup::EventSetupRecordKey& iKey,
+                                                       const edm::IOVSyncValue& iTime,
+                                                       edm::ValidityInterval& oInterval) {
+  oInterval = edm::ValidityInterval(edm::IOVSyncValue::beginOfTime(), edm::IOVSyncValue::endOfTime());
+}
+
+void EcalRecHitParametersGPUESProducer::fillDescriptions(edm::ConfigurationDescriptions& desc) {
+  edm::ParameterSetDescription d;
+
+  //---- db statuses to be exluded from reconstruction
+  d.add<std::vector<std::string>>("ChannelStatusToBeExcluded",
+                                  {
+                                      "kDAC",
+                                      "kNoisy",
+                                      "kNNoisy",
+                                      "kFixedG6",
+                                      "kFixedG1",
+                                      "kFixedG0",
+                                      "kNonRespondingIsolated",
+                                      "kDeadVFE",
+                                      "kDeadFE",
+                                      "kNoDataNoTP",
+                                  });
+
+  // reco flags association to DB flag
+  edm::ParameterSetDescription desc_list_flagsMapDBReco;
+  desc_list_flagsMapDBReco.add<std::vector<std::string>>("kGood", {"kOk", "kDAC", "kNoLaser", "kNoisy"});
+  desc_list_flagsMapDBReco.add<std::vector<std::string>>("kNoisy", {"kNNoisy", "kFixedG6", "kFixedG1"});
+  desc_list_flagsMapDBReco.add<std::vector<std::string>>("kNeighboursRecovered",
+                                                         {"kFixedG0", "kNonRespondingIsolated", "kDeadVFE"});
+  desc_list_flagsMapDBReco.add<std::vector<std::string>>("kTowerRecovered", {"kDeadFE"});
+  desc_list_flagsMapDBReco.add<std::vector<std::string>>("kDead", {"kNoDataNoTP"});
+
+  d.add<edm::ParameterSetDescription>("flagsMapDBReco", desc_list_flagsMapDBReco);
+
+  desc.addWithDefaultLabel(d);
+}
+
+std::unique_ptr<EcalRecHitParametersGPU> EcalRecHitParametersGPUESProducer::produce(JobConfigurationGPURecord const&) {
+  return std::make_unique<EcalRecHitParametersGPU>(pset_);
+}
+
+DEFINE_FWK_EVENTSETUP_SOURCE(EcalRecHitParametersGPUESProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
new file mode 100644
index 0000000000000..a6dabd37f8439
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -0,0 +1,244 @@
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+#include "CommonTools/Utils/interface/StringToEnumValue.h"
+#include "CondFormats/DataRecord/interface/EcalADCToGeVConstantRcd.h"
+#include "CondFormats/DataRecord/interface/EcalChannelStatusRcd.h"
+#include "CondFormats/DataRecord/interface/EcalIntercalibConstantsRcd.h"
+#include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRcd.h"
+#include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRefRcd.h"
+#include "CondFormats/DataRecord/interface/EcalLaserAlphasRcd.h"
+#include "CondFormats/DataRecord/interface/EcalLinearCorrectionsRcd.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRecHitParametersGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
+
+#include "EcalRecHitBuilderKernels.h"
+
+class EcalRecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit EcalRecHitProducerGPU(edm::ParameterSet const& ps);
+  ~EcalRecHitProducerGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  // data
+  uint32_t neb_, nee_;  // extremely important, in particular neb_
+
+  // gpu input
+  using InputProduct = cms::cuda::Product<ecal::UncalibratedRecHit<calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<InputProduct> uncalibRecHitsInEBToken_;
+  edm::EDGetTokenT<InputProduct> uncalibRecHitsInEEToken_;
+
+  // event data
+  ecal::rechit::EventOutputDataGPU eventOutputDataGPU_;
+
+  cms::cuda::ContextState cudaState_;
+
+  // gpu output
+  using OutputProduct = cms::cuda::Product<ecal::RecHit<calo::common::DevStoragePolicy>>;
+  edm::EDPutTokenT<OutputProduct> recHitsTokenEB_, recHitsTokenEE_;
+
+  // configuration parameters
+  ecal::rechit::ConfigurationParameters configParameters_;
+
+  // conditions handles
+  edm::ESHandle<EcalRechitADCToGeVConstantGPU> ADCToGeVConstantHandle_;
+  edm::ESHandle<EcalIntercalibConstantsGPU> IntercalibConstantsHandle_;
+  edm::ESHandle<EcalRechitChannelStatusGPU> ChannelStatusHandle_;
+
+  edm::ESHandle<EcalLaserAPDPNRatiosGPU> LaserAPDPNRatiosHandle_;
+  edm::ESHandle<EcalLaserAPDPNRatiosRefGPU> LaserAPDPNRatiosRefHandle_;
+  edm::ESHandle<EcalLaserAlphasGPU> LaserAlphasHandle_;
+  edm::ESHandle<EcalLinearCorrectionsGPU> LinearCorrectionsHandle_;
+  edm::ESHandle<EcalRecHitParametersGPU> recHitParametersHandle_;
+
+  // Associate reco flagbit (outer vector) to many db status flags (inner vector)
+  std::vector<int>
+      expanded_v_DB_reco_flags_;  // Transform a map in a vector      // FIXME AM: int or uint32 to be checked
+  std::vector<uint32_t> expanded_Sizes_v_DB_reco_flags_;    // Saving the size for each piece
+  std::vector<uint32_t> expanded_flagbit_v_DB_reco_flags_;  // And the "key" for each key
+
+  uint32_t flagmask_;  // do not propagate channels with these flags on
+};
+
+void EcalRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("uncalibrecHitsInLabelEB",
+                          edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"));
+  desc.add<edm::InputTag>("uncalibrecHitsInLabelEE",
+                          edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"));
+
+  desc.add<std::string>("recHitsLabelEB", "EcalRecHitsGPUEB");
+  desc.add<std::string>("recHitsLabelEE", "EcalRecHitsGPUEE");
+
+  desc.add<bool>("killDeadChannels", true);
+
+  desc.add<double>("EBLaserMIN", 0.01);
+  desc.add<double>("EELaserMIN", 0.01);
+  desc.add<double>("EBLaserMAX", 30.0);
+  desc.add<double>("EELaserMAX", 30.0);
+
+  desc.add<uint32_t>("maxNumberHitsEB", 61200);
+  desc.add<uint32_t>("maxNumberHitsEE", 14648);
+}
+
+EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
+  //---- input
+  uncalibRecHitsInEBToken_ = consumes<InputProduct>(ps.getParameter<edm::InputTag>("uncalibrecHitsInLabelEB"));
+  uncalibRecHitsInEEToken_ = consumes<InputProduct>(ps.getParameter<edm::InputTag>("uncalibrecHitsInLabelEE"));
+
+  //---- output
+  recHitsTokenEB_ = produces<OutputProduct>(ps.getParameter<std::string>("recHitsLabelEB"));
+  recHitsTokenEE_ = produces<OutputProduct>(ps.getParameter<std::string>("recHitsLabelEE"));
+
+  bool killDeadChannels = ps.getParameter<bool>("killDeadChannels");
+  configParameters_.killDeadChannels = killDeadChannels;
+
+  configParameters_.EBLaserMIN = ps.getParameter<double>("EBLaserMIN");
+  configParameters_.EELaserMIN = ps.getParameter<double>("EELaserMIN");
+  configParameters_.EBLaserMAX = ps.getParameter<double>("EBLaserMAX");
+  configParameters_.EELaserMAX = ps.getParameter<double>("EELaserMAX");
+
+  // max number of digis to allocate for
+  configParameters_.maxNumberHitsEB = ps.getParameter<uint32_t>("maxNumberHitsEB");
+  configParameters_.maxNumberHitsEE = ps.getParameter<uint32_t>("maxNumberHitsEE");
+
+  flagmask_ = 0;
+  flagmask_ |= 0x1 << EcalRecHit::kNeighboursRecovered;
+  flagmask_ |= 0x1 << EcalRecHit::kTowerRecovered;
+  flagmask_ |= 0x1 << EcalRecHit::kDead;
+  flagmask_ |= 0x1 << EcalRecHit::kKilled;
+  flagmask_ |= 0x1 << EcalRecHit::kTPSaturated;
+  flagmask_ |= 0x1 << EcalRecHit::kL1SpikeFlag;
+
+  configParameters_.flagmask = flagmask_;
+
+  // for recovery and killing
+
+  configParameters_.recoverEBIsolatedChannels = ps.getParameter<bool>("recoverEBIsolatedChannels");
+  configParameters_.recoverEEIsolatedChannels = ps.getParameter<bool>("recoverEEIsolatedChannels");
+  configParameters_.recoverEBVFE = ps.getParameter<bool>("recoverEBVFE");
+  configParameters_.recoverEEVFE = ps.getParameter<bool>("recoverEEVFE");
+  configParameters_.recoverEBFE = ps.getParameter<bool>("recoverEBFE");
+  configParameters_.recoverEEFE = ps.getParameter<bool>("recoverEEFE");
+}
+
+EcalRecHitProducerGPU::~EcalRecHitProducerGPU() {}
+
+void EcalRecHitProducerGPU::acquire(edm::Event const& event,
+                                    edm::EventSetup const& setup,
+                                    edm::WaitingTaskWithArenaHolder holder) {
+  // cuda products
+  auto const& ebUncalibRecHitsProduct = event.get(uncalibRecHitsInEBToken_);
+  auto const& eeUncalibRecHitsProduct = event.get(uncalibRecHitsInEEToken_);
+  // raii
+  cms::cuda::ScopedContextAcquire ctx{ebUncalibRecHitsProduct, std::move(holder), cudaState_};
+  // get actual object
+  auto const& ebUncalibRecHits = ctx.get(ebUncalibRecHitsProduct);
+  auto const& eeUncalibRecHits = ctx.get(eeUncalibRecHitsProduct);
+
+  ecal::rechit::EventInputDataGPU inputDataGPU{ebUncalibRecHits, eeUncalibRecHits};
+
+  neb_ = ebUncalibRecHits.size;
+  nee_ = eeUncalibRecHits.size;
+
+  if ((neb_ > configParameters_.maxNumberHitsEB) || (nee_ > configParameters_.maxNumberHitsEE)) {
+    edm::LogError("EcalRecHitProducerGPU")
+        << "max number of channels exceeded. See options 'maxNumberHitsEB and maxNumberHitsEE' ";
+  }
+
+  int nchannelsEB = ebUncalibRecHits.size;  // --> offsetForInput, first EB and then EE
+
+  // conditions
+  // - laser correction
+  // - IC
+  // - adt2gev
+
+  //
+  setup.get<EcalADCToGeVConstantRcd>().get(ADCToGeVConstantHandle_);
+  setup.get<EcalIntercalibConstantsRcd>().get(IntercalibConstantsHandle_);
+  setup.get<EcalChannelStatusRcd>().get(ChannelStatusHandle_);
+
+  setup.get<EcalLaserAPDPNRatiosRcd>().get(LaserAPDPNRatiosHandle_);
+  setup.get<EcalLaserAPDPNRatiosRefRcd>().get(LaserAPDPNRatiosRefHandle_);
+  setup.get<EcalLaserAlphasRcd>().get(LaserAlphasHandle_);
+  setup.get<EcalLinearCorrectionsRcd>().get(LinearCorrectionsHandle_);
+  setup.get<JobConfigurationGPURecord>().get(recHitParametersHandle_);
+
+  auto const& ADCToGeVConstantProduct = ADCToGeVConstantHandle_->getProduct(ctx.stream());
+  auto const& IntercalibConstantsProduct = IntercalibConstantsHandle_->getProduct(ctx.stream());
+  auto const& ChannelStatusProduct = ChannelStatusHandle_->getProduct(ctx.stream());
+
+  auto const& LaserAPDPNRatiosProduct = LaserAPDPNRatiosHandle_->getProduct(ctx.stream());
+  auto const& LaserAPDPNRatiosRefProduct = LaserAPDPNRatiosRefHandle_->getProduct(ctx.stream());
+  auto const& LaserAlphasProduct = LaserAlphasHandle_->getProduct(ctx.stream());
+  auto const& LinearCorrectionsProduct = LinearCorrectionsHandle_->getProduct(ctx.stream());
+  auto const& recHitParametersProduct = recHitParametersHandle_->getProduct(ctx.stream());
+
+  // set config ptrs : this is done to avoid changing things downstream
+  configParameters_.ChannelStatusToBeExcluded = recHitParametersProduct.ChannelStatusToBeExcluded;
+  configParameters_.ChannelStatusToBeExcludedSize = std::get<0>(recHitParametersHandle_->getValues()).get().size();
+  configParameters_.expanded_v_DB_reco_flags = recHitParametersProduct.expanded_v_DB_reco_flags;
+  configParameters_.expanded_Sizes_v_DB_reco_flags = recHitParametersProduct.expanded_Sizes_v_DB_reco_flags;
+  configParameters_.expanded_flagbit_v_DB_reco_flags = recHitParametersProduct.expanded_flagbit_v_DB_reco_flags;
+  configParameters_.expanded_v_DB_reco_flagsSize = std::get<3>(recHitParametersHandle_->getValues()).get().size();
+
+  // bundle up conditions
+  ecal::rechit::ConditionsProducts conditions{ADCToGeVConstantProduct,
+                                              IntercalibConstantsProduct,
+                                              ChannelStatusProduct,
+                                              LaserAPDPNRatiosProduct,
+                                              LaserAPDPNRatiosRefProduct,
+                                              LaserAlphasProduct,
+                                              LinearCorrectionsProduct,
+                                              IntercalibConstantsHandle_->getOffset()};
+
+  // dev mem
+  eventOutputDataGPU_.allocate(configParameters_, ctx.stream());
+
+  //
+  // schedule algorithms
+  //
+
+  edm::TimeValue_t event_time = event.time().value();
+
+  ecal::rechit::create_ecal_rehit(
+      inputDataGPU, eventOutputDataGPU_, conditions, configParameters_, nchannelsEB, event_time, ctx.stream());
+
+  cudaCheck(cudaGetLastError());
+}
+
+void EcalRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+  eventOutputDataGPU_.recHitsEB.size = neb_;
+  eventOutputDataGPU_.recHitsEE.size = nee_;
+
+  // put into the event
+  ctx.emplace(event, recHitsTokenEB_, std::move(eventOutputDataGPU_.recHitsEB));
+  ctx.emplace(event, recHitsTokenEE_, std::move(eventOutputDataGPU_.recHitsEE));
+}
+
+DEFINE_FWK_MODULE(EcalRecHitProducerGPU);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
new file mode 100644
index 0000000000000..f7e57a61fdd96
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
@@ -0,0 +1,93 @@
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#include "Common.h"
+
+class EcalUncalibRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> {
+public:
+  explicit EcalUncalibRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps);
+  ~EcalUncalibRecHitConvertGPU2CPUFormat() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  using InputProduct = ecal::UncalibratedRecHit<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  const edm::EDGetTokenT<InputProduct> recHitsGPUEB_;
+  const edm::EDGetTokenT<InputProduct> recHitsGPUEE_;
+
+  const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_;
+};
+
+void EcalUncalibRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("recHitsLabelGPUEB", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"));
+  desc.add<edm::InputTag>("recHitsLabelGPUEE", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"));
+
+  desc.add<std::string>("recHitsLabelCPUEB", "EcalUncalibRecHitsEB");
+  desc.add<std::string>("recHitsLabelCPUEE", "EcalUncalibRecHitsEE");
+
+  confDesc.add("ecalUncalibRecHitConvertGPU2CPUFormat", desc);
+}
+
+EcalUncalibRecHitConvertGPU2CPUFormat::EcalUncalibRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps)
+    : recHitsGPUEB_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEB"))},
+      recHitsGPUEE_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEE"))},
+      recHitsLabelCPUEB_{ps.getParameter<std::string>("recHitsLabelCPUEB")},
+      recHitsLabelCPUEE_{ps.getParameter<std::string>("recHitsLabelCPUEE")} {
+  produces<EBUncalibratedRecHitCollection>(recHitsLabelCPUEB_);
+  produces<EEUncalibratedRecHitCollection>(recHitsLabelCPUEE_);
+}
+
+EcalUncalibRecHitConvertGPU2CPUFormat::~EcalUncalibRecHitConvertGPU2CPUFormat() {}
+
+void EcalUncalibRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) {
+  edm::Handle<InputProduct> hRecHitsGPUEB, hRecHitsGPUEE;
+  event.getByToken(recHitsGPUEB_, hRecHitsGPUEB);
+  event.getByToken(recHitsGPUEE_, hRecHitsGPUEE);
+
+  auto recHitsCPUEB = std::make_unique<EBUncalibratedRecHitCollection>();
+  auto recHitsCPUEE = std::make_unique<EEUncalibratedRecHitCollection>();
+  recHitsCPUEB->reserve(hRecHitsGPUEB->amplitude.size());
+  recHitsCPUEE->reserve(hRecHitsGPUEE->amplitude.size());
+
+  for (uint32_t i = 0; i < hRecHitsGPUEB->amplitude.size(); ++i) {
+    recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB->did[i]},
+                               hRecHitsGPUEB->amplitude[i],
+                               hRecHitsGPUEB->pedestal[i],
+                               hRecHitsGPUEB->jitter[i],
+                               hRecHitsGPUEB->chi2[i],
+                               hRecHitsGPUEB->flags[i]);
+    (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->jitterError[i]);
+    auto const offset = i * EcalDataFrame::MAXSAMPLES;
+    for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample)
+      (*recHitsCPUEB)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEB->amplitudesAll[offset + sample]);
+  }
+
+  for (uint32_t i = 0; i < hRecHitsGPUEE->amplitude.size(); ++i) {
+    recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE->did[i]},
+                               hRecHitsGPUEE->amplitude[i],
+                               hRecHitsGPUEE->pedestal[i],
+                               hRecHitsGPUEE->jitter[i],
+                               hRecHitsGPUEE->chi2[i],
+                               hRecHitsGPUEE->flags[i]);
+    (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->jitterError[i]);
+    auto const offset = i * EcalDataFrame::MAXSAMPLES;
+    for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample)
+      (*recHitsCPUEE)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEE->amplitudesAll[offset + sample]);
+  }
+
+  event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);
+  event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_);
+}
+
+DEFINE_FWK_MODULE(EcalUncalibRecHitConvertGPU2CPUFormat);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgoGPU.cu b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgoGPU.cu
new file mode 100644
index 0000000000000..9d5a8a2ad1bd3
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgoGPU.cu
@@ -0,0 +1,305 @@
+#include <iostream>
+#include <limits>
+
+#include <cuda.h>
+
+#include "CondFormats/EcalObjects/interface/EcalMGPAGainRatio.h"
+#include "CondFormats/EcalObjects/interface/EcalPedestals.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "CondFormats/EcalObjects/interface/EcalSampleMask.h"
+#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
+#include "CondFormats/EcalObjects/interface/EcalXtalGroupId.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+
+#include "AmplitudeComputationCommonKernels.h"
+#include "AmplitudeComputationKernels.h"
+#include "Common.h"
+#include "EcalUncalibRecHitMultiFitAlgoGPU.h"
+#include "TimeComputationKernels.h"
+
+//#define DEBUG
+
+//#define ECAL_RECO_CUDA_DEBUG
+
+namespace ecal {
+  namespace multifit {
+
+    void entryPoint(EventInputDataGPU const& eventInputGPU,
+                    EventOutputDataGPU& eventOutputGPU,
+                    EventDataForScratchGPU& scratch,
+                    ConditionsProducts const& conditions,
+                    ConfigurationParameters const& configParameters,
+                    cudaStream_t cudaStream) {
+      using digis_type = std::vector<uint16_t>;
+      using dids_type = std::vector<uint32_t>;
+      // accodring to the cpu setup  //----> hardcoded
+      bool const gainSwitchUseMaxSampleEB = true;
+      // accodring to the cpu setup  //----> hardcoded
+      bool const gainSwitchUseMaxSampleEE = false;
+
+      uint32_t const offsetForHashes = conditions.offsetForHashes;
+      uint32_t const offsetForInputs = eventInputGPU.ebDigis.size;
+      unsigned int totalChannels = eventInputGPU.ebDigis.size + eventInputGPU.eeDigis.size;
+
+      //
+      // 1d preparation kernel
+      //
+      unsigned int nchannels_per_block = 32;
+      unsigned int threads_1d = 10 * nchannels_per_block;
+      unsigned int blocks_1d = threads_1d > 10 * totalChannels ? 1 : (totalChannels * 10 + threads_1d - 1) / threads_1d;
+      int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES *
+                         (sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + sizeof(bool));
+      kernel_prep_1d_and_initialize<<<blocks_1d, threads_1d, shared_bytes, cudaStream>>>(
+          conditions.pulseShapes.values,
+          eventInputGPU.ebDigis.data.get(),
+          eventInputGPU.ebDigis.ids.get(),
+          eventInputGPU.eeDigis.data.get(),
+          eventInputGPU.eeDigis.ids.get(),
+          (SampleVector*)scratch.samples.get(),
+          (SampleVector*)eventOutputGPU.recHitsEB.amplitudesAll.get(),
+          (SampleVector*)eventOutputGPU.recHitsEE.amplitudesAll.get(),
+          (SampleGainVector*)scratch.gainsNoise.get(),
+          conditions.pedestals.mean_x1,
+          conditions.pedestals.mean_x12,
+          conditions.pedestals.rms_x12,
+          conditions.pedestals.mean_x6,
+          conditions.gainRatios.gain6Over1,
+          conditions.gainRatios.gain12Over6,
+          scratch.hasSwitchToGain6.get(),
+          scratch.hasSwitchToGain1.get(),
+          scratch.isSaturated.get(),
+          eventOutputGPU.recHitsEB.amplitude.get(),
+          eventOutputGPU.recHitsEE.amplitude.get(),
+          eventOutputGPU.recHitsEB.chi2.get(),
+          eventOutputGPU.recHitsEE.chi2.get(),
+          eventOutputGPU.recHitsEB.pedestal.get(),
+          eventOutputGPU.recHitsEE.pedestal.get(),
+          eventOutputGPU.recHitsEB.did.get(),
+          eventOutputGPU.recHitsEE.did.get(),
+          eventOutputGPU.recHitsEB.flags.get(),
+          eventOutputGPU.recHitsEE.flags.get(),
+          scratch.acState.get(),
+          (BXVectorType*)scratch.activeBXs.get(),
+          offsetForHashes,
+          offsetForInputs,
+          gainSwitchUseMaxSampleEB,
+          gainSwitchUseMaxSampleEE,
+          totalChannels);
+      cudaCheck(cudaGetLastError());
+
+      //
+      // 2d preparation kernel
+      //
+      int blocks_2d = totalChannels;
+      dim3 threads_2d{10, 10};
+      kernel_prep_2d<<<blocks_2d, threads_2d, 0, cudaStream>>>((SampleGainVector*)scratch.gainsNoise.get(),
+                                                               eventInputGPU.ebDigis.ids.get(),
+                                                               eventInputGPU.eeDigis.ids.get(),
+                                                               conditions.pedestals.rms_x12,
+                                                               conditions.pedestals.rms_x6,
+                                                               conditions.pedestals.rms_x1,
+                                                               conditions.gainRatios.gain12Over6,
+                                                               conditions.gainRatios.gain6Over1,
+                                                               conditions.samplesCorrelation.EBG12SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EBG6SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EBG1SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EEG12SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EEG6SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EEG1SamplesCorrelation,
+                                                               (SampleMatrix*)scratch.noisecov.get(),
+                                                               (PulseMatrixType*)scratch.pulse_matrix.get(),
+                                                               conditions.pulseShapes.values,
+                                                               scratch.hasSwitchToGain6.get(),
+                                                               scratch.hasSwitchToGain1.get(),
+                                                               scratch.isSaturated.get(),
+                                                               offsetForHashes,
+                                                               offsetForInputs);
+      cudaCheck(cudaGetLastError());
+
+      // run minimization kernels
+      v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream);
+
+      if (configParameters.shouldRunTimingComputation) {
+        //
+        // TODO: this guy can run concurrently with other kernels,
+        // there is no dependence on the order of execution
+        //
+        unsigned int threads_time_init = threads_1d;
+        unsigned int blocks_time_init = blocks_1d;
+        int sharedBytesInit = 2 * threads_time_init * sizeof(SampleVector::Scalar);
+        kernel_time_computation_init<<<blocks_time_init, threads_time_init, sharedBytesInit, cudaStream>>>(
+            eventInputGPU.ebDigis.data.get(),
+            eventInputGPU.ebDigis.ids.get(),
+            eventInputGPU.eeDigis.data.get(),
+            eventInputGPU.eeDigis.ids.get(),
+            conditions.pedestals.rms_x12,
+            conditions.pedestals.rms_x6,
+            conditions.pedestals.rms_x1,
+            conditions.pedestals.mean_x12,
+            conditions.pedestals.mean_x6,
+            conditions.pedestals.mean_x1,
+            conditions.gainRatios.gain12Over6,
+            conditions.gainRatios.gain6Over1,
+            scratch.sample_values.get(),
+            scratch.sample_value_errors.get(),
+            scratch.ampMaxError.get(),
+            scratch.useless_sample_values.get(),
+            scratch.pedestal_nums.get(),
+            offsetForHashes,
+            offsetForInputs,
+            conditions.sampleMask.getEcalSampleMaskRecordEB(),
+            conditions.sampleMask.getEcalSampleMaskRecordEE(),
+            totalChannels);
+        cudaCheck(cudaGetLastError());
+
+        //
+        // TODO: small kernel only for EB. It needs to be checked if
+        /// fusing such small kernels is beneficial in here
+        //
+        // we are running only over EB digis
+        // therefore we need to create threads/blocks only for that
+        unsigned int const threadsFixMGPA = threads_1d;
+        unsigned int const blocksFixMGPA =
+            threadsFixMGPA > 10 * eventInputGPU.ebDigis.size
+                ? 1
+                : (10 * eventInputGPU.ebDigis.size + threadsFixMGPA - 1) / threadsFixMGPA;
+        kernel_time_compute_fixMGPAslew<<<blocksFixMGPA, threadsFixMGPA, 0, cudaStream>>>(
+            eventInputGPU.ebDigis.data.get(),
+            eventInputGPU.eeDigis.data.get(),
+            scratch.sample_values.get(),
+            scratch.sample_value_errors.get(),
+            scratch.useless_sample_values.get(),
+            conditions.sampleMask.getEcalSampleMaskRecordEB(),
+            totalChannels,
+            offsetForInputs);
+        cudaCheck(cudaGetLastError());
+
+        int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * 4 * sizeof(SampleVector::Scalar);
+        auto const threads_nullhypot = threads_1d;
+        auto const blocks_nullhypot = blocks_1d;
+        kernel_time_compute_nullhypot<<<blocks_nullhypot, threads_nullhypot, sharedBytes, cudaStream>>>(
+            scratch.sample_values.get(),
+            scratch.sample_value_errors.get(),
+            scratch.useless_sample_values.get(),
+            scratch.chi2sNullHypot.get(),
+            scratch.sum0sNullHypot.get(),
+            scratch.sumAAsNullHypot.get(),
+            totalChannels);
+        cudaCheck(cudaGetLastError());
+
+        unsigned int nchannels_per_block_makeratio = 10;
+        unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio;
+        unsigned int blocks_makeratio = threads_makeratio > 45 * totalChannels
+                                            ? 1
+                                            : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio;
+        int sharedBytesMakeRatio = 5 * threads_makeratio * sizeof(SampleVector::Scalar);
+        kernel_time_compute_makeratio<<<blocks_makeratio, threads_makeratio, sharedBytesMakeRatio, cudaStream>>>(
+            scratch.sample_values.get(),
+            scratch.sample_value_errors.get(),
+            eventInputGPU.ebDigis.ids.get(),
+            eventInputGPU.eeDigis.ids.get(),
+            scratch.useless_sample_values.get(),
+            scratch.pedestal_nums.get(),
+            configParameters.amplitudeFitParametersEB,
+            configParameters.amplitudeFitParametersEE,
+            configParameters.timeFitParametersEB,
+            configParameters.timeFitParametersEE,
+            scratch.sumAAsNullHypot.get(),
+            scratch.sum0sNullHypot.get(),
+            scratch.tMaxAlphaBetas.get(),
+            scratch.tMaxErrorAlphaBetas.get(),
+            scratch.accTimeMax.get(),
+            scratch.accTimeWgt.get(),
+            scratch.tcState.get(),
+            configParameters.timeFitParametersSizeEB,
+            configParameters.timeFitParametersSizeEE,
+            configParameters.timeFitLimitsFirstEB,
+            configParameters.timeFitLimitsFirstEE,
+            configParameters.timeFitLimitsSecondEB,
+            configParameters.timeFitLimitsSecondEE,
+            totalChannels,
+            offsetForInputs);
+        cudaCheck(cudaGetLastError());
+
+        auto const threads_findamplchi2 = threads_1d;
+        auto const blocks_findamplchi2 = blocks_1d;
+        int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * sizeof(SampleVector::Scalar);
+        kernel_time_compute_findamplchi2_and_finish<<<blocks_findamplchi2,
+                                                      threads_findamplchi2,
+                                                      sharedBytesFindAmplChi2,
+                                                      cudaStream>>>(scratch.sample_values.get(),
+                                                                    scratch.sample_value_errors.get(),
+                                                                    eventInputGPU.ebDigis.ids.get(),
+                                                                    eventInputGPU.eeDigis.ids.get(),
+                                                                    scratch.useless_sample_values.get(),
+                                                                    scratch.tMaxAlphaBetas.get(),
+                                                                    scratch.tMaxErrorAlphaBetas.get(),
+                                                                    scratch.accTimeMax.get(),
+                                                                    scratch.accTimeWgt.get(),
+                                                                    configParameters.amplitudeFitParametersEB,
+                                                                    configParameters.amplitudeFitParametersEE,
+                                                                    scratch.sumAAsNullHypot.get(),
+                                                                    scratch.sum0sNullHypot.get(),
+                                                                    scratch.chi2sNullHypot.get(),
+                                                                    scratch.tcState.get(),
+                                                                    scratch.ampMaxAlphaBeta.get(),
+                                                                    scratch.ampMaxError.get(),
+                                                                    scratch.timeMax.get(),
+                                                                    scratch.timeError.get(),
+                                                                    totalChannels,
+                                                                    offsetForInputs);
+        cudaCheck(cudaGetLastError());
+
+        auto const threads_timecorr = 32;
+        auto const blocks_timecorr =
+            threads_timecorr > totalChannels ? 1 : (totalChannels + threads_timecorr - 1) / threads_timecorr;
+        kernel_time_correction_and_finalize<<<blocks_timecorr, threads_timecorr, 0, cudaStream>>>(
+            eventOutputGPU.recHitsEB.amplitude.get(),
+            eventOutputGPU.recHitsEE.amplitude.get(),
+            eventInputGPU.ebDigis.data.get(),
+            eventInputGPU.ebDigis.ids.get(),
+            eventInputGPU.eeDigis.data.get(),
+            eventInputGPU.eeDigis.ids.get(),
+            conditions.timeBiasCorrections.EBTimeCorrAmplitudeBins,
+            conditions.timeBiasCorrections.EETimeCorrAmplitudeBins,
+            conditions.timeBiasCorrections.EBTimeCorrShiftBins,
+            conditions.timeBiasCorrections.EETimeCorrShiftBins,
+            scratch.timeMax.get(),
+            scratch.timeError.get(),
+            conditions.pedestals.rms_x12,
+            conditions.timeCalibConstants.values,
+            eventOutputGPU.recHitsEB.jitter.get(),
+            eventOutputGPU.recHitsEE.jitter.get(),
+            eventOutputGPU.recHitsEB.jitterError.get(),
+            eventOutputGPU.recHitsEE.jitterError.get(),
+            eventOutputGPU.recHitsEB.flags.get(),
+            eventOutputGPU.recHitsEE.flags.get(),
+            conditions.timeBiasCorrections.EBTimeCorrAmplitudeBinsSize,
+            conditions.timeBiasCorrections.EETimeCorrAmplitudeBinsSize,
+            configParameters.timeConstantTermEB,
+            configParameters.timeConstantTermEE,
+            conditions.timeOffsetConstant.getEBValue(),
+            conditions.timeOffsetConstant.getEEValue(),
+            configParameters.timeNconstEB,
+            configParameters.timeNconstEE,
+            configParameters.amplitudeThreshEB,
+            configParameters.amplitudeThreshEE,
+            configParameters.outOfTimeThreshG12pEB,
+            configParameters.outOfTimeThreshG12pEE,
+            configParameters.outOfTimeThreshG12mEB,
+            configParameters.outOfTimeThreshG12mEE,
+            configParameters.outOfTimeThreshG61pEB,
+            configParameters.outOfTimeThreshG61pEE,
+            configParameters.outOfTimeThreshG61mEB,
+            configParameters.outOfTimeThreshG61mEE,
+            offsetForHashes,
+            offsetForInputs,
+            totalChannels);
+        cudaCheck(cudaGetLastError());
+      }
+    }
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgoGPU.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgoGPU.h
new file mode 100644
index 0000000000000..c84047a8bf8e7
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgoGPU.h
@@ -0,0 +1,23 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgoGPU_h
+#define RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgoGPU_h
+
+#include <vector>
+
+#include <cuda.h>
+
+#include "DeclsForKernels.h"
+
+namespace ecal {
+  namespace multifit {
+
+    void entryPoint(EventInputDataGPU const&,
+                    EventOutputDataGPU&,
+                    EventDataForScratchGPU&,
+                    ConditionsProducts const&,
+                    ConfigurationParameters const&,
+                    cudaStream_t);
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgoGPU_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
new file mode 100644
index 0000000000000..a321f35144c39
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
@@ -0,0 +1,279 @@
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h"
+#include "CondFormats/DataRecord/interface/EcalGainRatiosRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPedestalsRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPulseCovariancesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPulseShapesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalSampleMaskRcd.h"
+#include "CondFormats/DataRecord/interface/EcalSamplesCorrelationRcd.h"
+#include "CondFormats/DataRecord/interface/EcalTimeBiasCorrectionsRcd.h"
+#include "CondFormats/DataRecord/interface/EcalTimeCalibConstantsRcd.h"
+#include "CondFormats/DataRecord/interface/EcalTimeOffsetConstantRcd.h"
+#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalMultifitParametersGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EcalUncalibRecHitMultiFitAlgoGPU.h"
+
+class EcalUncalibRecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit EcalUncalibRecHitProducerGPU(edm::ParameterSet const& ps);
+  ~EcalUncalibRecHitProducerGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  using InputProduct = cms::cuda::Product<ecal::DigisCollection<calo::common::DevStoragePolicy>>;
+  const edm::EDGetTokenT<InputProduct> digisTokenEB_, digisTokenEE_;
+  using OutputProduct = cms::cuda::Product<ecal::UncalibratedRecHit<calo::common::DevStoragePolicy>>;
+  const edm::EDPutTokenT<OutputProduct> recHitsTokenEB_, recHitsTokenEE_;
+
+  // conditions tokens
+  const edm::ESGetToken<EcalPedestalsGPU, EcalPedestalsRcd> pedestalsToken_;
+  const edm::ESGetToken<EcalGainRatiosGPU, EcalGainRatiosRcd> gainRatiosToken_;
+  const edm::ESGetToken<EcalPulseShapesGPU, EcalPulseShapesRcd> pulseShapesToken_;
+  const edm::ESGetToken<EcalPulseCovariancesGPU, EcalPulseCovariancesRcd> pulseCovariancesToken_;
+  const edm::ESGetToken<EcalSamplesCorrelationGPU, EcalSamplesCorrelationRcd> samplesCorrelationToken_;
+  const edm::ESGetToken<EcalTimeBiasCorrectionsGPU, EcalTimeBiasCorrectionsRcd> timeBiasCorrectionsToken_;
+  const edm::ESGetToken<EcalTimeCalibConstantsGPU, EcalTimeCalibConstantsRcd> timeCalibConstantsToken_;
+  const edm::ESGetToken<EcalSampleMask, EcalSampleMaskRcd> sampleMaskToken_;
+  const edm::ESGetToken<EcalTimeOffsetConstant, EcalTimeOffsetConstantRcd> timeOffsetConstantToken_;
+  const edm::ESGetToken<EcalMultifitParametersGPU, JobConfigurationGPURecord> multifitParametersToken_;
+
+  // configuration parameters
+  ecal::multifit::ConfigurationParameters configParameters_;
+
+  // event data
+  ecal::multifit::EventOutputDataGPU eventOutputDataGPU_;
+
+  cms::cuda::ContextState cudaState_;
+
+  uint32_t neb_, nee_;
+};
+
+void EcalUncalibRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("digisLabelEB", edm::InputTag("ecalRawToDigiGPU", "ebDigis"));
+  desc.add<edm::InputTag>("digisLabelEE", edm::InputTag("ecalRawToDigiGPU", "eeDigis"));
+
+  desc.add<std::string>("recHitsLabelEB", "EcalUncalibRecHitsEB");
+  desc.add<std::string>("recHitsLabelEE", "EcalUncalibRecHitsEE");
+
+  desc.add<double>("EBtimeFitLimits_Lower", 0.2);
+  desc.add<double>("EBtimeFitLimits_Upper", 1.4);
+  desc.add<double>("EEtimeFitLimits_Lower", 0.2);
+  desc.add<double>("EEtimeFitLimits_Upper", 1.4);
+  desc.add<double>("EBtimeConstantTerm", .6);
+  desc.add<double>("EEtimeConstantTerm", 1.0);
+  desc.add<double>("EBtimeNconst", 28.5);
+  desc.add<double>("EEtimeNconst", 31.8);
+  desc.add<double>("outOfTimeThresholdGain12pEB", 5);
+  desc.add<double>("outOfTimeThresholdGain12mEB", 5);
+  desc.add<double>("outOfTimeThresholdGain61pEB", 5);
+  desc.add<double>("outOfTimeThresholdGain61mEB", 5);
+  desc.add<double>("outOfTimeThresholdGain12pEE", 1000);
+  desc.add<double>("outOfTimeThresholdGain12mEE", 1000);
+  desc.add<double>("outOfTimeThresholdGain61pEE", 1000);
+  desc.add<double>("outOfTimeThresholdGain61mEE", 1000);
+  desc.add<double>("amplitudeThresholdEB", 10);
+  desc.add<double>("amplitudeThresholdEE", 10);
+  desc.add<uint32_t>("maxNumberHitsEB", 61200);
+  desc.add<uint32_t>("maxNumberHitsEE", 14648);
+  desc.addUntracked<std::vector<uint32_t>>("kernelMinimizeThreads", {32, 1, 1});
+  desc.add<bool>("shouldRunTimingComputation", true);
+  confDesc.addWithDefaultLabel(desc);
+}
+
+EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU(const edm::ParameterSet& ps)
+    : digisTokenEB_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("digisLabelEB"))},
+      digisTokenEE_{consumes<InputProduct>(ps.getParameter<edm::InputTag>("digisLabelEE"))},
+      recHitsTokenEB_{produces<OutputProduct>(ps.getParameter<std::string>("recHitsLabelEB"))},
+      recHitsTokenEE_{produces<OutputProduct>(ps.getParameter<std::string>("recHitsLabelEE"))},
+      pedestalsToken_{esConsumes<EcalPedestalsGPU, EcalPedestalsRcd>()},
+      gainRatiosToken_{esConsumes<EcalGainRatiosGPU, EcalGainRatiosRcd>()},
+      pulseShapesToken_{esConsumes<EcalPulseShapesGPU, EcalPulseShapesRcd>()},
+      pulseCovariancesToken_{esConsumes<EcalPulseCovariancesGPU, EcalPulseCovariancesRcd>()},
+      samplesCorrelationToken_{esConsumes<EcalSamplesCorrelationGPU, EcalSamplesCorrelationRcd>()},
+      timeBiasCorrectionsToken_{esConsumes<EcalTimeBiasCorrectionsGPU, EcalTimeBiasCorrectionsRcd>()},
+      timeCalibConstantsToken_{esConsumes<EcalTimeCalibConstantsGPU, EcalTimeCalibConstantsRcd>()},
+      sampleMaskToken_{esConsumes<EcalSampleMask, EcalSampleMaskRcd>()},
+      timeOffsetConstantToken_{esConsumes<EcalTimeOffsetConstant, EcalTimeOffsetConstantRcd>()},
+      multifitParametersToken_{esConsumes<EcalMultifitParametersGPU, JobConfigurationGPURecord>()} {
+  std::pair<double, double> EBtimeFitLimits, EEtimeFitLimits;
+  EBtimeFitLimits.first = ps.getParameter<double>("EBtimeFitLimits_Lower");
+  EBtimeFitLimits.second = ps.getParameter<double>("EBtimeFitLimits_Upper");
+  EEtimeFitLimits.first = ps.getParameter<double>("EEtimeFitLimits_Lower");
+  EEtimeFitLimits.second = ps.getParameter<double>("EEtimeFitLimits_Upper");
+
+  auto EBtimeConstantTerm = ps.getParameter<double>("EBtimeConstantTerm");
+  auto EEtimeConstantTerm = ps.getParameter<double>("EEtimeConstantTerm");
+  auto EBtimeNconst = ps.getParameter<double>("EBtimeNconst");
+  auto EEtimeNconst = ps.getParameter<double>("EEtimeNconst");
+
+  auto outOfTimeThreshG12pEB = ps.getParameter<double>("outOfTimeThresholdGain12pEB");
+  auto outOfTimeThreshG12mEB = ps.getParameter<double>("outOfTimeThresholdGain12mEB");
+  auto outOfTimeThreshG61pEB = ps.getParameter<double>("outOfTimeThresholdGain61pEB");
+  auto outOfTimeThreshG61mEB = ps.getParameter<double>("outOfTimeThresholdGain61mEB");
+  auto outOfTimeThreshG12pEE = ps.getParameter<double>("outOfTimeThresholdGain12pEE");
+  auto outOfTimeThreshG12mEE = ps.getParameter<double>("outOfTimeThresholdGain12mEE");
+  auto outOfTimeThreshG61pEE = ps.getParameter<double>("outOfTimeThresholdGain61pEE");
+  auto outOfTimeThreshG61mEE = ps.getParameter<double>("outOfTimeThresholdGain61mEE");
+  auto amplitudeThreshEB = ps.getParameter<double>("amplitudeThresholdEB");
+  auto amplitudeThreshEE = ps.getParameter<double>("amplitudeThresholdEE");
+
+  // max number of digis to allocate for
+  configParameters_.maxNumberHitsEB = ps.getParameter<uint32_t>("maxNumberHitsEB");
+  configParameters_.maxNumberHitsEE = ps.getParameter<uint32_t>("maxNumberHitsEE");
+
+  // switch to run timing computation kernels
+  configParameters_.shouldRunTimingComputation = ps.getParameter<bool>("shouldRunTimingComputation");
+
+  // minimize kernel launch conf
+  auto threadsMinimize = ps.getUntrackedParameter<std::vector<uint32_t>>("kernelMinimizeThreads");
+  configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0];
+  configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1];
+  configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2];
+
+  //
+  // configuration and physics parameters: done once
+  // assume there is a single device
+  // use sync copying
+  //
+
+  // time fit parameters and limits
+  configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first;
+  configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second;
+  configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first;
+  configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second;
+
+  // time constant terms
+  configParameters_.timeConstantTermEB = EBtimeConstantTerm;
+  configParameters_.timeConstantTermEE = EEtimeConstantTerm;
+
+  // time N const
+  configParameters_.timeNconstEB = EBtimeNconst;
+  configParameters_.timeNconstEE = EEtimeNconst;
+
+  // amplitude threshold for time flags
+  configParameters_.amplitudeThreshEB = amplitudeThreshEB;
+  configParameters_.amplitudeThreshEE = amplitudeThreshEE;
+
+  // out of time thresholds gain-dependent
+  configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB;
+  configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE;
+  configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB;
+  configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE;
+  configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB;
+  configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE;
+  configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB;
+  configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE;
+}
+
+EcalUncalibRecHitProducerGPU::~EcalUncalibRecHitProducerGPU() {}
+
+void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event,
+                                           edm::EventSetup const& setup,
+                                           edm::WaitingTaskWithArenaHolder holder) {
+  // cuda products
+  auto const& ebDigisProduct = event.get(digisTokenEB_);
+  auto const& eeDigisProduct = event.get(digisTokenEE_);
+
+  // raii
+  cms::cuda::ScopedContextAcquire ctx{ebDigisProduct, std::move(holder), cudaState_};
+
+  // get actual obj
+  auto const& ebDigis = ctx.get(ebDigisProduct);
+  auto const& eeDigis = ctx.get(eeDigisProduct);
+  ecal::multifit::EventInputDataGPU inputDataGPU{ebDigis, eeDigis};
+  neb_ = ebDigis.size;
+  nee_ = eeDigis.size;
+
+  if ((neb_ > configParameters_.maxNumberHitsEB) || (nee_ > configParameters_.maxNumberHitsEE)) {
+    edm::LogError("EcalUncalibRecHitProducerGPU")
+        << "max number of channels exceeded. See options 'maxNumberHitsEB and maxNumberHitsEE' ";
+  }
+
+  // conditions
+  auto const& timeCalibConstantsData = setup.getData(timeCalibConstantsToken_);
+  auto const& sampleMaskData = setup.getData(sampleMaskToken_);
+  auto const& timeOffsetConstantData = setup.getData(timeOffsetConstantToken_);
+  auto const& multifitParametersData = setup.getData(multifitParametersToken_);
+
+  auto const& pedestals = setup.getData(pedestalsToken_).getProduct(ctx.stream());
+  auto const& gainRatios = setup.getData(gainRatiosToken_).getProduct(ctx.stream());
+  auto const& pulseShapes = setup.getData(pulseShapesToken_).getProduct(ctx.stream());
+  auto const& pulseCovariances = setup.getData(pulseCovariancesToken_).getProduct(ctx.stream());
+  auto const& samplesCorrelation = setup.getData(samplesCorrelationToken_).getProduct(ctx.stream());
+  auto const& timeBiasCorrections = setup.getData(timeBiasCorrectionsToken_).getProduct(ctx.stream());
+  auto const& timeCalibConstants = timeCalibConstantsData.getProduct(ctx.stream());
+  auto const& multifitParameters = multifitParametersData.getProduct(ctx.stream());
+
+  // assign ptrs/values: this is done not to change how things look downstream
+  configParameters_.amplitudeFitParametersEB = multifitParameters.amplitudeFitParametersEB;
+  configParameters_.amplitudeFitParametersEE = multifitParameters.amplitudeFitParametersEE;
+  configParameters_.timeFitParametersEB = multifitParameters.timeFitParametersEB;
+  configParameters_.timeFitParametersEE = multifitParameters.timeFitParametersEE;
+  configParameters_.timeFitParametersSizeEB = multifitParametersData.getValues()[2].get().size();
+  configParameters_.timeFitParametersSizeEE = multifitParametersData.getValues()[3].get().size();
+
+  // bundle up conditions
+  ecal::multifit::ConditionsProducts conditions{pedestals,
+                                                gainRatios,
+                                                pulseShapes,
+                                                pulseCovariances,
+                                                samplesCorrelation,
+                                                timeBiasCorrections,
+                                                timeCalibConstants,
+                                                sampleMaskData,
+                                                timeOffsetConstantData,
+                                                timeCalibConstantsData.getOffset(),
+                                                multifitParameters};
+
+  // dev mem
+  eventOutputDataGPU_.allocate(configParameters_, ctx.stream());
+
+  // scratch mem
+  ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU;
+  eventDataForScratchGPU.allocate(configParameters_, ctx.stream());
+
+  //
+  // schedule algorithms
+  //
+  ecal::multifit::entryPoint(
+      inputDataGPU, eventOutputDataGPU_, eventDataForScratchGPU, conditions, configParameters_, ctx.stream());
+}
+
+void EcalUncalibRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  //DurationMeasurer<std::chrono::milliseconds> timer{std::string{"produce duration"}};
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+  // set the size of eb and ee
+  eventOutputDataGPU_.recHitsEB.size = neb_;
+  eventOutputDataGPU_.recHitsEE.size = nee_;
+
+  // put into the event
+  ctx.emplace(event, recHitsTokenEB_, std::move(eventOutputDataGPU_.recHitsEB));
+  ctx.emplace(event, recHitsTokenEE_, std::move(eventOutputDataGPU_.recHitsEE));
+}
+
+DEFINE_FWK_MODULE(EcalUncalibRecHitProducerGPU);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h b/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h
new file mode 100644
index 0000000000000..bbf9cb0dbb5c9
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h
@@ -0,0 +1,49 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h
+#define RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h
+
+#include <array>
+
+#include <Eigen/Dense>
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+
+namespace ecal {
+  namespace multifit {
+
+    constexpr int SampleVectorSize = 10;
+    constexpr int FullSampleVectorSize = 19;
+    constexpr int PulseVectorSize = 12;
+    constexpr int NGains = 3;
+
+    using data_type = ::ecal::reco::ComputationScalarType;
+
+    typedef Eigen::Matrix<data_type, SampleVectorSize, SampleVectorSize> PulseMatrixType;
+    typedef Eigen::Matrix<char, SampleVectorSize, 1> BXVectorType;
+    using SampleMatrixD = Eigen::Matrix<double, SampleVectorSize, SampleVectorSize>;
+
+    typedef Eigen::Matrix<data_type, SampleVectorSize, 1> SampleVector;
+    typedef Eigen::Matrix<data_type, FullSampleVectorSize, 1> FullSampleVector;
+    typedef Eigen::Matrix<data_type, Eigen::Dynamic, 1, 0, PulseVectorSize, 1> PulseVector;
+    typedef Eigen::Matrix<char, Eigen::Dynamic, 1, 0, PulseVectorSize, 1> BXVector;
+    typedef Eigen::Matrix<char, SampleVectorSize, 1> SampleGainVector;
+    typedef Eigen::Matrix<data_type, SampleVectorSize, SampleVectorSize> SampleMatrix;
+    typedef Eigen::Matrix<data_type, FullSampleVectorSize, FullSampleVectorSize> FullSampleMatrix;
+    typedef Eigen::Matrix<data_type, Eigen::Dynamic, Eigen::Dynamic, 0, PulseVectorSize, PulseVectorSize> PulseMatrix;
+    typedef Eigen::Matrix<data_type, SampleVectorSize, Eigen::Dynamic, 0, SampleVectorSize, PulseVectorSize>
+        SamplePulseMatrix;
+    typedef Eigen::LLT<SampleMatrix> SampleDecompLLT;
+    typedef Eigen::LLT<SampleMatrixD> SampleDecompLLTD;
+    typedef Eigen::LLT<PulseMatrix> PulseDecompLLT;
+    typedef Eigen::LDLT<PulseMatrix> PulseDecompLDLT;
+
+    typedef Eigen::Matrix<data_type, 1, 1> SingleMatrix;
+    typedef Eigen::Matrix<data_type, 1, 1> SingleVector;
+
+    typedef std::array<SampleMatrixD, NGains> SampleMatrixGainArray;
+
+    using PermutationMatrix = Eigen::PermutationMatrix<SampleMatrix::RowsAtCompileTime>;
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu
new file mode 100644
index 0000000000000..5316ed87d6ecc
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu
@@ -0,0 +1,308 @@
+#include "DataFormats/EcalDetId/interface/EBDetId.h"
+#include "DataFormats/EcalDetId/interface/EEDetId.h"
+
+#include "KernelHelpers.h"
+
+namespace ecal {
+  namespace reconstruction {
+
+    namespace internal {
+
+      namespace barrel {
+
+        __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; }
+
+        __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; }
+
+        __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; }
+
+        __device__ int dccFromSm(int ism) {
+          int iz = 1;
+          if (ism > 18)
+            iz = -1;
+          if (iz == -1)
+            ism -= 18;
+          int idcc = 9 + ism;
+          if (iz == +1)
+            idcc += 18;
+          return idcc;
+        }
+
+        __device__ int sm(int ieta, int iphi) {
+          int iz = 1;
+          if (ieta < 0)
+            iz = -1;
+          ieta *= iz;
+          int iphi_ = iphi;
+          if (iphi_ > 360)
+            iphi_ -= 360;
+          int ism = (iphi_ - 1) / 20 + 1;
+          if (iz == -1)
+            ism += 18;
+          return ism;
+        }
+
+        __device__ int dcc(int ieta, int iphi) {
+          int ism = sm(ieta, iphi);
+          return dccFromSm(ism);
+        }
+
+        //
+        // ---- why on hell things are so complex and not simple ???
+        //
+
+        __device__ int lm_channel(int iX, int iY) {
+          static const int idx_[] = {
+              // clang-format off
+         // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+            1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8,  // 3
+            1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8,  // 2
+            1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9,  // 1
+            1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9  // 0
+         // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+              // clang-format on
+          };
+
+          int il, ic, ii;
+          const int iym = 4;
+          const int ixm = 17;
+          int iX_ = iX + 1;
+          int iY_ = iY + 1;
+          il = iym - iY_;
+          ic = iX_ - 1;
+          ii = il * ixm + ic;
+          if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int))) {
+            return -1;
+          };
+          return idx_[ii];
+        }
+
+        __device__ int localCoord_x(int ieta, int iphi) {
+          int iz = 1;
+          if (ieta < 0) {
+            iz = -1;
+          }
+          ieta *= iz;
+          int ix = ieta - 1;
+
+          return ix;
+        }
+
+        __device__ int localCoord_y(int ieta, int iphi) {
+          int iz = 1;
+          if (ieta < 0) {
+            iz = -1;
+          }
+          int iphi_ = iphi;
+          if (iphi_ > 360) {
+            iphi_ -= 360;
+          }
+          int iy = (iphi_ - 1) % 20;
+          if (iz == -1) {
+            iy = 19 - iy;
+          }
+
+          return iy;
+        }
+
+        __device__ int lmmod(int ieta, int iphi) {
+          int ix = localCoord_x(ieta, iphi);
+          int iy = localCoord_y(ieta, iphi);
+
+          return lm_channel(ix / 5, iy / 5);
+        }
+
+        __device__ int side(int ieta, int iphi) {
+          int ilmmod = lmmod(ieta, iphi);
+          return (ilmmod % 2 == 0) ? 1 : 0;
+        }
+
+      }  // namespace barrel
+
+    }  // namespace internal
+
+    __device__ uint32_t hashedIndexEB(uint32_t id) {
+      using namespace internal::barrel;
+      return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1;
+    }
+
+    //
+    // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEBGeom.cc
+    //  function: "lmr"
+
+    __device__ int laser_monitoring_region_EB(uint32_t id) {
+      using namespace internal::barrel;
+
+      int ieta;
+      if (positiveZ(id)) {
+        ieta = ietaAbs(id);
+      } else {
+        ieta = -ietaAbs(id);
+      }
+
+      int idcc = dcc(ieta, (int)(iphi(id)));
+      int ism = idcc - 9;
+
+      int iside = side(ieta, (int)(iphi(id)));
+
+      return (1 + 2 * (ism - 1) + iside);
+    }
+
+    namespace internal {
+
+      namespace endcap {
+
+        __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; }
+
+        __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; }
+
+        __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; }
+
+        // these constants come from EE Det Id
+        __constant__ const unsigned short kxf[] = {
+            41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21,
+            51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51,
+            6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 4,  51, 4,  51, 4,
+            51, 4,  51, 4,  56, 1,  58, 1,  59, 1,  60, 1,  61, 1,  61, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62,
+            1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  61, 1,  61, 1,  60, 1,  59, 1,  58, 4,  56, 4,  51, 4,
+            51, 4,  51, 4,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51,
+            9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21,
+            51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51};
+
+        __constant__ const unsigned short kdi[] = {
+            0,    10,   20,   30,   40,   50,   60,   75,   90,   105,  120,  145,  170,  195,  220,  245,  270,
+            300,  330,  360,  390,  420,  450,  480,  510,  540,  570,  605,  640,  675,  710,  747,  784,  821,
+            858,  895,  932,  969,  1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500,
+            1545, 1590, 1635, 1680, 1725, 1770, 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265,
+            2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030,
+            3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, 3467, 3506, 3545, 3584, 3623, 3662, 3701,
+            3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, 4212, 4253, 4294, 4336, 4378,
+            4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, 5059, 5104, 5149,
+            5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, 5908,
+            5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577,
+            6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104,
+            7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314};
+
+        __device__ int quadrant(int iX, int iY) {
+          bool near = iX >= 11;
+          bool far = !near;
+          bool top = iY >= 11;
+          bool bot = !top;
+
+          int iquad = 0;
+          if (near && top)
+            iquad = 1;
+          if (far && top)
+            iquad = 2;
+          if (far && bot)
+            iquad = 3;
+          if (near && bot)
+            iquad = 4;
+
+          return iquad;
+        }
+
+        __device__ int sector(int iX, int iY) {
+          //  Y (towards the surface)
+          //  T
+          //  |
+          //  |
+          //  |
+          //  o---------| X  (towards center of LHC)
+          //
+          static const int idx_[] = {
+              // clang-format off
+             // 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
+                0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0,  // 20
+                0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0,  // 19
+                0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 8, 0, 0, 0,  // 18
+                0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 0, 0,  // 17
+                0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 8, 0,  // 16
+                0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9, 9, 9, 9, 8, 8, 8, 8, 8, 0,  // 15
+                0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9, 9, 9, 8, 8, 8, 8, 8, 8, 0,  // 14
+                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8,  // 13
+                3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, 8, 8, 8, 8, 8, 8, 8, 7, 7,  // 12
+                3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, 0, 8, 7, 7, 7, 7, 7, 7, 7,  // 11
+                3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7,  // 10
+                3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0, 6, 6, 7, 7, 7, 7, 7, 7, 7,  // 9
+                3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 7,  // 8
+                0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 0,  // 7
+                0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 0,  // 6
+                0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 0,  // 5
+                0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 0, 0,  // 4
+                0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 0, 0, 0,  // 3
+                0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 0, 0, 0, 0,  // 2
+                0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0   // 1
+             // 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
+              // clang-format on
+          };
+
+          int iym, ixm, il, ic, ii;
+          iym = 20;
+          ixm = 20;
+          int iX_ = iX;
+          int iY_ = iY;
+          il = iym - iY_;
+          ic = iX_ - 1;
+          ii = il * ixm + ic;
+
+          if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) {
+            return -1;
+          };
+          return idx_[ii];
+        }
+
+      }  // namespace endcap
+
+    }  // namespace internal
+
+    __device__ uint32_t hashedIndexEE(uint32_t id) {
+      using namespace internal::endcap;
+
+      const uint32_t jx(ix(id));
+      const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50);
+      return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]);
+    }
+
+    //
+    // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEEGeom.cc
+    // https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc
+    //
+
+    __device__ int laser_monitoring_region_EE(uint32_t id) {
+      using namespace internal::endcap;
+
+      // SuperCrysCoord
+      uint32_t iX = (ix(id) - 1) / 5 + 1;
+      uint32_t iY = (iy(id) - 1) / 5 + 1;
+
+      // Correct convention
+      //   * @param iz iz/zside index: -1 for EE-, +1 for EE+
+      //   https://github.com/cms-sw/cmssw/blob/master/DataFormats/EcalDetId/interface/EEDetId.h#L68-L71
+      //   zside in https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc#L63
+      //
+      int iz = positiveZ(id) ? 1 : -1;
+
+      int iquad = quadrant(iX, iY);
+      int isect = sector(iX, iY);
+      if (isect < 0)
+        return -1;
+
+      int ilmr = 0;
+      ilmr = isect - 6;
+      if (ilmr <= 0)
+        ilmr += 9;
+      if (ilmr == 9)
+        ilmr++;
+      if (ilmr == 8 && iquad == 4)
+        ilmr++;
+      if (iz == +1)
+        ilmr += 72;
+      else
+        ilmr += 82;
+
+      return ilmr;
+    }
+
+  }  // namespace reconstruction
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
new file mode 100644
index 0000000000000..74c5b68d8e137
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
@@ -0,0 +1,26 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
+#define RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
+
+#include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
+
+#include <cmath>
+#include <limits>
+#include <type_traits>
+
+#include <Eigen/Dense>
+
+namespace ecal {
+  namespace reconstruction {
+
+    __device__ uint32_t hashedIndexEB(uint32_t id);
+
+    __device__ uint32_t hashedIndexEE(uint32_t id);
+
+    __device__ int laser_monitoring_region_EB(uint32_t id);
+
+    __device__ int laser_monitoring_region_EE(uint32_t id);
+
+  }  // namespace reconstruction
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu
new file mode 100644
index 0000000000000..9c2d2fc986c08
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu
@@ -0,0 +1,1133 @@
+#include <cmath>
+#include <limits>
+
+#include <cuda.h>
+
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
+
+#include "Common.h"
+#include "TimeComputationKernels.h"
+#include "KernelHelpers.h"
+
+//#define DEBUG
+
+//#define ECAL_RECO_CUDA_DEBUG
+
+namespace ecal {
+  namespace multifit {
+
+    __device__ __forceinline__ bool use_sample(unsigned int sample_mask, unsigned int sample) {
+      return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
+    }
+
+    __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  bool const* useless_sample_values,
+                                                  SampleVector::Scalar* chi2s,
+                                                  SampleVector::Scalar* sum0s,
+                                                  SampleVector::Scalar* sumAAs,
+                                                  const int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      int tx = threadIdx.x + blockDim.x * blockIdx.x;
+      int ltx = threadIdx.x;
+      int ch = tx / nsamples;
+      int nchannels_per_block = blockDim.x / nsamples;
+
+      // threads that return here should not affect the __syncthreads() below since they have exitted the kernel
+      if (ch >= nchannels)
+        return;
+
+      int sample = tx % nsamples;
+
+      // shared mem inits
+      extern __shared__ char sdata[];
+      char* s_sum0 = sdata;
+      SampleVector::Scalar* s_sum1 = reinterpret_cast<SampleVector::Scalar*>(s_sum0 + nchannels_per_block * nsamples);
+      SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples;
+      SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples;
+
+      // TODO make sure no div by 0
+      const auto inv_error =
+          useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]);
+      const auto sample_value = sample_values[tx];
+      s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
+      s_sum1[ltx] = inv_error;
+      s_sumA[ltx] = sample_value * inv_error;
+      s_sumAA[ltx] = sample_value * sample_value * inv_error;
+      __syncthreads();
+
+      // 5 threads for [0, 4] samples
+      if (sample < 5) {
+        s_sum0[ltx] += s_sum0[ltx + 5];
+        s_sum1[ltx] += s_sum1[ltx + 5];
+        s_sumA[ltx] += s_sumA[ltx + 5];
+        s_sumAA[ltx] += s_sumAA[ltx + 5];
+      }
+      __syncthreads();
+
+      if (sample < 2) {
+        // note double counting of sample 3
+        s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
+        s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
+        s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
+        s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
+      }
+      __syncthreads();
+
+      if (sample == 0) {
+        // note, subtract to remove the double counting of sample == 3
+        const auto sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
+        const auto sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
+        const auto sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
+        const auto sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
+        const auto chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
+        chi2s[ch] = chi2;
+        sum0s[ch] = sum0;
+        sumAAs[ch] = sumAA;
+
+#ifdef DEBUG_TC_NULLHYPOT
+        if (ch == 0) {
+          printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
+        }
+#endif
+      }
+    }
+
+    constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
+    constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
+
+    //#define DEBUG_TC_MAKERATIO
+    //
+    // launch ctx parameters are
+    // 45 threads per channel, X channels per block, Y blocks
+    // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
+    // TODO: it might be much beter to use 32 threads per channel instead of 45
+    // to simplify the synchronization
+    //
+    __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  uint32_t const* dids_eb,
+                                                  uint32_t const* dids_ee,
+                                                  bool const* useless_sample_values,
+                                                  char const* pedestal_nums,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEB,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEE,
+                                                  ConfigurationParameters::type const* timeFitParametersEB,
+                                                  ConfigurationParameters::type const* timeFitParametersEE,
+                                                  SampleVector::Scalar const* sumAAsNullHypot,
+                                                  SampleVector::Scalar const* sum0sNullHypot,
+                                                  SampleVector::Scalar* tMaxAlphaBetas,
+                                                  SampleVector::Scalar* tMaxErrorAlphaBetas,
+                                                  SampleVector::Scalar* g_accTimeMax,
+                                                  SampleVector::Scalar* g_accTimeWgt,
+                                                  TimeComputationState* g_state,
+                                                  unsigned const int timeFitParameters_sizeEB,
+                                                  unsigned const int timeFitParameters_sizeEE,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEB,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEE,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEB,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEE,
+                                                  const int nchannels,
+                                                  uint32_t const offsetForInputs) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nthreads_per_channel = 45;  // n=10, n(n-1)/2
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockDim.x * blockIdx.x;
+      const int ch = gtx / nthreads_per_channel;
+      const int ltx = threadIdx.x % nthreads_per_channel;
+      const int ch_start = ch * nsamples;
+      const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+      const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+
+      // remove inactive threads
+      // threads that return here should not affect the __syncthreads() below since they have exitted the kernel
+      if (ch >= nchannels)
+        return;
+
+      const auto did = DetId{dids[inputCh]};
+      const auto isBarrel = did.subdetId() == EcalBarrel;
+      const auto* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+      const auto* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE;
+      const auto timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE;
+      const auto timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
+      const auto timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
+
+      extern __shared__ char smem[];
+      ScalarType* shr_chi2s = reinterpret_cast<ScalarType*>(smem);
+      ScalarType* shr_time_wgt = shr_chi2s + blockDim.x;
+      ScalarType* shr_time_max = shr_time_wgt + blockDim.x;
+      ScalarType* shrTimeMax = shr_time_max + blockDim.x;
+      ScalarType* shrTimeWgt = shrTimeMax + blockDim.x;
+
+      // map tx -> (sample_i, sample_j)
+      int sample_i, sample_j = 0;
+      if (ltx >= 0 && ltx <= 8) {
+        sample_i = 0;
+        sample_j = 1 + ltx;
+      } else if (ltx <= 16) {
+        sample_i = 1;
+        sample_j = 2 + ltx - 9;
+      } else if (ltx <= 23) {
+        sample_i = 2;
+        sample_j = 3 + ltx - 17;
+      } else if (ltx <= 29) {
+        sample_i = 3;
+        sample_j = 4 + ltx - 24;
+      } else if (ltx <= 34) {
+        sample_i = 4;
+        sample_j = 5 + ltx - 30;
+      } else if (ltx <= 38) {
+        sample_i = 5;
+        sample_j = 6 + ltx - 35;
+      } else if (ltx <= 41) {
+        sample_i = 6;
+        sample_j = 7 + ltx - 39;
+      } else if (ltx <= 43) {
+        sample_i = 7;
+        sample_j = 8 + ltx - 42;
+      } else if (ltx <= 44) {
+        sample_i = 8;
+        sample_j = 9;
+      } else
+        assert(false);
+
+      const auto tx_i = ch_start + sample_i;
+      const auto tx_j = ch_start + sample_j;
+
+      //
+      // note, given the way we partition the block, with 45 threads per channel
+      // we will end up with inactive threads which need to be dragged along
+      // through the synching point
+      //
+      bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
+                                         sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
+
+      //
+      // see cpu implementation for explanation
+      //
+      ScalarType chi2 = std::numeric_limits<ScalarType>::max();
+      ScalarType tmax = 0;
+      ScalarType tmaxerr = 0;
+      shrTimeMax[threadIdx.x] = 0;
+      shrTimeWgt[threadIdx.x] = 0;
+      bool internalCondForSkipping1 = true;
+      bool internalCondForSkipping2 = true;
+      if (!condForUselessSamples) {
+        const auto rtmp = sample_values[tx_i] / sample_values[tx_j];
+        const auto invampl_i = 1.0 / sample_values[tx_i];
+        const auto relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
+        const auto invampl_j = 1.0 / sample_values[tx_j];
+        const auto relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
+        const auto err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
+        auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
+        // TODO non-divergent branch for a block if each block has 1 channel
+        // otherwise non-divergent for groups of 45 threads
+        // at this point, pedestal_nums[ch] can be either 0, 1 or 2
+        if (pedestal_nums[ch] == 2)
+          err2 *= err2 * 0.5;
+        const auto err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
+        const auto total_error = std::sqrt(err1 + err2 + err3);
+
+        const auto alpha = amplitudeFitParameters[0];
+        const auto beta = amplitudeFitParameters[1];
+        const auto alphabeta = alpha * beta;
+        const auto invalphabeta = 1.0 / alphabeta;
+
+        // variables instead of a struct
+        const auto ratio_index = sample_i;
+        const auto ratio_step = sample_j - sample_i;
+        const auto ratio_value = rtmp;
+        const auto ratio_error = total_error;
+
+        const auto rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
+        internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j);
+        if (!internalCondForSkipping1) {
+          //
+          // precompute.
+          // in cpu version this was done conditionally
+          // however easier to do it here (precompute) and then just filter out
+          // if not needed
+          //
+          const auto l_timeFitLimits_first = timeFitLimits_first;
+          const auto l_timeFitLimits_second = timeFitLimits_second;
+          if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
+            const auto time_max_i = static_cast<ScalarType>(ratio_index);
+            auto u = timeFitParameters[timeFitParameters_size - 1];
+            CMS_UNROLL_LOOP
+            for (int k = timeFitParameters_size - 2; k >= 0; k--)
+              u = u * ratio_value + timeFitParameters[k];
+
+            auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
+            for (int k = timeFitParameters_size - 2; k >= 1; k--)
+              du = du * ratio_value + k * timeFitParameters[k];
+
+            const auto error2 = ratio_error * ratio_error * du * du;
+            const auto time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
+            const auto time_wgt = error2 > 0 ? 1.0 / error2 : static_cast<ScalarType>(0);
+
+            // store into shared mem
+            // note, this name is essentially identical to the one used
+            // below.
+            shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0;
+            shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0;
+          } else {
+            shrTimeMax[threadIdx.x] = 0;
+            shrTimeWgt[threadIdx.x] = 0;
+          }
+
+          // continue with ratios
+          const auto stepOverBeta = static_cast<SampleVector::Scalar>(ratio_step) / beta;
+          const auto offset = static_cast<SampleVector::Scalar>(ratio_index) + alphabeta;
+          const auto rmin = std::max(ratio_value - ratio_error, 0.001);
+          const auto rmax = std::min(ratio_value + ratio_error,
+                                     fast_expf(static_cast<SampleVector::Scalar>(ratio_step) / beta) - 0.001);
+          const auto time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0);
+          const auto time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0);
+
+          // set these guys
+          tmax = 0.5 * (time1 + time2);
+          tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
+#ifdef DEBUG_TC_MAKERATIO
+          if (ch == 1 || ch == 0)
+            printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n",
+                   ch,
+                   ltx,
+                   tmax,
+                   tmaxerr,
+                   time1,
+                   time2,
+                   offset,
+                   rmin,
+                   rmax);
+#endif
+
+          SampleVector::Scalar sumAf = 0;
+          SampleVector::Scalar sumff = 0;
+          const int itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
+          auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
+          // TODO: data dependence
+          for (int it = itmin + 1; it < nsamples; it++) {
+            loffset += invalphabeta;
+            if (useless_sample_values[ch_start + it])
+              continue;
+            const auto inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
+            const auto term1 = 1.0 + loffset;
+            const auto f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
+            sumAf += sample_values[ch_start + it] * (f * inverr2);
+            sumff += f * (f * inverr2);
+          }
+
+          const auto sumAA = sumAAsNullHypot[ch];
+          const auto sum0 = sum0sNullHypot[ch];
+          chi2 = sumAA;
+          // TODO: sum0 can not be 0 below, need to introduce the check upfront
+          if (sumff > 0) {
+            chi2 = sumAA - sumAf * (sumAf / sumff);
+          }
+          chi2 /= sum0;
+
+#ifdef DEBUG_TC_MAKERATIO
+          if (ch == 1 || ch == 0)
+            printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n",
+                   ch,
+                   ltx,
+                   sumAf,
+                   sumff,
+                   sumAA,
+                   static_cast<int>(sum0),
+                   tmax,
+                   tmaxerr,
+                   chi2);
+#endif
+
+          if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
+            internalCondForSkipping2 = false;
+          else
+            chi2 = std::numeric_limits<ScalarType>::max();
+        }
+      }
+
+      // store into smem
+      shr_chi2s[threadIdx.x] = chi2;
+      __syncthreads();
+
+      // find min chi2 - quite crude for now
+      // TODO validate/check
+      char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+      bool oddElements = nthreads_per_channel % 2;
+      CMS_UNROLL_LOOP
+      while (iter >= 1) {
+        if (ltx < iter)
+          // for odd ns, the last guy will just store itself
+          // exception is for ltx == 0 and iter==1
+          shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                       ? shr_chi2s[threadIdx.x]
+                                       : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]);
+        __syncthreads();
+        oddElements = iter % 2;
+        iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
+      }
+
+      // filter out inactive or useless samples threads
+      if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
+        // min chi2, now compute weighted average of tmax measurements
+        // see cpu version for more explanation
+        const auto chi2min = shr_chi2s[threadIdx.x - ltx];
+        const auto chi2Limit = chi2min + 1.0;
+        const auto inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0;
+
+#ifdef DEBUG_TC_MAKERATIO
+        if (ch == 1 || ch == 0)
+          printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
+                 ch,
+                 ltx,
+                 chi2min,
+                 chi2Limit,
+                 inverseSigmaSquared);
+#endif
+
+        // store into shared mem and run reduction
+        // TODO: check if cooperative groups would be better
+        // TODO: check if shuffling intrinsics are better
+        shr_time_wgt[threadIdx.x] = inverseSigmaSquared;
+        shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared;
+      } else {
+        shr_time_wgt[threadIdx.x] = 0;
+        shr_time_max[threadIdx.x] = 0;
+      }
+      __syncthreads();
+
+      // reduce to compute time_max and time_wgt
+      iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+      oddElements = nthreads_per_channel % 2;
+      CMS_UNROLL_LOOP
+      while (iter >= 1) {
+        if (ltx < iter) {
+          shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                          ? shr_time_wgt[threadIdx.x]
+                                          : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter];
+          shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                          ? shr_time_max[threadIdx.x]
+                                          : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter];
+          shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                        ? shrTimeMax[threadIdx.x]
+                                        : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter];
+          shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                        ? shrTimeWgt[threadIdx.x]
+                                        : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter];
+        }
+
+        __syncthreads();
+        oddElements = iter % 2;
+        iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
+      }
+
+      // load from shared memory the 0th guy (will contain accumulated values)
+      // compute
+      // store into global mem
+      if (ltx == 0) {
+        const auto tmp_time_max = shr_time_max[threadIdx.x];
+        const auto tmp_time_wgt = shr_time_wgt[threadIdx.x];
+
+        // we are done if there number of time ratios is 0
+        if (tmp_time_wgt == 0 && tmp_time_max == 0) {
+          g_state[ch] = TimeComputationState::Finished;
+          return;
+        }
+
+        // no div by 0
+        const auto tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
+        const auto tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt);
+
+        tMaxAlphaBetas[ch] = tMaxAlphaBeta;
+        tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
+        g_accTimeMax[ch] = shrTimeMax[threadIdx.x];
+        g_accTimeWgt[ch] = shrTimeWgt[threadIdx.x];
+        g_state[ch] = TimeComputationState::NotFinished;
+
+#ifdef DEBUG_TC_MAKERATIO
+        printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
+        printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
+               ch,
+               tMaxAlphaBeta,
+               tMaxErrorAlphaBeta,
+               shrTimeMax[threadIdx.x],
+               shrTimeWgt[threadIdx.x]);
+#endif
+      }
+    }
+
+    /// launch ctx parameters are
+    /// 10 threads per channel, N channels per block, Y blocks
+    /// TODO: do we need to keep the state around or can be removed?!
+    //#define DEBUG_FINDAMPLCHI2_AND_FINISH
+    __global__ void kernel_time_compute_findamplchi2_and_finish(
+        SampleVector::Scalar const* sample_values,
+        SampleVector::Scalar const* sample_value_errors,
+        uint32_t const* dids_eb,
+        uint32_t const* dids_ee,
+        bool const* useless_samples,
+        SampleVector::Scalar const* g_tMaxAlphaBeta,
+        SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
+        SampleVector::Scalar const* g_accTimeMax,
+        SampleVector::Scalar const* g_accTimeWgt,
+        ConfigurationParameters::type const* amplitudeFitParametersEB,
+        ConfigurationParameters::type const* amplitudeFitParametersEE,
+        SampleVector::Scalar const* sumAAsNullHypot,
+        SampleVector::Scalar const* sum0sNullHypot,
+        SampleVector::Scalar const* chi2sNullHypot,
+        TimeComputationState* g_state,
+        SampleVector::Scalar* g_ampMaxAlphaBeta,
+        SampleVector::Scalar* g_ampMaxError,
+        SampleVector::Scalar* g_timeMax,
+        SampleVector::Scalar* g_timeError,
+        const int nchannels,
+        uint32_t const offsetForInputs) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int ch = gtx / nsamples;
+      const int sample = threadIdx.x % nsamples;
+      const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+      const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+
+      // configure shared mem
+      // per block, we need #threads per block * 2 * sizeof(ScalarType)
+      // we run with N channels per block
+      extern __shared__ char smem[];
+      ScalarType* shr_sumAf = reinterpret_cast<ScalarType*>(smem);
+      ScalarType* shr_sumff = shr_sumAf + blockDim.x;
+
+      if (ch >= nchannels)
+        return;
+
+      auto state = g_state[ch];
+      const auto did = DetId{dids[inputCh]};
+      const auto* amplitudeFitParameters =
+          did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+
+      // TODO is that better than storing into global and launching another kernel
+      // for the first 10 threads
+      if (state == TimeComputationState::NotFinished) {
+        const auto alpha = amplitudeFitParameters[0];
+        const auto beta = amplitudeFitParameters[1];
+        const auto alphabeta = alpha * beta;
+        const auto invalphabeta = 1.0 / alphabeta;
+        const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
+        const auto sample_value = sample_values[gtx];
+        const auto sample_value_error = sample_value_errors[gtx];
+        const auto inverr2 =
+            useless_samples[gtx] ? static_cast<ScalarType>(0) : 1.0 / (sample_value_error * sample_value_error);
+        const auto offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
+        const auto term1 = 1.0 + offset;
+        const auto f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.0);
+        const auto sumAf = sample_value * (f * inverr2);
+        const auto sumff = f * (f * inverr2);
+
+        // store into shared mem
+        shr_sumAf[threadIdx.x] = sumAf;
+        shr_sumff[threadIdx.x] = sumff;
+      } else {
+        shr_sumAf[threadIdx.x] = 0;
+        shr_sumff[threadIdx.x] = 0;
+      }
+      __syncthreads();
+
+      // reduce
+      // unroll completely here (but hardcoded)
+      if (sample < 5) {
+        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5];
+        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5];
+      }
+      __syncthreads();
+
+      if (sample < 2) {
+        // will need to subtract for ltx = 3, we double count here
+        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3];
+        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3];
+      }
+      __syncthreads();
+
+      if (sample == 0) {
+        // exit if the state is done
+        // note, we do not exit before all __synchtreads are finished
+        if (state == TimeComputationState::Finished) {
+          g_timeMax[ch] = 5;
+          g_timeError[ch] = -999;
+          return;
+        }
+
+        // subtract to avoid double counting
+        const auto sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3];
+        const auto sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3];
+
+        const auto ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
+        const auto sumAA = sumAAsNullHypot[ch];
+        const auto sum0 = sum0sNullHypot[ch];
+        const auto nullChi2 = chi2sNullHypot[ch];
+        if (sumff > 0) {
+          const auto chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
+          if (chi2AlphaBeta > nullChi2) {
+            // null hypothesis is better
+            state = TimeComputationState::Finished;
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+            printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
+                   ch,
+                   chi2AlphaBeta,
+                   nullChi2,
+                   sumAA,
+                   sumAf,
+                   sumff,
+                   sum0);
+#endif
+          }
+
+          // store to global
+          g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
+        } else {
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+          printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
+#endif
+          state = TimeComputationState::Finished;
+        }
+
+        // store the state to global and finish calcs
+        g_state[ch] = state;
+        if (state == TimeComputationState::Finished) {
+          // store default values into global
+          g_timeMax[ch] = 5;
+          g_timeError[ch] = -999;
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+          printf("ch = %d finished state\n", ch);
+#endif
+          return;
+        }
+
+        const auto ampMaxError = g_ampMaxError[ch];
+        const auto test_ratio = ampMaxAlphaBeta / ampMaxError;
+        const auto accTimeMax = g_accTimeMax[ch];
+        const auto accTimeWgt = g_accTimeWgt[ch];
+        const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
+        const auto tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
+        // branch to separate large vs small pulses
+        // see cpu version for more info
+        if (test_ratio > 5.0 && accTimeWgt > 0) {
+          const auto tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
+          const auto tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
+
+          if (test_ratio > 10.0) {
+            g_timeMax[ch] = tMaxRatio;
+            g_timeError[ch] = tMaxErrorRatio;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+            printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
+#endif
+          } else {
+            const auto timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
+                                  tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
+                                 5.0;
+            const auto timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
+                                    tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
+                                   5.0;
+            state = TimeComputationState::Finished;
+            g_state[ch] = state;
+            g_timeMax[ch] = timeMax;
+            g_timeError[ch] = timeError;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+            printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
+#endif
+          }
+        } else {
+          state = TimeComputationState::Finished;
+          g_state[ch] = state;
+          g_timeMax[ch] = tMaxAlphaBeta;
+          g_timeError[ch] = tMaxErrorAlphaBeta;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+          printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
+#endif
+        }
+      }
+    }
+
+    __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
+                                                    uint16_t const* digis_ee,
+                                                    SampleVector::Scalar* sample_values,
+                                                    SampleVector::Scalar* sample_value_errors,
+                                                    bool* useless_sample_values,
+                                                    unsigned const int sample_mask,
+                                                    const int nchannels,
+                                                    uint32_t const offsetForInputs) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int ch = gtx / nsamples;
+      const int sample = threadIdx.x % nsamples;
+      const int inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
+      const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
+
+      // remove thread for sample 0, oversubscribing is easier than ....
+      if (ch >= nchannels || sample == 0)
+        return;
+
+      if (!use_sample(sample_mask, sample))
+        return;
+
+      const auto gainIdPrev = ecal::mgpa::gainId(digis[inputGtx - 1]);
+      const auto gainIdNext = ecal::mgpa::gainId(digis[inputGtx]);
+      if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
+        sample_values[gtx - 1] = 0;
+        sample_value_errors[gtx - 1] = 1e+9;
+        useless_sample_values[gtx - 1] = true;
+      }
+    }
+
+    __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
+                                             SampleVector::Scalar const* sample_value_errors,
+                                             uint32_t const* dids,
+                                             bool const* useless_samples,
+                                             SampleVector::Scalar const* g_timeMax,
+                                             SampleVector::Scalar const* amplitudeFitParametersEB,
+                                             SampleVector::Scalar const* amplitudeFitParametersEE,
+                                             SampleVector::Scalar* g_amplitudeMax,
+                                             const int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr ScalarType corr4 = 1.;
+      constexpr ScalarType corr6 = 1.;
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int ch = gtx / nsamples;
+      const int sample = threadIdx.x % nsamples;
+
+      if (ch >= nchannels)
+        return;
+
+      const auto did = DetId{dids[ch]};
+      const auto* amplitudeFitParameters =
+          did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+
+      // configure shared mem
+      extern __shared__ char smem[];
+      ScalarType* shr_sum1 = reinterpret_cast<ScalarType*>(smem);
+      auto* shr_sumA = shr_sum1 + blockDim.x;
+      auto* shr_sumF = shr_sumA + blockDim.x;
+      auto* shr_sumAF = shr_sumF + blockDim.x;
+      auto* shr_sumFF = shr_sumAF + blockDim.x;
+
+      const auto alpha = amplitudeFitParameters[0];
+      const auto beta = amplitudeFitParameters[1];
+      const auto timeMax = g_timeMax[ch];
+      const auto pedestalLimit = timeMax - (alpha * beta) - 1.0;
+      const auto sample_value = sample_values[gtx];
+      const auto sample_value_error = sample_value_errors[gtx];
+      const auto inverr2 =
+          sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast<ScalarType>(0);
+      const auto termOne = 1 + (sample - timeMax) / (alpha * beta);
+      const auto f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta)
+                                     : static_cast<ScalarType>(0.);
+
+      bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) ||
+                         (f > 0.4 * corr4 && sample >= timeMax)) &&
+                        !useless_samples[gtx];
+
+      // store into shared mem
+      shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast<ScalarType>(0);
+      shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast<ScalarType>(0);
+      shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast<ScalarType>(0);
+      shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast<ScalarType>(0);
+      shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast<ScalarType>(0);
+
+      // reduction
+      if (sample <= 4) {
+        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5];
+        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5];
+        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5];
+        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5];
+        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5];
+      }
+      __syncthreads();
+
+      if (sample < 2) {
+        // note: we double count sample 3
+        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3];
+        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3];
+        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3];
+        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3];
+        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3];
+      }
+      __syncthreads();
+
+      if (sample == 0) {
+        const auto sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3];
+        const auto sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3];
+        const auto sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3];
+        const auto sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3];
+        const auto sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3];
+
+        const auto denom = sumFF * sum1 - sumF * sumF;
+        const auto condForDenom = sum1 > 0 && std::abs(denom) > 1.e-20;
+        const auto amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast<ScalarType>(0.);
+
+        // store into global mem
+        g_amplitudeMax[ch] = amplitudeMax;
+      }
+    }
+
+    //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
+    __global__ void kernel_time_computation_init(uint16_t const* digis_eb,
+                                                 uint32_t const* dids_eb,
+                                                 uint16_t const* digis_ee,
+                                                 uint32_t const* dids_ee,
+                                                 float const* rms_x12,
+                                                 float const* rms_x6,
+                                                 float const* rms_x1,
+                                                 float const* mean_x12,
+                                                 float const* mean_x6,
+                                                 float const* mean_x1,
+                                                 float const* gain12Over6,
+                                                 float const* gain6Over1,
+                                                 SampleVector::Scalar* sample_values,
+                                                 SampleVector::Scalar* sample_value_errors,
+                                                 SampleVector::Scalar* ampMaxError,
+                                                 bool* useless_sample_values,
+                                                 char* pedestal_nums,
+                                                 uint32_t const offsetForHashes,
+                                                 uint32_t const offsetForInputs,
+                                                 unsigned const int sample_maskEB,
+                                                 unsigned const int sample_maskEE,
+                                                 int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int tx = threadIdx.x + blockDim.x * blockIdx.x;
+      const int ch = tx / nsamples;
+      const int inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
+      const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+      const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
+      const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+
+      // threads that return here should not affect the __syncthreads() below since they have exitted the kernel
+      if (ch >= nchannels)
+        return;
+
+      // indices/inits
+      const int sample = tx % nsamples;
+      const int input_ch_start = inputCh * nsamples;
+      SampleVector::Scalar pedestal = 0.;
+      int num = 0;
+
+      // configure shared mem
+      extern __shared__ char smem[];
+      ScalarType* shrSampleValues = reinterpret_cast<SampleVector::Scalar*>(smem);
+      ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x;
+
+      // 0 and 1 sample values
+      const auto adc0 = ecal::mgpa::adc(digis[input_ch_start]);
+      const auto gainId0 = ecal::mgpa::gainId(digis[input_ch_start]);
+      const auto adc1 = ecal::mgpa::adc(digis[input_ch_start + 1]);
+      const auto gainId1 = ecal::mgpa::gainId(digis[input_ch_start + 1]);
+      const auto did = DetId{dids[inputCh]};
+      const auto isBarrel = did.subdetId() == EcalBarrel;
+      const auto sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE;
+      const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                     : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
+      // set pedestal
+      // TODO this branch is non-divergent for a group of 10 threads
+      if (gainId0 == 1 && use_sample(sample_mask, 0)) {
+        pedestal = static_cast<SampleVector::Scalar>(adc0);
+        num = 1;
+
+        const auto diff = adc1 - adc0;
+        if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) {
+          pedestal = (pedestal + static_cast<SampleVector::Scalar>(adc1)) / 2.0;
+          num = 2;
+        }
+      } else {
+        pedestal = mean_x12[ch];
+      }
+
+      // ped subtracted and gain-renormalized samples.
+      const auto gainId = ecal::mgpa::gainId(digis[inputTx]);
+      const auto adc = ecal::mgpa::adc(digis[inputTx]);
+
+      bool bad = false;
+      SampleVector::Scalar sample_value, sample_value_error;
+      // TODO divergent branch
+      // TODO: piece below is general both for amplitudes and timing
+      // potentially there is a way to reduce the amount of code...
+      if (!use_sample(sample_mask, sample)) {
+        bad = true;
+        sample_value = 0;
+        sample_value_error = 0;
+      } else if (gainId == 1) {
+        sample_value = static_cast<SampleVector::Scalar>(adc) - pedestal;
+        sample_value_error = rms_x12[hashedId];
+      } else if (gainId == 2) {
+        sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x6[hashedId]) * gain12Over6[hashedId];
+        sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
+      } else if (gainId == 3) {
+        sample_value =
+            (static_cast<SampleVector::Scalar>(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] * gain12Over6[hashedId];
+        sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId];
+      } else {
+        sample_value = 0;
+        sample_value_error = 0;
+        bad = true;
+      }
+
+      // TODO: make sure we save things correctly when sample is useless
+      const auto useless_sample = (sample_value_error <= 0) | bad;
+      useless_sample_values[tx] = useless_sample;
+      sample_values[tx] = sample_value;
+      sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
+
+      // DEBUG
+#ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
+      if (ch == 0) {
+        printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
+               sample,
+               sample_value,
+               sample_value_error,
+               useless_sample ? '1' : '0');
+      }
+#endif
+
+      // store into the shared mem
+      shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
+      shrSampleValueErrors[threadIdx.x] = sample_value_error;
+      __syncthreads();
+
+      // perform the reduction with min
+      if (sample < 5) {
+        // note, if equal -> we keep the value with lower sample as for cpu
+        shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5]
+                                                ? shrSampleValueErrors[threadIdx.x + 5]
+                                                : shrSampleValueErrors[threadIdx.x];
+        shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]);
+      }
+      __syncthreads();
+
+      // a bit of an overkill, but easier than to compare across 3 values
+      if (sample < 3) {
+        shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3]
+                                                ? shrSampleValueErrors[threadIdx.x + 3]
+                                                : shrSampleValueErrors[threadIdx.x];
+        shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]);
+      }
+      __syncthreads();
+
+      if (sample < 2) {
+        shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2]
+                                                ? shrSampleValueErrors[threadIdx.x + 2]
+                                                : shrSampleValueErrors[threadIdx.x];
+        shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]);
+      }
+      __syncthreads();
+
+      if (sample == 0) {
+        // we only needd the max error
+        const auto maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1]
+                                             ? shrSampleValueErrors[threadIdx.x + 1]
+                                             : shrSampleValueErrors[threadIdx.x];
+
+        // # pedestal samples used
+        pedestal_nums[ch] = num;
+        // this is used downstream
+        ampMaxError[ch] = maxSampleValueError;
+
+        // DEBUG
+#ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
+        if (ch == 0) {
+          printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
+        }
+#endif
+      }
+    }
+
+    ///
+    /// launch context parameters: 1 thread per channel
+    ///
+    //#define DEBUG_TIME_CORRECTION
+    __global__ void kernel_time_correction_and_finalize(
+        //        SampleVector::Scalar const* g_amplitude,
+        ::ecal::reco::StorageScalarType const* g_amplitudeEB,
+        ::ecal::reco::StorageScalarType const* g_amplitudeEE,
+        uint16_t const* digis_eb,
+        uint32_t const* dids_eb,
+        uint16_t const* digis_ee,
+        uint32_t const* dids_ee,
+        float const* amplitudeBinsEB,
+        float const* amplitudeBinsEE,
+        float const* shiftBinsEB,
+        float const* shiftBinsEE,
+        SampleVector::Scalar const* g_timeMax,
+        SampleVector::Scalar const* g_timeError,
+        float const* g_rms_x12,
+        float const* timeCalibConstant,
+        float* g_jitterEB,
+        float* g_jitterEE,
+        float* g_jitterErrorEB,
+        float* g_jitterErrorEE,
+        uint32_t* flagsEB,
+        uint32_t* flagsEE,
+        const int amplitudeBinsSizeEB,
+        const int amplitudeBinsSizeEE,
+        ConfigurationParameters::type const timeConstantTermEB,
+        ConfigurationParameters::type const timeConstantTermEE,
+        float const offsetTimeValueEB,
+        float const offsetTimeValueEE,
+        ConfigurationParameters::type const timeNconstEB,
+        ConfigurationParameters::type const timeNconstEE,
+        ConfigurationParameters::type const amplitudeThresholdEB,
+        ConfigurationParameters::type const amplitudeThresholdEE,
+        ConfigurationParameters::type const outOfTimeThreshG12pEB,
+        ConfigurationParameters::type const outOfTimeThreshG12pEE,
+        ConfigurationParameters::type const outOfTimeThreshG12mEB,
+        ConfigurationParameters::type const outOfTimeThreshG12mEE,
+        ConfigurationParameters::type const outOfTimeThreshG61pEB,
+        ConfigurationParameters::type const outOfTimeThreshG61pEE,
+        ConfigurationParameters::type const outOfTimeThreshG61mEB,
+        ConfigurationParameters::type const outOfTimeThreshG61mEE,
+        uint32_t const offsetForHashes,
+        uint32_t const offsetForInputs,
+        const int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
+      const auto* dids = gtx >= offsetForInputs ? dids_ee : dids_eb;
+      const auto& digis = gtx >= offsetForInputs ? digis_ee : digis_eb;
+
+      // filter out outside of range threads
+      if (gtx >= nchannels)
+        return;
+
+// need to ref the right ptrs
+#define ARRANGE(var) auto* var = gtx >= offsetForInputs ? var##EE : var##EB
+      ARRANGE(g_amplitude);
+      ARRANGE(g_jitter);
+      ARRANGE(g_jitterError);
+      ARRANGE(flags);
+#undef ARRANGE
+
+      const auto did = DetId{dids[inputGtx]};
+      const auto isBarrel = did.subdetId() == EcalBarrel;
+      const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                     : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+      const auto* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE;
+      const auto* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE;
+      const auto amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE;
+      const auto timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
+      const auto timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
+      const auto offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE;
+      const auto amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
+      const auto outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
+      const auto outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
+      const auto outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
+      const auto outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
+
+      // load some
+      const auto amplitude = g_amplitude[inputGtx];
+      const auto rms_x12 = g_rms_x12[hashedId];
+      const auto timeCalibConst = timeCalibConstant[hashedId];
+
+      int myBin = -1;
+      for (int bin = 0; bin < amplitudeBinsSize; bin++) {
+        if (amplitude > amplitudeBins[bin])
+          myBin = bin;
+        else
+          break;
+      }
+
+      ScalarType correction = 0;
+      if (myBin == -1) {
+        correction = shiftBins[0];
+      } else if (myBin == amplitudeBinsSize - 1) {
+        correction = shiftBins[myBin];
+      } else {
+        correction = shiftBins[myBin + 1] - shiftBins[myBin];
+        correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
+        correction += shiftBins[myBin];
+      }
+
+      // correction * 1./25.
+      correction = correction * 0.04;
+      const auto timeMax = g_timeMax[gtx];
+      const auto timeError = g_timeError[gtx];
+      const auto jitter = timeMax - 5 + correction;
+      const auto jitterError =
+          std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04);  // 0.04 = 1./25.
+
+#ifdef DEBUG_TIME_CORRECTION
+      printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
+             gtx,
+             timeMax,
+             timeError,
+             jitter,
+             correction);
+//    }
+#endif
+
+      // store back to  global
+      g_jitter[inputGtx] = jitter;
+      g_jitterError[inputGtx] = jitterError;
+
+      // set the flag
+      // TODO: replace with something more efficient (if required),
+      // for now just to make it work
+      if (amplitude > amplitudeThreshold * rms_x12) {
+        auto threshP = outOfTimeThreshG12p;
+        auto threshM = outOfTimeThreshG12m;
+        if (amplitude > 3000.) {
+          for (int isample = 0; isample < nsamples; isample++) {
+            int gainid = ecal::mgpa::gainId(digis[nsamples * inputGtx + isample]);
+            if (gainid != 1) {
+              threshP = outOfTimeThreshG61p;
+              threshM = outOfTimeThreshG61m;
+              break;
+            }
+          }
+        }
+
+        const auto correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
+        const auto nterm = timeNconst * rms_x12 / amplitude;
+        const auto sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
+        if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
+          flags[inputGtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
+      }
+    }
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h
new file mode 100644
index 0000000000000..a9b1c69678abd
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h
@@ -0,0 +1,186 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h
+
+#include <iostream>
+#include <limits>
+
+#include <cuda.h>
+
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EigenMatrixTypes_gpu.h"
+
+//#define DEBUG
+
+//#define ECAL_RECO_CUDA_DEBUG
+
+namespace ecal {
+  namespace multifit {
+
+    __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  bool const* useless_sample_values,
+                                                  SampleVector::Scalar* chi2s,
+                                                  SampleVector::Scalar* sum0s,
+                                                  SampleVector::Scalar* sumAAs,
+                                                  int const nchannels);
+    //
+    // launch ctx parameters are
+    // 45 threads per channel, X channels per block, Y blocks
+    // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
+    // TODO: it might be much beter to use 32 threads per channel instead of 45
+    // to simplify the synchronization
+    //
+    __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  uint32_t const* dids_eb,
+                                                  uint32_t const* dids_ee,
+                                                  bool const* useless_sample_values,
+                                                  char const* pedestal_nums,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEB,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEE,
+                                                  ConfigurationParameters::type const* timeFitParametersEB,
+                                                  ConfigurationParameters::type const* timeFitParametersEE,
+                                                  SampleVector::Scalar const* sumAAsNullHypot,
+                                                  SampleVector::Scalar const* sum0sNullHypot,
+                                                  SampleVector::Scalar* tMaxAlphaBetas,
+                                                  SampleVector::Scalar* tMaxErrorAlphaBetas,
+                                                  SampleVector::Scalar* g_accTimeMax,
+                                                  SampleVector::Scalar* g_accTimeWgt,
+                                                  TimeComputationState* g_state,
+                                                  unsigned int const timeFitParameters_sizeEB,
+                                                  unsigned int const timeFitParameters_sizeEE,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEB,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEE,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEB,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEE,
+                                                  int const nchannels,
+                                                  uint32_t const offsetForInputs);
+
+    /// launch ctx parameters are
+    /// 10 threads per channel, N channels per block, Y blocks
+    /// TODO: do we need to keep the state around or can be removed?!
+    //#define DEBUG_FINDAMPLCHI2_AND_FINISH
+    __global__ void kernel_time_compute_findamplchi2_and_finish(
+        SampleVector::Scalar const* sample_values,
+        SampleVector::Scalar const* sample_value_errors,
+        uint32_t const* dids_eb,
+        uint32_t const* dids_ee,
+        bool const* useless_samples,
+        SampleVector::Scalar const* g_tMaxAlphaBeta,
+        SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
+        SampleVector::Scalar const* g_accTimeMax,
+        SampleVector::Scalar const* g_accTimeWgt,
+        ConfigurationParameters::type const* amplitudeFitParametersEB,
+        ConfigurationParameters::type const* amplitudeFitParametersEE,
+        SampleVector::Scalar const* sumAAsNullHypot,
+        SampleVector::Scalar const* sum0sNullHypot,
+        SampleVector::Scalar const* chi2sNullHypot,
+        TimeComputationState* g_state,
+        SampleVector::Scalar* g_ampMaxAlphaBeta,
+        SampleVector::Scalar* g_ampMaxError,
+        SampleVector::Scalar* g_timeMax,
+        SampleVector::Scalar* g_timeError,
+        int const nchannels,
+        uint32_t const offsetForInputs);
+
+    __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
+                                                    uint16_t const* digis_ee,
+                                                    SampleVector::Scalar* sample_values,
+                                                    SampleVector::Scalar* sample_value_errors,
+                                                    bool* useless_sample_values,
+                                                    unsigned int const sample_mask,
+                                                    int const nchannels,
+                                                    uint32_t const offsetForInputs);
+
+    __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
+                                             SampleVector::Scalar const* sample_value_errors,
+                                             uint32_t const* dids_eb,
+                                             uint32_t const* dids_ed,
+                                             bool const* useless_samples,
+                                             SampleVector::Scalar const* g_timeMax,
+                                             SampleVector::Scalar const* amplitudeFitParametersEB,
+                                             SampleVector::Scalar const* amplitudeFitParametersEE,
+                                             SampleVector::Scalar* g_amplitudeMax,
+                                             int const nchannels,
+                                             uint32_t const offsetForInputs);
+
+    //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
+    __global__ void kernel_time_computation_init(uint16_t const* digis_eb,
+                                                 uint32_t const* dids_eb,
+                                                 uint16_t const* digis_ee,
+                                                 uint32_t const* dids_ee,
+                                                 float const* rms_x12,
+                                                 float const* rms_x6,
+                                                 float const* rms_x1,
+                                                 float const* mean_x12,
+                                                 float const* mean_x6,
+                                                 float const* mean_x1,
+                                                 float const* gain12Over6,
+                                                 float const* gain6Over1,
+                                                 SampleVector::Scalar* sample_values,
+                                                 SampleVector::Scalar* sample_value_errors,
+                                                 SampleVector::Scalar* ampMaxError,
+                                                 bool* useless_sample_values,
+                                                 char* pedestal_nums,
+                                                 uint32_t const offsetForHashes,
+                                                 uint32_t const offsetForInputs,
+                                                 unsigned int const sample_maskEB,
+                                                 unsigned int const sample_maskEE,
+                                                 int nchannels);
+
+    ///
+    /// launch context parameters: 1 thread per channel
+    ///
+    //#define DEBUG_TIME_CORRECTION
+    __global__ void kernel_time_correction_and_finalize(
+        //        SampleVector::Scalar const* g_amplitude,
+        ::ecal::reco::StorageScalarType const* g_amplitudeEB,
+        ::ecal::reco::StorageScalarType const* g_amplitudeEE,
+        uint16_t const* digis_eb,
+        uint32_t const* dids_eb,
+        uint16_t const* digis_ee,
+        uint32_t const* dids_ee,
+        float const* amplitudeBinsEB,
+        float const* amplitudeBinsEE,
+        float const* shiftBinsEB,
+        float const* shiftBinsEE,
+        SampleVector::Scalar const* g_timeMax,
+        SampleVector::Scalar const* g_timeError,
+        float const* g_rms_x12,
+        float const* timeCalibConstant,
+        ::ecal::reco::StorageScalarType* g_jitterEB,
+        ::ecal::reco::StorageScalarType* g_jitterEE,
+        ::ecal::reco::StorageScalarType* g_jitterErrorEB,
+        ::ecal::reco::StorageScalarType* g_jitterErrorEE,
+        uint32_t* flagsEB,
+        uint32_t* flagsEE,
+        int const amplitudeBinsSizeEB,
+        int const amplitudeBinsSizeEE,
+        ConfigurationParameters::type const timeConstantTermEB,
+        ConfigurationParameters::type const timeConstantTermEE,
+        float const offsetTimeValueEB,
+        float const offsetTimeValueEE,
+        ConfigurationParameters::type const timeNconstEB,
+        ConfigurationParameters::type const timeNconstEE,
+        ConfigurationParameters::type const amplitudeThresholdEB,
+        ConfigurationParameters::type const amplitudeThresholdEE,
+        ConfigurationParameters::type const outOfTimeThreshG12pEB,
+        ConfigurationParameters::type const outOfTimeThreshG12pEE,
+        ConfigurationParameters::type const outOfTimeThreshG12mEB,
+        ConfigurationParameters::type const outOfTimeThreshG12mEE,
+        ConfigurationParameters::type const outOfTimeThreshG61pEB,
+        ConfigurationParameters::type const outOfTimeThreshG61pEE,
+        ConfigurationParameters::type const outOfTimeThreshG61mEB,
+        ConfigurationParameters::type const outOfTimeThreshG61mEE,
+        uint32_t const offsetForHashes,
+        uint32_t const offsetForInputs,
+        int const nchannels);
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py
index 1eef78d42e940..72a3efaae38ba 100644
--- a/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py
+++ b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py
@@ -1,6 +1,57 @@
 import FWCore.ParameterSet.Config as cms
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
 # ECAL multifit running on CPU
 from RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi import ecalMultiFitUncalibRecHit
 
 ecalMultiFitUncalibRecHitTask = cms.Task(ecalMultiFitUncalibRecHit)
+
+# ECAL conditions used by the multifit running on GPU
+from RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi import ecalPedestalsGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi import ecalGainRatiosGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi import ecalPulseShapesGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi import ecalPulseCovariancesGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi import ecalSamplesCorrelationGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi import ecalTimeBiasCorrectionsGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi import ecalTimeCalibConstantsGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalMultifitParametersGPUESProducer_cfi import ecalMultifitParametersGPUESProducer
+
+# ECAL multifit running on GPU
+from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi import ecalUncalibRecHitProducerGPU as _ecalUncalibRecHitProducerGPU
+ecalMultiFitUncalibRecHitGPU = _ecalUncalibRecHitProducerGPU.clone(
+  digisLabelEB = cms.InputTag('ecalDigisGPU', 'ebDigis'),
+  digisLabelEE = cms.InputTag('ecalDigisGPU', 'eeDigis'),
+)
+
+# copy the uncalibrated rechits from GPU to CPU
+from RecoLocalCalo.EcalRecProducers.ecalCPUUncalibRecHitProducer_cfi import ecalCPUUncalibRecHitProducer as _ecalCPUUncalibRecHitProducer
+ecalMultiFitUncalibRecHitSoA = _ecalCPUUncalibRecHitProducer.clone(
+  recHitsInLabelEB = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEB'),
+  recHitsInLabelEE = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEE'),
+)
+
+# convert the uncalibrated rechits from SoA to legacy format
+from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitConvertGPU2CPUFormat_cfi import ecalUncalibRecHitConvertGPU2CPUFormat as _ecalUncalibRecHitConvertGPU2CPUFormat
+_ecalMultiFitUncalibRecHit_gpu = _ecalUncalibRecHitConvertGPU2CPUFormat.clone(
+  recHitsLabelGPUEB = cms.InputTag('ecalMultiFitUncalibRecHitSoA', 'EcalUncalibRecHitsEB'),
+  recHitsLabelGPUEE = cms.InputTag('ecalMultiFitUncalibRecHitSoA', 'EcalUncalibRecHitsEE'),
+)
+gpu.toReplaceWith(ecalMultiFitUncalibRecHit, _ecalMultiFitUncalibRecHit_gpu)
+
+gpu.toReplaceWith(ecalMultiFitUncalibRecHitTask, cms.Task(
+  # ECAL conditions used by the multifit running on GPU
+  ecalPedestalsGPUESProducer,
+  ecalGainRatiosGPUESProducer,
+  ecalPulseShapesGPUESProducer,
+  ecalPulseCovariancesGPUESProducer,
+  ecalSamplesCorrelationGPUESProducer,
+  ecalTimeBiasCorrectionsGPUESProducer,
+  ecalTimeCalibConstantsGPUESProducer,
+  ecalMultifitParametersGPUESProducer,
+  # ECAL multifit running on GP
+  ecalMultiFitUncalibRecHitGPU,
+  # copy the uncalibrated rechits from GPU to CPU
+  ecalMultiFitUncalibRecHitSoA,
+  # convert the uncalibrated rechits legacy format
+  ecalMultiFitUncalibRecHit,
+))
diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_gpu_new_cfi.py b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_gpu_new_cfi.py
new file mode 100644
index 0000000000000..84a0c6f9cbe8a
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_gpu_new_cfi.py
@@ -0,0 +1,83 @@
+import FWCore.ParameterSet.Config as cms
+
+from RecoLocalCalo.EcalRecProducers.ecalPulseShapeParameters_cff import *
+
+ecalMultiFitUncalibRecHitgpu = cms.EDProducer("EcalUncalibRecHitProducerGPUNew",
+    EBdigiCollection = cms.InputTag("ecalDigis","ebDigis"),
+    EEdigiCollection = cms.InputTag("ecalDigis","eeDigis"),
+    EBhitCollection = cms.string("EcalUncalibRecHitsEBgpunew"),
+    EBhitCollection_soa = cms.string("EcalUncalibRecHitsEBgpunew"),
+    EEhitCollection = cms.string('EcalUncalibRecHitsEEgpunew'),
+    EEhitCollection_soa = cms.string('EcalUncalibRecHitsEEgpunew'),
+    algo = cms.string("EcalUncalibRecHitWorkerMultiFitGPUNew"),
+    algoPSet = cms.PSet(
+      # for multifit method
+      EcalPulseShapeParameters = cms.PSet( ecal_pulse_shape_parameters ),
+      activeBXs = cms.vint32(-5,-4,-3,-2,-1,0,1,2,3,4),
+      ampErrorCalculation = cms.bool(True),
+      useLumiInfoRunHeader = cms.bool(True),
+  
+      doPrefitEB = cms.bool(False),
+      doPrefitEE = cms.bool(False),
+      prefitMaxChiSqEB = cms.double(25.),
+      prefitMaxChiSqEE = cms.double(10.),
+      
+      dynamicPedestalsEB = cms.bool(False),
+      dynamicPedestalsEE = cms.bool(False),
+      mitigateBadSamplesEB = cms.bool(False),
+      mitigateBadSamplesEE = cms.bool(False),
+      gainSwitchUseMaxSampleEB = cms.bool(True),
+      gainSwitchUseMaxSampleEE = cms.bool(False),      
+      selectiveBadSampleCriteriaEB = cms.bool(False),
+      selectiveBadSampleCriteriaEE = cms.bool(False),
+      simplifiedNoiseModelForGainSwitch = cms.bool(True),
+      addPedestalUncertaintyEB = cms.double(0.),
+      addPedestalUncertaintyEE = cms.double(0.),
+  
+      # decide which algorithm to be use to calculate the jitter
+      timealgo = cms.string("RatioMethod"),
+  
+      # for ratio method
+      EBtimeFitParameters = cms.vdouble(-2.015452e+00, 3.130702e+00, -1.234730e+01, 4.188921e+01, -8.283944e+01, 9.101147e+01, -5.035761e+01, 1.105621e+01),
+      EEtimeFitParameters = cms.vdouble(-2.390548e+00, 3.553628e+00, -1.762341e+01, 6.767538e+01, -1.332130e+02, 1.407432e+02, -7.541106e+01, 1.620277e+01),
+      EBamplitudeFitParameters = cms.vdouble(1.138,1.652),
+      EEamplitudeFitParameters = cms.vdouble(1.890,1.400),
+      EBtimeFitLimits_Lower = cms.double(0.2),
+      EBtimeFitLimits_Upper = cms.double(1.4),
+      EEtimeFitLimits_Lower = cms.double(0.2),
+      EEtimeFitLimits_Upper = cms.double(1.4),
+      # for time error
+      EBtimeConstantTerm= cms.double(.6),
+      EEtimeConstantTerm= cms.double(1.0),
+     
+      # for kOutOfTime flag
+      EBtimeNconst      = cms.double(28.5),
+      EEtimeNconst      = cms.double(31.8),
+      outOfTimeThresholdGain12pEB    = cms.double(5),      # times estimated precision
+      outOfTimeThresholdGain12mEB    = cms.double(5),      # times estimated precision
+      outOfTimeThresholdGain61pEB    = cms.double(5),      # times estimated precision
+      outOfTimeThresholdGain61mEB    = cms.double(5),      # times estimated precision
+      outOfTimeThresholdGain12pEE    = cms.double(1000),   # times estimated precision
+      outOfTimeThresholdGain12mEE    = cms.double(1000),   # times estimated precision
+      outOfTimeThresholdGain61pEE    = cms.double(1000),   # times estimated precision
+      outOfTimeThresholdGain61mEE    = cms.double(1000),   # times estimated precision
+      amplitudeThresholdEB    = cms.double(10),
+      amplitudeThresholdEE    = cms.double(10),
+  
+      ebSpikeThreshold = cms.double(1.042),
+  
+      # these are now taken from DB. Here the MC parameters for backward compatibility
+      ebPulseShape = cms.vdouble( 5.2e-05,-5.26e-05 , 6.66e-05, 0.1168, 0.7575, 1.,  0.8876, 0.6732, 0.4741,  0.3194 ),
+      eePulseShape = cms.vdouble( 5.2e-05,-5.26e-05 , 6.66e-05, 0.1168, 0.7575, 1.,  0.8876, 0.6732, 0.4741,  0.3194 ),   
+  
+      # for kPoorReco flag
+      kPoorRecoFlagEB = cms.bool(True),
+      kPoorRecoFlagEE = cms.bool(False),
+      chi2ThreshEB_ = cms.double(65.0),
+      chi2ThreshEE_ = cms.double(50.0),
+
+      # threads/blocks config
+      threads = cms.vint32(256, 1, 1),
+      runV1 = cms.bool(True),
+   )                                           
+)
diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py b/RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py
new file mode 100644
index 0000000000000..a9b5599fd970f
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py
@@ -0,0 +1,69 @@
+import FWCore.ParameterSet.Config as cms
+
+from RecoLocalCalo.EcalRecAlgos.ecalCleaningAlgo import cleaningAlgoConfig 
+
+# rechit producer
+ecalRecHitGPU = cms.EDProducer("EcalRecHitProducerGPU",
+                               
+    uncalibrecHitsInLabelEB = cms.InputTag("ecalUncalibRecHitProducerGPU","EcalUncalibRecHitsEB"),
+    uncalibrecHitsInLabelEE = cms.InputTag("ecalUncalibRecHitProducerGPU","EcalUncalibRecHitsEE"),
+          
+    recHitsLabelEB = cms.string("EcalRecHitsEB"),
+    recHitsLabelEE = cms.string("EcalRecHitsEE"),
+ 
+    maxNumberHitsEB = cms.uint32(61200),
+    maxNumberHitsEE = cms.uint32(14648),  
+  
+    ## db statuses to be exluded from reconstruction (some will be recovered)
+    ChannelStatusToBeExcluded = cms.vstring(   'kDAC',
+                                               'kNoisy',
+                                               'kNNoisy',
+                                               'kFixedG6',
+                                               'kFixedG1',
+                                               'kFixedG0',
+                                               'kNonRespondingIsolated',
+                                               'kDeadVFE',
+                                               'kDeadFE',
+                                               'kNoDataNoTP',
+                                               #
+                                               # AM should I add them here?????
+                                               # next ones from "flagsMapDBReco"
+                                               # but not defined in "EcalChannelStatusCode.h"
+                                               # but they are defined in "EcalRecHit.h"
+                                               #
+                                               #'kKilled',
+                                               #'kTPSaturated',
+                                               #'kL1SpikeFlag',
+                                               ),
+    
+    ## avoid propagation of dead channels other than after recovery
+    killDeadChannels = cms.bool(True),
+    
+    ## define maximal and minimal values for the laser corrections
+    
+    EBLaserMIN = cms.double(0.01),
+    EELaserMIN = cms.double(0.01),
+
+    EBLaserMAX = cms.double(30.0),
+    EELaserMAX = cms.double(30.0),
+
+    ## reco flags association to DB flag
+    flagsMapDBReco = cms.PSet(
+        kGood  = cms.vstring('kOk','kDAC','kNoLaser','kNoisy'),
+        kNoisy = cms.vstring('kNNoisy','kFixedG6','kFixedG1'),
+        kNeighboursRecovered = cms.vstring('kFixedG0',
+                                           'kNonRespondingIsolated',
+                                           'kDeadVFE'),
+        kTowerRecovered = cms.vstring('kDeadFE'),
+        kDead           = cms.vstring('kNoDataNoTP')
+        ), 
+
+    ## for channel recovery
+    recoverEBIsolatedChannels = cms.bool(False),
+    recoverEEIsolatedChannels = cms.bool(False),
+    recoverEBVFE  = cms.bool(False),
+    recoverEEVFE  = cms.bool(False),
+    recoverEBFE = cms.bool(True),
+    recoverEEFE = cms.bool(True),
+)
+
diff --git a/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py
new file mode 100644
index 0000000000000..a3d04e836f020
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py
@@ -0,0 +1,172 @@
+import FWCore.ParameterSet.Config as cms
+from Configuration.StandardSequences.Eras import eras
+
+process = cms.Process('RECO', eras.Run2_2018)
+
+# import of standard configurations
+process.load('Configuration.StandardSequences.Services_cff')
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff')
+process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
+
+# Other statements
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, '102X_dataRun2_HLT_v2', '')
+
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32(100)
+)
+
+# load data using the DAQ source
+import sys, os, inspect
+sys.path.append(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))))
+process.load('sourceFromRawCmggpu_cff')
+
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff")
+process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi")
+process.load("EventFilter.EcalRawToDigi.EcalUnpackerData_cfi")
+process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi")
+
+# load both cpu and gpu plugins
+process.load("RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi")
+
+# for validation of gpu multifit products
+process.load("RecoLocalCalo.EcalRecProducers.ecalCPUUncalibRecHitProducer_cfi")
+process.load("EventFilter.EcalRawToDigi.ecalCPUDigisProducer_cfi")
+
+process.load("EventFilter.EcalRawToDigi.ecalRawToDigiGPU_cfi")
+process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi")
+
+process.load("RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi")
+
+process.load("RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi")
+
+process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi")
+
+# force HLT configuration for ecalMultiFitUncalibRecHit
+process.ecalMultiFitUncalibRecHit.algoPSet = cms.PSet(
+    ebSpikeThreshold = cms.double(1.042),
+    EBtimeFitLimits_Upper = cms.double(1.4),
+    EEtimeFitLimits_Lower = cms.double(0.2),
+    timealgo = cms.string("None"),
+    EBtimeNconst = cms.double(28.5),
+    prefitMaxChiSqEE = cms.double(10.0),
+    outOfTimeThresholdGain12mEB = cms.double(5.0),
+    outOfTimeThresholdGain12mEE = cms.double(1000.0),
+    EEtimeFitParameters = cms.vdouble(-2.390548, 3.553628, -17.62341, 67.67538, -133.213, 140.7432, -75.41106, 16.20277),
+    prefitMaxChiSqEB = cms.double(25.0),
+    simplifiedNoiseModelForGainSwitch = cms.bool(True),
+    EBtimeFitParameters = cms.vdouble(-2.015452, 3.130702, -12.3473, 41.88921, -82.83944, 91.01147, -50.35761, 11.05621),
+    selectiveBadSampleCriteriaEB = cms.bool(False),
+    dynamicPedestalsEB = cms.bool(False),
+    useLumiInfoRunHeader = cms.bool(False),
+    EBamplitudeFitParameters = cms.vdouble(1.138, 1.652),
+    doPrefitEE = cms.bool(False),
+    dynamicPedestalsEE = cms.bool(False),
+    selectiveBadSampleCriteriaEE = cms.bool(False),
+    outOfTimeThresholdGain61pEE = cms.double(1000.0),
+    outOfTimeThresholdGain61pEB = cms.double(5.0),
+    activeBXs = cms.vint32(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4),
+    EcalPulseShapeParameters = cms.PSet(
+        EEPulseShapeTemplate = cms.vdouble(0.116442, 0.756246, 1.0, 0.897182, 0.686831, 0.491506, 0.344111, 0.245731, 0.174115, 0.123361, 0.0874288, 0.061957),
+        EEdigiCollection = cms.string(""),
+        EcalPreMixStage2 = cms.bool(False),
+        EcalPreMixStage1 = cms.bool(False),
+        EBPulseShapeCovariance = cms.vdouble(3.001E-6, 1.233E-5, 0.0, -4.416E-6, -4.571E-6, -3.614E-6, -2.636E-6, -1.286E-6, -8.41E-7, -5.296E-7, 0.0, 0.0, 1.233E-5, 6.154E-5, 0.0, -2.2E-5, -2.309E-5, -1.838E-5, -1.373E-5, -7.334E-6, -5.088E-6, -3.745E-6, -2.428E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.416E-6, -2.2E-5, 0.0, 8.319E-6, 8.545E-6, 6.792E-6, 5.059E-6, 2.678E-6, 1.816E-6, 1.223E-6, 8.245E-7, 5.589E-7, -4.571E-6, -2.309E-5, 0.0, 8.545E-6, 9.182E-6, 7.219E-6, 5.388E-6, 2.853E-6, 1.944E-6, 1.324E-6, 9.083E-7, 6.335E-7, -3.614E-6, -1.838E-5, 0.0, 6.792E-6, 7.219E-6, 6.016E-6, 4.437E-6, 2.385E-6, 1.636E-6, 1.118E-6, 7.754E-7, 5.556E-7, -2.636E-6, -1.373E-5, 0.0, 5.059E-6, 5.388E-6, 4.437E-6, 3.602E-6, 1.917E-6, 1.322E-6, 9.079E-7, 6.529E-7, 4.752E-7, -1.286E-6, -7.334E-6, 0.0, 2.678E-6, 2.853E-6, 2.385E-6, 1.917E-6, 1.375E-6, 9.1E-7, 6.455E-7, 4.693E-7, 3.657E-7, -8.41E-7, -5.088E-6, 0.0, 1.816E-6, 1.944E-6, 1.636E-6, 1.322E-6, 9.1E-7, 9.115E-7, 6.062E-7, 4.436E-7, 3.422E-7, -5.296E-7, -3.745E-6, 0.0, 1.223E-6, 1.324E-6, 1.118E-6, 9.079E-7, 6.455E-7, 6.062E-7, 7.217E-7, 4.862E-7, 3.768E-7, 0.0, -2.428E-6, 0.0, 8.245E-7, 9.083E-7, 7.754E-7, 6.529E-7, 4.693E-7, 4.436E-7, 4.862E-7, 6.509E-7, 4.418E-7, 0.0, 0.0, 0.0, 5.589E-7, 6.335E-7, 5.556E-7, 4.752E-7, 3.657E-7, 3.422E-7, 3.768E-7, 4.418E-7, 6.142E-7),
+        ESdigiCollection = cms.string(""),
+        EBdigiCollection = cms.string(""),
+        EBCorrNoiseMatrixG01 = cms.vdouble(1.0, 0.73354, 0.64442, 0.58851, 0.55425, 0.53082, 0.51916, 0.51097, 0.50732, 0.50409),
+        EBCorrNoiseMatrixG12 = cms.vdouble(1.0, 0.71073, 0.55721, 0.46089, 0.40449, 0.35931, 0.33924, 0.32439, 0.31581, 0.30481),
+        EBCorrNoiseMatrixG06 = cms.vdouble(1.0, 0.70946, 0.58021, 0.49846, 0.45006, 0.41366, 0.39699, 0.38478, 0.37847, 0.37055),
+        EEPulseShapeCovariance = cms.vdouble(3.941E-5, 3.333E-5, 0.0, -1.449E-5, -1.661E-5, -1.424E-5, -1.183E-5, -6.842E-6, -4.915E-6, -3.411E-6, 0.0, 0.0, 3.333E-5, 2.862E-5, 0.0, -1.244E-5, -1.431E-5, -1.233E-5, -1.032E-5, -5.883E-6, -4.154E-6, -2.902E-6, -2.128E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.449E-5, -1.244E-5, 0.0, 5.84E-6, 6.649E-6, 5.72E-6, 4.812E-6, 2.708E-6, 1.869E-6, 1.33E-6, 9.186E-7, 6.446E-7, -1.661E-5, -1.431E-5, 0.0, 6.649E-6, 7.966E-6, 6.898E-6, 5.794E-6, 3.157E-6, 2.184E-6, 1.567E-6, 1.084E-6, 7.575E-7, -1.424E-5, -1.233E-5, 0.0, 5.72E-6, 6.898E-6, 6.341E-6, 5.347E-6, 2.859E-6, 1.991E-6, 1.431E-6, 9.839E-7, 6.886E-7, -1.183E-5, -1.032E-5, 0.0, 4.812E-6, 5.794E-6, 5.347E-6, 4.854E-6, 2.628E-6, 1.809E-6, 1.289E-6, 9.02E-7, 6.146E-7, -6.842E-6, -5.883E-6, 0.0, 2.708E-6, 3.157E-6, 2.859E-6, 2.628E-6, 1.863E-6, 1.296E-6, 8.882E-7, 6.108E-7, 4.283E-7, -4.915E-6, -4.154E-6, 0.0, 1.869E-6, 2.184E-6, 1.991E-6, 1.809E-6, 1.296E-6, 1.217E-6, 8.669E-7, 5.751E-7, 3.882E-7, -3.411E-6, -2.902E-6, 0.0, 1.33E-6, 1.567E-6, 1.431E-6, 1.289E-6, 8.882E-7, 8.669E-7, 9.522E-7, 6.717E-7, 4.293E-7, 0.0, -2.128E-6, 0.0, 9.186E-7, 1.084E-6, 9.839E-7, 9.02E-7, 6.108E-7, 5.751E-7, 6.717E-7, 7.911E-7, 5.493E-7, 0.0, 0.0, 0.0, 6.446E-7, 7.575E-7, 6.886E-7, 6.146E-7, 4.283E-7, 3.882E-7, 4.293E-7, 5.493E-7, 7.027E-7),
+        EBPulseShapeTemplate = cms.vdouble(0.0113979, 0.758151, 1.0, 0.887744, 0.673548, 0.474332, 0.319561, 0.215144, 0.147464, 0.101087, 0.0693181, 0.0475044),
+        EECorrNoiseMatrixG01 = cms.vdouble(1.0, 0.72698, 0.62048, 0.55691, 0.51848, 0.49147, 0.47813, 0.47007, 0.46621, 0.46265),
+        EECorrNoiseMatrixG12 = cms.vdouble(1.0, 0.71373, 0.44825, 0.30152, 0.21609, 0.14786, 0.11772, 0.10165, 0.09465, 0.08098),
+        UseLCcorrection = cms.untracked.bool(True),
+        EECorrNoiseMatrixG06 = cms.vdouble(1.0, 0.71217, 0.47464, 0.34056, 0.26282, 0.20287, 0.17734, 0.16256, 0.15618, 0.14443)
+    ),
+    doPrefitEB = cms.bool(False),
+    addPedestalUncertaintyEE = cms.double(0.0),
+    addPedestalUncertaintyEB = cms.double(0.0),
+    gainSwitchUseMaxSampleEB = cms.bool(True),
+    EEtimeNconst = cms.double(31.8),
+    EEamplitudeFitParameters = cms.vdouble(1.89, 1.4),
+    chi2ThreshEE_ = cms.double(50.0),
+    eePulseShape = cms.vdouble(5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194),
+    outOfTimeThresholdGain12pEB = cms.double(5.0),
+    gainSwitchUseMaxSampleEE = cms.bool(False),
+    mitigateBadSamplesEB = cms.bool(False),
+    outOfTimeThresholdGain12pEE = cms.double(1000.0),
+    ebPulseShape = cms.vdouble(5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194),
+    ampErrorCalculation = cms.bool(False),
+    mitigateBadSamplesEE = cms.bool(False),
+    amplitudeThresholdEB = cms.double(10.0),
+    kPoorRecoFlagEB = cms.bool(True),
+    amplitudeThresholdEE = cms.double(10.0),
+    EBtimeFitLimits_Lower = cms.double(0.2),
+    kPoorRecoFlagEE = cms.bool(False),
+    EEtimeFitLimits_Upper = cms.double(1.4),
+    outOfTimeThresholdGain61mEE = cms.double(1000.0),
+    EEtimeConstantTerm = cms.double(1.0),
+    EBtimeConstantTerm = cms.double(0.6),
+    chi2ThreshEB_ = cms.double(65.0),
+    outOfTimeThresholdGain61mEB = cms.double(5.0)
+)
+
+process.ecalDigis = process.ecalEBunpacker.clone()
+process.ecalDigis.InputLabel = cms.InputTag('rawDataCollector')
+
+process.out = cms.OutputModule("PoolOutputModule",
+    fileName = cms.untracked.string("test.root")
+)
+
+process.finalize = cms.EndPath(process.out)
+
+process.bunchSpacing = cms.Path(
+    process.bunchSpacingProducer
+)
+
+process.digiPath = cms.Path(
+    process.ecalDigis +
+    process.ecalRawToDigiGPU +
+    process.ecalCPUDigisProducer
+)
+
+process.recoPath = cms.Path(
+    process.ecalMultiFitUncalibRecHit +
+    process.ecalUncalibRecHitProducerGPU +
+    process.ecalCPUUncalibRecHitProducer
+)
+
+process.schedule = cms.Schedule(
+    process.bunchSpacing,
+    process.digiPath,
+    process.recoPath,
+    process.finalize
+)
+
+process.options = cms.untracked.PSet(
+    numberOfThreads = cms.untracked.uint32(4),
+    numberOfStreams = cms.untracked.uint32(4),
+    SkipEvent = cms.untracked.vstring('ProductNotFound'),
+    wantSummary = cms.untracked.bool(True)
+)
+
+# report CUDAService messages
+process.MessageLogger.categories.append("CUDAService")
diff --git a/RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py b/RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py
new file mode 100644
index 0000000000000..e993a7573b689
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py
@@ -0,0 +1,151 @@
+import FWCore.ParameterSet.Config as cms
+
+# input
+FastMonitoringService = cms.Service( "FastMonitoringService",
+    filePerFwkStream = cms.untracked.bool( False ),
+    fastMonIntervals = cms.untracked.uint32( 2 ),
+    sleepTime = cms.untracked.int32( 1 )
+)
+
+EvFDaqDirector = cms.Service( "EvFDaqDirector",
+    runNumber = cms.untracked.uint32( 321177 ),
+
+    baseDir = cms.untracked.string( "tmp" ),
+    buBaseDir = cms.untracked.string( "tmp" ),
+
+    useFileBroker = cms.untracked.bool( False ),
+    fileBrokerKeepAlive = cms.untracked.bool( True ),
+    fileBrokerPort = cms.untracked.string( "8080" ),
+    fileBrokerUseLocalLock = cms.untracked.bool( True ),
+    fuLockPollInterval = cms.untracked.uint32( 2000 ),
+
+    requireTransfersPSet = cms.untracked.bool( False ),
+    selectedTransferMode = cms.untracked.string( "" ),
+    mergingPset = cms.untracked.string( "" ),
+
+    outputAdler32Recheck = cms.untracked.bool( False ),
+)
+
+source = cms.Source( "FedRawDataInputSource",
+    runNumber = cms.untracked.uint32( 321177 ),
+    getLSFromFilename = cms.untracked.bool(True),
+    testModeNoBuilderUnit = cms.untracked.bool(False),
+    verifyAdler32 = cms.untracked.bool( True ),
+    verifyChecksum = cms.untracked.bool( True ),
+    useL1EventID = cms.untracked.bool( False ),         # True
+    alwaysStartFromfirstLS = cms.untracked.uint32( 0 ),
+
+    eventChunkBlock = cms.untracked.uint32( 240 ),      # 32
+    eventChunkSize = cms.untracked.uint32( 240),        # 32
+    maxBufferedFiles = cms.untracked.uint32( 8 ),       #  2
+    numBuffers = cms.untracked.uint32( 8 ),             #  2
+
+    fileListMode = cms.untracked.bool( True ),          # False
+    fileNames = cms.untracked.vstring(
+        #'/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000004.raw',
+    ),
+)
\ No newline at end of file
diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
new file mode 100644
index 0000000000000..c70572ff3b89d
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
@@ -0,0 +1,322 @@
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.StandardSequences.Eras import eras
+#from Configuration.ProcessModifiers.gpu_cff import gpu
+
+process = cms.Process('RECO', eras.Run2_2018)
+
+# import of standard configurations
+process.load('Configuration.StandardSequences.Services_cff')
+#process.load('SimGeneral.HepPDTESSource.pythiapdt_cfi')
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+#process.load('Configuration.EventContent.EventContent_cff')
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff')
+#process.load('Configuration.StandardSequences.RawToDigi_Data_cff')
+#process.load('Configuration.StandardSequences.Reconstruction_Data_cff')
+#process.load('DQMOffline.Configuration.DQMOffline_cff')
+process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
+
+
+
+
+
+
+# Other statements
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, '102X_dataRun2_HLT_v2', '')
+
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32(1000)
+)
+
+# load data using the DAQ source
+import sys, os, inspect
+sys.path.append(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))))
+process.load('sourceFromRawCmggpu_cff')
+
+#-----------------------------------------
+# CMSSW/Hcal non-DQM Related Module import
+#-----------------------------------------
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff")
+#process.load("RecoLocalCalo.Configuration.ecalLocalRecoSequence_cff")
+process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi")
+process.load("EventFilter.EcalRawToDigi.EcalUnpackerData_cfi")
+process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi")
+
+# load both cpu and gpu plugins
+#
+# ../cfipython/slc7_amd64_gcc700/RecoLocalCalo/EcalRecProducers/ecalUncalibRecHitProducerGPU_cfi.py
+#
+process.load("RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi")
+
+# for validation of gpu multifit products
+process.load("RecoLocalCalo.EcalRecProducers.ecalCPUUncalibRecHitProducer_cfi")
+process.load("EventFilter.EcalRawToDigi.ecalCPUDigisProducer_cfi")
+
+process.load("EventFilter.EcalRawToDigi.ecalRawToDigiGPU_cfi")
+process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi")
+
+#process.ecalUncalibRecHitProducerGPU.kernelsVersion = 0
+#process.ecalUncalibRecHitProducerGPU.kernelMinimizeThreads = cms.vuint32(16, 1, 1)
+
+process.load("RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi")
+
+#process.ecalMultiFitUncalibRecHitgpu.algoPSet.threads = cms.vint32(256, 1, 1)
+
+#from RecoLocalCalo.EcalRecProducers.ecalMultifitParametersGPUESProducer_cfi import ecalMultifitParametersGPUESProducer
+process.load("RecoLocalCalo.EcalRecProducers.ecalMultifitParametersGPUESProducer_cfi")
+
+#
+#
+#   No "JobConfigurationGPURecord" record found in the EventSetup.n
+#    #--->
+#
+process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitParametersGPUESProducer_cfi")
+#ecalRecHitParametersGPUESProducer_cfi.py
+
+
+##
+## force HLT configuration for ecalMultiFitUncalibRecHit
+##
+
+process.ecalMultiFitUncalibRecHit.algoPSet = cms.PSet( 
+      ebSpikeThreshold = cms.double( 1.042 ),
+      EBtimeFitLimits_Upper = cms.double( 1.4 ),
+      EEtimeFitLimits_Lower = cms.double( 0.2 ),
+      timealgo = cms.string( "None" ),
+      EBtimeNconst = cms.double( 28.5 ),
+      prefitMaxChiSqEE = cms.double( 10.0 ),
+      outOfTimeThresholdGain12mEB = cms.double( 5.0 ),
+      outOfTimeThresholdGain12mEE = cms.double( 1000.0 ),
+      EEtimeFitParameters = cms.vdouble( -2.390548, 3.553628, -17.62341, 67.67538, -133.213, 140.7432, -75.41106, 16.20277 ),
+      prefitMaxChiSqEB = cms.double( 25.0 ),
+      simplifiedNoiseModelForGainSwitch = cms.bool( True ),
+      EBtimeFitParameters = cms.vdouble( -2.015452, 3.130702, -12.3473, 41.88921, -82.83944, 91.01147, -50.35761, 11.05621 ),
+      selectiveBadSampleCriteriaEB = cms.bool( False ),
+      dynamicPedestalsEB = cms.bool( False ),
+      useLumiInfoRunHeader = cms.bool( False ),
+      EBamplitudeFitParameters = cms.vdouble( 1.138, 1.652 ),
+      doPrefitEE = cms.bool( False ),
+      dynamicPedestalsEE = cms.bool( False ),
+      selectiveBadSampleCriteriaEE = cms.bool( False ),
+      outOfTimeThresholdGain61pEE = cms.double( 1000.0 ),
+      outOfTimeThresholdGain61pEB = cms.double( 5.0 ),
+      activeBXs = cms.vint32( -5, -4, -3, -2, -1, 0, 1, 2, 3, 4 ),
+      EcalPulseShapeParameters = cms.PSet( 
+        EEPulseShapeTemplate = cms.vdouble( 0.116442, 0.756246, 1.0, 0.897182, 0.686831, 0.491506, 0.344111, 0.245731, 0.174115, 0.123361, 0.0874288, 0.061957 ),
+        EEdigiCollection = cms.string( "" ),
+        EcalPreMixStage2 = cms.bool( False ),
+        EcalPreMixStage1 = cms.bool( False ),
+        EBPulseShapeCovariance = cms.vdouble( 3.001E-6, 1.233E-5, 0.0, -4.416E-6, -4.571E-6, -3.614E-6, -2.636E-6, -1.286E-6, -8.41E-7, -5.296E-7, 0.0, 0.0, 1.233E-5, 6.154E-5, 0.0, -2.2E-5, -2.309E-5, -1.838E-5, -1.373E-5, -7.334E-6, -5.088E-6, -3.745E-6, -2.428E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.416E-6, -2.2E-5, 0.0, 8.319E-6, 8.545E-6, 6.792E-6, 5.059E-6, 2.678E-6, 1.816E-6, 1.223E-6, 8.245E-7, 5.589E-7, -4.571E-6, -2.309E-5, 0.0, 8.545E-6, 9.182E-6, 7.219E-6, 5.388E-6, 2.853E-6, 1.944E-6, 1.324E-6, 9.083E-7, 6.335E-7, -3.614E-6, -1.838E-5, 0.0, 6.792E-6, 7.219E-6, 6.016E-6, 4.437E-6, 2.385E-6, 1.636E-6, 1.118E-6, 7.754E-7, 5.556E-7, -2.636E-6, -1.373E-5, 0.0, 5.059E-6, 5.388E-6, 4.437E-6, 3.602E-6, 1.917E-6, 1.322E-6, 9.079E-7, 6.529E-7, 4.752E-7, -1.286E-6, -7.334E-6, 0.0, 2.678E-6, 2.853E-6, 2.385E-6, 1.917E-6, 1.375E-6, 9.1E-7, 6.455E-7, 4.693E-7, 3.657E-7, -8.41E-7, -5.088E-6, 0.0, 1.816E-6, 1.944E-6, 1.636E-6, 1.322E-6, 9.1E-7, 9.115E-7, 6.062E-7, 4.436E-7, 3.422E-7, -5.296E-7, -3.745E-6, 0.0, 1.223E-6, 1.324E-6, 1.118E-6, 9.079E-7, 6.455E-7, 6.062E-7, 7.217E-7, 4.862E-7, 3.768E-7, 0.0, -2.428E-6, 0.0, 8.245E-7, 9.083E-7, 7.754E-7, 6.529E-7, 4.693E-7, 4.436E-7, 4.862E-7, 6.509E-7, 4.418E-7, 0.0, 0.0, 0.0, 5.589E-7, 6.335E-7, 5.556E-7, 4.752E-7, 3.657E-7, 3.422E-7, 3.768E-7, 4.418E-7, 6.142E-7 ),
+        ESdigiCollection = cms.string( "" ),
+        EBdigiCollection = cms.string( "" ),
+        EBCorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.73354, 0.64442, 0.58851, 0.55425, 0.53082, 0.51916, 0.51097, 0.50732, 0.50409 ),
+        EBCorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71073, 0.55721, 0.46089, 0.40449, 0.35931, 0.33924, 0.32439, 0.31581, 0.30481 ),
+        EBCorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.70946, 0.58021, 0.49846, 0.45006, 0.41366, 0.39699, 0.38478, 0.37847, 0.37055 ),
+        EEPulseShapeCovariance = cms.vdouble( 3.941E-5, 3.333E-5, 0.0, -1.449E-5, -1.661E-5, -1.424E-5, -1.183E-5, -6.842E-6, -4.915E-6, -3.411E-6, 0.0, 0.0, 3.333E-5, 2.862E-5, 0.0, -1.244E-5, -1.431E-5, -1.233E-5, -1.032E-5, -5.883E-6, -4.154E-6, -2.902E-6, -2.128E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.449E-5, -1.244E-5, 0.0, 5.84E-6, 6.649E-6, 5.72E-6, 4.812E-6, 2.708E-6, 1.869E-6, 1.33E-6, 9.186E-7, 6.446E-7, -1.661E-5, -1.431E-5, 0.0, 6.649E-6, 7.966E-6, 6.898E-6, 5.794E-6, 3.157E-6, 2.184E-6, 1.567E-6, 1.084E-6, 7.575E-7, -1.424E-5, -1.233E-5, 0.0, 5.72E-6, 6.898E-6, 6.341E-6, 5.347E-6, 2.859E-6, 1.991E-6, 1.431E-6, 9.839E-7, 6.886E-7, -1.183E-5, -1.032E-5, 0.0, 4.812E-6, 5.794E-6, 5.347E-6, 4.854E-6, 2.628E-6, 1.809E-6, 1.289E-6, 9.02E-7, 6.146E-7, -6.842E-6, -5.883E-6, 0.0, 2.708E-6, 3.157E-6, 2.859E-6, 2.628E-6, 1.863E-6, 1.296E-6, 8.882E-7, 6.108E-7, 4.283E-7, -4.915E-6, -4.154E-6, 0.0, 1.869E-6, 2.184E-6, 1.991E-6, 1.809E-6, 1.296E-6, 1.217E-6, 8.669E-7, 5.751E-7, 3.882E-7, -3.411E-6, -2.902E-6, 0.0, 1.33E-6, 1.567E-6, 1.431E-6, 1.289E-6, 8.882E-7, 8.669E-7, 9.522E-7, 6.717E-7, 4.293E-7, 0.0, -2.128E-6, 0.0, 9.186E-7, 1.084E-6, 9.839E-7, 9.02E-7, 6.108E-7, 5.751E-7, 6.717E-7, 7.911E-7, 5.493E-7, 0.0, 0.0, 0.0, 6.446E-7, 7.575E-7, 6.886E-7, 6.146E-7, 4.283E-7, 3.882E-7, 4.293E-7, 5.493E-7, 7.027E-7 ),
+        EBPulseShapeTemplate = cms.vdouble( 0.0113979, 0.758151, 1.0, 0.887744, 0.673548, 0.474332, 0.319561, 0.215144, 0.147464, 0.101087, 0.0693181, 0.0475044 ),
+        EECorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.72698, 0.62048, 0.55691, 0.51848, 0.49147, 0.47813, 0.47007, 0.46621, 0.46265 ),
+        EECorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71373, 0.44825, 0.30152, 0.21609, 0.14786, 0.11772, 0.10165, 0.09465, 0.08098 ),
+        UseLCcorrection = cms.untracked.bool( True ),
+        EECorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.71217, 0.47464, 0.34056, 0.26282, 0.20287, 0.17734, 0.16256, 0.15618, 0.14443 )
+      ),
+      doPrefitEB = cms.bool( False ),
+      addPedestalUncertaintyEE = cms.double( 0.0 ),
+      addPedestalUncertaintyEB = cms.double( 0.0 ),
+      gainSwitchUseMaxSampleEB = cms.bool( True ),
+      EEtimeNconst = cms.double( 31.8 ),
+      EEamplitudeFitParameters = cms.vdouble( 1.89, 1.4 ),
+      chi2ThreshEE_ = cms.double( 50.0 ),
+      eePulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ),
+      outOfTimeThresholdGain12pEB = cms.double( 5.0 ),
+      gainSwitchUseMaxSampleEE = cms.bool( False ),
+      mitigateBadSamplesEB = cms.bool( False ),
+      outOfTimeThresholdGain12pEE = cms.double( 1000.0 ),
+      ebPulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ),
+      ampErrorCalculation = cms.bool( False ),
+      mitigateBadSamplesEE = cms.bool( False ),
+      amplitudeThresholdEB = cms.double( 10.0 ),
+      kPoorRecoFlagEB = cms.bool( True ),
+      amplitudeThresholdEE = cms.double( 10.0 ),
+      EBtimeFitLimits_Lower = cms.double( 0.2 ),
+      kPoorRecoFlagEE = cms.bool( False ),
+      EEtimeFitLimits_Upper = cms.double( 1.4 ),
+      outOfTimeThresholdGain61mEE = cms.double( 1000.0 ),
+      EEtimeConstantTerm = cms.double( 1.0 ),
+      EBtimeConstantTerm = cms.double( 0.6 ),
+      chi2ThreshEB_ = cms.double( 65.0 ),
+      outOfTimeThresholdGain61mEB = cms.double( 5.0 )
+)     
+      
+##    
+    
+    
+    
+process.load('Configuration.StandardSequences.Reconstruction_cff')
+#process.ecalRecHit
+
+    
+process.load("RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi")
+#process.load("RecoLocalCalo.EcalRecProducers.ecalADCToGeVConstantGPUESProducer_cfi")
+#process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi")
+    
+process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi")
+    
+process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi")
+process.ecalRecHitProducerGPU = process.ecalRecHitGPU.clone()
+ 
+ 
+process.load("RecoLocalCalo.EcalRecProducers.ecalCPURecHitProducer_cfi")
+
+ 
+#
+# AM : TEST to see if the number of rechits matches
+#
+#process.ecalRecHit.killDeadChannels = cms.bool(False)
+#
+#process.ecalRecHit.recoverEBFE = cms.bool(False)
+#process.ecalRecHit.recoverEBIsolatedChannels = cms.bool(False)
+#process.ecalRecHit.recoverEBVFE = cms.bool(False)
+##
+#process.ecalRecHit.recoverEEFE = cms.bool(False)
+#process.ecalRecHit.recoverEEIsolatedChannels = cms.bool(False)
+#process.ecalRecHit.recoverEEVFE = cms.bool(False)
+#
+#process.ecalRecHit.skipTimeCalib = cms.bool(True)
+#
+#process.ecalRecHitProducerGPU.killDeadChannels = cms.bool(False)
+#
+#
+#process.ecalRecHitProducerGPU.recoverEBFE = cms.bool(False)
+#process.ecalRecHitProducerGPU.recoverEBIsolatedChannels = cms.bool(False)
+#process.ecalRecHitProducerGPU.recoverEBVFE = cms.bool(False)
+#process.ecalRecHitProducerGPU.recoverEEFE = cms.bool(False)
+#process.ecalRecHitProducerGPU.recoverEEIsolatedChannels = cms.bool(False)
+#process.ecalRecHitProducerGPU.recoverEEVFE = cms.bool(False)
+#
+#
+#
+# TEST
+#
+#process.ecalRecHit.ChannelStatusToBeExcluded = cms.vstring( 
+                                                          #'kDAC', 
+                                                          #'kNoisy', 
+                                                          #'kNNoisy', 
+                                                          #'kFixedG6', 
+                                                          #'kFixedG1', 
+                                                          #'kFixedG0', 
+                                                          #'kNonRespondingIsolated',
+                                                          #'kDeadVFE', 
+                                                          #'kDeadFE', 
+                                                          #'kNoDataNoTP'
+                                                          #)
+#process.ecalRecHitProducerGPU.ChannelStatusToBeExcluded = cms.vstring(
+                                                          #'kDAC', 
+                                                          #'kNoisy', 
+                                                          #'kNNoisy', 
+                                                          #'kFixedG6', 
+                                                          #'kFixedG1', 
+                                                          #'kFixedG0', 
+                                                          #'kNonRespondingIsolated',
+                                                          #'kDeadVFE', 
+                                                          #'kDeadFE', 
+                                                          #'kNoDataNoTP'
+                                                          #)
+#
+#
+
+    #ChannelStatusToBeExcluded = cms.vstring(
+        #'kDAC', 
+        #'kNoisy', 
+        #'kNNoisy', 
+        #'kFixedG6', 
+        #'kFixedG1', 
+        #'kFixedG0', 
+        #'kNonRespondingIsolated', 
+        #'kDeadVFE', 
+        #'kDeadFE', 
+        #'kNoDataNoTP'
+    #),
+
+
+
+#process.hcalDigis.silent = cms.untracked.bool(False)
+#process.hcalDigis.InputLabel = rawTag
+process.ecalDigis = process.ecalEBunpacker.clone()
+process.ecalDigis.InputLabel = cms.InputTag('rawDataCollector')
+#process.hbheprerecogpu.processQIE11 = cms.bool(True)
+
+process.out = cms.OutputModule(
+    "PoolOutputModule",
+    fileName = cms.untracked.string("testRechit.root")
+)
+
+#process.out = cms.OutputModule("AsciiOutputModule",
+#    outputCommands = cms.untracked.vstring(
+#        'keep *_ecalMultiFitUncalibRecHit_*_*', 
+#    ),
+#    verbosity = cms.untracked.uint32(0)
+#)
+process.finalize = cms.EndPath(process.out)
+
+process.bunchSpacing = cms.Path(
+    process.bunchSpacingProducer
+)
+
+process.digiPath = cms.Path(
+    #process.hcalDigis
+    process.ecalDigis
+    *process.ecalRawToDigiGPU    
+    *process.ecalCPUDigisProducer
+)
+
+process.recoPath = cms.Path(
+    (process.ecalMultiFitUncalibRecHit+process.ecalDetIdToBeRecovered)
+    #process.ecalMultiFitUncalibRecHit
+    *process.ecalRecHit
+#   gpu
+    *process.ecalUncalibRecHitProducerGPU
+    *process.ecalCPUUncalibRecHitProducer
+    *process.ecalRecHitProducerGPU
+    *process.ecalCPURecHitProducer
+)
+
+process.schedule = cms.Schedule(
+    process.bunchSpacing,
+    process.digiPath,
+    process.recoPath,
+#    process.ecalecalLocalRecoSequence
+    process.finalize
+)
+
+process.options = cms.untracked.PSet(
+    numberOfThreads = cms.untracked.uint32(4),
+    numberOfStreams = cms.untracked.uint32(4),
+    SkipEvent = cms.untracked.vstring('ProductNotFound'),
+    wantSummary = cms.untracked.bool(True)
+)
+
+# report CUDAService messages
+process.MessageLogger.categories.append("CUDAService")
+
+
+#
+#process.DependencyGraph = cms.Service("DependencyGraph")
+
+
diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py
new file mode 100644
index 0000000000000..ffb665d7bc96a
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py
@@ -0,0 +1,233 @@
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.StandardSequences.Eras import eras
+#from Configuration.ProcessModifiers.gpu_cff import gpu
+
+process = cms.Process('RECO', eras.Run2_2018)
+
+# import of standard configurations
+process.load('Configuration.StandardSequences.Services_cff')
+#process.load('SimGeneral.HepPDTESSource.pythiapdt_cfi')
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+#process.load('Configuration.EventContent.EventContent_cff')
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff')
+#process.load('Configuration.StandardSequences.RawToDigi_Data_cff')
+#process.load('Configuration.StandardSequences.Reconstruction_Data_cff')
+#process.load('DQMOffline.Configuration.DQMOffline_cff')
+process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
+
+
+
+
+
+
+# Other statements
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, '102X_dataRun2_HLT_v2', '')
+
+
+process.maxEvents = cms.untracked.PSet(
+    #input = cms.untracked.int32(100)
+    input = cms.untracked.int32(1000)
+)
+
+# load data using the DAQ source
+import sys, os, inspect
+sys.path.append(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))))
+process.load('sourceFromRawCmggpu_cff')
+
+#-----------------------------------------
+# CMSSW/Hcal non-DQM Related Module import
+#-----------------------------------------
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff")
+#process.load("RecoLocalCalo.Configuration.ecalLocalRecoSequence_cff")
+process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi")
+process.load("EventFilter.EcalRawToDigi.EcalUnpackerData_cfi")
+process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi")
+
+# load both cpu and gpu plugins
+#
+# ../cfipython/slc7_amd64_gcc700/RecoLocalCalo/EcalRecProducers/ecalUncalibRecHitProducerGPU_cfi.py
+#
+process.load("RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi")
+
+# for validation of gpu multifit products
+process.load("RecoLocalCalo.EcalRecProducers.ecalCPUUncalibRecHitProducer_cfi")
+#
+# ../cfipython/slc7_amd64_gcc700/RecoLocalCalo/EcalRecProducers/ecalCPUUncalibRecHitProducer_cfi.py
+#
+
+process.load("EventFilter.EcalRawToDigi.ecalRawToDigiGPU_cfi")
+process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi")
+
+#process.ecalUncalibRecHitProducerGPU.kernelsVersion = 0
+#process.ecalUncalibRecHitProducerGPU.kernelMinimizeThreads = cms.vuint32(16, 1, 1)
+#
+# process.ecalUncalibRecHitProducerGPU.shouldRunTimingComputation = cms.bool(False)
+#
+
+
+process.load("RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi")
+
+#process.ecalMultiFitUncalibRecHitgpu.algoPSet.threads = cms.vint32(256, 1, 1)
+
+
+process.load("RecoLocalCalo.EcalRecProducers.ecalMultifitParametersGPUESProducer_cfi")
+
+
+
+##
+## force HLT configuration for ecalMultiFitUncalibRecHit
+##
+
+process.ecalMultiFitUncalibRecHit.algoPSet = cms.PSet( 
+      ebSpikeThreshold = cms.double( 1.042 ),
+      EBtimeFitLimits_Upper = cms.double( 1.4 ),
+      EEtimeFitLimits_Lower = cms.double( 0.2 ),
+      timealgo = cms.string( "None" ),   # ----> no timing computation for CPU version
+      EBtimeNconst = cms.double( 28.5 ),
+      prefitMaxChiSqEE = cms.double( 10.0 ),
+      outOfTimeThresholdGain12mEB = cms.double( 5.0 ),
+      outOfTimeThresholdGain12mEE = cms.double( 1000.0 ),
+      EEtimeFitParameters = cms.vdouble( -2.390548, 3.553628, -17.62341, 67.67538, -133.213, 140.7432, -75.41106, 16.20277 ),
+      prefitMaxChiSqEB = cms.double( 25.0 ),
+      simplifiedNoiseModelForGainSwitch = cms.bool( True ),
+      EBtimeFitParameters = cms.vdouble( -2.015452, 3.130702, -12.3473, 41.88921, -82.83944, 91.01147, -50.35761, 11.05621 ),
+      selectiveBadSampleCriteriaEB = cms.bool( False ),
+      dynamicPedestalsEB = cms.bool( False ),
+      useLumiInfoRunHeader = cms.bool( False ),
+      EBamplitudeFitParameters = cms.vdouble( 1.138, 1.652 ),
+      doPrefitEE = cms.bool( False ),
+      dynamicPedestalsEE = cms.bool( False ),
+      selectiveBadSampleCriteriaEE = cms.bool( False ),
+      outOfTimeThresholdGain61pEE = cms.double( 1000.0 ),
+      outOfTimeThresholdGain61pEB = cms.double( 5.0 ),
+      activeBXs = cms.vint32( -5, -4, -3, -2, -1, 0, 1, 2, 3, 4 ),
+      EcalPulseShapeParameters = cms.PSet( 
+        EEPulseShapeTemplate = cms.vdouble( 0.116442, 0.756246, 1.0, 0.897182, 0.686831, 0.491506, 0.344111, 0.245731, 0.174115, 0.123361, 0.0874288, 0.061957 ),
+        EEdigiCollection = cms.string( "" ),
+        EcalPreMixStage2 = cms.bool( False ),
+        EcalPreMixStage1 = cms.bool( False ),
+        EBPulseShapeCovariance = cms.vdouble( 3.001E-6, 1.233E-5, 0.0, -4.416E-6, -4.571E-6, -3.614E-6, -2.636E-6, -1.286E-6, -8.41E-7, -5.296E-7, 0.0, 0.0, 1.233E-5, 6.154E-5, 0.0, -2.2E-5, -2.309E-5, -1.838E-5, -1.373E-5, -7.334E-6, -5.088E-6, -3.745E-6, -2.428E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.416E-6, -2.2E-5, 0.0, 8.319E-6, 8.545E-6, 6.792E-6, 5.059E-6, 2.678E-6, 1.816E-6, 1.223E-6, 8.245E-7, 5.589E-7, -4.571E-6, -2.309E-5, 0.0, 8.545E-6, 9.182E-6, 7.219E-6, 5.388E-6, 2.853E-6, 1.944E-6, 1.324E-6, 9.083E-7, 6.335E-7, -3.614E-6, -1.838E-5, 0.0, 6.792E-6, 7.219E-6, 6.016E-6, 4.437E-6, 2.385E-6, 1.636E-6, 1.118E-6, 7.754E-7, 5.556E-7, -2.636E-6, -1.373E-5, 0.0, 5.059E-6, 5.388E-6, 4.437E-6, 3.602E-6, 1.917E-6, 1.322E-6, 9.079E-7, 6.529E-7, 4.752E-7, -1.286E-6, -7.334E-6, 0.0, 2.678E-6, 2.853E-6, 2.385E-6, 1.917E-6, 1.375E-6, 9.1E-7, 6.455E-7, 4.693E-7, 3.657E-7, -8.41E-7, -5.088E-6, 0.0, 1.816E-6, 1.944E-6, 1.636E-6, 1.322E-6, 9.1E-7, 9.115E-7, 6.062E-7, 4.436E-7, 3.422E-7, -5.296E-7, -3.745E-6, 0.0, 1.223E-6, 1.324E-6, 1.118E-6, 9.079E-7, 6.455E-7, 6.062E-7, 7.217E-7, 4.862E-7, 3.768E-7, 0.0, -2.428E-6, 0.0, 8.245E-7, 9.083E-7, 7.754E-7, 6.529E-7, 4.693E-7, 4.436E-7, 4.862E-7, 6.509E-7, 4.418E-7, 0.0, 0.0, 0.0, 5.589E-7, 6.335E-7, 5.556E-7, 4.752E-7, 3.657E-7, 3.422E-7, 3.768E-7, 4.418E-7, 6.142E-7 ),
+        ESdigiCollection = cms.string( "" ),
+        EBdigiCollection = cms.string( "" ),
+        EBCorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.73354, 0.64442, 0.58851, 0.55425, 0.53082, 0.51916, 0.51097, 0.50732, 0.50409 ),
+        EBCorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71073, 0.55721, 0.46089, 0.40449, 0.35931, 0.33924, 0.32439, 0.31581, 0.30481 ),
+        EBCorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.70946, 0.58021, 0.49846, 0.45006, 0.41366, 0.39699, 0.38478, 0.37847, 0.37055 ),
+        EEPulseShapeCovariance = cms.vdouble( 3.941E-5, 3.333E-5, 0.0, -1.449E-5, -1.661E-5, -1.424E-5, -1.183E-5, -6.842E-6, -4.915E-6, -3.411E-6, 0.0, 0.0, 3.333E-5, 2.862E-5, 0.0, -1.244E-5, -1.431E-5, -1.233E-5, -1.032E-5, -5.883E-6, -4.154E-6, -2.902E-6, -2.128E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.449E-5, -1.244E-5, 0.0, 5.84E-6, 6.649E-6, 5.72E-6, 4.812E-6, 2.708E-6, 1.869E-6, 1.33E-6, 9.186E-7, 6.446E-7, -1.661E-5, -1.431E-5, 0.0, 6.649E-6, 7.966E-6, 6.898E-6, 5.794E-6, 3.157E-6, 2.184E-6, 1.567E-6, 1.084E-6, 7.575E-7, -1.424E-5, -1.233E-5, 0.0, 5.72E-6, 6.898E-6, 6.341E-6, 5.347E-6, 2.859E-6, 1.991E-6, 1.431E-6, 9.839E-7, 6.886E-7, -1.183E-5, -1.032E-5, 0.0, 4.812E-6, 5.794E-6, 5.347E-6, 4.854E-6, 2.628E-6, 1.809E-6, 1.289E-6, 9.02E-7, 6.146E-7, -6.842E-6, -5.883E-6, 0.0, 2.708E-6, 3.157E-6, 2.859E-6, 2.628E-6, 1.863E-6, 1.296E-6, 8.882E-7, 6.108E-7, 4.283E-7, -4.915E-6, -4.154E-6, 0.0, 1.869E-6, 2.184E-6, 1.991E-6, 1.809E-6, 1.296E-6, 1.217E-6, 8.669E-7, 5.751E-7, 3.882E-7, -3.411E-6, -2.902E-6, 0.0, 1.33E-6, 1.567E-6, 1.431E-6, 1.289E-6, 8.882E-7, 8.669E-7, 9.522E-7, 6.717E-7, 4.293E-7, 0.0, -2.128E-6, 0.0, 9.186E-7, 1.084E-6, 9.839E-7, 9.02E-7, 6.108E-7, 5.751E-7, 6.717E-7, 7.911E-7, 5.493E-7, 0.0, 0.0, 0.0, 6.446E-7, 7.575E-7, 6.886E-7, 6.146E-7, 4.283E-7, 3.882E-7, 4.293E-7, 5.493E-7, 7.027E-7 ),
+        EBPulseShapeTemplate = cms.vdouble( 0.0113979, 0.758151, 1.0, 0.887744, 0.673548, 0.474332, 0.319561, 0.215144, 0.147464, 0.101087, 0.0693181, 0.0475044 ),
+        EECorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.72698, 0.62048, 0.55691, 0.51848, 0.49147, 0.47813, 0.47007, 0.46621, 0.46265 ),
+        EECorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71373, 0.44825, 0.30152, 0.21609, 0.14786, 0.11772, 0.10165, 0.09465, 0.08098 ),
+        UseLCcorrection = cms.untracked.bool( True ),
+        EECorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.71217, 0.47464, 0.34056, 0.26282, 0.20287, 0.17734, 0.16256, 0.15618, 0.14443 )
+      ),
+      doPrefitEB = cms.bool( False ),
+      addPedestalUncertaintyEE = cms.double( 0.0 ),
+      addPedestalUncertaintyEB = cms.double( 0.0 ),
+      gainSwitchUseMaxSampleEB = cms.bool( True ),
+      EEtimeNconst = cms.double( 31.8 ),
+      EEamplitudeFitParameters = cms.vdouble( 1.89, 1.4 ),
+      chi2ThreshEE_ = cms.double( 50.0 ),
+      eePulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ),
+      outOfTimeThresholdGain12pEB = cms.double( 5.0 ),
+      gainSwitchUseMaxSampleEE = cms.bool( False ),
+      mitigateBadSamplesEB = cms.bool( False ),
+      outOfTimeThresholdGain12pEE = cms.double( 1000.0 ),
+      ebPulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ),
+      ampErrorCalculation = cms.bool( False ),
+      mitigateBadSamplesEE = cms.bool( False ),
+      amplitudeThresholdEB = cms.double( 10.0 ),
+      kPoorRecoFlagEB = cms.bool( True ),
+      amplitudeThresholdEE = cms.double( 10.0 ),
+      EBtimeFitLimits_Lower = cms.double( 0.2 ),
+      kPoorRecoFlagEE = cms.bool( False ),
+      EEtimeFitLimits_Upper = cms.double( 1.4 ),
+      outOfTimeThresholdGain61mEE = cms.double( 1000.0 ),
+      EEtimeConstantTerm = cms.double( 1.0 ),
+      EBtimeConstantTerm = cms.double( 0.6 ),
+      chi2ThreshEB_ = cms.double( 65.0 ),
+      outOfTimeThresholdGain61mEB = cms.double( 5.0 )
+)     
+      
+##    
+    
+    
+    
+#process.load('Configuration.StandardSequences.Reconstruction_cff')
+#process.ecalRecHit
+
+    
+    
+#process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi")
+#process.ecalRecHitGPU
+    
+
+
+#process.hcalDigis.silent = cms.untracked.bool(False)
+#process.hcalDigis.InputLabel = rawTag
+process.ecalDigis = process.ecalEBunpacker.clone()
+process.ecalDigis.InputLabel = cms.InputTag('rawDataCollector')
+#process.hbheprerecogpu.processQIE11 = cms.bool(True)
+
+process.out = cms.OutputModule(
+    "PoolOutputModule",
+    fileName = cms.untracked.string("test_uncalib.root")
+)
+
+#process.out = cms.OutputModule("AsciiOutputModule",
+#    outputCommands = cms.untracked.vstring(
+#        'keep *_ecalMultiFitUncalibRecHit_*_*', 
+#    ),
+#    verbosity = cms.untracked.uint32(0)
+#)
+process.finalize = cms.EndPath(process.out)
+
+process.bunchSpacing = cms.Path(
+    process.bunchSpacingProducer
+)
+
+process.digiPath = cms.Path(
+    #process.hcalDigis
+    process.ecalDigis
+    *process.ecalRawToDigiGPU    
+)
+
+process.recoPath = cms.Path(
+    #(process.ecalMultiFitUncalibRecHit+process.ecalDetIdToBeRecovered)
+    process.ecalMultiFitUncalibRecHit
+    #*process.ecalRecHit
+#   gpu
+    *process.ecalUncalibRecHitProducerGPU
+    *process.ecalCPUUncalibRecHitProducer
+    #*process.ecalRecHitGPU
+)
+
+process.schedule = cms.Schedule(
+    process.bunchSpacing,
+    process.digiPath,
+    process.recoPath,
+#    process.ecalecalLocalRecoSequence
+    process.finalize
+)
+
+process.options = cms.untracked.PSet(
+    numberOfThreads = cms.untracked.uint32(8),
+    numberOfStreams = cms.untracked.uint32(8),
+    SkipEvent = cms.untracked.vstring('ProductNotFound'),
+    wantSummary = cms.untracked.bool(True)
+)
+
+# report CUDAService messages
+process.MessageLogger.categories.append("CUDAService")
+
+