From 3b94f5cc4a121186773199b40dc87f5f4c69f284 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 14 Nov 2018 23:56:26 +0100
Subject: [PATCH 01/34] Synchronise with CMSSW_10_4_0_pre2

---
 .../HcalRawToDigi/plugins/BuildFile.xml       | 27 +++++------
 .../python/hcalGlobalReco_cff.py              |  7 ++-
 .../Configuration/python/hcalLocalReco_cff.py | 47 +++++++++----------
 RecoLocalCalo/HcalRecAlgos/BuildFile.xml      | 34 +++++++-------
 RecoLocalCalo/HcalRecProducers/BuildFile.xml  | 20 ++++----
 5 files changed, 66 insertions(+), 69 deletions(-)
diff --git a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
index ccf6a061119c2..2855afe777098 100644
--- a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
@@ -1,16 +1,15 @@
-<use name="DataFormats/HcalDetId"/>
-<use name="DataFormats/HcalDigi"/>
-<use name="DataFormats/FEDRawData"/>
-<use name="CondFormats/HcalObjects"/>
-<use name="CalibFormats/HcalObjects"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/MessageLogger"/>
-<use name="boost"/>
-<use name="zlib"/>
-<use name="EventFilter/HcalRawToDigi"/>
-<flags EDM_PLUGIN="1"/>
-<library file="HcalCalibFEDSelector.cc,HcalCalibTypeFilter.cc,HcalDigiToRaw.cc,HcalEmptyEventFilter.cc,HcalHistogramRawToDigi.cc,HcalRawToDigi.cc,modules.cc,HcalDigiToRawuHTR.cc,HcalRawToDigiFake.cc" name="EventFilterHcalRawToDigiPlugins">
+<use   name="DataFormats/HcalDetId"/>
+<use   name="DataFormats/HcalDigi"/>
+<use   name="DataFormats/FEDRawData"/>
+<use   name="CondFormats/HcalObjects"/>
+<use   name="CalibFormats/HcalObjects"/>
+<use   name="FWCore/Framework"/>
+<use   name="FWCore/MessageLogger"/>
+<use   name="boost"/>
+<use   name="zlib"/>
+<use   name="EventFilter/HcalRawToDigi"/>
+<flags   EDM_PLUGIN="1"/>
+<library   file="HcalCalibFEDSelector.cc,HcalCalibTypeFilter.cc,HcalDigiToRaw.cc,HcalEmptyEventFilter.cc,HcalHistogramRawToDigi.cc,HcalRawToDigi.cc,modules.cc,HcalDigiToRawuHTR.cc,HcalRawToDigiFake.cc" name="EventFilterHcalRawToDigiPlugins">
 </library>
-
-<library file="HcalLaserEventFiltProducer2012.cc, HcalLaserEventFilter2012.cc,HcalLaserHFFilter2012.cc,HcalLaserHBHEFilter2012.cc,HcalLaserHBHEHFFilter2012.cc" name="EventFilterHcalRawToDigiFiltersPlugins">
+<library   file="HcalLaserEventFiltProducer2012.cc, HcalLaserEventFilter2012.cc,HcalLaserHFFilter2012.cc,HcalLaserHBHEFilter2012.cc,HcalLaserHBHEHFFilter2012.cc" name="EventFilterHcalRawToDigiFiltersPlugins">
 </library>
diff --git a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
index 70207e36ba654..11e318a72b3e2 100644
--- a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
@@ -1,10 +1,9 @@
 import FWCore.ParameterSet.Config as cms
 
 from RecoLocalCalo.HcalRecProducers.HBHEIsolatedNoiseReflagger_cfi import *
-hcalGlobalRecoTask = cms.Task(hbhereco)
-hcalGlobalRecoSequence = cms.Sequence(hcalGlobalRecoTask)
+hcalGlobalRecoSequence = cms.Sequence(hbhereco)
 
 from RecoLocalCalo.HcalRecProducers.HBHEPhase1Reconstructor_cfi import hbheprereco as _phase1_hbheprereco
 
-from Configuration.Eras.Modifier_run3_HB_cff import run3_HB
-run3_HB.toReplaceWith( hbhereco, _phase1_hbheprereco ) # >=Run3
+from Configuration.Eras.Modifier_phase2_hcal_cff import phase2_hcal
+phase2_hcal.toReplaceWith( hbhereco, _phase1_hbheprereco )
diff --git a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
index 84e7498b7eb2d..d6f1582e56f93 100644
--- a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
@@ -1,61 +1,56 @@
 import FWCore.ParameterSet.Config as cms
 
 from RecoLocalCalo.HcalRecAlgos.hcalRecAlgoESProd_cfi import *
-from RecoLocalCalo.HcalRecAlgos.hcalChannelPropertiesESProd_cfi import *
 hcalOOTPileupESProducer = cms.ESProducer('OOTPileupDBCompatibilityESProducer')
 
 from RecoLocalCalo.HcalRecProducers.HBHEPhase1Reconstructor_cfi import hbheprereco as _phase1_hbheprereco
 hbheprereco = _phase1_hbheprereco.clone(
-    processQIE11 = False,
-    tsFromDB = True,
+    processQIE11 = cms.bool(False),
+    tsFromDB = cms.bool(True),
     pulseShapeParametersQIE8 = dict(
-        TrianglePeakTS = 4,
+        TrianglePeakTS = cms.uint32(4),
     )
 )
 
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_ho_cfi import *
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_hf_cfi import *
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_zdc_cfi import *
-hcalLocalRecoTask = cms.Task(hbheprereco,hfreco,horeco,zdcreco)
-hcalLocalRecoSequence = cms.Sequence(hcalLocalRecoTask)
+hcalLocalRecoSequence = cms.Sequence(hbheprereco+hfreco+horeco+zdcreco)
 
 from RecoLocalCalo.HcalRecProducers.hfprereco_cfi import hfprereco
 from RecoLocalCalo.HcalRecProducers.HFPhase1Reconstructor_cfi import hfreco as _phase1_hfreco
 from RecoLocalCalo.HcalRecProducers.hbheplan1_cfi import hbheplan1
 
-#--- for HCALonly wf
-hcalOnlyLocalRecoTask = cms.Task(hbheprereco,hfprereco,hfreco,horeco)
-
 # copy for cosmics
 _default_hfreco = hfreco.clone()
 
-#--- Phase1 
-_phase1_hcalLocalRecoTask = hcalLocalRecoTask.copy()
-_phase1_hcalLocalRecoTask.add(hfprereco)
+_phase1_hcalLocalRecoSequence = hcalLocalRecoSequence.copy()
+_phase1_hcalLocalRecoSequence.insert(0,hfprereco)
 
 from Configuration.Eras.Modifier_run2_HF_2017_cff import run2_HF_2017
-run2_HF_2017.toReplaceWith( hcalLocalRecoTask, _phase1_hcalLocalRecoTask )
+run2_HF_2017.toReplaceWith( hcalLocalRecoSequence, _phase1_hcalLocalRecoSequence )
 run2_HF_2017.toReplaceWith( hfreco, _phase1_hfreco )
 from Configuration.Eras.Modifier_run2_HCAL_2017_cff import run2_HCAL_2017
 run2_HCAL_2017.toReplaceWith( hbheprereco, _phase1_hbheprereco )
 
-_plan1_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy()
-_plan1_hcalLocalRecoTask.add(hbheplan1)
+_plan1_hcalLocalRecoSequence = _phase1_hcalLocalRecoSequence.copy()
+_plan1_hcalLocalRecoSequence += hbheplan1
 from Configuration.Eras.Modifier_run2_HEPlan1_2017_cff import run2_HEPlan1_2017
-run2_HEPlan1_2017.toReplaceWith(hcalLocalRecoTask, _plan1_hcalLocalRecoTask)
+run2_HEPlan1_2017.toReplaceWith(hcalLocalRecoSequence, _plan1_hcalLocalRecoSequence)
 
 hbhecollapse = hbheplan1.clone()
-_collapse_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy()
-_collapse_hcalLocalRecoTask.add(hbhecollapse)
+_collapse_hcalLocalRecoSequence = _phase1_hcalLocalRecoSequence.copy()
+_collapse_hcalLocalRecoSequence += hbhecollapse
 from Configuration.ProcessModifiers.run2_HECollapse_2018_cff import run2_HECollapse_2018
-run2_HECollapse_2018.toReplaceWith(hcalLocalRecoTask, _collapse_hcalLocalRecoTask)
+run2_HECollapse_2018.toReplaceWith(hcalLocalRecoSequence, _collapse_hcalLocalRecoSequence)
+
+_phase2_hcalLocalRecoSequence = hcalLocalRecoSequence.copy()
+_phase2_hcalLocalRecoSequence.remove(hbheprereco)
+
+from Configuration.Eras.Modifier_phase2_hcal_cff import phase2_hcal
+phase2_hcal.toReplaceWith( hcalLocalRecoSequence, _phase2_hcalLocalRecoSequence )
 
-#--- from >=Run3
-_run3_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy()
-_run3_hcalLocalRecoTask.remove(hbheprereco)
-from Configuration.Eras.Modifier_run3_HB_cff import run3_HB
-run3_HB.toReplaceWith( hcalLocalRecoTask, _run3_hcalLocalRecoTask )
 
-_fastSim_hcalLocalRecoTask = hcalLocalRecoTask.copyAndExclude([zdcreco])
+_fastSim_hcalLocalRecoSequence = hcalLocalRecoSequence.copyAndExclude([zdcreco])
 from Configuration.Eras.Modifier_fastSim_cff import fastSim
-fastSim.toReplaceWith( hcalLocalRecoTask, _fastSim_hcalLocalRecoTask )
+fastSim.toReplaceWith( hcalLocalRecoSequence, _fastSim_hcalLocalRecoSequence )
diff --git a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
index ac94d61e12494..b3b67a7ae8fe4 100644
--- a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
@@ -1,19 +1,19 @@
-<use name="boost"/>
-<use name="eigen"/>
-<use name="clhep"/>
-<use name="DataFormats/HcalDigi"/>
-<use name="DataFormats/HcalRecHit"/>
-<use name="DataFormats/TrackReco"/>
-<use name="CalibFormats/HcalObjects"/>
-<use name="CalibFormats/CaloObjects"/>
-<use name="CalibCalorimetry/HcalAlgos"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
-<use name="FWCore/MessageLogger"/>
-<use name="CondFormats/DataRecord"/>
-<use name="RecoLocalCalo/EcalRecAlgos"/>
-<use name="vdt_headers"/>
-<use name="rootminuit2"/>
+<use   name="boost"/>
+<use   name="eigen"/>
+<use   name="clhep"/>
+<use   name="DataFormats/HcalDigi"/>
+<use   name="DataFormats/HcalRecHit"/>
+<use   name="CalibFormats/HcalObjects"/>
+<use   name="CalibFormats/CaloObjects"/>
+<use   name="CalibCalorimetry/HcalAlgos"/>
+<use   name="RecoMET/METAlgorithms"/>
+<use   name="DataFormats/CaloTowers"/>
+<use   name="FWCore/Framework"/>
+<use   name="FWCore/PluginManager"/>
+<use   name="FWCore/ParameterSet"/>
+<use   name="CondFormats/DataRecord"/>
+<use   name="vdt_headers"/>
+<use   name="rootminuit2"/> 
 <export>
-  <lib name="1"/>
+  <lib   name="1"/>
 </export>
diff --git a/RecoLocalCalo/HcalRecProducers/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
index c3ae589a0c0a7..923620701895a 100644
--- a/RecoLocalCalo/HcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
@@ -1,8 +1,12 @@
-<flags EDM_PLUGIN="1"/>
-<use name="CalibCalorimetry/HcalAlgos"/>
-<use name="CalibFormats/HcalObjects"/>
-<use name="RecoLocalCalo/HcalRecAlgos"/>
-<use name="FWCore/Framework"/>
-<use name="DataFormats/Common"/>
-<use name="Geometry/Records"/>
-<use name="boost"/>
+<flags   EDM_PLUGIN="1"/>
+<use   name="CalibCalorimetry/HcalAlgos"/>
+<use   name="CalibFormats/HcalObjects"/>
+<use   name="RecoLocalCalo/HcalRecAlgos"/>
+<use   name="FWCore/Framework"/>
+<use   name="DataFormats/Common"/>
+<use   name="DataFormats/Math"/>
+<use   name="DataFormats/RecoCandidate"/>
+<use   name="Geometry/Records"/>
+<use   name="Geometry/CaloGeometry"/>
+<use   name="CondFormats/EcalObjects"/>
+<use   name="boost"/>

From 72d50abcf0d3fdc8a1f9902c8b7e9c336da4d978 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 15 May 2019 14:11:42 +0200
Subject: [PATCH 02/34] Synchronise with CMSSW_10_6_0

---
 RecoLocalCalo/HcalRecAlgos/BuildFile.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
index b3b67a7ae8fe4..8a8c538fc654e 100644
--- a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
@@ -11,6 +11,7 @@
 <use   name="FWCore/Framework"/>
 <use   name="FWCore/PluginManager"/>
 <use   name="FWCore/ParameterSet"/>
+<use   name="FWCore/MessageLogger"/>
 <use   name="CondFormats/DataRecord"/>
 <use   name="vdt_headers"/>
 <use   name="rootminuit2"/> 

From 58530e9ff6cc09ad65f2c7fb2b57be13bee27db3 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 12 Sep 2019 05:45:55 +0200
Subject: [PATCH 03/34] Synchronise with CMSSW_11_0_0_pre7

---
 .../Configuration/python/hcalLocalReco_cff.py | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
index d6f1582e56f93..057707c80534e 100644
--- a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
@@ -15,7 +15,8 @@
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_ho_cfi import *
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_hf_cfi import *
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_zdc_cfi import *
-hcalLocalRecoSequence = cms.Sequence(hbheprereco+hfreco+horeco+zdcreco)
+hcalLocalRecoTask = cms.Task(hbheprereco,hfreco,horeco,zdcreco)
+hcalLocalRecoSequence = cms.Sequence(hcalLocalRecoTask)
 
 from RecoLocalCalo.HcalRecProducers.hfprereco_cfi import hfprereco
 from RecoLocalCalo.HcalRecProducers.HFPhase1Reconstructor_cfi import hfreco as _phase1_hfreco
@@ -24,33 +25,33 @@
 # copy for cosmics
 _default_hfreco = hfreco.clone()
 
-_phase1_hcalLocalRecoSequence = hcalLocalRecoSequence.copy()
-_phase1_hcalLocalRecoSequence.insert(0,hfprereco)
+_phase1_hcalLocalRecoTask = hcalLocalRecoTask.copy()
+_phase1_hcalLocalRecoTask.add(hfprereco)
 
 from Configuration.Eras.Modifier_run2_HF_2017_cff import run2_HF_2017
-run2_HF_2017.toReplaceWith( hcalLocalRecoSequence, _phase1_hcalLocalRecoSequence )
+run2_HF_2017.toReplaceWith( hcalLocalRecoTask, _phase1_hcalLocalRecoTask )
 run2_HF_2017.toReplaceWith( hfreco, _phase1_hfreco )
 from Configuration.Eras.Modifier_run2_HCAL_2017_cff import run2_HCAL_2017
 run2_HCAL_2017.toReplaceWith( hbheprereco, _phase1_hbheprereco )
 
-_plan1_hcalLocalRecoSequence = _phase1_hcalLocalRecoSequence.copy()
-_plan1_hcalLocalRecoSequence += hbheplan1
+_plan1_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy()
+_plan1_hcalLocalRecoTask.add(hbheplan1)
 from Configuration.Eras.Modifier_run2_HEPlan1_2017_cff import run2_HEPlan1_2017
-run2_HEPlan1_2017.toReplaceWith(hcalLocalRecoSequence, _plan1_hcalLocalRecoSequence)
+run2_HEPlan1_2017.toReplaceWith(hcalLocalRecoTask, _plan1_hcalLocalRecoTask)
 
 hbhecollapse = hbheplan1.clone()
-_collapse_hcalLocalRecoSequence = _phase1_hcalLocalRecoSequence.copy()
-_collapse_hcalLocalRecoSequence += hbhecollapse
+_collapse_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy()
+_collapse_hcalLocalRecoTask.add(hbhecollapse)
 from Configuration.ProcessModifiers.run2_HECollapse_2018_cff import run2_HECollapse_2018
-run2_HECollapse_2018.toReplaceWith(hcalLocalRecoSequence, _collapse_hcalLocalRecoSequence)
+run2_HECollapse_2018.toReplaceWith(hcalLocalRecoTask, _collapse_hcalLocalRecoTask)
 
-_phase2_hcalLocalRecoSequence = hcalLocalRecoSequence.copy()
-_phase2_hcalLocalRecoSequence.remove(hbheprereco)
+_phase2_hcalLocalRecoTask = hcalLocalRecoTask.copy()
+_phase2_hcalLocalRecoTask.remove(hbheprereco)
 
 from Configuration.Eras.Modifier_phase2_hcal_cff import phase2_hcal
-phase2_hcal.toReplaceWith( hcalLocalRecoSequence, _phase2_hcalLocalRecoSequence )
+phase2_hcal.toReplaceWith( hcalLocalRecoTask, _phase2_hcalLocalRecoTask )
 
 
-_fastSim_hcalLocalRecoSequence = hcalLocalRecoSequence.copyAndExclude([zdcreco])
+_fastSim_hcalLocalRecoTask = hcalLocalRecoTask.copyAndExclude([zdcreco])
 from Configuration.Eras.Modifier_fastSim_cff import fastSim
-fastSim.toReplaceWith( hcalLocalRecoSequence, _fastSim_hcalLocalRecoSequence )
+fastSim.toReplaceWith( hcalLocalRecoTask, _fastSim_hcalLocalRecoTask )

From df660a58dab88221648f850386de980cbaf80d32 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 28 Nov 2019 14:15:13 +0100
Subject: [PATCH 04/34] Synchronise with CMSSW_11_0_0_pre12

---
 RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
index 11e318a72b3e2..a94a032602713 100644
--- a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
@@ -1,7 +1,8 @@
 import FWCore.ParameterSet.Config as cms
 
 from RecoLocalCalo.HcalRecProducers.HBHEIsolatedNoiseReflagger_cfi import *
-hcalGlobalRecoSequence = cms.Sequence(hbhereco)
+hcalGlobalRecoTask = cms.Task(hbhereco)
+hcalGlobalRecoSequence = cms.Sequence(hcalGlobalRecoTask)
 
 from RecoLocalCalo.HcalRecProducers.HBHEPhase1Reconstructor_cfi import hbheprereco as _phase1_hbheprereco
 

From 00b22a280bea5b5c0a8c4acb22b23a184de741e4 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 26 Mar 2020 19:09:30 +0100
Subject: [PATCH 05/34] Synchronise with CMSSW_11_1_0_pre5

---
 RecoLocalCalo/HcalRecAlgos/BuildFile.xml     | 5 ++---
 RecoLocalCalo/HcalRecProducers/BuildFile.xml | 4 ----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
index 8a8c538fc654e..6294d952f5056 100644
--- a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
@@ -3,16 +3,15 @@
 <use   name="clhep"/>
 <use   name="DataFormats/HcalDigi"/>
 <use   name="DataFormats/HcalRecHit"/>
+<use   name="DataFormats/TrackReco"/>
 <use   name="CalibFormats/HcalObjects"/>
 <use   name="CalibFormats/CaloObjects"/>
 <use   name="CalibCalorimetry/HcalAlgos"/>
-<use   name="RecoMET/METAlgorithms"/>
-<use   name="DataFormats/CaloTowers"/>
 <use   name="FWCore/Framework"/>
-<use   name="FWCore/PluginManager"/>
 <use   name="FWCore/ParameterSet"/>
 <use   name="FWCore/MessageLogger"/>
 <use   name="CondFormats/DataRecord"/>
+<use   name="RecoLocalCalo/EcalRecAlgos"/>
 <use   name="vdt_headers"/>
 <use   name="rootminuit2"/> 
 <export>
diff --git a/RecoLocalCalo/HcalRecProducers/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
index 923620701895a..935359b3063c9 100644
--- a/RecoLocalCalo/HcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
@@ -4,9 +4,5 @@
 <use   name="RecoLocalCalo/HcalRecAlgos"/>
 <use   name="FWCore/Framework"/>
 <use   name="DataFormats/Common"/>
-<use   name="DataFormats/Math"/>
-<use   name="DataFormats/RecoCandidate"/>
 <use   name="Geometry/Records"/>
-<use   name="Geometry/CaloGeometry"/>
-<use   name="CondFormats/EcalObjects"/>
 <use   name="boost"/>

From b906b1b8bd05440a6dce27c25ecf7d4c9391b598 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 11 May 2020 14:49:56 +0200
Subject: [PATCH 06/34] Synchronise with CMSSW_11_1_0_pre7

---
 .../HcalRawToDigi/plugins/BuildFile.xml       | 27 ++++++++-------
 RecoLocalCalo/HcalRecAlgos/BuildFile.xml      | 34 +++++++++----------
 RecoLocalCalo/HcalRecProducers/BuildFile.xml  | 16 ++++-----
 3 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
index 2855afe777098..ccf6a061119c2 100644
--- a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
@@ -1,15 +1,16 @@
-<use   name="DataFormats/HcalDetId"/>
-<use   name="DataFormats/HcalDigi"/>
-<use   name="DataFormats/FEDRawData"/>
-<use   name="CondFormats/HcalObjects"/>
-<use   name="CalibFormats/HcalObjects"/>
-<use   name="FWCore/Framework"/>
-<use   name="FWCore/MessageLogger"/>
-<use   name="boost"/>
-<use   name="zlib"/>
-<use   name="EventFilter/HcalRawToDigi"/>
-<flags   EDM_PLUGIN="1"/>
-<library   file="HcalCalibFEDSelector.cc,HcalCalibTypeFilter.cc,HcalDigiToRaw.cc,HcalEmptyEventFilter.cc,HcalHistogramRawToDigi.cc,HcalRawToDigi.cc,modules.cc,HcalDigiToRawuHTR.cc,HcalRawToDigiFake.cc" name="EventFilterHcalRawToDigiPlugins">
+<use name="DataFormats/HcalDetId"/>
+<use name="DataFormats/HcalDigi"/>
+<use name="DataFormats/FEDRawData"/>
+<use name="CondFormats/HcalObjects"/>
+<use name="CalibFormats/HcalObjects"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/MessageLogger"/>
+<use name="boost"/>
+<use name="zlib"/>
+<use name="EventFilter/HcalRawToDigi"/>
+<flags EDM_PLUGIN="1"/>
+<library file="HcalCalibFEDSelector.cc,HcalCalibTypeFilter.cc,HcalDigiToRaw.cc,HcalEmptyEventFilter.cc,HcalHistogramRawToDigi.cc,HcalRawToDigi.cc,modules.cc,HcalDigiToRawuHTR.cc,HcalRawToDigiFake.cc" name="EventFilterHcalRawToDigiPlugins">
 </library>
-<library   file="HcalLaserEventFiltProducer2012.cc, HcalLaserEventFilter2012.cc,HcalLaserHFFilter2012.cc,HcalLaserHBHEFilter2012.cc,HcalLaserHBHEHFFilter2012.cc" name="EventFilterHcalRawToDigiFiltersPlugins">
+
+<library file="HcalLaserEventFiltProducer2012.cc, HcalLaserEventFilter2012.cc,HcalLaserHFFilter2012.cc,HcalLaserHBHEFilter2012.cc,HcalLaserHBHEHFFilter2012.cc" name="EventFilterHcalRawToDigiFiltersPlugins">
 </library>
diff --git a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
index 6294d952f5056..ac94d61e12494 100644
--- a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
@@ -1,19 +1,19 @@
-<use   name="boost"/>
-<use   name="eigen"/>
-<use   name="clhep"/>
-<use   name="DataFormats/HcalDigi"/>
-<use   name="DataFormats/HcalRecHit"/>
-<use   name="DataFormats/TrackReco"/>
-<use   name="CalibFormats/HcalObjects"/>
-<use   name="CalibFormats/CaloObjects"/>
-<use   name="CalibCalorimetry/HcalAlgos"/>
-<use   name="FWCore/Framework"/>
-<use   name="FWCore/ParameterSet"/>
-<use   name="FWCore/MessageLogger"/>
-<use   name="CondFormats/DataRecord"/>
-<use   name="RecoLocalCalo/EcalRecAlgos"/>
-<use   name="vdt_headers"/>
-<use   name="rootminuit2"/> 
+<use name="boost"/>
+<use name="eigen"/>
+<use name="clhep"/>
+<use name="DataFormats/HcalDigi"/>
+<use name="DataFormats/HcalRecHit"/>
+<use name="DataFormats/TrackReco"/>
+<use name="CalibFormats/HcalObjects"/>
+<use name="CalibFormats/CaloObjects"/>
+<use name="CalibCalorimetry/HcalAlgos"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/MessageLogger"/>
+<use name="CondFormats/DataRecord"/>
+<use name="RecoLocalCalo/EcalRecAlgos"/>
+<use name="vdt_headers"/>
+<use name="rootminuit2"/>
 <export>
-  <lib   name="1"/>
+  <lib name="1"/>
 </export>
diff --git a/RecoLocalCalo/HcalRecProducers/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
index 935359b3063c9..c3ae589a0c0a7 100644
--- a/RecoLocalCalo/HcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
@@ -1,8 +1,8 @@
-<flags   EDM_PLUGIN="1"/>
-<use   name="CalibCalorimetry/HcalAlgos"/>
-<use   name="CalibFormats/HcalObjects"/>
-<use   name="RecoLocalCalo/HcalRecAlgos"/>
-<use   name="FWCore/Framework"/>
-<use   name="DataFormats/Common"/>
-<use   name="Geometry/Records"/>
-<use   name="boost"/>
+<flags EDM_PLUGIN="1"/>
+<use name="CalibCalorimetry/HcalAlgos"/>
+<use name="CalibFormats/HcalObjects"/>
+<use name="RecoLocalCalo/HcalRecAlgos"/>
+<use name="FWCore/Framework"/>
+<use name="DataFormats/Common"/>
+<use name="Geometry/Records"/>
+<use name="boost"/>

From 815e8feeee3309f3d13ab71b1314daf5483a7969 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 5 Jun 2020 08:48:44 +0200
Subject: [PATCH 07/34] Implement HCAL local reconstruction on GPUs
 (cms-patatrack#468)

Implement full HCAL local reconstruction on GPU:
  - HCAL RAW to DIGI unpacking on GPU;
  - HCAL RecHits reconstruction on GPU, with "Method 0" and "MAHI";
  - corresponding EventSetup modules for providing conditions data on GPU.

Implement various conversions between GPU and CPU collections to allow
the validation of GPU vs CPU results, run the unpacker on CPU and copy
the unpacked data to the GPU, etc.

Co-authored-by: Andrea Bocci <andrea.bocci@cern.ch>
---
 CUDADataFormats/HcalDigi/BuildFile.xml        |   8 +
 .../HcalDigi/interface/DigiCollection.h       | 187 ++++++
 CUDADataFormats/HcalDigi/src/classes.h        |  38 ++
 CUDADataFormats/HcalDigi/src/classes_def.xml  |  60 ++
 CUDADataFormats/HcalRecHitSoA/BuildFile.xml   |   7 +
 .../interface/RecHitCollection.h              |  38 ++
 CUDADataFormats/HcalRecHitSoA/src/classes.h   |  14 +
 .../HcalRecHitSoA/src/classes_def.xml         |  16 +
 EventFilter/HcalRawToDigi/bin/BuildFile.xml   |   8 +
 .../makeHcalRaw2DigiGpuValidationPlots.cpp    | 407 ++++++++++++
 .../HcalRawToDigi/plugins/BuildFile.xml       |  22 +-
 .../HcalRawToDigi/plugins/DeclsForKernels.h   | 148 +++++
 .../HcalRawToDigi/plugins/DecodeGPU.cu        | 613 ++++++++++++++++++
 EventFilter/HcalRawToDigi/plugins/DecodeGPU.h |  23 +
 .../plugins/ElectronicsMappingGPU.cc          |  64 ++
 .../plugins/ElectronicsMappingGPU.h           |  51 ++
 .../plugins/HcalCPUDigisProducer.cc           | 191 ++++++
 .../plugins/HcalDigisProducerGPU.cc           | 272 ++++++++
 .../plugins/HcalESProducerGPUDefs.cc          |  12 +
 .../HcalRawToDigi/plugins/HcalRawToDigiGPU.cc | 206 ++++++
 RecoLocalCalo/HcalRecAlgos/BuildFile.xml      |  24 +-
 RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml  |   2 +
 .../HcalRecAlgos/bin/generateQIEShapes.cc     |  82 +++
 .../HcalRecoParamsWithPulseShapesGPU.h        |  57 ++
 .../src/HcalRecoParamsWithPulseShapesGPU.cc   | 222 +++++++
 RecoLocalCalo/HcalRecProducers/BuildFile.xml  |  16 +-
 .../HcalRecProducers/bin/BuildFile.xml        |   7 +
 .../bin/makeHcalRecHitGpuValidationPlots.cpp  | 282 ++++++++
 .../src/HBHERecHitProducerGPU.cc              | 276 ++++++++
 .../src/HcalCPURecHitsProducer.cc             | 109 ++++
 .../src/HcalESProducersGPUDefs.cc             | 132 ++++
 31 files changed, 3576 insertions(+), 18 deletions(-)
 create mode 100644 CUDADataFormats/HcalDigi/BuildFile.xml
 create mode 100644 CUDADataFormats/HcalDigi/interface/DigiCollection.h
 create mode 100644 CUDADataFormats/HcalDigi/src/classes.h
 create mode 100644 CUDADataFormats/HcalDigi/src/classes_def.xml
 create mode 100644 CUDADataFormats/HcalRecHitSoA/BuildFile.xml
 create mode 100644 CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
 create mode 100644 CUDADataFormats/HcalRecHitSoA/src/classes.h
 create mode 100644 CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
 create mode 100644 EventFilter/HcalRawToDigi/bin/BuildFile.xml
 create mode 100644 EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
 create mode 100644 EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
 create mode 100644 EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
 create mode 100644 EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
 create mode 100644 EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
 create mode 100644 EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
 create mode 100644 EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
 create mode 100644 EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
 create mode 100644 EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
 create mode 100644 EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
 create mode 100644 RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml
 create mode 100644 RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc
 create mode 100644 RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h
 create mode 100644 RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
 create mode 100644 RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml
 create mode 100644 RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp
 create mode 100644 RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
 create mode 100644 RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
 create mode 100644 RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc

diff --git a/CUDADataFormats/HcalDigi/BuildFile.xml b/CUDADataFormats/HcalDigi/BuildFile.xml
new file mode 100644
index 0000000000000..8feae467742c0
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/BuildFile.xml
@@ -0,0 +1,8 @@
+<use name="DataFormats/Common"/>
+<use name="CUDADataFormats/HcalCommon"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
+
+<export>
+  <lib   name="1"/>
+</export>
diff --git a/CUDADataFormats/HcalDigi/interface/DigiCollection.h b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
new file mode 100644
index 0000000000000..cbb9da410f801
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
@@ -0,0 +1,187 @@
+#ifndef CUDADataFormats_HcalDigi_interface_DigiCollection_h
+#define CUDADataFormats_HcalDigi_interface_DigiCollection_h
+
+#include "CUDADataFormats/HcalCommon/interface/Common.h"
+
+namespace hcal {
+
+  struct Flavor01 {
+    using adc_type = uint8_t;
+    using tdc_type = uint8_t;
+    using soibit_type = uint8_t;
+
+    static constexpr int WORDS_PER_SAMPLE = 1;
+    static constexpr int HEADER_WORDS = 1;
+
+    static constexpr adc_type adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
+
+    static constexpr tdc_type tdc(uint16_t const* const sample_start) { return (*sample_start >> 8) & 0x3f; }
+
+    static constexpr soibit_type soibit(uint16_t const* const sample_start) { return (*sample_start >> 14) & 0x1; }
+  };
+
+  struct Flavor2 {
+    static constexpr int WORDS_PER_SAMPLE = 2;
+    static constexpr int HEADER_WORDS = 1;
+  };
+
+  struct Flavor3 {
+    using adc_type = uint8_t;
+    using tdc_type = uint8_t;
+    using soibit_type = uint8_t;
+
+    static constexpr int WORDS_PER_SAMPLE = 1;
+    static constexpr int HEADER_WORDS = 1;
+
+    static constexpr adc_type adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
+
+    static constexpr tdc_type tdc(uint16_t const* const sample_start) { return ((*sample_start >> 8) & 0x3); }
+
+    static constexpr soibit_type soibit(uint16_t const* const sample_start) { return ((*sample_start >> 14) & 0x1); }
+
+    static constexpr uint8_t capid(uint16_t const* const sample_start) { return ((*sample_start >> 10) & 0x3); }
+  };
+
+  struct Flavor4 {
+    static constexpr int WORDS_PER_SAMPLE = 1;
+    static constexpr int HEADER_WORDS = 1;
+  };
+
+  struct Flavor5 {
+    using adc_type = uint8_t;
+
+    static constexpr float WORDS_PER_SAMPLE = 0.5;
+    static constexpr int SAMPLES_PER_WORD = 2;
+    static constexpr int HEADER_WORDS = 1;
+
+    static constexpr adc_type adc(uint16_t const* const sample_start, uint8_t const shifter) {
+      return ((*sample_start >> shifter * 8) & 0x7f);
+    }
+  };
+
+  template <typename Flavor>
+  constexpr uint8_t capid_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+    auto const capid_first = (*dfstart >> 8) & 0x3;
+    return (capid_first + sample) & 0x3;  // same as % 4
+  }
+
+  template <>
+  constexpr uint8_t capid_for_sample<Flavor3>(uint16_t const* const dfstart, uint32_t const sample) {
+    return Flavor3::capid(dfstart + Flavor3::HEADER_WORDS + sample * Flavor3::WORDS_PER_SAMPLE);
+  }
+
+  template <typename Flavor>
+  constexpr typename Flavor::soibit_type soibit_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+    return Flavor::soibit(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
+  }
+
+  template <typename Flavor>
+  constexpr typename Flavor::adc_type adc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+    return Flavor::adc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
+  }
+
+  template <typename Flavor>
+  constexpr typename Flavor::tdc_type tdc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+    return Flavor::tdc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
+  }
+
+  template <>
+  constexpr Flavor5::adc_type adc_for_sample<Flavor5>(uint16_t const* const dfstart, uint32_t const sample) {
+    // avoid using WORDS_PER_SAMPLE and simply shift
+    return Flavor5::adc(dfstart + Flavor5::HEADER_WORDS + (sample >> 1), sample % 2);
+  }
+
+  template <typename Flavor>
+  constexpr uint32_t compute_stride(uint32_t const nsamples) {
+    return static_cast<uint32_t>(nsamples * Flavor::WORDS_PER_SAMPLE) + Flavor::HEADER_WORDS;
+  }
+
+  template <typename Flavor>
+  constexpr uint32_t compute_nsamples(uint32_t const nwords) {
+    return (nwords - Flavor::HEADER_WORDS) / Flavor::WORDS_PER_SAMPLE;
+  }
+
+  template <>
+  constexpr uint32_t compute_nsamples<Flavor5>(uint32_t const nwords) {
+    return (nwords - Flavor5::HEADER_WORDS) * Flavor5::SAMPLES_PER_WORD;
+  }
+
+  //
+  template <typename StoragePolicy>
+  struct DigiCollectionBase : public common::AddSize<typename StoragePolicy::TagType> {
+    DigiCollectionBase() = default;
+    DigiCollectionBase(DigiCollectionBase const&) = default;
+    DigiCollectionBase& operator=(DigiCollectionBase const&) = default;
+
+    DigiCollectionBase(DigiCollectionBase&&) = default;
+    DigiCollectionBase& operator=(DigiCollectionBase&&) = default;
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type resize(std::size_t size) {
+      ids.resize(size);
+      data.resize(size * stride);
+    }
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type reserve(std::size_t size) {
+      ids.reserve(size);
+      data.reserve(size * stride);
+    }
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type clear() {
+      ids.clear();
+      data.clear();
+    }
+
+    typename StoragePolicy::template StorageSelector<uint32_t>::type ids;
+    typename StoragePolicy::template StorageSelector<uint16_t>::type data;
+    uint32_t stride;
+  };
+
+  template <typename Flavor, typename StoragePolicy>
+  struct DigiCollection : public DigiCollectionBase<StoragePolicy> {
+    using DigiCollectionBase<StoragePolicy>::DigiCollectionBase;
+  };
+
+  // NOTE: base ctors will not be available
+  template <typename StoragePolicy>
+  struct DigiCollection<Flavor5, StoragePolicy> : public DigiCollectionBase<StoragePolicy> {
+    DigiCollection() = default;
+    //DigiCollection(
+    //        uint32_t *ids, uint16_t *data, uint8_t *presamples,
+    //        uint32_t ndigis, uint32_t stride)
+    //    : DigiCollectionBase(ids, data, ndigis, stride)
+    //    , npresamples{npresamples}
+    //{}
+    DigiCollection(DigiCollection const&) = default;
+    DigiCollection& operator=(DigiCollection const&) = default;
+
+    DigiCollection(DigiCollection&&) = default;
+    DigiCollection& operator=(DigiCollection&&) = default;
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type resize(std::size_t size) {
+      DigiCollectionBase<StoragePolicy>::resize(size);
+      npresamples.resize(size);
+    }
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type reserve(std::size_t size) {
+      DigiCollectionBase<StoragePolicy>::reserve(size);
+      npresamples.reserve(size);
+    }
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type clear() {
+      DigiCollectionBase<StoragePolicy>::clear();
+      npresamples.clear();
+    }
+
+    // add npresamples member
+    typename StoragePolicy::template StorageSelector<uint8_t>::type npresamples;
+  };
+
+}  // namespace hcal
+
+#endif  // CUDADataFormats_HcalDigi_interface_DigiCollection_h
diff --git a/CUDADataFormats/HcalDigi/src/classes.h b/CUDADataFormats/HcalDigi/src/classes.h
new file mode 100644
index 0000000000000..f00f8cf7dbdf4
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/src/classes.h
@@ -0,0 +1,38 @@
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+
+namespace hcal {
+
+  // instantiate what we know will be used
+  template struct DigiCollection<Flavor01, common::ViewStoragePolicy>;
+
+  template struct DigiCollection<Flavor2, common::ViewStoragePolicy>;
+
+  template struct DigiCollection<Flavor3, common::ViewStoragePolicy>;
+
+  template struct DigiCollection<Flavor4, common::ViewStoragePolicy>;
+
+  template struct DigiCollection<Flavor5, common::ViewStoragePolicy>;
+
+  template struct DigiCollection<Flavor01, common::VecStoragePolicy<std::allocator>>;
+
+  template struct DigiCollection<Flavor2, common::VecStoragePolicy<std::allocator>>;
+
+  template struct DigiCollection<Flavor3, common::VecStoragePolicy<std::allocator>>;
+
+  template struct DigiCollection<Flavor4, common::VecStoragePolicy<std::allocator>>;
+
+  template struct DigiCollection<Flavor5, common::VecStoragePolicy<std::allocator>>;
+
+  template struct DigiCollection<Flavor01, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
+
+  template struct DigiCollection<Flavor2, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
+
+  template struct DigiCollection<Flavor3, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
+
+  template struct DigiCollection<Flavor4, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
+
+  template struct DigiCollection<Flavor5, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
+
+}  // namespace hcal
diff --git a/CUDADataFormats/HcalDigi/src/classes_def.xml b/CUDADataFormats/HcalDigi/src/classes_def.xml
new file mode 100644
index 0000000000000..18c1e5a09fd10
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/src/classes_def.xml
@@ -0,0 +1,60 @@
+<lcgdict>
+    <class name="std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t, 0>>" />
+    <class name="std::vector<uint16_t, cms::cuda::HostAllocator<uint16_t, 0>>" />
+    <class name="std::vector<uint8_t, cms::cuda::HostAllocator<uint8_t, 0>>" />
+            
+    <class name="hcal::DigiCollectionBase<hcal::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollectionBase<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
+
+    <!--
+    <class name="hcal::DigiCollectionBase<hcal::Flavor01, hcal::common::ViewStoragePolicy>" />
+    <class name="hcal::DigiCollectionBase<hcal::Flavor2, hcal::common::ViewStoragePolicy>" />
+    <class name="hcal::DigiCollectionBase<hcal::Flavor3, hcal::common::ViewStoragePolicy>" />
+    <class name="hcal::DigiCollectionBase<hcal::Flavor4, hcal::common::ViewStoragePolicy>" />
+    <class name="hcal::DigiCollectionBase<hcal::Flavor5, hcal::common::ViewStoragePolicy>" />
+    -->
+        
+    <class name="hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor2, hcal::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor4, hcal::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<std::allocator>>" />
+            
+    <class name="hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor2, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor4, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
+
+    <!--
+    <class name="hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>" />
+    <class name="hcal::DigiCollection<hcal::Flavor2, hcal::common::ViewStoragePolicy>" />
+    <class name="hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>" />
+    <class name="hcal::DigiCollection<hcal::Flavor4, hcal::common::ViewStoragePolicy>" />
+    <class name="hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>" />
+    -->
+
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, hcal::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, hcal::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>" persistent="false" />
+            
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, hcal::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, hcal::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>>" persistent="false" />
+
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor2, hcal::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor4, hcal::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<std::allocator>>>" />
+                
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor2, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor4, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
+</lcgdict>
diff --git a/CUDADataFormats/HcalRecHitSoA/BuildFile.xml b/CUDADataFormats/HcalRecHitSoA/BuildFile.xml
new file mode 100644
index 0000000000000..245701de5fdb0
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/BuildFile.xml
@@ -0,0 +1,7 @@
+<use name="DataFormats/Common"/>
+<use name="CUDADataFormats/Common" />
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+  <lib   name="1"/>
+</export>
diff --git a/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
new file mode 100644
index 0000000000000..a17c8a51073be
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
@@ -0,0 +1,38 @@
+#ifndef CUDADataFormats_HcalRecHitCollectionSoA_interface_RecHitCollection_h
+#define CUDADataFormats_HcalRecHitCollectionSoA_interface_RecHitCollection_h
+
+#include <vector>
+
+#include "CUDADataFormats/HcalCommon/interface/Common.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
+namespace hcal {
+
+  template <typename StoragePolicy>
+  struct RecHitCollection : public common::AddSize<typename StoragePolicy::TagType> {
+    RecHitCollection() = default;
+    RecHitCollection(const RecHitCollection&) = default;
+    RecHitCollection& operator=(const RecHitCollection&) = default;
+
+    RecHitCollection(RecHitCollection&&) = default;
+    RecHitCollection& operator=(RecHitCollection&&) = default;
+
+    typename StoragePolicy::template StorageSelector<float>::type energy;
+    typename StoragePolicy::template StorageSelector<float>::type chi2;
+    typename StoragePolicy::template StorageSelector<float>::type energyM0;
+    typename StoragePolicy::template StorageSelector<float>::type timeM0;
+    typename StoragePolicy::template StorageSelector<uint32_t>::type did;
+
+    template <typename U = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<U, common::tags::Vec>::value, void>::type resize(size_t size) {
+      energy.resize(size);
+      chi2.resize(size);
+      energyM0.resize(size);
+      timeM0.resize(size);
+      did.resize(size);
+    }
+  };
+
+}  // namespace hcal
+
+#endif  // RecoLocalCalo_HcalRecAlgos_interface_RecHitCollection_h
diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes.h b/CUDADataFormats/HcalRecHitSoA/src/classes.h
new file mode 100644
index 0000000000000..91035e8384117
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/src/classes.h
@@ -0,0 +1,14 @@
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+
+namespace hcal {
+
+  // explicit template instantiations
+  template struct RecHitCollection<common::ViewStoragePolicy>;
+
+  template struct RecHitCollection<common::VecStoragePolicy<std::allocator>>;
+
+  template struct RecHitCollection<common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
+
+}  // namespace hcal
diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
new file mode 100644
index 0000000000000..d747618168c22
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
@@ -0,0 +1,16 @@
+<lcgdict>
+    <class name="std::vector<float, cms::cuda::HostAllocator<float, 0>>" />
+    <class name="std::vector<int, cms::cuda::HostAllocator<int, 0>>" />
+    <class name="std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t, 0>>" />
+
+
+    <class name="hcal::RecHitCollection<hcal::common::VecStoragePolicy<std::allocator>>"/>
+    <class name="hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>"/>
+    <class name="edm::Wrapper<hcal::RecHitCollection<hcal::common::VecStoragePolicy<std::allocator>>>"/>
+    <class name="edm::Wrapper<hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>"/>
+            
+            
+
+    <class name="cms::cuda::Product<hcal::RecHitCollection<hcal::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::RecHitCollection<hcal::common::ViewStoragePolicy>>>" persistent="false" />
+</lcgdict>
diff --git a/EventFilter/HcalRawToDigi/bin/BuildFile.xml b/EventFilter/HcalRawToDigi/bin/BuildFile.xml
new file mode 100644
index 0000000000000..7a24968df89c8
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/bin/BuildFile.xml
@@ -0,0 +1,8 @@
+<bin name="makeHcalRaw2DigiGpuValidationPlots" file="makeHcalRaw2DigiGpuValidationPlots.cpp">
+    <use name="root"/>
+    <use name="rootgraphics"/>
+    <use name="DataFormats/Common"/>
+    <use name="DataFormats/HcalDigi"/>
+    <use name="DataFormats/HcalDetId"/>
+    <use name="cuda"/>
+</bin>
diff --git a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
new file mode 100644
index 0000000000000..ff6a820a3525d
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
@@ -0,0 +1,407 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <TCanvas.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TTree.h>
+#include <TPaveStats.h>
+
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+
+#define CREATE_HIST_1D(varname, nbins, first, last) auto varname = new TH1D(#varname, #varname, nbins, first, last)
+
+#define CREATE_HIST_2D(varname, nbins, first, last) \
+  auto varname = new TH2D(#varname, #varname, nbins, first, last, nbins, first, last)
+
+QIE11DigiCollection filterQIE11(QIE11DigiCollection const& coll) {
+  QIE11DigiCollection out;
+  out.reserve(coll.size());
+
+  for (uint32_t i = 0; i < coll.size(); i++) {
+    auto const df = coll[i];
+    auto const id = HcalDetId{df.id()};
+    if (id.subdetId() != HcalEndcap)
+      continue;
+
+    out.push_back(QIE11DataFrame{df});
+  }
+
+  return out;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "run with: ./<exe> <path to input file> <path to output file>\n";
+    exit(0);
+  }
+
+  auto filterf01HE = [](QIE11DigiCollection const& coll) {
+    QIE11DigiCollection out{coll.samples(), coll.subdetId()};
+    out.reserve(coll.size());
+
+    for (uint32_t i = 0; i < coll.size(); i++) {
+      auto const df = QIE11DataFrame{coll[i]};
+      auto const id = HcalDetId{df.id()};
+      if ((df.flavor() == 0 or df.flavor() == 1) and id.subdetId() == HcalEndcap)
+        out.push_back(df);
+    }
+
+    return out;
+  };
+
+  auto filterf3HB = [](QIE11DigiCollection const& coll) {
+    QIE11DigiCollection out{coll.samples(), coll.subdetId()};
+    out.reserve(coll.size());
+
+    for (uint32_t i = 0; i < coll.size(); i++) {
+      auto const df = QIE11DataFrame{coll[i]};
+      auto const did = HcalDetId{df.id()};
+      if (df.flavor() == 3 and did.subdetId() == HcalBarrel)
+        out.push_back(df);
+    }
+
+    return out;
+  };
+
+  // branches to use
+  using Collectionf01 =
+      hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  using Collectionf5 =
+      hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  using Collectionf3 =
+      hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  edm::Wrapper<Collectionf01>* wgpuf01he = nullptr;
+  edm::Wrapper<Collectionf5>* wgpuf5hb = nullptr;
+  edm::Wrapper<Collectionf3>* wgpuf3hb = nullptr;
+  edm::Wrapper<QIE11DigiCollection>* wcpuf01he = nullptr;
+  edm::Wrapper<HBHEDigiCollection>* wcpuf5hb = nullptr;
+
+  std::string inFileName{argv[1]};
+  std::string outFileName{argv[2]};
+
+  // prep output
+  TFile rfout{outFileName.c_str(), "recreate"};
+
+  CREATE_HIST_1D(hADCf01HEGPU, 256, 0, 256);
+  CREATE_HIST_1D(hADCf01HECPU, 256, 0, 256);
+  CREATE_HIST_1D(hADCf5HBGPU, 128, 0, 128);
+  CREATE_HIST_1D(hADCf5HBCPU, 128, 0, 128);
+  CREATE_HIST_1D(hADCf3HBGPU, 256, 0, 256);
+  CREATE_HIST_1D(hADCf3HBCPU, 256, 0, 256);
+  CREATE_HIST_1D(hTDCf01HEGPU, 64, 0, 64);
+  CREATE_HIST_1D(hTDCf01HECPU, 64, 0, 64);
+
+  CREATE_HIST_2D(hADCf01HEGPUvsCPU, 256, 0, 256);
+  CREATE_HIST_2D(hADCf3HBGPUvsCPU, 256, 0, 256);
+  CREATE_HIST_2D(hADCf5HBGPUvsCPU, 128, 0, 128);
+  CREATE_HIST_2D(hTDCf01HEGPUvsCPU, 64, 0, 64);
+  CREATE_HIST_2D(hTDCf3HBGPUvsCPU, 4, 0, 4);
+
+  /*
+    auto hADCEBGPU = new TH1D("hADCEBGPU", "hADCEBGPU", nbins, 0, last);
+    auto hADCEBCPU = new TH1D("hADCEBCPU", "hADCEBCPU", nbins, 0, last);
+    auto hADCEEGPU = new TH1D("hADCEEGPU", "hADCEEGPU", nbins, 0, last);
+    auto hADCEECPU = new TH1D("hADCEECPU", "hADCEECPU", nbins, 0, last);
+
+    auto hGainEBGPU = new TH1D("hGainEBGPU", "hGainEBGPU", 4, 0, 4);
+    auto hGainEBCPU = new TH1D("hGainEBCPU", "hGainEBCPU", 4, 0, 4);
+    auto hGainEEGPU = new TH1D("hGainEEGPU", "hGainEEGPU", 4, 0, 4);
+    auto hGainEECPU = new TH1D("hGainEECPU", "hGainEECPU", 4, 0, 4);
+
+    auto hADCEBGPUvsCPU = new TH2D("hADCEBGPUvsCPU", "hADCEBGPUvsCPU",
+        nbins, 0, last, nbins, 0, last);
+    auto hADCEEGPUvsCPU = new TH2D("hADCEEGPUvsCPU", "hADCEEGPUvsCPU",
+        nbins, 0, last, nbins, 0, last);
+    auto hGainEBGPUvsCPU = new TH2D("hGainEBGPUvsCPU", "hGainEBGPUvsCPU",
+        4, 0, 4, 4, 0, 4);
+    auto hGainEEGPUvsCPU = new TH2D("hGainEEGPUvsCPU", "hGainEEGPUvsCPU",
+        4, 0, 4, 4, 0, 4);
+        */
+
+  // prep input
+  TFile rfin{inFileName.c_str()};
+  TTree* rt = (TTree*)rfin.Get("Events");
+  rt->SetBranchAddress("QIE11DataFrameHcalDataFrameContainer_hcalDigis__RECO.", &wcpuf01he);
+  rt->SetBranchAddress("HBHEDataFramesSorted_hcalDigis__RECO.", &wcpuf5hb);
+  rt->SetBranchAddress(
+      "hcalFlavor5hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "f5HBDigis_RECO.",
+      &wgpuf5hb);
+  rt->SetBranchAddress(
+      "hcalFlavor01hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "f01HEDigis_RECO.",
+      &wgpuf01he);
+  rt->SetBranchAddress(
+      "hcalFlavor3hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "f3HBDigis_RECO.",
+      &wgpuf3hb);
+
+  // accumulate
+  auto const nentries = rt->GetEntries();
+  std::cout << ">>> nentries = " << nentries << std::endl;
+  for (int ie = 0; ie < nentries; ++ie) {
+    rt->GetEntry(ie);
+
+    auto const& f01HEProduct = wgpuf01he->bareProduct();
+    auto const& f5HBProduct = wgpuf5hb->bareProduct();
+    auto const& f3HBProduct = wgpuf3hb->bareProduct();
+    auto const& qie11Product = wcpuf01he->bareProduct();
+    auto const qie11Filteredf01 = filterf01HE(qie11Product);
+    auto const qie11Filteredf3 = filterf3HB(qie11Product);
+    auto const& qie8Product = wcpuf5hb->bareProduct();
+
+    auto const ngpuf01he = f01HEProduct.ids.size();
+    auto const ngpuf5hb = f5HBProduct.ids.size();
+    auto const ngpuf3hb = f3HBProduct.ids.size();
+    auto const ncpuf01he = qie11Filteredf01.size();
+    auto const ncpuf5hb = qie8Product.size();
+    auto const ncpuf3hb = qie11Filteredf3.size();
+
+    /*
+        printf("ngpuf01he = %u nqie11 = %u ncpuf01he = %u ngpuf5hb = %u ncpuf5hb = %u\n",
+            f01HEProduct.size(), qie11Product.size(), qie11Filtered.size(), 
+            f5HBProduct.size(),
+            static_cast<uint32_t>(qie8Product.size()));
+            */
+
+    if (ngpuf01he != ncpuf01he) {
+      std::cerr << "*** mismatch in number of flavor 01 digis for event " << ie << std::endl
+                << ">>> ngpuf01he = " << ngpuf01he << std::endl
+                << ">>> ncpuf01he = " << ncpuf01he << std::endl;
+    }
+
+    {
+      auto const& idsgpu = f01HEProduct.ids;
+      auto const& datagpu = f01HEProduct.data;
+
+      for (uint32_t ich = 0; ich < ncpuf01he; ich++) {
+        auto const cpudf = QIE11DataFrame{qie11Filteredf01[ich]};
+        auto const cpuid = cpudf.id();
+        auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid);
+
+        if (iter2idgpu == idsgpu.end()) {
+          std::cerr << "missing " << HcalDetId{cpuid} << std::endl;
+          continue;
+        }
+
+        // FIXME: cna fail...
+        assert(*iter2idgpu == cpuid);
+
+        auto const ptrdiff = iter2idgpu - idsgpu.begin();
+        auto const nsamples_gpu = hcal::compute_nsamples<hcal::Flavor01>(f01HEProduct.stride);
+        auto const nsamples_cpu = qie11Filteredf01.samples();
+        assert(static_cast<uint32_t>(nsamples_cpu) == nsamples_gpu);
+
+        uint32_t ichgpu = ptrdiff;
+        uint32_t offset = ichgpu * f01HEProduct.stride;
+        uint16_t const* df_start = datagpu.data() + offset;
+        for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
+          auto const cpuadc = cpudf[sample].adc();
+          auto const gpuadc = hcal::adc_for_sample<hcal::Flavor01>(df_start, sample);
+          auto const cputdc = cpudf[sample].tdc();
+          auto const gputdc = hcal::tdc_for_sample<hcal::Flavor01>(df_start, sample);
+          auto const cpucapid = cpudf[sample].capid();
+          auto const gpucapid = hcal::capid_for_sample<hcal::Flavor01>(df_start, sample);
+
+          hADCf01HEGPU->Fill(gpuadc);
+          hADCf01HECPU->Fill(cpuadc);
+          hTDCf01HEGPU->Fill(gputdc);
+          hTDCf01HECPU->Fill(cputdc);
+          hADCf01HEGPUvsCPU->Fill(cpuadc, gpuadc);
+          hTDCf01HEGPUvsCPU->Fill(cputdc, gputdc);
+
+          // At RAW Decoding level there must not be any mistmatches
+          // in the adc values at all!
+          assert(static_cast<uint8_t>(cpuadc) == gpuadc);
+          assert(static_cast<uint8_t>(cputdc) == gputdc);
+          assert(static_cast<uint8_t>(cpucapid) == gpucapid);
+        }
+      }
+    }
+
+    if (ngpuf3hb != ncpuf3hb) {
+      std::cerr << "*** mismatch in number of flavor 3 digis for event " << ie << std::endl
+                << ">>> ngpuf01he = " << ngpuf3hb << std::endl
+                << ">>> ncpuf01he = " << ncpuf3hb << std::endl;
+    }
+
+    {
+      auto const& idsgpu = f3HBProduct.ids;
+      auto const& datagpu = f3HBProduct.data;
+
+      for (uint32_t ich = 0; ich < ncpuf3hb; ich++) {
+        auto const cpudf = QIE11DataFrame{qie11Filteredf3[ich]};
+        auto const cpuid = cpudf.id();
+        auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid);
+
+        if (iter2idgpu == idsgpu.end()) {
+          std::cerr << "missing " << HcalDetId{cpuid} << std::endl;
+          continue;
+        }
+
+        // FIXME: cna fail...
+        assert(*iter2idgpu == cpuid);
+
+        auto const ptrdiff = iter2idgpu - idsgpu.begin();
+        auto const nsamples_gpu = hcal::compute_nsamples<hcal::Flavor3>(f3HBProduct.stride);
+        auto const nsamples_cpu = qie11Filteredf3.samples();
+        assert(static_cast<uint32_t>(nsamples_cpu) == nsamples_gpu);
+
+        uint32_t ichgpu = ptrdiff;
+        uint32_t offset = ichgpu * f3HBProduct.stride;
+        uint16_t const* df_start = datagpu.data() + offset;
+        for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
+          auto const cpuadc = cpudf[sample].adc();
+          auto const gpuadc = hcal::adc_for_sample<hcal::Flavor3>(df_start, sample);
+          auto const cputdc = cpudf[sample].tdc();
+          auto const gputdc = hcal::tdc_for_sample<hcal::Flavor3>(df_start, sample);
+
+          hADCf3HBGPU->Fill(gpuadc);
+          hADCf3HBCPU->Fill(cpuadc);
+          hADCf3HBGPUvsCPU->Fill(cpuadc, gpuadc);
+          hTDCf3HBGPUvsCPU->Fill(cputdc, gputdc);
+
+          // At RAW Decoding level there must not be any mistmatches
+          // in the adc values at all!
+          assert(static_cast<uint8_t>(cpuadc) == gpuadc);
+          assert(static_cast<uint8_t>(cputdc) == gputdc);
+        }
+      }
+    }
+
+    if (ngpuf5hb != ncpuf5hb) {
+      std::cerr << "*** mismatch in number of flavor 5 digis for event " << ie << std::endl
+                << ">>> ngpuf5hb = " << ngpuf5hb << std::endl
+                << ">>> ncpuf5hb = " << ncpuf5hb << std::endl;
+    }
+
+    {
+      auto const& idsgpu = f5HBProduct.ids;
+      auto const& datagpu = f5HBProduct.data;
+      for (uint32_t i = 0; i < ncpuf5hb; i++) {
+        auto const cpudf = qie8Product[i];
+        auto const cpuid = cpudf.id().rawId();
+        auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid);
+        if (iter2idgpu == idsgpu.end()) {
+          std::cerr << "missing " << HcalDetId{cpuid} << std::endl;
+          continue;
+        }
+
+        assert(*iter2idgpu == cpuid);
+
+        auto const ptrdiff = iter2idgpu - idsgpu.begin();
+        auto const nsamples_gpu = hcal::compute_nsamples<hcal::Flavor5>(f5HBProduct.stride);
+        auto const nsamples_cpu = qie8Product[0].size();
+        assert(static_cast<uint32_t>(nsamples_cpu) == nsamples_gpu);
+
+        uint32_t offset = ptrdiff * f5HBProduct.stride;
+        uint16_t const* df_start = datagpu.data() + offset;
+        for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
+          auto const cpuadc = cpudf.sample(sample).adc();
+          auto const gpuadc = hcal::adc_for_sample<hcal::Flavor5>(df_start, sample);
+          auto const cpucapid = cpudf.sample(sample).capid();
+          auto const gpucapid = hcal::capid_for_sample<hcal::Flavor01>(df_start, sample);
+
+          hADCf5HBGPU->Fill(gpuadc);
+          hADCf5HBCPU->Fill(cpuadc);
+          hADCf5HBGPUvsCPU->Fill(cpuadc, gpuadc);
+
+          // the must for us at RAW Decoding stage
+          assert(static_cast<hcal::Flavor5::adc_type>(cpuadc) == gpuadc);
+          assert(static_cast<uint8_t>(cpucapid) == gpucapid);
+        }
+      }
+    }
+  }
+
+  {
+    TCanvas c{"plots", "plots", 4200, 6200};
+    c.Divide(3, 3);
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hADCf01HECPU->SetLineColor(kBlack);
+      hADCf01HECPU->SetLineWidth(1.);
+      hADCf01HECPU->Draw("");
+      hADCf01HEGPU->SetLineColor(kBlue);
+      hADCf01HEGPU->SetLineWidth(1.);
+      hADCf01HEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCf01HEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(2);
+    {
+      gPad->SetLogy();
+      hADCf5HBCPU->SetLineColor(kBlack);
+      hADCf5HBCPU->SetLineWidth(1.);
+      hADCf5HBCPU->Draw("");
+      hADCf5HBGPU->SetLineColor(kBlue);
+      hADCf5HBGPU->SetLineWidth(1.);
+      hADCf5HBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCf5HBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(3);
+    {
+      gPad->SetLogy();
+      hADCf3HBCPU->SetLineColor(kBlack);
+      hADCf3HBCPU->SetLineWidth(1.);
+      hADCf3HBCPU->Draw("");
+      hADCf3HBGPU->SetLineColor(kBlue);
+      hADCf3HBGPU->SetLineWidth(1.);
+      hADCf3HBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCf3HBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(4);
+    hADCf01HEGPUvsCPU->Draw("colz");
+    c.cd(5);
+    hADCf5HBGPUvsCPU->Draw("colz");
+    c.cd(6);
+    hADCf3HBGPUvsCPU->Draw("colz");
+    c.cd(7);
+    {
+      gPad->SetLogy();
+      hTDCf01HECPU->SetLineColor(kBlack);
+      hTDCf01HECPU->SetLineWidth(1.);
+      hTDCf01HECPU->Draw("");
+      hTDCf01HEGPU->SetLineColor(kBlue);
+      hTDCf01HEGPU->SetLineWidth(1.);
+      hTDCf01HEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hTDCf01HEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(8);
+    hTDCf01HEGPUvsCPU->Draw("colz");
+    c.cd(9);
+    hTDCf3HBGPUvsCPU->Draw("colz");
+
+    c.SaveAs("plots.pdf");
+  }
+
+  rfin.Close();
+  rfout.Write();
+  rfout.Close();
+}
diff --git a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
index ccf6a061119c2..47f7625254fc0 100644
--- a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
@@ -1,16 +1,26 @@
+<use name="boost"/>
+<use name="zlib"/>
+<use name="CalibFormats/HcalObjects"/>
+<use name="CondFormats/HcalObjects"/>
+<use name="DataFormats/FEDRawData"/>
 <use name="DataFormats/HcalDetId"/>
 <use name="DataFormats/HcalDigi"/>
-<use name="DataFormats/FEDRawData"/>
-<use name="CondFormats/HcalObjects"/>
-<use name="CalibFormats/HcalObjects"/>
+<use name="EventFilter/HcalRawToDigi"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/MessageLogger"/>
-<use name="boost"/>
-<use name="zlib"/>
-<use name="EventFilter/HcalRawToDigi"/>
 <flags EDM_PLUGIN="1"/>
+
 <library file="HcalCalibFEDSelector.cc,HcalCalibTypeFilter.cc,HcalDigiToRaw.cc,HcalEmptyEventFilter.cc,HcalHistogramRawToDigi.cc,HcalRawToDigi.cc,modules.cc,HcalDigiToRawuHTR.cc,HcalRawToDigiFake.cc" name="EventFilterHcalRawToDigiPlugins">
 </library>
 
 <library file="HcalLaserEventFiltProducer2012.cc, HcalLaserEventFilter2012.cc,HcalLaserHFFilter2012.cc,HcalLaserHBHEFilter2012.cc,HcalLaserHBHEHFFilter2012.cc" name="EventFilterHcalRawToDigiFiltersPlugins">
 </library>
+
+<library file="HcalRawToDigiGPU.cc,DecodeGPU.cu,HcalESProducerGPUDefs.cc,ElectronicsMappingGPU.cc,HcalCPUDigisProducer.cc,HcalDigisProducerGPU.cc" name="EventFilterHcalRawToDigiGPUPlugins">
+  <use name="cuda" />
+  <use name="CUDADataFormats/Common" />
+  <use name="CUDADataFormats/HcalCommon"/>
+  <use name="CUDADataFormats/HcalDigi" />
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+</library>
diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
new file mode 100644
index 0000000000000..a66d7d38248ab
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
@@ -0,0 +1,148 @@
+#ifndef EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
+#define EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+
+#include <vector>
+
+namespace hcal {
+  namespace raw {
+
+    constexpr int32_t empty_event_size = 32;
+    constexpr uint32_t utca_nfeds_max = 50;
+    constexpr uint32_t nbytes_per_fed_max = 10 * 1024;
+
+    // each collection corresponds to a particular flavor with a certain number of
+    // samples per digi
+    constexpr uint32_t numOutputCollections = 3;
+    constexpr uint8_t OutputF01HE = 0;
+    constexpr uint8_t OutputF5HB = 1;
+    constexpr uint8_t OutputF3HB = 2;
+
+    struct ConfigurationParameters {
+      uint32_t maxChannelsF01HE;
+      uint32_t maxChannelsF5HB;
+      uint32_t maxChannelsF3HB;
+      uint32_t nsamplesF01HE;
+      uint32_t nsamplesF5HB;
+      uint32_t nsamplesF3HB;
+    };
+
+    struct InputDataCPU {
+      std::vector<unsigned char, cms::cuda::HostAllocator<unsigned char>> data;
+      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> offsets;
+      std::vector<int, cms::cuda::HostAllocator<int>> feds;
+
+      void allocate() {
+        data.resize(utca_nfeds_max * sizeof(unsigned char) * nbytes_per_fed_max);
+        offsets.resize(utca_nfeds_max, 0);
+        feds.resize(utca_nfeds_max, 0);
+      }
+    };
+
+    struct OutputDataCPU {
+      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> nchannels;
+
+      void allocate() { nchannels.resize(numOutputCollections); }
+    };
+
+    struct ScratchDataGPU {
+      // depends on tHE number of output collections
+      // that is a statically known predefined number!!!
+      uint32_t *pChannelsCounters = nullptr;
+
+      void allocate(ConfigurationParameters const &) {
+        cudaCheck(cudaMalloc((void **)&pChannelsCounters, sizeof(uint32_t) * numOutputCollections));
+      }
+
+      void deallocate(ConfigurationParameters const &) {
+        if (pChannelsCounters) {
+          cudaCheck(cudaFree(pChannelsCounters));
+        }
+      }
+    };
+
+    struct OutputDataGPU {
+      DigiCollection<Flavor01, common::ViewStoragePolicy> digisF01HE;
+      DigiCollection<Flavor5, common::ViewStoragePolicy> digisF5HB;
+      DigiCollection<Flavor3, common::ViewStoragePolicy> digisF3HB;
+
+      // qie 11 HE
+      /*
+    uint16_t *digisF01HE = nullptr;
+    uint32_t *idsF01HE = nullptr;
+
+    // qie 8 HB
+    uint16_t *digisF5HB = nullptr;
+    uint32_t *idsF5HB = nullptr;
+    uint8_t *npresamplesF5HB = nullptr
+    */
+
+      void allocate(ConfigurationParameters const &config) {
+        cudaCheck(
+            cudaMalloc((void **)&digisF01HE.data,
+                       config.maxChannelsF01HE * sizeof(uint16_t) * compute_stride<Flavor01>(config.nsamplesF01HE)));
+        cudaCheck(cudaMalloc((void **)&digisF01HE.ids, sizeof(uint32_t) * config.maxChannelsF01HE));
+
+        cudaCheck(cudaMalloc((void **)&digisF5HB.data,
+                             config.maxChannelsF5HB * sizeof(uint16_t) * compute_stride<Flavor5>(config.nsamplesF5HB)));
+        cudaCheck(cudaMalloc((void **)&digisF5HB.ids, sizeof(uint32_t) * config.maxChannelsF5HB));
+        cudaCheck(cudaMalloc((void **)&digisF5HB.npresamples, sizeof(uint8_t) * config.maxChannelsF5HB));
+
+        cudaCheck(cudaMalloc((void **)&digisF3HB.data,
+                             config.maxChannelsF3HB * sizeof(uint16_t) * compute_stride<Flavor3>(config.nsamplesF3HB)));
+        cudaCheck(cudaMalloc((void **)&digisF3HB.ids, config.maxChannelsF3HB * sizeof(uint32_t)));
+      }
+
+      void deallocate(ConfigurationParameters const &config) {
+        if (digisF01HE.data) {
+          cudaCheck(cudaFree(digisF01HE.data));
+          cudaCheck(cudaFree(digisF01HE.ids));
+        }
+
+        if (digisF5HB.data) {
+          cudaCheck(cudaFree(digisF5HB.data));
+          cudaCheck(cudaFree(digisF5HB.ids));
+          cudaCheck(cudaFree(digisF5HB.npresamples));
+        }
+
+        if (digisF3HB.data) {
+          cudaCheck(cudaFree(digisF3HB.data));
+          cudaCheck(cudaFree(digisF3HB.ids));
+        }
+      }
+    };
+
+    struct InputDataGPU {
+      unsigned char *data = nullptr;
+      uint32_t *offsets = nullptr;
+      int *feds = nullptr;
+
+      void allocate() {
+        cudaCheck(cudaMalloc((void **)&data, sizeof(unsigned char) * nbytes_per_fed_max * utca_nfeds_max));
+        cudaCheck(cudaMalloc((void **)&offsets, sizeof(uint32_t) * utca_nfeds_max));
+        cudaCheck(cudaMalloc((void **)&feds, sizeof(int) * utca_nfeds_max));
+      }
+
+      void deallocate() {
+        if (data) {
+          cudaCheck(cudaFree(data));
+          cudaCheck(cudaFree(offsets));
+          cudaCheck(cudaFree(feds));
+        }
+      }
+    };
+
+    struct ConditionsProducts {
+      ElectronicsMappingGPU::Product const &eMappingProduct;
+    };
+
+  }  // namespace raw
+}  // namespace hcal
+
+#endif  // EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
new file mode 100644
index 0000000000000..5eaff1d9c699f
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
@@ -0,0 +1,613 @@
+#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h"
+#include "DataFormats/HcalDetId/interface/HcalSubdetector.h"
+#include "DataFormats/HcalDetId/interface/HcalDetId.h"
+
+#include "EventFilter/HcalRawToDigi/plugins/DecodeGPU.h"
+
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
+
+namespace hcal {
+  namespace raw {
+
+    __forceinline__ __device__ char const* get_subdet_str(DetId const& did) {
+      switch (did.subdetId()) {
+        case HcalEmpty:
+          return "HcalEmpty";
+          break;
+        case HcalBarrel:
+          return "HcalBarrel";
+          break;
+        case HcalEndcap:
+          return "HcalEndcap";
+          break;
+        case HcalOuter:
+          return "HcalOuter";
+          break;
+        case HcalForward:
+          return "HcalForward";
+          break;
+        case HcalTriggerTower:
+          return "HcalTriggerTower";
+          break;
+        case HcalOther:
+          return "HcalOther";
+          break;
+        default:
+          return "Unknown";
+          break;
+      }
+
+      return "Unknown";
+    }
+
+    __forceinline__ __device__ bool is_channel_header_word(uint16_t const* ptr) {
+      uint8_t bit = (*ptr >> 15) & 0x1;
+      return bit == 1;
+    }
+
+    template <typename T>
+    constexpr bool is_power_of_two(T x) {
+      return (x != 0) && ((x & (x - 1)) == 0);
+    }
+
+    template <int NTHREADS>
+    __global__ void kernel_rawdecode_test(unsigned char const* data,
+                                          uint32_t const* offsets,
+                                          int const* feds,
+                                          uint32_t const* eid2did,
+                                          uint32_t const* eid2tid,
+                                          uint16_t* digisF01HE,
+                                          uint32_t* idsF01HE,
+                                          uint16_t* digisF5HB,
+                                          uint32_t* idsF5HB,
+                                          uint8_t* npresamplesF5HB,
+                                          uint16_t* digisF3HB,
+                                          uint32_t* idsF3HB,
+                                          uint32_t* pChannelsCounters,
+                                          uint32_t const nsamplesF01HE,
+                                          uint32_t const nsamplesF5HB,
+                                          uint32_t const nsamplesF3HB,
+                                          uint32_t const nBytesTotal) {
+      // in order to properly use cooperative groups
+      static_assert(is_power_of_two(NTHREADS) == true && NTHREADS <= 32);
+
+      thread_block_tile<NTHREADS> thread_group = tiled_partition<NTHREADS>(this_thread_block());
+
+      auto const iamc = threadIdx.x / NTHREADS;
+      auto const ifed = blockIdx.x;
+      auto const fed = feds[ifed];
+      auto const offset = offsets[ifed];
+      auto const size = ifed == gridDim.x - 1 ? nBytesTotal - offset : offsets[ifed + 1] - offset;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+      if (ifed > 0 || iamc > 0)
+        return;
+      printf("threadIdx.x = %d rank = %d iamc = %d\n", threadIdx.x, thread_group.thread_rank(), iamc);
+#endif
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      printf("ifed = %d fed = %d offset = %u size = %u\n", ifed, fed, offset, size);
+#endif
+
+      // offset to the right raw buffer
+      uint64_t const* buffer = reinterpret_cast<uint64_t const*>(data + offset);
+
+      //
+      // fed header
+      //
+      auto const fed_header = buffer[0];
+      uint32_t const fed_id = (fed_header >> 8) & 0xfff;
+      uint32_t const bx = (fed_header >> 20) & 0xfff;
+      uint32_t const lv1 = (fed_header >> 32) & 0xffffff;
+      uint8_t const trigger_type = (fed_header >> 56) & 0xf;
+      uint8_t const bid_fed_header = (fed_header >> 60) & 0xf;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      printf("fed = %d fed_id = %u bx = %u lv1 = %u trigger_type = %u bid = %u\n",
+             fed,
+             fed_id,
+             bx,
+             lv1,
+             trigger_type,
+             bid_fed_header);
+#endif
+
+      // amc 13 header
+      auto const amc13word = buffer[1];
+      uint8_t const namc = (amc13word >> 52) & 0xf;
+      uint8_t const amc13version = (amc13word >> 60) & 0xf;
+      uint32_t const amc13OrbitNumber = (amc13word >> 4) & 0xffffffffu;
+
+      if (iamc >= namc)
+        return;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      printf("fed = %d namc = %u amc13version = %u amc13OrbitNumber = %u\n", fed, namc, amc13version, amc13OrbitNumber);
+#endif
+
+      // compute the offset int to the right buffer
+      uint32_t amcoffset = 0;
+      for (uint8_t ii = 0u; ii < iamc; ii++) {
+        auto const word = buffer[2 + ii];
+        int const amcSize = (word >> 32) & 0xffffff;
+        amcoffset += amcSize;
+      }
+
+      //    for (uint8_t iamc=0u; iamc < namc; ++iamc) {
+      auto const word = buffer[2 + iamc];
+      uint16_t const amcid = word & 0xffff;
+      int const slot = (word >> 16) & 0xf;
+      int const amcBlockNumber = (word >> 20) & 0xff;
+      int const amcSize = (word >> 32) & 0xffffff;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      printf("fed = %d amcid = %u slot = %d amcBlockNumber = %d\n", fed, amcid, slot, amcBlockNumber);
+#endif
+
+      bool const amcmore = ((word >> 61) & 0x1) != 0;
+      bool const amcSegmented = ((word >> 60) & 0x1) != 0;
+      bool const amcLengthOk = ((word >> 62) & 0x1) != 0;
+      bool const amcCROk = ((word >> 56) & 0x1) != 0;
+      bool const amcDataPresent = ((word >> 58) & 0x1) != 0;
+      bool const amcDataValid = ((word >> 56) & 0x1) != 0;
+      bool const amcEnabled = ((word >> 59) & 0x1) != 0;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      printf(
+          "fed = %d amcmore = %d amcSegmented = %d, amcLengthOk = %d amcCROk = %d\n>> amcDataPresent = %d amcDataValid "
+          "= %d amcEnabled = %d\n",
+          fed,
+          static_cast<int>(amcmore),
+          static_cast<int>(amcSegmented),
+          static_cast<int>(amcLengthOk),
+          static_cast<int>(amcCROk),
+          static_cast<int>(amcDataPresent),
+          static_cast<int>(amcDataValid),
+          static_cast<int>(amcEnabled));
+#endif
+
+      // get to the payload
+      auto const* payload64 = buffer + 2 + namc + amcoffset;
+      auto const* payload16 = reinterpret_cast<uint16_t const*>(payload64);
+      //amcoffset += amcSize;
+
+      // uhtr header v1 1st 64 bits
+      auto const payload64_w0 = payload64[0];
+      // uhtr n bytes comes from amcSize, according to the cpu version!
+      //uint32_t const data_length64 = payload64_w0 & 0xfffff;
+      uint32_t const data_length64 = amcSize;
+      uint16_t bcn = (payload64_w0 >> 20) & 0xfff;
+      uint32_t evn = (payload64_w0 >> 32) & 0xffffff;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      printf("fed = %d data_length64 = %u bcn = %u evn = %u\n", fed, data_length64, bcn, evn);
+#endif
+
+      // uhtr header v1 2nd 64 bits
+      auto const payload64_w1 = payload64[1];
+      uint8_t const uhtrcrate = payload64_w1 & 0xff;
+      uint8_t const uhtrslot = (payload64_w1 >> 8) & 0xf;
+      uint8_t const presamples = (payload64_w1 >> 12) & 0xf;
+      uint16_t const orbitN = (payload64_w1 >> 16) & 0xffff;
+      uint8_t const firmFlavor = (payload64_w1 >> 32) & 0xff;
+      uint8_t const eventType = (payload64_w1 >> 40) & 0xf;
+      uint8_t const payloadFormat = (payload64_w1 >> 44) & 0xf;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      printf(
+          "fed = %d crate = %u slot = %u presamples = %u\n>>> orbitN = %u firmFlavor = %u eventType = %u payloadFormat "
+          "= %u\n",
+          fed,
+          uhtrcrate,
+          uhtrslot,
+          presamples,
+          orbitN,
+          firmFlavor,
+          eventType,
+          payloadFormat);
+#endif
+
+      // this should be filtering out uMNio...
+      if (payloadFormat != 1)
+        return;
+
+      // skip uhtr header words
+      auto const channelDataSize = data_length64 - 2;        // 2 uhtr header v1 words
+      auto const* channelDataBuffer64Start = payload64 + 2;  // 2 uhtr header v2 wds
+      auto const* channelDataBuffer64End = channelDataBuffer64Start + channelDataSize;
+      auto const* ptr = reinterpret_cast<uint16_t const*>(channelDataBuffer64Start);
+      auto const* end = ptr + sizeof(uint64_t) / sizeof(uint16_t) * (channelDataSize - 1);
+      auto const t_rank = thread_group.thread_rank();
+
+      // iterate thru the channel data
+      while (ptr != end) {
+        // this is the starting point for this thread group for this iteration
+        // with respect to this pointer every thread will move forward afterwards
+        auto const* const start_ptr = ptr;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+        thread_group.sync();
+#endif
+
+        // skip to the header word of the right channel for this thread
+        int counter = 0;
+        while (counter < thread_group.thread_rank()) {
+          // just a check for threads that land beyond the end
+          if (ptr == end)
+            break;
+
+          // move ptr one forward past header
+          if (is_channel_header_word(ptr))
+            ++ptr;
+          else {
+            // go to the next channel and do not consider this guy as a
+            // channel
+            while (ptr != end)
+              if (!is_channel_header_word(ptr))
+                ++ptr;
+              else
+                break;
+            continue;
+          }
+
+          // skip
+          while (ptr != end)
+            if (!is_channel_header_word(ptr))
+              ++ptr;
+            else
+              break;
+          counter++;
+        }
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+        thread_group.sync();
+        printf("ptr - start_ptr = %d counter = %d rank = %d\n", static_cast<int>(ptr - start_ptr), counter, t_rank);
+#endif
+
+        // assume that if all is valid, ptr points
+        // to the header word of the channel to be decoded
+        // skip to the next channel header word if above assumption
+        // does not hold
+        //uint8_t const fw_lastbit = (*ptr >> 15) & 0x1;
+        //if (fw_lastbit != 1) {
+        //    ptr++;
+        //    continue;
+        //}
+
+        // when the end is near, channels will land outside of the [start_ptr, end)
+        // region
+        if (ptr != end) {
+          // for all of the flavors, these 2 guys have the same bit layout
+          uint8_t const flavor = (ptr[0] >> 12) & 0x7;
+          uint8_t const channelid = ptr[0] & 0xff;
+          auto const* const new_channel_start = ptr;
+
+          // flavor dependent stuff
+          switch (flavor) {
+            case 0:
+            case 1: {
+              // treat eid and did
+              uint8_t fiber = (channelid >> 3) & 0x1f;
+              uint8_t fchannel = channelid & 0x7;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = HcalDetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              // remove digis not for HE
+              if (did.subdetId() != HcalEndcap)
+                break;
+
+              // count words
+              auto const* channel_header_word = ptr++;
+              while (!is_channel_header_word(ptr) && ptr != end)
+                ++ptr;
+              auto const* channel_end = ptr;  // set ptr
+              uint32_t const nwords = channel_end - channel_header_word;
+
+              // filter out this digi if nwords does not equal expected
+              auto const expected_words = compute_stride<Flavor01>(nsamplesF01HE);
+              if (nwords != expected_words)
+                break;
+
+              // inc the number of digis of this type
+              auto const pos = atomicAdd(&pChannelsCounters[OutputF01HE], 1);
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+              printf("rank = %d pos = %d\n", thread_group.thread_rank(), pos);
+#endif
+
+              // store to global mem words for this digi
+              idsF01HE[pos] = did.rawId();
+
+              for (uint32_t iword = 0; iword < expected_words; iword++)
+                digisF01HE[pos * expected_words + iword] = channel_header_word[iword];
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("nwords = %u\n", nwords);
+#endif
+
+              break;
+            }
+            case 3: {
+              // treat eid and did
+              uint8_t fiber = (channelid >> 3) & 0x1f;
+              uint8_t fchannel = channelid & 0x7;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = HcalDetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              // remove digis not for HE
+              if (did.subdetId() != HcalBarrel)
+                break;
+
+              // count words
+              auto const* channel_header_word = ptr++;
+              while (!is_channel_header_word(ptr) && ptr != end)
+                ++ptr;
+              auto const* channel_end = ptr;  // set ptr
+              uint32_t const nwords = channel_end - channel_header_word;
+
+              // filter out this digi if nwords does not equal expected
+              auto const expected_words = compute_stride<Flavor3>(nsamplesF3HB);
+              if (nwords != expected_words)
+                break;
+
+              // inc the number of digis of this type
+              auto const pos = atomicAdd(&pChannelsCounters[OutputF3HB], 1);
+
+              // store to global mem words for this digi
+              idsF3HB[pos] = did.rawId();
+              for (uint32_t iword = 0; iword < expected_words; iword++)
+                digisF3HB[pos * expected_words + iword] = channel_header_word[iword];
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("nwords = %u\n", nwords);
+#endif
+
+              break;
+            }
+            case 2: {
+              uint8_t fiber = (channelid >> 3) & 0x1f;
+              uint8_t fchannel = channelid & 0x7;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = DetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              break;
+            }
+            case 4: {
+              uint8_t link = (channelid >> 4) & 0xf;
+              uint8_t tower = channelid & 0xf;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, link, tower, true};
+              auto const did = DetId{eid2tid[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u link = %u tower = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     link,
+                     tower);
+#endif
+
+              break;
+            }
+            case 5: {
+              uint8_t fiber = (channelid >> 2) & 0x3f;
+              uint8_t fchannel = channelid & 0x3;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = HcalDetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              // remove digis not for HB
+              if (did.subdetId() != HcalBarrel)
+                break;
+
+              // count words
+              auto const* channel_header_word = ptr++;
+              while (!is_channel_header_word(ptr) && ptr != end)
+                ++ptr;
+              auto const* channel_end = ptr;  // set ptr
+              uint32_t const nwords = channel_end - channel_header_word;
+
+              // filter out this digi if nwords does not equal expected
+              //uint32_t const expected_words =
+              //    nsamplesF5HB * Flavor5::WORDS_PER_SAMPLE +
+              //    Flavor5::HEADER_WORDS;
+              auto const expected_words = compute_stride<Flavor5>(nsamplesF5HB);
+              if (nwords != expected_words)
+                break;
+
+              // inc the number of digis of this type
+              auto const pos = atomicAdd(&pChannelsCounters[OutputF5HB], 1);
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+              printf("rank = %d pos = %d\n", thread_group.thread_rank(), pos);
+#endif
+
+              // store to global mem words for this digi
+              idsF5HB[pos] = did.rawId();
+              npresamplesF5HB[pos] = presamples;
+              for (uint32_t iword = 0; iword < expected_words; iword++)
+                digisF5HB[pos * expected_words + iword] = channel_header_word[iword];
+
+              break;
+            }
+            case 7: {
+              uint8_t const fiber = (channelid >> 2) & 0x3f;
+              uint8_t const fchannel = channelid & 0x3;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = DetId{eid2did[eid.linearIndex()]};
+
+              /*
+                if (eid.rawId() >= HcalElectronicsId::maxLinearIndex) {
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+                    printf("*** rawid = %u has no known det id***\n",
+                        eid.rawId());
+#endif
+                    break;
+                }
+                */
+              //auto const did = DetId{eid2did[eid.rawId()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u\n", eid.rawId(), eid.linearIndex(), did.rawId());
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              break;
+            }
+            default:
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("flavor = %u crate = %u slot = %u channelid = %u\n", flavor, uhtrcrate, uhtrslot, channelid);
+#endif
+              ;
+          }
+
+          // skip to the next word in case
+          // 1) current channel was not treated
+          // 2) we are in the middle of the digi words and not at the end
+          while (new_channel_start == ptr || !is_channel_header_word(ptr) && ptr != end)
+            ++ptr;
+        }
+
+        // thread with rank 31 of the group will have the ptr pointing to the
+        // header word of the next channel or the end
+        int const offset_to_shuffle = ptr - start_ptr;
+
+        // always receive from the last guy in the group
+        auto const offset_for_rank31 = thread_group.shfl(offset_to_shuffle, NTHREADS - 1);
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+        printf("rank = %d offset_to_shuffle = %d offset_for_rank32 = %d\n",
+               thread_group.thread_rank(),
+               offset_to_shuffle,
+               offset_for_rank31);
+#endif
+
+        // update the ptr for all threads of this group
+        // NOTE: relative to the start_ptr that is the same for all threads of
+        // this group
+        ptr = start_ptr + offset_for_rank31;
+      }
+    }
+
+    void entryPoint(InputDataCPU const& inputCPU,
+                    InputDataGPU& inputGPU,
+                    OutputDataGPU& outputGPU,
+                    ScratchDataGPU& scratchGPU,
+                    OutputDataCPU& outputCPU,
+                    ConditionsProducts const& conditions,
+                    ConfigurationParameters const& config,
+                    cudaStream_t cudaStream,
+                    uint32_t const nfedsWithData,
+                    uint32_t const nbytesTotal) {
+      // transfer
+      cudaCheck(cudaMemcpyAsync(
+          inputGPU.data, inputCPU.data.data(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream));
+      cudaCheck(cudaMemcpyAsync(inputGPU.offsets,
+                                inputCPU.offsets.data(),
+                                nfedsWithData * sizeof(uint32_t),
+                                cudaMemcpyHostToDevice,
+                                cudaStream));
+      cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounters, 0, sizeof(uint32_t) * numOutputCollections, cudaStream));
+      cudaCheck(cudaMemcpyAsync(
+          inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
+
+      // 12 is the max number of modules per crate
+      kernel_rawdecode_test<32><<<nfedsWithData, 12 * 32, 0, cudaStream>>>(inputGPU.data,
+                                                                           inputGPU.offsets,
+                                                                           inputGPU.feds,
+                                                                           conditions.eMappingProduct.eid2did,
+                                                                           conditions.eMappingProduct.eid2tid,
+                                                                           outputGPU.digisF01HE.data,
+                                                                           outputGPU.digisF01HE.ids,
+                                                                           outputGPU.digisF5HB.data,
+                                                                           outputGPU.digisF5HB.ids,
+                                                                           outputGPU.digisF5HB.npresamples,
+                                                                           outputGPU.digisF3HB.data,
+                                                                           outputGPU.digisF3HB.ids,
+                                                                           scratchGPU.pChannelsCounters,
+                                                                           config.nsamplesF01HE,
+                                                                           config.nsamplesF5HB,
+                                                                           config.nsamplesF3HB,
+                                                                           nbytesTotal);
+      cudaCheck(cudaGetLastError());
+
+      cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.data(),
+                                scratchGPU.pChannelsCounters,
+                                sizeof(uint32_t) * numOutputCollections,
+                                cudaMemcpyDeviceToHost,
+                                cudaStream));
+    }
+
+  }  // namespace raw
+}  // namespace hcal
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
new file mode 100644
index 0000000000000..97af639b61a5e
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
@@ -0,0 +1,23 @@
+#ifndef EventFilter_HcalRawToDigi_interface_DecodeGPU_h
+#define EventFilter_HcalRawToDigi_interface_DecodeGPU_h
+
+#include "EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h"
+
+namespace hcal {
+  namespace raw {
+
+    void entryPoint(InputDataCPU const&,
+                    InputDataGPU&,
+                    OutputDataGPU&,
+                    ScratchDataGPU&,
+                    OutputDataCPU&,
+                    ConditionsProducts const&,
+                    ConfigurationParameters const&,
+                    cudaStream_t cudaStream,
+                    uint32_t const,
+                    uint32_t const);
+
+  }
+}  // namespace hcal
+
+#endif  // EventFilter_HcalRawToDigi_interface_DecodeGPU_h
diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
new file mode 100644
index 0000000000000..13dc8a756a415
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
@@ -0,0 +1,64 @@
+#include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h"
+
+namespace hcal {
+  namespace raw {
+
+    // TODO: 0x3FFFFF * 4B ~= 16MB
+    // tmp solution for linear mapping of eid -> did
+    ElectronicsMappingGPU::ElectronicsMappingGPU(HcalElectronicsMap const& mapping)
+        : eid2tid_(HcalElectronicsId::maxLinearIndex, 0u), eid2did_(HcalElectronicsId::maxLinearIndex, 0u) {
+      auto const& eidsPrecision = mapping.allElectronicsIdPrecision();
+      for (uint32_t i = 0; i < eidsPrecision.size(); ++i) {
+        auto const& eid = eidsPrecision[i];
+
+        // assign
+        eid2did_[eid.linearIndex()] = eid.isTriggerChainId() ? 0u : mapping.lookup(eid).rawId();
+      }
+
+      auto const& eidsTrigger = mapping.allElectronicsIdTrigger();
+      for (uint32_t i = 0; i < eidsTrigger.size(); i++) {
+        auto const& eid = eidsTrigger[i];
+
+        // assign
+        eid2tid_[eid.linearIndex()] = eid.isTriggerChainId() ? mapping.lookupTrigger(eid).rawId() : 0u;
+      }
+    }
+
+    ElectronicsMappingGPU::Product::~Product() {
+      // deallocation
+      cudaCheck(cudaFree(eid2did));
+      cudaCheck(cudaFree(eid2tid));
+    }
+
+    ElectronicsMappingGPU::Product const& ElectronicsMappingGPU::getProduct(cudaStream_t cudaStream) const {
+      auto const& product = product_.dataForCurrentDeviceAsync(
+          cudaStream, [this](ElectronicsMappingGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck(cudaMalloc((void**)&product.eid2did, this->eid2did_.size() * sizeof(uint32_t)));
+            cudaCheck(cudaMalloc((void**)&product.eid2tid, this->eid2tid_.size() * sizeof(uint32_t)));
+
+            // transfer
+            cudaCheck(cudaMemcpyAsync(product.eid2did,
+                                      this->eid2did_.data(),
+                                      this->eid2did_.size() * sizeof(uint32_t),
+                                      cudaMemcpyHostToDevice,
+                                      cudaStream));
+            cudaCheck(cudaMemcpyAsync(product.eid2tid,
+                                      this->eid2tid_.data(),
+                                      this->eid2tid_.size() * sizeof(uint32_t),
+                                      cudaMemcpyHostToDevice,
+                                      cudaStream));
+          });
+
+      return product;
+    }
+
+  }  // namespace raw
+}  // namespace hcal
+
+TYPELOOKUP_DATA_REG(hcal::raw::ElectronicsMappingGPU);
diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
new file mode 100644
index 0000000000000..cb7090d480faa
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
@@ -0,0 +1,51 @@
+#ifndef EventFilter_HcalRawToDigi_interface_ElectronicsMappingGPU_h
+#define EventFilter_HcalRawToDigi_interface_ElectronicsMappingGPU_h
+
+#include "CondFormats/HcalObjects/interface/HcalElectronicsMap.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+namespace hcal {
+  namespace raw {
+
+    class ElectronicsMappingGPU {
+    public:
+      struct Product {
+        ~Product();
+        // trigger
+        uint32_t *eid2tid;
+        // detector
+        uint32_t *eid2did;
+      };
+
+#ifndef __CUDACC__
+
+      // rearrange pedestals
+      ElectronicsMappingGPU(HcalElectronicsMap const &);
+
+      // will call dealloation for Product thru ~Product
+      ~ElectronicsMappingGPU() = default;
+
+      // get device pointers
+      Product const &getProduct(cudaStream_t) const;
+
+      //
+      static std::string name() { return std::string{"hcalElectronicsMappingGPU"}; }
+
+    private:
+      // in the future, we need to arrange so to avoid this copy on the host
+      // if possible
+      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> eid2tid_;
+      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> eid2did_;
+
+      cms::cuda::ESProduct<Product> product_;
+#endif
+    };
+
+  }  // namespace raw
+}  // namespace hcal
+
+#endif  // EventFilter_HcalRawToDigi_interface_ElectronicsMappingGPU_h
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
new file mode 100644
index 0000000000000..013aca56298bb
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
@@ -0,0 +1,191 @@
+#include <iostream>
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+
+class HcalCPUDigisProducer : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HcalCPUDigisProducer(edm::ParameterSet const& ps);
+  ~HcalCPUDigisProducer() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef01> digisF01HETokenIn_;
+  using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef5> digisF5HBTokenIn_;
+  using IProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef3> digisF3HBTokenIn_;
+
+  using OProductTypef01 =
+      hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OProductTypef01> digisF01HETokenOut_;
+  using OProductTypef5 =
+      hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OProductTypef5> digisF5HBTokenOut_;
+  using OProductTypef3 =
+      hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OProductTypef3> digisF3HBTokenOut_;
+
+  // needed to pass data from acquire to produce
+  OProductTypef01 digisf01HE_;
+  OProductTypef5 digisf5HB_;
+  OProductTypef3 digisf3HB_;
+};
+
+void HcalCPUDigisProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("digisLabelF01HEIn", edm::InputTag{"hcalRawToDigiGPU", "f01HEDigisGPU"});
+  desc.add<edm::InputTag>("digisLabelF5HBIn", edm::InputTag{"hcalRawToDigiGPU", "f5HBDigisGPU"});
+  desc.add<edm::InputTag>("digisLabelF3HBIn", edm::InputTag{"hcalRawToDigiGPU", "f3HBDigisGPU"});
+  desc.add<std::string>("digisLabelF01HEOut", "f01HEDigis");
+  desc.add<std::string>("digisLabelF5HBOut", "f5HBDigis");
+  desc.add<std::string>("digisLabelF3HBOut", "f3HBDigis");
+
+  std::string label = "hcalCPUDigisProducer";
+  confDesc.add(label, desc);
+}
+
+HcalCPUDigisProducer::HcalCPUDigisProducer(const edm::ParameterSet& ps)
+    : digisF01HETokenIn_{consumes<IProductTypef01>(ps.getParameter<edm::InputTag>("digisLabelF01HEIn"))},
+      digisF5HBTokenIn_{consumes<IProductTypef5>(ps.getParameter<edm::InputTag>("digisLabelF5HBIn"))},
+      digisF3HBTokenIn_{consumes<IProductTypef3>(ps.getParameter<edm::InputTag>("digisLabelF3HBIn"))},
+      digisF01HETokenOut_{produces<OProductTypef01>(ps.getParameter<std::string>("digisLabelF01HEOut"))},
+      digisF5HBTokenOut_{produces<OProductTypef5>(ps.getParameter<std::string>("digisLabelF5HBOut"))},
+      digisF3HBTokenOut_{produces<OProductTypef3>(ps.getParameter<std::string>("digisLabelF3HBOut"))} {}
+
+HcalCPUDigisProducer::~HcalCPUDigisProducer() {}
+
+void HcalCPUDigisProducer::acquire(edm::Event const& event,
+                                   edm::EventSetup const& setup,
+                                   edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& f01HEProduct = event.get(digisF01HETokenIn_);
+  auto const& f5HBProduct = event.get(digisF5HBTokenIn_);
+  auto const& f3HBProduct = event.get(digisF3HBTokenIn_);
+  cms::cuda::ScopedContextAcquire ctx{f01HEProduct, std::move(taskHolder)};
+  auto const& f01HEDigis = ctx.get(f01HEProduct);
+  auto const& f5HBDigis = ctx.get(f5HBProduct);
+  auto const& f3HBDigis = ctx.get(f3HBProduct);
+
+  // resize out tmp buffers
+  digisf01HE_.stride = f01HEDigis.stride;
+  digisf5HB_.stride = f5HBDigis.stride;
+  digisf3HB_.stride = f3HBDigis.stride;
+  digisf01HE_.resize(f01HEDigis.size);
+  digisf5HB_.resize(f5HBDigis.size);
+  digisf3HB_.resize(f3HBDigis.size);
+
+  /*
+    idsf01he.resize(f01HEDigis.ndigis);
+    dataf01he.resize(f01HEDigis.ndigis * f01HEDigis.stride);
+    idsf5hb.resize(f5HBDigis.ndigis);
+    npresamplesf5hb.resize(f5HBDigis.ndigis);
+    dataf5hb.resize(f5HBDigis.ndigis * f5HBDigis.stride);
+    stridef01he = f01HEDigis.stride;
+    stridef5hb = f5HBDigis.stride;
+    */
+
+  auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
+    using vector_type = typename std::remove_reference<decltype(dest)>::type;
+    using type = typename vector_type::value_type;
+    cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream()));
+  };
+
+  lambdaToTransfer(digisf01HE_.data, f01HEDigis.data);
+  lambdaToTransfer(digisf01HE_.ids, f01HEDigis.ids);
+
+  lambdaToTransfer(digisf5HB_.data, f5HBDigis.data);
+  lambdaToTransfer(digisf5HB_.ids, f5HBDigis.ids);
+  lambdaToTransfer(digisf5HB_.npresamples, f5HBDigis.npresamples);
+
+  lambdaToTransfer(digisf3HB_.data, f3HBDigis.data);
+  lambdaToTransfer(digisf3HB_.ids, f3HBDigis.ids);
+
+  /*
+    // enqeue transfers
+    cudaCheck( cudaMemcpyAsync(digisf01.data.data(),
+                               f01HEDigis.data,
+                               dataf01HE.data.size() * sizeof(uint16_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream().id()) );
+    cudaCheck( cudaMemcpyAsync(dataf5hb.data(),
+                               f5HBDigis.data,
+                               dataf5hb.size() * sizeof(uint16_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream().id()) );
+    cudaCheck( cudaMemcpyAsync(idsf01he.data(),
+                               f01HEDigis.ids,
+                               idsf01he.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream().id()) );
+    cudaCheck( cudaMemcpyAsync(idsf5hb.data(),
+                               f5HBDigis.ids,
+                               idsf5hb.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream().id()) );
+    cudaCheck( cudaMemcpyAsync(npresamplesf5hb.data(),
+                               f5HBDigis.npresamples,
+                               npresamplesf5hb.size() * sizeof(uint8_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream.id()) );
+                               */
+}
+
+void HcalCPUDigisProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  auto outf01 = std::make_unique<OProductTypef01>(std::move(digisf01HE_));
+  auto outf5 = std::make_unique<OProductTypef5>(std::move(digisf5HB_));
+  auto outf3 = std::make_unique<OProductTypef3>(std::move(digisf3HB_));
+
+  event.put(digisF01HETokenOut_, std::move(outf01));
+  event.put(digisF5HBTokenOut_, std::move(outf5));
+  event.put(digisF3HBTokenOut_, std::move(outf3));
+
+  // output collections
+  /*
+    auto f01he = std::make_unique<edm::DataFrameContainer>(
+        stridef01he, HcalEndcap, idsf01he.size());
+    auto f5hb = std::make_unique<edm::DataFrameContainer>(
+        stridef5hb, HcalBarrel, idsf5hb.size());
+    
+    // cast constness away
+    // use pointers to buffers instead of move operator= semantics (or swap)
+    // cause we have different allocators in there...
+    auto *dataf01hetmp = const_cast<uint16_t*>(f01he->data().data());
+    auto *dataf5hbtmp = const_cast<uint16_t*>(f5hb->data().data());
+
+    auto *idsf01hetmp = const_cast<uint32_t*>(f01he->ids().data());
+    auto idsf5hbtmp = const_cast<uint32_t*>(f5hb->ids().data());
+
+    // copy data
+    std::memcpy(dataf01hetmp, dataf01he.data(), dataf01he.size() * sizeof(uint16_t));
+    std::memcpy(dataf5hbtmp, dataf5hb.data(), dataf5hb.size() * sizeof(uint16_t));
+    std::memcpy(idsf01hetmp, idsf01he.data(), idsf01he.size() * sizeof(uint32_t));
+    std::memcpy(idsf5hbtmp, idsf5hb.data(), idsf5hb.size() * sizeof(uint32_t));
+
+    event.put(digisF01HETokenOut_, std::move(f01he));
+    event.put(digisF5HBTokenOut_, std::move(f5hb));
+    */
+}
+
+DEFINE_FWK_MODULE(HcalCPUDigisProducer);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
new file mode 100644
index 0000000000000..0364e3718821f
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -0,0 +1,272 @@
+#include <iostream>
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+
+class HcalDigisProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HcalDigisProducerGPU(edm::ParameterSet const& ps);
+  ~HcalDigisProducerGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  // input product tokens
+  edm::EDGetTokenT<HBHEDigiCollection> hbheDigiToken_;
+  edm::EDGetTokenT<QIE11DigiCollection> qie11DigiToken_;
+
+  // type aliases
+  using HostCollectionf01 =
+      hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf01 = hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>;
+  using HostCollectionf5 =
+      hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf5 = hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>;
+  using HostCollectionf3 =
+      hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf3 = hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>;
+
+  // output product tokens
+  using ProductTypef01 = cms::cuda::Product<DeviceCollectionf01>;
+  edm::EDPutTokenT<ProductTypef01> digisF01HEToken_;
+  using ProductTypef5 = cms::cuda::Product<DeviceCollectionf5>;
+  edm::EDPutTokenT<ProductTypef5> digisF5HBToken_;
+  using ProductTypef3 = cms::cuda::Product<DeviceCollectionf3>;
+  edm::EDPutTokenT<ProductTypef3> digisF3HBToken_;
+
+  cms::cuda::ContextState cudaState_;
+
+  /*
+    hcal::raw::ConfigurationParameters config_;
+    // FIXME move this to use raii
+    hcal::raw::InputDataCPU inputCPU_;
+    hcal::raw::InputDataGPU inputGPU_;
+    hcal::raw::OutputDataGPU outputGPU_;
+    hcal::raw::ScratchDataGPU scratchGPU_;
+    hcal::raw::OutputDataCPU outputCPU_;
+    */
+
+  struct ConfigParameters {
+    uint32_t maxChannelsF01HE, maxChannelsF5HB, maxChannelsF3HB, nsamplesF01HE, nsamplesF5HB, nsamplesF3HB;
+  };
+  ConfigParameters config_;
+
+  // tmp on the host
+  HostCollectionf01 hf01_;
+  HostCollectionf5 hf5_;
+  HostCollectionf3 hf3_;
+
+  // device products
+  // NOTE: this module owns memory of the product on the device
+  DeviceCollectionf01 df01_;
+  DeviceCollectionf5 df5_;
+  DeviceCollectionf3 df3_;
+};
+
+void HcalDigisProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  // FIXME
+  desc.add<edm::InputTag>("hbheDigisLabel", edm::InputTag("hcalDigis"));
+  desc.add<edm::InputTag>("qie11DigiLabel", edm::InputTag("hcalDigis"));
+  desc.add<std::string>("digisLabelF01HE", std::string{"f01HEDigisGPU"});
+  desc.add<std::string>("digisLabelF5HB", std::string{"f5HBDigisGPU"});
+  desc.add<std::string>("digisLabelF3HB", std::string{"f3HBDigisGPU"});
+  desc.add<uint32_t>("maxChannelsF01HE", 10000u);
+  desc.add<uint32_t>("maxChannelsF5HB", 10000u);
+  desc.add<uint32_t>("maxChannelsF3HB", 10000u);
+  desc.add<uint32_t>("nsamplesF01HE", 8);
+  desc.add<uint32_t>("nsamplesF5HB", 8);
+  desc.add<uint32_t>("nsamplesF3HB", 8);
+
+  std::string label = "hcalDigisProducerGPU";
+  confDesc.add(label, desc);
+}
+
+HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
+    : hbheDigiToken_{consumes<HBHEDigiCollection>(ps.getParameter<edm::InputTag>("hbheDigisLabel"))},
+      qie11DigiToken_{consumes<QIE11DigiCollection>(ps.getParameter<edm::InputTag>("qie11DigiLabel"))},
+      digisF01HEToken_{produces<ProductTypef01>(ps.getParameter<std::string>("digisLabelF01HE"))},
+      digisF5HBToken_{produces<ProductTypef5>(ps.getParameter<std::string>("digisLabelF5HB"))},
+      digisF3HBToken_{produces<ProductTypef3>(ps.getParameter<std::string>("digisLabelF3HB"))} {
+  config_.maxChannelsF01HE = ps.getParameter<uint32_t>("maxChannelsF01HE");
+  config_.maxChannelsF5HB = ps.getParameter<uint32_t>("maxChannelsF5HB");
+  config_.maxChannelsF3HB = ps.getParameter<uint32_t>("maxChannelsF3HB");
+  config_.nsamplesF01HE = ps.getParameter<uint32_t>("nsamplesF01HE");
+  config_.nsamplesF5HB = ps.getParameter<uint32_t>("nsamplesF5HB");
+  config_.nsamplesF3HB = ps.getParameter<uint32_t>("nsamplesF3HB");
+
+  // allocate on the device
+  cudaCheck(cudaMalloc(
+      (void**)&df01_.data,
+      config_.maxChannelsF01HE * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)));
+  cudaCheck(cudaMalloc((void**)&df01_.ids, config_.maxChannelsF01HE * sizeof(uint32_t)));
+
+  cudaCheck(cudaMalloc(
+      (void**)&df5_.data,
+      config_.maxChannelsF5HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)));
+  cudaCheck(cudaMalloc((void**)&df5_.ids, config_.maxChannelsF5HB * sizeof(uint32_t)));
+  cudaCheck(cudaMalloc((void**)&df5_.npresamples, sizeof(uint8_t) * config_.maxChannelsF5HB));
+
+  cudaCheck(cudaMalloc(
+      (void**)&df3_.data,
+      config_.maxChannelsF3HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB)));
+  cudaCheck(cudaMalloc((void**)&df3_.ids, config_.maxChannelsF3HB * sizeof(uint32_t)));
+
+  // preallocate on the host
+  hf01_.stride = hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
+  hf5_.stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
+  hf3_.stride = hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB);
+  hf01_.reserve(config_.maxChannelsF01HE);
+  hf5_.reserve(config_.maxChannelsF5HB);
+  hf3_.reserve(config_.maxChannelsF3HB);
+}
+
+HcalDigisProducerGPU::~HcalDigisProducerGPU() {
+  // deallocate on the device
+  cudaCheck(cudaFree(df01_.data));
+  cudaCheck(cudaFree(df01_.ids));
+
+  cudaCheck(cudaFree(df5_.data));
+  cudaCheck(cudaFree(df5_.ids));
+  cudaCheck(cudaFree(df5_.npresamples));
+}
+
+void HcalDigisProducerGPU::acquire(edm::Event const& event,
+                                   edm::EventSetup const& setup,
+                                   edm::WaitingTaskWithArenaHolder holder) {
+  // raii
+  cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
+
+  hf01_.clear();
+  hf5_.clear();
+  hf3_.clear();
+
+  // event data
+  edm::Handle<HBHEDigiCollection> hbheDigis;
+  edm::Handle<QIE11DigiCollection> qie11Digis;
+  event.getByToken(hbheDigiToken_, hbheDigis);
+  event.getByToken(qie11DigiToken_, qie11Digis);
+
+  for (auto const& hbhe : *hbheDigis) {
+    auto const id = hbhe.id().rawId();
+    auto const presamples = hbhe.presamples();
+    hf5_.ids.push_back(id);
+    hf5_.npresamples.push_back(presamples);
+    int stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
+    // simple for now...
+    static_assert(hcal::Flavor5::HEADER_WORDS == 1);
+    uint16_t header_word = (1 << 15) | (0x5 << 12) | (0 << 10) | ((hbhe.sample(0).capid() & 0x3) << 8);
+    hf5_.data.push_back(header_word);
+    //for (int i=0; i<hcal::Flavor5::HEADER_WORDS; i++)
+    //    hf5_.data.push_back(0);
+    for (int i = 0; i < stride - hcal::Flavor5::HEADER_WORDS; i++) {
+      uint16_t s0 = (0 << 7) | (static_cast<uint8_t>(hbhe.sample(2 * i).adc()) & 0x7f);
+      uint16_t s1 = (0 << 7) | (static_cast<uint8_t>(hbhe.sample(2 * i + 1).adc()) & 0x7f);
+      uint16_t sample = (s1 << 8) | s0;
+      hf5_.data.push_back(sample);
+    }
+  }
+
+  for (unsigned int i = 0; i < qie11Digis->size(); i++) {
+    auto const& digi = QIE11DataFrame{(*qie11Digis)[i]};
+    if (digi.flavor() == 0 or digi.flavor() == 1) {
+      if (digi.detid().subdetId() != HcalEndcap)
+        continue;
+      auto const id = digi.detid().rawId();
+      hf01_.ids.push_back(id);
+      for (int hw = 0; hw < hcal::Flavor01::HEADER_WORDS; hw++)
+        hf01_.data.push_back((*qie11Digis)[i][hw]);
+      for (int sample = 0; sample < digi.samples(); sample++) {
+        hf01_.data.push_back((*qie11Digis)[i][hcal::Flavor01::HEADER_WORDS + sample]);
+      }
+    } else if (digi.flavor() == 3) {
+      if (digi.detid().subdetId() != HcalBarrel)
+        continue;
+      auto const id = digi.detid().rawId();
+      hf3_.ids.push_back(id);
+      for (int hw = 0; hw < hcal::Flavor3::HEADER_WORDS; hw++)
+        hf3_.data.push_back((*qie11Digis)[i][hw]);
+      for (int sample = 0; sample < digi.samples(); sample++) {
+        hf3_.data.push_back((*qie11Digis)[i][hcal::Flavor3::HEADER_WORDS + sample]);
+      }
+    }
+  }
+
+  auto lambdaToTransfer = [&ctx](auto* dest, auto const& src) {
+    using vector_type = typename std::remove_reference<decltype(src)>::type;
+    using type = typename vector_type::value_type;
+    cudaCheck(cudaMemcpyAsync(dest, src.data(), src.size() * sizeof(type), cudaMemcpyHostToDevice, ctx.stream()));
+  };
+
+  lambdaToTransfer(df01_.data, hf01_.data);
+  lambdaToTransfer(df01_.ids, hf01_.ids);
+
+  lambdaToTransfer(df5_.data, hf5_.data);
+  lambdaToTransfer(df5_.ids, hf5_.ids);
+  lambdaToTransfer(df5_.npresamples, hf5_.npresamples);
+
+  lambdaToTransfer(df3_.data, hf3_.data);
+  lambdaToTransfer(df3_.ids, hf3_.ids);
+}
+
+void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+  df01_.stride = hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
+  df01_.size = hf01_.ids.size();
+  df5_.stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
+  df5_.size = hf5_.ids.size();
+  df3_.stride = hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB);
+  df3_.size = hf3_.ids.size();
+
+  ctx.emplace(event, digisF01HEToken_, df01_);
+  ctx.emplace(event, digisF5HBToken_, df5_);
+  ctx.emplace(event, digisF3HBToken_, df3_);
+
+  /*
+
+#ifdef HCAL_RAWDECODE_CPUDEBUG
+    printf("f01he channels = %u f5hb channesl = %u\n",
+        outputCPU_.nchannels[hcal::raw::OutputF01HE], 
+        outputCPU_.nchannels[hcal::raw::OutputF5HB]);
+#endif
+
+    // FIXME: use sizes of views directly for cuda mem cpy?
+    auto const nchannelsF01HE = outputCPU_.nchannels[hcal::raw::OutputF01HE];
+    auto const nchannelsF5HB = outputCPU_.nchannels[hcal::raw::OutputF5HB];
+    outputGPU_.digisF01HE.size = nchannelsF01HE;
+    outputGPU_.digisF5HB.size = nchannelsF5HB;
+    outputGPU_.digisF01HE.stride = 
+        hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
+    outputGPU_.digisF5HB.stride = 
+        hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
+
+    hcal::DigiCollection<hcal::Flavor01> digisF01HE{outputGPU_.idsF01HE,
+        outputGPU_.digisF01HE, nchannelsF01HE, 
+        hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)};
+    hcal::DigiCollection<hcal::Flavor5> digisF5HB{outputGPU_.idsF5HB,
+        outputGPU_.digisF5HB, outputGPU_.npresamplesF5HB, nchannelsF5HB, 
+        hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)};
+
+    ctx.emplace(event, digisF01HEToken_, std::move(outputGPU_.digisF01HE));
+    ctx.emplace(event, digisF5HBToken_, std::move(outputGPU_.digisF5HB));
+
+    */
+}
+
+DEFINE_FWK_MODULE(HcalDigisProducerGPU);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
new file mode 100644
index 0000000000000..04cb786826015
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
@@ -0,0 +1,12 @@
+#include "HcalRawESProducerGPU.h"
+
+#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
+
+#include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
+
+#include <iostream>
+
+using HcalElectronicsMappingGPUESProducer =
+    HcalRawESProducerGPU<hcal::raw::ElectronicsMappingGPU, HcalElectronicsMap, HcalElectronicsMapRcd>;
+
+DEFINE_FWK_EVENTSETUP_MODULE(HcalElectronicsMappingGPUESProducer);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
new file mode 100644
index 0000000000000..d9af852a2889b
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
@@ -0,0 +1,206 @@
+#include <iostream>
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+// algorithm specific
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+//#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+//#include "CUDADataFormats/HcalDigi/interface/DigisCollection.h"
+
+//#include "CondFormats/DataRecord/interface/HcalMappingElectronicsRcd.h"
+//#include "EventFilter/HcalRawToDigi/interface/ElectronicsMappingGPU.h"
+
+#include "EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h"
+#include "EventFilter/HcalRawToDigi/plugins/DecodeGPU.h"
+#include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
+#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
+#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
+
+class HcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HcalRawToDigiGPU(edm::ParameterSet const& ps);
+  ~HcalRawToDigiGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  edm::EDGetTokenT<FEDRawDataCollection> rawDataToken_;
+  using ProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>;
+  edm::EDPutTokenT<ProductTypef01> digisF01HEToken_;
+  using ProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>;
+  edm::EDPutTokenT<ProductTypef5> digisF5HBToken_;
+  using ProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>;
+  edm::EDPutTokenT<ProductTypef3> digisF3HBToken_;
+
+  cms::cuda::ContextState cudaState_;
+
+  std::vector<int> fedsToUnpack_;
+
+  hcal::raw::ConfigurationParameters config_;
+  // FIXME move this to use raii
+  hcal::raw::InputDataCPU inputCPU_;
+  hcal::raw::InputDataGPU inputGPU_;
+  hcal::raw::OutputDataGPU outputGPU_;
+  hcal::raw::ScratchDataGPU scratchGPU_;
+  hcal::raw::OutputDataCPU outputCPU_;
+};
+
+void HcalRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("InputLabel", edm::InputTag("rawDataCollector"));
+  auto nFeds = FEDNumbering::MAXHCALuTCAFEDID - FEDNumbering::MINHCALuTCAFEDID + 1;
+  std::vector<int> feds(nFeds);
+  for (int i = 0; i < nFeds; ++i)
+    feds[i] = i + FEDNumbering::MINHCALuTCAFEDID;
+  desc.add<std::vector<int>>("FEDs", feds);
+  desc.add<uint32_t>("maxChannelsF01HE", 10000u);
+  desc.add<uint32_t>("maxChannelsF5HB", 10000u);
+  desc.add<uint32_t>("maxChannelsF3HB", 10000u);
+  desc.add<uint32_t>("nsamplesF01HE", 8);
+  desc.add<uint32_t>("nsamplesF5HB", 8);
+  desc.add<uint32_t>("nsamplesF3HB", 8);
+  desc.add<std::string>("digisLabelF5HB", "f5HBDigisGPU");
+  desc.add<std::string>("digisLabelF01HE", "f01HEDigisGPU");
+  desc.add<std::string>("digisLabelF3HB", "f3HBDigisGPU");
+
+  std::string label = "hcalRawToDigiGPU";
+  confDesc.add(label, desc);
+}
+
+HcalRawToDigiGPU::HcalRawToDigiGPU(const edm::ParameterSet& ps)
+    : rawDataToken_{consumes<FEDRawDataCollection>(ps.getParameter<edm::InputTag>("InputLabel"))},
+      digisF01HEToken_{produces<ProductTypef01>(ps.getParameter<std::string>("digisLabelF01HE"))},
+      digisF5HBToken_{produces<ProductTypef5>(ps.getParameter<std::string>("digisLabelF5HB"))},
+      digisF3HBToken_{produces<ProductTypef3>(ps.getParameter<std::string>("digisLabelF3HB"))},
+      fedsToUnpack_{ps.getParameter<std::vector<int>>("FEDs")} {
+  config_.maxChannelsF01HE = ps.getParameter<uint32_t>("maxChannelsF01HE");
+  config_.maxChannelsF5HB = ps.getParameter<uint32_t>("maxChannelsF5HB");
+  config_.maxChannelsF3HB = ps.getParameter<uint32_t>("maxChannelsF3HB");
+  config_.nsamplesF01HE = ps.getParameter<uint32_t>("nsamplesF01HE");
+  config_.nsamplesF5HB = ps.getParameter<uint32_t>("nsamplesF5HB");
+  config_.nsamplesF3HB = ps.getParameter<uint32_t>("nsamplesF3HB");
+
+  inputCPU_.allocate();
+  inputGPU_.allocate();
+  outputGPU_.allocate(config_);
+  scratchGPU_.allocate(config_);
+  outputCPU_.allocate();
+}
+
+HcalRawToDigiGPU::~HcalRawToDigiGPU() {
+  inputGPU_.deallocate();
+  outputGPU_.deallocate(config_);
+  scratchGPU_.deallocate(config_);
+}
+
+void HcalRawToDigiGPU::acquire(edm::Event const& event,
+                               edm::EventSetup const& setup,
+                               edm::WaitingTaskWithArenaHolder holder) {
+  // raii
+  cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
+
+  // conditions
+  edm::ESHandle<hcal::raw::ElectronicsMappingGPU> eMappingHandle;
+  setup.get<HcalElectronicsMapRcd>().get(eMappingHandle);
+  auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream());
+
+  // bundle up conditions
+  hcal::raw::ConditionsProducts conditions{eMappingProduct};
+
+  // event data
+  edm::Handle<FEDRawDataCollection> rawDataHandle;
+  event.getByToken(rawDataToken_, rawDataHandle);
+
+  // iterate over feds
+  // TODO: another idea
+  //   - loop over all feds to unpack and enqueue cuda memcpy
+  //   - accumulate the sizes
+  //   - after the loop launch cuda memcpy for sizes
+  //   - enqueue the kernel
+  uint32_t currentCummOffset = 0;
+  uint32_t counter = 0;
+  for (auto const& fed : fedsToUnpack_) {
+    auto const& data = rawDataHandle->FEDData(fed);
+    auto const nbytes = data.size();
+
+    // skip empty feds
+    if (nbytes < hcal::raw::empty_event_size)
+      continue;
+
+#ifdef HCAL_RAWDECODE_CPUDEBUG
+    printf("fed = %d nbytes = %lu\n", fed, nbytes);
+#endif
+
+    // copy raw data into plain buffer
+    std::memcpy(inputCPU_.data.data() + currentCummOffset, data.data(), nbytes);
+    // set the offset in bytes from the start
+    inputCPU_.offsets[counter] = currentCummOffset;
+    inputCPU_.feds[counter] = fed;
+
+    // this is the current offset into the vector
+    currentCummOffset += nbytes;
+    ++counter;
+  }
+
+  hcal::raw::entryPoint(inputCPU_,
+                        inputGPU_,
+                        outputGPU_,
+                        scratchGPU_,
+                        outputCPU_,
+                        conditions,
+                        config_,
+                        ctx.stream(),
+                        counter,
+                        currentCummOffset);
+}
+
+void HcalRawToDigiGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+#ifdef HCAL_RAWDECODE_CPUDEBUG
+  printf("f01he channels = %u f5hb channesl = %u\n",
+         outputCPU_.nchannels[hcal::raw::OutputF01HE],
+         outputCPU_.nchannels[hcal::raw::OutputF5HB]);
+#endif
+
+  // FIXME: use sizes of views directly for cuda mem cpy?
+  auto const nchannelsF01HE = outputCPU_.nchannels[hcal::raw::OutputF01HE];
+  auto const nchannelsF5HB = outputCPU_.nchannels[hcal::raw::OutputF5HB];
+  auto const nchannelsF3HB = outputCPU_.nchannels[hcal::raw::OutputF3HB];
+  outputGPU_.digisF01HE.size = nchannelsF01HE;
+  outputGPU_.digisF5HB.size = nchannelsF5HB;
+  outputGPU_.digisF3HB.size = nchannelsF3HB;
+  outputGPU_.digisF01HE.stride = hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
+  outputGPU_.digisF5HB.stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
+  outputGPU_.digisF3HB.stride = hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB);
+
+  /*
+    hcal::DigiCollection<hcal::Flavor01> digisF01HE{outputGPU_.idsF01HE,
+        outputGPU_.digisF01HE, nchannelsF01HE, 
+        hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)};
+    hcal::DigiCollection<hcal::Flavor5> digisF5HB{outputGPU_.idsF5HB,
+        outputGPU_.digisF5HB, outputGPU_.npresamplesF5HB, nchannelsF5HB, 
+        hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)};
+        */
+
+  ctx.emplace(event, digisF01HEToken_, std::move(outputGPU_.digisF01HE));
+  ctx.emplace(event, digisF5HBToken_, std::move(outputGPU_.digisF5HB));
+  ctx.emplace(event, digisF3HBToken_, std::move(outputGPU_.digisF3HB));
+}
+
+DEFINE_FWK_MODULE(HcalRawToDigiGPU);
diff --git a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
index ac94d61e12494..8b00269aa4d9a 100644
--- a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
@@ -1,19 +1,27 @@
 <use name="boost"/>
-<use name="eigen"/>
 <use name="clhep"/>
+<use name="cuda"/>
+<use name="eigen"/>
+<use name="rootminuit2"/>
+<use name="vdt_headers"/>
+<use name="CUDADataFormats/Common" />
+<use name="CUDADataFormats/HcalDigi"/>
+<use name="CUDADataFormats/HcalRecHitSoA" />
+<use name="CalibCalorimetry/HcalAlgos"/>
+<use name="CalibFormats/CaloObjects"/>
+<use name="CalibFormats/HcalObjects"/>
+<use name="CondFormats/DataRecord"/>
 <use name="DataFormats/HcalDigi"/>
 <use name="DataFormats/HcalRecHit"/>
 <use name="DataFormats/TrackReco"/>
-<use name="CalibFormats/HcalObjects"/>
-<use name="CalibFormats/CaloObjects"/>
-<use name="CalibCalorimetry/HcalAlgos"/>
 <use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
 <use name="FWCore/MessageLogger"/>
-<use name="CondFormats/DataRecord"/>
+<use name="FWCore/ParameterSet"/>
+<use name="Geometry/CaloTopology"/>
+<use name="Geometry/HcalCommonData"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="RecoLocalCalo/EcalRecAlgos"/>
-<use name="vdt_headers"/>
-<use name="rootminuit2"/>
 <export>
   <lib name="1"/>
 </export>
diff --git a/RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml
new file mode 100644
index 0000000000000..30f76131622bd
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml
@@ -0,0 +1,2 @@
+<bin name="generateQIEShapes" file="generateQIEShapes.cc">
+</bin>
diff --git a/RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc b/RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc
new file mode 100644
index 0000000000000..39d5195e7438d
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc
@@ -0,0 +1,82 @@
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+//
+// Pregenerate QIE Shapes using hardcoded arrays
+// This is taken directly from CondFormats/HcalObjects/srcHcalQIEData.cc
+// This generation is running upon conditions retrieval typically for the cpu workload
+//
+// For the GPU workload, it is better to put generated values into constant memory.
+// Either this or just use global memory (for global mem, we need getters...).
+// Choosign constant memory as thsese
+// values are statically known and never change. Any change in any case requires
+// recompilation!
+//
+
+const float binMin[32] = {-1, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                          16, 18, 20, 22, 24, 26, 28, 31, 34, 37, 40, 44, 48, 52, 57, 62};
+
+const float binMin2[64] = {-0.5,  0.5,   1.5,   2.5,   3.5,   4.5,   5.5,   6.5,   7.5,   8.5,   9.5,
+                           10.5,  11.5,  12.5,  13.5,  14.5,  // 16 bins with width 1x
+                           15.5,  17.5,  19.5,  21.5,  23.5,  25.5,  27.5,  29.5,  31.5,  33.5,  35.5,
+                           37.5,  39.5,  41.5,  43.5,  45.5,  47.5,  49.5,  51.5,  53.5,  // 20 bins with width 2x
+                           55.5,  59.5,  63.5,  67.5,  71.5,  75.5,  79.5,  83.5,  87.5,  91.5,  95.5,
+                           99.5,  103.5, 107.5, 111.5, 115.5, 119.5, 123.5, 127.5, 131.5, 135.5,  // 21 bins with width 4x
+                           139.5, 147.5, 155.5, 163.5, 171.5, 179.5, 187.5};  // 7 bins with width 8x
+
+constexpr uint32_t nbins_qie8 = 32;
+constexpr uint32_t nbins_qie11 = 64;
+
+void dump(std::vector<float> const& vec, std::string const& name) {
+  std::stringstream str;
+  str << "float const " << name << "[" << vec.size() << "] = {";
+  uint32_t counter = 0;
+  for (auto const& value : vec) {
+    if (counter % 8 == 0)
+      str << std::endl;
+    if (counter == vec.size() - 1)
+      str << value;
+    else
+      str << value << ", ";
+    counter++;
+  }
+  str << "};";
+  std::cout << str.str() << std::endl;
+}
+
+void generate(uint32_t const nbins, float const* initValues, std::vector<float>& values) {
+  // preset the first range
+  for (uint32_t adc = 0; adc < nbins; adc++)
+    values[adc] = initValues[adc];
+
+  // do the rest
+  int scale = 1;
+  for (uint32_t range = 1; range < 4; range++) {
+    int factor = nbins == 32 ? 5 : 8;
+    scale *= factor;
+
+    auto const index_offset = range * nbins;
+    uint32_t const overlap = nbins == 32 ? 2 : 3;
+    values[index_offset] = values[index_offset - overlap];
+
+    for (uint32_t i = 1; i < nbins; i++)
+      values[index_offset + i] = values[index_offset + i - 1] + scale * (values[i] - values[i - 1]);
+  }
+
+  values[nbins * 4] = 2 * values[nbins * 4 - 1] - values[nbins * 4 - 2];
+}
+
+int main(int argc, char* argv[]) {
+  //
+  // run 128 bins
+  //
+  std::vector<float> valuesqie8(nbins_qie8 * 4 + 1), valuesqie11(nbins_qie11 * 4 + 1);
+  generate(nbins_qie8, binMin, valuesqie8);
+  generate(nbins_qie11, binMin2, valuesqie11);
+
+  dump(valuesqie8, std::string{"qie8shape"});
+  dump(valuesqie11, std::string{"qie11shape"});
+
+  return 0;
+}
diff --git a/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h b/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h
new file mode 100644
index 0000000000000..4c8f9c03d22ef
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h
@@ -0,0 +1,57 @@
+#ifndef RecoLocalCalo_HcalRecAlgos_interface_HcalRecoParamsWithPulseShapesGPU_h
+#define RecoLocalCalo_HcalRecAlgos_interface_HcalRecoParamsWithPulseShapesGPU_h
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class HcalRecoParams;
+
+//
+// TODO: HcalPulseShapes will need to be used via ESSource
+// This is a workaround: precompute/store/transfer what's needed only
+//
+class HcalRecoParamsWithPulseShapesGPU {
+public:
+  struct Product {
+    ~Product();
+    uint32_t *param1 = nullptr, *param2 = nullptr;
+    uint32_t *ids = nullptr;
+
+    // These guys come directly from PulseShapeFunctor class
+    float *acc25nsVec = nullptr, *diff25nsItvlVec = nullptr, *accVarLenIdxMinusOneVec = nullptr,
+          *diffVarItvlIdxMinusOneVec = nullptr, *accVarLenIdxZEROVec = nullptr, *diffVarItvlIdxZEROVec = nullptr;
+  };
+
+#ifndef __CUDACC__
+  // rearrange reco params
+  HcalRecoParamsWithPulseShapesGPU(HcalRecoParams const &);
+
+  // will trigger deallocation of Product thru ~Product
+  ~HcalRecoParamsWithPulseShapesGPU() = default;
+
+  // get device pointers
+  Product const &getProduct(cudaStream_t) const;
+
+  //
+  static std::string name() { return std::string{"hcalRecoParamsWithPulseShapesGPU"}; }
+
+private:
+  uint64_t totalChannels_;  // hb + he
+  std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> param1_;
+  std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> param2_;
+  std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> ids_;
+
+  std::vector<float, cms::cuda::HostAllocator<float>> acc25nsVec_;                 // 256
+  std::vector<float, cms::cuda::HostAllocator<float>> diff25nsItvlVec_;            // 256
+  std::vector<float, cms::cuda::HostAllocator<float>> accVarLenIdxMinusOneVec_;    // 25
+  std::vector<float, cms::cuda::HostAllocator<float>> diffVarItvlIdxMinusOneVec_;  // 25
+  std::vector<float, cms::cuda::HostAllocator<float>> accVarLenIdxZEROVec_;        // 25
+  std::vector<float, cms::cuda::HostAllocator<float>> diffVarItvlIdxZEROVec_;      // 25
+
+  cms::cuda::ESProduct<Product> product_;
+#endif
+};
+
+#endif
diff --git a/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
new file mode 100644
index 0000000000000..8ae2cd1a88880
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
@@ -0,0 +1,222 @@
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+
+#include "CondFormats/HcalObjects/interface/HcalRecoParams.h"
+#include "CalibCalorimetry/HcalAlgos/interface/HcalPulseShapes.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <unordered_map>
+
+// FIXME: add proper getters to conditions
+HcalRecoParamsWithPulseShapesGPU::HcalRecoParamsWithPulseShapesGPU(HcalRecoParams const& recoParams)
+    : totalChannels_{recoParams.getAllContainers()[0].second.size() + recoParams.getAllContainers()[1].second.size()},
+      param1_(totalChannels_),
+      param2_(totalChannels_),
+      ids_(totalChannels_) {
+#ifdef HCAL_MAHI_CPUDEBUG
+  printf("hello from a reco params with pulse shapes\n");
+#endif
+
+  auto const containers = recoParams.getAllContainers();
+
+  HcalPulseShapes pulseShapes;
+  std::unordered_map<unsigned int, uint32_t> idCache;
+
+  // fill in eb
+  auto const& barrelValues = containers[0].second;
+  for (uint64_t i = 0; i < barrelValues.size(); ++i) {
+    param1_[i] = barrelValues[i].param1();
+    param2_[i] = barrelValues[i].param2();
+
+    auto const pulseShapeId = barrelValues[i].pulseShapeID();
+    // FIXME: 0 throws upon look up to HcalPulseShapes
+    // although comments state that 0 is reserved,
+    // HcalPulseShapes::getShape throws on 0!
+    if (pulseShapeId == 0) {
+      ids_[i] = 0;
+      continue;
+    }
+    if (auto const iter = idCache.find(pulseShapeId); iter == idCache.end()) {
+      // new guy
+      auto const newId = idCache.size();
+      idCache[pulseShapeId] = newId;
+      // this will be the id
+      ids_[i] = newId;
+
+      // resize value arrays
+      acc25nsVec_.resize(acc25nsVec_.size() + HcalConst::maxPSshapeBin);
+      diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + HcalConst::maxPSshapeBin);
+      accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + HcalConst::nsPerBX);
+      diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + HcalConst::nsPerBX);
+      accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + HcalConst::nsPerBX);
+      diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + HcalConst::nsPerBX);
+
+      // precompute and get values from the functor
+      auto const& pulseShape = pulseShapes.getShape(pulseShapeId);
+      FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, 10};
+      auto const offset256 = newId * HcalConst::maxPSshapeBin;
+      auto const offset25 = newId * HcalConst::nsPerBX;
+      auto const numShapes = newId;
+      for (int i = 0; i < HcalConst::maxPSshapeBin; i++) {
+        acc25nsVec_[offset256 * numShapes + i] = functor.get_acc25nsVec()[i];
+        diff25nsItvlVec_[offset256 * numShapes + i] = functor.get_diff25nsItvlVec()[i];
+      }
+
+      for (int i = 0; i < HcalConst::nsPerBX; i++) {
+        accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxMinusOneVec()[i];
+        diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxMinusOneVec()[i];
+        accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxZEROVec()[i];
+        diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxZEROVec()[i];
+      }
+    } else {
+      // already recorded this pulse shape, just set id
+      ids_[i] = iter->second;
+    }
+#ifdef HCAL_MAHI_CPUDEBUG
+    if (barrelValues[i].rawId() == DETID_TO_DEBUG) {
+      printf("recoShapeId = %u myid = %u\n", pulseShapeId, ids_[i]);
+    }
+#endif
+  }
+
+  // fill in ee
+  auto const& endcapValues = containers[1].second;
+  auto const offset = barrelValues.size();
+  for (uint64_t i = 0; i < endcapValues.size(); ++i) {
+    param1_[i + offset] = endcapValues[i].param1();
+    param2_[i + offset] = endcapValues[i].param2();
+
+    auto const pulseShapeId = endcapValues[i].pulseShapeID();
+    // FIXME: 0 throws upon look up to HcalPulseShapes
+    // although comments state that 0 is reserved,
+    // HcalPulseShapes::getShape throws on 0!
+    if (pulseShapeId == 0) {
+      ids_[i + offset] = 0;
+      continue;
+    }
+    if (auto const iter = idCache.find(pulseShapeId); iter == idCache.end()) {
+      // new guy
+      auto const newId = idCache.size();
+      idCache[pulseShapeId] = newId;
+      // this will be the id
+      ids_[i + offset] = newId;
+
+      // resize value arrays
+      acc25nsVec_.resize(acc25nsVec_.size() + HcalConst::maxPSshapeBin);
+      diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + HcalConst::maxPSshapeBin);
+      accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + HcalConst::nsPerBX);
+      diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + HcalConst::nsPerBX);
+      accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + HcalConst::nsPerBX);
+      diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + HcalConst::nsPerBX);
+
+      // precompute and get values from the functor
+      auto const& pulseShape = pulseShapes.getShape(pulseShapeId);
+      FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, 10};
+      auto const offset256 = newId * HcalConst::maxPSshapeBin;
+      auto const offset25 = newId * HcalConst::nsPerBX;
+      auto const numShapes = newId;
+      for (int i = 0; i < HcalConst::maxPSshapeBin; i++) {
+        acc25nsVec_[offset256 * numShapes + i] = functor.get_acc25nsVec()[i];
+        diff25nsItvlVec_[offset256 * numShapes + i] = functor.get_diff25nsItvlVec()[i];
+      }
+
+      for (int i = 0; i < HcalConst::nsPerBX; i++) {
+        accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxMinusOneVec()[i];
+        diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxMinusOneVec()[i];
+        accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxZEROVec()[i];
+        diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxZEROVec()[i];
+      }
+    } else {
+      // already recorded this pulse shape, just set id
+      ids_[i + offset] = iter->second;
+    }
+  }
+
+#ifdef HCAL_MAHI_CPUDEBUG
+  for (auto const& p : idCache)
+    printf("recoPulseShapeId = %u id = %u\n", p.first, p.second);
+#endif
+}
+
+HcalRecoParamsWithPulseShapesGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(param1));
+  cudaCheck(cudaFree(param2));
+  cudaCheck(cudaFree(ids));
+  cudaCheck(cudaFree(acc25nsVec));
+  cudaCheck(cudaFree(diff25nsItvlVec));
+  cudaCheck(cudaFree(accVarLenIdxMinusOneVec));
+  cudaCheck(cudaFree(diffVarItvlIdxMinusOneVec));
+  cudaCheck(cudaFree(accVarLenIdxZEROVec));
+  cudaCheck(cudaFree(diffVarItvlIdxZEROVec));
+}
+
+HcalRecoParamsWithPulseShapesGPU::Product const& HcalRecoParamsWithPulseShapesGPU::getProduct(
+    cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](HcalRecoParamsWithPulseShapesGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.param1, this->param1_.size() * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&product.param2, this->param2_.size() * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&product.ids, this->ids_.size() * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&product.acc25nsVec, this->acc25nsVec_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.diff25nsItvlVec, this->diff25nsItvlVec_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.accVarLenIdxMinusOneVec,
+                             this->accVarLenIdxMinusOneVec_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.diffVarItvlIdxMinusOneVec,
+                             this->diffVarItvlIdxMinusOneVec_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.accVarLenIdxZEROVec, this->accVarLenIdxZEROVec_.size() * sizeof(float)));
+        cudaCheck(
+            cudaMalloc((void**)&product.diffVarItvlIdxZEROVec, this->diffVarItvlIdxZEROVec_.size() * sizeof(float)));
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.param1,
+                                  this->param1_.data(),
+                                  this->param1_.size() * sizeof(uint32_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.param2,
+                                  this->param2_.data(),
+                                  this->param2_.size() * sizeof(uint32_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.ids, this->ids_.data(), this->ids_.size() * sizeof(uint32_t), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.acc25nsVec,
+                                  this->acc25nsVec_.data(),
+                                  this->acc25nsVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.diff25nsItvlVec,
+                                  this->diff25nsItvlVec_.data(),
+                                  this->diff25nsItvlVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.accVarLenIdxMinusOneVec,
+                                  this->accVarLenIdxMinusOneVec_.data(),
+                                  this->accVarLenIdxMinusOneVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.diffVarItvlIdxMinusOneVec,
+                                  this->diffVarItvlIdxMinusOneVec_.data(),
+                                  this->diffVarItvlIdxMinusOneVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.accVarLenIdxZEROVec,
+                                  this->accVarLenIdxZEROVec_.data(),
+                                  this->accVarLenIdxZEROVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.diffVarItvlIdxZEROVec,
+                                  this->diffVarItvlIdxZEROVec_.data(),
+                                  this->diffVarItvlIdxZEROVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(HcalRecoParamsWithPulseShapesGPU);
diff --git a/RecoLocalCalo/HcalRecProducers/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
index c3ae589a0c0a7..7375c85d61a54 100644
--- a/RecoLocalCalo/HcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
@@ -1,8 +1,16 @@
-<flags EDM_PLUGIN="1"/>
+<use name="boost"/>
+<use name="cuda"/>
+<use name="CUDADataFormats/Common" />
+<use name="CUDADataFormats/HcalCommon"/>
+<use name="CUDADataFormats/HcalDigi"/>
+<use name="CUDADataFormats/HcalRecHitSoA"/>
 <use name="CalibCalorimetry/HcalAlgos"/>
 <use name="CalibFormats/HcalObjects"/>
-<use name="RecoLocalCalo/HcalRecAlgos"/>
-<use name="FWCore/Framework"/>
 <use name="DataFormats/Common"/>
+<use name="FWCore/Framework"/>
+<use name="Geometry/HcalCommonData"/>
 <use name="Geometry/Records"/>
-<use name="boost"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="RecoLocalCalo/HcalRecAlgos"/>
+<flags EDM_PLUGIN="1"/>
diff --git a/RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml
new file mode 100644
index 0000000000000..a804a7fe4b70e
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml
@@ -0,0 +1,7 @@
+<bin name="makeHcalRecHitGpuValidationPlots" file="makeHcalRecHitGpuValidationPlots.cpp">
+    <use name="root"/>
+    <use name="rootgraphics"/>
+    <use name="DataFormats/Common"/>
+    <use name="DataFormats/HcalDetId"/>
+    <use name="CUDADataFormats/HcalRecHitSoA"/>
+</bin>
diff --git a/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp b/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp
new file mode 100644
index 0000000000000..5ef7861f43232
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp
@@ -0,0 +1,282 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <TCanvas.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TTree.h>
+#include <TPaveStats.h>
+
+#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h"
+//#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+
+#define CREATE_HIST_1D(varname, nbins, first, last) auto varname = new TH1D(#varname, #varname, nbins, first, last)
+
+#define CREATE_HIST_2D(varname, nbins, first, last) \
+  auto varname = new TH2D(#varname, #varname, nbins, first, last, nbins, first, last)
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "run with: ./<exe> <path to input file> <path to output file>\n";
+    exit(0);
+  }
+
+  std::string inFileName{argv[1]};
+  std::string outFileName{argv[2]};
+
+  // branches to use
+  edm::Wrapper<HBHERecHitCollection>* wcpu = nullptr;
+  edm::Wrapper<HBHERecHitCollection>* wgpu = nullptr;
+  //    edm::Wrapper<hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>> *wgpu=nullptr;
+
+  // prep output
+  TFile rfout{outFileName.c_str(), "recreate"};
+
+  CREATE_HIST_1D(hEnergyM0HBGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyM0HEGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyM0HBCPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyM0HECPU, 1000, 0, 100);
+
+  CREATE_HIST_1D(hEnergyHBGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyHBCPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyHEGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyHECPU, 1000, 0, 100);
+
+  CREATE_HIST_1D(hChi2HBGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hChi2HBCPU, 1000, 0, 100);
+  CREATE_HIST_1D(hChi2HEGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hChi2HECPU, 1000, 0, 100);
+
+  CREATE_HIST_2D(hEnergyHBGPUvsCPU, 1000, 0, 100);
+  CREATE_HIST_2D(hEnergyHEGPUvsCPU, 1000, 0, 100);
+  CREATE_HIST_2D(hChi2HBGPUvsCPU, 1000, 0, 100);
+  CREATE_HIST_2D(hChi2HEGPUvsCPU, 1000, 0, 100);
+
+  CREATE_HIST_2D(hEnergyM0HBGPUvsCPU, 1000, 0, 100);
+  CREATE_HIST_2D(hEnergyM0HEGPUvsCPU, 1000, 0, 100);
+
+  // prep input
+  TFile rfin{inFileName.c_str()};
+  TTree* rt = (TTree*)rfin.Get("Events");
+  rt->SetBranchAddress("HBHERecHitsSorted_hcalCPURecHitsProducer_recHitsLegacyLabelOut_RECO.", &wgpu);
+  //    rt->SetBranchAddress("hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalRecHitCollection_hcalCPURecHitsProducer_recHitsM0LabelOut_RECO.", &wgpu);
+  rt->SetBranchAddress("HBHERecHitsSorted_hbheprereco__RECO.", &wcpu);
+
+  // accumulate
+  auto const nentries = rt->GetEntries();
+  std::cout << ">>> nentries = " << nentries << std::endl;
+  for (int ie = 0; ie < nentries; ++ie) {
+    rt->GetEntry(ie);
+
+    auto const& gpuProduct = wgpu->bareProduct();
+    auto const& cpuProduct = wcpu->bareProduct();
+
+    auto const ncpu = cpuProduct.size();
+    auto const ngpu = gpuProduct.size();
+    //        auto const ngpu = gpuProduct.energy.size();
+
+    if (ngpu != ncpu) {
+      std::cerr << "*** mismatch in number of rec hits for event " << ie << std::endl
+                << ">>> ngpu = " << ngpu << std::endl
+                << ">>> ncpu = " << ncpu << std::endl;
+    }
+
+    for (uint32_t ich = 0; ich < ncpu; ich++) {
+      auto const& cpurh = cpuProduct[ich];
+      auto const& did = cpurh.id();
+      auto iter2gpu = gpuProduct.find(did);
+      //            auto iter2idgpu = std::find(
+      //                gpuProduct.did.begin(), gpuProduct.did.end(), did.rawId());
+
+      if (iter2gpu == gpuProduct.end()) {
+        std::cerr << "missing " << did << std::endl;
+        continue;
+      }
+
+      assert(iter2gpu->id().rawId() == did.rawId());
+
+      auto const gpu_energy_m0 = iter2gpu->eraw();
+      auto const cpu_energy_m0 = cpurh.eraw();
+      auto const gpu_energy = iter2gpu->energy();
+      auto const cpu_energy = cpurh.energy();
+      auto const gpu_chi2 = iter2gpu->chi2();
+      auto const cpu_chi2 = cpurh.chi2();
+
+      if (did.subdetId() == HcalBarrel) {
+        hEnergyM0HBGPU->Fill(gpu_energy_m0);
+        hEnergyM0HBCPU->Fill(cpu_energy_m0);
+        hEnergyM0HBGPUvsCPU->Fill(cpu_energy_m0, gpu_energy_m0);
+
+        hEnergyHBGPU->Fill(gpu_energy);
+        hEnergyHBCPU->Fill(cpu_energy);
+        hEnergyHBGPUvsCPU->Fill(cpu_energy, gpu_energy);
+        hChi2HBGPU->Fill(gpu_chi2);
+        hChi2HBCPU->Fill(cpu_chi2);
+        hChi2HBGPUvsCPU->Fill(cpu_chi2, gpu_chi2);
+      } else if (did.subdetId() == HcalEndcap) {
+        hEnergyM0HEGPU->Fill(gpu_energy_m0);
+        hEnergyM0HECPU->Fill(cpu_energy_m0);
+        hEnergyM0HEGPUvsCPU->Fill(cpu_energy_m0, gpu_energy_m0);
+
+        hEnergyHEGPU->Fill(gpu_energy);
+        hEnergyHECPU->Fill(cpu_energy);
+        hEnergyHEGPUvsCPU->Fill(cpu_energy, gpu_energy);
+
+        hChi2HEGPU->Fill(gpu_chi2);
+        hChi2HECPU->Fill(cpu_chi2);
+        hChi2HEGPUvsCPU->Fill(cpu_chi2, gpu_chi2);
+      }
+    }
+  }
+
+  {
+    TCanvas c{"plots", "plots", 4200, 6200};
+    c.Divide(4, 3);
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hEnergyM0HBCPU->SetLineColor(kBlack);
+      hEnergyM0HBCPU->SetLineWidth(1.);
+      hEnergyM0HBCPU->Draw("");
+      hEnergyM0HBGPU->SetLineColor(kBlue);
+      hEnergyM0HBGPU->SetLineWidth(1.);
+      hEnergyM0HBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergyM0HBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(2);
+    {
+      gPad->SetLogz();
+      hEnergyM0HBGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hEnergyM0HBGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hEnergyM0HBGPUvsCPU->Draw("colz");
+    }
+    c.cd(3);
+    {
+      gPad->SetLogy();
+      hEnergyM0HECPU->SetLineColor(kBlack);
+      hEnergyM0HECPU->SetLineWidth(1.);
+      hEnergyM0HECPU->Draw("");
+      hEnergyM0HEGPU->SetLineColor(kBlue);
+      hEnergyM0HEGPU->SetLineWidth(1.);
+      hEnergyM0HEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergyM0HEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(4);
+    {
+      gPad->SetLogz();
+      hEnergyM0HEGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hEnergyM0HEGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hEnergyM0HEGPUvsCPU->Draw("colz");
+    }
+    c.cd(5);
+    {
+      gPad->SetLogy();
+      hEnergyHBCPU->SetLineColor(kBlack);
+      hEnergyHBCPU->SetLineWidth(1.);
+      hEnergyHBCPU->Draw("");
+      hEnergyHBGPU->SetLineColor(kBlue);
+      hEnergyHBGPU->SetLineWidth(1.);
+      hEnergyHBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergyHBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(6);
+    {
+      gPad->SetLogz();
+      hEnergyHBGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hEnergyHBGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hEnergyHBGPUvsCPU->Draw("colz");
+    }
+    c.cd(7);
+    {
+      gPad->SetLogy();
+      hEnergyHECPU->SetLineColor(kBlack);
+      hEnergyHECPU->SetLineWidth(1.);
+      hEnergyHECPU->Draw("");
+      hEnergyHEGPU->SetLineColor(kBlue);
+      hEnergyHEGPU->SetLineWidth(1.);
+      hEnergyHEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergyHEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(8);
+    {
+      gPad->SetLogz();
+      hEnergyHEGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hEnergyHEGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hEnergyHEGPUvsCPU->Draw("colz");
+    }
+    c.cd(9);
+    {
+      gPad->SetLogy();
+      hChi2HBCPU->SetLineColor(kBlack);
+      hChi2HBCPU->SetLineWidth(1.);
+      hChi2HBCPU->Draw("");
+      hChi2HBGPU->SetLineColor(kBlue);
+      hChi2HBGPU->SetLineWidth(1.);
+      hChi2HBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hChi2HBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(10);
+    {
+      gPad->SetLogz();
+      hChi2HBGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hChi2HBGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hChi2HBGPUvsCPU->Draw("colz");
+    }
+    c.cd(11);
+    {
+      gPad->SetLogy();
+      hChi2HECPU->SetLineColor(kBlack);
+      hChi2HECPU->SetLineWidth(1.);
+      hChi2HECPU->Draw("");
+      hChi2HEGPU->SetLineColor(kBlue);
+      hChi2HEGPU->SetLineWidth(1.);
+      hChi2HEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hChi2HEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(12);
+    {
+      gPad->SetLogz();
+      hChi2HEGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hChi2HEGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hChi2HEGPUvsCPU->Draw("colz");
+    }
+    c.SaveAs("plots.pdf");
+  }
+
+  rfin.Close();
+  rfout.Write();
+  rfout.Close();
+}
diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
new file mode 100644
index 0000000000000..f47f698c6b59a
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -0,0 +1,276 @@
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+#include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+
+#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
+
+//#include "CondFormats/DataRecord/interface/HcalPedestalsRcd.h"
+#include "RecoLocalCalo/HcalRecProducers/src/HcalCombinedRecordsGPU.h"
+
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalLUTCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalPedestalWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIECodersGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRespCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIETypesGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMCharacteristicsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
+
+#include "RecoLocalCalo/HcalRecAlgos/interface/DeclsForKernels.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/MahiGPU.h"
+
+class HBHERecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HBHERecHitProducerGPU(edm::ParameterSet const&);
+  ~HBHERecHitProducerGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef01> digisTokenF01HE_;
+
+  using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef5> digisTokenF5HB_;
+
+  using IProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef3> digisTokenF3HB_;
+
+  using RecHitType = hcal::RecHitCollection<hcal::common::ViewStoragePolicy>;
+  using OProductType = cms::cuda::Product<RecHitType>;
+  edm::EDPutTokenT<OProductType> rechitsM0Token_;
+
+  hcal::mahi::ConfigParameters configParameters_;
+  hcal::mahi::OutputDataGPU outputGPU_;
+  hcal::mahi::ScratchDataGPU scratchGPU_;
+  cms::cuda::ContextState cudaState_;
+};
+
+HBHERecHitProducerGPU::HBHERecHitProducerGPU(edm::ParameterSet const& ps)
+    : digisTokenF01HE_{consumes<IProductTypef01>(ps.getParameter<edm::InputTag>("digisLabelF01HE"))},
+      digisTokenF5HB_{consumes<IProductTypef5>(ps.getParameter<edm::InputTag>("digisLabelF5HB"))},
+      digisTokenF3HB_{consumes<IProductTypef3>(ps.getParameter<edm::InputTag>("digisLabelF3HB"))},
+      rechitsM0Token_{produces<OProductType>(ps.getParameter<std::string>("recHitsLabelM0HBHE"))} {
+  configParameters_.maxChannels = ps.getParameter<uint32_t>("maxChannels");
+  configParameters_.maxTimeSamples = ps.getParameter<uint32_t>("maxTimeSamples");
+  configParameters_.pulseOffsets = ps.getParameter<std::vector<int>>("pulseOffsets");
+  configParameters_.kprep1dChannelsPerBlock = ps.getParameter<uint32_t>("kprep1dChannelsPerBlock");
+  configParameters_.sipmQTSShift = ps.getParameter<int>("sipmQTSShift");
+  configParameters_.sipmQNTStoSum = ps.getParameter<int>("sipmQNTStoSum");
+  configParameters_.firstSampleShift = ps.getParameter<int>("firstSampleShift");
+  configParameters_.useEffectivePedestals = ps.getParameter<bool>("useEffectivePedestals");
+
+  configParameters_.meanTime = ps.getParameter<double>("meanTime");
+  configParameters_.timeSigmaSiPM = ps.getParameter<double>("timeSigmaSiPM");
+  configParameters_.timeSigmaHPD = ps.getParameter<double>("timeSigmaHPD");
+  configParameters_.ts4Thresh = ps.getParameter<double>("ts4Thresh");
+
+  configParameters_.applyTimeSlew = ps.getParameter<bool>("applyTimeSlew");
+  auto const tzeroValues = ps.getParameter<std::vector<double>>("tzeroTimeSlewParameters");
+  auto const slopeValues = ps.getParameter<std::vector<double>>("slopeTimeSlewParameters");
+  auto const tmaxValues = ps.getParameter<std::vector<double>>("tmaxTimeSlewParameters");
+
+  configParameters_.tzeroTimeSlew = tzeroValues[HcalTimeSlew::Medium];
+  configParameters_.slopeTimeSlew = slopeValues[HcalTimeSlew::Medium];
+  configParameters_.tmaxTimeSlew = tmaxValues[HcalTimeSlew::Medium];
+
+  auto threadsMinimize = ps.getParameter<std::vector<uint32_t>>("kernelMinimizeThreads");
+  configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0];
+  configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1];
+  configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2];
+
+  outputGPU_.allocate(configParameters_);
+  scratchGPU_.allocate(configParameters_);
+
+  // FIXME: use default device and default stream
+  cudaCheck(
+      cudaMalloc((void**)&configParameters_.pulseOffsetsDevice, sizeof(int) * configParameters_.pulseOffsets.size()));
+  cudaCheck(cudaMemcpy(configParameters_.pulseOffsetsDevice,
+                       configParameters_.pulseOffsets.data(),
+                       configParameters_.pulseOffsets.size() * sizeof(int),
+                       cudaMemcpyHostToDevice));
+}
+
+HBHERecHitProducerGPU::~HBHERecHitProducerGPU() {
+  outputGPU_.deallocate(configParameters_);
+  scratchGPU_.deallocate(configParameters_);
+
+  cudaCheck(cudaFree(configParameters_.pulseOffsetsDevice));
+}
+
+void HBHERecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& cdesc) {
+  edm::ParameterSetDescription desc;
+  desc.add<uint32_t>("maxChannels", 10000u);
+  desc.add<uint32_t>("maxTimeSamples", 10);
+  desc.add<std::vector<int>>("pulseOffsets", {-3, -2, -1, 0, 1, 2, 3, 4});
+  desc.add<uint32_t>("kprep1dChannelsPerBlock", 32);
+  desc.add<edm::InputTag>("digisLabelF01HE", edm::InputTag{"hcalRawToDigiGPU", "f01HEDigisGPU"});
+  desc.add<edm::InputTag>("digisLabelF5HB", edm::InputTag{"hcalRawToDigiGPU", "f5HBDigisGPU"});
+  desc.add<edm::InputTag>("digisLabelF3HB", edm::InputTag{"hcalRawToDigiGPU", "f3HBDigisGPU"});
+  desc.add<std::string>("recHitsLabelM0HBHE", "recHitsM0HBHE");
+  desc.add<int>("sipmQTSShift", 0);
+  desc.add<int>("sipmQNTStoSum", 3);
+  desc.add<int>("firstSampleShift", 0);
+  desc.add<bool>("useEffectivePedestals", true);
+
+  desc.add<double>("meanTime", 0.f);
+  desc.add<double>("timeSigmaSiPM", 2.5f);
+  desc.add<double>("timeSigmaHPD", 5.0f);
+  desc.add<double>("ts4Thresh", 0.0);
+
+  desc.add<bool>("applyTimeSlew", true);
+  desc.add<std::vector<double>>("tzeroTimeSlewParameters", {23.960177, 11.977461, 9.109694});
+  desc.add<std::vector<double>>("slopeTimeSlewParameters", {-3.178648, -1.5610227, -1.075824});
+  desc.add<std::vector<double>>("tmaxTimeSlewParameters", {16.00, 10.00, 6.25});
+  desc.add<std::vector<uint32_t>>("kernelMinimizeThreads", {16, 1, 1});
+
+  std::string label = "hbheRecHitProducerGPU";
+  cdesc.add(label, desc);
+}
+
+void HBHERecHitProducerGPU::acquire(edm::Event const& event,
+                                    edm::EventSetup const& setup,
+                                    edm::WaitingTaskWithArenaHolder holder) {
+#ifdef HCAL_MAHI_CPUDEBUG
+  auto start = std::chrono::high_resolution_clock::now();
+#endif
+
+  // input + raii
+  auto const& f01HEProduct = event.get(digisTokenF01HE_);
+  auto const& f5HBProduct = event.get(digisTokenF5HB_);
+  auto const& f3HBProduct = event.get(digisTokenF3HB_);
+  cms::cuda::ScopedContextAcquire ctx{f01HEProduct, std::move(holder), cudaState_};
+  auto const& f01HEDigis = ctx.get(f01HEProduct);
+  auto const& f5HBDigis = ctx.get(f5HBProduct);
+  auto const& f3HBDigis = ctx.get(f3HBProduct);
+
+  hcal::mahi::InputDataGPU inputGPU{f01HEDigis, f5HBDigis, f3HBDigis};
+
+  // conditions
+  edm::ESHandle<HcalRecoParamsWithPulseShapesGPU> recoParamsHandle;
+  setup.get<HcalRecoParamsRcd>().get(recoParamsHandle);
+  auto const& recoParamsProduct = recoParamsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalGainWidthsGPU> gainWidthsHandle;
+  setup.get<HcalGainWidthsRcd>().get(gainWidthsHandle);
+  auto const& gainWidthsProduct = gainWidthsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalGainsGPU> gainsHandle;
+  setup.get<HcalGainsRcd>().get(gainsHandle);
+  auto const& gainsProduct = gainsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalLUTCorrsGPU> lutCorrsHandle;
+  setup.get<HcalLUTCorrsRcd>().get(lutCorrsHandle);
+  auto const& lutCorrsProduct = lutCorrsHandle->getProduct(ctx.stream());
+
+  // use only 1 depending on useEffectivePedestals
+  edm::ESHandle<HcalConvertedPedestalWidthsGPU> pedestalWidthsHandle;
+  edm::ESHandle<HcalConvertedEffectivePedestalWidthsGPU> effectivePedestalWidthsHandle;
+  setup.get<HcalConvertedEffectivePedestalWidthsRcd>().get(effectivePedestalWidthsHandle);
+  setup.get<HcalConvertedPedestalWidthsRcd>().get(pedestalWidthsHandle);
+  auto const& pedestalWidthsProduct = pedestalWidthsHandle->getProduct(ctx.stream());
+  auto const& effectivePedestalWidthsProduct = effectivePedestalWidthsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalConvertedPedestalsGPU> pedestalsHandle;
+  setup.get<HcalConvertedPedestalsRcd>().get(pedestalsHandle);
+  auto const& pedestalsProduct = pedestalsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalConvertedEffectivePedestalsGPU> effectivePedestalsHandle;
+  if (configParameters_.useEffectivePedestals)
+    setup.get<HcalConvertedEffectivePedestalsRcd>().get(effectivePedestalsHandle);
+  auto const* effectivePedestalsProduct =
+      configParameters_.useEffectivePedestals ? &effectivePedestalsHandle->getProduct(ctx.stream()) : nullptr;
+
+  edm::ESHandle<HcalQIECodersGPU> qieCodersHandle;
+  setup.get<HcalQIEDataRcd>().get(qieCodersHandle);
+  auto const& qieCodersProduct = qieCodersHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalRespCorrsGPU> respCorrsHandle;
+  setup.get<HcalRespCorrsRcd>().get(respCorrsHandle);
+  auto const& respCorrsProduct = respCorrsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalTimeCorrsGPU> timeCorrsHandle;
+  setup.get<HcalTimeCorrsRcd>().get(timeCorrsHandle);
+  auto const& timeCorrsProduct = timeCorrsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalQIETypesGPU> qieTypesHandle;
+  setup.get<HcalQIETypesRcd>().get(qieTypesHandle);
+  auto const& qieTypesProduct = qieTypesHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalTopology> topologyHandle;
+  setup.get<HcalRecNumberingRecord>().get(topologyHandle);
+  edm::ESHandle<HcalDDDRecConstants> recConstantsHandle;
+  setup.get<HcalRecNumberingRecord>().get(recConstantsHandle);
+
+  edm::ESHandle<HcalSiPMParametersGPU> sipmParametersHandle;
+  setup.get<HcalSiPMParametersRcd>().get(sipmParametersHandle);
+  auto const& sipmParametersProduct = sipmParametersHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalSiPMCharacteristicsGPU> sipmCharacteristicsHandle;
+  setup.get<HcalSiPMCharacteristicsRcd>().get(sipmCharacteristicsHandle);
+  auto const& sipmCharacteristicsProduct = sipmCharacteristicsHandle->getProduct(ctx.stream());
+
+  // bundle up conditions
+  hcal::mahi::ConditionsProducts conditions{gainWidthsProduct,
+                                            gainsProduct,
+                                            lutCorrsProduct,
+                                            pedestalWidthsProduct,
+                                            effectivePedestalWidthsProduct,
+                                            pedestalsProduct,
+                                            qieCodersProduct,
+                                            recoParamsProduct,
+                                            respCorrsProduct,
+                                            timeCorrsProduct,
+                                            qieTypesProduct,
+                                            sipmParametersProduct,
+                                            sipmCharacteristicsProduct,
+                                            effectivePedestalsProduct,
+                                            topologyHandle.product(),
+                                            recConstantsHandle.product(),
+                                            pedestalsHandle->offsetForHashes()};
+
+  hcal::mahi::entryPoint(inputGPU, outputGPU_, conditions, scratchGPU_, configParameters_, ctx.stream());
+
+#ifdef HCAL_MAHI_CPUDEBUG
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+  std::cout << "acquire  duration = " << duration << std::endl;
+#endif
+}
+
+void HBHERecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+  ctx.emplace(event, rechitsM0Token_, std::move(outputGPU_.recHits));
+}
+
+DEFINE_FWK_MODULE(HBHERecHitProducerGPU);
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
new file mode 100644
index 0000000000000..23eea5c5aa30b
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
@@ -0,0 +1,109 @@
+#include <iostream>
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h"
+
+class HcalCPURecHitsProducer : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HcalCPURecHitsProducer(edm::ParameterSet const& ps);
+  ~HcalCPURecHitsProducer() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  using IProductType = cms::cuda::Product<hcal::RecHitCollection<hcal::common::ViewStoragePolicy>>;
+  edm::EDGetTokenT<IProductType> recHitsM0TokenIn_;
+  using OProductType = hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OProductType> recHitsM0TokenOut_;
+  edm::EDPutTokenT<HBHERecHitCollection> recHitsLegacyTokenOut_;
+
+  // to pass from acquire to produce
+  OProductType tmpRecHits_;
+};
+
+void HcalCPURecHitsProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("recHitsM0LabelIn", edm::InputTag{"hbheRecHitProducerGPU", "recHitsM0HBHE"});
+  desc.add<std::string>("recHitsM0LabelOut", "recHitsM0HBHE");
+  desc.add<std::string>("recHitsLegacyLabelOut", "recHitsLegacyHBHE");
+
+  std::string label = "hcalCPURecHitsProducer";
+  confDesc.add(label, desc);
+}
+
+HcalCPURecHitsProducer::HcalCPURecHitsProducer(const edm::ParameterSet& ps)
+    : recHitsM0TokenIn_{consumes<IProductType>(ps.getParameter<edm::InputTag>("recHitsM0LabelIn"))},
+      recHitsM0TokenOut_{produces<OProductType>("recHitsM0LabelOut")},
+      recHitsLegacyTokenOut_{produces<HBHERecHitCollection>("recHitsLegacyLabelOut")} {}
+
+HcalCPURecHitsProducer::~HcalCPURecHitsProducer() {}
+
+void HcalCPURecHitsProducer::acquire(edm::Event const& event,
+                                     edm::EventSetup const& setup,
+                                     edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& recHitsProduct = event.get(recHitsM0TokenIn_);
+  cms::cuda::ScopedContextAcquire ctx{recHitsProduct, std::move(taskHolder)};
+  auto const& recHits = ctx.get(recHitsProduct);
+
+  // resize tmp buffers
+  tmpRecHits_.resize(recHits.size);
+
+#ifdef HCAL_MAHI_CPUDEBUG
+  std::cout << "num rec Hits = " << recHits.size << std::endl;
+#endif
+
+  auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
+    using vector_type = typename std::remove_reference<decltype(dest)>::type;
+    using type = typename vector_type::value_type;
+    cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream()));
+  };
+
+  lambdaToTransfer(tmpRecHits_.energy, recHits.energy);
+  lambdaToTransfer(tmpRecHits_.chi2, recHits.chi2);
+  lambdaToTransfer(tmpRecHits_.energyM0, recHits.energyM0);
+  lambdaToTransfer(tmpRecHits_.timeM0, recHits.timeM0);
+  lambdaToTransfer(tmpRecHits_.did, recHits.did);
+}
+
+void HcalCPURecHitsProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  // populate the legacy collection
+  auto recHitsLegacy = std::make_unique<HBHERecHitCollection>();
+  // did not set size with ctor as there is no setter for did
+  recHitsLegacy->reserve(tmpRecHits_.did.size());
+  for (uint32_t i = 0; i < tmpRecHits_.did.size(); i++) {
+    recHitsLegacy->emplace_back(HcalDetId{tmpRecHits_.did[i]},
+                                tmpRecHits_.energy[i],
+                                0  // timeRising
+    );
+
+    // update newly pushed guy
+    (*recHitsLegacy)[i].setChiSquared(tmpRecHits_.chi2[i]);
+    (*recHitsLegacy)[i].setRawEnergy(tmpRecHits_.energyM0[i]);
+  }
+
+  // put a legacy format
+  event.put(recHitsLegacyTokenOut_, std::move(recHitsLegacy));
+
+  // put a new format
+  event.put(recHitsM0TokenOut_, std::make_unique<OProductType>(std::move(tmpRecHits_)));
+}
+
+DEFINE_FWK_MODULE(HcalCPURecHitsProducer);
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
new file mode 100644
index 0000000000000..d4767231c60dd
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
@@ -0,0 +1,132 @@
+#include "HcalESProducerGPU.h"
+
+#include "CondFormats/HcalObjects/interface/HcalRecoParams.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestals.h"
+#include "CondFormats/HcalObjects/interface/HcalGains.h"
+#include "CondFormats/HcalObjects/interface/HcalLUTCorrs.h"
+#include "CondFormats/HcalObjects/interface/HcalRespCorrs.h"
+#include "CondFormats/HcalObjects/interface/HcalTimeCorrs.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestalWidths.h"
+#include "CondFormats/HcalObjects/interface/HcalGainWidths.h"
+#include "CondFormats/HcalObjects/interface/HcalQIEData.h"
+#include "CondFormats/HcalObjects/interface/HcalQIETypes.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMParameters.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristics.h"
+
+#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalPedestalsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalPedestalWidthsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
+
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalPedestalsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalLUTCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRespCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalPedestalWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIECodersGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIETypesGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMCharacteristicsGPU.h"
+
+#include <iostream>
+
+using HcalRecoParamsGPUESProducer = HcalESProducerGPU<HcalRecoParamsRcd, HcalRecoParamsGPU, HcalRecoParams>;
+
+using HcalRecoParamsWithPulseShapesGPUESProducer =
+    HcalESProducerGPU<HcalRecoParamsRcd, HcalRecoParamsWithPulseShapesGPU, HcalRecoParams>;
+
+using HcalPedestalsGPUESProducer = HcalESProducerGPU<HcalPedestalsRcd, HcalPedestalsGPU, HcalPedestals>;
+
+using HcalGainsGPUESProducer = HcalESProducerGPU<HcalGainsRcd, HcalGainsGPU, HcalGains>;
+
+using HcalLUTCorrsGPUESProducer = HcalESProducerGPU<HcalLUTCorrsRcd, HcalLUTCorrsGPU, HcalLUTCorrs>;
+
+using HcalRespCorrsGPUESProducer = HcalESProducerGPU<HcalRespCorrsRcd, HcalRespCorrsGPU, HcalRespCorrs>;
+
+using HcalTimeCorrsGPUESProducer = HcalESProducerGPU<HcalTimeCorrsRcd, HcalTimeCorrsGPU, HcalTimeCorrs>;
+
+using HcalPedestalWidthsGPUESProducer =
+    HcalESProducerGPU<HcalPedestalWidthsRcd, HcalPedestalWidthsGPU, HcalPedestalWidths>;
+
+using HcalGainWidthsGPUESProducer = HcalESProducerGPU<HcalGainWidthsRcd, HcalGainWidthsGPU, HcalGainWidths>;
+
+using HcalQIECodersGPUESProducer = HcalESProducerGPU<HcalQIEDataRcd, HcalQIECodersGPU, HcalQIEData>;
+
+using HcalQIETypesGPUESProducer = HcalESProducerGPU<HcalQIETypesRcd, HcalQIETypesGPU, HcalQIETypes>;
+
+using HcalSiPMParametersGPUESProducer =
+    HcalESProducerGPU<HcalSiPMParametersRcd, HcalSiPMParametersGPU, HcalSiPMParameters>;
+
+using HcalSiPMCharacteristicsGPUESProducer =
+    HcalESProducerGPU<HcalSiPMCharacteristicsRcd, HcalSiPMCharacteristicsGPU, HcalSiPMCharacteristics>;
+
+DEFINE_FWK_EVENTSETUP_MODULE(HcalRecoParamsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalRecoParamsWithPulseShapesGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalPedestalsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalGainsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalLUTCorrsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalRespCorrsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalTimeCorrsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalPedestalWidthsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalGainWidthsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalQIECodersGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalQIETypesGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMParametersGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMCharacteristicsGPUESProducer);
+
+#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
+
+#include "RecoLocalCalo/HcalRecProducers/src/HcalCombinedRecordsGPU.h"
+
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
+
+EVENTSETUP_RECORD_REG(HcalConvertedPedestalsRcd);
+EVENTSETUP_RECORD_REG(HcalConvertedPedestalWidthsRcd);
+
+using HcalConvertedPedestalsGPUESProducer = HcalESProducerGPUWithDependencies<HcalConvertedPedestalsRcd,
+                                                                              HcalConvertedPedestalsGPU,
+                                                                              HcalPedestals,
+                                                                              HcalQIEData,
+                                                                              HcalQIETypes>;
+
+using HcalConvertedEffectivePedestalsGPUESProducer =
+    HcalESProducerGPUWithDependencies<HcalConvertedEffectivePedestalsRcd,
+                                      HcalConvertedEffectivePedestalsGPU,
+                                      HcalPedestals,
+                                      HcalQIEData,
+                                      HcalQIETypes>;
+
+using HcalConvertedPedestalWidthsGPUESProducer = HcalESProducerGPUWithDependencies<HcalConvertedPedestalWidthsRcd,
+                                                                                   HcalConvertedPedestalWidthsGPU,
+                                                                                   HcalPedestals,
+                                                                                   HcalPedestalWidths,
+                                                                                   HcalQIEData,
+                                                                                   HcalQIETypes>;
+
+using HcalConvertedEffectivePedestalWidthsGPUESProducer =
+    HcalESProducerGPUWithDependencies<HcalConvertedEffectivePedestalWidthsRcd,
+                                      HcalConvertedEffectivePedestalWidthsGPU,
+                                      HcalPedestals,
+                                      HcalPedestalWidths,
+                                      HcalQIEData,
+                                      HcalQIETypes>;
+
+DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedPedestalsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedEffectivePedestalsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedPedestalWidthsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedEffectivePedestalWidthsGPUESProducer);

From ed51435cd93d95e65bf69600948331b6571e6254 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 5 Jun 2020 08:54:56 +0200
Subject: [PATCH 08/34] Update HCAL local reconstruction on GPUs
 (cms-patatrack#470)

Move common ROOT dictionaries to a dedicated new package,
CUDADataFormats/StdDictionaries .

Remove unnecessary dictionary declarations.

Determine the default module label automatically for templated and
non-templated EDProducers and ESProducer, and remove the "name()"
static method previously used to distinguish their template arguments.

Use Event::emplace instead of Event:put where relevant.

Protect the use of CUDA API calls in module constructors and
destructors, checking that the CUDAService is available and enabled.

Move the definition of EventSetup records to a package/library that does
not define plugins.
---
 CUDADataFormats/HcalDigi/src/classes.h        | 37 +------
 CUDADataFormats/HcalDigi/src/classes_def.xml  | 20 ----
 CUDADataFormats/HcalRecHitSoA/src/classes.h   | 13 +--
 .../HcalRawToDigi/plugins/BuildFile.xml       |  1 +
 .../plugins/ElectronicsMappingGPU.h           |  5 +-
 .../plugins/HcalCPUDigisProducer.cc           | 33 +++----
 .../plugins/HcalDigisProducerGPU.cc           | 68 ++++++-------
 .../plugins/HcalESProducerGPUDefs.cc          |  6 +-
 .../HcalRawToDigi/plugins/HcalRawToDigiGPU.cc | 58 ++++++-----
 .../HcalRecoParamsWithPulseShapesGPU.h        |  3 -
 .../src/HBHERecHitProducerGPU.cc              | 98 +++++++++----------
 .../src/HcalCPURecHitsProducer.cc             | 23 ++---
 .../src/HcalESProducersGPUDefs.cc             |  8 +-
 13 files changed, 138 insertions(+), 235 deletions(-)

diff --git a/CUDADataFormats/HcalDigi/src/classes.h b/CUDADataFormats/HcalDigi/src/classes.h
index f00f8cf7dbdf4..8c4a20318928e 100644
--- a/CUDADataFormats/HcalDigi/src/classes.h
+++ b/CUDADataFormats/HcalDigi/src/classes.h
@@ -1,38 +1,3 @@
-#include "DataFormats/Common/interface/Wrapper.h"
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
-
-namespace hcal {
-
-  // instantiate what we know will be used
-  template struct DigiCollection<Flavor01, common::ViewStoragePolicy>;
-
-  template struct DigiCollection<Flavor2, common::ViewStoragePolicy>;
-
-  template struct DigiCollection<Flavor3, common::ViewStoragePolicy>;
-
-  template struct DigiCollection<Flavor4, common::ViewStoragePolicy>;
-
-  template struct DigiCollection<Flavor5, common::ViewStoragePolicy>;
-
-  template struct DigiCollection<Flavor01, common::VecStoragePolicy<std::allocator>>;
-
-  template struct DigiCollection<Flavor2, common::VecStoragePolicy<std::allocator>>;
-
-  template struct DigiCollection<Flavor3, common::VecStoragePolicy<std::allocator>>;
-
-  template struct DigiCollection<Flavor4, common::VecStoragePolicy<std::allocator>>;
-
-  template struct DigiCollection<Flavor5, common::VecStoragePolicy<std::allocator>>;
-
-  template struct DigiCollection<Flavor01, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
-
-  template struct DigiCollection<Flavor2, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
-
-  template struct DigiCollection<Flavor3, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
-
-  template struct DigiCollection<Flavor4, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
-
-  template struct DigiCollection<Flavor5, common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
-
-}  // namespace hcal
+#include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/HcalDigi/src/classes_def.xml b/CUDADataFormats/HcalDigi/src/classes_def.xml
index 18c1e5a09fd10..33e9b28a49b49 100644
--- a/CUDADataFormats/HcalDigi/src/classes_def.xml
+++ b/CUDADataFormats/HcalDigi/src/classes_def.xml
@@ -1,19 +1,7 @@
 <lcgdict>
-    <class name="std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t, 0>>" />
-    <class name="std::vector<uint16_t, cms::cuda::HostAllocator<uint16_t, 0>>" />
-    <class name="std::vector<uint8_t, cms::cuda::HostAllocator<uint8_t, 0>>" />
-            
     <class name="hcal::DigiCollectionBase<hcal::common::VecStoragePolicy<std::allocator>>" />
     <class name="hcal::DigiCollectionBase<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
 
-    <!--
-    <class name="hcal::DigiCollectionBase<hcal::Flavor01, hcal::common::ViewStoragePolicy>" />
-    <class name="hcal::DigiCollectionBase<hcal::Flavor2, hcal::common::ViewStoragePolicy>" />
-    <class name="hcal::DigiCollectionBase<hcal::Flavor3, hcal::common::ViewStoragePolicy>" />
-    <class name="hcal::DigiCollectionBase<hcal::Flavor4, hcal::common::ViewStoragePolicy>" />
-    <class name="hcal::DigiCollectionBase<hcal::Flavor5, hcal::common::ViewStoragePolicy>" />
-    -->
-        
     <class name="hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<std::allocator>>" />
     <class name="hcal::DigiCollection<hcal::Flavor2, hcal::common::VecStoragePolicy<std::allocator>>" />
     <class name="hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<std::allocator>>" />
@@ -26,14 +14,6 @@
     <class name="hcal::DigiCollection<hcal::Flavor4, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
     <class name="hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
 
-    <!--
-    <class name="hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>" />
-    <class name="hcal::DigiCollection<hcal::Flavor2, hcal::common::ViewStoragePolicy>" />
-    <class name="hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>" />
-    <class name="hcal::DigiCollection<hcal::Flavor4, hcal::common::ViewStoragePolicy>" />
-    <class name="hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>" />
-    -->
-
     <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>" persistent="false" />
     <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, hcal::common::ViewStoragePolicy>>" persistent="false" />
     <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>" persistent="false" />
diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes.h b/CUDADataFormats/HcalRecHitSoA/src/classes.h
index 91035e8384117..a13782165c413 100644
--- a/CUDADataFormats/HcalRecHitSoA/src/classes.h
+++ b/CUDADataFormats/HcalRecHitSoA/src/classes.h
@@ -1,14 +1,3 @@
-#include "DataFormats/Common/interface/Wrapper.h"
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
-
-namespace hcal {
-
-  // explicit template instantiations
-  template struct RecHitCollection<common::ViewStoragePolicy>;
-
-  template struct RecHitCollection<common::VecStoragePolicy<std::allocator>>;
-
-  template struct RecHitCollection<common::VecStoragePolicy<CUDAHostAllocatorAlias>>;
-
-}  // namespace hcal
+#include "DataFormats/Common/interface/Wrapper.h"
diff --git a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
index 47f7625254fc0..025ea32125c82 100644
--- a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
@@ -22,5 +22,6 @@
   <use name="CUDADataFormats/HcalCommon"/>
   <use name="CUDADataFormats/HcalDigi" />
   <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAServices"/>
   <use name="HeterogeneousCore/CUDAUtilities"/>
 </library>
diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
index cb7090d480faa..92ebb4e197072 100644
--- a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
+++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
@@ -4,8 +4,8 @@
 #include "CondFormats/HcalObjects/interface/HcalElectronicsMap.h"
 
 #ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #endif
 
 namespace hcal {
@@ -32,9 +32,6 @@ namespace hcal {
       // get device pointers
       Product const &getProduct(cudaStream_t) const;
 
-      //
-      static std::string name() { return std::string{"hcalElectronicsMappingGPU"}; }
-
     private:
       // in the future, we need to arrange so to avoid this copy on the host
       // if possible
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
index 013aca56298bb..8aad10228021c 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
@@ -1,22 +1,16 @@
 #include <iostream>
 
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
-
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
-
-#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
-#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 class HcalCPUDigisProducer : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -62,8 +56,7 @@ void HcalCPUDigisProducer::fillDescriptions(edm::ConfigurationDescriptions& conf
   desc.add<std::string>("digisLabelF5HBOut", "f5HBDigis");
   desc.add<std::string>("digisLabelF3HBOut", "f3HBDigis");
 
-  std::string label = "hcalCPUDigisProducer";
-  confDesc.add(label, desc);
+  confDesc.addWithDefaultLabel(desc);
 }
 
 HcalCPUDigisProducer::HcalCPUDigisProducer(const edm::ParameterSet& ps)
@@ -153,13 +146,9 @@ void HcalCPUDigisProducer::acquire(edm::Event const& event,
 }
 
 void HcalCPUDigisProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
-  auto outf01 = std::make_unique<OProductTypef01>(std::move(digisf01HE_));
-  auto outf5 = std::make_unique<OProductTypef5>(std::move(digisf5HB_));
-  auto outf3 = std::make_unique<OProductTypef3>(std::move(digisf3HB_));
-
-  event.put(digisF01HETokenOut_, std::move(outf01));
-  event.put(digisF5HBTokenOut_, std::move(outf5));
-  event.put(digisF3HBTokenOut_, std::move(outf3));
+  event.emplace(digisF01HETokenOut_, std::move(digisf01HE_));
+  event.emplace(digisF5HBTokenOut_, std::move(digisf5HB_));
+  event.emplace(digisF3HBTokenOut_, std::move(digisf3HB_));
 
   // output collections
   /*
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
index 0364e3718821f..ce18d78af7e84 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -1,18 +1,16 @@
 #include <iostream>
 
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
-
-#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
-
-#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 class HcalDigisProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -93,8 +91,7 @@ void HcalDigisProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& conf
   desc.add<uint32_t>("nsamplesF5HB", 8);
   desc.add<uint32_t>("nsamplesF3HB", 8);
 
-  std::string label = "hcalDigisProducerGPU";
-  confDesc.add(label, desc);
+  confDesc.addWithDefaultLabel(desc);
 }
 
 HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
@@ -110,22 +107,20 @@ HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
   config_.nsamplesF5HB = ps.getParameter<uint32_t>("nsamplesF5HB");
   config_.nsamplesF3HB = ps.getParameter<uint32_t>("nsamplesF3HB");
 
-  // allocate on the device
-  cudaCheck(cudaMalloc(
-      (void**)&df01_.data,
-      config_.maxChannelsF01HE * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)));
-  cudaCheck(cudaMalloc((void**)&df01_.ids, config_.maxChannelsF01HE * sizeof(uint32_t)));
+  // call CUDA API functions only if CUDA is available
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    // allocate on the device
+    cudaCheck(cudaMalloc((void**)&df01_.data, config_.maxChannelsF01HE * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)));
+    cudaCheck(cudaMalloc((void**)&df01_.ids, config_.maxChannelsF01HE * sizeof(uint32_t)));
 
-  cudaCheck(cudaMalloc(
-      (void**)&df5_.data,
-      config_.maxChannelsF5HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)));
-  cudaCheck(cudaMalloc((void**)&df5_.ids, config_.maxChannelsF5HB * sizeof(uint32_t)));
-  cudaCheck(cudaMalloc((void**)&df5_.npresamples, sizeof(uint8_t) * config_.maxChannelsF5HB));
+    cudaCheck(cudaMalloc((void**)&df5_.data, config_.maxChannelsF5HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)));
+    cudaCheck(cudaMalloc((void**)&df5_.ids, config_.maxChannelsF5HB * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc((void**)&df5_.npresamples, sizeof(uint8_t) * config_.maxChannelsF5HB));
 
-  cudaCheck(cudaMalloc(
-      (void**)&df3_.data,
-      config_.maxChannelsF3HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB)));
-  cudaCheck(cudaMalloc((void**)&df3_.ids, config_.maxChannelsF3HB * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc((void**)&df3_.data, config_.maxChannelsF3HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB)));
+    cudaCheck(cudaMalloc((void**)&df3_.ids, config_.maxChannelsF3HB * sizeof(uint32_t)));
+  }
 
   // preallocate on the host
   hf01_.stride = hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
@@ -137,13 +132,20 @@ HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
 }
 
 HcalDigisProducerGPU::~HcalDigisProducerGPU() {
-  // deallocate on the device
-  cudaCheck(cudaFree(df01_.data));
-  cudaCheck(cudaFree(df01_.ids));
-
-  cudaCheck(cudaFree(df5_.data));
-  cudaCheck(cudaFree(df5_.ids));
-  cudaCheck(cudaFree(df5_.npresamples));
+  // call CUDA API functions only if CUDA is available
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    // deallocate on the device
+    cudaCheck(cudaFree(df01_.data));
+    cudaCheck(cudaFree(df01_.ids));
+
+    cudaCheck(cudaFree(df5_.data));
+    cudaCheck(cudaFree(df5_.ids));
+    cudaCheck(cudaFree(df5_.npresamples));
+
+    cudaCheck(cudaFree(df3_.data));
+    cudaCheck(cudaFree(df3_.ids));
+  }
 }
 
 void HcalDigisProducerGPU::acquire(edm::Event const& event,
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
index 04cb786826015..910d6a7a7b26f 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
@@ -1,10 +1,8 @@
-#include "HcalRawESProducerGPU.h"
+#include <iostream>
 
 #include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
-
 #include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
-
-#include <iostream>
+#include "HcalRawESProducerGPU.h"
 
 using HcalElectronicsMappingGPUESProducer =
     HcalRawESProducerGPU<hcal::raw::ElectronicsMappingGPU, HcalElectronicsMap, HcalElectronicsMapRcd>;
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
index d9af852a2889b..5c3b92b42cd67 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
@@ -1,31 +1,20 @@
 #include <iostream>
 
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-// algorithm specific
-
+#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
+#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
 #include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-//#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
-//#include "CUDADataFormats/HcalDigi/interface/DigisCollection.h"
-
-//#include "CondFormats/DataRecord/interface/HcalMappingElectronicsRcd.h"
-//#include "EventFilter/HcalRawToDigi/interface/ElectronicsMappingGPU.h"
-
 #include "EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h"
 #include "EventFilter/HcalRawToDigi/plugins/DecodeGPU.h"
 #include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
-#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
-#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 class HcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -95,17 +84,26 @@ HcalRawToDigiGPU::HcalRawToDigiGPU(const edm::ParameterSet& ps)
   config_.nsamplesF5HB = ps.getParameter<uint32_t>("nsamplesF5HB");
   config_.nsamplesF3HB = ps.getParameter<uint32_t>("nsamplesF3HB");
 
-  inputCPU_.allocate();
-  inputGPU_.allocate();
-  outputGPU_.allocate(config_);
-  scratchGPU_.allocate(config_);
-  outputCPU_.allocate();
+  // reserve memory and call CUDA API functions only if CUDA is available
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    inputCPU_.allocate();
+    outputCPU_.allocate();
+
+    inputGPU_.allocate();
+    outputGPU_.allocate(config_);
+    scratchGPU_.allocate(config_);
+  }
 }
 
 HcalRawToDigiGPU::~HcalRawToDigiGPU() {
-  inputGPU_.deallocate();
-  outputGPU_.deallocate(config_);
-  scratchGPU_.deallocate(config_);
+  // call CUDA API functions only if CUDA is available
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    inputGPU_.deallocate();
+    outputGPU_.deallocate(config_);
+    scratchGPU_.deallocate(config_);
+  }
 }
 
 void HcalRawToDigiGPU::acquire(edm::Event const& event,
diff --git a/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h b/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h
index 4c8f9c03d22ef..965fb873bcf88 100644
--- a/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h
+++ b/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h
@@ -34,9 +34,6 @@ class HcalRecoParamsWithPulseShapesGPU {
   // get device pointers
   Product const &getProduct(cudaStream_t) const;
 
-  //
-  static std::string name() { return std::string{"hcalRecoParamsWithPulseShapesGPU"}; }
-
 private:
   uint64_t totalChannels_;  // hb + he
   std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> param1_;
diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
index f47f698c6b59a..206982af69faf 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -1,50 +1,42 @@
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-#include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h"
-#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
-
-#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h"
-#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h"
-#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
-
-//#include "CondFormats/DataRecord/interface/HcalPedestalsRcd.h"
-#include "RecoLocalCalo/HcalRecProducers/src/HcalCombinedRecordsGPU.h"
-
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
+#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/DeclsForKernels.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalCombinedRecordsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainWidthsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalLUTCorrsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalPedestalWidthsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIECodersGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRespCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIETypesGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRespCorrsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMCharacteristicsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
-
-#include "RecoLocalCalo/HcalRecAlgos/interface/DeclsForKernels.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/MahiGPU.h"
 
 class HBHERecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
@@ -109,23 +101,30 @@ HBHERecHitProducerGPU::HBHERecHitProducerGPU(edm::ParameterSet const& ps)
   configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1];
   configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2];
 
-  outputGPU_.allocate(configParameters_);
-  scratchGPU_.allocate(configParameters_);
-
-  // FIXME: use default device and default stream
-  cudaCheck(
-      cudaMalloc((void**)&configParameters_.pulseOffsetsDevice, sizeof(int) * configParameters_.pulseOffsets.size()));
-  cudaCheck(cudaMemcpy(configParameters_.pulseOffsetsDevice,
-                       configParameters_.pulseOffsets.data(),
-                       configParameters_.pulseOffsets.size() * sizeof(int),
-                       cudaMemcpyHostToDevice));
+  // call CUDA API functions only if CUDA is available
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    outputGPU_.allocate(configParameters_);
+    scratchGPU_.allocate(configParameters_);
+
+    // FIXME: use default device and default stream
+    cudaCheck(cudaMalloc((void**)&configParameters_.pulseOffsetsDevice, sizeof(int) * configParameters_.pulseOffsets.size()));
+    cudaCheck(cudaMemcpy(configParameters_.pulseOffsetsDevice,
+                         configParameters_.pulseOffsets.data(),
+                         configParameters_.pulseOffsets.size() * sizeof(int),
+                         cudaMemcpyHostToDevice));
+  }
 }
 
 HBHERecHitProducerGPU::~HBHERecHitProducerGPU() {
-  outputGPU_.deallocate(configParameters_);
-  scratchGPU_.deallocate(configParameters_);
-
-  cudaCheck(cudaFree(configParameters_.pulseOffsetsDevice));
+  // call CUDA API functions only if CUDA is available
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    outputGPU_.deallocate(configParameters_);
+    scratchGPU_.deallocate(configParameters_);
+
+    cudaCheck(cudaFree(configParameters_.pulseOffsetsDevice));
+  }
 }
 
 void HBHERecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& cdesc) {
@@ -154,8 +153,7 @@ void HBHERecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& cde
   desc.add<std::vector<double>>("tmaxTimeSlewParameters", {16.00, 10.00, 6.25});
   desc.add<std::vector<uint32_t>>("kernelMinimizeThreads", {16, 1, 1});
 
-  std::string label = "hbheRecHitProducerGPU";
-  cdesc.add(label, desc);
+  cdesc.addWithDefaultLabel(desc);
 }
 
 void HBHERecHitProducerGPU::acquire(edm::Event const& event,
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
index 23eea5c5aa30b..2d90bf9f08540 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
@@ -1,20 +1,16 @@
 #include <iostream>
+#include <string>
 
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
-
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
-#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
-#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 class HcalCPURecHitsProducer : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -44,8 +40,7 @@ void HcalCPURecHitsProducer::fillDescriptions(edm::ConfigurationDescriptions& co
   desc.add<std::string>("recHitsM0LabelOut", "recHitsM0HBHE");
   desc.add<std::string>("recHitsLegacyLabelOut", "recHitsLegacyHBHE");
 
-  std::string label = "hcalCPURecHitsProducer";
-  confDesc.add(label, desc);
+  confDesc.addWithDefaultLabel(desc);
 }
 
 HcalCPURecHitsProducer::HcalCPURecHitsProducer(const edm::ParameterSet& ps)
@@ -103,7 +98,7 @@ void HcalCPURecHitsProducer::produce(edm::Event& event, edm::EventSetup const& s
   event.put(recHitsLegacyTokenOut_, std::move(recHitsLegacy));
 
   // put a new format
-  event.put(recHitsM0TokenOut_, std::make_unique<OProductType>(std::move(tmpRecHits_)));
+  event.emplace(recHitsM0TokenOut_, std::move(tmpRecHits_));
 }
 
 DEFINE_FWK_MODULE(HcalCPURecHitsProducer);
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
index d4767231c60dd..26556dc523e85 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
@@ -86,18 +86,12 @@ DEFINE_FWK_EVENTSETUP_MODULE(HcalQIETypesGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMParametersGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMCharacteristicsGPUESProducer);
 
-#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
-
-#include "RecoLocalCalo/HcalRecProducers/src/HcalCombinedRecordsGPU.h"
-
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalCombinedRecordsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalWidthsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
 
-EVENTSETUP_RECORD_REG(HcalConvertedPedestalsRcd);
-EVENTSETUP_RECORD_REG(HcalConvertedPedestalWidthsRcd);
-
 using HcalConvertedPedestalsGPUESProducer = HcalESProducerGPUWithDependencies<HcalConvertedPedestalsRcd,
                                                                               HcalConvertedPedestalsGPU,
                                                                               HcalPedestals,

From b836fc92423dd660a29180c7348a33b2c40535d4 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 15 Jun 2020 12:41:59 +0200
Subject: [PATCH 09/34] Restructure code to work around CUDA build limitations
 (cms-patatrack#483)

Move ECAL and HCAL CUDA code to plugins.
General cleanup: remove unused code, apply clang-format and various include changes.
Fix product labels for HCAL rechits on CPU.

Co-authored-by: Andrea Bocci <andrea.bocci@cern.ch>
---
 .../HcalRawToDigi/plugins/DeclsForKernels.h   |    9 +-
 .../plugins/ElectronicsMappingGPU.cc          |    5 +-
 .../plugins/ElectronicsMappingGPU.h           |    6 +-
 .../plugins/HcalESProducerGPUDefs.cc          |    3 +-
 .../HcalRawToDigi/plugins/HcalRawToDigiGPU.cc |    7 +-
 .../HcalRecProducers/src/DeclsForKernels.h    |  136 ++
 .../src/HBHERecHitProducerGPU.cc              |    7 +-
 .../src/HcalCPURecHitsProducer.cc             |    4 +-
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 1830 +++++++++++++++++
 9 files changed, 1987 insertions(+), 20 deletions(-)
 create mode 100644 RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
 create mode 100644 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu

diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
index a66d7d38248ab..606053edb6801 100644
--- a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
+++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
@@ -1,14 +1,13 @@
 #ifndef EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
 #define EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-#include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
+#include <vector>
 
 #include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-#include <vector>
+#include "ElectronicsMappingGPU.h"
 
 namespace hcal {
   namespace raw {
diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
index 13dc8a756a415..6b7b89cc6ea77 100644
--- a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
@@ -1,9 +1,8 @@
-#include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
-
+#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h"
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h"
+#include "ElectronicsMappingGPU.h"
 
 namespace hcal {
   namespace raw {
diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
index 92ebb4e197072..0f4c12f02a92d 100644
--- a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
+++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
@@ -1,5 +1,5 @@
-#ifndef EventFilter_HcalRawToDigi_interface_ElectronicsMappingGPU_h
-#define EventFilter_HcalRawToDigi_interface_ElectronicsMappingGPU_h
+#ifndef EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h
+#define EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h
 
 #include "CondFormats/HcalObjects/interface/HcalElectronicsMap.h"
 
@@ -45,4 +45,4 @@ namespace hcal {
   }  // namespace raw
 }  // namespace hcal
 
-#endif  // EventFilter_HcalRawToDigi_interface_ElectronicsMappingGPU_h
+#endif  // EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
index 910d6a7a7b26f..aa601d6db06eb 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
@@ -1,7 +1,8 @@
 #include <iostream>
 
 #include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
-#include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
+
+#include "ElectronicsMappingGPU.h"
 #include "HcalRawESProducerGPU.h"
 
 using HcalElectronicsMappingGPUESProducer =
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
index 5c3b92b42cd67..0e3a1a0d3b1e3 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
@@ -3,9 +3,6 @@
 #include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
 #include "DataFormats/FEDRawData/interface/FEDNumbering.h"
 #include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-#include "EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h"
-#include "EventFilter/HcalRawToDigi/plugins/DecodeGPU.h"
-#include "EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
@@ -16,6 +13,10 @@
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+#include "DeclsForKernels.h"
+#include "DecodeGPU.h"
+#include "ElectronicsMappingGPU.h"
+
 class HcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit HcalRawToDigiGPU(edm::ParameterSet const& ps);
diff --git a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
new file mode 100644
index 0000000000000..42f1992bcf119
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
@@ -0,0 +1,136 @@
+#ifndef RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h
+#define RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h
+
+#include <functional>
+#include <optional>
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+#include "CalibCalorimetry/HcalAlgos/interface/HcalTimeSlew.h"
+#include "Geometry/CaloTopology/interface/HcalTopology.h"
+#include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainWidthsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalLUTCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIECodersGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIETypesGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRespCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMCharacteristicsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
+
+namespace hcal {
+  namespace mahi {
+
+    struct ConditionsProducts {
+      HcalGainWidthsGPU::Product const& gainWidths;
+      HcalGainsGPU::Product const& gains;
+      HcalLUTCorrsGPU::Product const& lutCorrs;
+      HcalConvertedPedestalWidthsGPU::Product const& pedestalWidths;
+      HcalConvertedEffectivePedestalWidthsGPU::Product const& effectivePedestalWidths;
+      HcalConvertedPedestalsGPU::Product const& pedestals;
+      HcalQIECodersGPU::Product const& qieCoders;
+      HcalRecoParamsWithPulseShapesGPU::Product const& recoParams;
+      HcalRespCorrsGPU::Product const& respCorrs;
+      HcalTimeCorrsGPU::Product const& timeCorrs;
+      HcalQIETypesGPU::Product const& qieTypes;
+      HcalSiPMParametersGPU::Product const& sipmParameters;
+      HcalSiPMCharacteristicsGPU::Product const& sipmCharacteristics;
+      HcalConvertedPedestalsGPU::Product const* convertedEffectivePedestals;
+      HcalTopology const* topology;
+      HcalDDDRecConstants const* recConstants;
+      uint32_t offsetForHashes;
+    };
+
+    struct ConfigParameters {
+      uint32_t maxChannels;
+      uint32_t maxTimeSamples;
+      uint32_t kprep1dChannelsPerBlock;
+      int sipmQTSShift;
+      int sipmQNTStoSum;
+      int firstSampleShift;
+      bool useEffectivePedestals;
+
+      float meanTime;
+      float timeSigmaSiPM, timeSigmaHPD;
+      float ts4Thresh;
+
+      std::vector<int> pulseOffsets;
+      int* pulseOffsetsDevice = nullptr;
+
+      std::array<uint32_t, 3> kernelMinimizeThreads;
+
+      // FIXME:
+      //   - add "getters" to HcalTimeSlew calib formats
+      //   - add ES Producer to consume what is produced above not to replicate.
+      //   which ones to use is hardcoded, therefore no need to send those to the device
+      bool applyTimeSlew;
+      float tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew;
+    };
+
+    struct OutputDataGPU {
+      RecHitCollection<common::ViewStoragePolicy> recHits;
+
+      void allocate(ConfigParameters const& config) {
+        cudaCheck(cudaMalloc((void**)&recHits.energy, config.maxChannels * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&recHits.chi2, config.maxChannels * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&recHits.energyM0, config.maxChannels * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&recHits.timeM0, config.maxChannels * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&recHits.did, config.maxChannels * sizeof(uint32_t)));
+      }
+
+      void deallocate(ConfigParameters const& config) {
+        cudaCheck(cudaFree(recHits.energy));
+        cudaCheck(cudaFree(recHits.chi2));
+        cudaCheck(cudaFree(recHits.energyM0));
+        cudaCheck(cudaFree(recHits.timeM0));
+        cudaCheck(cudaFree(recHits.did));
+      }
+    };
+
+    struct ScratchDataGPU {
+      float *amplitudes = nullptr, *noiseTerms = nullptr;
+      float *pulseMatrices = nullptr, *pulseMatricesM = nullptr, *pulseMatricesP = nullptr;
+      int8_t* soiSamples = nullptr;
+
+      // TODO: properly allocate for NSAMPLES VS NPULSES
+      void allocate(ConfigParameters const& config) {
+        cudaCheck(cudaMalloc((void**)&amplitudes, sizeof(float) * config.maxChannels * config.maxTimeSamples));
+        cudaCheck(cudaMalloc((void**)&noiseTerms, sizeof(float) * config.maxChannels * config.maxTimeSamples));
+        cudaCheck(cudaMalloc((void**)&pulseMatrices,
+                             sizeof(float) * config.maxChannels * config.maxTimeSamples * config.maxTimeSamples));
+        cudaCheck(cudaMalloc((void**)&pulseMatricesM,
+                             sizeof(float) * config.maxChannels * config.maxTimeSamples * config.maxTimeSamples));
+        cudaCheck(cudaMalloc((void**)&pulseMatricesP,
+                             sizeof(float) * config.maxChannels * config.maxTimeSamples * config.maxTimeSamples));
+        cudaCheck(cudaMalloc((void**)&soiSamples, sizeof(int8_t) * config.maxChannels));
+      }
+
+      void deallocate(ConfigParameters const& config) {
+        if (amplitudes) {
+          cudaCheck(cudaFree(amplitudes));
+          cudaCheck(cudaFree(noiseTerms));
+          cudaCheck(cudaFree(pulseMatrices));
+          cudaCheck(cudaFree(pulseMatricesM));
+          cudaCheck(cudaFree(pulseMatricesP));
+          cudaCheck(cudaFree(soiSamples));
+        }
+      }
+    };
+
+    struct InputDataGPU {
+      DigiCollection<Flavor01, common::ViewStoragePolicy> const& f01HEDigis;
+      DigiCollection<Flavor5, common::ViewStoragePolicy> const& f5HBDigis;
+      DigiCollection<Flavor3, common::ViewStoragePolicy> const& f3HBDigis;
+    };
+
+  }  // namespace mahi
+}  // namespace hcal
+
+#endif  // RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h
diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
index 206982af69faf..c78200badce57 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -19,7 +19,6 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/DeclsForKernels.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalCombinedRecordsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
@@ -37,7 +36,8 @@
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMCharacteristicsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/MahiGPU.h"
+
+#include "MahiGPU.h"
 
 class HBHERecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -108,7 +108,8 @@ HBHERecHitProducerGPU::HBHERecHitProducerGPU(edm::ParameterSet const& ps)
     scratchGPU_.allocate(configParameters_);
 
     // FIXME: use default device and default stream
-    cudaCheck(cudaMalloc((void**)&configParameters_.pulseOffsetsDevice, sizeof(int) * configParameters_.pulseOffsets.size()));
+    cudaCheck(
+        cudaMalloc((void**)&configParameters_.pulseOffsetsDevice, sizeof(int) * configParameters_.pulseOffsets.size()));
     cudaCheck(cudaMemcpy(configParameters_.pulseOffsetsDevice,
                          configParameters_.pulseOffsets.data(),
                          configParameters_.pulseOffsets.size() * sizeof(int),
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
index 2d90bf9f08540..db934710f6108 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
@@ -45,8 +45,8 @@ void HcalCPURecHitsProducer::fillDescriptions(edm::ConfigurationDescriptions& co
 
 HcalCPURecHitsProducer::HcalCPURecHitsProducer(const edm::ParameterSet& ps)
     : recHitsM0TokenIn_{consumes<IProductType>(ps.getParameter<edm::InputTag>("recHitsM0LabelIn"))},
-      recHitsM0TokenOut_{produces<OProductType>("recHitsM0LabelOut")},
-      recHitsLegacyTokenOut_{produces<HBHERecHitCollection>("recHitsLegacyLabelOut")} {}
+      recHitsM0TokenOut_{produces<OProductType>(ps.getParameter<std::string>("recHitsM0LabelOut"))},
+      recHitsLegacyTokenOut_{produces<HBHERecHitCollection>(ps.getParameter<std::string>("recHitsLegacyLabelOut"))} {}
 
 HcalCPURecHitsProducer::~HcalCPURecHitsProducer() {}
 
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
new file mode 100644
index 0000000000000..c1f9a62f4421b
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -0,0 +1,1830 @@
+#include <Eigen/Dense>
+
+#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
+
+// nvcc not able to parse this guy (whatever is inlcuded from it)....
+//#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h"
+
+#include "MahiGPU.h"
+
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#define DETID_TO_DEBUG 1125647428
+#endif
+
+namespace hcal {
+  namespace mahi {
+
+    template <int NROWS, int NCOLS>
+    using ColMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::ColMajor>;
+
+    template <int NROWS, int NCOLS>
+    using RowMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::RowMajor>;
+
+    template <int SIZE, typename T = float>
+    using ColumnVector = Eigen::Matrix<T, SIZE, 1>;
+
+    template <int SIZE, typename T = float>
+    using RowVector = Eigen::Matrix<T, 1, SIZE>;
+
+    // FIXME remove duplication...
+    // this is from PulesFunctor. nvcc was complaining... if included that header...
+    constexpr int maxSamples = 10;
+    constexpr int maxPSshapeBin = 256;
+    constexpr int nsPerBX = 25;
+    constexpr float iniTimeShift = 92.5f;
+
+    // this is from HcalTimeSlew.
+    // HcalTimeSlew are values that come in from ESProducer that takes them
+    // from a python config. see DeclsForKernels for more explanation
+    __forceinline__ __device__ float compute_time_slew_delay(float const fC,
+                                                             float const tzero,
+                                                             float const slope,
+                                                             float const tmax) {
+      auto const rawDelay = tzero + slope * std::log(fC);
+      return rawDelay < 0 ? 0 : (rawDelay > tmax ? tmax : rawDelay);
+    }
+
+    // HcalQIEShapes are hardcoded in HcalQIEData.cc basically
+    // + some logic to generate 128 and 256 value arrays...
+    __constant__ float const qie8shape[129] = {
+        -1,   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   16,
+        18,   20,   22,   24,   26,   28,   31,   34,   37,   40,   44,   48,   52,   57,   62,   57,   62,
+        67,   72,   77,   82,   87,   92,   97,   102,  107,  112,  117,  122,  127,  132,  142,  152,  162,
+        172,  182,  192,  202,  217,  232,  247,  262,  282,  302,  322,  347,  372,  347,  372,  397,  422,
+        447,  472,  497,  522,  547,  572,  597,  622,  647,  672,  697,  722,  772,  822,  872,  922,  972,
+        1022, 1072, 1147, 1222, 1297, 1372, 1472, 1572, 1672, 1797, 1922, 1797, 1922, 2047, 2172, 2297, 2422,
+        2547, 2672, 2797, 2922, 3047, 3172, 3297, 3422, 3547, 3672, 3922, 4172, 4422, 4672, 4922, 5172, 5422,
+        5797, 6172, 6547, 6922, 7422, 7922, 8422, 9047, 9672, 10297};
+
+    __constant__ float const qie11shape[257] = {
+        -0.5,    0.5,     1.5,     2.5,     3.5,     4.5,     5.5,     6.5,     7.5,     8.5,     9.5,     10.5,
+        11.5,    12.5,    13.5,    14.5,    15.5,    17.5,    19.5,    21.5,    23.5,    25.5,    27.5,    29.5,
+        31.5,    33.5,    35.5,    37.5,    39.5,    41.5,    43.5,    45.5,    47.5,    49.5,    51.5,    53.5,
+        55.5,    59.5,    63.5,    67.5,    71.5,    75.5,    79.5,    83.5,    87.5,    91.5,    95.5,    99.5,
+        103.5,   107.5,   111.5,   115.5,   119.5,   123.5,   127.5,   131.5,   135.5,   139.5,   147.5,   155.5,
+        163.5,   171.5,   179.5,   187.5,   171.5,   179.5,   187.5,   195.5,   203.5,   211.5,   219.5,   227.5,
+        235.5,   243.5,   251.5,   259.5,   267.5,   275.5,   283.5,   291.5,   299.5,   315.5,   331.5,   347.5,
+        363.5,   379.5,   395.5,   411.5,   427.5,   443.5,   459.5,   475.5,   491.5,   507.5,   523.5,   539.5,
+        555.5,   571.5,   587.5,   603.5,   619.5,   651.5,   683.5,   715.5,   747.5,   779.5,   811.5,   843.5,
+        875.5,   907.5,   939.5,   971.5,   1003.5,  1035.5,  1067.5,  1099.5,  1131.5,  1163.5,  1195.5,  1227.5,
+        1259.5,  1291.5,  1355.5,  1419.5,  1483.5,  1547.5,  1611.5,  1675.5,  1547.5,  1611.5,  1675.5,  1739.5,
+        1803.5,  1867.5,  1931.5,  1995.5,  2059.5,  2123.5,  2187.5,  2251.5,  2315.5,  2379.5,  2443.5,  2507.5,
+        2571.5,  2699.5,  2827.5,  2955.5,  3083.5,  3211.5,  3339.5,  3467.5,  3595.5,  3723.5,  3851.5,  3979.5,
+        4107.5,  4235.5,  4363.5,  4491.5,  4619.5,  4747.5,  4875.5,  5003.5,  5131.5,  5387.5,  5643.5,  5899.5,
+        6155.5,  6411.5,  6667.5,  6923.5,  7179.5,  7435.5,  7691.5,  7947.5,  8203.5,  8459.5,  8715.5,  8971.5,
+        9227.5,  9483.5,  9739.5,  9995.5,  10251.5, 10507.5, 11019.5, 11531.5, 12043.5, 12555.5, 13067.5, 13579.5,
+        12555.5, 13067.5, 13579.5, 14091.5, 14603.5, 15115.5, 15627.5, 16139.5, 16651.5, 17163.5, 17675.5, 18187.5,
+        18699.5, 19211.5, 19723.5, 20235.5, 20747.5, 21771.5, 22795.5, 23819.5, 24843.5, 25867.5, 26891.5, 27915.5,
+        28939.5, 29963.5, 30987.5, 32011.5, 33035.5, 34059.5, 35083.5, 36107.5, 37131.5, 38155.5, 39179.5, 40203.5,
+        41227.5, 43275.5, 45323.5, 47371.5, 49419.5, 51467.5, 53515.5, 55563.5, 57611.5, 59659.5, 61707.5, 63755.5,
+        65803.5, 67851.5, 69899.5, 71947.5, 73995.5, 76043.5, 78091.5, 80139.5, 82187.5, 84235.5, 88331.5, 92427.5,
+        96523.5, 100620,  104716,  108812,  112908};
+
+    // Conditions are transferred once per IOV
+    // Access is performed based on the det id which is converted to a linear index
+    // 2 funcs below are taken from HcalTopology (reimplemented here).
+    // Inputs are constants that are also taken from HcalTopology
+    // but passed to the kernel as arguments using the HclaTopology itself
+    constexpr int32_t IPHI_MAX = 72;
+
+    __forceinline__ __device__ uint32_t did2linearIndexHB(
+        uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) {
+      HcalDetId did{didraw};
+      uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1);
+      return did.ieta() > 0 ? value + maxDepthHB * IPHI_MAX * (did.ieta() - firstHBRing)
+                            : value + maxDepthHB * IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
+    }
+
+    __forceinline__ __device__ uint32_t did2linearIndexHE(uint32_t const didraw,
+                                                          int const maxDepthHE,
+                                                          int const maxPhiHE,
+                                                          int const firstHERing,
+                                                          int const lastHERing,
+                                                          int const nEtaHE) {
+      HcalDetId did{didraw};
+      uint32_t const value = (did.depth() - 1) + maxDepthHE * (did.iphi() - 1);
+      return did.ieta() > 0 ? value + maxDepthHE * maxPhiHE * (did.ieta() - firstHERing)
+                            : value + maxDepthHE * maxPhiHE * (did.ieta() + lastHERing + nEtaHE);
+    }
+
+    __forceinline__ __device__ uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range) {
+      return capid * 4 + range;
+    }
+
+    __forceinline__ __device__ float compute_reco_correction_factor(float const par1,
+                                                                    float const par2,
+                                                                    float const par3,
+                                                                    float const x) {
+      return par3 * x * x + par2 * x + par1;
+    }
+
+    // compute the charge using the adc, qie type and the appropriate qie shape array
+    __forceinline__ __device__ float compute_coder_charge(
+        int const qieType, uint8_t const adc, uint8_t const capid, float const* qieOffsets, float const* qieSlopes) {
+      auto const range = qieType == 0 ? (adc >> 5) & 0x3 : (adc >> 6) & 0x3;
+      auto const* qieShapeToUse = qieType == 0 ? qie8shape : qie11shape;
+      auto const nbins = qieType == 0 ? 32 : 64;
+      auto const center = adc % nbins == nbins - 1 ? 0.5 * (3 * qieShapeToUse[adc] - qieShapeToUse[adc - 1])
+                                                   : 0.5 * (qieShapeToUse[adc] + qieShapeToUse[adc + 1]);
+      auto const index = get_qiecoder_index(capid, range);
+      return (center - qieOffsets[index]) / qieSlopes[index];
+    }
+
+    __forceinline__ __device__ float compute_diff_charge_gain(int const qieType,
+                                                              uint8_t adc,
+                                                              uint8_t const capid,
+                                                              float const* qieOffsets,
+                                                              float const* qieSlopes,
+                                                              bool const isqie11) {
+      constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
+      constexpr uint32_t mantissaMaskQIE11 = 0x3f;
+      auto const mantissaMask = isqie11 ? mantissaMaskQIE11 : mantissaMaskQIE8;
+      auto const q = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
+      auto const mantissa = adc & mantissaMask;
+
+      if (mantissa == 0u || mantissa == mantissaMask - 1u)
+        return compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes) - q;
+      else if (mantissa == 1u || mantissa == mantissaMask)
+        return q - compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
+      else {
+        auto const qup = compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes);
+        auto const qdown = compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
+        auto const upgain = qup - q;
+        auto const downgain = q - qdown;
+        auto const averagegain = (qup - qdown) / 2.f;
+        if (std::abs(upgain - downgain) < 0.01f * averagegain)
+          return averagegain;
+        else {
+          auto const q2up = compute_coder_charge(qieType, adc + 2u, capid, qieOffsets, qieSlopes);
+          auto const q2down = compute_coder_charge(qieType, adc - 2u, capid, qieOffsets, qieSlopes);
+          auto const upgain2 = q2up - qup;
+          auto const downgain2 = qdown - q2down;
+          if (std::abs(upgain2 - upgain) < std::abs(downgain2 - downgain))
+            return upgain;
+          else
+            return downgain;
+        }
+      }
+    }
+
+    // Assume: same number of samples for HB and HE
+    // TODO: add/validate restrict (will increase #registers in use by the kernel)
+    __global__ void kernel_prep1d_sameNumberOfSamples(float* amplitudes,
+                                                      float* noiseTerms,
+                                                      float* outputEnergy,
+                                                      float* outputChi2,
+                                                      uint16_t const* dataf01HE,
+                                                      uint16_t const* dataf5HB,
+                                                      uint16_t const* dataf3HB,
+                                                      uint32_t const* idsf01HE,
+                                                      uint32_t const* idsf5HB,
+                                                      uint32_t const* idsf3HB,
+                                                      uint32_t const stridef01HE,
+                                                      uint32_t const stridef5HB,
+                                                      uint32_t const stridef3HB,
+                                                      uint32_t const nchannelsf01HE,
+                                                      uint32_t const nchannelsf5HB,
+                                                      uint8_t const* npresamplesf5HB,
+                                                      int8_t* soiSamples,
+                                                      float* method0Energy,
+                                                      float* method0Time,
+                                                      uint32_t* outputdid,
+                                                      uint32_t const nchannels,
+                                                      uint32_t const* recoParam1Values,
+                                                      uint32_t const* recoParam2Values,
+                                                      float const* qieCoderOffsets,
+                                                      float const* qieCoderSlopes,
+                                                      int const* qieTypes,
+                                                      float const* pedestalWidths,
+                                                      float const* effectivePedestalWidths,
+                                                      float const* pedestals,
+                                                      float const* effectivePedestals,
+                                                      bool const useEffectivePedestals,
+                                                      int const* sipmTypeValues,
+                                                      float const* fcByPEValues,
+                                                      float const* parLin1Values,
+                                                      float const* parLin2Values,
+                                                      float const* parLin3Values,
+                                                      float const* gainValues,
+                                                      float const* respCorrectionValues,
+                                                      int const maxDepthHB,
+                                                      int const maxDepthHE,
+                                                      int const maxPhiHE,
+                                                      int const firstHBRing,
+                                                      int const lastHBRing,
+                                                      int const firstHERing,
+                                                      int const lastHERing,
+                                                      int const nEtaHB,
+                                                      int const nEtaHE,
+                                                      int const sipmQTSShift,
+                                                      int const sipmQNTStoSum,
+                                                      int const firstSampleShift,
+                                                      uint32_t const offsetForHashes,
+                                                      float const ts4Thresh) {
+      // indices + runtime constants
+      auto const sample = threadIdx.x;
+      int32_t const nsamplesExpected = blockDim.x;
+      auto const lch = threadIdx.y;
+      auto const gch = lch + blockDim.y * blockIdx.x;
+      auto const nchannels_per_block = blockDim.y;
+      auto const linearThPerBlock = threadIdx.x + threadIdx.y * blockDim.x;
+
+      constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
+      constexpr uint32_t mantissaMaskQIE11 = 0x3f;
+
+      // remove
+      if (gch >= nchannels)
+        return;
+
+      // initialize all output buffers
+      if (sample == 0) {
+        outputdid[gch] = 0;
+        method0Energy[gch] = 0;
+        method0Time[gch] = 0;
+        outputEnergy[gch] = 0;
+        outputChi2[gch] = 0;
+      }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#ifdef HCAL_MAHI_GPUDEBUG_SINGLECHANNEL
+      if (gch > 0)
+        return;
+#endif
+#endif
+
+      // configure shared mem
+      extern __shared__ char smem[];
+      float* shrEnergyM0PerTS = reinterpret_cast<float*>(smem);
+      float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesExpected * nchannels_per_block;
+      float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesExpected * nchannels_per_block;
+      float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block;
+      unsigned long long int* shrMethod0EnergySamplePair =
+          reinterpret_cast<unsigned long long int*>(shrEnergyM0TotalAccum + nchannels_per_block);
+      if (sample == 0) {
+        shrMethod0EnergyAccum[lch] = 0;
+        shrMethod0EnergySamplePair[lch] = __float_as_uint(std::numeric_limits<float>::min());
+        shrEnergyM0TotalAccum[lch] = 0;
+      }
+
+      // offset output
+      auto* amplitudesForChannel = amplitudes + nsamplesExpected * gch;
+      auto* noiseTermsForChannel = noiseTerms + nsamplesExpected * gch;
+      auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
+
+      // get event input quantities
+      auto const stride = gch < nchannelsf01HE ? stridef01HE : (gch < nchannelsf015 ? stridef5HB : stridef3HB);
+      auto const nsamples = gch < nchannelsf01HE ? compute_nsamples<Flavor01>(stride)
+                                                 : (gch < nchannelsf015 ? compute_nsamples<Flavor5>(stride)
+                                                                        : compute_nsamples<Flavor3>(stride));
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      assert(nsamples == nsamplesExpected);
+#endif
+
+      auto const id = gch < nchannelsf01HE
+                          ? idsf01HE[gch]
+                          : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
+      auto const did = HcalDetId{id};
+      auto const adc =
+          gch < nchannelsf01HE
+              ? adc_for_sample<Flavor01>(dataf01HE + stride * gch, sample)
+              : (gch < nchannelsf015 ? adc_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
+                                     : adc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
+      auto const capid =
+          gch < nchannelsf01HE
+              ? capid_for_sample<Flavor01>(dataf01HE + stride * gch, sample)
+              : (gch < nchannelsf015 ? capid_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
+                                     : capid_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
+      if (id != DETID_TO_DEBUG)
+        return;
+#endif
+#endif
+
+      // compute hash for this did
+      auto const hashedId =
+          did.subdetId() == HcalBarrel
+              ? did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+              : did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
+
+      // conditions based on the hash
+      // FIXME: remove hardcoded values
+      auto const qieType = qieTypes[hashedId] > 0 ? 1 : 0;  // 2 types at this point
+      auto const* qieOffsets = qieCoderOffsets + hashedId * HcalQIECodersGPU::numValuesPerChannel;
+      auto const* qieSlopes = qieCoderSlopes + hashedId * HcalQIECodersGPU::numValuesPerChannel;
+      auto const* pedestalsForChannel = pedestals + hashedId * 4;
+      auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
+                                                 ? effectivePedestalWidths + hashedId * 4
+                                                 : pedestalWidths + hashedId * 4;
+      auto const* gains = gainValues + hashedId * 4;
+      auto const gain = gains[capid];
+      auto const gain0 = gains[0];
+      auto const respCorrection = respCorrectionValues[hashedId];
+      auto const pedestal = pedestalsForChannel[capid];
+      auto const pedestalWidth = pedestalWidthsForChannel[capid];
+      // if needed, only use effective pedestals for f01
+      auto const pedestalToUseForMethod0 = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
+                                               ? effectivePedestals[hashedId * 4 + capid]
+                                               : pedestal;
+      auto const sipmType = sipmTypeValues[hashedId];
+      auto const fcByPE = fcByPEValues[hashedId];
+      auto const recoParam1 = recoParam1Values[hashedId];
+      auto const recoParam2 = recoParam2Values[hashedId];
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      printf("qieType = %d qieOffset0 = %f qieOffset1 = %f qieSlope0 = %f qieSlope1 = %f\n",
+             qieType,
+             qieOffsets[0],
+             qieOffsets[1],
+             qieSlopes[0],
+             qieSlopes[1]);
+#endif
+
+      // compute charge
+      auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
+
+      shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal;
+      if (gch < nchannelsf01HE) {
+        // NOTE: assume that soi is high only for a single guy!
+        //   which must be the case. cpu version does not check for that
+        //   if that is not the case, we will see that with cuda mmecheck
+        auto const soibit = soibit_for_sample<Flavor01>(dataf01HE + stride * gch, sample);
+        if (soibit == 1)
+          soiSamples[gch] = sample;
+      } else if (gch >= nchannelsf015) {
+        auto const soibit = soibit_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample);
+        if (soibit == 1)
+          soiSamples[gch] = sample;
+      }
+      __syncthreads();
+      int32_t const soi = gch < nchannelsf01HE
+                              ? soiSamples[gch]
+                              : (gch < nchannelsf015 ? npresamplesf5HB[gch - nchannelsf01HE] : soiSamples[gch]);
+      //int32_t const soi = gch >= nchannelsf01HE
+      //    ? npresamplesf5HB[gch - nchannelsf01HE]
+      //    : soiSamples[gch];
+      // this is here just to make things uniform...
+      if (gch >= nchannelsf01HE && gch < nchannelsf015 && sample == 0)
+        soiSamples[gch] = npresamplesf5HB[gch - nchannelsf01HE];
+
+      //
+      // compute various quantities (raw charge and tdc stuff)
+      // NOTE: this branch will be divergent only for a single warp that
+      // sits on the boundary when flavor 01 channels end and flavor 5 start
+      //
+      float rawCharge;
+      float tdcTime;
+      auto const dfc = compute_diff_charge_gain(
+          qieType, adc, capid, qieOffsets, qieSlopes, gch < nchannelsf01HE || gch >= nchannelsf015);
+      if (gch >= nchannelsf01HE && gch < nchannelsf015) {
+        // flavor 5
+        rawCharge = charge;
+        tdcTime = HcalSpecialTimes::UNKNOWN_T_NOTDC;
+      } else {
+        // flavor 0 or 1 or 3
+        // conditions needed for sipms
+        auto const parLin1 = parLin1Values[sipmType - 1];
+        auto const parLin2 = parLin2Values[sipmType - 1];
+        auto const parLin3 = parLin3Values[sipmType - 1];
+
+        int const first = std::max(soi + sipmQTSShift, 0);
+        int const last = std::min(soi + sipmQNTStoSum, nsamplesExpected);
+        float sipmq = 0.0f;
+        for (auto ts = first; ts < last; ts++)
+          sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesExpected + ts];
+        auto const effectivePixelsFired = sipmq / fcByPE;
+        auto const factor = compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
+        rawCharge = (charge - pedestal) * factor + pedestal;
+        if (gch < nchannelsf01HE)
+          tdcTime = HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor01>(dataf01HE + stride * gch, sample));
+        else if (gch >= nchannelsf015)
+          tdcTime =
+              HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
+#endif
+      }
+
+      // compute method 0 quantities
+      // TODO: need to apply containment
+      // TODO: need to apply time slew
+      // TODO: for < run 3, apply HBM legacy energy correction
+      auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
+      auto const startSampleTmp = soi + firstSampleShift;
+      auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
+      auto const endSample = startSample + nsamplesToAdd < nsamples ? startSample + nsamplesToAdd : nsamples;
+      // NOTE: gain is a small number < 10^-3, multiply it last
+      auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
+      auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
+      // store to shared mem
+      shrEnergyM0PerTS[lch * nsamplesExpected + sample] = energym0_per_ts;
+      atomicAdd(&shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0);
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      printf(
+          "id = %u sample = %d gch = %d hashedId = %u adc = %u capid = %u\n"
+          "   charge = %f rawCharge = %f dfc = %f pedestal = %f\n"
+          "   gain = %f respCorrection = %f energym0_per_ts = %f\n",
+          id,
+          sample,
+          gch,
+          hashedId,
+          adc,
+          capid,
+          charge,
+          rawCharge,
+          dfc,
+          pedestalToUseForMethod0,
+          gain,
+          respCorrection,
+          energym0_per_ts);
+      printf(
+          "startSample = %d endSample = %d param1 = %u param2 = %u\n", startSample, endSample, recoParam1, recoParam2);
+#endif
+
+      if (sample >= startSample && sample < endSample) {
+        atomicAdd(&shrMethod0EnergyAccum[lch], energym0_per_ts);
+        // pack sample, energy as 64 bit value
+        unsigned long long int old = shrMethod0EnergySamplePair[lch], assumed;
+        unsigned long long int val =
+            (static_cast<unsigned long long int>(sample) << 32) + __float_as_uint(energym0_per_ts);
+        do {
+          assumed = old;
+          // decode energy, sample values
+          int const current_sample = (assumed >> 32) & 0xffffffff;
+          float const current_energy = __uint_as_float(assumed & 0xffffffff);
+          if (energym0_per_ts > current_energy)
+            old = atomicCAS(&shrMethod0EnergySamplePair[lch], assumed, val);
+          else
+            break;
+        } while (assumed != old);
+      }
+      __syncthreads();
+
+      // NOTE: must take soi, as values for that thread are used...
+      if (sample == soi) {
+        auto const method0_energy = shrMethod0EnergyAccum[lch];
+        auto const val = shrMethod0EnergySamplePair[lch];
+        int const max_sample = (val >> 32) & 0xffffffff;
+        float const max_energy = __uint_as_float(val & 0xffffffff);
+        float const max_energy_1 =
+            max_sample < nsamples - 1 ? shrEnergyM0PerTS[lch * nsamplesExpected + max_sample + 1] : 0.f;
+        float const position = nsamplesToAdd < nsamples ? max_sample - soi : max_sample;
+        auto const sum = max_energy + max_energy_1;
+        // FIXME: for full comparison with cpu method 0  timing,
+        // need to correct by slew
+        // requires an accumulator -> more shared mem -> omit here unless
+        // really needed
+        float const time =
+            max_energy > 0.f && max_energy_1 > 0.f ? 25.f * (position + max_energy_1 / sum) : 25.f * position;
+
+        // store method0 quantities to global mem
+        outputdid[gch] = id;
+        method0Energy[gch] = method0_energy;
+        method0Time[gch] = time;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("tsTOT = %f tstrig = %f ts4Thresh = %f\n", shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0, ts4Thresh);
+#endif
+
+        // check as in cpu version if mahi is not needed
+        // FIXME: KNOWN ISSUE: observed a problem when rawCharge and pedestal
+        // are basically equal and generate -0.00000...
+        // needs to be treated properly
+        if (!(shrEnergyM0TotalAccum[lch] > 0 && energym0_per_ts_gain0 >= ts4Thresh)) {
+          // do not need to run mahi minimization
+          //outputEnergy[gch] = 0; energy already inited to 0
+          outputChi2[gch] = -9999.f;
+        }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("method0_energy = %f max_sample = %d max_energy = %f time = %f\n",
+               method0_energy,
+               max_sample,
+               max_energy,
+               time);
+#endif
+      }
+
+      //
+      // preparations for mahi fit
+      //
+      auto const amplitude = rawCharge - pedestalToUseForMethod0;
+      auto const noiseADC = (1. / std::sqrt(12)) * dfc;
+      auto const noisePhoto = amplitude > pedestalWidth ? std::sqrt(amplitude * fcByPE) : 0.f;
+      auto const noiseTerm = noiseADC * noiseADC + noisePhoto * noisePhoto + pedestalWidth * pedestalWidth;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      printf(
+          "charrge(%d) = %f pedestal(%d) = %f dfc(%d) = %f pedestalWidth(%d) = %f noiseADC(%d) = %f noisPhoto(%d) = "
+          "%f\n",
+          sample,
+          rawCharge,
+          sample,
+          pedestalToUseForMethod0,
+          sample,
+          dfc,
+          sample,
+          pedestalWidth,
+          sample,
+          noiseADC,
+          sample,
+          noisePhoto);
+#endif
+
+      // store to global memory
+      amplitudesForChannel[sample] = amplitude;
+      noiseTermsForChannel[sample] = noiseTerm;
+    }
+
+    // TODO: remove what's not needed
+    __forceinline__ __device__ float compute_pulse_shape_value(float const pulse_time,
+                                                               int const sample,
+                                                               int const shift,
+                                                               float const* acc25nsVec,
+                                                               float const* diff25nsItvlVec,
+                                                               float const* accVarLenIdxMinusOneVec,
+                                                               float const* diffVarItvlIdxMinusOneVec,
+                                                               float const* accVarLenIdxZeroVec,
+                                                               float const* diffVarItvlIdxZeroVec) {
+      // constants
+      constexpr float pulse_height = 1.0f;
+      constexpr float slew = 0.f;
+      constexpr auto ns_per_bx = nsPerBX;
+      constexpr auto num_ns = nsPerBX * maxSamples;
+      constexpr auto num_bx = num_ns / ns_per_bx;
+
+      // FIXME: clean up all the rounding... this is coming from original cpu version
+      float const i_start_float =
+          -iniTimeShift - pulse_time - slew > 0.f ? 0.f : std::abs(-iniTimeShift - pulse_time - slew) + 1.f;
+      int i_start = static_cast<int>(i_start_float);
+      float offset_start = static_cast<float>(i_start) - iniTimeShift - pulse_time - slew;
+      // FIXME: do we need a check for nan???
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (shift == 0)
+        printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start);
+#endif
+
+      // boundary
+      if (offset_start == 1.0f) {
+        offset_start = 0.f;
+        i_start -= 1;
+      }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (shift == 0)
+        printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start);
+#endif
+
+      int const bin_start = static_cast<int>(offset_start);
+      auto const bin_start_up = static_cast<float>(bin_start) + 0.5f;
+      int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start;
+      int const its_start = i_start / ns_per_bx;
+      int const distTo25ns_start = nsPerBX - 1 - i_start % ns_per_bx;
+      auto const factor = offset_start - static_cast<float>(bin_0_start) - 0.5;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (shift == 0) {
+        printf("bin_start = %d bin_0_start = %d its_start = %d distTo25ns_start = %d factor = %f\n",
+               bin_start,
+               bin_0_start,
+               its_start,
+               distTo25ns_start,
+               factor);
+      }
+#endif
+
+      auto const sample_over10ts = sample + shift;
+      float value = 0.0f;
+      if (sample_over10ts == its_start) {
+        value = bin_0_start == -1
+                    ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start]
+                    : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
+      } else if (sample_over10ts > its_start) {
+        int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
+        value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
+      }
+      value *= pulse_height;
+      return value;
+    }
+
+    // TODO: need to add an array of offsets for pulses (a la activeBXs...)
+    // Assume for now 8 pulses
+    __global__ void kernel_prep_pulseMatrices_sameNumberOfSamples(float* pulseMatrices,
+                                                                  float* pulseMatricesM,
+                                                                  float* pulseMatricesP,
+                                                                  int const* pulseOffsets,
+                                                                  float const* amplitudes,
+                                                                  uint32_t const* idsf01HE,
+                                                                  uint32_t const* idsf5HB,
+                                                                  uint32_t const* idsf3HB,
+                                                                  uint32_t const nchannelsf01HE,
+                                                                  uint32_t const nchannelsf5HB,
+                                                                  uint32_t const nchannelsTotal,
+                                                                  int8_t const* soiSamples,
+                                                                  uint32_t const* recoPulseShapeIds,
+                                                                  float const* acc25nsVecValues,
+                                                                  float const* diff25nsItvlVecValues,
+                                                                  float const* accVarLenIdxMinusOneVecValues,
+                                                                  float const* diffVarItvlIdxMinusOneVecValues,
+                                                                  float const* accVarLenIdxZeroVecValues,
+                                                                  float const* diffVarItvlIdxZeroVecValues,
+                                                                  float const meanTime,
+                                                                  float const timeSigmaSiPM,
+                                                                  float const timeSigmaHPD,
+                                                                  int const maxDepthHB,
+                                                                  int const maxDepthHE,
+                                                                  int const maxPhiHE,
+                                                                  int const firstHBRing,
+                                                                  int const lastHBRing,
+                                                                  int const firstHERing,
+                                                                  int const lastHERing,
+                                                                  int const nEtaHB,
+                                                                  int const nEtaHE,
+                                                                  uint32_t const offsetForHashes,
+                                                                  bool const applyTimeSlew,
+                                                                  float const tzeroTimeSlew,
+                                                                  float const slopeTimeSlew,
+                                                                  float const tmaxTimeSlew) {
+      // indices
+      auto const ipulse = threadIdx.y;
+      auto const npulses = blockDim.y;
+      auto const sample = threadIdx.x;
+      auto const nsamples = blockDim.x;
+      auto const lch = threadIdx.z;
+      auto const gch = lch + blockIdx.x * blockDim.z;
+      auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
+
+      if (gch >= nchannelsTotal)
+        return;
+
+      // conditions
+      auto const id = gch < nchannelsf01HE
+                          ? idsf01HE[gch]
+                          : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
+      //auto const id = gch >= nchannelsf01HE
+      //    ? idsf5HB[gch - nchannelsf01HE]
+      //    : idsf01HE[gch];
+      auto const deltaT = gch >= nchannelsf01HE && gch < nchannelsf015 ? timeSigmaHPD : timeSigmaSiPM;
+      auto const did = DetId{id};
+      auto const hashedId =
+          did.subdetId() == HcalBarrel
+              ? did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+              : did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
+      auto const recoPulseShapeId = recoPulseShapeIds[hashedId];
+      auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * maxPSshapeBin;
+      auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * maxPSshapeBin;
+      auto const* accVarLenIdxMinusOneVec = accVarLenIdxMinusOneVecValues + recoPulseShapeId * nsPerBX;
+      auto const* diffVarItvlIdxMinusOneVec = diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * nsPerBX;
+      auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * nsPerBX;
+      auto const* diffVarItvlIdxZeroVec = diffVarItvlIdxZeroVecValues + recoPulseShapeId * nsPerBX;
+
+      // offset output arrays
+      auto* pulseMatrix = pulseMatrices + nsamples * npulses * gch;
+      auto* pulseMatrixM = pulseMatricesM + nsamples * npulses * gch;
+      auto* pulseMatrixP = pulseMatricesP + nsamples * npulses * gch;
+
+      // amplitude per ipulse
+      int const soi = soiSamples[gch];
+      int const pulseOffset = pulseOffsets[ipulse];
+      auto const amplitude = amplitudes[gch * nsamples + pulseOffset + soi];
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
+      if (id != DETID_TO_DEBUG)
+        return;
+#endif
+#endif
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (sample == 0 && ipulse == 0) {
+        for (int i = 0; i < 8; i++)
+          printf("amplitude(%d) = %f\n", i, amplitudes[gch * nsamples + i]);
+        printf("acc25nsVec and diff25nsItvlVec for recoPulseShapeId = %u\n", recoPulseShapeId);
+        for (int i = 0; i < 256; i++) {
+          printf("acc25nsVec(%d) = %f diff25nsItvlVec(%d) = %f\n", i, acc25nsVec[i], i, diff25nsItvlVec[i]);
+        }
+        printf("accVarLenIdxZEROVec and accVarLenIdxMinusOneVec\n");
+        for (int i = 0; i < 25; i++) {
+          printf("accVarLenIdxZEROVec(%d) = %f accVarLenIdxMinusOneVec(%d) = %f\n",
+                 i,
+                 accVarLenIdxZeroVec[i],
+                 i,
+                 accVarLenIdxMinusOneVec[i]);
+        }
+        printf("diffVarItvlIdxZEROVec and diffVarItvlIdxMinusOneVec\n");
+        for (int i = 0; i < 25; i++) {
+          printf("diffVarItvlIdxZEROVec(%d) = %f diffVarItvlIdxMinusOneVec(%d) = %f\n",
+                 i,
+                 diffVarItvlIdxZeroVec[i],
+                 i,
+                 diffVarItvlIdxMinusOneVec[i]);
+        }
+      }
+#endif
+
+      auto t0 = meanTime;
+      if (applyTimeSlew) {
+        if (amplitude <= 1.0f)
+          t0 += compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
+        else
+          t0 += compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
+      }
+      auto const t0m = -deltaT + t0;
+      auto const t0p = deltaT + t0;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (sample == 0 && ipulse == 0) {
+        printf("time values: %f %f %f\n", t0, t0m, t0p);
+      }
+
+      if (sample == 0 && ipulse == 0) {
+        for (int i = 0; i < 10; i++) {
+          auto const value = compute_pulse_shape_value(t0,
+                                                       i,
+                                                       0,
+                                                       acc25nsVec,
+                                                       diff25nsItvlVec,
+                                                       accVarLenIdxMinusOneVec,
+                                                       diffVarItvlIdxMinusOneVec,
+                                                       accVarLenIdxZeroVec,
+                                                       diffVarItvlIdxZeroVec);
+          printf("pulse(%d) = %f\n", i, value);
+        }
+        printf("\n");
+        for (int i = 0; i < 10; i++) {
+          auto const value = compute_pulse_shape_value(t0p,
+                                                       i,
+                                                       0,
+                                                       acc25nsVec,
+                                                       diff25nsItvlVec,
+                                                       accVarLenIdxMinusOneVec,
+                                                       diffVarItvlIdxMinusOneVec,
+                                                       accVarLenIdxZeroVec,
+                                                       diffVarItvlIdxZeroVec);
+          printf("pulseP(%d) = %f\n", i, value);
+        }
+        printf("\n");
+        for (int i = 0; i < 10; i++) {
+          auto const value = compute_pulse_shape_value(t0m,
+                                                       i,
+                                                       0,
+                                                       acc25nsVec,
+                                                       diff25nsItvlVec,
+                                                       accVarLenIdxMinusOneVec,
+                                                       diffVarItvlIdxMinusOneVec,
+                                                       accVarLenIdxZeroVec,
+                                                       diffVarItvlIdxZeroVec);
+          printf("pulseM(%d) = %f\n", i, value);
+        }
+      }
+#endif
+
+      // FIXME: shift should be treated properly,
+      // here assume 8 time slices and 8 samples
+      auto const shift = 4 - soi;  // as in cpu version!
+                                   //    auto const offset = ipulse - soi;
+                                   //    auto const idx = sample - offset;
+      auto const idx = sample - pulseOffset;
+      auto const value = idx >= 0 && idx < nsamples ? compute_pulse_shape_value(t0,
+                                                                                idx,
+                                                                                shift,
+                                                                                acc25nsVec,
+                                                                                diff25nsItvlVec,
+                                                                                accVarLenIdxMinusOneVec,
+                                                                                diffVarItvlIdxMinusOneVec,
+                                                                                accVarLenIdxZeroVec,
+                                                                                diffVarItvlIdxZeroVec)
+                                                    : 0;
+      auto const value_t0m = idx >= 0 && idx < nsamples ? compute_pulse_shape_value(t0m,
+                                                                                    idx,
+                                                                                    shift,
+                                                                                    acc25nsVec,
+                                                                                    diff25nsItvlVec,
+                                                                                    accVarLenIdxMinusOneVec,
+                                                                                    diffVarItvlIdxMinusOneVec,
+                                                                                    accVarLenIdxZeroVec,
+                                                                                    diffVarItvlIdxZeroVec)
+                                                        : 0;
+      auto const value_t0p = idx >= 0 && idx < nsamples ? compute_pulse_shape_value(t0p,
+                                                                                    idx,
+                                                                                    shift,
+                                                                                    acc25nsVec,
+                                                                                    diff25nsItvlVec,
+                                                                                    accVarLenIdxMinusOneVec,
+                                                                                    diffVarItvlIdxMinusOneVec,
+                                                                                    accVarLenIdxZeroVec,
+                                                                                    diffVarItvlIdxZeroVec)
+                                                        : 0;
+
+      // store to global
+      pulseMatrix[ipulse * nsamples + sample] = value;
+      ;
+      pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
+      pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
+    }
+
+    // FIXME: provide specialization for Row Major layout
+    template <typename T, int Stride, int Order = Eigen::ColMajor>
+    struct MapSymM {
+      using type = T;
+      using base_type = typename std::remove_const<type>::type;
+
+      static constexpr int total = Stride * (Stride + 1) / 2;
+      static constexpr int stride = Stride;
+      T* data;
+
+      __forceinline__ __device__ MapSymM(T* data) : data{data} {}
+
+      __forceinline__ __device__ T const& operator()(int const row, int const col) const {
+        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
+        auto const index = total - tmp + row - col;
+        return data[index];
+      }
+
+      template <typename U = T>
+      __forceinline__ __device__ typename std::enable_if<std::is_same<base_type, U>::value, base_type>::type&
+      operator()(int const row, int const col) {
+        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
+        auto const index = total - tmp + row - col;
+        return data[index];
+      }
+    };
+
+    // simple/trivial cholesky decomposition impl
+    template <typename MatrixType1, typename MatrixType2>
+    __forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
+      auto const sqrtm_0_0 = std::sqrt(M(0, 0));
+      L(0, 0) = sqrtm_0_0;
+      using T = typename MatrixType1::base_type;
+
+#pragma unroll
+      for (int i = 1; i < MatrixType1::stride; i++) {
+        T sumsq{0};
+        for (int j = 0; j < i; j++) {
+          T sumsq2{0};
+          auto const m_i_j = M(i, j);
+          for (int k = 0; k < j; ++k)
+            sumsq2 += L(i, k) * L(j, k);
+
+          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+          L(i, j) = value_i_j;
+
+          sumsq += value_i_j * value_i_j;
+        }
+
+        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
+        L(i, i) = l_i_i;
+      }
+    }
+
+    template <typename MatrixType1, typename MatrixType2>
+    __forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) {
+      auto const sqrtm_0_0 = std::sqrt(M(0, 0));
+      L(0, 0) = sqrtm_0_0;
+      using T = typename MatrixType1::base_type;
+
+      for (int i = 1; i < N; i++) {
+        T sumsq{0};
+        for (int j = 0; j < i; j++) {
+          T sumsq2{0};
+          auto const m_i_j = M(i, j);
+          for (int k = 0; k < j; ++k)
+            sumsq2 += L(i, k) * L(j, k);
+
+          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+          L(i, j) = value_i_j;
+
+          sumsq += value_i_j * value_i_j;
+        }
+
+        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
+        L(i, i) = l_i_i;
+      }
+    }
+
+    template <typename MatrixType1, typename MatrixType2, typename VectorType>
+    __forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets(
+        MatrixType1& L,
+        MatrixType2 const& M,
+        float b[MatrixType1::stride],
+        VectorType const& Atb,
+        int const N,
+        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
+      auto const real_0 = pulseOffsets(0);
+      auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0));
+      L(0, 0) = sqrtm_0_0;
+      using T = typename MatrixType1::base_type;
+      b[0] = Atb(real_0) / sqrtm_0_0;
+
+      for (int i = 1; i < N; i++) {
+        auto const i_real = pulseOffsets(i);
+        T sumsq{0};
+        T total = 0;
+        auto const atb = Atb(i_real);
+        for (int j = 0; j < i; j++) {
+          auto const j_real = pulseOffsets(j);
+          T sumsq2{0};
+          auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
+          for (int k = 0; k < j; ++k)
+            sumsq2 += L(i, k) * L(j, k);
+
+          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+          L(i, j) = value_i_j;
+
+          sumsq += value_i_j * value_i_j;
+          total += value_i_j * b[j];
+        }
+
+        auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
+        L(i, i) = l_i_i;
+        b[i] = (atb - total) / l_i_i;
+      }
+    }
+
+    template <typename MatrixType1, typename MatrixType2, typename VectorType>
+    __forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets(
+        MatrixType1& L,
+        MatrixType2 const& M,
+        float b[MatrixType1::stride],
+        VectorType const& Atb,
+        int const N,
+        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
+      using T = typename MatrixType1::base_type;
+      auto const i = N - 1;
+      auto const i_real = pulseOffsets(i);
+      T sumsq{0};
+      T total = 0;
+      for (int j = 0; j < i; j++) {
+        auto const j_real = pulseOffsets(j);
+        T sumsq2{0};
+        auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
+        for (int k = 0; k < j; ++k)
+          sumsq2 += L(i, k) * L(j, k);
+
+        auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+        L(i, j) = value_i_j;
+        sumsq += value_i_j * value_i_j;
+
+        total += value_i_j * b[j];
+      }
+
+      auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
+      L(i, i) = l_i_i;
+      b[i] = (Atb(i_real) - total) / l_i_i;
+    }
+
+    template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
+    __device__ void solve_forward_subst_matrix(MatrixType1& A,
+                                               MatrixType2 const& pulseMatrixView,
+                                               MatrixType3 const& matrixL) {
+      // FIXME: this assumes pulses are on columns and samples on rows
+      constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
+      constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;
+
+#pragma unroll
+      for (int icol = 0; icol < NPULSES; icol++) {
+        float reg_b[NSAMPLES];
+        float reg_L[NSAMPLES];
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+        for (int i = 0; i < NSAMPLES; i++) {
+          reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol));
+          reg_L[i] = matrixL(i, 0);
+        }
+
+        // compute x0 and store it
+        auto x_prev = reg_b[0] / reg_L[0];
+        A(0, icol) = x_prev;
+
+// iterate
+#pragma unroll
+        for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+          for (int counter = iL; counter < NSAMPLES; counter++)
+            reg_b[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+          for (int counter = iL; counter < NSAMPLES; counter++)
+            reg_L[counter] = matrixL(counter, iL);
+
+          // compute the next x for M(iL, icol)
+          x_prev = reg_b[iL] / reg_L[iL];
+
+          // store the result value
+          A(iL, icol) = x_prev;
+        }
+      }
+    }
+
+    template <typename MatrixType1, typename MatrixType2>
+    __device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime],
+                                               MatrixType1 inputAmplitudesView,
+                                               MatrixType2 matrixL) {
+      constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime;
+
+      float reg_b_tmp[NSAMPLES];
+      float reg_L[NSAMPLES];
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+      for (int i = 0; i < NSAMPLES; i++) {
+        reg_b_tmp[i] = inputAmplitudesView(i);
+        reg_L[i] = matrixL(i, 0);
+      }
+
+      // compute x0 and store it
+      auto x_prev = reg_b_tmp[0] / reg_L[0];
+      reg_b[0] = x_prev;
+
+// iterate
+#pragma unroll
+      for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+        for (int counter = iL; counter < NSAMPLES; counter++)
+          reg_b_tmp[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+        for (int counter = iL; counter < NSAMPLES; counter++)
+          reg_L[counter] = matrixL(counter, iL);
+
+        // compute the next x for M(iL, icol)
+        x_prev = reg_b_tmp[iL] / reg_L[iL];
+
+        // store the result value
+        reg_b[iL] = x_prev;
+      }
+    }
+
+    // TODO: add active bxs
+    template <typename MatrixType, typename VectorType>
+    __device__ void fnnls(MatrixType const& AtA,
+                          VectorType const& Atb,
+                          VectorType& solution,
+                          int& npassive,
+                          ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
+                          MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
+                          double const eps,
+                          int const maxIterations) {
+      // constants
+      constexpr auto NPULSES = VectorType::RowsAtCompileTime;
+
+      // to keep track of where to terminate if converged
+      Eigen::Index w_max_idx_prev = 0;
+      float w_max_prev = 0;
+      auto eps_to_use = eps;
+      bool recompute = false;
+
+      // used throughout
+      VectorType s;
+      float reg_b[NPULSES];
+      //float matrixLStorage[MapSymM<float, NPULSES>::total];
+      //MapSymM<float, NPULSES> matrixL{matrixLStorage};
+
+      int iter = 0;
+      while (true) {
+        if (iter > 0 || npassive == 0) {
+          auto const nactive = NPULSES - npassive;
+          // exit if there are no more pulses to constrain
+          if (nactive == 0)
+            break;
+
+          // compute the gradient
+          //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive);
+          Eigen::Index w_max_idx;
+          float w_max = -std::numeric_limits<float>::max();
+          for (int icol = npassive; icol < NPULSES; icol++) {
+            auto const icol_real = pulseOffsets(icol);
+            auto const atb = Atb(icol_real);
+            float sum = 0;
+#pragma unroll
+            for (int counter = 0; counter < NPULSES; counter++)
+              sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter)
+                                         : AtA(icol_real, counter) * solution(counter);
+
+            auto const w = atb - sum;
+            if (w > w_max) {
+              w_max = w;
+              w_max_idx = icol - npassive;
+            }
+          }
+
+          // check for convergence
+          if (w_max < eps_to_use || w_max_idx == w_max_idx_prev && w_max == w_max_prev)
+            break;
+
+          if (iter >= maxIterations)
+            break;
+
+          w_max_prev = w_max;
+          w_max_idx_prev = w_max_idx;
+
+          // move index to the right part of the vector
+          w_max_idx += npassive;
+
+          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx));
+          ++npassive;
+        }
+
+        // inner loop
+        while (true) {
+          if (npassive == 0)
+            break;
+
+          //s.head(npassive)
+          //auto const& matrixL =
+          //    AtA.topLeftCorner(npassive, npassive)
+          //        .llt().matrixL();
+          //.solve(Atb.head(npassive));
+          if (recompute || iter == 0)
+            compute_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+          else
+            update_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+
+          // run backward substituion
+          s(npassive - 1) = reg_b[npassive - 1] / matrixL(npassive - 1, npassive - 1);
+          for (int i = npassive - 2; i >= 0; --i) {
+            float total = 0;
+            for (int j = i + 1; j < npassive; j++)
+              total += matrixL(j, i) * s(j);
+
+            s(i) = (reg_b[i] - total) / matrixL(i, i);
+          }
+
+          // done if solution values are all positive
+          if (s.head(npassive).minCoeff() > 0.f) {
+            for (int i = 0; i < npassive; i++) {
+              auto const i_real = pulseOffsets(i);
+              solution(i_real) = s(i);
+            }
+            //solution.head(npassive) = s.head(npassive);
+            recompute = false;
+            break;
+          }
+
+          // there were negative values -> have to recompute the whole decomp
+          recompute = true;
+
+          auto alpha = std::numeric_limits<float>::max();
+          Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
+          for (int i = 0; i < npassive; i++) {
+            if (s[i] <= 0.) {
+              auto const i_real = pulseOffsets(i);
+              auto const ratio = solution[i_real] / (solution[i_real] - s[i]);
+              if (ratio < alpha) {
+                alpha = ratio;
+                alpha_idx = i;
+                alpha_idx_real = i_real;
+              }
+            }
+          }
+
+          // upadte solution
+          for (int i = 0; i < npassive; i++) {
+            auto const i_real = pulseOffsets(i);
+            solution(i_real) += alpha * (s(i) - solution(i_real));
+          }
+          //solution.head(npassive) += alpha *
+          //    (s.head(npassive) - solution.head(npassive));
+          solution[alpha_idx_real] = 0;
+          --npassive;
+
+          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx));
+        }
+
+        // as in cpu
+        ++iter;
+        if (iter % 10 == 0)
+          eps_to_use *= 10;
+      }
+    }
+
+    template <int NSAMPLES, int NPULSES>
+    __forceinline__ __device__ void update_covariance(
+        ColumnVector<NPULSES> const& resultAmplitudesVector,
+        MapSymM<float, NSAMPLES>& covarianceMatrix,
+        Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
+        Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
+        Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
+#pragma unroll
+      for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
+        auto const resultAmplitude = resultAmplitudesVector(ipulse);
+        if (resultAmplitude == 0)
+          continue;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("pulse cov array for ibx = %d and offset %d\n", ipulse, offset);
+#endif
+
+        // preload a column
+        float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
+#pragma unroll
+        for (int counter = 0; counter < NSAMPLES; counter++) {
+          pmcol[counter] = __ldg(&pulseMatrix.coeffRef(counter, ipulse));
+          pmpcol[counter] = __ldg(&pulseMatrixP.coeffRef(counter, ipulse));
+          pmmcol[counter] = __ldg(&pulseMatrixM.coeffRef(counter, ipulse));
+        }
+
+        auto const ampl2 = resultAmplitude * resultAmplitude;
+#pragma unroll
+        for (int col = 0; col < NSAMPLES; col++) {
+          auto const valueP_col = pmpcol[col];
+          auto const valueM_col = pmmcol[col];
+          auto const value_col = pmcol[col];
+          auto const tmppcol = valueP_col - value_col;
+          auto const tmpmcol = valueM_col - value_col;
+
+          // diagonal
+          auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
+          covarianceMatrix(col, col) += ampl2 * tmp_value;
+
+// FIXME: understand if this actually gets unrolled
+#pragma unroll
+          for (int row = col + 1; row < NSAMPLES; row++) {
+            float const valueP_row = pmpcol[row];  //pulseMatrixP(j, ipulseReal);
+            float const value_row = pmcol[row];    //pulseMatrix(j, ipulseReal);
+            float const valueM_row = pmmcol[row];  //pulseMatrixM(j, ipulseReal);
+
+            float tmpprow = valueP_row - value_row;
+            float tmpmrow = valueM_row - value_row;
+
+            auto const covValue = 0.5 * (tmppcol * tmpprow + tmpmcol * tmpmrow);
+
+            covarianceMatrix(row, col) += ampl2 * covValue;
+          }
+        }
+      }
+    }
+
+    template <int NSAMPLES, int NPULSES>
+    __global__ void kernel_minimize(float* outputEnergy,
+                                    float* outputChi2,
+                                    float const* __restrict__ inputAmplitudes,
+                                    float const* __restrict__ pulseMatrices,
+                                    float const* __restrict__ pulseMatricesM,
+                                    float const* __restrict__ pulseMatricesP,
+                                    int const* __restrict__ pulseOffsetValues,
+                                    float const* __restrict__ noiseTerms,
+                                    int8_t const* __restrict__ soiSamples,
+                                    float const* __restrict__ pedestalWidths,
+                                    float const* __restrict__ effectivePedestalWidths,
+                                    bool const useEffectivePedestals,
+                                    uint32_t const* __restrict__ idsf01HE,
+                                    uint32_t const* __restrict__ idsf5HB,
+                                    uint32_t const* __restrict__ idsf3HB,
+                                    float const* __restrict__ gainValues,
+                                    float const* __restrict__ respCorrectionValues,
+                                    uint32_t const nchannelsf01HE,
+                                    uint32_t const nchannelsf5HB,
+                                    uint32_t const nchannelsTotal,
+                                    uint32_t const offsetForHashes,
+                                    int const maxDepthHB,
+                                    int const maxDepthHE,
+                                    int const maxPhiHE,
+                                    int const firstHBRing,
+                                    int const lastHBRing,
+                                    int const firstHERing,
+                                    int const lastHERing,
+                                    int const nEtaHB,
+                                    int const nEtaHE) {
+      // can be relaxed if needed - minor updates are needed in that case!
+      static_assert(NPULSES == NSAMPLES);
+
+      // indices
+      auto const gch = threadIdx.x + blockIdx.x * blockDim.x;
+      auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
+      if (gch >= nchannelsTotal)
+        return;
+
+      // if chi2 is set to -9999 do not run minimization
+      if (outputChi2[gch] == -9999.f)
+        return;
+
+      // configure shared mem
+      extern __shared__ char shrmem[];
+      float* shrMatrixLFnnlsStorage = reinterpret_cast<float*>(shrmem) + MapSymM<float, NPULSES>::total * threadIdx.x;
+      float* shrAtAStorage =
+          reinterpret_cast<float*>(shrmem) + MapSymM<float, NPULSES>::total * (threadIdx.x + blockDim.x);
+
+      // conditions for pedestal widths
+      auto const id = gch < nchannelsf01HE
+                          ? idsf01HE[gch]
+                          : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
+      //auto const id = gch >= nchannelsf01HE
+      //    ? idsf5HB[gch - nchannelsf01HE]
+      //    : idsf01HE[gch];
+      auto const did = DetId{id};
+      auto const hashedId =
+          did.subdetId() == HcalBarrel
+              ? did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+              : did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
+
+      auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
+                                                 ? effectivePedestalWidths + hashedId * 4
+                                                 : pedestalWidths + hashedId * 4;
+      auto const averagePedestalWidth2 = 0.25 * (pedestalWidthsForChannel[0] * pedestalWidthsForChannel[0] +
+                                                 pedestalWidthsForChannel[1] * pedestalWidthsForChannel[1] +
+                                                 pedestalWidthsForChannel[2] * pedestalWidthsForChannel[2] +
+                                                 pedestalWidthsForChannel[3] * pedestalWidthsForChannel[3]);
+      auto const* gains = gainValues + hashedId * 4;
+      // FIXME on cpu ts 0 capid was used - does it make any difference
+      auto const gain = gains[0];
+      auto const respCorrection = respCorrectionValues[hashedId];
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
+      if (id != DETID_TO_DEBUG)
+        return;
+#endif
+#endif
+
+      // TODO: provide this properly
+      int const soi = soiSamples[gch];
+      constexpr float deltaChi2Threashold = 1e-3;
+
+      ColumnVector<NPULSES, int> pulseOffsets;
+#pragma unroll
+      for (int i = 0; i < NPULSES; ++i)
+        pulseOffsets(i) = i;
+      //        pulseOffsets(i) = pulseOffsetValues[i] - pulseOffsetValues[0];
+
+      // output amplitudes/weights
+      ColumnVector<NPULSES> resultAmplitudesVector = ColumnVector<NPULSES>::Zero();
+
+      // map views
+      Eigen::Map<const ColumnVector<NSAMPLES>> inputAmplitudesView{inputAmplitudes + gch * NSAMPLES};
+      Eigen::Map<const ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
+      Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{pulseMatricesM +
+                                                                              gch * NSAMPLES * NPULSES};
+      Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{pulseMatricesP +
+                                                                              gch * NSAMPLES * NPULSES};
+      Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{pulseMatrices + gch * NSAMPLES * NPULSES};
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      for (int i = 0; i < NSAMPLES; i++)
+        printf("inputValues(%d) = %f noiseTerms(%d) = %f\n", i, inputAmplitudesView(i), i, noiseTermsView(i));
+      for (int i = 0; i < NSAMPLES; i++) {
+        for (int j = 0; j < NPULSES; j++)
+          printf("%f ", glbPulseMatrixView(i, j));
+        printf("\n");
+      }
+      printf("\n");
+      for (int i = 0; i < NSAMPLES; i++) {
+        for (int j = 0; j < NPULSES; j++)
+          printf("%f ", glbPulseMatrixMView(i, j));
+        printf("\n");
+      }
+      printf("\n");
+      for (int i = 0; i < NSAMPLES; i++) {
+        for (int j = 0; j < NPULSES; j++)
+          printf("%f ", glbPulseMatrixPView(i, j));
+        printf("\n");
+      }
+#endif
+
+      int npassive = 0;
+      float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
+      // TOOD: provide constants from configuration
+      for (int iter = 1; iter < 50; iter++) {
+        //float covarianceMatrixStorage[MapSymM<float, NSAMPLES>::total];
+        // NOTE: only works when NSAMPLES == NPULSES
+        // if does not hold -> slightly rearrange shared mem to still reuse
+        // shared memory
+        float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
+        MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
+#pragma unroll
+        for (int counter = 0; counter < MapSymM<float, NSAMPLES>::total; counter++)
+          covarianceMatrixStorage[counter] = averagePedestalWidth2;
+#pragma unroll
+        for (int counter = 0; counter < MapSymM<float, NSAMPLES>::stride; counter++)
+          covarianceMatrix(counter, counter) += __ldg(&noiseTermsView.coeffRef(counter));
+
+        // update covariance matrix
+        update_covariance(
+            resultAmplitudesVector, covarianceMatrix, glbPulseMatrixView, glbPulseMatrixMView, glbPulseMatrixPView);
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("covariance matrix\n");
+        for (int i = 0; i < 8; i++) {
+          for (int j = 0; j < 8; j++)
+            printf("%f ", covarianceMatrix(i, j));
+          printf("\n");
+        }
+#endif
+
+        // compute Cholesky Decomposition L matrix
+        //matrixDecomposition.compute(covarianceMatrix);
+        //auto const& matrixL = matrixDecomposition.matrixL();
+        float matrixLStorage[MapSymM<float, NSAMPLES>::total];
+        MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
+        compute_decomposition_unrolled(matrixL, covarianceMatrix);
+
+        //
+        // replace eigen
+        //
+        //auto const& A = matrixDecomposition
+        //    .matrixL()
+        //    .solve(pulseMatrixView);
+        ColMajorMatrix<NSAMPLES, NPULSES> A;
+        solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
+
+        //
+        // remove eigen
+        //
+        //auto const& b = matrixL
+        //   .solve(inputAmplitudesView);
+        //
+        float reg_b[NSAMPLES];
+        solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
+
+        // TODO: we do not really need to change these matrcies
+        // will be fixed in the optimized version
+        //ColMajorMatrix<NPULSES, NPULSES> AtA = A.transpose() * A;
+        //ColumnVector<NPULSES> Atb = A.transpose() * b;
+        //ColMajorMatrix<NPULSES, NPULSES> AtA;
+        //float AtAStorage[MapSymM<float, NPULSES>::total];
+        MapSymM<float, NPULSES> AtA{shrAtAStorage};
+        ColumnVector<NPULSES> Atb;
+#pragma unroll
+        for (int icol = 0; icol < NPULSES; icol++) {
+          float reg_ai[NSAMPLES];
+
+// load column icol
+#pragma unroll
+          for (int counter = 0; counter < NSAMPLES; counter++)
+            reg_ai[counter] = A(counter, icol);
+
+          // compute diagonal
+          float sum = 0.f;
+#pragma unroll
+          for (int counter = 0; counter < NSAMPLES; counter++)
+            sum += reg_ai[counter] * reg_ai[counter];
+
+          // store
+          AtA(icol, icol) = sum;
+
+// go thru the other columns
+#pragma unroll
+          for (int j = icol + 1; j < NPULSES; j++) {
+            // load column j
+            float reg_aj[NSAMPLES];
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              reg_aj[counter] = A(counter, j);
+
+            // accum
+            float sum = 0.f;
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              sum += reg_aj[counter] * reg_ai[counter];
+
+            // store
+            //AtA(icol, j) = sum;
+            AtA(j, icol) = sum;
+          }
+
+          // Atb accum
+          float sum_atb = 0;
+#pragma unroll
+          for (int counter = 0; counter < NSAMPLES; counter++)
+            sum_atb += reg_ai[counter] * reg_b[counter];
+
+          // store atb
+          Atb(icol) = sum_atb;
+        }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("AtA\n");
+        for (int i = 0; i < 8; i++) {
+          for (int j = 0; j < 8; j++)
+            printf("%f ", AtA(i, j));
+          printf("\n");
+        }
+        printf("Atb\n");
+        for (int i = 0; i < 8; i++)
+          printf("%f ", Atb(i));
+        printf("\n");
+        printf("result Amplitudes before nnls\n");
+        for (int i = 0; i < 8; i++)
+          printf("%f ", resultAmplitudesVector(i));
+        printf("\n");
+#endif
+
+        // for fnnls
+        MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
+
+        // run fast nnls
+        // FIXME: provide values from config
+        fnnls(AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, 1e-11, 500);
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("result Amplitudes\n");
+        for (int i = 0; i < 8; i++)
+          printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i));
+#endif
+
+        // replace pulseMatrixView * result - inputs
+        // NOTE:
+        float accum[NSAMPLES];
+        Eigen::Map<ColumnVector<NSAMPLES>> mapAccum{accum};
+        {
+          float results[NPULSES];
+
+// preload results and permute according to the pulse offsets
+#pragma unroll
+          for (int counter = 0; counter < NPULSES; counter++) {
+            results[counter] = resultAmplitudesVector[counter];
+          }
+
+// load accum
+#pragma unroll
+          for (int counter = 0; counter < NSAMPLES; counter++)
+            accum[counter] = -inputAmplitudesView(counter);
+
+          // iterate
+          for (int icol = 0; icol < NPULSES; icol++) {
+            float pm_col[NSAMPLES];
+
+// preload a column of pulse matrix
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              pm_col[counter] = __ldg(&glbPulseMatrixView.coeffRef(counter, icol));
+
+// accum
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              accum[counter] += results[icol] * pm_col[counter];
+          }
+        }
+
+        // compute chi2 and check that there is no rotation
+        //chi2 = matrixDecomposition
+        //    .matrixL()
+        //    . solve(mapAccum)
+        //            .solve(pulseMatrixView * resultAmplitudesVector - inputAmplitudesView)
+        //    .squaredNorm();
+        {
+          float reg_b_tmp[NSAMPLES];
+          float reg_L[NSAMPLES];
+          float accumSum = 0;
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+          for (int i = 0; i < NSAMPLES; i++) {
+            reg_b_tmp[i] = mapAccum(i);
+            reg_L[i] = matrixL(i, 0);
+          }
+
+          // compute x0 and store it
+          auto x_prev = reg_b_tmp[0] / reg_L[0];
+          accumSum += x_prev * x_prev;
+
+// iterate
+#pragma unroll
+          for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+            for (int counter = iL; counter < NSAMPLES; counter++)
+              reg_b_tmp[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+            for (int counter = iL; counter < NSAMPLES; counter++)
+              reg_L[counter] = matrixL(counter, iL);
+
+            // compute the next x for M(iL, icol)
+            x_prev = reg_b_tmp[iL] / reg_L[iL];
+
+            // store the result value
+            accumSum += x_prev * x_prev;
+          }
+
+          chi2 = accumSum;
+        }
+
+        auto const deltaChi2 = std::abs(chi2 - previous_chi2);
+        if (chi2 == chi2_2itersback && chi2 < previous_chi2)
+          break;
+
+        // update
+        chi2_2itersback = previous_chi2;
+        previous_chi2 = chi2;
+
+        // exit condition
+        if (deltaChi2 < deltaChi2Threashold)
+          break;
+      }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      for (int i = 0; i < NPULSES; i++)
+        printf("pulseOffsets(%d) = %d outputAmplitudes(%d) = %f\n", i, pulseOffsets(i), i, resultAmplitudesVector(i));
+      printf("chi2 = %f\n", chi2);
+#endif
+
+      outputChi2[gch] = chi2;
+      auto const idx_for_energy = std::abs(pulseOffsetValues[0]);
+      outputEnergy[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
+      /*
+    #pragma unroll
+    for (int i=0; i<NPULSES; i++)
+        if (pulseOffsets[i] == soi)
+            // NOTE: gain is a number < 10^-3/4, multiply first to avoid stab issues
+            outputEnergy[gch] = (gain*resultAmplitudesVector(i))*respCorrection;
+    */
+    }
+
+    void entryPoint(InputDataGPU const& inputGPU,
+                    OutputDataGPU& outputGPU,
+                    ConditionsProducts const& conditions,
+                    ScratchDataGPU& scratch,
+                    ConfigParameters const& configParameters,
+                    cudaStream_t cudaStream) {
+      auto const totalChannels = inputGPU.f01HEDigis.size + inputGPU.f5HBDigis.size + inputGPU.f3HBDigis.size;
+
+      // FIXME: may be move this assignment to emphasize this more clearly
+      // FIXME: number of channels for output might change given that
+      //   some channesl might be filtered out
+      outputGPU.recHits.size = totalChannels;
+
+      // TODO: this can be lifted by implementing a separate kernel
+      // similar to the default one, but properly handling the diff in #samples
+      // or modifying existing one
+      auto const f01nsamples = compute_nsamples<Flavor01>(inputGPU.f01HEDigis.stride);
+      auto const f5nsamples = compute_nsamples<Flavor5>(inputGPU.f5HBDigis.stride);
+      auto const f3nsamples = compute_nsamples<Flavor3>(inputGPU.f3HBDigis.stride);
+      assert(f01nsamples == f5nsamples && f01nsamples == f3nsamples);
+
+      dim3 threadsPerBlock{f01nsamples, configParameters.kprep1dChannelsPerBlock};
+      int blocks = static_cast<uint32_t>(threadsPerBlock.y) > totalChannels
+                       ? 1
+                       : (totalChannels + threadsPerBlock.y - 1) / threadsPerBlock.y;
+      int nbytesShared =
+          ((2 * f01nsamples + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock;
+      kernel_prep1d_sameNumberOfSamples<<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
+          scratch.amplitudes,
+          scratch.noiseTerms,
+          outputGPU.recHits.energy,
+          outputGPU.recHits.chi2,
+          inputGPU.f01HEDigis.data,
+          inputGPU.f5HBDigis.data,
+          inputGPU.f3HBDigis.data,
+          inputGPU.f01HEDigis.ids,
+          inputGPU.f5HBDigis.ids,
+          inputGPU.f3HBDigis.ids,
+          inputGPU.f01HEDigis.stride,
+          inputGPU.f5HBDigis.stride,
+          inputGPU.f3HBDigis.stride,
+          inputGPU.f01HEDigis.size,
+          inputGPU.f5HBDigis.size,
+          inputGPU.f5HBDigis.npresamples,
+          scratch.soiSamples,
+          outputGPU.recHits.energyM0,
+          outputGPU.recHits.timeM0,
+          outputGPU.recHits.did,
+          totalChannels,
+          conditions.recoParams.param1,
+          conditions.recoParams.param2,
+          conditions.qieCoders.offsets,
+          conditions.qieCoders.slopes,
+          conditions.qieTypes.values,
+          conditions.pedestalWidths.values,
+          conditions.effectivePedestalWidths.values,
+          conditions.pedestals.values,
+          conditions.convertedEffectivePedestals ? conditions.convertedEffectivePedestals->values
+                                                 : conditions.pedestals.values,
+          configParameters.useEffectivePedestals,
+          conditions.sipmParameters.type,
+          conditions.sipmParameters.fcByPE,
+          conditions.sipmCharacteristics.parLin1,
+          conditions.sipmCharacteristics.parLin2,
+          conditions.sipmCharacteristics.parLin3,
+          conditions.gains.values,
+          conditions.respCorrs.values,
+          conditions.topology->maxDepthHB(),
+          conditions.topology->maxDepthHE(),
+          conditions.recConstants->getNPhi(1) > IPHI_MAX ? conditions.recConstants->getNPhi(1) : IPHI_MAX,
+          conditions.topology->firstHBRing(),
+          conditions.topology->lastHBRing(),
+          conditions.topology->firstHERing(),
+          conditions.topology->lastHERing(),
+          conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
+          conditions.topology->firstHERing() > conditions.topology->lastHERing()
+              ? 0
+              : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1),
+          configParameters.sipmQTSShift,
+          configParameters.sipmQNTStoSum,
+          configParameters.firstSampleShift,
+          conditions.offsetForHashes,
+          configParameters.ts4Thresh);
+      cudaCheck(cudaGetLastError());
+
+      // 1024 is the max threads per block for gtx1080
+      // FIXME: take this from cuda service or something like that
+      uint32_t const channelsPerBlock = 1024 / (f01nsamples * configParameters.pulseOffsets.size());
+      dim3 threadsPerBlock2{f01nsamples, static_cast<uint32_t>(configParameters.pulseOffsets.size()), channelsPerBlock};
+      int blocks2 =
+          threadsPerBlock2.z > totalChannels ? 1 : (totalChannels + threadsPerBlock2.z - 1) / threadsPerBlock2.z;
+
+#ifdef HCAL_MAHI_CPUDEBUG
+      std::cout << "threads: " << threadsPerBlock2.x << " " << threadsPerBlock2.y << "  " << threadsPerBlock2.z
+                << std::endl;
+      std::cout << "blocks: " << blocks2 << std::endl;
+#endif
+
+      kernel_prep_pulseMatrices_sameNumberOfSamples<<<blocks2, threadsPerBlock2, 0, cudaStream>>>(
+          scratch.pulseMatrices,
+          scratch.pulseMatricesM,
+          scratch.pulseMatricesP,
+          configParameters.pulseOffsetsDevice,
+          scratch.amplitudes,
+          inputGPU.f01HEDigis.ids,
+          inputGPU.f5HBDigis.ids,
+          inputGPU.f3HBDigis.ids,
+          inputGPU.f01HEDigis.size,
+          inputGPU.f5HBDigis.size,
+          totalChannels,
+          scratch.soiSamples,
+          conditions.recoParams.ids,
+          conditions.recoParams.acc25nsVec,
+          conditions.recoParams.diff25nsItvlVec,
+          conditions.recoParams.accVarLenIdxMinusOneVec,
+          conditions.recoParams.diffVarItvlIdxMinusOneVec,
+          conditions.recoParams.accVarLenIdxZEROVec,
+          conditions.recoParams.diffVarItvlIdxZEROVec,
+          configParameters.meanTime,
+          configParameters.timeSigmaSiPM,
+          configParameters.timeSigmaHPD,
+          conditions.topology->maxDepthHB(),
+          conditions.topology->maxDepthHE(),
+          conditions.recConstants->getNPhi(1) > IPHI_MAX ? conditions.recConstants->getNPhi(1) : IPHI_MAX,
+          conditions.topology->firstHBRing(),
+          conditions.topology->lastHBRing(),
+          conditions.topology->firstHERing(),
+          conditions.topology->lastHERing(),
+          conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
+          conditions.topology->firstHERing() > conditions.topology->lastHERing()
+              ? 0
+              : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1),
+          conditions.offsetForHashes,
+          configParameters.applyTimeSlew,
+          configParameters.tzeroTimeSlew,
+          configParameters.slopeTimeSlew,
+          configParameters.tmaxTimeSlew);
+      cudaCheck(cudaGetLastError());
+
+      if (f01nsamples == 8 && configParameters.pulseOffsets.size() == 8u) {
+        // FIXME: provide constants from configuration
+        uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
+        uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;
+        auto const nbytesShared = 2 * threadsPerBlock * MapSymM<float, 8>::total * sizeof(float);
+        kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
+            outputGPU.recHits.energy,
+            outputGPU.recHits.chi2,
+            scratch.amplitudes,
+            scratch.pulseMatrices,
+            scratch.pulseMatricesM,
+            scratch.pulseMatricesP,
+            configParameters.pulseOffsetsDevice,
+            scratch.noiseTerms,
+            scratch.soiSamples,
+            conditions.pedestalWidths.values,
+            conditions.effectivePedestalWidths.values,
+            configParameters.useEffectivePedestals,
+            inputGPU.f01HEDigis.ids,
+            inputGPU.f5HBDigis.ids,
+            inputGPU.f3HBDigis.ids,
+            conditions.gains.values,
+            conditions.respCorrs.values,
+            inputGPU.f01HEDigis.size,
+            inputGPU.f5HBDigis.size,
+            totalChannels,
+            conditions.offsetForHashes,
+            conditions.topology->maxDepthHB(),
+            conditions.topology->maxDepthHE(),
+            conditions.recConstants->getNPhi(1) > IPHI_MAX ? conditions.recConstants->getNPhi(1) : IPHI_MAX,
+            conditions.topology->firstHBRing(),
+            conditions.topology->lastHBRing(),
+            conditions.topology->firstHERing(),
+            conditions.topology->lastHERing(),
+            conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
+            conditions.topology->firstHERing() > conditions.topology->lastHERing()
+                ? 0
+                : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1));
+      } else {
+        throw cms::Exception("Invalid MahiGPU configuration")
+            << "Currently support only 8 pulses and 8 time samples and provided: " << f01nsamples << " samples and "
+            << configParameters.pulseOffsets.size() << " pulses" << std::endl;
+      }
+    }
+
+  }  // namespace mahi
+}  // namespace hcal

From e1381c7cc93e90584787703fd076905da35763c1 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 17 Jun 2020 09:19:48 -0500
Subject: [PATCH 10/34] Apply code formatting (cms-patatrack#486)

---
 .../HcalRawToDigi/plugins/HcalDigisProducerGPU.cc    | 12 +++++++++---
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu        |  1 -
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
index ce18d78af7e84..c7bb27b60fa12 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -111,14 +111,20 @@ HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
   edm::Service<CUDAService> cs;
   if (cs and cs->enabled()) {
     // allocate on the device
-    cudaCheck(cudaMalloc((void**)&df01_.data, config_.maxChannelsF01HE * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)));
+    cudaCheck(cudaMalloc(
+        (void**)&df01_.data,
+        config_.maxChannelsF01HE * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)));
     cudaCheck(cudaMalloc((void**)&df01_.ids, config_.maxChannelsF01HE * sizeof(uint32_t)));
 
-    cudaCheck(cudaMalloc((void**)&df5_.data, config_.maxChannelsF5HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)));
+    cudaCheck(cudaMalloc(
+        (void**)&df5_.data,
+        config_.maxChannelsF5HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)));
     cudaCheck(cudaMalloc((void**)&df5_.ids, config_.maxChannelsF5HB * sizeof(uint32_t)));
     cudaCheck(cudaMalloc((void**)&df5_.npresamples, sizeof(uint8_t) * config_.maxChannelsF5HB));
 
-    cudaCheck(cudaMalloc((void**)&df3_.data, config_.maxChannelsF3HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB)));
+    cudaCheck(cudaMalloc(
+        (void**)&df3_.data,
+        config_.maxChannelsF3HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB)));
     cudaCheck(cudaMalloc((void**)&df3_.ids, config_.maxChannelsF3HB * sizeof(uint32_t)));
   }
 
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index c1f9a62f4421b..3800ddb3cae70 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -7,7 +7,6 @@
 
 #include "MahiGPU.h"
 
-
 #ifdef HCAL_MAHI_GPUDEBUG
 #define DETID_TO_DEBUG 1125647428
 #endif

From 7ab00f9411d0ad37ea0e40c1d1676455bca87ed2 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 18 Jun 2020 15:32:58 +0200
Subject: [PATCH 11/34] Fix warnings about unused variables in HCAL GPU code
 (cms-patatrack#491)

---
 .../HcalRawToDigi/plugins/DecodeGPU.cu        | 38 +++++++++----------
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 38 +++++++++++--------
 2 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
index 5eaff1d9c699f..1589ec6cb1661 100644
--- a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
@@ -76,9 +76,7 @@ namespace hcal {
 
       auto const iamc = threadIdx.x / NTHREADS;
       auto const ifed = blockIdx.x;
-      auto const fed = feds[ifed];
       auto const offset = offsets[ifed];
-      auto const size = ifed == gridDim.x - 1 ? nBytesTotal - offset : offsets[ifed + 1] - offset;
 
 #ifdef HCAL_RAWDECODE_GPUDEBUG_CG
       if (ifed > 0 || iamc > 0)
@@ -87,12 +85,15 @@ namespace hcal {
 #endif
 
 #ifdef HCAL_RAWDECODE_GPUDEBUG
+      auto const fed = feds[ifed];
+      auto const size = ifed == gridDim.x - 1 ? nBytesTotal - offset : offsets[ifed + 1] - offset;
       printf("ifed = %d fed = %d offset = %u size = %u\n", ifed, fed, offset, size);
 #endif
 
       // offset to the right raw buffer
       uint64_t const* buffer = reinterpret_cast<uint64_t const*>(data + offset);
 
+#ifdef HCAL_RAWDECODE_GPUDEBUG
       //
       // fed header
       //
@@ -103,7 +104,6 @@ namespace hcal {
       uint8_t const trigger_type = (fed_header >> 56) & 0xf;
       uint8_t const bid_fed_header = (fed_header >> 60) & 0xf;
 
-#ifdef HCAL_RAWDECODE_GPUDEBUG
       printf("fed = %d fed_id = %u bx = %u lv1 = %u trigger_type = %u bid = %u\n",
              fed,
              fed_id,
@@ -116,13 +116,12 @@ namespace hcal {
       // amc 13 header
       auto const amc13word = buffer[1];
       uint8_t const namc = (amc13word >> 52) & 0xf;
-      uint8_t const amc13version = (amc13word >> 60) & 0xf;
-      uint32_t const amc13OrbitNumber = (amc13word >> 4) & 0xffffffffu;
-
       if (iamc >= namc)
         return;
 
 #ifdef HCAL_RAWDECODE_GPUDEBUG
+      uint8_t const amc13version = (amc13word >> 60) & 0xf;
+      uint32_t const amc13OrbitNumber = (amc13word >> 4) & 0xffffffffu;
       printf("fed = %d namc = %u amc13version = %u amc13OrbitNumber = %u\n", fed, namc, amc13version, amc13OrbitNumber);
 #endif
 
@@ -134,16 +133,14 @@ namespace hcal {
         amcoffset += amcSize;
       }
 
-      //    for (uint8_t iamc=0u; iamc < namc; ++iamc) {
       auto const word = buffer[2 + iamc];
-      uint16_t const amcid = word & 0xffff;
-      int const slot = (word >> 16) & 0xf;
-      int const amcBlockNumber = (word >> 20) & 0xff;
       int const amcSize = (word >> 32) & 0xffffff;
 
 #ifdef HCAL_RAWDECODE_GPUDEBUG
+      uint16_t const amcid = word & 0xffff;
+      int const slot = (word >> 16) & 0xf;
+      int const amcBlockNumber = (word >> 20) & 0xff;
       printf("fed = %d amcid = %u slot = %d amcBlockNumber = %d\n", fed, amcid, slot, amcBlockNumber);
-#endif
 
       bool const amcmore = ((word >> 61) & 0x1) != 0;
       bool const amcSegmented = ((word >> 60) & 0x1) != 0;
@@ -152,8 +149,6 @@ namespace hcal {
       bool const amcDataPresent = ((word >> 58) & 0x1) != 0;
       bool const amcDataValid = ((word >> 56) & 0x1) != 0;
       bool const amcEnabled = ((word >> 59) & 0x1) != 0;
-
-#ifdef HCAL_RAWDECODE_GPUDEBUG
       printf(
           "fed = %d amcmore = %d amcSegmented = %d, amcLengthOk = %d amcCROk = %d\n>> amcDataPresent = %d amcDataValid "
           "= %d amcEnabled = %d\n",
@@ -169,18 +164,19 @@ namespace hcal {
 
       // get to the payload
       auto const* payload64 = buffer + 2 + namc + amcoffset;
-      auto const* payload16 = reinterpret_cast<uint16_t const*>(payload64);
       //amcoffset += amcSize;
 
+#ifdef HCAL_RAWDECODE_GPUDEBUG
       // uhtr header v1 1st 64 bits
       auto const payload64_w0 = payload64[0];
-      // uhtr n bytes comes from amcSize, according to the cpu version!
       //uint32_t const data_length64 = payload64_w0 & 0xfffff;
+#endif
+      // uhtr n bytes comes from amcSize, according to the cpu version!
       uint32_t const data_length64 = amcSize;
-      uint16_t bcn = (payload64_w0 >> 20) & 0xfff;
-      uint32_t evn = (payload64_w0 >> 32) & 0xffffff;
 
 #ifdef HCAL_RAWDECODE_GPUDEBUG
+      uint16_t bcn = (payload64_w0 >> 20) & 0xfff;
+      uint32_t evn = (payload64_w0 >> 32) & 0xffffff;
       printf("fed = %d data_length64 = %u bcn = %u evn = %u\n", fed, data_length64, bcn, evn);
 #endif
 
@@ -189,12 +185,12 @@ namespace hcal {
       uint8_t const uhtrcrate = payload64_w1 & 0xff;
       uint8_t const uhtrslot = (payload64_w1 >> 8) & 0xf;
       uint8_t const presamples = (payload64_w1 >> 12) & 0xf;
-      uint16_t const orbitN = (payload64_w1 >> 16) & 0xffff;
-      uint8_t const firmFlavor = (payload64_w1 >> 32) & 0xff;
-      uint8_t const eventType = (payload64_w1 >> 40) & 0xf;
       uint8_t const payloadFormat = (payload64_w1 >> 44) & 0xf;
 
 #ifdef HCAL_RAWDECODE_GPUDEBUG
+      uint16_t const orbitN = (payload64_w1 >> 16) & 0xffff;
+      uint8_t const firmFlavor = (payload64_w1 >> 32) & 0xff;
+      uint8_t const eventType = (payload64_w1 >> 40) & 0xf;
       printf(
           "fed = %d crate = %u slot = %u presamples = %u\n>>> orbitN = %u firmFlavor = %u eventType = %u payloadFormat "
           "= %u\n",
@@ -215,7 +211,7 @@ namespace hcal {
       // skip uhtr header words
       auto const channelDataSize = data_length64 - 2;        // 2 uhtr header v1 words
       auto const* channelDataBuffer64Start = payload64 + 2;  // 2 uhtr header v2 wds
-      auto const* channelDataBuffer64End = channelDataBuffer64Start + channelDataSize;
+      //auto const* channelDataBuffer64End = channelDataBuffer64Start + channelDataSize;
       auto const* ptr = reinterpret_cast<uint16_t const*>(channelDataBuffer64Start);
       auto const* end = ptr + sizeof(uint64_t) / sizeof(uint16_t) * (channelDataSize - 1);
       auto const t_rank = thread_group.thread_rank();
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index 3800ddb3cae70..72df5d89815a2 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -28,7 +28,7 @@ namespace hcal {
 
     // FIXME remove duplication...
     // this is from PulesFunctor. nvcc was complaining... if included that header...
-    constexpr int maxSamples = 10;
+    //constexpr int maxSamples = 10;
     constexpr int maxPSshapeBin = 256;
     constexpr int nsPerBX = 25;
     constexpr float iniTimeShift = 92.5f;
@@ -229,9 +229,6 @@ namespace hcal {
       auto const nchannels_per_block = blockDim.y;
       auto const linearThPerBlock = threadIdx.x + threadIdx.y * blockDim.x;
 
-      constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
-      constexpr uint32_t mantissaMaskQIE11 = 0x3f;
-
       // remove
       if (gch >= nchannels)
         return;
@@ -375,13 +372,17 @@ namespace hcal {
       // sits on the boundary when flavor 01 channels end and flavor 5 start
       //
       float rawCharge;
+#ifdef COMPUTE_TDC_TIME
       float tdcTime;
+#endif  // COMPUTE_TDC_TIME
       auto const dfc = compute_diff_charge_gain(
           qieType, adc, capid, qieOffsets, qieSlopes, gch < nchannelsf01HE || gch >= nchannelsf015);
       if (gch >= nchannelsf01HE && gch < nchannelsf015) {
         // flavor 5
         rawCharge = charge;
+#ifdef COMPUTE_TDC_TIME
         tdcTime = HcalSpecialTimes::UNKNOWN_T_NOTDC;
+#endif  // COMPUTE_TDC_TIME
       } else {
         // flavor 0 or 1 or 3
         // conditions needed for sipms
@@ -397,11 +398,13 @@ namespace hcal {
         auto const effectivePixelsFired = sipmq / fcByPE;
         auto const factor = compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
         rawCharge = (charge - pedestal) * factor + pedestal;
+#ifdef COMPUTE_TDC_TIME
         if (gch < nchannelsf01HE)
           tdcTime = HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor01>(dataf01HE + stride * gch, sample));
         else if (gch >= nchannelsf015)
           tdcTime =
               HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
+#endif  // COMPUTE_TDC_TIME
 
 #ifdef HCAL_MAHI_GPUDEBUG
         printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
@@ -454,7 +457,7 @@ namespace hcal {
         do {
           assumed = old;
           // decode energy, sample values
-          int const current_sample = (assumed >> 32) & 0xffffffff;
+          //int const current_sample = (assumed >> 32) & 0xffffffff;
           float const current_energy = __uint_as_float(assumed & 0xffffffff);
           if (energym0_per_ts > current_energy)
             old = atomicCAS(&shrMethod0EnergySamplePair[lch], assumed, val);
@@ -554,8 +557,8 @@ namespace hcal {
       constexpr float pulse_height = 1.0f;
       constexpr float slew = 0.f;
       constexpr auto ns_per_bx = nsPerBX;
-      constexpr auto num_ns = nsPerBX * maxSamples;
-      constexpr auto num_bx = num_ns / ns_per_bx;
+      //constexpr auto num_ns = nsPerBX * maxSamples;
+      //constexpr auto num_bx = num_ns / ns_per_bx;
 
       // FIXME: clean up all the rounding... this is coming from original cpu version
       float const i_start_float =
@@ -786,9 +789,10 @@ namespace hcal {
       // FIXME: shift should be treated properly,
       // here assume 8 time slices and 8 samples
       auto const shift = 4 - soi;  // as in cpu version!
-                                   //    auto const offset = ipulse - soi;
-                                   //    auto const idx = sample - offset;
-      auto const idx = sample - pulseOffset;
+
+      // auto const offset = ipulse - soi;
+      // auto const idx = sample - offset;
+      int32_t const idx = sample - pulseOffset;
       auto const value = idx >= 0 && idx < nsamples ? compute_pulse_shape_value(t0,
                                                                                 idx,
                                                                                 shift,
@@ -1346,8 +1350,10 @@ namespace hcal {
 #endif
 #endif
 
+      /*
       // TODO: provide this properly
       int const soi = soiSamples[gch];
+      */
       constexpr float deltaChi2Threashold = 1e-3;
 
       ColumnVector<NPULSES, int> pulseOffsets;
@@ -1633,12 +1639,12 @@ namespace hcal {
       auto const idx_for_energy = std::abs(pulseOffsetValues[0]);
       outputEnergy[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
       /*
-    #pragma unroll
-    for (int i=0; i<NPULSES; i++)
-        if (pulseOffsets[i] == soi)
-            // NOTE: gain is a number < 10^-3/4, multiply first to avoid stab issues
-            outputEnergy[gch] = (gain*resultAmplitudesVector(i))*respCorrection;
-    */
+      #pragma unroll
+      for (int i=0; i<NPULSES; i++)
+          if (pulseOffsets[i] == soi)
+              // NOTE: gain is a number < 10^-3/4, multiply first to avoid stab issues
+              outputEnergy[gch] = (gain*resultAmplitudesVector(i))*respCorrection;
+      */
     }
 
     void entryPoint(InputDataGPU const& inputGPU,

From aa8fb930163fcae9cd39a48424430be8b5f3c75e Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 18 Jun 2020 08:34:36 -0500
Subject: [PATCH 12/34] Remove duplicate dictionary definitions
 (cms-patatrack#489)

Remove dictionary definitions for classes already defined in CUDADataFormats/StdDictionaries.

Co-authored-by: Andrea Bocci <andrea.bocci@cern.ch>
---
 CUDADataFormats/HcalRecHitSoA/src/classes_def.xml | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
index d747618168c22..f9b6eb4ecc130 100644
--- a/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
+++ b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
@@ -1,15 +1,9 @@
 <lcgdict>
-    <class name="std::vector<float, cms::cuda::HostAllocator<float, 0>>" />
-    <class name="std::vector<int, cms::cuda::HostAllocator<int, 0>>" />
-    <class name="std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t, 0>>" />
-
-
     <class name="hcal::RecHitCollection<hcal::common::VecStoragePolicy<std::allocator>>"/>
-    <class name="hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>"/>
     <class name="edm::Wrapper<hcal::RecHitCollection<hcal::common::VecStoragePolicy<std::allocator>>>"/>
+
+    <class name="hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>"/>
     <class name="edm::Wrapper<hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>"/>
-            
-            
 
     <class name="cms::cuda::Product<hcal::RecHitCollection<hcal::common::ViewStoragePolicy>>" persistent="false" />
     <class name="edm::Wrapper<cms::cuda::Product<hcal::RecHitCollection<hcal::common::ViewStoragePolicy>>>" persistent="false" />

From 32e783113a657aa88f7f42cf0b06a9c6fbb27fd5 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 6 Jun 2020 00:05:23 +0200
Subject: [PATCH 13/34] Backport: add ECAL-only and HCAL-only workflows for MC
 and data (#30350)

Backport #30105: add ECAL-only workflows for data.
Backport #30136: add HCAL-only workflows for MC and data.
---
 RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
index 057707c80534e..ddcab0252afcf 100644
--- a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
@@ -22,6 +22,9 @@
 from RecoLocalCalo.HcalRecProducers.HFPhase1Reconstructor_cfi import hfreco as _phase1_hfreco
 from RecoLocalCalo.HcalRecProducers.hbheplan1_cfi import hbheplan1
 
+#--- for HCALonly wf
+hcalOnlyLocalRecoTask = cms.Task(hbheprereco,hfprereco,hfreco,horeco)
+
 # copy for cosmics
 _default_hfreco = hfreco.clone()
 

From 8a4f7818606fb872560e27a29878a5500c6ac68c Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 11 Jul 2020 00:11:51 +0200
Subject: [PATCH 14/34] Implement HCAL-only workflows on GPU
 (cms-patatrack#505)

Implement the HCAL-only on GPU workflow in runTheMatrix.py, 11634.522 .
Add customisations for profiling the HCAL-only workflows.
---
 .../python/customizeHcalOnlyForProfiling.py   | 60 ++++++++++++++++++
 .../Configuration/python/hcalLocalReco_cff.py | 20 ++++++
 .../python/hbheRecHitProducerGPUTask_cff.py   | 63 +++++++++++++++++++
 3 files changed, 143 insertions(+)
 create mode 100644 RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
 create mode 100644 RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py

diff --git a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
new file mode 100644
index 0000000000000..b3a2548791ae5
--- /dev/null
+++ b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
@@ -0,0 +1,60 @@
+import FWCore.ParameterSet.Config as cms
+
+# Customise the HCAL-only reconstruction to run on GPU
+#
+# Currently, this means:
+#   - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU;
+#   - running the HBHE local reconstruction, including MAHI, on GPU.
+def customizeHcalOnlyForProfilingGPUOnly(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('hbheRecHitProducerGPU')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the HCAL-only reconstruction to run on GPU, and copy the data to the host
+#
+# Currently, this means:
+#   - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU;
+#   - running the HBHE local reconstruction, including MAHI, on GPU;
+#   - copying the rechits to CPU and converting them to legacy format.
+#
+# (this is equivalent to customizeHcalOnlyForProfiling, as the copy and conversion is done by the same module)
+def customizeHcalOnlyForProfilingGPUWithHostCopy(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('hbheprereco')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the HCAL-only reconstruction to run on GPU, copy the data to the host, and convert to legacy format
+#
+# Currently, this means:
+#   - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU;
+#   - running the HBHE local reconstruction, including MAHI, on GPU;
+#   - copying the rechits to CPU and converting them to legacy format.
+#
+# The same customisation can be also used on the CPU workflow, running up to the rechits on CPU.
+def customizeHcalOnlyForProfiling(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('hbheprereco')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
diff --git a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
index ddcab0252afcf..2eff3a1eb5e40 100644
--- a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
@@ -48,6 +48,26 @@
 from Configuration.ProcessModifiers.run2_HECollapse_2018_cff import run2_HECollapse_2018
 run2_HECollapse_2018.toReplaceWith(hcalLocalRecoTask, _collapse_hcalLocalRecoTask)
 
+# Run 3 HCAL workflow on GPU
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+from RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPUTask_cff import *
+_hcalLocalRecoTaskGPU = hcalLocalRecoTask.copy()
+_hcalLocalRecoTaskGPU.add(hbheRecHitProducerGPUTask)
+gpu.toReplaceWith(hcalLocalRecoTask, _hcalLocalRecoTaskGPU)
+
+_hcalOnlyLocalRecoTaskGPU = hcalOnlyLocalRecoTask.copy()
+_hcalOnlyLocalRecoTaskGPU.add(hbheRecHitProducerGPUTask)
+gpu.toReplaceWith(hcalOnlyLocalRecoTask, _hcalOnlyLocalRecoTaskGPU)
+
+from RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi import hcalCPURecHitsProducer as _hcalCPURecHitsProducer
+gpu.toReplaceWith(hbheprereco, _hcalCPURecHitsProducer.clone(
+    recHitsM0LabelIn = "hbheRecHitProducerGPU",
+    recHitsM0LabelOut = "",
+    recHitsLegacyLabelOut = ""
+))
+#---
+
 _phase2_hcalLocalRecoTask = hcalLocalRecoTask.copy()
 _phase2_hcalLocalRecoTask.remove(hbheprereco)
 
diff --git a/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py b/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py
new file mode 100644
index 0000000000000..d2d3dac166469
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py
@@ -0,0 +1,63 @@
+import FWCore.ParameterSet.Config as cms
+
+# Run 3 HCAL workflow on GPU
+
+# EventSetup modules used by HBHERecHitProducerGPU
+from RecoLocalCalo.HcalRecProducers.hcalGainsGPUESProducer_cfi import hcalGainsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalGainWidthsGPUESProducer_cfi import hcalGainWidthsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalLUTCorrsGPUESProducer_cfi import hcalLUTCorrsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalsGPUESProducer_cfi import hcalConvertedPedestalsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalsGPUESProducer_cfi import hcalConvertedEffectivePedestalsGPUESProducer
+hcalConvertedEffectivePedestalsGPUESProducer.label0 = "withTopoEff"
+
+from RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalWidthsGPUESProducer_cfi import hcalConvertedPedestalWidthsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalWidthsGPUESProducer_cfi import hcalConvertedEffectivePedestalWidthsGPUESProducer
+hcalConvertedEffectivePedestalWidthsGPUESProducer.label0 = "withTopoEff"
+hcalConvertedEffectivePedestalWidthsGPUESProducer.label1 = "withTopoEff"
+
+from RecoLocalCalo.HcalRecProducers.hcalQIECodersGPUESProducer_cfi import hcalQIECodersGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalRecoParamsWithPulseShapesGPUESProducer_cfi import hcalRecoParamsWithPulseShapesGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalRespCorrsGPUESProducer_cfi import hcalRespCorrsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalTimeCorrsGPUESProducer_cfi import hcalTimeCorrsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalQIETypesGPUESProducer_cfi import hcalQIETypesGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalSiPMParametersGPUESProducer_cfi import hcalSiPMParametersGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalSiPMCharacteristicsGPUESProducer_cfi import hcalSiPMCharacteristicsGPUESProducer
+
+# convert the HBHE digis into SoA format, and copy them from CPU to GPU
+from EventFilter.HcalRawToDigi.hcalDigisProducerGPU_cfi import hcalDigisProducerGPU as _hcalDigisProducerGPU
+hcalDigisGPU = _hcalDigisProducerGPU.clone(
+    digisLabelF01HE = "",
+    digisLabelF5HB = "",
+    digisLabelF3HB = ""
+)
+
+# run the HCAL local reconstruction (MAHI) on GPU
+from RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPU_cfi import hbheRecHitProducerGPU as _hbheRecHitProducerGPU
+hbheRecHitProducerGPU = _hbheRecHitProducerGPU.clone(
+    digisLabelF01HE = "hcalDigisGPU",
+    digisLabelF5HB = "hcalDigisGPU",
+    digisLabelF3HB = "hcalDigisGPU",
+    recHitsLabelM0HBHE = ""
+)
+
+# Tasks and Sequences
+hbheRecHitProducerGPUTask = cms.Task(
+    hcalGainsGPUESProducer,
+    hcalGainWidthsGPUESProducer,
+    hcalLUTCorrsGPUESProducer,
+    hcalConvertedPedestalsGPUESProducer,
+    hcalConvertedEffectivePedestalsGPUESProducer,
+    hcalConvertedPedestalWidthsGPUESProducer,
+    hcalConvertedEffectivePedestalWidthsGPUESProducer,
+    hcalQIECodersGPUESProducer,
+    hcalRecoParamsWithPulseShapesGPUESProducer,
+    hcalRespCorrsGPUESProducer,
+    hcalTimeCorrsGPUESProducer,
+    hcalQIETypesGPUESProducer,
+    hcalSiPMParametersGPUESProducer,
+    hcalSiPMCharacteristicsGPUESProducer,
+    hcalDigisGPU,
+    hbheRecHitProducerGPU
+)
+
+hbheRecHitProducerGPUSequence = cms.Sequence(hbheRecHitProducerGPUTask)

From e4145ebcd295f7c99d8b9f8b3a28ddf07e911e4e Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 12 Jul 2020 14:43:04 +0200
Subject: [PATCH 15/34] Update ECAL and HCAL reconstruction to run on multple
 GPUs [1/3] (cms-patatrack#502)

Use caching allocators for host and device CUDA memory.
Use dedicated ESProducers to make part of the modules' configuration available on all GPUs.
Rename hcal and hcal::common namespaces to to calo::common.
---
 CUDADataFormats/HcalDigi/BuildFile.xml        |   2 +-
 .../HcalDigi/interface/DigiCollection.h       |  16 +-
 CUDADataFormats/HcalDigi/src/classes_def.xml  |  76 +++++----
 .../interface/RecHitCollection.h              |   6 +-
 .../HcalRecHitSoA/src/classes_def.xml         |  15 +-
 .../makeHcalRaw2DigiGpuValidationPlots.cpp    |   6 +-
 .../HcalRawToDigi/plugins/BuildFile.xml       |   2 +-
 .../HcalRawToDigi/plugins/DeclsForKernels.h   | 143 +++++++---------
 .../HcalRawToDigi/plugins/DecodeGPU.cu        |  36 ++---
 .../plugins/HcalCPUDigisProducer.cc           |  97 ++---------
 .../plugins/HcalDigisProducerGPU.cc           | 152 +++++++-----------
 .../HcalRawToDigi/plugins/HcalRawToDigiGPU.cc |  97 ++++++-----
 .../interface/HcalMahiPulseOffsetsGPU.h       |  39 +++++
 .../src/HcalMahiPulseOffsetsGPU.cc            |  36 +++++
 RecoLocalCalo/HcalRecProducers/BuildFile.xml  |   2 +-
 .../bin/makeHcalRecHitGpuValidationPlots.cpp  |   4 +-
 .../HcalRecProducers/src/DeclsForKernels.h    |  74 ++++-----
 .../src/HBHERecHitProducerGPU.cc              |  76 +++++----
 .../src/HcalCPURecHitsProducer.cc             |  16 +-
 .../src/HcalMahiPulseOffsetsGPUESProducer.cc  |  67 ++++++++
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu |  80 ++++-----
 21 files changed, 536 insertions(+), 506 deletions(-)
 create mode 100644 RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h
 create mode 100644 RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc
 create mode 100644 RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc

diff --git a/CUDADataFormats/HcalDigi/BuildFile.xml b/CUDADataFormats/HcalDigi/BuildFile.xml
index 8feae467742c0..fb871f16b69f0 100644
--- a/CUDADataFormats/HcalDigi/BuildFile.xml
+++ b/CUDADataFormats/HcalDigi/BuildFile.xml
@@ -1,5 +1,5 @@
 <use name="DataFormats/Common"/>
-<use name="CUDADataFormats/HcalCommon"/>
+<use name="CUDADataFormats/CaloCommon"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="cuda"/>
 
diff --git a/CUDADataFormats/HcalDigi/interface/DigiCollection.h b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
index cbb9da410f801..f5ae63d0954c6 100644
--- a/CUDADataFormats/HcalDigi/interface/DigiCollection.h
+++ b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_HcalDigi_interface_DigiCollection_h
 #define CUDADataFormats_HcalDigi_interface_DigiCollection_h
 
-#include "CUDADataFormats/HcalCommon/interface/Common.h"
+#include "CUDADataFormats/CaloCommon/interface/Common.h"
 
 namespace hcal {
 
@@ -108,7 +108,7 @@ namespace hcal {
 
   //
   template <typename StoragePolicy>
-  struct DigiCollectionBase : public common::AddSize<typename StoragePolicy::TagType> {
+  struct DigiCollectionBase : public ::calo::common::AddSize<typename StoragePolicy::TagType> {
     DigiCollectionBase() = default;
     DigiCollectionBase(DigiCollectionBase const&) = default;
     DigiCollectionBase& operator=(DigiCollectionBase const&) = default;
@@ -117,19 +117,19 @@ namespace hcal {
     DigiCollectionBase& operator=(DigiCollectionBase&&) = default;
 
     template <typename T = typename StoragePolicy::TagType>
-    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type resize(std::size_t size) {
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type resize(std::size_t size) {
       ids.resize(size);
       data.resize(size * stride);
     }
 
     template <typename T = typename StoragePolicy::TagType>
-    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type reserve(std::size_t size) {
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type reserve(std::size_t size) {
       ids.reserve(size);
       data.reserve(size * stride);
     }
 
     template <typename T = typename StoragePolicy::TagType>
-    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type clear() {
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type clear() {
       ids.clear();
       data.clear();
     }
@@ -161,19 +161,19 @@ namespace hcal {
     DigiCollection& operator=(DigiCollection&&) = default;
 
     template <typename T = typename StoragePolicy::TagType>
-    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type resize(std::size_t size) {
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type resize(std::size_t size) {
       DigiCollectionBase<StoragePolicy>::resize(size);
       npresamples.resize(size);
     }
 
     template <typename T = typename StoragePolicy::TagType>
-    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type reserve(std::size_t size) {
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type reserve(std::size_t size) {
       DigiCollectionBase<StoragePolicy>::reserve(size);
       npresamples.reserve(size);
     }
 
     template <typename T = typename StoragePolicy::TagType>
-    typename std::enable_if<std::is_same<T, common::tags::Vec>::value, void>::type clear() {
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type clear() {
       DigiCollectionBase<StoragePolicy>::clear();
       npresamples.clear();
     }
diff --git a/CUDADataFormats/HcalDigi/src/classes_def.xml b/CUDADataFormats/HcalDigi/src/classes_def.xml
index 33e9b28a49b49..3291c7f6d22ec 100644
--- a/CUDADataFormats/HcalDigi/src/classes_def.xml
+++ b/CUDADataFormats/HcalDigi/src/classes_def.xml
@@ -1,40 +1,52 @@
 <lcgdict>
-    <class name="hcal::DigiCollectionBase<hcal::common::VecStoragePolicy<std::allocator>>" />
-    <class name="hcal::DigiCollectionBase<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollectionBase<calo::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollectionBase<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
 
-    <class name="hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<std::allocator>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor2, hcal::common::VecStoragePolicy<std::allocator>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<std::allocator>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor4, hcal::common::VecStoragePolicy<std::allocator>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor2, calo::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor4, calo::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<std::allocator>>" />
             
-    <class name="hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor2, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor4, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor2, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor4, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
 
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>" persistent="false" />
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, hcal::common::ViewStoragePolicy>>" persistent="false" />
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>" persistent="false" />
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, hcal::common::ViewStoragePolicy>>" persistent="false" />
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::ViewStoragePolicy>>" persistent="false" />
             
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>>" persistent="false" />
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, hcal::common::ViewStoragePolicy>>>" persistent="false" />
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>>" persistent="false" />
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, hcal::common::ViewStoragePolicy>>>" persistent="false" />
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>" persistent="false" />
+            
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, calo::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, calo::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::ViewStoragePolicy>>>" persistent="false" />
+                
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, calo::common::DevStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, calo::common::DevStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>>" persistent="false" />
 
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<std::allocator>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor2, hcal::common::VecStoragePolicy<std::allocator>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<std::allocator>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor4, hcal::common::VecStoragePolicy<std::allocator>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor2, calo::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor4, calo::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<std::allocator>>>" />
                 
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor2, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor4, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor2, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor4, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
 </lcgdict>
diff --git a/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
index a17c8a51073be..424b2c0813b4c 100644
--- a/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
+++ b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
@@ -3,13 +3,13 @@
 
 #include <vector>
 
-#include "CUDADataFormats/HcalCommon/interface/Common.h"
+#include "CUDADataFormats/CaloCommon/interface/Common.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 
 namespace hcal {
 
   template <typename StoragePolicy>
-  struct RecHitCollection : public common::AddSize<typename StoragePolicy::TagType> {
+  struct RecHitCollection : public ::calo::common::AddSize<typename StoragePolicy::TagType> {
     RecHitCollection() = default;
     RecHitCollection(const RecHitCollection&) = default;
     RecHitCollection& operator=(const RecHitCollection&) = default;
@@ -24,7 +24,7 @@ namespace hcal {
     typename StoragePolicy::template StorageSelector<uint32_t>::type did;
 
     template <typename U = typename StoragePolicy::TagType>
-    typename std::enable_if<std::is_same<U, common::tags::Vec>::value, void>::type resize(size_t size) {
+    typename std::enable_if<std::is_same<U, ::calo::common::tags::Vec>::value, void>::type resize(size_t size) {
       energy.resize(size);
       chi2.resize(size);
       energyM0.resize(size);
diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
index f9b6eb4ecc130..71dd18a7daddb 100644
--- a/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
+++ b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
@@ -1,10 +1,13 @@
 <lcgdict>
-    <class name="hcal::RecHitCollection<hcal::common::VecStoragePolicy<std::allocator>>"/>
-    <class name="edm::Wrapper<hcal::RecHitCollection<hcal::common::VecStoragePolicy<std::allocator>>>"/>
+    <class name="hcal::RecHitCollection<calo::common::VecStoragePolicy<std::allocator>>"/>
+    <class name="edm::Wrapper<hcal::RecHitCollection<calo::common::VecStoragePolicy<std::allocator>>>"/>
 
-    <class name="hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>"/>
-    <class name="edm::Wrapper<hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>>"/>
+    <class name="hcal::RecHitCollection<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>"/>
+    <class name="edm::Wrapper<hcal::RecHitCollection<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>"/>
 
-    <class name="cms::cuda::Product<hcal::RecHitCollection<hcal::common::ViewStoragePolicy>>" persistent="false" />
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::RecHitCollection<hcal::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::RecHitCollection<calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::RecHitCollection<calo::common::ViewStoragePolicy>>>" persistent="false" />
+                
+    <class name="cms::cuda::Product<hcal::RecHitCollection<calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::RecHitCollection<calo::common::DevStoragePolicy>>>" persistent="false" />
 </lcgdict>
diff --git a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
index ff6a820a3525d..fd144ae452363 100644
--- a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
+++ b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
@@ -71,11 +71,11 @@ int main(int argc, char* argv[]) {
 
   // branches to use
   using Collectionf01 =
-      hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+      hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   using Collectionf5 =
-      hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+      hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   using Collectionf3 =
-      hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+      hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   edm::Wrapper<Collectionf01>* wgpuf01he = nullptr;
   edm::Wrapper<Collectionf5>* wgpuf5hb = nullptr;
   edm::Wrapper<Collectionf3>* wgpuf3hb = nullptr;
diff --git a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
index 025ea32125c82..3077a68a665e4 100644
--- a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
@@ -19,7 +19,7 @@
 <library file="HcalRawToDigiGPU.cc,DecodeGPU.cu,HcalESProducerGPUDefs.cc,ElectronicsMappingGPU.cc,HcalCPUDigisProducer.cc,HcalDigisProducerGPU.cc" name="EventFilterHcalRawToDigiGPUPlugins">
   <use name="cuda" />
   <use name="CUDADataFormats/Common" />
-  <use name="CUDADataFormats/HcalCommon"/>
+  <use name="CUDADataFormats/CaloCommon"/>
   <use name="CUDADataFormats/HcalDigi" />
   <use name="HeterogeneousCore/CUDACore"/>
   <use name="HeterogeneousCore/CUDAServices"/>
diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
index 606053edb6801..3808440f1449e 100644
--- a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
+++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
@@ -6,6 +6,8 @@
 #include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 #include "ElectronicsMappingGPU.h"
 
@@ -33,108 +35,75 @@ namespace hcal {
     };
 
     struct InputDataCPU {
-      std::vector<unsigned char, cms::cuda::HostAllocator<unsigned char>> data;
-      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> offsets;
-      std::vector<int, cms::cuda::HostAllocator<int>> feds;
-
-      void allocate() {
-        data.resize(utca_nfeds_max * sizeof(unsigned char) * nbytes_per_fed_max);
-        offsets.resize(utca_nfeds_max, 0);
-        feds.resize(utca_nfeds_max, 0);
-      }
+      cms::cuda::host::unique_ptr<unsigned char[]> data;
+      cms::cuda::host::unique_ptr<uint32_t[]> offsets;
+      cms::cuda::host::unique_ptr<int[]> feds;
     };
 
     struct OutputDataCPU {
-      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> nchannels;
-
-      void allocate() { nchannels.resize(numOutputCollections); }
+      cms::cuda::host::unique_ptr<uint32_t[]> nchannels;
     };
 
     struct ScratchDataGPU {
       // depends on tHE number of output collections
       // that is a statically known predefined number!!!
-      uint32_t *pChannelsCounters = nullptr;
-
-      void allocate(ConfigurationParameters const &) {
-        cudaCheck(cudaMalloc((void **)&pChannelsCounters, sizeof(uint32_t) * numOutputCollections));
-      }
-
-      void deallocate(ConfigurationParameters const &) {
-        if (pChannelsCounters) {
-          cudaCheck(cudaFree(pChannelsCounters));
-        }
-      }
+      cms::cuda::device::unique_ptr<uint32_t[]> pChannelsCounters;
     };
 
     struct OutputDataGPU {
-      DigiCollection<Flavor01, common::ViewStoragePolicy> digisF01HE;
-      DigiCollection<Flavor5, common::ViewStoragePolicy> digisF5HB;
-      DigiCollection<Flavor3, common::ViewStoragePolicy> digisF3HB;
-
-      // qie 11 HE
-      /*
-    uint16_t *digisF01HE = nullptr;
-    uint32_t *idsF01HE = nullptr;
-
-    // qie 8 HB
-    uint16_t *digisF5HB = nullptr;
-    uint32_t *idsF5HB = nullptr;
-    uint8_t *npresamplesF5HB = nullptr
-    */
-
-      void allocate(ConfigurationParameters const &config) {
-        cudaCheck(
-            cudaMalloc((void **)&digisF01HE.data,
-                       config.maxChannelsF01HE * sizeof(uint16_t) * compute_stride<Flavor01>(config.nsamplesF01HE)));
-        cudaCheck(cudaMalloc((void **)&digisF01HE.ids, sizeof(uint32_t) * config.maxChannelsF01HE));
-
-        cudaCheck(cudaMalloc((void **)&digisF5HB.data,
-                             config.maxChannelsF5HB * sizeof(uint16_t) * compute_stride<Flavor5>(config.nsamplesF5HB)));
-        cudaCheck(cudaMalloc((void **)&digisF5HB.ids, sizeof(uint32_t) * config.maxChannelsF5HB));
-        cudaCheck(cudaMalloc((void **)&digisF5HB.npresamples, sizeof(uint8_t) * config.maxChannelsF5HB));
-
-        cudaCheck(cudaMalloc((void **)&digisF3HB.data,
-                             config.maxChannelsF3HB * sizeof(uint16_t) * compute_stride<Flavor3>(config.nsamplesF3HB)));
-        cudaCheck(cudaMalloc((void **)&digisF3HB.ids, config.maxChannelsF3HB * sizeof(uint32_t)));
-      }
-
-      void deallocate(ConfigurationParameters const &config) {
-        if (digisF01HE.data) {
-          cudaCheck(cudaFree(digisF01HE.data));
-          cudaCheck(cudaFree(digisF01HE.ids));
-        }
-
-        if (digisF5HB.data) {
-          cudaCheck(cudaFree(digisF5HB.data));
-          cudaCheck(cudaFree(digisF5HB.ids));
-          cudaCheck(cudaFree(digisF5HB.npresamples));
-        }
-
-        if (digisF3HB.data) {
-          cudaCheck(cudaFree(digisF3HB.data));
-          cudaCheck(cudaFree(digisF3HB.ids));
-        }
+      DigiCollection<Flavor01, ::calo::common::DevStoragePolicy> digisF01HE;
+      DigiCollection<Flavor5, ::calo::common::DevStoragePolicy> digisF5HB;
+      DigiCollection<Flavor3, ::calo::common::DevStoragePolicy> digisF3HB;
+
+      void allocate(ConfigurationParameters const &config, cudaStream_t cudaStream) {
+        digisF01HE.data = cms::cuda::make_device_unique<uint16_t[]>(
+          config.maxChannelsF01HE*compute_stride<Flavor01>(config.nsamplesF01HE),
+          cudaStream
+        );
+        //cudaCheck(
+        //    cudaMalloc((void **)&digisF01HE.data,
+        //               config.maxChannelsF01HE * sizeof(uint16_t) * compute_stride<Flavor01>(config.nsamplesF01HE)));
+        digisF01HE.ids = cms::cuda::make_device_unique<uint32_t[]>(
+          config.maxChannelsF01HE,
+          cudaStream
+        );
+        //cudaCheck(cudaMalloc((void **)&digisF01HE.ids, sizeof(uint32_t) * config.maxChannelsF01HE));
+
+        digisF5HB.data = cms::cuda::make_device_unique<uint16_t[]>(
+          config.maxChannelsF5HB * compute_stride<Flavor5>(config.nsamplesF5HB),
+          cudaStream
+        );
+        //cudaCheck(cudaMalloc((void **)&digisF5HB.data,
+        //                     config.maxChannelsF5HB * sizeof(uint16_t) * compute_stride<Flavor5>(config.nsamplesF5HB)));
+        digisF5HB.ids = cms::cuda::make_device_unique<uint32_t[]>(
+          config.maxChannelsF5HB,
+          cudaStream
+        );
+        //cudaCheck(cudaMalloc((void **)&digisF5HB.ids, sizeof(uint32_t) * config.maxChannelsF5HB));
+        digisF5HB.npresamples = cms::cuda::make_device_unique<uint8_t[]>(
+          config.maxChannelsF5HB,
+          cudaStream
+        );
+        //cudaCheck(cudaMalloc((void **)&digisF5HB.npresamples, sizeof(uint8_t) * config.maxChannelsF5HB));
+
+        digisF3HB.data = cms::cuda::make_device_unique<uint16_t[]>(
+          config.maxChannelsF3HB * compute_stride<Flavor3>(config.nsamplesF3HB),
+          cudaStream
+        );
+        //cudaCheck(cudaMalloc((void **)&digisF3HB.data,
+        //                     config.maxChannelsF3HB * sizeof(uint16_t) * compute_stride<Flavor3>(config.nsamplesF3HB)));
+        digisF3HB.ids = cms::cuda::make_device_unique<uint32_t[]>(
+          config.maxChannelsF3HB,
+          cudaStream
+        );
+        //cudaCheck(cudaMalloc((void **)&digisF3HB.ids, config.maxChannelsF3HB * sizeof(uint32_t)));
       }
     };
 
     struct InputDataGPU {
-      unsigned char *data = nullptr;
-      uint32_t *offsets = nullptr;
-      int *feds = nullptr;
-
-      void allocate() {
-        cudaCheck(cudaMalloc((void **)&data, sizeof(unsigned char) * nbytes_per_fed_max * utca_nfeds_max));
-        cudaCheck(cudaMalloc((void **)&offsets, sizeof(uint32_t) * utca_nfeds_max));
-        cudaCheck(cudaMalloc((void **)&feds, sizeof(int) * utca_nfeds_max));
-      }
-
-      void deallocate() {
-        if (data) {
-          cudaCheck(cudaFree(data));
-          cudaCheck(cudaFree(offsets));
-          cudaCheck(cudaFree(feds));
-        }
-      }
+      cms::cuda::device::unique_ptr<unsigned char[]> data;
+      cms::cuda::device::unique_ptr<uint32_t[]> offsets;
+      cms::cuda::device::unique_ptr<int[]> feds;
     };
 
     struct ConditionsProducts {
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
index 1589ec6cb1661..5011b7b74fac3 100644
--- a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
@@ -568,38 +568,38 @@ namespace hcal {
                     uint32_t const nbytesTotal) {
       // transfer
       cudaCheck(cudaMemcpyAsync(
-          inputGPU.data, inputCPU.data.data(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream));
-      cudaCheck(cudaMemcpyAsync(inputGPU.offsets,
-                                inputCPU.offsets.data(),
+          inputGPU.data.get(), inputCPU.data.get(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream));
+      cudaCheck(cudaMemcpyAsync(inputGPU.offsets.get(),
+                                inputCPU.offsets.get(),
                                 nfedsWithData * sizeof(uint32_t),
                                 cudaMemcpyHostToDevice,
                                 cudaStream));
-      cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounters, 0, sizeof(uint32_t) * numOutputCollections, cudaStream));
+      cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounters.get(), 0, sizeof(uint32_t) * numOutputCollections, cudaStream));
       cudaCheck(cudaMemcpyAsync(
-          inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
+          inputGPU.feds.get(), inputCPU.feds.get(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
 
       // 12 is the max number of modules per crate
-      kernel_rawdecode_test<32><<<nfedsWithData, 12 * 32, 0, cudaStream>>>(inputGPU.data,
-                                                                           inputGPU.offsets,
-                                                                           inputGPU.feds,
+      kernel_rawdecode_test<32><<<nfedsWithData, 12 * 32, 0, cudaStream>>>(inputGPU.data.get(),
+                                                                           inputGPU.offsets.get(),
+                                                                           inputGPU.feds.get(),
                                                                            conditions.eMappingProduct.eid2did,
                                                                            conditions.eMappingProduct.eid2tid,
-                                                                           outputGPU.digisF01HE.data,
-                                                                           outputGPU.digisF01HE.ids,
-                                                                           outputGPU.digisF5HB.data,
-                                                                           outputGPU.digisF5HB.ids,
-                                                                           outputGPU.digisF5HB.npresamples,
-                                                                           outputGPU.digisF3HB.data,
-                                                                           outputGPU.digisF3HB.ids,
-                                                                           scratchGPU.pChannelsCounters,
+                                                                           outputGPU.digisF01HE.data.get(),
+                                                                           outputGPU.digisF01HE.ids.get(),
+                                                                           outputGPU.digisF5HB.data.get(),
+                                                                           outputGPU.digisF5HB.ids.get(),
+                                                                           outputGPU.digisF5HB.npresamples.get(),
+                                                                           outputGPU.digisF3HB.data.get(),
+                                                                           outputGPU.digisF3HB.ids.get(),
+                                                                           scratchGPU.pChannelsCounters.get(),
                                                                            config.nsamplesF01HE,
                                                                            config.nsamplesF5HB,
                                                                            config.nsamplesF3HB,
                                                                            nbytesTotal);
       cudaCheck(cudaGetLastError());
 
-      cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.data(),
-                                scratchGPU.pChannelsCounters,
+      cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.get(),
+                                scratchGPU.pChannelsCounters.get(),
                                 sizeof(uint32_t) * numOutputCollections,
                                 cudaMemcpyDeviceToHost,
                                 cudaStream));
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
index 8aad10228021c..a45d5d44adcd2 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
@@ -23,21 +23,21 @@ class HcalCPUDigisProducer : public edm::stream::EDProducer<edm::ExternalWork> {
   void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>;
+  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductTypef01> digisF01HETokenIn_;
-  using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>;
+  using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductTypef5> digisF5HBTokenIn_;
-  using IProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>;
+  using IProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductTypef3> digisF3HBTokenIn_;
 
   using OProductTypef01 =
-      hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+      hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   edm::EDPutTokenT<OProductTypef01> digisF01HETokenOut_;
   using OProductTypef5 =
-      hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+      hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   edm::EDPutTokenT<OProductTypef5> digisF5HBTokenOut_;
   using OProductTypef3 =
-      hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+      hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   edm::EDPutTokenT<OProductTypef3> digisF3HBTokenOut_;
 
   // needed to pass data from acquire to produce
@@ -89,92 +89,29 @@ void HcalCPUDigisProducer::acquire(edm::Event const& event,
   digisf5HB_.resize(f5HBDigis.size);
   digisf3HB_.resize(f3HBDigis.size);
 
-  /*
-    idsf01he.resize(f01HEDigis.ndigis);
-    dataf01he.resize(f01HEDigis.ndigis * f01HEDigis.stride);
-    idsf5hb.resize(f5HBDigis.ndigis);
-    npresamplesf5hb.resize(f5HBDigis.ndigis);
-    dataf5hb.resize(f5HBDigis.ndigis * f5HBDigis.stride);
-    stridef01he = f01HEDigis.stride;
-    stridef5hb = f5HBDigis.stride;
-    */
-
   auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
     using vector_type = typename std::remove_reference<decltype(dest)>::type;
     using type = typename vector_type::value_type;
+    using src_data_type = typename std::remove_pointer<decltype(src)>::type;
+    static_assert(std::is_same<src_data_type, type>::value && "Dest and Src data types do not match");
     cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream()));
   };
 
-  lambdaToTransfer(digisf01HE_.data, f01HEDigis.data);
-  lambdaToTransfer(digisf01HE_.ids, f01HEDigis.ids);
-
-  lambdaToTransfer(digisf5HB_.data, f5HBDigis.data);
-  lambdaToTransfer(digisf5HB_.ids, f5HBDigis.ids);
-  lambdaToTransfer(digisf5HB_.npresamples, f5HBDigis.npresamples);
-
-  lambdaToTransfer(digisf3HB_.data, f3HBDigis.data);
-  lambdaToTransfer(digisf3HB_.ids, f3HBDigis.ids);
-
-  /*
-    // enqeue transfers
-    cudaCheck( cudaMemcpyAsync(digisf01.data.data(),
-                               f01HEDigis.data,
-                               dataf01HE.data.size() * sizeof(uint16_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream().id()) );
-    cudaCheck( cudaMemcpyAsync(dataf5hb.data(),
-                               f5HBDigis.data,
-                               dataf5hb.size() * sizeof(uint16_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream().id()) );
-    cudaCheck( cudaMemcpyAsync(idsf01he.data(),
-                               f01HEDigis.ids,
-                               idsf01he.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream().id()) );
-    cudaCheck( cudaMemcpyAsync(idsf5hb.data(),
-                               f5HBDigis.ids,
-                               idsf5hb.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream().id()) );
-    cudaCheck( cudaMemcpyAsync(npresamplesf5hb.data(),
-                               f5HBDigis.npresamples,
-                               npresamplesf5hb.size() * sizeof(uint8_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream.id()) );
-                               */
+  lambdaToTransfer(digisf01HE_.data, f01HEDigis.data.get());
+  lambdaToTransfer(digisf01HE_.ids, f01HEDigis.ids.get());
+
+  lambdaToTransfer(digisf5HB_.data, f5HBDigis.data.get());
+  lambdaToTransfer(digisf5HB_.ids, f5HBDigis.ids.get());
+  lambdaToTransfer(digisf5HB_.npresamples, f5HBDigis.npresamples.get());
+
+  lambdaToTransfer(digisf3HB_.data, f3HBDigis.data.get());
+  lambdaToTransfer(digisf3HB_.ids, f3HBDigis.ids.get());
 }
 
 void HcalCPUDigisProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
   event.emplace(digisF01HETokenOut_, std::move(digisf01HE_));
   event.emplace(digisF5HBTokenOut_, std::move(digisf5HB_));
   event.emplace(digisF3HBTokenOut_, std::move(digisf3HB_));
-
-  // output collections
-  /*
-    auto f01he = std::make_unique<edm::DataFrameContainer>(
-        stridef01he, HcalEndcap, idsf01he.size());
-    auto f5hb = std::make_unique<edm::DataFrameContainer>(
-        stridef5hb, HcalBarrel, idsf5hb.size());
-    
-    // cast constness away
-    // use pointers to buffers instead of move operator= semantics (or swap)
-    // cause we have different allocators in there...
-    auto *dataf01hetmp = const_cast<uint16_t*>(f01he->data().data());
-    auto *dataf5hbtmp = const_cast<uint16_t*>(f5hb->data().data());
-
-    auto *idsf01hetmp = const_cast<uint32_t*>(f01he->ids().data());
-    auto idsf5hbtmp = const_cast<uint32_t*>(f5hb->ids().data());
-
-    // copy data
-    std::memcpy(dataf01hetmp, dataf01he.data(), dataf01he.size() * sizeof(uint16_t));
-    std::memcpy(dataf5hbtmp, dataf5hb.data(), dataf5hb.size() * sizeof(uint16_t));
-    std::memcpy(idsf01hetmp, idsf01he.data(), idsf01he.size() * sizeof(uint32_t));
-    std::memcpy(idsf5hbtmp, idsf5hb.data(), idsf5hb.size() * sizeof(uint32_t));
-
-    event.put(digisF01HETokenOut_, std::move(f01he));
-    event.put(digisF5HBTokenOut_, std::move(f5hb));
-    */
 }
 
 DEFINE_FWK_MODULE(HcalCPUDigisProducer);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
index c7bb27b60fa12..69546a7a118bc 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -29,14 +29,14 @@ class HcalDigisProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 
   // type aliases
   using HostCollectionf01 =
-      hcal::DigiCollection<hcal::Flavor01, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
-  using DeviceCollectionf01 = hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>;
+      hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf01 = hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>;
   using HostCollectionf5 =
-      hcal::DigiCollection<hcal::Flavor5, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
-  using DeviceCollectionf5 = hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>;
+      hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf5 = hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>;
   using HostCollectionf3 =
-      hcal::DigiCollection<hcal::Flavor3, hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
-  using DeviceCollectionf3 = hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>;
+      hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf3 = hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>;
 
   // output product tokens
   using ProductTypef01 = cms::cuda::Product<DeviceCollectionf01>;
@@ -48,16 +48,6 @@ class HcalDigisProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 
   cms::cuda::ContextState cudaState_;
 
-  /*
-    hcal::raw::ConfigurationParameters config_;
-    // FIXME move this to use raii
-    hcal::raw::InputDataCPU inputCPU_;
-    hcal::raw::InputDataGPU inputGPU_;
-    hcal::raw::OutputDataGPU outputGPU_;
-    hcal::raw::ScratchDataGPU scratchGPU_;
-    hcal::raw::OutputDataCPU outputCPU_;
-    */
-
   struct ConfigParameters {
     uint32_t maxChannelsF01HE, maxChannelsF5HB, maxChannelsF3HB, nsamplesF01HE, nsamplesF5HB, nsamplesF3HB;
   };
@@ -68,8 +58,7 @@ class HcalDigisProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
   HostCollectionf5 hf5_;
   HostCollectionf3 hf3_;
 
-  // device products
-  // NOTE: this module owns memory of the product on the device
+  // device products: product owns memory (i.e. not the module)
   DeviceCollectionf01 df01_;
   DeviceCollectionf5 df5_;
   DeviceCollectionf3 df3_;
@@ -107,27 +96,6 @@ HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
   config_.nsamplesF5HB = ps.getParameter<uint32_t>("nsamplesF5HB");
   config_.nsamplesF3HB = ps.getParameter<uint32_t>("nsamplesF3HB");
 
-  // call CUDA API functions only if CUDA is available
-  edm::Service<CUDAService> cs;
-  if (cs and cs->enabled()) {
-    // allocate on the device
-    cudaCheck(cudaMalloc(
-        (void**)&df01_.data,
-        config_.maxChannelsF01HE * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)));
-    cudaCheck(cudaMalloc((void**)&df01_.ids, config_.maxChannelsF01HE * sizeof(uint32_t)));
-
-    cudaCheck(cudaMalloc(
-        (void**)&df5_.data,
-        config_.maxChannelsF5HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)));
-    cudaCheck(cudaMalloc((void**)&df5_.ids, config_.maxChannelsF5HB * sizeof(uint32_t)));
-    cudaCheck(cudaMalloc((void**)&df5_.npresamples, sizeof(uint8_t) * config_.maxChannelsF5HB));
-
-    cudaCheck(cudaMalloc(
-        (void**)&df3_.data,
-        config_.maxChannelsF3HB * sizeof(uint16_t) * hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB)));
-    cudaCheck(cudaMalloc((void**)&df3_.ids, config_.maxChannelsF3HB * sizeof(uint32_t)));
-  }
-
   // preallocate on the host
   hf01_.stride = hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
   hf5_.stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
@@ -137,22 +105,7 @@ HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
   hf3_.reserve(config_.maxChannelsF3HB);
 }
 
-HcalDigisProducerGPU::~HcalDigisProducerGPU() {
-  // call CUDA API functions only if CUDA is available
-  edm::Service<CUDAService> cs;
-  if (cs and cs->enabled()) {
-    // deallocate on the device
-    cudaCheck(cudaFree(df01_.data));
-    cudaCheck(cudaFree(df01_.ids));
-
-    cudaCheck(cudaFree(df5_.data));
-    cudaCheck(cudaFree(df5_.ids));
-    cudaCheck(cudaFree(df5_.npresamples));
-
-    cudaCheck(cudaFree(df3_.data));
-    cudaCheck(cudaFree(df3_.ids));
-  }
-}
+HcalDigisProducerGPU::~HcalDigisProducerGPU() {}
 
 void HcalDigisProducerGPU::acquire(edm::Event const& event,
                                    edm::EventSetup const& setup,
@@ -170,6 +123,43 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
   event.getByToken(hbheDigiToken_, hbheDigis);
   event.getByToken(qie11DigiToken_, qie11Digis);
 
+  // flavor 0/1 get devie blobs
+  df01_.data = cms::cuda::make_device_unique<uint16_t[]>(
+    config_.maxChannelsF01HE * hcal::compute_stride<hcal::Flavor01>(
+      config_.nsamplesF01HE),
+    ctx.stream()
+  );
+  df01_.ids = cms::cuda::make_device_unique<uint32_t[]>(
+    config_.maxChannelsF01HE,
+    ctx.stream()
+  );
+
+  // flavor3 get device blobs
+  df3_.data  = cms::cuda::make_device_unique<uint16_t[]>(
+    config_.maxChannelsF3HB * hcal::compute_stride<hcal::Flavor3>(
+      config_.nsamplesF3HB),
+    ctx.stream()
+  );
+  df3_.ids = cms::cuda::make_device_unique<uint32_t[]>(
+    config_.maxChannelsF3HB,
+    ctx.stream()
+  );
+
+  // flavor5 get device blobs
+  df5_.data = cms::cuda::make_device_unique<uint16_t[]>(
+    config_.maxChannelsF5HB * hcal::compute_stride<hcal::Flavor5>(
+      config_.nsamplesF5HB),
+    ctx.stream()
+  );
+  df5_.ids = cms::cuda::make_device_unique<uint32_t[]>(
+    config_.maxChannelsF5HB,
+    ctx.stream()
+  );
+  df5_.npresamples = cms::cuda::make_device_unique<uint8_t[]>(
+    config_.maxChannelsF5HB,
+    ctx.stream()
+  );
+
   for (auto const& hbhe : *hbheDigis) {
     auto const id = hbhe.id().rawId();
     auto const presamples = hbhe.presamples();
@@ -218,18 +208,20 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
   auto lambdaToTransfer = [&ctx](auto* dest, auto const& src) {
     using vector_type = typename std::remove_reference<decltype(src)>::type;
     using type = typename vector_type::value_type;
+    using dest_data_type = typename std::remove_pointer<decltype(dest)>::type;
+    static_assert(std::is_same<dest_data_type, type>::value && "Dest and Src data typesdo not match");
     cudaCheck(cudaMemcpyAsync(dest, src.data(), src.size() * sizeof(type), cudaMemcpyHostToDevice, ctx.stream()));
   };
 
-  lambdaToTransfer(df01_.data, hf01_.data);
-  lambdaToTransfer(df01_.ids, hf01_.ids);
+  lambdaToTransfer(df01_.data.get(), hf01_.data);
+  lambdaToTransfer(df01_.ids.get(), hf01_.ids);
 
-  lambdaToTransfer(df5_.data, hf5_.data);
-  lambdaToTransfer(df5_.ids, hf5_.ids);
-  lambdaToTransfer(df5_.npresamples, hf5_.npresamples);
+  lambdaToTransfer(df5_.data.get(), hf5_.data);
+  lambdaToTransfer(df5_.ids.get(), hf5_.ids);
+  lambdaToTransfer(df5_.npresamples.get(), hf5_.npresamples);
 
-  lambdaToTransfer(df3_.data, hf3_.data);
-  lambdaToTransfer(df3_.ids, hf3_.ids);
+  lambdaToTransfer(df3_.data.get(), hf3_.data);
+  lambdaToTransfer(df3_.ids.get(), hf3_.ids);
 }
 
 void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
@@ -242,39 +234,9 @@ void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& set
   df3_.stride = hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB);
   df3_.size = hf3_.ids.size();
 
-  ctx.emplace(event, digisF01HEToken_, df01_);
-  ctx.emplace(event, digisF5HBToken_, df5_);
-  ctx.emplace(event, digisF3HBToken_, df3_);
-
-  /*
-
-#ifdef HCAL_RAWDECODE_CPUDEBUG
-    printf("f01he channels = %u f5hb channesl = %u\n",
-        outputCPU_.nchannels[hcal::raw::OutputF01HE], 
-        outputCPU_.nchannels[hcal::raw::OutputF5HB]);
-#endif
-
-    // FIXME: use sizes of views directly for cuda mem cpy?
-    auto const nchannelsF01HE = outputCPU_.nchannels[hcal::raw::OutputF01HE];
-    auto const nchannelsF5HB = outputCPU_.nchannels[hcal::raw::OutputF5HB];
-    outputGPU_.digisF01HE.size = nchannelsF01HE;
-    outputGPU_.digisF5HB.size = nchannelsF5HB;
-    outputGPU_.digisF01HE.stride = 
-        hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
-    outputGPU_.digisF5HB.stride = 
-        hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
-
-    hcal::DigiCollection<hcal::Flavor01> digisF01HE{outputGPU_.idsF01HE,
-        outputGPU_.digisF01HE, nchannelsF01HE, 
-        hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)};
-    hcal::DigiCollection<hcal::Flavor5> digisF5HB{outputGPU_.idsF5HB,
-        outputGPU_.digisF5HB, outputGPU_.npresamplesF5HB, nchannelsF5HB, 
-        hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)};
-
-    ctx.emplace(event, digisF01HEToken_, std::move(outputGPU_.digisF01HE));
-    ctx.emplace(event, digisF5HBToken_, std::move(outputGPU_.digisF5HB));
-
-    */
+  ctx.emplace(event, digisF01HEToken_, std::move(df01_));
+  ctx.emplace(event, digisF5HBToken_, std::move(df5_));
+  ctx.emplace(event, digisF3HBToken_, std::move(df3_));
 }
 
 DEFINE_FWK_MODULE(HcalDigisProducerGPU);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
index 0e3a1a0d3b1e3..ab479a36202b0 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
@@ -12,6 +12,7 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 #include "DeclsForKernels.h"
 #include "DecodeGPU.h"
@@ -29,11 +30,11 @@ class HcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 
 private:
   edm::EDGetTokenT<FEDRawDataCollection> rawDataToken_;
-  using ProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>;
+  using ProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>;
   edm::EDPutTokenT<ProductTypef01> digisF01HEToken_;
-  using ProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>;
+  using ProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>;
   edm::EDPutTokenT<ProductTypef5> digisF5HBToken_;
-  using ProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>;
+  using ProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>;
   edm::EDPutTokenT<ProductTypef3> digisF3HBToken_;
 
   cms::cuda::ContextState cudaState_;
@@ -41,11 +42,7 @@ class HcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
   std::vector<int> fedsToUnpack_;
 
   hcal::raw::ConfigurationParameters config_;
-  // FIXME move this to use raii
-  hcal::raw::InputDataCPU inputCPU_;
-  hcal::raw::InputDataGPU inputGPU_;
   hcal::raw::OutputDataGPU outputGPU_;
-  hcal::raw::ScratchDataGPU scratchGPU_;
   hcal::raw::OutputDataCPU outputCPU_;
 };
 
@@ -84,27 +81,9 @@ HcalRawToDigiGPU::HcalRawToDigiGPU(const edm::ParameterSet& ps)
   config_.nsamplesF01HE = ps.getParameter<uint32_t>("nsamplesF01HE");
   config_.nsamplesF5HB = ps.getParameter<uint32_t>("nsamplesF5HB");
   config_.nsamplesF3HB = ps.getParameter<uint32_t>("nsamplesF3HB");
-
-  // reserve memory and call CUDA API functions only if CUDA is available
-  edm::Service<CUDAService> cs;
-  if (cs and cs->enabled()) {
-    inputCPU_.allocate();
-    outputCPU_.allocate();
-
-    inputGPU_.allocate();
-    outputGPU_.allocate(config_);
-    scratchGPU_.allocate(config_);
-  }
 }
 
 HcalRawToDigiGPU::~HcalRawToDigiGPU() {
-  // call CUDA API functions only if CUDA is available
-  edm::Service<CUDAService> cs;
-  if (cs and cs->enabled()) {
-    inputGPU_.deallocate();
-    outputGPU_.deallocate(config_);
-    scratchGPU_.deallocate(config_);
-  }
 }
 
 void HcalRawToDigiGPU::acquire(edm::Event const& event,
@@ -125,6 +104,50 @@ void HcalRawToDigiGPU::acquire(edm::Event const& event,
   edm::Handle<FEDRawDataCollection> rawDataHandle;
   event.getByToken(rawDataToken_, rawDataHandle);
 
+  // scratch
+  hcal::raw::ScratchDataGPU scratchGPU = {
+    cms::cuda::make_device_unique<uint32_t[]>(
+      hcal::raw::numOutputCollections,
+      ctx.stream())
+  };
+
+  // input cpu data
+  hcal::raw::InputDataCPU inputCPU = {
+    cms::cuda::make_host_unique<unsigned char[]>(
+      hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max,
+      ctx.stream()),
+    cms::cuda::make_host_unique<uint32_t[]>(
+      hcal::raw::utca_nfeds_max,
+      ctx.stream()),
+    cms::cuda::make_host_unique<int[]>(
+      hcal::raw::utca_nfeds_max,
+      ctx.stream())
+  };
+
+  // input data gpu
+  hcal::raw::InputDataGPU inputGPU = {
+    cms::cuda::make_device_unique<unsigned char[]>(
+      hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max,
+      ctx.stream()),
+    cms::cuda::make_device_unique<uint32_t[]>(
+      hcal::raw::utca_nfeds_max,
+      ctx.stream()),
+    cms::cuda::make_device_unique<int[]>(
+      hcal::raw::utca_nfeds_max,
+      ctx.stream())
+  };
+
+  // output cpu
+  outputCPU_ = {
+    cms::cuda::make_host_unique<uint32_t[]>(
+      hcal::raw::numOutputCollections,
+      ctx.stream()
+    )
+  };
+
+  // output gpu
+  outputGPU_.allocate(config_, ctx.stream());
+
   // iterate over feds
   // TODO: another idea
   //   - loop over all feds to unpack and enqueue cuda memcpy
@@ -146,20 +169,20 @@ void HcalRawToDigiGPU::acquire(edm::Event const& event,
 #endif
 
     // copy raw data into plain buffer
-    std::memcpy(inputCPU_.data.data() + currentCummOffset, data.data(), nbytes);
+    std::memcpy(inputCPU.data.get() + currentCummOffset, data.data(), nbytes);
     // set the offset in bytes from the start
-    inputCPU_.offsets[counter] = currentCummOffset;
-    inputCPU_.feds[counter] = fed;
+    inputCPU.offsets[counter] = currentCummOffset;
+    inputCPU.feds[counter] = fed;
 
     // this is the current offset into the vector
     currentCummOffset += nbytes;
     ++counter;
   }
 
-  hcal::raw::entryPoint(inputCPU_,
-                        inputGPU_,
+  hcal::raw::entryPoint(inputCPU,
+                        inputGPU,
                         outputGPU_,
-                        scratchGPU_,
+                        scratchGPU,
                         outputCPU_,
                         conditions,
                         config_,
@@ -188,18 +211,12 @@ void HcalRawToDigiGPU::produce(edm::Event& event, edm::EventSetup const& setup)
   outputGPU_.digisF5HB.stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
   outputGPU_.digisF3HB.stride = hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB);
 
-  /*
-    hcal::DigiCollection<hcal::Flavor01> digisF01HE{outputGPU_.idsF01HE,
-        outputGPU_.digisF01HE, nchannelsF01HE, 
-        hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE)};
-    hcal::DigiCollection<hcal::Flavor5> digisF5HB{outputGPU_.idsF5HB,
-        outputGPU_.digisF5HB, outputGPU_.npresamplesF5HB, nchannelsF5HB, 
-        hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB)};
-        */
-
   ctx.emplace(event, digisF01HEToken_, std::move(outputGPU_.digisF01HE));
   ctx.emplace(event, digisF5HBToken_, std::move(outputGPU_.digisF5HB));
   ctx.emplace(event, digisF3HBToken_, std::move(outputGPU_.digisF3HB));
+  
+  // reset ptrs that are carried as members
+  outputCPU_.nchannels.reset();
 }
 
 DEFINE_FWK_MODULE(HcalRawToDigiGPU);
diff --git a/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h b/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h
new file mode 100644
index 0000000000000..2481a80711d33
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h
@@ -0,0 +1,39 @@
+#ifndef RecoLocalCalo_HcalRecAlgos_interface_HcalMahiPulseOffsetsGPU_h
+#define RecoLocalCalo_HcalRecAlgos_interface_HcalMahiPulseOffsetsGPU_h
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class HcalMahiPulseOffsetsGPU {
+public:
+  struct Product {
+    ~Product();
+    int* values;
+  };
+
+#ifndef __CUDACC__
+  // rearrange reco params
+  HcalMahiPulseOffsetsGPU(edm::ParameterSet const&);
+
+  // will trigger deallocation of Product thru ~Product
+  ~HcalMahiPulseOffsetsGPU() = default;
+
+  std::vector<int, cms::cuda::HostAllocator<int>> const& getValues() const {
+      return values_;
+  }
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+private:
+  std::vector<int, cms::cuda::HostAllocator<int>> values_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif
+};
+
+#endif
diff --git a/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc b/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc
new file mode 100644
index 0000000000000..3f5cdbe5f15ca
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc
@@ -0,0 +1,36 @@
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+// FIXME: add proper getters to conditions
+HcalMahiPulseOffsetsGPU::HcalMahiPulseOffsetsGPU(edm::ParameterSet const& ps) 
+{
+    auto const& values = ps.getParameter<std::vector<int>>("pulseOffsets");
+    values_.resize(values.size());
+    std::copy(values.begin(), values.end(), values_.begin());
+}
+
+HcalMahiPulseOffsetsGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(values));
+}
+
+HcalMahiPulseOffsetsGPU::Product const& HcalMahiPulseOffsetsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product =
+      product_.dataForCurrentDeviceAsync(cudaStream, [this](HcalMahiPulseOffsetsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.values, this->values_.size() * sizeof(int)));
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->values_.data(),
+                                  this->values_.size() * sizeof(int),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(HcalMahiPulseOffsetsGPU);
diff --git a/RecoLocalCalo/HcalRecProducers/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
index 7375c85d61a54..524c2d64dd5d7 100644
--- a/RecoLocalCalo/HcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
@@ -1,7 +1,7 @@
 <use name="boost"/>
 <use name="cuda"/>
 <use name="CUDADataFormats/Common" />
-<use name="CUDADataFormats/HcalCommon"/>
+<use name="CUDADataFormats/CaloCommon"/>
 <use name="CUDADataFormats/HcalDigi"/>
 <use name="CUDADataFormats/HcalRecHitSoA"/>
 <use name="CalibCalorimetry/HcalAlgos"/>
diff --git a/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp b/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp
index 5ef7861f43232..fe6aabf928aca 100644
--- a/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp
+++ b/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp
@@ -30,7 +30,7 @@ int main(int argc, char* argv[]) {
   // branches to use
   edm::Wrapper<HBHERecHitCollection>* wcpu = nullptr;
   edm::Wrapper<HBHERecHitCollection>* wgpu = nullptr;
-  //    edm::Wrapper<hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>> *wgpu=nullptr;
+  //    edm::Wrapper<hcal::RecHitCollection<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>> *wgpu=nullptr;
 
   // prep output
   TFile rfout{outFileName.c_str(), "recreate"};
@@ -61,7 +61,7 @@ int main(int argc, char* argv[]) {
   // prep input
   TFile rfin{inFileName.c_str()};
   TTree* rt = (TTree*)rfin.Get("Events");
-  rt->SetBranchAddress("HBHERecHitsSorted_hcalCPURecHitsProducer_recHitsLegacyLabelOut_RECO.", &wgpu);
+  rt->SetBranchAddress("HBHERecHitsSorted_hcalCPURecHitsProducer_recHitsLegacyHBHE_RECO.", &wgpu);
   //    rt->SetBranchAddress("hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalRecHitCollection_hcalCPURecHitsProducer_recHitsM0LabelOut_RECO.", &wgpu);
   rt->SetBranchAddress("HBHERecHitsSorted_hbheprereco__RECO.", &wcpu);
 
diff --git a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
index 42f1992bcf119..1486e27557285 100644
--- a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
+++ b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
@@ -24,6 +24,10 @@
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMCharacteristicsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 namespace hcal {
   namespace mahi {
@@ -46,6 +50,8 @@ namespace hcal {
       HcalTopology const* topology;
       HcalDDDRecConstants const* recConstants;
       uint32_t offsetForHashes;
+      HcalMahiPulseOffsetsGPU::Product const& pulseOffsets;
+      std::vector<int, cms::cuda::HostAllocator<int>> const& pulseOffsetsHost;
     };
 
     struct ConfigParameters {
@@ -62,7 +68,8 @@ namespace hcal {
       float ts4Thresh;
 
       std::vector<int> pulseOffsets;
-      int* pulseOffsetsDevice = nullptr;
+      // FIXME remove pulseOffsets - they come from esproduce now
+      //int* pulseOffsetsDevice = nullptr;
 
       std::array<uint32_t, 3> kernelMinimizeThreads;
 
@@ -75,59 +82,32 @@ namespace hcal {
     };
 
     struct OutputDataGPU {
-      RecHitCollection<common::ViewStoragePolicy> recHits;
-
-      void allocate(ConfigParameters const& config) {
-        cudaCheck(cudaMalloc((void**)&recHits.energy, config.maxChannels * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&recHits.chi2, config.maxChannels * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&recHits.energyM0, config.maxChannels * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&recHits.timeM0, config.maxChannels * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&recHits.did, config.maxChannels * sizeof(uint32_t)));
-      }
-
-      void deallocate(ConfigParameters const& config) {
-        cudaCheck(cudaFree(recHits.energy));
-        cudaCheck(cudaFree(recHits.chi2));
-        cudaCheck(cudaFree(recHits.energyM0));
-        cudaCheck(cudaFree(recHits.timeM0));
-        cudaCheck(cudaFree(recHits.did));
+      RecHitCollection<::calo::common::DevStoragePolicy> recHits;
+
+      void allocate(ConfigParameters const& config, cudaStream_t cudaStream) {
+        recHits.energy = cms::cuda::make_device_unique<float[]>(
+          config.maxChannels, cudaStream);
+        recHits.chi2 = cms::cuda::make_device_unique<float[]>(
+          config.maxChannels, cudaStream);
+        recHits.energyM0 = cms::cuda::make_device_unique<float[]>(
+          config.maxChannels, cudaStream);
+        recHits.timeM0 = cms::cuda::make_device_unique<float[]>(
+          config.maxChannels, cudaStream);
+        recHits.did = cms::cuda::make_device_unique<uint32_t[]>(
+          config.maxChannels, cudaStream);
       }
     };
 
     struct ScratchDataGPU {
-      float *amplitudes = nullptr, *noiseTerms = nullptr;
-      float *pulseMatrices = nullptr, *pulseMatricesM = nullptr, *pulseMatricesP = nullptr;
-      int8_t* soiSamples = nullptr;
-
-      // TODO: properly allocate for NSAMPLES VS NPULSES
-      void allocate(ConfigParameters const& config) {
-        cudaCheck(cudaMalloc((void**)&amplitudes, sizeof(float) * config.maxChannels * config.maxTimeSamples));
-        cudaCheck(cudaMalloc((void**)&noiseTerms, sizeof(float) * config.maxChannels * config.maxTimeSamples));
-        cudaCheck(cudaMalloc((void**)&pulseMatrices,
-                             sizeof(float) * config.maxChannels * config.maxTimeSamples * config.maxTimeSamples));
-        cudaCheck(cudaMalloc((void**)&pulseMatricesM,
-                             sizeof(float) * config.maxChannels * config.maxTimeSamples * config.maxTimeSamples));
-        cudaCheck(cudaMalloc((void**)&pulseMatricesP,
-                             sizeof(float) * config.maxChannels * config.maxTimeSamples * config.maxTimeSamples));
-        cudaCheck(cudaMalloc((void**)&soiSamples, sizeof(int8_t) * config.maxChannels));
-      }
-
-      void deallocate(ConfigParameters const& config) {
-        if (amplitudes) {
-          cudaCheck(cudaFree(amplitudes));
-          cudaCheck(cudaFree(noiseTerms));
-          cudaCheck(cudaFree(pulseMatrices));
-          cudaCheck(cudaFree(pulseMatricesM));
-          cudaCheck(cudaFree(pulseMatricesP));
-          cudaCheck(cudaFree(soiSamples));
-        }
-      }
+      cms::cuda::device::unique_ptr<float[]> amplitudes, noiseTerms,
+          pulseMatrices, pulseMatricesM, pulseMatricesP;
+      cms::cuda::device::unique_ptr<int8_t[]> soiSamples;
     };
 
     struct InputDataGPU {
-      DigiCollection<Flavor01, common::ViewStoragePolicy> const& f01HEDigis;
-      DigiCollection<Flavor5, common::ViewStoragePolicy> const& f5HBDigis;
-      DigiCollection<Flavor3, common::ViewStoragePolicy> const& f3HBDigis;
+      DigiCollection<Flavor01, ::calo::common::DevStoragePolicy> const& f01HEDigis;
+      DigiCollection<Flavor5, ::calo::common::DevStoragePolicy> const& f5HBDigis;
+      DigiCollection<Flavor3, ::calo::common::DevStoragePolicy> const& f3HBDigis;
     };
 
   }  // namespace mahi
diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
index c78200badce57..e0294d68f0edd 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -37,6 +37,9 @@
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
 
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
+#include "HcalMahiPulseOffsetsGPURecord.h"
+
 #include "MahiGPU.h"
 
 class HBHERecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
@@ -49,22 +52,21 @@ class HBHERecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork>
   void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
   void produce(edm::Event&, edm::EventSetup const&) override;
 
-  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, hcal::common::ViewStoragePolicy>>;
+  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductTypef01> digisTokenF01HE_;
 
-  using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, hcal::common::ViewStoragePolicy>>;
+  using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductTypef5> digisTokenF5HB_;
 
-  using IProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, hcal::common::ViewStoragePolicy>>;
+  using IProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductTypef3> digisTokenF3HB_;
 
-  using RecHitType = hcal::RecHitCollection<hcal::common::ViewStoragePolicy>;
+  using RecHitType = hcal::RecHitCollection<calo::common::DevStoragePolicy>;
   using OProductType = cms::cuda::Product<RecHitType>;
   edm::EDPutTokenT<OProductType> rechitsM0Token_;
 
   hcal::mahi::ConfigParameters configParameters_;
   hcal::mahi::OutputDataGPU outputGPU_;
-  hcal::mahi::ScratchDataGPU scratchGPU_;
   cms::cuda::ContextState cudaState_;
 };
 
@@ -75,7 +77,6 @@ HBHERecHitProducerGPU::HBHERecHitProducerGPU(edm::ParameterSet const& ps)
       rechitsM0Token_{produces<OProductType>(ps.getParameter<std::string>("recHitsLabelM0HBHE"))} {
   configParameters_.maxChannels = ps.getParameter<uint32_t>("maxChannels");
   configParameters_.maxTimeSamples = ps.getParameter<uint32_t>("maxTimeSamples");
-  configParameters_.pulseOffsets = ps.getParameter<std::vector<int>>("pulseOffsets");
   configParameters_.kprep1dChannelsPerBlock = ps.getParameter<uint32_t>("kprep1dChannelsPerBlock");
   configParameters_.sipmQTSShift = ps.getParameter<int>("sipmQTSShift");
   configParameters_.sipmQNTStoSum = ps.getParameter<int>("sipmQNTStoSum");
@@ -100,39 +101,14 @@ HBHERecHitProducerGPU::HBHERecHitProducerGPU(edm::ParameterSet const& ps)
   configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0];
   configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1];
   configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2];
-
-  // call CUDA API functions only if CUDA is available
-  edm::Service<CUDAService> cs;
-  if (cs and cs->enabled()) {
-    outputGPU_.allocate(configParameters_);
-    scratchGPU_.allocate(configParameters_);
-
-    // FIXME: use default device and default stream
-    cudaCheck(
-        cudaMalloc((void**)&configParameters_.pulseOffsetsDevice, sizeof(int) * configParameters_.pulseOffsets.size()));
-    cudaCheck(cudaMemcpy(configParameters_.pulseOffsetsDevice,
-                         configParameters_.pulseOffsets.data(),
-                         configParameters_.pulseOffsets.size() * sizeof(int),
-                         cudaMemcpyHostToDevice));
-  }
 }
 
-HBHERecHitProducerGPU::~HBHERecHitProducerGPU() {
-  // call CUDA API functions only if CUDA is available
-  edm::Service<CUDAService> cs;
-  if (cs and cs->enabled()) {
-    outputGPU_.deallocate(configParameters_);
-    scratchGPU_.deallocate(configParameters_);
-
-    cudaCheck(cudaFree(configParameters_.pulseOffsetsDevice));
-  }
-}
+HBHERecHitProducerGPU::~HBHERecHitProducerGPU() {}
 
 void HBHERecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& cdesc) {
   edm::ParameterSetDescription desc;
   desc.add<uint32_t>("maxChannels", 10000u);
   desc.add<uint32_t>("maxTimeSamples", 10);
-  desc.add<std::vector<int>>("pulseOffsets", {-3, -2, -1, 0, 1, 2, 3, 4});
   desc.add<uint32_t>("kprep1dChannelsPerBlock", 32);
   desc.add<edm::InputTag>("digisLabelF01HE", edm::InputTag{"hcalRawToDigiGPU", "f01HEDigisGPU"});
   desc.add<edm::InputTag>("digisLabelF5HB", edm::InputTag{"hcalRawToDigiGPU", "f5HBDigisGPU"});
@@ -239,6 +215,10 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
   setup.get<HcalSiPMCharacteristicsRcd>().get(sipmCharacteristicsHandle);
   auto const& sipmCharacteristicsProduct = sipmCharacteristicsHandle->getProduct(ctx.stream());
 
+  edm::ESHandle<HcalMahiPulseOffsetsGPU> pulseOffsetsHandle;
+  setup.get<HcalMahiPulseOffsetsGPURecord>().get(pulseOffsetsHandle);
+  auto const& pulseOffsetsProduct = pulseOffsetsHandle->getProduct(ctx.stream());
+
   // bundle up conditions
   hcal::mahi::ConditionsProducts conditions{gainWidthsProduct,
                                             gainsProduct,
@@ -256,9 +236,35 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
                                             effectivePedestalsProduct,
                                             topologyHandle.product(),
                                             recConstantsHandle.product(),
-                                            pedestalsHandle->offsetForHashes()};
-
-  hcal::mahi::entryPoint(inputGPU, outputGPU_, conditions, scratchGPU_, configParameters_, ctx.stream());
+                                            pedestalsHandle->offsetForHashes(),
+                                            pulseOffsetsProduct,
+                                            pulseOffsetsHandle->getValues()};
+
+  // scratch mem on device
+  hcal::mahi::ScratchDataGPU scratchGPU = {
+    cms::cuda::make_device_unique<float[]>(
+      configParameters_.maxChannels * configParameters_.maxTimeSamples,
+      ctx.stream()
+    ),
+    cms::cuda::make_device_unique<float[]>(
+      configParameters_.maxChannels * configParameters_.maxTimeSamples, ctx.stream()),
+    cms::cuda::make_device_unique<float[]>(
+      configParameters_.maxChannels*configParameters_.maxTimeSamples*configParameters_.maxTimeSamples, 
+      ctx.stream()),
+    cms::cuda::make_device_unique<float[]>(
+      configParameters_.maxChannels*configParameters_.maxTimeSamples*configParameters_.maxTimeSamples, 
+      ctx.stream()),
+    cms::cuda::make_device_unique<float[]>(
+      configParameters_.maxChannels*configParameters_.maxTimeSamples*configParameters_.maxTimeSamples, 
+      ctx.stream()),
+    cms::cuda::make_device_unique<int8_t[]>(
+      configParameters_.maxChannels, ctx.stream()),
+  };
+
+  // output dev mem
+  outputGPU_.allocate(configParameters_, ctx.stream());
+
+  hcal::mahi::entryPoint(inputGPU, outputGPU_, conditions, scratchGPU, configParameters_, ctx.stream());
 
 #ifdef HCAL_MAHI_CPUDEBUG
   auto end = std::chrono::high_resolution_clock::now();
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
index db934710f6108..714ec8b7de5af 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
@@ -23,9 +23,9 @@ class HcalCPURecHitsProducer : public edm::stream::EDProducer<edm::ExternalWork>
   void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-  using IProductType = cms::cuda::Product<hcal::RecHitCollection<hcal::common::ViewStoragePolicy>>;
+  using IProductType = cms::cuda::Product<hcal::RecHitCollection<calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductType> recHitsM0TokenIn_;
-  using OProductType = hcal::RecHitCollection<hcal::common::VecStoragePolicy<hcal::CUDAHostAllocatorAlias>>;
+  using OProductType = hcal::RecHitCollection<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   edm::EDPutTokenT<OProductType> recHitsM0TokenOut_;
   edm::EDPutTokenT<HBHERecHitCollection> recHitsLegacyTokenOut_;
 
@@ -67,15 +67,17 @@ void HcalCPURecHitsProducer::acquire(edm::Event const& event,
 
   auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
     using vector_type = typename std::remove_reference<decltype(dest)>::type;
+    using src_data_type = typename std::remove_pointer<decltype(src)>::type;
     using type = typename vector_type::value_type;
+    static_assert(std::is_same<src_data_type, type>::value && "Dest and Src data types do not match");
     cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream()));
   };
 
-  lambdaToTransfer(tmpRecHits_.energy, recHits.energy);
-  lambdaToTransfer(tmpRecHits_.chi2, recHits.chi2);
-  lambdaToTransfer(tmpRecHits_.energyM0, recHits.energyM0);
-  lambdaToTransfer(tmpRecHits_.timeM0, recHits.timeM0);
-  lambdaToTransfer(tmpRecHits_.did, recHits.did);
+  lambdaToTransfer(tmpRecHits_.energy, recHits.energy.get());
+  lambdaToTransfer(tmpRecHits_.chi2, recHits.chi2.get());
+  lambdaToTransfer(tmpRecHits_.energyM0, recHits.energyM0.get());
+  lambdaToTransfer(tmpRecHits_.timeM0, recHits.timeM0.get());
+  lambdaToTransfer(tmpRecHits_.did, recHits.did.get());
 }
 
 void HcalCPURecHitsProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
new file mode 100644
index 0000000000000..9500f62ea869f
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
@@ -0,0 +1,67 @@
+#include <array>
+#include <iostream>
+#include <tuple>
+#include <utility>
+
+#include "FWCore/Framework/interface/ESProducer.h"
+#include "FWCore/Framework/interface/ESProductHost.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/ModuleFactory.h"
+#include "FWCore/Framework/interface/EventSetupRecordIntervalFinder.h"
+#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
+#include "HcalMahiPulseOffsetsGPURecord.h"
+
+#include "FWCore/Framework/interface/SourceFactory.h"
+
+class HcalMahiPulseOffsetsGPUESProducer 
+        : public edm::ESProducer, public edm::EventSetupRecordIntervalFinder {
+public:
+    HcalMahiPulseOffsetsGPUESProducer(edm::ParameterSet const&);
+    ~HcalMahiPulseOffsetsGPUESProducer() override = default;
+
+    static void fillDescriptions(edm::ConfigurationDescriptions&);
+    std::unique_ptr<HcalMahiPulseOffsetsGPU> produce(HcalMahiPulseOffsetsGPURecord const&);
+
+protected:
+    void setIntervalFor(const edm::eventsetup::EventSetupRecordKey&,
+                        const edm::IOVSyncValue&,
+                        edm::ValidityInterval&) override;
+
+private:
+    edm::ParameterSet const& pset_;
+};
+
+HcalMahiPulseOffsetsGPUESProducer::HcalMahiPulseOffsetsGPUESProducer(
+        edm::ParameterSet const& pset) : pset_{pset}
+{
+    setWhatProduced(this);
+    findingRecord<HcalMahiPulseOffsetsGPURecord>();
+}
+
+void HcalMahiPulseOffsetsGPUESProducer::setIntervalFor(
+        const edm::eventsetup::EventSetupRecordKey& iKey,
+        const edm::IOVSyncValue& iTime,
+        edm::ValidityInterval& oInterval) {
+    oInterval = edm::ValidityInterval(
+        edm::IOVSyncValue::beginOfTime(), edm::IOVSyncValue::endOfTime());
+}
+
+void HcalMahiPulseOffsetsGPUESProducer::fillDescriptions(
+        edm::ConfigurationDescriptions& desc) {
+    edm::ParameterSetDescription d;
+    d.add<std::vector<int>>("pulseOffsets", {-3, -2, -1, 0, 1, 2, 3, 4});
+    desc.addWithDefaultLabel(d);
+}
+
+std::unique_ptr<HcalMahiPulseOffsetsGPU> HcalMahiPulseOffsetsGPUESProducer::produce(
+        HcalMahiPulseOffsetsGPURecord const&) {
+    return std::make_unique<HcalMahiPulseOffsetsGPU>(pset_);
+}
+
+DEFINE_FWK_EVENTSETUP_SOURCE(HcalMahiPulseOffsetsGPUESProducer);
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index 72df5d89815a2..05ddb83ae540a 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -1675,26 +1675,26 @@ namespace hcal {
       int nbytesShared =
           ((2 * f01nsamples + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock;
       kernel_prep1d_sameNumberOfSamples<<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
-          scratch.amplitudes,
-          scratch.noiseTerms,
-          outputGPU.recHits.energy,
-          outputGPU.recHits.chi2,
-          inputGPU.f01HEDigis.data,
-          inputGPU.f5HBDigis.data,
-          inputGPU.f3HBDigis.data,
-          inputGPU.f01HEDigis.ids,
-          inputGPU.f5HBDigis.ids,
-          inputGPU.f3HBDigis.ids,
+          scratch.amplitudes.get(),
+          scratch.noiseTerms.get(),
+          outputGPU.recHits.energy.get(),
+          outputGPU.recHits.chi2.get(),
+          inputGPU.f01HEDigis.data.get(),
+          inputGPU.f5HBDigis.data.get(),
+          inputGPU.f3HBDigis.data.get(),
+          inputGPU.f01HEDigis.ids.get(),
+          inputGPU.f5HBDigis.ids.get(),
+          inputGPU.f3HBDigis.ids.get(),
           inputGPU.f01HEDigis.stride,
           inputGPU.f5HBDigis.stride,
           inputGPU.f3HBDigis.stride,
           inputGPU.f01HEDigis.size,
           inputGPU.f5HBDigis.size,
-          inputGPU.f5HBDigis.npresamples,
-          scratch.soiSamples,
-          outputGPU.recHits.energyM0,
-          outputGPU.recHits.timeM0,
-          outputGPU.recHits.did,
+          inputGPU.f5HBDigis.npresamples.get(),
+          scratch.soiSamples.get(),
+          outputGPU.recHits.energyM0.get(),
+          outputGPU.recHits.timeM0.get(),
+          outputGPU.recHits.did.get(),
           totalChannels,
           conditions.recoParams.param1,
           conditions.recoParams.param2,
@@ -1734,8 +1734,8 @@ namespace hcal {
 
       // 1024 is the max threads per block for gtx1080
       // FIXME: take this from cuda service or something like that
-      uint32_t const channelsPerBlock = 1024 / (f01nsamples * configParameters.pulseOffsets.size());
-      dim3 threadsPerBlock2{f01nsamples, static_cast<uint32_t>(configParameters.pulseOffsets.size()), channelsPerBlock};
+      uint32_t const channelsPerBlock = 1024 / (f01nsamples * conditions.pulseOffsetsHost.size());
+      dim3 threadsPerBlock2{f01nsamples, static_cast<uint32_t>(conditions.pulseOffsetsHost.size()), channelsPerBlock};
       int blocks2 =
           threadsPerBlock2.z > totalChannels ? 1 : (totalChannels + threadsPerBlock2.z - 1) / threadsPerBlock2.z;
 
@@ -1746,18 +1746,18 @@ namespace hcal {
 #endif
 
       kernel_prep_pulseMatrices_sameNumberOfSamples<<<blocks2, threadsPerBlock2, 0, cudaStream>>>(
-          scratch.pulseMatrices,
-          scratch.pulseMatricesM,
-          scratch.pulseMatricesP,
-          configParameters.pulseOffsetsDevice,
-          scratch.amplitudes,
-          inputGPU.f01HEDigis.ids,
-          inputGPU.f5HBDigis.ids,
-          inputGPU.f3HBDigis.ids,
+          scratch.pulseMatrices.get(),
+          scratch.pulseMatricesM.get(),
+          scratch.pulseMatricesP.get(),
+          conditions.pulseOffsets.values,
+          scratch.amplitudes.get(),
+          inputGPU.f01HEDigis.ids.get(),
+          inputGPU.f5HBDigis.ids.get(),
+          inputGPU.f3HBDigis.ids.get(),
           inputGPU.f01HEDigis.size,
           inputGPU.f5HBDigis.size,
           totalChannels,
-          scratch.soiSamples,
+          scratch.soiSamples.get(),
           conditions.recoParams.ids,
           conditions.recoParams.acc25nsVec,
           conditions.recoParams.diff25nsItvlVec,
@@ -1786,27 +1786,27 @@ namespace hcal {
           configParameters.tmaxTimeSlew);
       cudaCheck(cudaGetLastError());
 
-      if (f01nsamples == 8 && configParameters.pulseOffsets.size() == 8u) {
+      if (f01nsamples == 8 && conditions.pulseOffsetsHost.size() == 8u) {
         // FIXME: provide constants from configuration
         uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
         uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;
         auto const nbytesShared = 2 * threadsPerBlock * MapSymM<float, 8>::total * sizeof(float);
         kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
-            outputGPU.recHits.energy,
-            outputGPU.recHits.chi2,
-            scratch.amplitudes,
-            scratch.pulseMatrices,
-            scratch.pulseMatricesM,
-            scratch.pulseMatricesP,
-            configParameters.pulseOffsetsDevice,
-            scratch.noiseTerms,
-            scratch.soiSamples,
+            outputGPU.recHits.energy.get(),
+            outputGPU.recHits.chi2.get(),
+            scratch.amplitudes.get(),
+            scratch.pulseMatrices.get(),
+            scratch.pulseMatricesM.get(),
+            scratch.pulseMatricesP.get(),
+            conditions.pulseOffsets.values,
+            scratch.noiseTerms.get(),
+            scratch.soiSamples.get(),
             conditions.pedestalWidths.values,
             conditions.effectivePedestalWidths.values,
             configParameters.useEffectivePedestals,
-            inputGPU.f01HEDigis.ids,
-            inputGPU.f5HBDigis.ids,
-            inputGPU.f3HBDigis.ids,
+            inputGPU.f01HEDigis.ids.get(),
+            inputGPU.f5HBDigis.ids.get(),
+            inputGPU.f3HBDigis.ids.get(),
             conditions.gains.values,
             conditions.respCorrs.values,
             inputGPU.f01HEDigis.size,
@@ -1827,7 +1827,7 @@ namespace hcal {
       } else {
         throw cms::Exception("Invalid MahiGPU configuration")
             << "Currently support only 8 pulses and 8 time samples and provided: " << f01nsamples << " samples and "
-            << configParameters.pulseOffsets.size() << " pulses" << std::endl;
+            << conditions.pulseOffsetsHost.size() << " pulses" << std::endl;
       }
     }
 

From 288b5df485b4fb4b4d7dd9d84604d33b09dfcf47 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 12 Jul 2020 14:44:53 +0200
Subject: [PATCH 16/34] Update ECAL and HCAL reconstruction to run on multple
 GPUs [2/3] (cms-patatrack#508)

Add missing ESProducers for ECAL and HCAL GPU modules: add to the
offline workflows and to the HLT customisations the ESProducers required
to complement the configuration of the ECAL and HCAL GPU modules.
---
 .../HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py b/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py
index d2d3dac166469..d938653d5a15e 100644
--- a/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py
+++ b/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py
@@ -22,6 +22,7 @@
 from RecoLocalCalo.HcalRecProducers.hcalQIETypesGPUESProducer_cfi import hcalQIETypesGPUESProducer
 from RecoLocalCalo.HcalRecProducers.hcalSiPMParametersGPUESProducer_cfi import hcalSiPMParametersGPUESProducer
 from RecoLocalCalo.HcalRecProducers.hcalSiPMCharacteristicsGPUESProducer_cfi import hcalSiPMCharacteristicsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalMahiPulseOffsetsGPUESProducer_cfi import hcalMahiPulseOffsetsGPUESProducer
 
 # convert the HBHE digis into SoA format, and copy them from CPU to GPU
 from EventFilter.HcalRawToDigi.hcalDigisProducerGPU_cfi import hcalDigisProducerGPU as _hcalDigisProducerGPU
@@ -56,6 +57,7 @@
     hcalQIETypesGPUESProducer,
     hcalSiPMParametersGPUESProducer,
     hcalSiPMCharacteristicsGPUESProducer,
+    hcalMahiPulseOffsetsGPUESProducer,
     hcalDigisGPU,
     hbheRecHitProducerGPU
 )

From e4406a36acc7140a0fec51ccf4a49f18ee736b40 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 12 Jul 2020 15:09:00 +0200
Subject: [PATCH 17/34] Apply code formatting

---
 .../HcalRawToDigi/plugins/DeclsForKernels.h   | 32 +++---------
 .../HcalRawToDigi/plugins/DecodeGPU.cu        | 10 ++--
 .../plugins/HcalDigisProducerGPU.cc           | 37 +++-----------
 .../HcalRawToDigi/plugins/HcalRawToDigiGPU.cc | 46 +++++------------
 .../interface/HcalMahiPulseOffsetsGPU.h       |  4 +-
 .../src/HcalMahiPulseOffsetsGPU.cc            | 13 +++--
 .../HcalRecProducers/src/DeclsForKernels.h    | 18 +++----
 .../src/HBHERecHitProducerGPU.cc              | 31 ++++++------
 .../src/HcalMahiPulseOffsetsGPUESProducer.cc  | 50 ++++++++-----------
 9 files changed, 83 insertions(+), 158 deletions(-)

diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
index 3808440f1449e..9eb0670f60d59 100644
--- a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
+++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
@@ -57,45 +57,27 @@ namespace hcal {
 
       void allocate(ConfigurationParameters const &config, cudaStream_t cudaStream) {
         digisF01HE.data = cms::cuda::make_device_unique<uint16_t[]>(
-          config.maxChannelsF01HE*compute_stride<Flavor01>(config.nsamplesF01HE),
-          cudaStream
-        );
+            config.maxChannelsF01HE * compute_stride<Flavor01>(config.nsamplesF01HE), cudaStream);
         //cudaCheck(
         //    cudaMalloc((void **)&digisF01HE.data,
         //               config.maxChannelsF01HE * sizeof(uint16_t) * compute_stride<Flavor01>(config.nsamplesF01HE)));
-        digisF01HE.ids = cms::cuda::make_device_unique<uint32_t[]>(
-          config.maxChannelsF01HE,
-          cudaStream
-        );
+        digisF01HE.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF01HE, cudaStream);
         //cudaCheck(cudaMalloc((void **)&digisF01HE.ids, sizeof(uint32_t) * config.maxChannelsF01HE));
 
         digisF5HB.data = cms::cuda::make_device_unique<uint16_t[]>(
-          config.maxChannelsF5HB * compute_stride<Flavor5>(config.nsamplesF5HB),
-          cudaStream
-        );
+            config.maxChannelsF5HB * compute_stride<Flavor5>(config.nsamplesF5HB), cudaStream);
         //cudaCheck(cudaMalloc((void **)&digisF5HB.data,
         //                     config.maxChannelsF5HB * sizeof(uint16_t) * compute_stride<Flavor5>(config.nsamplesF5HB)));
-        digisF5HB.ids = cms::cuda::make_device_unique<uint32_t[]>(
-          config.maxChannelsF5HB,
-          cudaStream
-        );
+        digisF5HB.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF5HB, cudaStream);
         //cudaCheck(cudaMalloc((void **)&digisF5HB.ids, sizeof(uint32_t) * config.maxChannelsF5HB));
-        digisF5HB.npresamples = cms::cuda::make_device_unique<uint8_t[]>(
-          config.maxChannelsF5HB,
-          cudaStream
-        );
+        digisF5HB.npresamples = cms::cuda::make_device_unique<uint8_t[]>(config.maxChannelsF5HB, cudaStream);
         //cudaCheck(cudaMalloc((void **)&digisF5HB.npresamples, sizeof(uint8_t) * config.maxChannelsF5HB));
 
         digisF3HB.data = cms::cuda::make_device_unique<uint16_t[]>(
-          config.maxChannelsF3HB * compute_stride<Flavor3>(config.nsamplesF3HB),
-          cudaStream
-        );
+            config.maxChannelsF3HB * compute_stride<Flavor3>(config.nsamplesF3HB), cudaStream);
         //cudaCheck(cudaMalloc((void **)&digisF3HB.data,
         //                     config.maxChannelsF3HB * sizeof(uint16_t) * compute_stride<Flavor3>(config.nsamplesF3HB)));
-        digisF3HB.ids = cms::cuda::make_device_unique<uint32_t[]>(
-          config.maxChannelsF3HB,
-          cudaStream
-        );
+        digisF3HB.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF3HB, cudaStream);
         //cudaCheck(cudaMalloc((void **)&digisF3HB.ids, config.maxChannelsF3HB * sizeof(uint32_t)));
       }
     };
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
index 5011b7b74fac3..ab1f7134277be 100644
--- a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
@@ -567,14 +567,18 @@ namespace hcal {
                     uint32_t const nfedsWithData,
                     uint32_t const nbytesTotal) {
       // transfer
-      cudaCheck(cudaMemcpyAsync(
-          inputGPU.data.get(), inputCPU.data.get(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream));
+      cudaCheck(cudaMemcpyAsync(inputGPU.data.get(),
+                                inputCPU.data.get(),
+                                nbytesTotal * sizeof(unsigned char),
+                                cudaMemcpyHostToDevice,
+                                cudaStream));
       cudaCheck(cudaMemcpyAsync(inputGPU.offsets.get(),
                                 inputCPU.offsets.get(),
                                 nfedsWithData * sizeof(uint32_t),
                                 cudaMemcpyHostToDevice,
                                 cudaStream));
-      cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounters.get(), 0, sizeof(uint32_t) * numOutputCollections, cudaStream));
+      cudaCheck(
+          cudaMemsetAsync(scratchGPU.pChannelsCounters.get(), 0, sizeof(uint32_t) * numOutputCollections, cudaStream));
       cudaCheck(cudaMemcpyAsync(
           inputGPU.feds.get(), inputCPU.feds.get(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
 
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
index 69546a7a118bc..d49e2ad366817 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -125,40 +125,19 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
 
   // flavor 0/1 get devie blobs
   df01_.data = cms::cuda::make_device_unique<uint16_t[]>(
-    config_.maxChannelsF01HE * hcal::compute_stride<hcal::Flavor01>(
-      config_.nsamplesF01HE),
-    ctx.stream()
-  );
-  df01_.ids = cms::cuda::make_device_unique<uint32_t[]>(
-    config_.maxChannelsF01HE,
-    ctx.stream()
-  );
+      config_.maxChannelsF01HE * hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE), ctx.stream());
+  df01_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF01HE, ctx.stream());
 
   // flavor3 get device blobs
-  df3_.data  = cms::cuda::make_device_unique<uint16_t[]>(
-    config_.maxChannelsF3HB * hcal::compute_stride<hcal::Flavor3>(
-      config_.nsamplesF3HB),
-    ctx.stream()
-  );
-  df3_.ids = cms::cuda::make_device_unique<uint32_t[]>(
-    config_.maxChannelsF3HB,
-    ctx.stream()
-  );
+  df3_.data = cms::cuda::make_device_unique<uint16_t[]>(
+      config_.maxChannelsF3HB * hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB), ctx.stream());
+  df3_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF3HB, ctx.stream());
 
   // flavor5 get device blobs
   df5_.data = cms::cuda::make_device_unique<uint16_t[]>(
-    config_.maxChannelsF5HB * hcal::compute_stride<hcal::Flavor5>(
-      config_.nsamplesF5HB),
-    ctx.stream()
-  );
-  df5_.ids = cms::cuda::make_device_unique<uint32_t[]>(
-    config_.maxChannelsF5HB,
-    ctx.stream()
-  );
-  df5_.npresamples = cms::cuda::make_device_unique<uint8_t[]>(
-    config_.maxChannelsF5HB,
-    ctx.stream()
-  );
+      config_.maxChannelsF5HB * hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB), ctx.stream());
+  df5_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF5HB, ctx.stream());
+  df5_.npresamples = cms::cuda::make_device_unique<uint8_t[]>(config_.maxChannelsF5HB, ctx.stream());
 
   for (auto const& hbhe : *hbheDigis) {
     auto const id = hbhe.id().rawId();
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
index ab479a36202b0..f1b5ef6885a04 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
@@ -83,8 +83,7 @@ HcalRawToDigiGPU::HcalRawToDigiGPU(const edm::ParameterSet& ps)
   config_.nsamplesF3HB = ps.getParameter<uint32_t>("nsamplesF3HB");
 }
 
-HcalRawToDigiGPU::~HcalRawToDigiGPU() {
-}
+HcalRawToDigiGPU::~HcalRawToDigiGPU() {}
 
 void HcalRawToDigiGPU::acquire(edm::Event const& event,
                                edm::EventSetup const& setup,
@@ -106,44 +105,23 @@ void HcalRawToDigiGPU::acquire(edm::Event const& event,
 
   // scratch
   hcal::raw::ScratchDataGPU scratchGPU = {
-    cms::cuda::make_device_unique<uint32_t[]>(
-      hcal::raw::numOutputCollections,
-      ctx.stream())
-  };
+      cms::cuda::make_device_unique<uint32_t[]>(hcal::raw::numOutputCollections, ctx.stream())};
 
   // input cpu data
-  hcal::raw::InputDataCPU inputCPU = {
-    cms::cuda::make_host_unique<unsigned char[]>(
-      hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max,
-      ctx.stream()),
-    cms::cuda::make_host_unique<uint32_t[]>(
-      hcal::raw::utca_nfeds_max,
-      ctx.stream()),
-    cms::cuda::make_host_unique<int[]>(
-      hcal::raw::utca_nfeds_max,
-      ctx.stream())
-  };
+  hcal::raw::InputDataCPU inputCPU = {cms::cuda::make_host_unique<unsigned char[]>(
+                                          hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max, ctx.stream()),
+                                      cms::cuda::make_host_unique<uint32_t[]>(hcal::raw::utca_nfeds_max, ctx.stream()),
+                                      cms::cuda::make_host_unique<int[]>(hcal::raw::utca_nfeds_max, ctx.stream())};
 
   // input data gpu
   hcal::raw::InputDataGPU inputGPU = {
-    cms::cuda::make_device_unique<unsigned char[]>(
-      hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max,
-      ctx.stream()),
-    cms::cuda::make_device_unique<uint32_t[]>(
-      hcal::raw::utca_nfeds_max,
-      ctx.stream()),
-    cms::cuda::make_device_unique<int[]>(
-      hcal::raw::utca_nfeds_max,
-      ctx.stream())
-  };
+      cms::cuda::make_device_unique<unsigned char[]>(hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max,
+                                                     ctx.stream()),
+      cms::cuda::make_device_unique<uint32_t[]>(hcal::raw::utca_nfeds_max, ctx.stream()),
+      cms::cuda::make_device_unique<int[]>(hcal::raw::utca_nfeds_max, ctx.stream())};
 
   // output cpu
-  outputCPU_ = {
-    cms::cuda::make_host_unique<uint32_t[]>(
-      hcal::raw::numOutputCollections,
-      ctx.stream()
-    )
-  };
+  outputCPU_ = {cms::cuda::make_host_unique<uint32_t[]>(hcal::raw::numOutputCollections, ctx.stream())};
 
   // output gpu
   outputGPU_.allocate(config_, ctx.stream());
@@ -214,7 +192,7 @@ void HcalRawToDigiGPU::produce(edm::Event& event, edm::EventSetup const& setup)
   ctx.emplace(event, digisF01HEToken_, std::move(outputGPU_.digisF01HE));
   ctx.emplace(event, digisF5HBToken_, std::move(outputGPU_.digisF5HB));
   ctx.emplace(event, digisF3HBToken_, std::move(outputGPU_.digisF3HB));
-  
+
   // reset ptrs that are carried as members
   outputCPU_.nchannels.reset();
 }
diff --git a/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h b/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h
index 2481a80711d33..98ce9c0b660f1 100644
--- a/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h
+++ b/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h
@@ -22,9 +22,7 @@ class HcalMahiPulseOffsetsGPU {
   // will trigger deallocation of Product thru ~Product
   ~HcalMahiPulseOffsetsGPU() = default;
 
-  std::vector<int, cms::cuda::HostAllocator<int>> const& getValues() const {
-      return values_;
-  }
+  std::vector<int, cms::cuda::HostAllocator<int>> const& getValues() const { return values_; }
 
   // get device pointers
   Product const& getProduct(cudaStream_t) const;
diff --git a/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc b/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc
index 3f5cdbe5f15ca..005a77932f6df 100644
--- a/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc
+++ b/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc
@@ -4,11 +4,10 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 // FIXME: add proper getters to conditions
-HcalMahiPulseOffsetsGPU::HcalMahiPulseOffsetsGPU(edm::ParameterSet const& ps) 
-{
-    auto const& values = ps.getParameter<std::vector<int>>("pulseOffsets");
-    values_.resize(values.size());
-    std::copy(values.begin(), values.end(), values_.begin());
+HcalMahiPulseOffsetsGPU::HcalMahiPulseOffsetsGPU(edm::ParameterSet const& ps) {
+  auto const& values = ps.getParameter<std::vector<int>>("pulseOffsets");
+  values_.resize(values.size());
+  std::copy(values.begin(), values.end(), values_.begin());
 }
 
 HcalMahiPulseOffsetsGPU::Product::~Product() {
@@ -17,8 +16,8 @@ HcalMahiPulseOffsetsGPU::Product::~Product() {
 }
 
 HcalMahiPulseOffsetsGPU::Product const& HcalMahiPulseOffsetsGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product =
-      product_.dataForCurrentDeviceAsync(cudaStream, [this](HcalMahiPulseOffsetsGPU::Product& product, cudaStream_t cudaStream) {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](HcalMahiPulseOffsetsGPU::Product& product, cudaStream_t cudaStream) {
         // malloc
         cudaCheck(cudaMalloc((void**)&product.values, this->values_.size() * sizeof(int)));
 
diff --git a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
index 1486e27557285..38503cec7e76f 100644
--- a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
+++ b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
@@ -85,22 +85,16 @@ namespace hcal {
       RecHitCollection<::calo::common::DevStoragePolicy> recHits;
 
       void allocate(ConfigParameters const& config, cudaStream_t cudaStream) {
-        recHits.energy = cms::cuda::make_device_unique<float[]>(
-          config.maxChannels, cudaStream);
-        recHits.chi2 = cms::cuda::make_device_unique<float[]>(
-          config.maxChannels, cudaStream);
-        recHits.energyM0 = cms::cuda::make_device_unique<float[]>(
-          config.maxChannels, cudaStream);
-        recHits.timeM0 = cms::cuda::make_device_unique<float[]>(
-          config.maxChannels, cudaStream);
-        recHits.did = cms::cuda::make_device_unique<uint32_t[]>(
-          config.maxChannels, cudaStream);
+        recHits.energy = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
+        recHits.chi2 = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
+        recHits.energyM0 = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
+        recHits.timeM0 = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
+        recHits.did = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannels, cudaStream);
       }
     };
 
     struct ScratchDataGPU {
-      cms::cuda::device::unique_ptr<float[]> amplitudes, noiseTerms,
-          pulseMatrices, pulseMatricesM, pulseMatricesP;
+      cms::cuda::device::unique_ptr<float[]> amplitudes, noiseTerms, pulseMatrices, pulseMatricesM, pulseMatricesP;
       cms::cuda::device::unique_ptr<int8_t[]> soiSamples;
     };
 
diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
index e0294d68f0edd..ea51d54822e11 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -242,23 +242,20 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
 
   // scratch mem on device
   hcal::mahi::ScratchDataGPU scratchGPU = {
-    cms::cuda::make_device_unique<float[]>(
-      configParameters_.maxChannels * configParameters_.maxTimeSamples,
-      ctx.stream()
-    ),
-    cms::cuda::make_device_unique<float[]>(
-      configParameters_.maxChannels * configParameters_.maxTimeSamples, ctx.stream()),
-    cms::cuda::make_device_unique<float[]>(
-      configParameters_.maxChannels*configParameters_.maxTimeSamples*configParameters_.maxTimeSamples, 
-      ctx.stream()),
-    cms::cuda::make_device_unique<float[]>(
-      configParameters_.maxChannels*configParameters_.maxTimeSamples*configParameters_.maxTimeSamples, 
-      ctx.stream()),
-    cms::cuda::make_device_unique<float[]>(
-      configParameters_.maxChannels*configParameters_.maxTimeSamples*configParameters_.maxTimeSamples, 
-      ctx.stream()),
-    cms::cuda::make_device_unique<int8_t[]>(
-      configParameters_.maxChannels, ctx.stream()),
+      cms::cuda::make_device_unique<float[]>(configParameters_.maxChannels * configParameters_.maxTimeSamples,
+                                             ctx.stream()),
+      cms::cuda::make_device_unique<float[]>(configParameters_.maxChannels * configParameters_.maxTimeSamples,
+                                             ctx.stream()),
+      cms::cuda::make_device_unique<float[]>(
+          configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples,
+          ctx.stream()),
+      cms::cuda::make_device_unique<float[]>(
+          configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples,
+          ctx.stream()),
+      cms::cuda::make_device_unique<float[]>(
+          configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples,
+          ctx.stream()),
+      cms::cuda::make_device_unique<int8_t[]>(configParameters_.maxChannels, ctx.stream()),
   };
 
   // output dev mem
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
index 9500f62ea869f..c31781078d711 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
@@ -19,49 +19,43 @@
 
 #include "FWCore/Framework/interface/SourceFactory.h"
 
-class HcalMahiPulseOffsetsGPUESProducer 
-        : public edm::ESProducer, public edm::EventSetupRecordIntervalFinder {
+class HcalMahiPulseOffsetsGPUESProducer : public edm::ESProducer, public edm::EventSetupRecordIntervalFinder {
 public:
-    HcalMahiPulseOffsetsGPUESProducer(edm::ParameterSet const&);
-    ~HcalMahiPulseOffsetsGPUESProducer() override = default;
+  HcalMahiPulseOffsetsGPUESProducer(edm::ParameterSet const&);
+  ~HcalMahiPulseOffsetsGPUESProducer() override = default;
 
-    static void fillDescriptions(edm::ConfigurationDescriptions&);
-    std::unique_ptr<HcalMahiPulseOffsetsGPU> produce(HcalMahiPulseOffsetsGPURecord const&);
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+  std::unique_ptr<HcalMahiPulseOffsetsGPU> produce(HcalMahiPulseOffsetsGPURecord const&);
 
 protected:
-    void setIntervalFor(const edm::eventsetup::EventSetupRecordKey&,
-                        const edm::IOVSyncValue&,
-                        edm::ValidityInterval&) override;
+  void setIntervalFor(const edm::eventsetup::EventSetupRecordKey&,
+                      const edm::IOVSyncValue&,
+                      edm::ValidityInterval&) override;
 
 private:
-    edm::ParameterSet const& pset_;
+  edm::ParameterSet const& pset_;
 };
 
-HcalMahiPulseOffsetsGPUESProducer::HcalMahiPulseOffsetsGPUESProducer(
-        edm::ParameterSet const& pset) : pset_{pset}
-{
-    setWhatProduced(this);
-    findingRecord<HcalMahiPulseOffsetsGPURecord>();
+HcalMahiPulseOffsetsGPUESProducer::HcalMahiPulseOffsetsGPUESProducer(edm::ParameterSet const& pset) : pset_{pset} {
+  setWhatProduced(this);
+  findingRecord<HcalMahiPulseOffsetsGPURecord>();
 }
 
-void HcalMahiPulseOffsetsGPUESProducer::setIntervalFor(
-        const edm::eventsetup::EventSetupRecordKey& iKey,
-        const edm::IOVSyncValue& iTime,
-        edm::ValidityInterval& oInterval) {
-    oInterval = edm::ValidityInterval(
-        edm::IOVSyncValue::beginOfTime(), edm::IOVSyncValue::endOfTime());
+void HcalMahiPulseOffsetsGPUESProducer::setIntervalFor(const edm::eventsetup::EventSetupRecordKey& iKey,
+                                                       const edm::IOVSyncValue& iTime,
+                                                       edm::ValidityInterval& oInterval) {
+  oInterval = edm::ValidityInterval(edm::IOVSyncValue::beginOfTime(), edm::IOVSyncValue::endOfTime());
 }
 
-void HcalMahiPulseOffsetsGPUESProducer::fillDescriptions(
-        edm::ConfigurationDescriptions& desc) {
-    edm::ParameterSetDescription d;
-    d.add<std::vector<int>>("pulseOffsets", {-3, -2, -1, 0, 1, 2, 3, 4});
-    desc.addWithDefaultLabel(d);
+void HcalMahiPulseOffsetsGPUESProducer::fillDescriptions(edm::ConfigurationDescriptions& desc) {
+  edm::ParameterSetDescription d;
+  d.add<std::vector<int>>("pulseOffsets", {-3, -2, -1, 0, 1, 2, 3, 4});
+  desc.addWithDefaultLabel(d);
 }
 
 std::unique_ptr<HcalMahiPulseOffsetsGPU> HcalMahiPulseOffsetsGPUESProducer::produce(
-        HcalMahiPulseOffsetsGPURecord const&) {
-    return std::make_unique<HcalMahiPulseOffsetsGPU>(pset_);
+    HcalMahiPulseOffsetsGPURecord const&) {
+  return std::make_unique<HcalMahiPulseOffsetsGPU>(pset_);
 }
 
 DEFINE_FWK_EVENTSETUP_SOURCE(HcalMahiPulseOffsetsGPUESProducer);

From f95560a5c33bf663ea9d0a76e33f7d57782ea264 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 12 Jul 2020 23:24:43 +0200
Subject: [PATCH 18/34] Synchronise with CMSSW_11_2_0_pre2

---
 .../python/hcalGlobalReco_cff.py              | 15 +++++-
 .../Configuration/python/hcalLocalReco_cff.py | 53 ++++++++++---------
 2 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
index a94a032602713..fbb4c53f9f28b 100644
--- a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
@@ -4,7 +4,18 @@
 hcalGlobalRecoTask = cms.Task(hbhereco)
 hcalGlobalRecoSequence = cms.Sequence(hcalGlobalRecoTask)
 
+#--- for Run 3 and later
+from Configuration.Eras.Modifier_run3_HB_cff import run3_HB
+
 from RecoLocalCalo.HcalRecProducers.HBHEPhase1Reconstructor_cfi import hbheprereco as _phase1_hbheprereco
+run3_HB.toReplaceWith(hbhereco, _phase1_hbheprereco)
+
+#--- for Run 3 on GPU
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
-from Configuration.Eras.Modifier_phase2_hcal_cff import phase2_hcal
-phase2_hcal.toReplaceWith( hbhereco, _phase1_hbheprereco )
+from RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi import hcalCPURecHitsProducer as _hcalCPURecHitsProducer
+gpu.toReplaceWith(hbhereco, _hcalCPURecHitsProducer.clone(
+    recHitsM0LabelIn = "hbheRecHitProducerGPU",
+    recHitsM0LabelOut = "",
+    recHitsLegacyLabelOut = ""
+))
diff --git a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
index 2eff3a1eb5e40..a7bdce3b916af 100644
--- a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
@@ -1,33 +1,32 @@
 import FWCore.ParameterSet.Config as cms
 
 from RecoLocalCalo.HcalRecAlgos.hcalRecAlgoESProd_cfi import *
+from RecoLocalCalo.HcalRecAlgos.hcalChannelPropertiesESProd_cfi import *
 hcalOOTPileupESProducer = cms.ESProducer('OOTPileupDBCompatibilityESProducer')
 
 from RecoLocalCalo.HcalRecProducers.HBHEPhase1Reconstructor_cfi import hbheprereco as _phase1_hbheprereco
 hbheprereco = _phase1_hbheprereco.clone(
-    processQIE11 = cms.bool(False),
-    tsFromDB = cms.bool(True),
+    processQIE11 = False,
+    tsFromDB = True,
     pulseShapeParametersQIE8 = dict(
-        TrianglePeakTS = cms.uint32(4),
+        TrianglePeakTS = 4,
     )
 )
 
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_ho_cfi import *
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_hf_cfi import *
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_zdc_cfi import *
-hcalLocalRecoTask = cms.Task(hbheprereco,hfreco,horeco,zdcreco)
+hcalLocalRecoTask = cms.Task(hbheprereco, hfreco, horeco, zdcreco)
 hcalLocalRecoSequence = cms.Sequence(hcalLocalRecoTask)
 
 from RecoLocalCalo.HcalRecProducers.hfprereco_cfi import hfprereco
 from RecoLocalCalo.HcalRecProducers.HFPhase1Reconstructor_cfi import hfreco as _phase1_hfreco
 from RecoLocalCalo.HcalRecProducers.hbheplan1_cfi import hbheplan1
 
-#--- for HCALonly wf
-hcalOnlyLocalRecoTask = cms.Task(hbheprereco,hfprereco,hfreco,horeco)
-
-# copy for cosmics
+#--- for cosmics
 _default_hfreco = hfreco.clone()
 
+#--- for Phase 1
 _phase1_hcalLocalRecoTask = hcalLocalRecoTask.copy()
 _phase1_hcalLocalRecoTask.add(hfprereco)
 
@@ -35,7 +34,7 @@
 run2_HF_2017.toReplaceWith( hcalLocalRecoTask, _phase1_hcalLocalRecoTask )
 run2_HF_2017.toReplaceWith( hfreco, _phase1_hfreco )
 from Configuration.Eras.Modifier_run2_HCAL_2017_cff import run2_HCAL_2017
-run2_HCAL_2017.toReplaceWith( hbheprereco, _phase1_hbheprereco )
+run2_HCAL_2017.toReplaceWith(hbheprereco, _phase1_hbheprereco)
 
 _plan1_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy()
 _plan1_hcalLocalRecoTask.add(hbheplan1)
@@ -48,17 +47,30 @@
 from Configuration.ProcessModifiers.run2_HECollapse_2018_cff import run2_HECollapse_2018
 run2_HECollapse_2018.toReplaceWith(hcalLocalRecoTask, _collapse_hcalLocalRecoTask)
 
-# Run 3 HCAL workflow on GPU
+#--- for Run 3 and later
+_run3_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy()
+_run3_hcalLocalRecoTask.remove(hbheprereco)
+from Configuration.Eras.Modifier_run3_HB_cff import run3_HB
+run3_HB.toReplaceWith(hcalLocalRecoTask, _run3_hcalLocalRecoTask)
+
+#--- for Run 3 on GPU
 from Configuration.ProcessModifiers.gpu_cff import gpu
 
 from RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPUTask_cff import *
-_hcalLocalRecoTaskGPU = hcalLocalRecoTask.copy()
-_hcalLocalRecoTaskGPU.add(hbheRecHitProducerGPUTask)
-gpu.toReplaceWith(hcalLocalRecoTask, _hcalLocalRecoTaskGPU)
+_run3_hcalLocalRecoGPUTask = _run3_hcalLocalRecoTask.copy()
+_run3_hcalLocalRecoGPUTask.add(hbheRecHitProducerGPUTask)
+gpu.toReplaceWith(hcalLocalRecoTask, _run3_hcalLocalRecoGPUTask)
+
+#--- HCAL-only workflow for Run 3
+# FIXME rename `hbheprereco` to `hbhereco` and use it from hcalGlobalRecoTask
+hcalOnlyLocalRecoTask = cms.Task(hbheprereco, hfprereco, hfreco, horeco)
+
+#--- HCAL-only workflow for Run 3 on GPU
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
-_hcalOnlyLocalRecoTaskGPU = hcalOnlyLocalRecoTask.copy()
-_hcalOnlyLocalRecoTaskGPU.add(hbheRecHitProducerGPUTask)
-gpu.toReplaceWith(hcalOnlyLocalRecoTask, _hcalOnlyLocalRecoTaskGPU)
+_hcalOnlyLocalRecoGPUTask = hcalOnlyLocalRecoTask.copy()
+_hcalOnlyLocalRecoGPUTask.add(hbheRecHitProducerGPUTask)
+gpu.toReplaceWith(hcalOnlyLocalRecoTask, _hcalOnlyLocalRecoGPUTask)
 
 from RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi import hcalCPURecHitsProducer as _hcalCPURecHitsProducer
 gpu.toReplaceWith(hbheprereco, _hcalCPURecHitsProducer.clone(
@@ -66,15 +78,8 @@
     recHitsM0LabelOut = "",
     recHitsLegacyLabelOut = ""
 ))
-#---
-
-_phase2_hcalLocalRecoTask = hcalLocalRecoTask.copy()
-_phase2_hcalLocalRecoTask.remove(hbheprereco)
-
-from Configuration.Eras.Modifier_phase2_hcal_cff import phase2_hcal
-phase2_hcal.toReplaceWith( hcalLocalRecoTask, _phase2_hcalLocalRecoTask )
-
 
+#--- for FastSim
 _fastSim_hcalLocalRecoTask = hcalLocalRecoTask.copyAndExclude([zdcreco])
 from Configuration.Eras.Modifier_fastSim_cff import fastSim
 fastSim.toReplaceWith( hcalLocalRecoTask, _fastSim_hcalLocalRecoTask )

From 4858247e7e022d890930a5269fe7747608f511be Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 30 Jul 2020 21:21:01 +0200
Subject: [PATCH 19/34] Refactor common ECAL and HCAL code (cms-patatrack#523)

Move duplicated Eigen code to a common file, and use it for both ECAL and HCAL.
Move HCAL general reconstruction code from the hcal::multifit to the hcal::reconstruction namespace.
---
 .../HcalRecProducers/src/KernelHelpers.h      | 236 +++++++
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 588 ++----------------
 2 files changed, 301 insertions(+), 523 deletions(-)
 create mode 100644 RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h

diff --git a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
new file mode 100644
index 0000000000000..72f369d99060e
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
@@ -0,0 +1,236 @@
+#ifndef RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
+#define RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
+
+#include "DeclsForKernels.h"
+#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
+
+// nvcc not able to parse this guy (whatever is inlcuded from it)....
+//#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h"
+
+namespace hcal {
+  namespace reconstruction {
+
+
+    constexpr int32_t IPHI_MAX = 72;
+
+
+    // this is from HcalTimeSlew.
+    // HcalTimeSlew are values that come in from ESProducer that takes them
+    // from a python config. see DeclsForKernels for more explanation
+    __forceinline__ __device__ float compute_time_slew_delay(float const fC,
+                                                             float const tzero,
+                                                             float const slope,
+                                                             float const tmax) {
+      auto const rawDelay = tzero + slope * std::log(fC);
+      return rawDelay < 0 ? 0 : (rawDelay > tmax ? tmax : rawDelay);
+    }
+
+    // HcalQIEShapes are hardcoded in HcalQIEData.cc basically
+    // + some logic to generate 128 and 256 value arrays...
+    __constant__ float const qie8shape[129] = {
+        -1,   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   16,
+        18,   20,   22,   24,   26,   28,   31,   34,   37,   40,   44,   48,   52,   57,   62,   57,   62,
+        67,   72,   77,   82,   87,   92,   97,   102,  107,  112,  117,  122,  127,  132,  142,  152,  162,
+        172,  182,  192,  202,  217,  232,  247,  262,  282,  302,  322,  347,  372,  347,  372,  397,  422,
+        447,  472,  497,  522,  547,  572,  597,  622,  647,  672,  697,  722,  772,  822,  872,  922,  972,
+        1022, 1072, 1147, 1222, 1297, 1372, 1472, 1572, 1672, 1797, 1922, 1797, 1922, 2047, 2172, 2297, 2422,
+        2547, 2672, 2797, 2922, 3047, 3172, 3297, 3422, 3547, 3672, 3922, 4172, 4422, 4672, 4922, 5172, 5422,
+        5797, 6172, 6547, 6922, 7422, 7922, 8422, 9047, 9672, 10297};
+
+    __constant__ float const qie11shape[257] = {
+        -0.5,    0.5,     1.5,     2.5,     3.5,     4.5,     5.5,     6.5,     7.5,     8.5,     9.5,     10.5,
+        11.5,    12.5,    13.5,    14.5,    15.5,    17.5,    19.5,    21.5,    23.5,    25.5,    27.5,    29.5,
+        31.5,    33.5,    35.5,    37.5,    39.5,    41.5,    43.5,    45.5,    47.5,    49.5,    51.5,    53.5,
+        55.5,    59.5,    63.5,    67.5,    71.5,    75.5,    79.5,    83.5,    87.5,    91.5,    95.5,    99.5,
+        103.5,   107.5,   111.5,   115.5,   119.5,   123.5,   127.5,   131.5,   135.5,   139.5,   147.5,   155.5,
+        163.5,   171.5,   179.5,   187.5,   171.5,   179.5,   187.5,   195.5,   203.5,   211.5,   219.5,   227.5,
+        235.5,   243.5,   251.5,   259.5,   267.5,   275.5,   283.5,   291.5,   299.5,   315.5,   331.5,   347.5,
+        363.5,   379.5,   395.5,   411.5,   427.5,   443.5,   459.5,   475.5,   491.5,   507.5,   523.5,   539.5,
+        555.5,   571.5,   587.5,   603.5,   619.5,   651.5,   683.5,   715.5,   747.5,   779.5,   811.5,   843.5,
+        875.5,   907.5,   939.5,   971.5,   1003.5,  1035.5,  1067.5,  1099.5,  1131.5,  1163.5,  1195.5,  1227.5,
+        1259.5,  1291.5,  1355.5,  1419.5,  1483.5,  1547.5,  1611.5,  1675.5,  1547.5,  1611.5,  1675.5,  1739.5,
+        1803.5,  1867.5,  1931.5,  1995.5,  2059.5,  2123.5,  2187.5,  2251.5,  2315.5,  2379.5,  2443.5,  2507.5,
+        2571.5,  2699.5,  2827.5,  2955.5,  3083.5,  3211.5,  3339.5,  3467.5,  3595.5,  3723.5,  3851.5,  3979.5,
+        4107.5,  4235.5,  4363.5,  4491.5,  4619.5,  4747.5,  4875.5,  5003.5,  5131.5,  5387.5,  5643.5,  5899.5,
+        6155.5,  6411.5,  6667.5,  6923.5,  7179.5,  7435.5,  7691.5,  7947.5,  8203.5,  8459.5,  8715.5,  8971.5,
+        9227.5,  9483.5,  9739.5,  9995.5,  10251.5, 10507.5, 11019.5, 11531.5, 12043.5, 12555.5, 13067.5, 13579.5,
+        12555.5, 13067.5, 13579.5, 14091.5, 14603.5, 15115.5, 15627.5, 16139.5, 16651.5, 17163.5, 17675.5, 18187.5,
+        18699.5, 19211.5, 19723.5, 20235.5, 20747.5, 21771.5, 22795.5, 23819.5, 24843.5, 25867.5, 26891.5, 27915.5,
+        28939.5, 29963.5, 30987.5, 32011.5, 33035.5, 34059.5, 35083.5, 36107.5, 37131.5, 38155.5, 39179.5, 40203.5,
+        41227.5, 43275.5, 45323.5, 47371.5, 49419.5, 51467.5, 53515.5, 55563.5, 57611.5, 59659.5, 61707.5, 63755.5,
+        65803.5, 67851.5, 69899.5, 71947.5, 73995.5, 76043.5, 78091.5, 80139.5, 82187.5, 84235.5, 88331.5, 92427.5,
+        96523.5, 100620,  104716,  108812,  112908};
+
+    // Conditions are transferred once per IOV
+    // Access is performed based on the det id which is converted to a linear index
+    // 2 funcs below are taken from HcalTopology (reimplemented here).
+    // Inputs are constants that are also taken from HcalTopology
+    // but passed to the kernel as arguments using the HclaTopology itself
+    //    constexpr int32_t IPHI_MAX = 72;
+
+    __forceinline__ __device__ uint32_t did2linearIndexHB(
+        uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) {
+      HcalDetId did{didraw};
+      uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1);
+      return did.ieta() > 0 ? value + maxDepthHB * hcal::reconstruction::IPHI_MAX * (did.ieta() - firstHBRing)
+	: value + maxDepthHB * hcal::reconstruction::IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
+    }
+
+    __forceinline__ __device__ uint32_t did2linearIndexHE(uint32_t const didraw,
+                                                          int const maxDepthHE,
+                                                          int const maxPhiHE,
+                                                          int const firstHERing,
+                                                          int const lastHERing,
+                                                          int const nEtaHE) {
+      HcalDetId did{didraw};
+      uint32_t const value = (did.depth() - 1) + maxDepthHE * (did.iphi() - 1);
+      return did.ieta() > 0 ? value + maxDepthHE * maxPhiHE * (did.ieta() - firstHERing)
+                            : value + maxDepthHE * maxPhiHE * (did.ieta() + lastHERing + nEtaHE);
+    }
+
+    __forceinline__ __device__ uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range) {
+      return capid * 4 + range;
+    }
+
+    __forceinline__ __device__ float compute_reco_correction_factor(float const par1,
+                                                                    float const par2,
+                                                                    float const par3,
+                                                                    float const x) {
+      return par3 * x * x + par2 * x + par1;
+    }
+
+    // compute the charge using the adc, qie type and the appropriate qie shape array
+    __forceinline__ __device__ float compute_coder_charge(
+        int const qieType, uint8_t const adc, uint8_t const capid, float const* qieOffsets, float const* qieSlopes) {
+      auto const range = qieType == 0 ? (adc >> 5) & 0x3 : (adc >> 6) & 0x3;
+      auto const* qieShapeToUse = qieType == 0 ? qie8shape : qie11shape;
+      auto const nbins = qieType == 0 ? 32 : 64;
+      auto const center = adc % nbins == nbins - 1 ? 0.5 * (3 * qieShapeToUse[adc] - qieShapeToUse[adc - 1])
+                                                   : 0.5 * (qieShapeToUse[adc] + qieShapeToUse[adc + 1]);
+      auto const index = get_qiecoder_index(capid, range);
+      return (center - qieOffsets[index]) / qieSlopes[index];
+    }
+
+    __forceinline__ __device__ float compute_diff_charge_gain(int const qieType,
+                                                              uint8_t adc,
+                                                              uint8_t const capid,
+                                                              float const* qieOffsets,
+                                                              float const* qieSlopes,
+                                                              bool const isqie11) {
+      constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
+      constexpr uint32_t mantissaMaskQIE11 = 0x3f;
+      auto const mantissaMask = isqie11 ? mantissaMaskQIE11 : mantissaMaskQIE8;
+      auto const q = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
+      auto const mantissa = adc & mantissaMask;
+
+      if (mantissa == 0u || mantissa == mantissaMask - 1u)
+        return compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes) - q;
+      else if (mantissa == 1u || mantissa == mantissaMask)
+        return q - compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
+      else {
+        auto const qup = compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes);
+        auto const qdown = compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
+        auto const upgain = qup - q;
+        auto const downgain = q - qdown;
+        auto const averagegain = (qup - qdown) / 2.f;
+        if (std::abs(upgain - downgain) < 0.01f * averagegain)
+          return averagegain;
+        else {
+          auto const q2up = compute_coder_charge(qieType, adc + 2u, capid, qieOffsets, qieSlopes);
+          auto const q2down = compute_coder_charge(qieType, adc - 2u, capid, qieOffsets, qieSlopes);
+          auto const upgain2 = q2up - qup;
+          auto const downgain2 = qdown - q2down;
+          if (std::abs(upgain2 - upgain) < std::abs(downgain2 - downgain))
+            return upgain;
+          else
+            return downgain;
+        }
+      }
+    }
+
+    // FIXME remove duplication...
+    // this is from PulesFunctor. nvcc was complaining... if included that header...
+    //constexpr int maxSamples = 10;                                                                                                                                                
+    constexpr int maxPSshapeBin = 256;
+    constexpr int nsPerBX = 25;
+    constexpr float iniTimeShift = 92.5f;
+
+
+    // TODO: remove what's not needed
+    __forceinline__ __device__ float compute_pulse_shape_value(float const pulse_time,
+                                                               int const sample,
+                                                               int const shift,
+                                                               float const* acc25nsVec,
+                                                               float const* diff25nsItvlVec,
+                                                               float const* accVarLenIdxMinusOneVec,
+                                                               float const* diffVarItvlIdxMinusOneVec,
+                                                               float const* accVarLenIdxZeroVec,
+                                                               float const* diffVarItvlIdxZeroVec) {
+      // constants
+      constexpr float pulse_height = 1.0f;
+      constexpr float slew = 0.f;
+      constexpr auto ns_per_bx = nsPerBX;
+      //constexpr auto num_ns = nsPerBX * maxSamples;
+      //constexpr auto num_bx = num_ns / ns_per_bx;
+
+      // FIXME: clean up all the rounding... this is coming from original cpu version
+      float const i_start_float =
+	-iniTimeShift - pulse_time - slew > 0.f ? 0.f : std::abs(-iniTimeShift - pulse_time - slew) + 1.f;
+      int i_start = static_cast<int>(i_start_float);
+      float offset_start = static_cast<float>(i_start) - iniTimeShift - pulse_time - slew;
+      // FIXME: do we need a check for nan???
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (shift == 0)
+        printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start);
+#endif
+
+      // boundary
+      if (offset_start == 1.0f) {
+        offset_start = 0.f;
+        i_start -= 1;
+      }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (shift == 0)
+        printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start);
+#endif
+
+      int const bin_start = static_cast<int>(offset_start);
+      auto const bin_start_up = static_cast<float>(bin_start) + 0.5f;
+      int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start;
+      int const its_start = i_start / ns_per_bx;
+      int const distTo25ns_start = nsPerBX - 1 - i_start % ns_per_bx;
+      auto const factor = offset_start - static_cast<float>(bin_0_start) - 0.5;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (shift == 0) {
+        printf("bin_start = %d bin_0_start = %d its_start = %d distTo25ns_start = %d factor = %f\n",
+               bin_start,
+               bin_0_start,
+               its_start,
+               distTo25ns_start,
+               factor);
+      }
+#endif
+
+      auto const sample_over10ts = sample + shift;
+      float value = 0.0f;
+      if (sample_over10ts == its_start) {
+        value = bin_0_start == -1
+                    ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start]
+	  : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
+      } else if (sample_over10ts > its_start) {
+        int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
+        value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
+      }
+      value *= pulse_height;
+      return value;
+    }
+
+
+
+  }  // namespace reconstruction
+}  // namespace hcal
+
+
+#endif  // RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index 05ddb83ae540a..08fc726c4c691 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -1,6 +1,7 @@
 #include <Eigen/Dense>
 
 #include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
+#include "DataFormats/Math/interface/EigenComputations.h"
 
 // nvcc not able to parse this guy (whatever is inlcuded from it)....
 //#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h"
@@ -14,159 +15,6 @@
 namespace hcal {
   namespace mahi {
 
-    template <int NROWS, int NCOLS>
-    using ColMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::ColMajor>;
-
-    template <int NROWS, int NCOLS>
-    using RowMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::RowMajor>;
-
-    template <int SIZE, typename T = float>
-    using ColumnVector = Eigen::Matrix<T, SIZE, 1>;
-
-    template <int SIZE, typename T = float>
-    using RowVector = Eigen::Matrix<T, 1, SIZE>;
-
-    // FIXME remove duplication...
-    // this is from PulesFunctor. nvcc was complaining... if included that header...
-    //constexpr int maxSamples = 10;
-    constexpr int maxPSshapeBin = 256;
-    constexpr int nsPerBX = 25;
-    constexpr float iniTimeShift = 92.5f;
-
-    // this is from HcalTimeSlew.
-    // HcalTimeSlew are values that come in from ESProducer that takes them
-    // from a python config. see DeclsForKernels for more explanation
-    __forceinline__ __device__ float compute_time_slew_delay(float const fC,
-                                                             float const tzero,
-                                                             float const slope,
-                                                             float const tmax) {
-      auto const rawDelay = tzero + slope * std::log(fC);
-      return rawDelay < 0 ? 0 : (rawDelay > tmax ? tmax : rawDelay);
-    }
-
-    // HcalQIEShapes are hardcoded in HcalQIEData.cc basically
-    // + some logic to generate 128 and 256 value arrays...
-    __constant__ float const qie8shape[129] = {
-        -1,   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   16,
-        18,   20,   22,   24,   26,   28,   31,   34,   37,   40,   44,   48,   52,   57,   62,   57,   62,
-        67,   72,   77,   82,   87,   92,   97,   102,  107,  112,  117,  122,  127,  132,  142,  152,  162,
-        172,  182,  192,  202,  217,  232,  247,  262,  282,  302,  322,  347,  372,  347,  372,  397,  422,
-        447,  472,  497,  522,  547,  572,  597,  622,  647,  672,  697,  722,  772,  822,  872,  922,  972,
-        1022, 1072, 1147, 1222, 1297, 1372, 1472, 1572, 1672, 1797, 1922, 1797, 1922, 2047, 2172, 2297, 2422,
-        2547, 2672, 2797, 2922, 3047, 3172, 3297, 3422, 3547, 3672, 3922, 4172, 4422, 4672, 4922, 5172, 5422,
-        5797, 6172, 6547, 6922, 7422, 7922, 8422, 9047, 9672, 10297};
-
-    __constant__ float const qie11shape[257] = {
-        -0.5,    0.5,     1.5,     2.5,     3.5,     4.5,     5.5,     6.5,     7.5,     8.5,     9.5,     10.5,
-        11.5,    12.5,    13.5,    14.5,    15.5,    17.5,    19.5,    21.5,    23.5,    25.5,    27.5,    29.5,
-        31.5,    33.5,    35.5,    37.5,    39.5,    41.5,    43.5,    45.5,    47.5,    49.5,    51.5,    53.5,
-        55.5,    59.5,    63.5,    67.5,    71.5,    75.5,    79.5,    83.5,    87.5,    91.5,    95.5,    99.5,
-        103.5,   107.5,   111.5,   115.5,   119.5,   123.5,   127.5,   131.5,   135.5,   139.5,   147.5,   155.5,
-        163.5,   171.5,   179.5,   187.5,   171.5,   179.5,   187.5,   195.5,   203.5,   211.5,   219.5,   227.5,
-        235.5,   243.5,   251.5,   259.5,   267.5,   275.5,   283.5,   291.5,   299.5,   315.5,   331.5,   347.5,
-        363.5,   379.5,   395.5,   411.5,   427.5,   443.5,   459.5,   475.5,   491.5,   507.5,   523.5,   539.5,
-        555.5,   571.5,   587.5,   603.5,   619.5,   651.5,   683.5,   715.5,   747.5,   779.5,   811.5,   843.5,
-        875.5,   907.5,   939.5,   971.5,   1003.5,  1035.5,  1067.5,  1099.5,  1131.5,  1163.5,  1195.5,  1227.5,
-        1259.5,  1291.5,  1355.5,  1419.5,  1483.5,  1547.5,  1611.5,  1675.5,  1547.5,  1611.5,  1675.5,  1739.5,
-        1803.5,  1867.5,  1931.5,  1995.5,  2059.5,  2123.5,  2187.5,  2251.5,  2315.5,  2379.5,  2443.5,  2507.5,
-        2571.5,  2699.5,  2827.5,  2955.5,  3083.5,  3211.5,  3339.5,  3467.5,  3595.5,  3723.5,  3851.5,  3979.5,
-        4107.5,  4235.5,  4363.5,  4491.5,  4619.5,  4747.5,  4875.5,  5003.5,  5131.5,  5387.5,  5643.5,  5899.5,
-        6155.5,  6411.5,  6667.5,  6923.5,  7179.5,  7435.5,  7691.5,  7947.5,  8203.5,  8459.5,  8715.5,  8971.5,
-        9227.5,  9483.5,  9739.5,  9995.5,  10251.5, 10507.5, 11019.5, 11531.5, 12043.5, 12555.5, 13067.5, 13579.5,
-        12555.5, 13067.5, 13579.5, 14091.5, 14603.5, 15115.5, 15627.5, 16139.5, 16651.5, 17163.5, 17675.5, 18187.5,
-        18699.5, 19211.5, 19723.5, 20235.5, 20747.5, 21771.5, 22795.5, 23819.5, 24843.5, 25867.5, 26891.5, 27915.5,
-        28939.5, 29963.5, 30987.5, 32011.5, 33035.5, 34059.5, 35083.5, 36107.5, 37131.5, 38155.5, 39179.5, 40203.5,
-        41227.5, 43275.5, 45323.5, 47371.5, 49419.5, 51467.5, 53515.5, 55563.5, 57611.5, 59659.5, 61707.5, 63755.5,
-        65803.5, 67851.5, 69899.5, 71947.5, 73995.5, 76043.5, 78091.5, 80139.5, 82187.5, 84235.5, 88331.5, 92427.5,
-        96523.5, 100620,  104716,  108812,  112908};
-
-    // Conditions are transferred once per IOV
-    // Access is performed based on the det id which is converted to a linear index
-    // 2 funcs below are taken from HcalTopology (reimplemented here).
-    // Inputs are constants that are also taken from HcalTopology
-    // but passed to the kernel as arguments using the HclaTopology itself
-    constexpr int32_t IPHI_MAX = 72;
-
-    __forceinline__ __device__ uint32_t did2linearIndexHB(
-        uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) {
-      HcalDetId did{didraw};
-      uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1);
-      return did.ieta() > 0 ? value + maxDepthHB * IPHI_MAX * (did.ieta() - firstHBRing)
-                            : value + maxDepthHB * IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
-    }
-
-    __forceinline__ __device__ uint32_t did2linearIndexHE(uint32_t const didraw,
-                                                          int const maxDepthHE,
-                                                          int const maxPhiHE,
-                                                          int const firstHERing,
-                                                          int const lastHERing,
-                                                          int const nEtaHE) {
-      HcalDetId did{didraw};
-      uint32_t const value = (did.depth() - 1) + maxDepthHE * (did.iphi() - 1);
-      return did.ieta() > 0 ? value + maxDepthHE * maxPhiHE * (did.ieta() - firstHERing)
-                            : value + maxDepthHE * maxPhiHE * (did.ieta() + lastHERing + nEtaHE);
-    }
-
-    __forceinline__ __device__ uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range) {
-      return capid * 4 + range;
-    }
-
-    __forceinline__ __device__ float compute_reco_correction_factor(float const par1,
-                                                                    float const par2,
-                                                                    float const par3,
-                                                                    float const x) {
-      return par3 * x * x + par2 * x + par1;
-    }
-
-    // compute the charge using the adc, qie type and the appropriate qie shape array
-    __forceinline__ __device__ float compute_coder_charge(
-        int const qieType, uint8_t const adc, uint8_t const capid, float const* qieOffsets, float const* qieSlopes) {
-      auto const range = qieType == 0 ? (adc >> 5) & 0x3 : (adc >> 6) & 0x3;
-      auto const* qieShapeToUse = qieType == 0 ? qie8shape : qie11shape;
-      auto const nbins = qieType == 0 ? 32 : 64;
-      auto const center = adc % nbins == nbins - 1 ? 0.5 * (3 * qieShapeToUse[adc] - qieShapeToUse[adc - 1])
-                                                   : 0.5 * (qieShapeToUse[adc] + qieShapeToUse[adc + 1]);
-      auto const index = get_qiecoder_index(capid, range);
-      return (center - qieOffsets[index]) / qieSlopes[index];
-    }
-
-    __forceinline__ __device__ float compute_diff_charge_gain(int const qieType,
-                                                              uint8_t adc,
-                                                              uint8_t const capid,
-                                                              float const* qieOffsets,
-                                                              float const* qieSlopes,
-                                                              bool const isqie11) {
-      constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
-      constexpr uint32_t mantissaMaskQIE11 = 0x3f;
-      auto const mantissaMask = isqie11 ? mantissaMaskQIE11 : mantissaMaskQIE8;
-      auto const q = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
-      auto const mantissa = adc & mantissaMask;
-
-      if (mantissa == 0u || mantissa == mantissaMask - 1u)
-        return compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes) - q;
-      else if (mantissa == 1u || mantissa == mantissaMask)
-        return q - compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
-      else {
-        auto const qup = compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes);
-        auto const qdown = compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
-        auto const upgain = qup - q;
-        auto const downgain = q - qdown;
-        auto const averagegain = (qup - qdown) / 2.f;
-        if (std::abs(upgain - downgain) < 0.01f * averagegain)
-          return averagegain;
-        else {
-          auto const q2up = compute_coder_charge(qieType, adc + 2u, capid, qieOffsets, qieSlopes);
-          auto const q2down = compute_coder_charge(qieType, adc - 2u, capid, qieOffsets, qieSlopes);
-          auto const upgain2 = q2up - qup;
-          auto const downgain2 = qdown - q2down;
-          if (std::abs(upgain2 - upgain) < std::abs(downgain2 - downgain))
-            return upgain;
-          else
-            return downgain;
-        }
-      }
-    }
-
     // Assume: same number of samples for HB and HE
     // TODO: add/validate restrict (will increase #registers in use by the kernel)
     __global__ void kernel_prep1d_sameNumberOfSamples(float* amplitudes,
@@ -303,8 +151,8 @@ namespace hcal {
       // compute hash for this did
       auto const hashedId =
           did.subdetId() == HcalBarrel
-              ? did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
-              : did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
+	? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+	: hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
 
       // conditions based on the hash
       // FIXME: remove hardcoded values
@@ -340,7 +188,7 @@ namespace hcal {
 #endif
 
       // compute charge
-      auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
+      auto const charge = hcal::reconstruction::compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
 
       shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal;
       if (gch < nchannelsf01HE) {
@@ -375,7 +223,7 @@ namespace hcal {
 #ifdef COMPUTE_TDC_TIME
       float tdcTime;
 #endif  // COMPUTE_TDC_TIME
-      auto const dfc = compute_diff_charge_gain(
+      auto const dfc = hcal::reconstruction::compute_diff_charge_gain(
           qieType, adc, capid, qieOffsets, qieSlopes, gch < nchannelsf01HE || gch >= nchannelsf015);
       if (gch >= nchannelsf01HE && gch < nchannelsf015) {
         // flavor 5
@@ -396,7 +244,7 @@ namespace hcal {
         for (auto ts = first; ts < last; ts++)
           sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesExpected + ts];
         auto const effectivePixelsFired = sipmq / fcByPE;
-        auto const factor = compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
+        auto const factor = hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
         rawCharge = (charge - pedestal) * factor + pedestal;
 #ifdef COMPUTE_TDC_TIME
         if (gch < nchannelsf01HE)
@@ -543,76 +391,6 @@ namespace hcal {
       noiseTermsForChannel[sample] = noiseTerm;
     }
 
-    // TODO: remove what's not needed
-    __forceinline__ __device__ float compute_pulse_shape_value(float const pulse_time,
-                                                               int const sample,
-                                                               int const shift,
-                                                               float const* acc25nsVec,
-                                                               float const* diff25nsItvlVec,
-                                                               float const* accVarLenIdxMinusOneVec,
-                                                               float const* diffVarItvlIdxMinusOneVec,
-                                                               float const* accVarLenIdxZeroVec,
-                                                               float const* diffVarItvlIdxZeroVec) {
-      // constants
-      constexpr float pulse_height = 1.0f;
-      constexpr float slew = 0.f;
-      constexpr auto ns_per_bx = nsPerBX;
-      //constexpr auto num_ns = nsPerBX * maxSamples;
-      //constexpr auto num_bx = num_ns / ns_per_bx;
-
-      // FIXME: clean up all the rounding... this is coming from original cpu version
-      float const i_start_float =
-          -iniTimeShift - pulse_time - slew > 0.f ? 0.f : std::abs(-iniTimeShift - pulse_time - slew) + 1.f;
-      int i_start = static_cast<int>(i_start_float);
-      float offset_start = static_cast<float>(i_start) - iniTimeShift - pulse_time - slew;
-      // FIXME: do we need a check for nan???
-#ifdef HCAL_MAHI_GPUDEBUG
-      if (shift == 0)
-        printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start);
-#endif
-
-      // boundary
-      if (offset_start == 1.0f) {
-        offset_start = 0.f;
-        i_start -= 1;
-      }
-
-#ifdef HCAL_MAHI_GPUDEBUG
-      if (shift == 0)
-        printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start);
-#endif
-
-      int const bin_start = static_cast<int>(offset_start);
-      auto const bin_start_up = static_cast<float>(bin_start) + 0.5f;
-      int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start;
-      int const its_start = i_start / ns_per_bx;
-      int const distTo25ns_start = nsPerBX - 1 - i_start % ns_per_bx;
-      auto const factor = offset_start - static_cast<float>(bin_0_start) - 0.5;
-
-#ifdef HCAL_MAHI_GPUDEBUG
-      if (shift == 0) {
-        printf("bin_start = %d bin_0_start = %d its_start = %d distTo25ns_start = %d factor = %f\n",
-               bin_start,
-               bin_0_start,
-               its_start,
-               distTo25ns_start,
-               factor);
-      }
-#endif
-
-      auto const sample_over10ts = sample + shift;
-      float value = 0.0f;
-      if (sample_over10ts == its_start) {
-        value = bin_0_start == -1
-                    ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start]
-                    : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
-      } else if (sample_over10ts > its_start) {
-        int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
-        value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
-      }
-      value *= pulse_height;
-      return value;
-    }
 
     // TODO: need to add an array of offsets for pulses (a la activeBXs...)
     // Assume for now 8 pulses
@@ -675,15 +453,15 @@ namespace hcal {
       auto const did = DetId{id};
       auto const hashedId =
           did.subdetId() == HcalBarrel
-              ? did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
-              : did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
+							 ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+							 : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
       auto const recoPulseShapeId = recoPulseShapeIds[hashedId];
-      auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * maxPSshapeBin;
-      auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * maxPSshapeBin;
-      auto const* accVarLenIdxMinusOneVec = accVarLenIdxMinusOneVecValues + recoPulseShapeId * nsPerBX;
-      auto const* diffVarItvlIdxMinusOneVec = diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * nsPerBX;
-      auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * nsPerBX;
-      auto const* diffVarItvlIdxZeroVec = diffVarItvlIdxZeroVecValues + recoPulseShapeId * nsPerBX;
+      auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * hcal::reconstruction::maxPSshapeBin;
+      auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * hcal::reconstruction::maxPSshapeBin;
+      auto const* accVarLenIdxMinusOneVec = accVarLenIdxMinusOneVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
+      auto const* diffVarItvlIdxMinusOneVec = diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
+      auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
+      auto const* diffVarItvlIdxZeroVec = diffVarItvlIdxZeroVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
 
       // offset output arrays
       auto* pulseMatrix = pulseMatrices + nsamples * npulses * gch;
@@ -732,9 +510,9 @@ namespace hcal {
       auto t0 = meanTime;
       if (applyTimeSlew) {
         if (amplitude <= 1.0f)
-          t0 += compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
+          t0 += hcal::reconstruction::compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
         else
-          t0 += compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
+          t0 += hcal::reconstruction::compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
       }
       auto const t0m = -deltaT + t0;
       auto const t0p = deltaT + t0;
@@ -746,7 +524,7 @@ namespace hcal {
 
       if (sample == 0 && ipulse == 0) {
         for (int i = 0; i < 10; i++) {
-          auto const value = compute_pulse_shape_value(t0,
+          auto const value = hcal::reconstruction::compute_pulse_shape_value(t0,
                                                        i,
                                                        0,
                                                        acc25nsVec,
@@ -759,7 +537,7 @@ namespace hcal {
         }
         printf("\n");
         for (int i = 0; i < 10; i++) {
-          auto const value = compute_pulse_shape_value(t0p,
+          auto const value = hcal::reconstruction::compute_pulse_shape_value(t0p,
                                                        i,
                                                        0,
                                                        acc25nsVec,
@@ -772,7 +550,7 @@ namespace hcal {
         }
         printf("\n");
         for (int i = 0; i < 10; i++) {
-          auto const value = compute_pulse_shape_value(t0m,
+          auto const value = hcal::reconstruction::compute_pulse_shape_value(t0m,
                                                        i,
                                                        0,
                                                        acc25nsVec,
@@ -793,7 +571,7 @@ namespace hcal {
       // auto const offset = ipulse - soi;
       // auto const idx = sample - offset;
       int32_t const idx = sample - pulseOffset;
-      auto const value = idx >= 0 && idx < nsamples ? compute_pulse_shape_value(t0,
+      auto const value = idx >= 0 && idx < nsamples ? hcal::reconstruction::compute_pulse_shape_value(t0,
                                                                                 idx,
                                                                                 shift,
                                                                                 acc25nsVec,
@@ -803,7 +581,7 @@ namespace hcal {
                                                                                 accVarLenIdxZeroVec,
                                                                                 diffVarItvlIdxZeroVec)
                                                     : 0;
-      auto const value_t0m = idx >= 0 && idx < nsamples ? compute_pulse_shape_value(t0m,
+      auto const value_t0m = idx >= 0 && idx < nsamples ? hcal::reconstruction::compute_pulse_shape_value(t0m,
                                                                                     idx,
                                                                                     shift,
                                                                                     acc25nsVec,
@@ -813,7 +591,7 @@ namespace hcal {
                                                                                     accVarLenIdxZeroVec,
                                                                                     diffVarItvlIdxZeroVec)
                                                         : 0;
-      auto const value_t0p = idx >= 0 && idx < nsamples ? compute_pulse_shape_value(t0p,
+      auto const value_t0p = idx >= 0 && idx < nsamples ? hcal::reconstruction::compute_pulse_shape_value(t0p,
                                                                                     idx,
                                                                                     shift,
                                                                                     acc25nsVec,
@@ -831,242 +609,6 @@ namespace hcal {
       pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
     }
 
-    // FIXME: provide specialization for Row Major layout
-    template <typename T, int Stride, int Order = Eigen::ColMajor>
-    struct MapSymM {
-      using type = T;
-      using base_type = typename std::remove_const<type>::type;
-
-      static constexpr int total = Stride * (Stride + 1) / 2;
-      static constexpr int stride = Stride;
-      T* data;
-
-      __forceinline__ __device__ MapSymM(T* data) : data{data} {}
-
-      __forceinline__ __device__ T const& operator()(int const row, int const col) const {
-        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
-        auto const index = total - tmp + row - col;
-        return data[index];
-      }
-
-      template <typename U = T>
-      __forceinline__ __device__ typename std::enable_if<std::is_same<base_type, U>::value, base_type>::type&
-      operator()(int const row, int const col) {
-        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
-        auto const index = total - tmp + row - col;
-        return data[index];
-      }
-    };
-
-    // simple/trivial cholesky decomposition impl
-    template <typename MatrixType1, typename MatrixType2>
-    __forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
-      auto const sqrtm_0_0 = std::sqrt(M(0, 0));
-      L(0, 0) = sqrtm_0_0;
-      using T = typename MatrixType1::base_type;
-
-#pragma unroll
-      for (int i = 1; i < MatrixType1::stride; i++) {
-        T sumsq{0};
-        for (int j = 0; j < i; j++) {
-          T sumsq2{0};
-          auto const m_i_j = M(i, j);
-          for (int k = 0; k < j; ++k)
-            sumsq2 += L(i, k) * L(j, k);
-
-          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
-          L(i, j) = value_i_j;
-
-          sumsq += value_i_j * value_i_j;
-        }
-
-        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
-        L(i, i) = l_i_i;
-      }
-    }
-
-    template <typename MatrixType1, typename MatrixType2>
-    __forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) {
-      auto const sqrtm_0_0 = std::sqrt(M(0, 0));
-      L(0, 0) = sqrtm_0_0;
-      using T = typename MatrixType1::base_type;
-
-      for (int i = 1; i < N; i++) {
-        T sumsq{0};
-        for (int j = 0; j < i; j++) {
-          T sumsq2{0};
-          auto const m_i_j = M(i, j);
-          for (int k = 0; k < j; ++k)
-            sumsq2 += L(i, k) * L(j, k);
-
-          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
-          L(i, j) = value_i_j;
-
-          sumsq += value_i_j * value_i_j;
-        }
-
-        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
-        L(i, i) = l_i_i;
-      }
-    }
-
-    template <typename MatrixType1, typename MatrixType2, typename VectorType>
-    __forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets(
-        MatrixType1& L,
-        MatrixType2 const& M,
-        float b[MatrixType1::stride],
-        VectorType const& Atb,
-        int const N,
-        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
-      auto const real_0 = pulseOffsets(0);
-      auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0));
-      L(0, 0) = sqrtm_0_0;
-      using T = typename MatrixType1::base_type;
-      b[0] = Atb(real_0) / sqrtm_0_0;
-
-      for (int i = 1; i < N; i++) {
-        auto const i_real = pulseOffsets(i);
-        T sumsq{0};
-        T total = 0;
-        auto const atb = Atb(i_real);
-        for (int j = 0; j < i; j++) {
-          auto const j_real = pulseOffsets(j);
-          T sumsq2{0};
-          auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
-          for (int k = 0; k < j; ++k)
-            sumsq2 += L(i, k) * L(j, k);
-
-          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
-          L(i, j) = value_i_j;
-
-          sumsq += value_i_j * value_i_j;
-          total += value_i_j * b[j];
-        }
-
-        auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
-        L(i, i) = l_i_i;
-        b[i] = (atb - total) / l_i_i;
-      }
-    }
-
-    template <typename MatrixType1, typename MatrixType2, typename VectorType>
-    __forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets(
-        MatrixType1& L,
-        MatrixType2 const& M,
-        float b[MatrixType1::stride],
-        VectorType const& Atb,
-        int const N,
-        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
-      using T = typename MatrixType1::base_type;
-      auto const i = N - 1;
-      auto const i_real = pulseOffsets(i);
-      T sumsq{0};
-      T total = 0;
-      for (int j = 0; j < i; j++) {
-        auto const j_real = pulseOffsets(j);
-        T sumsq2{0};
-        auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
-        for (int k = 0; k < j; ++k)
-          sumsq2 += L(i, k) * L(j, k);
-
-        auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
-        L(i, j) = value_i_j;
-        sumsq += value_i_j * value_i_j;
-
-        total += value_i_j * b[j];
-      }
-
-      auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
-      L(i, i) = l_i_i;
-      b[i] = (Atb(i_real) - total) / l_i_i;
-    }
-
-    template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
-    __device__ void solve_forward_subst_matrix(MatrixType1& A,
-                                               MatrixType2 const& pulseMatrixView,
-                                               MatrixType3 const& matrixL) {
-      // FIXME: this assumes pulses are on columns and samples on rows
-      constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
-      constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;
-
-#pragma unroll
-      for (int icol = 0; icol < NPULSES; icol++) {
-        float reg_b[NSAMPLES];
-        float reg_L[NSAMPLES];
-
-// preload a column and load column 0 of cholesky
-#pragma unroll
-        for (int i = 0; i < NSAMPLES; i++) {
-          reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol));
-          reg_L[i] = matrixL(i, 0);
-        }
-
-        // compute x0 and store it
-        auto x_prev = reg_b[0] / reg_L[0];
-        A(0, icol) = x_prev;
-
-// iterate
-#pragma unroll
-        for (int iL = 1; iL < NSAMPLES; iL++) {
-// update accum
-#pragma unroll
-          for (int counter = iL; counter < NSAMPLES; counter++)
-            reg_b[counter] -= x_prev * reg_L[counter];
-
-// load the next column of cholesky
-#pragma unroll
-          for (int counter = iL; counter < NSAMPLES; counter++)
-            reg_L[counter] = matrixL(counter, iL);
-
-          // compute the next x for M(iL, icol)
-          x_prev = reg_b[iL] / reg_L[iL];
-
-          // store the result value
-          A(iL, icol) = x_prev;
-        }
-      }
-    }
-
-    template <typename MatrixType1, typename MatrixType2>
-    __device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime],
-                                               MatrixType1 inputAmplitudesView,
-                                               MatrixType2 matrixL) {
-      constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime;
-
-      float reg_b_tmp[NSAMPLES];
-      float reg_L[NSAMPLES];
-
-// preload a column and load column 0 of cholesky
-#pragma unroll
-      for (int i = 0; i < NSAMPLES; i++) {
-        reg_b_tmp[i] = inputAmplitudesView(i);
-        reg_L[i] = matrixL(i, 0);
-      }
-
-      // compute x0 and store it
-      auto x_prev = reg_b_tmp[0] / reg_L[0];
-      reg_b[0] = x_prev;
-
-// iterate
-#pragma unroll
-      for (int iL = 1; iL < NSAMPLES; iL++) {
-// update accum
-#pragma unroll
-        for (int counter = iL; counter < NSAMPLES; counter++)
-          reg_b_tmp[counter] -= x_prev * reg_L[counter];
-
-// load the next column of cholesky
-#pragma unroll
-        for (int counter = iL; counter < NSAMPLES; counter++)
-          reg_L[counter] = matrixL(counter, iL);
-
-        // compute the next x for M(iL, icol)
-        x_prev = reg_b_tmp[iL] / reg_L[iL];
-
-        // store the result value
-        reg_b[iL] = x_prev;
-      }
-    }
 
     // TODO: add active bxs
     template <typename MatrixType, typename VectorType>
@@ -1074,8 +616,8 @@ namespace hcal {
                           VectorType const& Atb,
                           VectorType& solution,
                           int& npassive,
-                          ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
-                          MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
+                          calo::multifit::ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
+                          calo::multifit::MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
                           double const eps,
                           int const maxIterations) {
       // constants
@@ -1103,7 +645,7 @@ namespace hcal {
 
           // compute the gradient
           //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive);
-          Eigen::Index w_max_idx;
+	  Eigen::Index w_max_idx;
           float w_max = -std::numeric_limits<float>::max();
           for (int icol = npassive; icol < NPULSES; icol++) {
             auto const icol_real = pulseOffsets(icol);
@@ -1112,7 +654,7 @@ namespace hcal {
 #pragma unroll
             for (int counter = 0; counter < NPULSES; counter++)
               sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter)
-                                         : AtA(icol_real, counter) * solution(counter);
+		: AtA(icol_real, counter) * solution(counter);
 
             auto const w = atb - sum;
             if (w > w_max) {
@@ -1134,7 +676,7 @@ namespace hcal {
           // move index to the right part of the vector
           w_max_idx += npassive;
 
-          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx));
+	  Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx));
           ++npassive;
         }
 
@@ -1149,9 +691,9 @@ namespace hcal {
           //        .llt().matrixL();
           //.solve(Atb.head(npassive));
           if (recompute || iter == 0)
-            compute_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+	    calo::multifit::compute_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
           else
-            update_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+	    calo::multifit::update_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
 
           // run backward substituion
           s(npassive - 1) = reg_b[npassive - 1] / matrixL(npassive - 1, npassive - 1);
@@ -1178,7 +720,7 @@ namespace hcal {
           recompute = true;
 
           auto alpha = std::numeric_limits<float>::max();
-          Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
+	  Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
           for (int i = 0; i < npassive; i++) {
             if (s[i] <= 0.) {
               auto const i_real = pulseOffsets(i);
@@ -1201,7 +743,7 @@ namespace hcal {
           solution[alpha_idx_real] = 0;
           --npassive;
 
-          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx));
+	  Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx));
         }
 
         // as in cpu
@@ -1213,11 +755,11 @@ namespace hcal {
 
     template <int NSAMPLES, int NPULSES>
     __forceinline__ __device__ void update_covariance(
-        ColumnVector<NPULSES> const& resultAmplitudesVector,
-        MapSymM<float, NSAMPLES>& covarianceMatrix,
-        Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
-        Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
-        Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
+        calo::multifit::ColumnVector<NPULSES> const& resultAmplitudesVector,
+        calo::multifit::MapSymM<float, NSAMPLES>& covarianceMatrix,
+        Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
+        Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
+        Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
 #pragma unroll
       for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
         auto const resultAmplitude = resultAmplitudesVector(ipulse);
@@ -1314,9 +856,9 @@ namespace hcal {
 
       // configure shared mem
       extern __shared__ char shrmem[];
-      float* shrMatrixLFnnlsStorage = reinterpret_cast<float*>(shrmem) + MapSymM<float, NPULSES>::total * threadIdx.x;
+      float* shrMatrixLFnnlsStorage = reinterpret_cast<float*>(shrmem) + calo::multifit::MapSymM<float, NPULSES>::total * threadIdx.x;
       float* shrAtAStorage =
-          reinterpret_cast<float*>(shrmem) + MapSymM<float, NPULSES>::total * (threadIdx.x + blockDim.x);
+	reinterpret_cast<float*>(shrmem) + calo::multifit::MapSymM<float, NPULSES>::total * (threadIdx.x + blockDim.x);
 
       // conditions for pedestal widths
       auto const id = gch < nchannelsf01HE
@@ -1328,8 +870,8 @@ namespace hcal {
       auto const did = DetId{id};
       auto const hashedId =
           did.subdetId() == HcalBarrel
-              ? did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
-              : did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
+	? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+	: hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
 
       auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
                                                  ? effectivePedestalWidths + hashedId * 4
@@ -1356,23 +898,23 @@ namespace hcal {
       */
       constexpr float deltaChi2Threashold = 1e-3;
 
-      ColumnVector<NPULSES, int> pulseOffsets;
+      calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
 #pragma unroll
       for (int i = 0; i < NPULSES; ++i)
         pulseOffsets(i) = i;
       //        pulseOffsets(i) = pulseOffsetValues[i] - pulseOffsetValues[0];
 
       // output amplitudes/weights
-      ColumnVector<NPULSES> resultAmplitudesVector = ColumnVector<NPULSES>::Zero();
+      calo::multifit::ColumnVector<NPULSES> resultAmplitudesVector = calo::multifit::ColumnVector<NPULSES>::Zero();
 
       // map views
-      Eigen::Map<const ColumnVector<NSAMPLES>> inputAmplitudesView{inputAmplitudes + gch * NSAMPLES};
-      Eigen::Map<const ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
-      Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{pulseMatricesM +
+      Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> inputAmplitudesView{inputAmplitudes + gch * NSAMPLES};
+      Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
+      Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{pulseMatricesM +
                                                                               gch * NSAMPLES * NPULSES};
-      Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{pulseMatricesP +
+      Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{pulseMatricesP +
                                                                               gch * NSAMPLES * NPULSES};
-      Eigen::Map<const ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{pulseMatrices + gch * NSAMPLES * NPULSES};
+      Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{pulseMatrices + gch * NSAMPLES * NPULSES};
 
 #ifdef HCAL_MAHI_GPUDEBUG
       for (int i = 0; i < NSAMPLES; i++)
@@ -1405,12 +947,12 @@ namespace hcal {
         // if does not hold -> slightly rearrange shared mem to still reuse
         // shared memory
         float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
-        MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
+	calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
 #pragma unroll
-        for (int counter = 0; counter < MapSymM<float, NSAMPLES>::total; counter++)
+        for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::total; counter++)
           covarianceMatrixStorage[counter] = averagePedestalWidth2;
 #pragma unroll
-        for (int counter = 0; counter < MapSymM<float, NSAMPLES>::stride; counter++)
+        for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::stride; counter++)
           covarianceMatrix(counter, counter) += __ldg(&noiseTermsView.coeffRef(counter));
 
         // update covariance matrix
@@ -1429,9 +971,9 @@ namespace hcal {
         // compute Cholesky Decomposition L matrix
         //matrixDecomposition.compute(covarianceMatrix);
         //auto const& matrixL = matrixDecomposition.matrixL();
-        float matrixLStorage[MapSymM<float, NSAMPLES>::total];
-        MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
-        compute_decomposition_unrolled(matrixL, covarianceMatrix);
+        float matrixLStorage[calo::multifit::MapSymM<float, NSAMPLES>::total];
+	calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
+	calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
 
         //
         // replace eigen
@@ -1439,8 +981,8 @@ namespace hcal {
         //auto const& A = matrixDecomposition
         //    .matrixL()
         //    .solve(pulseMatrixView);
-        ColMajorMatrix<NSAMPLES, NPULSES> A;
-        solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
+	calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
+	calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
 
         //
         // remove eigen
@@ -1449,7 +991,7 @@ namespace hcal {
         //   .solve(inputAmplitudesView);
         //
         float reg_b[NSAMPLES];
-        solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
+	calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
 
         // TODO: we do not really need to change these matrcies
         // will be fixed in the optimized version
@@ -1457,8 +999,8 @@ namespace hcal {
         //ColumnVector<NPULSES> Atb = A.transpose() * b;
         //ColMajorMatrix<NPULSES, NPULSES> AtA;
         //float AtAStorage[MapSymM<float, NPULSES>::total];
-        MapSymM<float, NPULSES> AtA{shrAtAStorage};
-        ColumnVector<NPULSES> Atb;
+	calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
+	calo::multifit::ColumnVector<NPULSES> Atb;
 #pragma unroll
         for (int icol = 0; icol < NPULSES; icol++) {
           float reg_ai[NSAMPLES];
@@ -1525,11 +1067,11 @@ namespace hcal {
 #endif
 
         // for fnnls
-        MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
+	calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
 
         // run fast nnls
         // FIXME: provide values from config
-        fnnls(AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, 1e-11, 500);
+	fnnls(AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, 1e-11, 500);
 
 #ifdef HCAL_MAHI_GPUDEBUG
         printf("result Amplitudes\n");
@@ -1540,7 +1082,7 @@ namespace hcal {
         // replace pulseMatrixView * result - inputs
         // NOTE:
         float accum[NSAMPLES];
-        Eigen::Map<ColumnVector<NSAMPLES>> mapAccum{accum};
+        Eigen::Map<calo::multifit::ColumnVector<NSAMPLES>> mapAccum{accum};
         {
           float results[NPULSES];
 
@@ -1716,7 +1258,7 @@ namespace hcal {
           conditions.respCorrs.values,
           conditions.topology->maxDepthHB(),
           conditions.topology->maxDepthHE(),
-          conditions.recConstants->getNPhi(1) > IPHI_MAX ? conditions.recConstants->getNPhi(1) : IPHI_MAX,
+          conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1) : hcal::reconstruction::IPHI_MAX,
           conditions.topology->firstHBRing(),
           conditions.topology->lastHBRing(),
           conditions.topology->firstHERing(),
@@ -1770,7 +1312,7 @@ namespace hcal {
           configParameters.timeSigmaHPD,
           conditions.topology->maxDepthHB(),
           conditions.topology->maxDepthHE(),
-          conditions.recConstants->getNPhi(1) > IPHI_MAX ? conditions.recConstants->getNPhi(1) : IPHI_MAX,
+          conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1) : hcal::reconstruction::IPHI_MAX,
           conditions.topology->firstHBRing(),
           conditions.topology->lastHBRing(),
           conditions.topology->firstHERing(),
@@ -1790,7 +1332,7 @@ namespace hcal {
         // FIXME: provide constants from configuration
         uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
         uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;
-        auto const nbytesShared = 2 * threadsPerBlock * MapSymM<float, 8>::total * sizeof(float);
+        auto const nbytesShared = 2 * threadsPerBlock * calo::multifit::MapSymM<float, 8>::total * sizeof(float);
         kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
             outputGPU.recHits.energy.get(),
             outputGPU.recHits.chi2.get(),
@@ -1815,7 +1357,7 @@ namespace hcal {
             conditions.offsetForHashes,
             conditions.topology->maxDepthHB(),
             conditions.topology->maxDepthHE(),
-            conditions.recConstants->getNPhi(1) > IPHI_MAX ? conditions.recConstants->getNPhi(1) : IPHI_MAX,
+            conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1) : hcal::reconstruction::IPHI_MAX,
             conditions.topology->firstHBRing(),
             conditions.topology->lastHBRing(),
             conditions.topology->firstHERing(),

From 79af505b8a7918fdb26414afc0ead9de84bacf0d Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 31 Jul 2020 11:29:39 +0200
Subject: [PATCH 20/34] Apply code formatting (cms-patatrack#526)

---
 .../HcalRecProducers/src/KernelHelpers.h      |  14 +-
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 199 ++++++++++--------
 2 files changed, 111 insertions(+), 102 deletions(-)

diff --git a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
index 72f369d99060e..b0447b1600b9b 100644
--- a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
+++ b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
@@ -10,10 +10,8 @@
 namespace hcal {
   namespace reconstruction {
 
-
     constexpr int32_t IPHI_MAX = 72;
 
-
     // this is from HcalTimeSlew.
     // HcalTimeSlew are values that come in from ESProducer that takes them
     // from a python config. see DeclsForKernels for more explanation
@@ -73,7 +71,7 @@ namespace hcal {
       HcalDetId did{didraw};
       uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1);
       return did.ieta() > 0 ? value + maxDepthHB * hcal::reconstruction::IPHI_MAX * (did.ieta() - firstHBRing)
-	: value + maxDepthHB * hcal::reconstruction::IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
+                            : value + maxDepthHB * hcal::reconstruction::IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
     }
 
     __forceinline__ __device__ uint32_t did2linearIndexHE(uint32_t const didraw,
@@ -150,12 +148,11 @@ namespace hcal {
 
     // FIXME remove duplication...
     // this is from PulesFunctor. nvcc was complaining... if included that header...
-    //constexpr int maxSamples = 10;                                                                                                                                                
+    //constexpr int maxSamples = 10;
     constexpr int maxPSshapeBin = 256;
     constexpr int nsPerBX = 25;
     constexpr float iniTimeShift = 92.5f;
 
-
     // TODO: remove what's not needed
     __forceinline__ __device__ float compute_pulse_shape_value(float const pulse_time,
                                                                int const sample,
@@ -175,7 +172,7 @@ namespace hcal {
 
       // FIXME: clean up all the rounding... this is coming from original cpu version
       float const i_start_float =
-	-iniTimeShift - pulse_time - slew > 0.f ? 0.f : std::abs(-iniTimeShift - pulse_time - slew) + 1.f;
+          -iniTimeShift - pulse_time - slew > 0.f ? 0.f : std::abs(-iniTimeShift - pulse_time - slew) + 1.f;
       int i_start = static_cast<int>(i_start_float);
       float offset_start = static_cast<float>(i_start) - iniTimeShift - pulse_time - slew;
       // FIXME: do we need a check for nan???
@@ -218,7 +215,7 @@ namespace hcal {
       if (sample_over10ts == its_start) {
         value = bin_0_start == -1
                     ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start]
-	  : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
+                    : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
       } else if (sample_over10ts > its_start) {
         int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
         value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
@@ -227,10 +224,7 @@ namespace hcal {
       return value;
     }
 
-
-
   }  // namespace reconstruction
 }  // namespace hcal
 
-
 #endif  // RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index 08fc726c4c691..7789b86a50958 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -151,8 +151,9 @@ namespace hcal {
       // compute hash for this did
       auto const hashedId =
           did.subdetId() == HcalBarrel
-	? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
-	: hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
+              ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+              : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
+                    offsetForHashes;
 
       // conditions based on the hash
       // FIXME: remove hardcoded values
@@ -244,7 +245,8 @@ namespace hcal {
         for (auto ts = first; ts < last; ts++)
           sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesExpected + ts];
         auto const effectivePixelsFired = sipmq / fcByPE;
-        auto const factor = hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
+        auto const factor =
+            hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
         rawCharge = (charge - pedestal) * factor + pedestal;
 #ifdef COMPUTE_TDC_TIME
         if (gch < nchannelsf01HE)
@@ -391,7 +393,6 @@ namespace hcal {
       noiseTermsForChannel[sample] = noiseTerm;
     }
 
-
     // TODO: need to add an array of offsets for pulses (a la activeBXs...)
     // Assume for now 8 pulses
     __global__ void kernel_prep_pulseMatrices_sameNumberOfSamples(float* pulseMatrices,
@@ -453,15 +454,19 @@ namespace hcal {
       auto const did = DetId{id};
       auto const hashedId =
           did.subdetId() == HcalBarrel
-							 ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
-							 : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
+              ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+              : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
+                    offsetForHashes;
       auto const recoPulseShapeId = recoPulseShapeIds[hashedId];
       auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * hcal::reconstruction::maxPSshapeBin;
       auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * hcal::reconstruction::maxPSshapeBin;
-      auto const* accVarLenIdxMinusOneVec = accVarLenIdxMinusOneVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
-      auto const* diffVarItvlIdxMinusOneVec = diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
+      auto const* accVarLenIdxMinusOneVec =
+          accVarLenIdxMinusOneVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
+      auto const* diffVarItvlIdxMinusOneVec =
+          diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
       auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
-      auto const* diffVarItvlIdxZeroVec = diffVarItvlIdxZeroVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
+      auto const* diffVarItvlIdxZeroVec =
+          diffVarItvlIdxZeroVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
 
       // offset output arrays
       auto* pulseMatrix = pulseMatrices + nsamples * npulses * gch;
@@ -525,40 +530,40 @@ namespace hcal {
       if (sample == 0 && ipulse == 0) {
         for (int i = 0; i < 10; i++) {
           auto const value = hcal::reconstruction::compute_pulse_shape_value(t0,
-                                                       i,
-                                                       0,
-                                                       acc25nsVec,
-                                                       diff25nsItvlVec,
-                                                       accVarLenIdxMinusOneVec,
-                                                       diffVarItvlIdxMinusOneVec,
-                                                       accVarLenIdxZeroVec,
-                                                       diffVarItvlIdxZeroVec);
+                                                                             i,
+                                                                             0,
+                                                                             acc25nsVec,
+                                                                             diff25nsItvlVec,
+                                                                             accVarLenIdxMinusOneVec,
+                                                                             diffVarItvlIdxMinusOneVec,
+                                                                             accVarLenIdxZeroVec,
+                                                                             diffVarItvlIdxZeroVec);
           printf("pulse(%d) = %f\n", i, value);
         }
         printf("\n");
         for (int i = 0; i < 10; i++) {
           auto const value = hcal::reconstruction::compute_pulse_shape_value(t0p,
-                                                       i,
-                                                       0,
-                                                       acc25nsVec,
-                                                       diff25nsItvlVec,
-                                                       accVarLenIdxMinusOneVec,
-                                                       diffVarItvlIdxMinusOneVec,
-                                                       accVarLenIdxZeroVec,
-                                                       diffVarItvlIdxZeroVec);
+                                                                             i,
+                                                                             0,
+                                                                             acc25nsVec,
+                                                                             diff25nsItvlVec,
+                                                                             accVarLenIdxMinusOneVec,
+                                                                             diffVarItvlIdxMinusOneVec,
+                                                                             accVarLenIdxZeroVec,
+                                                                             diffVarItvlIdxZeroVec);
           printf("pulseP(%d) = %f\n", i, value);
         }
         printf("\n");
         for (int i = 0; i < 10; i++) {
           auto const value = hcal::reconstruction::compute_pulse_shape_value(t0m,
-                                                       i,
-                                                       0,
-                                                       acc25nsVec,
-                                                       diff25nsItvlVec,
-                                                       accVarLenIdxMinusOneVec,
-                                                       diffVarItvlIdxMinusOneVec,
-                                                       accVarLenIdxZeroVec,
-                                                       diffVarItvlIdxZeroVec);
+                                                                             i,
+                                                                             0,
+                                                                             acc25nsVec,
+                                                                             diff25nsItvlVec,
+                                                                             accVarLenIdxMinusOneVec,
+                                                                             diffVarItvlIdxMinusOneVec,
+                                                                             accVarLenIdxZeroVec,
+                                                                             diffVarItvlIdxZeroVec);
           printf("pulseM(%d) = %f\n", i, value);
         }
       }
@@ -571,36 +576,39 @@ namespace hcal {
       // auto const offset = ipulse - soi;
       // auto const idx = sample - offset;
       int32_t const idx = sample - pulseOffset;
-      auto const value = idx >= 0 && idx < nsamples ? hcal::reconstruction::compute_pulse_shape_value(t0,
-                                                                                idx,
-                                                                                shift,
-                                                                                acc25nsVec,
-                                                                                diff25nsItvlVec,
-                                                                                accVarLenIdxMinusOneVec,
-                                                                                diffVarItvlIdxMinusOneVec,
-                                                                                accVarLenIdxZeroVec,
-                                                                                diffVarItvlIdxZeroVec)
-                                                    : 0;
-      auto const value_t0m = idx >= 0 && idx < nsamples ? hcal::reconstruction::compute_pulse_shape_value(t0m,
-                                                                                    idx,
-                                                                                    shift,
-                                                                                    acc25nsVec,
-                                                                                    diff25nsItvlVec,
-                                                                                    accVarLenIdxMinusOneVec,
-                                                                                    diffVarItvlIdxMinusOneVec,
-                                                                                    accVarLenIdxZeroVec,
-                                                                                    diffVarItvlIdxZeroVec)
-                                                        : 0;
-      auto const value_t0p = idx >= 0 && idx < nsamples ? hcal::reconstruction::compute_pulse_shape_value(t0p,
-                                                                                    idx,
-                                                                                    shift,
-                                                                                    acc25nsVec,
-                                                                                    diff25nsItvlVec,
-                                                                                    accVarLenIdxMinusOneVec,
-                                                                                    diffVarItvlIdxMinusOneVec,
-                                                                                    accVarLenIdxZeroVec,
-                                                                                    diffVarItvlIdxZeroVec)
-                                                        : 0;
+      auto const value = idx >= 0 && idx < nsamples
+                             ? hcal::reconstruction::compute_pulse_shape_value(t0,
+                                                                               idx,
+                                                                               shift,
+                                                                               acc25nsVec,
+                                                                               diff25nsItvlVec,
+                                                                               accVarLenIdxMinusOneVec,
+                                                                               diffVarItvlIdxMinusOneVec,
+                                                                               accVarLenIdxZeroVec,
+                                                                               diffVarItvlIdxZeroVec)
+                             : 0;
+      auto const value_t0m = idx >= 0 && idx < nsamples
+                                 ? hcal::reconstruction::compute_pulse_shape_value(t0m,
+                                                                                   idx,
+                                                                                   shift,
+                                                                                   acc25nsVec,
+                                                                                   diff25nsItvlVec,
+                                                                                   accVarLenIdxMinusOneVec,
+                                                                                   diffVarItvlIdxMinusOneVec,
+                                                                                   accVarLenIdxZeroVec,
+                                                                                   diffVarItvlIdxZeroVec)
+                                 : 0;
+      auto const value_t0p = idx >= 0 && idx < nsamples
+                                 ? hcal::reconstruction::compute_pulse_shape_value(t0p,
+                                                                                   idx,
+                                                                                   shift,
+                                                                                   acc25nsVec,
+                                                                                   diff25nsItvlVec,
+                                                                                   accVarLenIdxMinusOneVec,
+                                                                                   diffVarItvlIdxMinusOneVec,
+                                                                                   accVarLenIdxZeroVec,
+                                                                                   diffVarItvlIdxZeroVec)
+                                 : 0;
 
       // store to global
       pulseMatrix[ipulse * nsamples + sample] = value;
@@ -609,7 +617,6 @@ namespace hcal {
       pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
     }
 
-
     // TODO: add active bxs
     template <typename MatrixType, typename VectorType>
     __device__ void fnnls(MatrixType const& AtA,
@@ -645,7 +652,7 @@ namespace hcal {
 
           // compute the gradient
           //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive);
-	  Eigen::Index w_max_idx;
+          Eigen::Index w_max_idx;
           float w_max = -std::numeric_limits<float>::max();
           for (int icol = npassive; icol < NPULSES; icol++) {
             auto const icol_real = pulseOffsets(icol);
@@ -654,7 +661,7 @@ namespace hcal {
 #pragma unroll
             for (int counter = 0; counter < NPULSES; counter++)
               sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter)
-		: AtA(icol_real, counter) * solution(counter);
+                                         : AtA(icol_real, counter) * solution(counter);
 
             auto const w = atb - sum;
             if (w > w_max) {
@@ -676,7 +683,7 @@ namespace hcal {
           // move index to the right part of the vector
           w_max_idx += npassive;
 
-	  Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx));
+          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx));
           ++npassive;
         }
 
@@ -691,9 +698,11 @@ namespace hcal {
           //        .llt().matrixL();
           //.solve(Atb.head(npassive));
           if (recompute || iter == 0)
-	    calo::multifit::compute_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+            calo::multifit::compute_decomposition_forwardsubst_with_offsets(
+                matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
           else
-	    calo::multifit::update_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+            calo::multifit::update_decomposition_forwardsubst_with_offsets(
+                matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
 
           // run backward substituion
           s(npassive - 1) = reg_b[npassive - 1] / matrixL(npassive - 1, npassive - 1);
@@ -720,7 +729,7 @@ namespace hcal {
           recompute = true;
 
           auto alpha = std::numeric_limits<float>::max();
-	  Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
+          Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
           for (int i = 0; i < npassive; i++) {
             if (s[i] <= 0.) {
               auto const i_real = pulseOffsets(i);
@@ -743,7 +752,7 @@ namespace hcal {
           solution[alpha_idx_real] = 0;
           --npassive;
 
-	  Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx));
+          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx));
         }
 
         // as in cpu
@@ -856,9 +865,10 @@ namespace hcal {
 
       // configure shared mem
       extern __shared__ char shrmem[];
-      float* shrMatrixLFnnlsStorage = reinterpret_cast<float*>(shrmem) + calo::multifit::MapSymM<float, NPULSES>::total * threadIdx.x;
-      float* shrAtAStorage =
-	reinterpret_cast<float*>(shrmem) + calo::multifit::MapSymM<float, NPULSES>::total * (threadIdx.x + blockDim.x);
+      float* shrMatrixLFnnlsStorage =
+          reinterpret_cast<float*>(shrmem) + calo::multifit::MapSymM<float, NPULSES>::total * threadIdx.x;
+      float* shrAtAStorage = reinterpret_cast<float*>(shrmem) +
+                             calo::multifit::MapSymM<float, NPULSES>::total * (threadIdx.x + blockDim.x);
 
       // conditions for pedestal widths
       auto const id = gch < nchannelsf01HE
@@ -870,8 +880,9 @@ namespace hcal {
       auto const did = DetId{id};
       auto const hashedId =
           did.subdetId() == HcalBarrel
-	? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
-	: hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + offsetForHashes;
+              ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+              : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
+                    offsetForHashes;
 
       auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
                                                  ? effectivePedestalWidths + hashedId * 4
@@ -911,10 +922,11 @@ namespace hcal {
       Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> inputAmplitudesView{inputAmplitudes + gch * NSAMPLES};
       Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
       Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{pulseMatricesM +
-                                                                              gch * NSAMPLES * NPULSES};
+                                                                                              gch * NSAMPLES * NPULSES};
       Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{pulseMatricesP +
-                                                                              gch * NSAMPLES * NPULSES};
-      Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{pulseMatrices + gch * NSAMPLES * NPULSES};
+                                                                                              gch * NSAMPLES * NPULSES};
+      Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{pulseMatrices +
+                                                                                             gch * NSAMPLES * NPULSES};
 
 #ifdef HCAL_MAHI_GPUDEBUG
       for (int i = 0; i < NSAMPLES; i++)
@@ -947,7 +959,7 @@ namespace hcal {
         // if does not hold -> slightly rearrange shared mem to still reuse
         // shared memory
         float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
-	calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
+        calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
 #pragma unroll
         for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::total; counter++)
           covarianceMatrixStorage[counter] = averagePedestalWidth2;
@@ -972,8 +984,8 @@ namespace hcal {
         //matrixDecomposition.compute(covarianceMatrix);
         //auto const& matrixL = matrixDecomposition.matrixL();
         float matrixLStorage[calo::multifit::MapSymM<float, NSAMPLES>::total];
-	calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
-	calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
+        calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
+        calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
 
         //
         // replace eigen
@@ -981,8 +993,8 @@ namespace hcal {
         //auto const& A = matrixDecomposition
         //    .matrixL()
         //    .solve(pulseMatrixView);
-	calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
-	calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
+        calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
+        calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
 
         //
         // remove eigen
@@ -991,7 +1003,7 @@ namespace hcal {
         //   .solve(inputAmplitudesView);
         //
         float reg_b[NSAMPLES];
-	calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
+        calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
 
         // TODO: we do not really need to change these matrcies
         // will be fixed in the optimized version
@@ -999,8 +1011,8 @@ namespace hcal {
         //ColumnVector<NPULSES> Atb = A.transpose() * b;
         //ColMajorMatrix<NPULSES, NPULSES> AtA;
         //float AtAStorage[MapSymM<float, NPULSES>::total];
-	calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
-	calo::multifit::ColumnVector<NPULSES> Atb;
+        calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
+        calo::multifit::ColumnVector<NPULSES> Atb;
 #pragma unroll
         for (int icol = 0; icol < NPULSES; icol++) {
           float reg_ai[NSAMPLES];
@@ -1067,11 +1079,11 @@ namespace hcal {
 #endif
 
         // for fnnls
-	calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
+        calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
 
         // run fast nnls
         // FIXME: provide values from config
-	fnnls(AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, 1e-11, 500);
+        fnnls(AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, 1e-11, 500);
 
 #ifdef HCAL_MAHI_GPUDEBUG
         printf("result Amplitudes\n");
@@ -1258,7 +1270,8 @@ namespace hcal {
           conditions.respCorrs.values,
           conditions.topology->maxDepthHB(),
           conditions.topology->maxDepthHE(),
-          conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1) : hcal::reconstruction::IPHI_MAX,
+          conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
+                                                                               : hcal::reconstruction::IPHI_MAX,
           conditions.topology->firstHBRing(),
           conditions.topology->lastHBRing(),
           conditions.topology->firstHERing(),
@@ -1312,7 +1325,8 @@ namespace hcal {
           configParameters.timeSigmaHPD,
           conditions.topology->maxDepthHB(),
           conditions.topology->maxDepthHE(),
-          conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1) : hcal::reconstruction::IPHI_MAX,
+          conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
+                                                                               : hcal::reconstruction::IPHI_MAX,
           conditions.topology->firstHBRing(),
           conditions.topology->lastHBRing(),
           conditions.topology->firstHERing(),
@@ -1357,7 +1371,8 @@ namespace hcal {
             conditions.offsetForHashes,
             conditions.topology->maxDepthHB(),
             conditions.topology->maxDepthHE(),
-            conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1) : hcal::reconstruction::IPHI_MAX,
+            conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
+                                                                                 : hcal::reconstruction::IPHI_MAX,
             conditions.topology->firstHBRing(),
             conditions.topology->lastHBRing(),
             conditions.topology->firstHERing(),

From 41c6b90e27789aed6c78e0c8bdbe9c9476d95d83 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 8 Aug 2020 17:29:51 +0200
Subject: [PATCH 21/34] Use up to 10 time samples for the HBHE digis in Run 3
 MC  (cms-patatrack#531)

---
 .../HcalDigi/interface/DigiCollection.h       |  2 +-
 .../makeHcalRaw2DigiGpuValidationPlots.cpp    |  6 +-
 .../plugins/HcalDigisProducerGPU.cc           | 92 +++++++++++--------
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 76 ++++++++-------
 4 files changed, 99 insertions(+), 77 deletions(-)

diff --git a/CUDADataFormats/HcalDigi/interface/DigiCollection.h b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
index f5ae63d0954c6..10350b52d4c52 100644
--- a/CUDADataFormats/HcalDigi/interface/DigiCollection.h
+++ b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
@@ -136,7 +136,7 @@ namespace hcal {
 
     typename StoragePolicy::template StorageSelector<uint32_t>::type ids;
     typename StoragePolicy::template StorageSelector<uint16_t>::type data;
-    uint32_t stride;
+    uint32_t stride{0};
   };
 
   template <typename Flavor, typename StoragePolicy>
diff --git a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
index fd144ae452363..94a43892e08b6 100644
--- a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
+++ b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
@@ -130,15 +130,15 @@ int main(int argc, char* argv[]) {
   rt->SetBranchAddress("QIE11DataFrameHcalDataFrameContainer_hcalDigis__RECO.", &wcpuf01he);
   rt->SetBranchAddress("HBHEDataFramesSorted_hcalDigis__RECO.", &wcpuf5hb);
   rt->SetBranchAddress(
-      "hcalFlavor5hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "hcalFlavor5calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
       "f5HBDigis_RECO.",
       &wgpuf5hb);
   rt->SetBranchAddress(
-      "hcalFlavor01hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "hcalFlavor01calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
       "f01HEDigis_RECO.",
       &wgpuf01he);
   rt->SetBranchAddress(
-      "hcalFlavor3hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "hcalFlavor3calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
       "f3HBDigis_RECO.",
       &wgpuf3hb);
 
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
index d49e2ad366817..d5b6d42d0ed2c 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -15,7 +15,7 @@
 class HcalDigisProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit HcalDigisProducerGPU(edm::ParameterSet const& ps);
-  ~HcalDigisProducerGPU() override;
+  ~HcalDigisProducerGPU() override = default;
   static void fillDescriptions(edm::ConfigurationDescriptions&);
 
 private:
@@ -49,11 +49,11 @@ class HcalDigisProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
   cms::cuda::ContextState cudaState_;
 
   struct ConfigParameters {
-    uint32_t maxChannelsF01HE, maxChannelsF5HB, maxChannelsF3HB, nsamplesF01HE, nsamplesF5HB, nsamplesF3HB;
+    uint32_t maxChannelsF01HE, maxChannelsF5HB, maxChannelsF3HB;
   };
   ConfigParameters config_;
 
-  // tmp on the host
+  // per event host buffers
   HostCollectionf01 hf01_;
   HostCollectionf5 hf5_;
   HostCollectionf3 hf3_;
@@ -76,9 +76,6 @@ void HcalDigisProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& conf
   desc.add<uint32_t>("maxChannelsF01HE", 10000u);
   desc.add<uint32_t>("maxChannelsF5HB", 10000u);
   desc.add<uint32_t>("maxChannelsF3HB", 10000u);
-  desc.add<uint32_t>("nsamplesF01HE", 8);
-  desc.add<uint32_t>("nsamplesF5HB", 8);
-  desc.add<uint32_t>("nsamplesF3HB", 8);
 
   confDesc.addWithDefaultLabel(desc);
 }
@@ -92,27 +89,24 @@ HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
   config_.maxChannelsF01HE = ps.getParameter<uint32_t>("maxChannelsF01HE");
   config_.maxChannelsF5HB = ps.getParameter<uint32_t>("maxChannelsF5HB");
   config_.maxChannelsF3HB = ps.getParameter<uint32_t>("maxChannelsF3HB");
-  config_.nsamplesF01HE = ps.getParameter<uint32_t>("nsamplesF01HE");
-  config_.nsamplesF5HB = ps.getParameter<uint32_t>("nsamplesF5HB");
-  config_.nsamplesF3HB = ps.getParameter<uint32_t>("nsamplesF3HB");
-
-  // preallocate on the host
-  hf01_.stride = hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
-  hf5_.stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
-  hf3_.stride = hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB);
+
+  // this is a preallocation for the max statically known number of time samples
+  // actual stride/nsamples will be inferred from data
+  hf5_.stride = hcal::compute_stride<hcal::Flavor5>(HBHEDataFrame::MAXSAMPLES);
+  hf01_.stride = hcal::compute_stride<hcal::Flavor01>(QIE11DigiCollection::MAXSAMPLES);
+  hf3_.stride = hcal::compute_stride<hcal::Flavor3>(QIE11DigiCollection::MAXSAMPLES);
   hf01_.reserve(config_.maxChannelsF01HE);
   hf5_.reserve(config_.maxChannelsF5HB);
   hf3_.reserve(config_.maxChannelsF3HB);
 }
 
-HcalDigisProducerGPU::~HcalDigisProducerGPU() {}
-
 void HcalDigisProducerGPU::acquire(edm::Event const& event,
                                    edm::EventSetup const& setup,
                                    edm::WaitingTaskWithArenaHolder holder) {
   // raii
   cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
 
+  // clear host buffers
   hf01_.clear();
   hf5_.clear();
   hf3_.clear();
@@ -123,35 +117,54 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
   event.getByToken(hbheDigiToken_, hbheDigis);
   event.getByToken(qie11DigiToken_, qie11Digis);
 
-  // flavor 0/1 get devie blobs
-  df01_.data = cms::cuda::make_device_unique<uint16_t[]>(
-      config_.maxChannelsF01HE * hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE), ctx.stream());
-  df01_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF01HE, ctx.stream());
 
-  // flavor3 get device blobs
-  df3_.data = cms::cuda::make_device_unique<uint16_t[]>(
-      config_.maxChannelsF3HB * hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB), ctx.stream());
-  df3_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF3HB, ctx.stream());
+  // init f5 collection
+  if (hbheDigis->size() > 0) {
+    auto const nsamples = (*hbheDigis)[0].size();
+    auto const stride = hcal::compute_stride<hcal::Flavor5>(nsamples);
+    hf5_.stride = stride;
+  
+    // flavor5 get device blobs
+    df5_.stride = stride;
+    df5_.data = cms::cuda::make_device_unique<uint16_t[]>(
+      config_.maxChannelsF5HB * stride, ctx.stream());
+    df5_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF5HB, ctx.stream());
+    df5_.npresamples = cms::cuda::make_device_unique<uint8_t[]>(config_.maxChannelsF5HB, ctx.stream());
+  }
 
-  // flavor5 get device blobs
-  df5_.data = cms::cuda::make_device_unique<uint16_t[]>(
-      config_.maxChannelsF5HB * hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB), ctx.stream());
-  df5_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF5HB, ctx.stream());
-  df5_.npresamples = cms::cuda::make_device_unique<uint8_t[]>(config_.maxChannelsF5HB, ctx.stream());
+  if (qie11Digis->size() > 0) {
+    auto const nsamples = qie11Digis->samples();
+    auto const stride01 = hcal::compute_stride<hcal::Flavor01>(nsamples);
+    auto const stride3 = hcal::compute_stride<hcal::Flavor3>(nsamples);
+
+    hf01_.stride = stride01;
+    hf3_.stride = stride3;
+  
+    // flavor 0/1 get devie blobs
+    df01_.stride = stride01;
+    df01_.data = cms::cuda::make_device_unique<uint16_t[]>(
+      config_.maxChannelsF01HE * stride01, ctx.stream());
+    df01_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF01HE, ctx.stream());
+
+    // flavor3 get device blobs
+    df3_.stride = stride3;
+    df3_.data = cms::cuda::make_device_unique<uint16_t[]>(
+      config_.maxChannelsF3HB * stride3, ctx.stream());
+    df3_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF3HB, ctx.stream());
+  }
 
   for (auto const& hbhe : *hbheDigis) {
     auto const id = hbhe.id().rawId();
     auto const presamples = hbhe.presamples();
     hf5_.ids.push_back(id);
     hf5_.npresamples.push_back(presamples);
-    int stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
+    auto const stride = hcal::compute_stride<hcal::Flavor5>(hbhe.size());
+    assert(stride == hf5_.stride && "strides must be the same for every single digi of the collection");
     // simple for now...
     static_assert(hcal::Flavor5::HEADER_WORDS == 1);
     uint16_t header_word = (1 << 15) | (0x5 << 12) | (0 << 10) | ((hbhe.sample(0).capid() & 0x3) << 8);
     hf5_.data.push_back(header_word);
-    //for (int i=0; i<hcal::Flavor5::HEADER_WORDS; i++)
-    //    hf5_.data.push_back(0);
-    for (int i = 0; i < stride - hcal::Flavor5::HEADER_WORDS; i++) {
+    for (unsigned int i = 0; i < stride - hcal::Flavor5::HEADER_WORDS; i++) {
       uint16_t s0 = (0 << 7) | (static_cast<uint8_t>(hbhe.sample(2 * i).adc()) & 0x7f);
       uint16_t s1 = (0 << 7) | (static_cast<uint8_t>(hbhe.sample(2 * i + 1).adc()) & 0x7f);
       uint16_t sample = (s1 << 8) | s0;
@@ -161,6 +174,7 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
 
   for (unsigned int i = 0; i < qie11Digis->size(); i++) {
     auto const& digi = QIE11DataFrame{(*qie11Digis)[i]};
+    assert(digi.samples() == qie11Digis->samples() && "collection nsamples must equal per digi samples");
     if (digi.flavor() == 0 or digi.flavor() == 1) {
       if (digi.detid().subdetId() != HcalEndcap)
         continue;
@@ -185,6 +199,7 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
   }
 
   auto lambdaToTransfer = [&ctx](auto* dest, auto const& src) {
+    if (src.size() == 0) return;
     using vector_type = typename std::remove_reference<decltype(src)>::type;
     using type = typename vector_type::value_type;
     using dest_data_type = typename std::remove_pointer<decltype(dest)>::type;
@@ -201,17 +216,14 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
 
   lambdaToTransfer(df3_.data.get(), hf3_.data);
   lambdaToTransfer(df3_.ids.get(), hf3_.ids);
-}
-
-void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
-  cms::cuda::ScopedContextProduce ctx{cudaState_};
 
-  df01_.stride = hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
   df01_.size = hf01_.ids.size();
-  df5_.stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
   df5_.size = hf5_.ids.size();
-  df3_.stride = hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB);
   df3_.size = hf3_.ids.size();
+}
+
+void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
 
   ctx.emplace(event, digisF01HEToken_, std::move(df01_));
   ctx.emplace(event, digisF5HBToken_, std::move(df5_));
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index 7789b86a50958..8796e6c6396fa 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -68,10 +68,12 @@ namespace hcal {
                                                       int const sipmQNTStoSum,
                                                       int const firstSampleShift,
                                                       uint32_t const offsetForHashes,
-                                                      float const ts4Thresh) {
+                                                      float const ts4Thresh,
+                                                      int const startingSample) {
       // indices + runtime constants
-      auto const sample = threadIdx.x;
-      int32_t const nsamplesExpected = blockDim.x;
+      auto const sample = threadIdx.x + startingSample;
+      auto const sampleWithinWindow = threadIdx.x;
+      int32_t const nsamplesForCompute = blockDim.x;
       auto const lch = threadIdx.y;
       auto const gch = lch + blockDim.y * blockIdx.x;
       auto const nchannels_per_block = blockDim.y;
@@ -82,7 +84,7 @@ namespace hcal {
         return;
 
       // initialize all output buffers
-      if (sample == 0) {
+      if (sampleWithinWindow == 0) {
         outputdid[gch] = 0;
         method0Energy[gch] = 0;
         method0Time[gch] = 0;
@@ -100,20 +102,20 @@ namespace hcal {
       // configure shared mem
       extern __shared__ char smem[];
       float* shrEnergyM0PerTS = reinterpret_cast<float*>(smem);
-      float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesExpected * nchannels_per_block;
-      float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesExpected * nchannels_per_block;
+      float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block;
+      float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block;
       float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block;
       unsigned long long int* shrMethod0EnergySamplePair =
           reinterpret_cast<unsigned long long int*>(shrEnergyM0TotalAccum + nchannels_per_block);
-      if (sample == 0) {
+      if (sampleWithinWindow == 0) {
         shrMethod0EnergyAccum[lch] = 0;
         shrMethod0EnergySamplePair[lch] = __float_as_uint(std::numeric_limits<float>::min());
         shrEnergyM0TotalAccum[lch] = 0;
       }
 
       // offset output
-      auto* amplitudesForChannel = amplitudes + nsamplesExpected * gch;
-      auto* noiseTermsForChannel = noiseTerms + nsamplesExpected * gch;
+      auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch;
+      auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch;
       auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
 
       // get event input quantities
@@ -123,7 +125,7 @@ namespace hcal {
                                                                         : compute_nsamples<Flavor3>(stride));
 
 #ifdef HCAL_MAHI_GPUDEBUG
-      assert(nsamples == nsamplesExpected);
+      assert(nsamples == nsamplesForCompute || nsamples-startingSample==nsampelsForCompute);
 #endif
 
       auto const id = gch < nchannelsf01HE
@@ -198,11 +200,11 @@ namespace hcal {
         //   if that is not the case, we will see that with cuda mmecheck
         auto const soibit = soibit_for_sample<Flavor01>(dataf01HE + stride * gch, sample);
         if (soibit == 1)
-          soiSamples[gch] = sample;
+          soiSamples[gch] = sampleWithinWindow;
       } else if (gch >= nchannelsf015) {
         auto const soibit = soibit_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample);
         if (soibit == 1)
-          soiSamples[gch] = sample;
+          soiSamples[gch] = sampleWithinWindow;
       }
       __syncthreads();
       int32_t const soi = gch < nchannelsf01HE
@@ -212,7 +214,7 @@ namespace hcal {
       //    ? npresamplesf5HB[gch - nchannelsf01HE]
       //    : soiSamples[gch];
       // this is here just to make things uniform...
-      if (gch >= nchannelsf01HE && gch < nchannelsf015 && sample == 0)
+      if (gch >= nchannelsf01HE && gch < nchannelsf015 && sampleWithinWindow == 0)
         soiSamples[gch] = npresamplesf5HB[gch - nchannelsf01HE];
 
       //
@@ -240,10 +242,10 @@ namespace hcal {
         auto const parLin3 = parLin3Values[sipmType - 1];
 
         int const first = std::max(soi + sipmQTSShift, 0);
-        int const last = std::min(soi + sipmQNTStoSum, nsamplesExpected);
+        int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
         float sipmq = 0.0f;
         for (auto ts = first; ts < last; ts++)
-          sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesExpected + ts];
+          sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesForCompute + ts];
         auto const effectivePixelsFired = sipmq / fcByPE;
         auto const factor =
             hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
@@ -268,12 +270,12 @@ namespace hcal {
       auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
       auto const startSampleTmp = soi + firstSampleShift;
       auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
-      auto const endSample = startSample + nsamplesToAdd < nsamples ? startSample + nsamplesToAdd : nsamples;
+      auto const endSample = startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
       // NOTE: gain is a small number < 10^-3, multiply it last
       auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
       auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
       // store to shared mem
-      shrEnergyM0PerTS[lch * nsamplesExpected + sample] = energym0_per_ts;
+      shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts;
       atomicAdd(&shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0);
 
 #ifdef HCAL_MAHI_GPUDEBUG
@@ -298,12 +300,12 @@ namespace hcal {
           "startSample = %d endSample = %d param1 = %u param2 = %u\n", startSample, endSample, recoParam1, recoParam2);
 #endif
 
-      if (sample >= startSample && sample < endSample) {
+      if (sampleWithinWindow >= startSample && sampleWithinWindow < endSample) {
         atomicAdd(&shrMethod0EnergyAccum[lch], energym0_per_ts);
         // pack sample, energy as 64 bit value
         unsigned long long int old = shrMethod0EnergySamplePair[lch], assumed;
         unsigned long long int val =
-            (static_cast<unsigned long long int>(sample) << 32) + __float_as_uint(energym0_per_ts);
+            (static_cast<unsigned long long int>(sampleWithinWindow) << 32) + __float_as_uint(energym0_per_ts);
         do {
           assumed = old;
           // decode energy, sample values
@@ -318,14 +320,14 @@ namespace hcal {
       __syncthreads();
 
       // NOTE: must take soi, as values for that thread are used...
-      if (sample == soi) {
+      if (sampleWithinWindow == soi) {
         auto const method0_energy = shrMethod0EnergyAccum[lch];
         auto const val = shrMethod0EnergySamplePair[lch];
         int const max_sample = (val >> 32) & 0xffffffff;
         float const max_energy = __uint_as_float(val & 0xffffffff);
         float const max_energy_1 =
-            max_sample < nsamples - 1 ? shrEnergyM0PerTS[lch * nsamplesExpected + max_sample + 1] : 0.f;
-        float const position = nsamplesToAdd < nsamples ? max_sample - soi : max_sample;
+            max_sample < nsamplesForCompute - 1 ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1] : 0.f;
+        float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample;
         auto const sum = max_energy + max_energy_1;
         // FIXME: for full comparison with cpu method 0  timing,
         // need to correct by slew
@@ -389,8 +391,8 @@ namespace hcal {
 #endif
 
       // store to global memory
-      amplitudesForChannel[sample] = amplitude;
-      noiseTermsForChannel[sample] = noiseTerm;
+      amplitudesForChannel[sampleWithinWindow] = amplitude;
+      noiseTermsForChannel[sampleWithinWindow] = noiseTerm;
     }
 
     // TODO: need to add an array of offsets for pulses (a la activeBXs...)
@@ -1215,19 +1217,25 @@ namespace hcal {
       outputGPU.recHits.size = totalChannels;
 
       // TODO: this can be lifted by implementing a separate kernel
-      // similar to the default one, but properly handling the diff in #samples
+      // similar to the default one, but properly handling the diff in #sample
       // or modifying existing one
       auto const f01nsamples = compute_nsamples<Flavor01>(inputGPU.f01HEDigis.stride);
       auto const f5nsamples = compute_nsamples<Flavor5>(inputGPU.f5HBDigis.stride);
       auto const f3nsamples = compute_nsamples<Flavor3>(inputGPU.f3HBDigis.stride);
-      assert(f01nsamples == f5nsamples && f01nsamples == f3nsamples);
-
-      dim3 threadsPerBlock{f01nsamples, configParameters.kprep1dChannelsPerBlock};
+      int constexpr windowSize = 8;
+      int const startingSample = f01nsamples - windowSize;
+      assert(startingSample==0 || startingSample==2);
+      if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f5HBDigis.stride> 0)
+          assert(f01nsamples == f5nsamples);
+      if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f3HBDigis.stride > 0)
+          assert(f01nsamples == f3nsamples);
+
+      dim3 threadsPerBlock{windowSize, configParameters.kprep1dChannelsPerBlock};
       int blocks = static_cast<uint32_t>(threadsPerBlock.y) > totalChannels
                        ? 1
                        : (totalChannels + threadsPerBlock.y - 1) / threadsPerBlock.y;
       int nbytesShared =
-          ((2 * f01nsamples + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock;
+          ((2 * windowSize + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock;
       kernel_prep1d_sameNumberOfSamples<<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
           scratch.amplitudes.get(),
           scratch.noiseTerms.get(),
@@ -1284,13 +1292,14 @@ namespace hcal {
           configParameters.sipmQNTStoSum,
           configParameters.firstSampleShift,
           conditions.offsetForHashes,
-          configParameters.ts4Thresh);
+          configParameters.ts4Thresh,
+          startingSample);
       cudaCheck(cudaGetLastError());
 
       // 1024 is the max threads per block for gtx1080
       // FIXME: take this from cuda service or something like that
-      uint32_t const channelsPerBlock = 1024 / (f01nsamples * conditions.pulseOffsetsHost.size());
-      dim3 threadsPerBlock2{f01nsamples, static_cast<uint32_t>(conditions.pulseOffsetsHost.size()), channelsPerBlock};
+      uint32_t const channelsPerBlock = 1024 / (windowSize * conditions.pulseOffsetsHost.size());
+      dim3 threadsPerBlock2{windowSize, static_cast<uint32_t>(conditions.pulseOffsetsHost.size()), channelsPerBlock};
       int blocks2 =
           threadsPerBlock2.z > totalChannels ? 1 : (totalChannels + threadsPerBlock2.z - 1) / threadsPerBlock2.z;
 
@@ -1342,7 +1351,8 @@ namespace hcal {
           configParameters.tmaxTimeSlew);
       cudaCheck(cudaGetLastError());
 
-      if (f01nsamples == 8 && conditions.pulseOffsetsHost.size() == 8u) {
+      // number of samples is checked in above assert
+      if (conditions.pulseOffsetsHost.size() == 8u) {
         // FIXME: provide constants from configuration
         uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
         uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;

From d4c41369af7120b597b14ea1368a24b8f6b10769 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 10 Aug 2020 18:41:50 +0200
Subject: [PATCH 22/34] Apply code checks and code format (cms-patatrack#532)

---
 .../plugins/HcalDigisProducerGPU.cc           | 21 ++++++++-----------
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 13 ++++++------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
index d5b6d42d0ed2c..b841e05f5938c 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -117,39 +117,35 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
   event.getByToken(hbheDigiToken_, hbheDigis);
   event.getByToken(qie11DigiToken_, qie11Digis);
 
-
   // init f5 collection
-  if (hbheDigis->size() > 0) {
+  if (not hbheDigis->empty()) {
     auto const nsamples = (*hbheDigis)[0].size();
     auto const stride = hcal::compute_stride<hcal::Flavor5>(nsamples);
     hf5_.stride = stride;
-  
+
     // flavor5 get device blobs
     df5_.stride = stride;
-    df5_.data = cms::cuda::make_device_unique<uint16_t[]>(
-      config_.maxChannelsF5HB * stride, ctx.stream());
+    df5_.data = cms::cuda::make_device_unique<uint16_t[]>(config_.maxChannelsF5HB * stride, ctx.stream());
     df5_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF5HB, ctx.stream());
     df5_.npresamples = cms::cuda::make_device_unique<uint8_t[]>(config_.maxChannelsF5HB, ctx.stream());
   }
 
-  if (qie11Digis->size() > 0) {
+  if (not qie11Digis->empty()) {
     auto const nsamples = qie11Digis->samples();
     auto const stride01 = hcal::compute_stride<hcal::Flavor01>(nsamples);
     auto const stride3 = hcal::compute_stride<hcal::Flavor3>(nsamples);
 
     hf01_.stride = stride01;
     hf3_.stride = stride3;
-  
+
     // flavor 0/1 get devie blobs
     df01_.stride = stride01;
-    df01_.data = cms::cuda::make_device_unique<uint16_t[]>(
-      config_.maxChannelsF01HE * stride01, ctx.stream());
+    df01_.data = cms::cuda::make_device_unique<uint16_t[]>(config_.maxChannelsF01HE * stride01, ctx.stream());
     df01_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF01HE, ctx.stream());
 
     // flavor3 get device blobs
     df3_.stride = stride3;
-    df3_.data = cms::cuda::make_device_unique<uint16_t[]>(
-      config_.maxChannelsF3HB * stride3, ctx.stream());
+    df3_.data = cms::cuda::make_device_unique<uint16_t[]>(config_.maxChannelsF3HB * stride3, ctx.stream());
     df3_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF3HB, ctx.stream());
   }
 
@@ -199,7 +195,8 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
   }
 
   auto lambdaToTransfer = [&ctx](auto* dest, auto const& src) {
-    if (src.size() == 0) return;
+    if (src.empty())
+      return;
     using vector_type = typename std::remove_reference<decltype(src)>::type;
     using type = typename vector_type::value_type;
     using dest_data_type = typename std::remove_pointer<decltype(dest)>::type;
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index 8796e6c6396fa..f4f19511899c8 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -125,7 +125,7 @@ namespace hcal {
                                                                         : compute_nsamples<Flavor3>(stride));
 
 #ifdef HCAL_MAHI_GPUDEBUG
-      assert(nsamples == nsamplesForCompute || nsamples-startingSample==nsampelsForCompute);
+      assert(nsamples == nsamplesForCompute || nsamples - startingSample == nsampelsForCompute);
 #endif
 
       auto const id = gch < nchannelsf01HE
@@ -270,7 +270,8 @@ namespace hcal {
       auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
       auto const startSampleTmp = soi + firstSampleShift;
       auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
-      auto const endSample = startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
+      auto const endSample =
+          startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
       // NOTE: gain is a small number < 10^-3, multiply it last
       auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
       auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
@@ -1224,11 +1225,11 @@ namespace hcal {
       auto const f3nsamples = compute_nsamples<Flavor3>(inputGPU.f3HBDigis.stride);
       int constexpr windowSize = 8;
       int const startingSample = f01nsamples - windowSize;
-      assert(startingSample==0 || startingSample==2);
-      if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f5HBDigis.stride> 0)
-          assert(f01nsamples == f5nsamples);
+      assert(startingSample == 0 || startingSample == 2);
+      if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f5HBDigis.stride > 0)
+        assert(f01nsamples == f5nsamples);
       if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f3HBDigis.stride > 0)
-          assert(f01nsamples == f3nsamples);
+        assert(f01nsamples == f3nsamples);
 
       dim3 threadsPerBlock{windowSize, configParameters.kprep1dChannelsPerBlock};
       int blocks = static_cast<uint32_t>(threadsPerBlock.y) > totalChannels

From 92a8ce488e0829670a13177cd120e328c97943e6 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 26 Aug 2020 21:43:02 +0200
Subject: [PATCH 23/34] HcalDigisProducerGPU: preallocate pinned host memory
 only if CUDA is available (cms-patatrack#543)

---
 .../HcalRawToDigi/plugins/HcalDigisProducerGPU.cc   | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
index b841e05f5938c..fdf9e6e704874 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -92,12 +92,17 @@ HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
 
   // this is a preallocation for the max statically known number of time samples
   // actual stride/nsamples will be inferred from data
-  hf5_.stride = hcal::compute_stride<hcal::Flavor5>(HBHEDataFrame::MAXSAMPLES);
   hf01_.stride = hcal::compute_stride<hcal::Flavor01>(QIE11DigiCollection::MAXSAMPLES);
+  hf5_.stride = hcal::compute_stride<hcal::Flavor5>(HBHEDataFrame::MAXSAMPLES);
   hf3_.stride = hcal::compute_stride<hcal::Flavor3>(QIE11DigiCollection::MAXSAMPLES);
-  hf01_.reserve(config_.maxChannelsF01HE);
-  hf5_.reserve(config_.maxChannelsF5HB);
-  hf3_.reserve(config_.maxChannelsF3HB);
+
+  // preallocate pinned host memory only if CUDA is available
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    hf01_.reserve(config_.maxChannelsF01HE);
+    hf5_.reserve(config_.maxChannelsF5HB);
+    hf3_.reserve(config_.maxChannelsF3HB);
+  }
 }
 
 void HcalDigisProducerGPU::acquire(edm::Event const& event,

From b7c931920afbd3ccf33493db7f204173ab723f89 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 20 Oct 2020 09:25:14 +0200
Subject: [PATCH 24/34] Move multifit/MAHI common code to
 DataFormats/CaloRecHit (cms-patatrack#557)

Move multifit/MAHI common code to DataFormats/CaloRecHit/interface/MultifitComputations.h .
Improve naming and description of fnnls parameters.
Use Eigen preprocessor symbols instead of explicit CUDA keywords, and CUDA preprocessor symbols to protect CUDA-only functions.

Co-authored-by: Andrea Bocci <andrea.bocci@cern.ch>
---
 .../HcalRecProducers/src/KernelHelpers.h      |   4 +-
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 150 +-----------------
 2 files changed, 4 insertions(+), 150 deletions(-)

diff --git a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
index b0447b1600b9b..1fd5cb0fc387a 100644
--- a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
+++ b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
@@ -10,8 +10,6 @@
 namespace hcal {
   namespace reconstruction {
 
-    constexpr int32_t IPHI_MAX = 72;
-
     // this is from HcalTimeSlew.
     // HcalTimeSlew are values that come in from ESProducer that takes them
     // from a python config. see DeclsForKernels for more explanation
@@ -64,7 +62,7 @@ namespace hcal {
     // 2 funcs below are taken from HcalTopology (reimplemented here).
     // Inputs are constants that are also taken from HcalTopology
     // but passed to the kernel as arguments using the HclaTopology itself
-    //    constexpr int32_t IPHI_MAX = 72;
+    constexpr int32_t IPHI_MAX = 72;
 
     __forceinline__ __device__ uint32_t did2linearIndexHB(
         uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) {
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index f4f19511899c8..5b3ad6693f27e 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -1,7 +1,7 @@
 #include <Eigen/Dense>
 
 #include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
-#include "DataFormats/Math/interface/EigenComputations.h"
+#include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
 
 // nvcc not able to parse this guy (whatever is inlcuded from it)....
 //#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h"
@@ -620,151 +620,6 @@ namespace hcal {
       pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
     }
 
-    // TODO: add active bxs
-    template <typename MatrixType, typename VectorType>
-    __device__ void fnnls(MatrixType const& AtA,
-                          VectorType const& Atb,
-                          VectorType& solution,
-                          int& npassive,
-                          calo::multifit::ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
-                          calo::multifit::MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
-                          double const eps,
-                          int const maxIterations) {
-      // constants
-      constexpr auto NPULSES = VectorType::RowsAtCompileTime;
-
-      // to keep track of where to terminate if converged
-      Eigen::Index w_max_idx_prev = 0;
-      float w_max_prev = 0;
-      auto eps_to_use = eps;
-      bool recompute = false;
-
-      // used throughout
-      VectorType s;
-      float reg_b[NPULSES];
-      //float matrixLStorage[MapSymM<float, NPULSES>::total];
-      //MapSymM<float, NPULSES> matrixL{matrixLStorage};
-
-      int iter = 0;
-      while (true) {
-        if (iter > 0 || npassive == 0) {
-          auto const nactive = NPULSES - npassive;
-          // exit if there are no more pulses to constrain
-          if (nactive == 0)
-            break;
-
-          // compute the gradient
-          //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive);
-          Eigen::Index w_max_idx;
-          float w_max = -std::numeric_limits<float>::max();
-          for (int icol = npassive; icol < NPULSES; icol++) {
-            auto const icol_real = pulseOffsets(icol);
-            auto const atb = Atb(icol_real);
-            float sum = 0;
-#pragma unroll
-            for (int counter = 0; counter < NPULSES; counter++)
-              sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter)
-                                         : AtA(icol_real, counter) * solution(counter);
-
-            auto const w = atb - sum;
-            if (w > w_max) {
-              w_max = w;
-              w_max_idx = icol - npassive;
-            }
-          }
-
-          // check for convergence
-          if (w_max < eps_to_use || w_max_idx == w_max_idx_prev && w_max == w_max_prev)
-            break;
-
-          if (iter >= maxIterations)
-            break;
-
-          w_max_prev = w_max;
-          w_max_idx_prev = w_max_idx;
-
-          // move index to the right part of the vector
-          w_max_idx += npassive;
-
-          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx));
-          ++npassive;
-        }
-
-        // inner loop
-        while (true) {
-          if (npassive == 0)
-            break;
-
-          //s.head(npassive)
-          //auto const& matrixL =
-          //    AtA.topLeftCorner(npassive, npassive)
-          //        .llt().matrixL();
-          //.solve(Atb.head(npassive));
-          if (recompute || iter == 0)
-            calo::multifit::compute_decomposition_forwardsubst_with_offsets(
-                matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
-          else
-            calo::multifit::update_decomposition_forwardsubst_with_offsets(
-                matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
-
-          // run backward substituion
-          s(npassive - 1) = reg_b[npassive - 1] / matrixL(npassive - 1, npassive - 1);
-          for (int i = npassive - 2; i >= 0; --i) {
-            float total = 0;
-            for (int j = i + 1; j < npassive; j++)
-              total += matrixL(j, i) * s(j);
-
-            s(i) = (reg_b[i] - total) / matrixL(i, i);
-          }
-
-          // done if solution values are all positive
-          if (s.head(npassive).minCoeff() > 0.f) {
-            for (int i = 0; i < npassive; i++) {
-              auto const i_real = pulseOffsets(i);
-              solution(i_real) = s(i);
-            }
-            //solution.head(npassive) = s.head(npassive);
-            recompute = false;
-            break;
-          }
-
-          // there were negative values -> have to recompute the whole decomp
-          recompute = true;
-
-          auto alpha = std::numeric_limits<float>::max();
-          Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
-          for (int i = 0; i < npassive; i++) {
-            if (s[i] <= 0.) {
-              auto const i_real = pulseOffsets(i);
-              auto const ratio = solution[i_real] / (solution[i_real] - s[i]);
-              if (ratio < alpha) {
-                alpha = ratio;
-                alpha_idx = i;
-                alpha_idx_real = i_real;
-              }
-            }
-          }
-
-          // upadte solution
-          for (int i = 0; i < npassive; i++) {
-            auto const i_real = pulseOffsets(i);
-            solution(i_real) += alpha * (s(i) - solution(i_real));
-          }
-          //solution.head(npassive) += alpha *
-          //    (s.head(npassive) - solution.head(npassive));
-          solution[alpha_idx_real] = 0;
-          --npassive;
-
-          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx));
-        }
-
-        // as in cpu
-        ++iter;
-        if (iter % 10 == 0)
-          eps_to_use *= 10;
-      }
-    }
-
     template <int NSAMPLES, int NPULSES>
     __forceinline__ __device__ void update_covariance(
         calo::multifit::ColumnVector<NPULSES> const& resultAmplitudesVector,
@@ -1086,7 +941,8 @@ namespace hcal {
 
         // run fast nnls
         // FIXME: provide values from config
-        fnnls(AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, 1e-11, 500);
+        calo::multifit::fnnls(
+            AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, 1e-11, 500, 10, 10);
 
 #ifdef HCAL_MAHI_GPUDEBUG
         printf("result Amplitudes\n");

From e9a3c6d98a40e55c8fe1647b638196c4fe7fa76a Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 6 Nov 2020 22:43:55 +0100
Subject: [PATCH 25/34] Address HCAL review comments regarding
 CondFormats/HcalObjects (cms-patatrack#565)

Move conditions from RecoLocalCalo/HcalRecAlgos to CondFormats/HcalObjects.
Rename HCAL digi Flavor01 to Flavor1, and drop unused Flavor2 and Flavor4.
Update include paths and HCAL namespaces.
---
 .../HcalDigi/interface/DigiCollection.h       | 22 +++-----
 CUDADataFormats/HcalDigi/src/classes_def.xml  | 32 +++--------
 .../makeHcalRaw2DigiGpuValidationPlots.cpp    | 14 ++---
 .../HcalRawToDigi/plugins/DeclsForKernels.h   |  6 +--
 .../HcalRawToDigi/plugins/DecodeGPU.cu        |  2 +-
 EventFilter/HcalRawToDigi/plugins/DecodeGPU.h |  2 +-
 .../plugins/HcalCPUDigisProducer.cc           |  4 +-
 .../plugins/HcalDigisProducerGPU.cc           | 12 ++---
 .../HcalRawToDigi/plugins/HcalRawToDigiGPU.cc |  4 +-
 RecoLocalCalo/HcalRecAlgos/BuildFile.xml      |  1 -
 .../HcalRecProducers/src/DeclsForKernels.h    | 54 ++++++++++++-------
 .../src/HBHERecHitProducerGPU.cc              | 51 ++++--------------
 .../src/HcalESProducersGPUDefs.cc             | 36 ++++++-------
 .../HcalRecProducers/src/KernelHelpers.h      | 14 ++---
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 45 ++++++++++------
 .../HcalRecProducers/src/SimpleAlgoGPU.h      | 19 +++++++
 16 files changed, 148 insertions(+), 170 deletions(-)
 create mode 100644 RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h

diff --git a/CUDADataFormats/HcalDigi/interface/DigiCollection.h b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
index 10350b52d4c52..af4f76b775ec1 100644
--- a/CUDADataFormats/HcalDigi/interface/DigiCollection.h
+++ b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
@@ -5,48 +5,38 @@
 
 namespace hcal {
 
-  struct Flavor01 {
+  // FLAVOR_HE_QIE11 = 1; Phase1 upgrade
+  struct Flavor1 {
     using adc_type = uint8_t;
     using tdc_type = uint8_t;
     using soibit_type = uint8_t;
 
     static constexpr int WORDS_PER_SAMPLE = 1;
+    static constexpr int SAMPLES_PER_WORD = 1;
     static constexpr int HEADER_WORDS = 1;
 
     static constexpr adc_type adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
-
     static constexpr tdc_type tdc(uint16_t const* const sample_start) { return (*sample_start >> 8) & 0x3f; }
-
     static constexpr soibit_type soibit(uint16_t const* const sample_start) { return (*sample_start >> 14) & 0x1; }
   };
 
-  struct Flavor2 {
-    static constexpr int WORDS_PER_SAMPLE = 2;
-    static constexpr int HEADER_WORDS = 1;
-  };
-
+  // FLAVOR_HB_QIE11 = 3; Phase1 upgrade
   struct Flavor3 {
     using adc_type = uint8_t;
     using tdc_type = uint8_t;
     using soibit_type = uint8_t;
 
     static constexpr int WORDS_PER_SAMPLE = 1;
+    static constexpr int SAMPLES_PER_WORD = 1;
     static constexpr int HEADER_WORDS = 1;
 
     static constexpr adc_type adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
-
     static constexpr tdc_type tdc(uint16_t const* const sample_start) { return ((*sample_start >> 8) & 0x3); }
-
     static constexpr soibit_type soibit(uint16_t const* const sample_start) { return ((*sample_start >> 14) & 0x1); }
-
     static constexpr uint8_t capid(uint16_t const* const sample_start) { return ((*sample_start >> 10) & 0x3); }
   };
 
-  struct Flavor4 {
-    static constexpr int WORDS_PER_SAMPLE = 1;
-    static constexpr int HEADER_WORDS = 1;
-  };
-
+  // FLAVOR_HB_QIE10 = 5; Phase0
   struct Flavor5 {
     using adc_type = uint8_t;
 
diff --git a/CUDADataFormats/HcalDigi/src/classes_def.xml b/CUDADataFormats/HcalDigi/src/classes_def.xml
index 3291c7f6d22ec..71997eb59ba61 100644
--- a/CUDADataFormats/HcalDigi/src/classes_def.xml
+++ b/CUDADataFormats/HcalDigi/src/classes_def.xml
@@ -2,51 +2,35 @@
     <class name="hcal::DigiCollectionBase<calo::common::VecStoragePolicy<std::allocator>>" />
     <class name="hcal::DigiCollectionBase<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
 
-    <class name="hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<std::allocator>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor2, calo::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<std::allocator>>" />
     <class name="hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<std::allocator>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor4, calo::common::VecStoragePolicy<std::allocator>>" />
     <class name="hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<std::allocator>>" />
             
-    <class name="hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor2, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
     <class name="hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
-    <class name="hcal::DigiCollection<hcal::Flavor4, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
     <class name="hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
 
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::ViewStoragePolicy>>" persistent="false" />
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::ViewStoragePolicy>>" persistent="false" />
     <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::ViewStoragePolicy>>" persistent="false" />
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, calo::common::ViewStoragePolicy>>" persistent="false" />
     <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::ViewStoragePolicy>>" persistent="false" />
             
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>" persistent="false" />
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>" persistent="false" />
     <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>" persistent="false" />
-    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, calo::common::DevStoragePolicy>>" persistent="false" />
     <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>" persistent="false" />
             
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::ViewStoragePolicy>>>" persistent="false" />
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, calo::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::ViewStoragePolicy>>>" persistent="false" />
     <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::ViewStoragePolicy>>>" persistent="false" />
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, calo::common::ViewStoragePolicy>>>" persistent="false" />
     <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::ViewStoragePolicy>>>" persistent="false" />
                 
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>>" persistent="false" />
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor2, calo::common::DevStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>>" persistent="false" />
     <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>>" persistent="false" />
-    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor4, calo::common::DevStoragePolicy>>>" persistent="false" />
     <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>>" persistent="false" />
 
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<std::allocator>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor2, calo::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<std::allocator>>>" />
     <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<std::allocator>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor4, calo::common::VecStoragePolicy<std::allocator>>>" />
     <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<std::allocator>>>" />
                 
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor2, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
     <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
-    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor4, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
     <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
 </lcgdict>
diff --git a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
index 94a43892e08b6..591da39b704f6 100644
--- a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
+++ b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
@@ -71,7 +71,7 @@ int main(int argc, char* argv[]) {
 
   // branches to use
   using Collectionf01 =
-      hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+      hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   using Collectionf5 =
       hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   using Collectionf3 =
@@ -134,7 +134,7 @@ int main(int argc, char* argv[]) {
       "f5HBDigis_RECO.",
       &wgpuf5hb);
   rt->SetBranchAddress(
-      "hcalFlavor01calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "hcalFlavor1calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
       "f01HEDigis_RECO.",
       &wgpuf01he);
   rt->SetBranchAddress(
@@ -194,7 +194,7 @@ int main(int argc, char* argv[]) {
         assert(*iter2idgpu == cpuid);
 
         auto const ptrdiff = iter2idgpu - idsgpu.begin();
-        auto const nsamples_gpu = hcal::compute_nsamples<hcal::Flavor01>(f01HEProduct.stride);
+        auto const nsamples_gpu = hcal::compute_nsamples<hcal::Flavor1>(f01HEProduct.stride);
         auto const nsamples_cpu = qie11Filteredf01.samples();
         assert(static_cast<uint32_t>(nsamples_cpu) == nsamples_gpu);
 
@@ -203,11 +203,11 @@ int main(int argc, char* argv[]) {
         uint16_t const* df_start = datagpu.data() + offset;
         for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
           auto const cpuadc = cpudf[sample].adc();
-          auto const gpuadc = hcal::adc_for_sample<hcal::Flavor01>(df_start, sample);
+          auto const gpuadc = hcal::adc_for_sample<hcal::Flavor1>(df_start, sample);
           auto const cputdc = cpudf[sample].tdc();
-          auto const gputdc = hcal::tdc_for_sample<hcal::Flavor01>(df_start, sample);
+          auto const gputdc = hcal::tdc_for_sample<hcal::Flavor1>(df_start, sample);
           auto const cpucapid = cpudf[sample].capid();
-          auto const gpucapid = hcal::capid_for_sample<hcal::Flavor01>(df_start, sample);
+          auto const gpucapid = hcal::capid_for_sample<hcal::Flavor1>(df_start, sample);
 
           hADCf01HEGPU->Fill(gpuadc);
           hADCf01HECPU->Fill(cpuadc);
@@ -306,7 +306,7 @@ int main(int argc, char* argv[]) {
           auto const cpuadc = cpudf.sample(sample).adc();
           auto const gpuadc = hcal::adc_for_sample<hcal::Flavor5>(df_start, sample);
           auto const cpucapid = cpudf.sample(sample).capid();
-          auto const gpucapid = hcal::capid_for_sample<hcal::Flavor01>(df_start, sample);
+          auto const gpucapid = hcal::capid_for_sample<hcal::Flavor1>(df_start, sample);
 
           hADCf5HBGPU->Fill(gpuadc);
           hADCf5HBCPU->Fill(cpuadc);
diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
index 9eb0670f60d59..9f89e3807402c 100644
--- a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
+++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
@@ -51,16 +51,16 @@ namespace hcal {
     };
 
     struct OutputDataGPU {
-      DigiCollection<Flavor01, ::calo::common::DevStoragePolicy> digisF01HE;
+      DigiCollection<Flavor1, ::calo::common::DevStoragePolicy> digisF01HE;
       DigiCollection<Flavor5, ::calo::common::DevStoragePolicy> digisF5HB;
       DigiCollection<Flavor3, ::calo::common::DevStoragePolicy> digisF3HB;
 
       void allocate(ConfigurationParameters const &config, cudaStream_t cudaStream) {
         digisF01HE.data = cms::cuda::make_device_unique<uint16_t[]>(
-            config.maxChannelsF01HE * compute_stride<Flavor01>(config.nsamplesF01HE), cudaStream);
+            config.maxChannelsF01HE * compute_stride<Flavor1>(config.nsamplesF01HE), cudaStream);
         //cudaCheck(
         //    cudaMalloc((void **)&digisF01HE.data,
-        //               config.maxChannelsF01HE * sizeof(uint16_t) * compute_stride<Flavor01>(config.nsamplesF01HE)));
+        //               config.maxChannelsF01HE * sizeof(uint16_t) * compute_stride<Flavor1>(config.nsamplesF01HE)));
         digisF01HE.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF01HE, cudaStream);
         //cudaCheck(cudaMalloc((void **)&digisF01HE.ids, sizeof(uint32_t) * config.maxChannelsF01HE));
 
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
index ab1f7134277be..97b4c4f07f497 100644
--- a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
@@ -316,7 +316,7 @@ namespace hcal {
               uint32_t const nwords = channel_end - channel_header_word;
 
               // filter out this digi if nwords does not equal expected
-              auto const expected_words = compute_stride<Flavor01>(nsamplesF01HE);
+              auto const expected_words = compute_stride<Flavor1>(nsamplesF01HE);
               if (nwords != expected_words)
                 break;
 
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
index 97af639b61a5e..3d5e4eec32269 100644
--- a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
@@ -1,7 +1,7 @@
 #ifndef EventFilter_HcalRawToDigi_interface_DecodeGPU_h
 #define EventFilter_HcalRawToDigi_interface_DecodeGPU_h
 
-#include "EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h"
+#include "DeclsForKernels.h"
 
 namespace hcal {
   namespace raw {
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
index a45d5d44adcd2..c2b67a10afaff 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
@@ -23,7 +23,7 @@ class HcalCPUDigisProducer : public edm::stream::EDProducer<edm::ExternalWork> {
   void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>;
+  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductTypef01> digisF01HETokenIn_;
   using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductTypef5> digisF5HBTokenIn_;
@@ -31,7 +31,7 @@ class HcalCPUDigisProducer : public edm::stream::EDProducer<edm::ExternalWork> {
   edm::EDGetTokenT<IProductTypef3> digisF3HBTokenIn_;
 
   using OProductTypef01 =
-      hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+      hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   edm::EDPutTokenT<OProductTypef01> digisF01HETokenOut_;
   using OProductTypef5 =
       hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
index fdf9e6e704874..9ca33340f7036 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -29,8 +29,8 @@ class HcalDigisProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 
   // type aliases
   using HostCollectionf01 =
-      hcal::DigiCollection<hcal::Flavor01, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
-  using DeviceCollectionf01 = hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>;
+      hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf01 = hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>;
   using HostCollectionf5 =
       hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
   using DeviceCollectionf5 = hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>;
@@ -92,7 +92,7 @@ HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
 
   // this is a preallocation for the max statically known number of time samples
   // actual stride/nsamples will be inferred from data
-  hf01_.stride = hcal::compute_stride<hcal::Flavor01>(QIE11DigiCollection::MAXSAMPLES);
+  hf01_.stride = hcal::compute_stride<hcal::Flavor1>(QIE11DigiCollection::MAXSAMPLES);
   hf5_.stride = hcal::compute_stride<hcal::Flavor5>(HBHEDataFrame::MAXSAMPLES);
   hf3_.stride = hcal::compute_stride<hcal::Flavor3>(QIE11DigiCollection::MAXSAMPLES);
 
@@ -137,7 +137,7 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
 
   if (not qie11Digis->empty()) {
     auto const nsamples = qie11Digis->samples();
-    auto const stride01 = hcal::compute_stride<hcal::Flavor01>(nsamples);
+    auto const stride01 = hcal::compute_stride<hcal::Flavor1>(nsamples);
     auto const stride3 = hcal::compute_stride<hcal::Flavor3>(nsamples);
 
     hf01_.stride = stride01;
@@ -181,10 +181,10 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event,
         continue;
       auto const id = digi.detid().rawId();
       hf01_.ids.push_back(id);
-      for (int hw = 0; hw < hcal::Flavor01::HEADER_WORDS; hw++)
+      for (int hw = 0; hw < hcal::Flavor1::HEADER_WORDS; hw++)
         hf01_.data.push_back((*qie11Digis)[i][hw]);
       for (int sample = 0; sample < digi.samples(); sample++) {
-        hf01_.data.push_back((*qie11Digis)[i][hcal::Flavor01::HEADER_WORDS + sample]);
+        hf01_.data.push_back((*qie11Digis)[i][hcal::Flavor1::HEADER_WORDS + sample]);
       }
     } else if (digi.flavor() == 3) {
       if (digi.detid().subdetId() != HcalBarrel)
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
index f1b5ef6885a04..7e8388a5f4d2f 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
@@ -30,7 +30,7 @@ class HcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 
 private:
   edm::EDGetTokenT<FEDRawDataCollection> rawDataToken_;
-  using ProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>;
+  using ProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>;
   edm::EDPutTokenT<ProductTypef01> digisF01HEToken_;
   using ProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>;
   edm::EDPutTokenT<ProductTypef5> digisF5HBToken_;
@@ -185,7 +185,7 @@ void HcalRawToDigiGPU::produce(edm::Event& event, edm::EventSetup const& setup)
   outputGPU_.digisF01HE.size = nchannelsF01HE;
   outputGPU_.digisF5HB.size = nchannelsF5HB;
   outputGPU_.digisF3HB.size = nchannelsF3HB;
-  outputGPU_.digisF01HE.stride = hcal::compute_stride<hcal::Flavor01>(config_.nsamplesF01HE);
+  outputGPU_.digisF01HE.stride = hcal::compute_stride<hcal::Flavor1>(config_.nsamplesF01HE);
   outputGPU_.digisF5HB.stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
   outputGPU_.digisF3HB.stride = hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB);
 
diff --git a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
index 8b00269aa4d9a..2c2ee20aff7d5 100644
--- a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
@@ -10,7 +10,6 @@
 <use name="CalibCalorimetry/HcalAlgos"/>
 <use name="CalibFormats/CaloObjects"/>
 <use name="CalibFormats/HcalObjects"/>
-<use name="CondFormats/DataRecord"/>
 <use name="DataFormats/HcalDigi"/>
 <use name="DataFormats/HcalRecHit"/>
 <use name="DataFormats/TrackReco"/>
diff --git a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
index 38503cec7e76f..b472ab1cec087 100644
--- a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
+++ b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
@@ -10,27 +10,43 @@
 #include "Geometry/CaloTopology/interface/HcalTopology.h"
 #include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalLUTCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIECodersGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIETypesGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRespCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMCharacteristicsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
-
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
+#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGainsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalLUTCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIECodersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIETypesGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristicsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h"
+
+#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
+#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
+
+#include "CondFormats/HcalObjects/interface/HcalCombinedRecordsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h"
+//#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalWidthsGPU.h"
+//#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalsGPU.h"
+
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
+#include "HcalMahiPulseOffsetsGPURecord.h"
+
 namespace hcal {
-  namespace mahi {
+  namespace reconstruction {
 
     struct ConditionsProducts {
       HcalGainWidthsGPU::Product const& gainWidths;
@@ -99,12 +115,12 @@ namespace hcal {
     };
 
     struct InputDataGPU {
-      DigiCollection<Flavor01, ::calo::common::DevStoragePolicy> const& f01HEDigis;
+      DigiCollection<Flavor1, ::calo::common::DevStoragePolicy> const& f01HEDigis;
       DigiCollection<Flavor5, ::calo::common::DevStoragePolicy> const& f5HBDigis;
       DigiCollection<Flavor3, ::calo::common::DevStoragePolicy> const& f3HBDigis;
     };
 
   }  // namespace mahi
-}  // namespace hcal
+}  // namespace reconstruction
 
 #endif  // RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h
diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
index ea51d54822e11..e66534682f0a2 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -1,46 +1,13 @@
-#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h"
-#include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h"
-#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
-#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
-#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
 #include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
-#include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalCombinedRecordsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalLUTCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalPedestalWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIECodersGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIETypesGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRespCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMCharacteristicsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
-
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
-#include "HcalMahiPulseOffsetsGPURecord.h"
-
-#include "MahiGPU.h"
+
+#include "SimpleAlgoGPU.h"
 
 class HBHERecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -52,7 +19,7 @@ class HBHERecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork>
   void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
   void produce(edm::Event&, edm::EventSetup const&) override;
 
-  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor01, calo::common::DevStoragePolicy>>;
+  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>;
   edm::EDGetTokenT<IProductTypef01> digisTokenF01HE_;
 
   using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>;
@@ -65,8 +32,8 @@ class HBHERecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork>
   using OProductType = cms::cuda::Product<RecHitType>;
   edm::EDPutTokenT<OProductType> rechitsM0Token_;
 
-  hcal::mahi::ConfigParameters configParameters_;
-  hcal::mahi::OutputDataGPU outputGPU_;
+  hcal::reconstruction::ConfigParameters configParameters_;
+  hcal::reconstruction::OutputDataGPU outputGPU_;
   cms::cuda::ContextState cudaState_;
 };
 
@@ -149,7 +116,7 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
   auto const& f5HBDigis = ctx.get(f5HBProduct);
   auto const& f3HBDigis = ctx.get(f3HBProduct);
 
-  hcal::mahi::InputDataGPU inputGPU{f01HEDigis, f5HBDigis, f3HBDigis};
+  hcal::reconstruction::InputDataGPU inputGPU{f01HEDigis, f5HBDigis, f3HBDigis};
 
   // conditions
   edm::ESHandle<HcalRecoParamsWithPulseShapesGPU> recoParamsHandle;
@@ -220,7 +187,7 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
   auto const& pulseOffsetsProduct = pulseOffsetsHandle->getProduct(ctx.stream());
 
   // bundle up conditions
-  hcal::mahi::ConditionsProducts conditions{gainWidthsProduct,
+  hcal::reconstruction::ConditionsProducts conditions{gainWidthsProduct,
                                             gainsProduct,
                                             lutCorrsProduct,
                                             pedestalWidthsProduct,
@@ -241,7 +208,7 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
                                             pulseOffsetsHandle->getValues()};
 
   // scratch mem on device
-  hcal::mahi::ScratchDataGPU scratchGPU = {
+  hcal::reconstruction::ScratchDataGPU scratchGPU = {
       cms::cuda::make_device_unique<float[]>(configParameters_.maxChannels * configParameters_.maxTimeSamples,
                                              ctx.stream()),
       cms::cuda::make_device_unique<float[]>(configParameters_.maxChannels * configParameters_.maxTimeSamples,
@@ -261,7 +228,7 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
   // output dev mem
   outputGPU_.allocate(configParameters_, ctx.stream());
 
-  hcal::mahi::entryPoint(inputGPU, outputGPU_, conditions, scratchGPU, configParameters_, ctx.stream());
+  hcal::reconstruction::entryPoint(inputGPU, outputGPU_, conditions, scratchGPU, configParameters_, ctx.stream());
 
 #ifdef HCAL_MAHI_CPUDEBUG
   auto end = std::chrono::high_resolution_clock::now();
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
index 26556dc523e85..cc9341ca4bb97 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
@@ -26,21 +26,19 @@
 #include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
 #include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
 
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalPedestalsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalLUTCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRespCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalTimeCorrsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalPedestalWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalGainWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIECodersGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalQIETypesGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMParametersGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalSiPMCharacteristicsGPU.h"
-
-#include <iostream>
+#include "CondFormats/HcalObjects/interface/HcalPedestalsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGainsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalLUTCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIECodersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIETypesGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristicsGPU.h"
 
 using HcalRecoParamsGPUESProducer = HcalESProducerGPU<HcalRecoParamsRcd, HcalRecoParamsGPU, HcalRecoParams>;
 
@@ -86,11 +84,11 @@ DEFINE_FWK_EVENTSETUP_MODULE(HcalQIETypesGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMParametersGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMCharacteristicsGPUESProducer);
 
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalCombinedRecordsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedPedestalWidthsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalCombinedRecordsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
 
 using HcalConvertedPedestalsGPUESProducer = HcalESProducerGPUWithDependencies<HcalConvertedPedestalsRcd,
                                                                               HcalConvertedPedestalsGPU,
diff --git a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
index 1fd5cb0fc387a..5caeef3a96c2e 100644
--- a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
+++ b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
@@ -2,10 +2,6 @@
 #define RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
 
 #include "DeclsForKernels.h"
-#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
-
-// nvcc not able to parse this guy (whatever is inlcuded from it)....
-//#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h"
 
 namespace hcal {
   namespace reconstruction {
@@ -107,6 +103,9 @@ namespace hcal {
       return (center - qieOffsets[index]) / qieSlopes[index];
     }
 
+    // this is from
+    //  https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc#L140
+
     __forceinline__ __device__ float compute_diff_charge_gain(int const qieType,
                                                               uint8_t adc,
                                                               uint8_t const capid,
@@ -145,8 +144,7 @@ namespace hcal {
     }
 
     // FIXME remove duplication...
-    // this is from PulesFunctor. nvcc was complaining... if included that header...
-    //constexpr int maxSamples = 10;
+    // this is from RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h nvcc was complaining... if included that header...
     constexpr int maxPSshapeBin = 256;
     constexpr int nsPerBX = 25;
     constexpr float iniTimeShift = 92.5f;
@@ -162,11 +160,8 @@ namespace hcal {
                                                                float const* accVarLenIdxZeroVec,
                                                                float const* diffVarItvlIdxZeroVec) {
       // constants
-      constexpr float pulse_height = 1.0f;
       constexpr float slew = 0.f;
       constexpr auto ns_per_bx = nsPerBX;
-      //constexpr auto num_ns = nsPerBX * maxSamples;
-      //constexpr auto num_bx = num_ns / ns_per_bx;
 
       // FIXME: clean up all the rounding... this is coming from original cpu version
       float const i_start_float =
@@ -218,7 +213,6 @@ namespace hcal {
         int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
         value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
       }
-      value *= pulse_height;
       return value;
     }
 
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index 5b3ad6693f27e..f6e9924d64916 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -1,12 +1,12 @@
 #include <Eigen/Dense>
 
-#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
 
 // nvcc not able to parse this guy (whatever is inlcuded from it)....
 //#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h"
 
-#include "MahiGPU.h"
+#include "SimpleAlgoGPU.h"
+#include "KernelHelpers.h"
 
 #ifdef HCAL_MAHI_GPUDEBUG
 #define DETID_TO_DEBUG 1125647428
@@ -15,6 +15,13 @@
 namespace hcal {
   namespace mahi {
 
+    // TODO: provide constants from configuration
+    // from RecoLocalCalo/HcalRecProducers/python/HBHEMahiParameters_cfi.py
+    constexpr int nMaxItersMin = 50;
+    constexpr int nMaxItersNNLS = 500;
+    constexpr double nnlsThresh = 1e-11;
+    constexpr float deltaChi2Threashold = 1e-3;
+
     // Assume: same number of samples for HB and HE
     // TODO: add/validate restrict (will increase #registers in use by the kernel)
     __global__ void kernel_prep1d_sameNumberOfSamples(float* amplitudes,
@@ -120,7 +127,7 @@ namespace hcal {
 
       // get event input quantities
       auto const stride = gch < nchannelsf01HE ? stridef01HE : (gch < nchannelsf015 ? stridef5HB : stridef3HB);
-      auto const nsamples = gch < nchannelsf01HE ? compute_nsamples<Flavor01>(stride)
+      auto const nsamples = gch < nchannelsf01HE ? compute_nsamples<Flavor1>(stride)
                                                  : (gch < nchannelsf015 ? compute_nsamples<Flavor5>(stride)
                                                                         : compute_nsamples<Flavor3>(stride));
 
@@ -134,12 +141,12 @@ namespace hcal {
       auto const did = HcalDetId{id};
       auto const adc =
           gch < nchannelsf01HE
-              ? adc_for_sample<Flavor01>(dataf01HE + stride * gch, sample)
+              ? adc_for_sample<Flavor1>(dataf01HE + stride * gch, sample)
               : (gch < nchannelsf015 ? adc_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
                                      : adc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
       auto const capid =
           gch < nchannelsf01HE
-              ? capid_for_sample<Flavor01>(dataf01HE + stride * gch, sample)
+              ? capid_for_sample<Flavor1>(dataf01HE + stride * gch, sample)
               : (gch < nchannelsf015 ? capid_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
                                      : capid_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
 
@@ -198,7 +205,7 @@ namespace hcal {
         // NOTE: assume that soi is high only for a single guy!
         //   which must be the case. cpu version does not check for that
         //   if that is not the case, we will see that with cuda mmecheck
-        auto const soibit = soibit_for_sample<Flavor01>(dataf01HE + stride * gch, sample);
+        auto const soibit = soibit_for_sample<Flavor1>(dataf01HE + stride * gch, sample);
         if (soibit == 1)
           soiSamples[gch] = sampleWithinWindow;
       } else if (gch >= nchannelsf015) {
@@ -252,7 +259,7 @@ namespace hcal {
         rawCharge = (charge - pedestal) * factor + pedestal;
 #ifdef COMPUTE_TDC_TIME
         if (gch < nchannelsf01HE)
-          tdcTime = HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor01>(dataf01HE + stride * gch, sample));
+          tdcTime = HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor1>(dataf01HE + stride * gch, sample));
         else if (gch >= nchannelsf015)
           tdcTime =
               HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
@@ -711,6 +718,7 @@ namespace hcal {
       // can be relaxed if needed - minor updates are needed in that case!
       static_assert(NPULSES == NSAMPLES);
 
+
       // indices
       auto const gch = threadIdx.x + blockIdx.x * blockDim.x;
       auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
@@ -765,8 +773,6 @@ namespace hcal {
       // TODO: provide this properly
       int const soi = soiSamples[gch];
       */
-      constexpr float deltaChi2Threashold = 1e-3;
-
       calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
 #pragma unroll
       for (int i = 0; i < NPULSES; ++i)
@@ -808,10 +814,10 @@ namespace hcal {
       }
 #endif
 
+
       int npassive = 0;
       float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
-      // TOOD: provide constants from configuration
-      for (int iter = 1; iter < 50; iter++) {
+      for (int iter = 1; iter < nMaxItersMin; iter++) {
         //float covarianceMatrixStorage[MapSymM<float, NSAMPLES>::total];
         // NOTE: only works when NSAMPLES == NPULSES
         // if does not hold -> slightly rearrange shared mem to still reuse
@@ -940,9 +946,8 @@ namespace hcal {
         calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
 
         // run fast nnls
-        // FIXME: provide values from config
         calo::multifit::fnnls(
-            AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, 1e-11, 500, 10, 10);
+			      AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, nnlsThresh, nMaxItersNNLS, 10, 10);
 
 #ifdef HCAL_MAHI_GPUDEBUG
         printf("result Amplitudes\n");
@@ -1060,6 +1065,12 @@ namespace hcal {
       */
     }
 
+  }
+}
+
+namespace hcal {
+  namespace reconstruction {
+
     void entryPoint(InputDataGPU const& inputGPU,
                     OutputDataGPU& outputGPU,
                     ConditionsProducts const& conditions,
@@ -1076,7 +1087,7 @@ namespace hcal {
       // TODO: this can be lifted by implementing a separate kernel
       // similar to the default one, but properly handling the diff in #sample
       // or modifying existing one
-      auto const f01nsamples = compute_nsamples<Flavor01>(inputGPU.f01HEDigis.stride);
+      auto const f01nsamples = compute_nsamples<Flavor1>(inputGPU.f01HEDigis.stride);
       auto const f5nsamples = compute_nsamples<Flavor5>(inputGPU.f5HBDigis.stride);
       auto const f3nsamples = compute_nsamples<Flavor3>(inputGPU.f3HBDigis.stride);
       int constexpr windowSize = 8;
@@ -1093,7 +1104,7 @@ namespace hcal {
                        : (totalChannels + threadsPerBlock.y - 1) / threadsPerBlock.y;
       int nbytesShared =
           ((2 * windowSize + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock;
-      kernel_prep1d_sameNumberOfSamples<<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
+      hcal::mahi::kernel_prep1d_sameNumberOfSamples<<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
           scratch.amplitudes.get(),
           scratch.noiseTerms.get(),
           outputGPU.recHits.energy.get(),
@@ -1166,7 +1177,7 @@ namespace hcal {
       std::cout << "blocks: " << blocks2 << std::endl;
 #endif
 
-      kernel_prep_pulseMatrices_sameNumberOfSamples<<<blocks2, threadsPerBlock2, 0, cudaStream>>>(
+      hcal::mahi::kernel_prep_pulseMatrices_sameNumberOfSamples<<<blocks2, threadsPerBlock2, 0, cudaStream>>>(
           scratch.pulseMatrices.get(),
           scratch.pulseMatricesM.get(),
           scratch.pulseMatricesP.get(),
@@ -1214,7 +1225,7 @@ namespace hcal {
         uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
         uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;
         auto const nbytesShared = 2 * threadsPerBlock * calo::multifit::MapSymM<float, 8>::total * sizeof(float);
-        kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
+	hcal::mahi::kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
             outputGPU.recHits.energy.get(),
             outputGPU.recHits.chi2.get(),
             scratch.amplitudes.get(),
diff --git a/RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h b/RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h
new file mode 100644
index 0000000000000..c0bb499b517a7
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h
@@ -0,0 +1,19 @@
+#ifndef RecoLocalCalo_HcalRecProducers_src_SimpleAlgoGPU_h
+#define RecoLocalCalo_HcalRecProducers_src_SimpleAlgoGPU_h
+
+#include "DeclsForKernels.h"
+
+namespace hcal {
+  namespace reconstruction {
+
+    void entryPoint(InputDataGPU const&,
+                    OutputDataGPU&,
+                    ConditionsProducts const&,
+                    ScratchDataGPU&,
+                    ConfigParameters const&,
+                    cudaStream_t);
+
+  }
+}  // namespace hcal
+
+#endif  // RecoLocalCalo_HcalRecProducers_src_SimpleAlgoGPU_h

From 8c9b251ff092835ec3e4838af5411b1980e262f5 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 6 Nov 2020 23:27:34 +0100
Subject: [PATCH 26/34] Reduce code duplication in CPU and GPU modules
 (cms-patatrack#566)

Move HCAL constants to a separate file, and update CPU and GPU code accordingly.

Add an infinite-IOV record for GPU modules configuration: as an interim approach,
some modules are using the ventSetup approach to copy complex configurations to
the GPUs; the new "JobConfigurationGPURecord" should be used for those, to make
it easier both to highlight the intent, and clean up the client code when a
better solution is found.

Replace ECAL and HCAL job configuration records with JobConfigurationGPURecord.
---
 .../src/HcalRecoParamsWithPulseShapesGPU.cc   | 40 +++++++++----------
 .../HcalRecProducers/src/DeclsForKernels.h    | 39 ++++++++----------
 .../src/HBHERecHitProducerGPU.cc              |  3 +-
 .../src/HcalMahiPulseOffsetsGPUESProducer.cc  | 14 +++----
 .../HcalRecProducers/src/KernelHelpers.h      | 19 ++++-----
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 26 +++++-------
 6 files changed, 63 insertions(+), 78 deletions(-)

diff --git a/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
index 8ae2cd1a88880..804ea328c74d4 100644
--- a/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
+++ b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
@@ -46,25 +46,25 @@ HcalRecoParamsWithPulseShapesGPU::HcalRecoParamsWithPulseShapesGPU(HcalRecoParam
       ids_[i] = newId;
 
       // resize value arrays
-      acc25nsVec_.resize(acc25nsVec_.size() + HcalConst::maxPSshapeBin);
-      diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + HcalConst::maxPSshapeBin);
-      accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + HcalConst::nsPerBX);
-      diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + HcalConst::nsPerBX);
-      accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + HcalConst::nsPerBX);
-      diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + HcalConst::nsPerBX);
+      acc25nsVec_.resize(acc25nsVec_.size() + hcal::constants::maxPSshapeBin);
+      diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + hcal::constants::maxPSshapeBin);
+      accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + hcal::constants::nsPerBX);
+      diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + hcal::constants::nsPerBX);
+      accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + hcal::constants::nsPerBX);
+      diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + hcal::constants::nsPerBX);
 
       // precompute and get values from the functor
       auto const& pulseShape = pulseShapes.getShape(pulseShapeId);
       FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, 10};
-      auto const offset256 = newId * HcalConst::maxPSshapeBin;
-      auto const offset25 = newId * HcalConst::nsPerBX;
+      auto const offset256 = newId * hcal::constants::maxPSshapeBin;
+      auto const offset25 = newId * hcal::constants::nsPerBX;
       auto const numShapes = newId;
-      for (int i = 0; i < HcalConst::maxPSshapeBin; i++) {
+      for (int i = 0; i < hcal::constants::maxPSshapeBin; i++) {
         acc25nsVec_[offset256 * numShapes + i] = functor.get_acc25nsVec()[i];
         diff25nsItvlVec_[offset256 * numShapes + i] = functor.get_diff25nsItvlVec()[i];
       }
 
-      for (int i = 0; i < HcalConst::nsPerBX; i++) {
+      for (int i = 0; i < hcal::constants::nsPerBX; i++) {
         accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxMinusOneVec()[i];
         diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxMinusOneVec()[i];
         accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxZEROVec()[i];
@@ -104,25 +104,25 @@ HcalRecoParamsWithPulseShapesGPU::HcalRecoParamsWithPulseShapesGPU(HcalRecoParam
       ids_[i + offset] = newId;
 
       // resize value arrays
-      acc25nsVec_.resize(acc25nsVec_.size() + HcalConst::maxPSshapeBin);
-      diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + HcalConst::maxPSshapeBin);
-      accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + HcalConst::nsPerBX);
-      diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + HcalConst::nsPerBX);
-      accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + HcalConst::nsPerBX);
-      diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + HcalConst::nsPerBX);
+      acc25nsVec_.resize(acc25nsVec_.size() + hcal::constants::maxPSshapeBin);
+      diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + hcal::constants::maxPSshapeBin);
+      accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + hcal::constants::nsPerBX);
+      diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + hcal::constants::nsPerBX);
+      accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + hcal::constants::nsPerBX);
+      diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + hcal::constants::nsPerBX);
 
       // precompute and get values from the functor
       auto const& pulseShape = pulseShapes.getShape(pulseShapeId);
       FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, 10};
-      auto const offset256 = newId * HcalConst::maxPSshapeBin;
-      auto const offset25 = newId * HcalConst::nsPerBX;
+      auto const offset256 = newId * hcal::constants::maxPSshapeBin;
+      auto const offset25 = newId * hcal::constants::nsPerBX;
       auto const numShapes = newId;
-      for (int i = 0; i < HcalConst::maxPSshapeBin; i++) {
+      for (int i = 0; i < hcal::constants::maxPSshapeBin; i++) {
         acc25nsVec_[offset256 * numShapes + i] = functor.get_acc25nsVec()[i];
         diff25nsItvlVec_[offset256 * numShapes + i] = functor.get_diff25nsItvlVec()[i];
       }
 
-      for (int i = 0; i < HcalConst::nsPerBX; i++) {
+      for (int i = 0; i < hcal::constants::nsPerBX; i++) {
         accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxMinusOneVec()[i];
         diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxMinusOneVec()[i];
         accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxZEROVec()[i];
diff --git a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
index b472ab1cec087..ede0150e15f0a 100644
--- a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
+++ b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
@@ -7,23 +7,6 @@
 #include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
 #include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
 #include "CalibCalorimetry/HcalAlgos/interface/HcalTimeSlew.h"
-#include "Geometry/CaloTopology/interface/HcalTopology.h"
-#include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalGainsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalLUTCorrsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalQIECodersGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalQIETypesGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristicsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h"
-
 #include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
@@ -34,16 +17,26 @@
 #include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
 #include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
-
 #include "CondFormats/HcalObjects/interface/HcalCombinedRecordsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h"
-//#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalWidthsGPU.h"
-//#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalsGPU.h"
-
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGainsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalLUTCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIECodersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIETypesGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristicsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h"
+#include "Geometry/CaloTopology/interface/HcalTopology.h"
+#include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
-#include "HcalMahiPulseOffsetsGPURecord.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
 
 namespace hcal {
   namespace reconstruction {
diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
index e66534682f0a2..5005fd383a0ed 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -4,6 +4,7 @@
 #include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
@@ -183,7 +184,7 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
   auto const& sipmCharacteristicsProduct = sipmCharacteristicsHandle->getProduct(ctx.stream());
 
   edm::ESHandle<HcalMahiPulseOffsetsGPU> pulseOffsetsHandle;
-  setup.get<HcalMahiPulseOffsetsGPURecord>().get(pulseOffsetsHandle);
+  setup.get<JobConfigurationGPURecord>().get(pulseOffsetsHandle);
   auto const& pulseOffsetsProduct = pulseOffsetsHandle->getProduct(ctx.stream());
 
   // bundle up conditions
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
index c31781078d711..7acb8a95cc29c 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
@@ -6,18 +6,16 @@
 #include "FWCore/Framework/interface/ESProducer.h"
 #include "FWCore/Framework/interface/ESProductHost.h"
 #include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/EventSetupRecordIntervalFinder.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
 #include "FWCore/Framework/interface/ModuleFactory.h"
-#include "FWCore/Framework/interface/EventSetupRecordIntervalFinder.h"
+#include "FWCore/Framework/interface/SourceFactory.h"
 #include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
 #include "FWCore/Utilities/interface/typelookup.h"
-
+#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
-#include "HcalMahiPulseOffsetsGPURecord.h"
-
-#include "FWCore/Framework/interface/SourceFactory.h"
 
 class HcalMahiPulseOffsetsGPUESProducer : public edm::ESProducer, public edm::EventSetupRecordIntervalFinder {
 public:
@@ -25,7 +23,7 @@ class HcalMahiPulseOffsetsGPUESProducer : public edm::ESProducer, public edm::Ev
   ~HcalMahiPulseOffsetsGPUESProducer() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions&);
-  std::unique_ptr<HcalMahiPulseOffsetsGPU> produce(HcalMahiPulseOffsetsGPURecord const&);
+  std::unique_ptr<HcalMahiPulseOffsetsGPU> produce(JobConfigurationGPURecord const&);
 
 protected:
   void setIntervalFor(const edm::eventsetup::EventSetupRecordKey&,
@@ -38,7 +36,7 @@ class HcalMahiPulseOffsetsGPUESProducer : public edm::ESProducer, public edm::Ev
 
 HcalMahiPulseOffsetsGPUESProducer::HcalMahiPulseOffsetsGPUESProducer(edm::ParameterSet const& pset) : pset_{pset} {
   setWhatProduced(this);
-  findingRecord<HcalMahiPulseOffsetsGPURecord>();
+  findingRecord<JobConfigurationGPURecord>();
 }
 
 void HcalMahiPulseOffsetsGPUESProducer::setIntervalFor(const edm::eventsetup::EventSetupRecordKey& iKey,
@@ -54,7 +52,7 @@ void HcalMahiPulseOffsetsGPUESProducer::fillDescriptions(edm::ConfigurationDescr
 }
 
 std::unique_ptr<HcalMahiPulseOffsetsGPU> HcalMahiPulseOffsetsGPUESProducer::produce(
-    HcalMahiPulseOffsetsGPURecord const&) {
+    JobConfigurationGPURecord const&) {
   return std::make_unique<HcalMahiPulseOffsetsGPU>(pset_);
 }
 
diff --git a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
index 5caeef3a96c2e..af705e1f8dd3a 100644
--- a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
+++ b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
@@ -1,6 +1,8 @@
 #ifndef RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
 #define RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
 
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
+
 #include "DeclsForKernels.h"
 
 namespace hcal {
@@ -143,12 +145,6 @@ namespace hcal {
       }
     }
 
-    // FIXME remove duplication...
-    // this is from RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h nvcc was complaining... if included that header...
-    constexpr int maxPSshapeBin = 256;
-    constexpr int nsPerBX = 25;
-    constexpr float iniTimeShift = 92.5f;
-
     // TODO: remove what's not needed
     __forceinline__ __device__ float compute_pulse_shape_value(float const pulse_time,
                                                                int const sample,
@@ -161,13 +157,14 @@ namespace hcal {
                                                                float const* diffVarItvlIdxZeroVec) {
       // constants
       constexpr float slew = 0.f;
-      constexpr auto ns_per_bx = nsPerBX;
+      constexpr auto ns_per_bx = hcal::constants::nsPerBX;
 
       // FIXME: clean up all the rounding... this is coming from original cpu version
-      float const i_start_float =
-          -iniTimeShift - pulse_time - slew > 0.f ? 0.f : std::abs(-iniTimeShift - pulse_time - slew) + 1.f;
+      float const i_start_float = -hcal::constants::iniTimeShift - pulse_time - slew > 0.f
+                                      ? 0.f
+                                      : std::abs(-hcal::constants::iniTimeShift - pulse_time - slew) + 1.f;
       int i_start = static_cast<int>(i_start_float);
-      float offset_start = static_cast<float>(i_start) - iniTimeShift - pulse_time - slew;
+      float offset_start = static_cast<float>(i_start) - hcal::constants::iniTimeShift - pulse_time - slew;
       // FIXME: do we need a check for nan???
 #ifdef HCAL_MAHI_GPUDEBUG
       if (shift == 0)
@@ -189,7 +186,7 @@ namespace hcal {
       auto const bin_start_up = static_cast<float>(bin_start) + 0.5f;
       int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start;
       int const its_start = i_start / ns_per_bx;
-      int const distTo25ns_start = nsPerBX - 1 - i_start % ns_per_bx;
+      int const distTo25ns_start = hcal::constants::nsPerBX - 1 - i_start % ns_per_bx;
       auto const factor = offset_start - static_cast<float>(bin_0_start) - 0.5;
 
 #ifdef HCAL_MAHI_GPUDEBUG
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index f6e9924d64916..d01d58b2c753b 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -468,15 +468,13 @@ namespace hcal {
               : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
                     offsetForHashes;
       auto const recoPulseShapeId = recoPulseShapeIds[hashedId];
-      auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * hcal::reconstruction::maxPSshapeBin;
-      auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * hcal::reconstruction::maxPSshapeBin;
-      auto const* accVarLenIdxMinusOneVec =
-          accVarLenIdxMinusOneVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
+      auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin;
+      auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin;
+      auto const* accVarLenIdxMinusOneVec = accVarLenIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
       auto const* diffVarItvlIdxMinusOneVec =
-          diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
-      auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
-      auto const* diffVarItvlIdxZeroVec =
-          diffVarItvlIdxZeroVecValues + recoPulseShapeId * hcal::reconstruction::nsPerBX;
+          diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
+      auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
+      auto const* diffVarItvlIdxZeroVec = diffVarItvlIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
 
       // offset output arrays
       auto* pulseMatrix = pulseMatrices + nsamples * npulses * gch;
@@ -718,7 +716,6 @@ namespace hcal {
       // can be relaxed if needed - minor updates are needed in that case!
       static_assert(NPULSES == NSAMPLES);
 
-
       // indices
       auto const gch = threadIdx.x + blockIdx.x * blockDim.x;
       auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
@@ -814,7 +811,6 @@ namespace hcal {
       }
 #endif
 
-
       int npassive = 0;
       float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
       for (int iter = 1; iter < nMaxItersMin; iter++) {
@@ -947,7 +943,7 @@ namespace hcal {
 
         // run fast nnls
         calo::multifit::fnnls(
-			      AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, nnlsThresh, nMaxItersNNLS, 10, 10);
+            AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, nnlsThresh, nMaxItersNNLS, 10, 10);
 
 #ifdef HCAL_MAHI_GPUDEBUG
         printf("result Amplitudes\n");
@@ -1065,8 +1061,8 @@ namespace hcal {
       */
     }
 
-  }
-}
+  }  // namespace mahi
+}  // namespace hcal
 
 namespace hcal {
   namespace reconstruction {
@@ -1225,7 +1221,7 @@ namespace hcal {
         uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
         uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;
         auto const nbytesShared = 2 * threadsPerBlock * calo::multifit::MapSymM<float, 8>::total * sizeof(float);
-	hcal::mahi::kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
+        hcal::mahi::kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
             outputGPU.recHits.energy.get(),
             outputGPU.recHits.chi2.get(),
             scratch.amplitudes.get(),
@@ -1266,5 +1262,5 @@ namespace hcal {
       }
     }
 
-  }  // namespace mahi
+  }  // namespace reconstruction
 }  // namespace hcal

From 0d0c22ab78a64e649061747c68d3166170e621cf Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 10 Nov 2020 23:14:53 +0100
Subject: [PATCH 27/34] Address more HCAL review comments (cms-patatrack#568)

Further simplify HCAL raw data template specialisations.
Clean up commented out code.
---
 .../HcalDigi/interface/DigiCollection.h       | 49 ++++++----------
 .../makeHcalRaw2DigiGpuValidationPlots.cpp    | 27 +--------
 .../HcalRawToDigi/plugins/DeclsForKernels.h   | 16 +-----
 .../HcalRawToDigi/plugins/DecodeGPU.cu        | 38 +++----------
 .../HcalRecProducers/src/DeclsForKernels.h    |  8 +--
 .../src/HcalESProducersGPUDefs.cc             | 56 +++++++++----------
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu |  8 +--
 7 files changed, 62 insertions(+), 140 deletions(-)

diff --git a/CUDADataFormats/HcalDigi/interface/DigiCollection.h b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
index af4f76b775ec1..e2f4bf0848e94 100644
--- a/CUDADataFormats/HcalDigi/interface/DigiCollection.h
+++ b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
@@ -7,44 +7,34 @@ namespace hcal {
 
   // FLAVOR_HE_QIE11 = 1; Phase1 upgrade
   struct Flavor1 {
-    using adc_type = uint8_t;
-    using tdc_type = uint8_t;
-    using soibit_type = uint8_t;
-
     static constexpr int WORDS_PER_SAMPLE = 1;
     static constexpr int SAMPLES_PER_WORD = 1;
     static constexpr int HEADER_WORDS = 1;
 
-    static constexpr adc_type adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
-    static constexpr tdc_type tdc(uint16_t const* const sample_start) { return (*sample_start >> 8) & 0x3f; }
-    static constexpr soibit_type soibit(uint16_t const* const sample_start) { return (*sample_start >> 14) & 0x1; }
+    static constexpr uint8_t adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
+    static constexpr uint8_t tdc(uint16_t const* const sample_start) { return (*sample_start >> 8) & 0x3f; }
+    static constexpr uint8_t soibit(uint16_t const* const sample_start) { return (*sample_start >> 14) & 0x1; }
   };
 
   // FLAVOR_HB_QIE11 = 3; Phase1 upgrade
   struct Flavor3 {
-    using adc_type = uint8_t;
-    using tdc_type = uint8_t;
-    using soibit_type = uint8_t;
-
     static constexpr int WORDS_PER_SAMPLE = 1;
     static constexpr int SAMPLES_PER_WORD = 1;
     static constexpr int HEADER_WORDS = 1;
 
-    static constexpr adc_type adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
-    static constexpr tdc_type tdc(uint16_t const* const sample_start) { return ((*sample_start >> 8) & 0x3); }
-    static constexpr soibit_type soibit(uint16_t const* const sample_start) { return ((*sample_start >> 14) & 0x1); }
+    static constexpr uint8_t adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
+    static constexpr uint8_t tdc(uint16_t const* const sample_start) { return ((*sample_start >> 8) & 0x3); }
+    static constexpr uint8_t soibit(uint16_t const* const sample_start) { return ((*sample_start >> 14) & 0x1); }
     static constexpr uint8_t capid(uint16_t const* const sample_start) { return ((*sample_start >> 10) & 0x3); }
   };
 
   // FLAVOR_HB_QIE10 = 5; Phase0
   struct Flavor5 {
-    using adc_type = uint8_t;
-
     static constexpr float WORDS_PER_SAMPLE = 0.5;
     static constexpr int SAMPLES_PER_WORD = 2;
     static constexpr int HEADER_WORDS = 1;
 
-    static constexpr adc_type adc(uint16_t const* const sample_start, uint8_t const shifter) {
+    static constexpr uint8_t adc(uint16_t const* const sample_start, uint8_t const shifter) {
       return ((*sample_start >> shifter * 8) & 0x7f);
     }
   };
@@ -61,22 +51,22 @@ namespace hcal {
   }
 
   template <typename Flavor>
-  constexpr typename Flavor::soibit_type soibit_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+  constexpr uint8_t soibit_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
     return Flavor::soibit(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
   }
 
   template <typename Flavor>
-  constexpr typename Flavor::adc_type adc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+  constexpr uint8_t adc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
     return Flavor::adc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
   }
 
   template <typename Flavor>
-  constexpr typename Flavor::tdc_type tdc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+  constexpr uint8_t tdc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
     return Flavor::tdc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
   }
 
   template <>
-  constexpr Flavor5::adc_type adc_for_sample<Flavor5>(uint16_t const* const dfstart, uint32_t const sample) {
+  constexpr uint8_t adc_for_sample<Flavor5>(uint16_t const* const dfstart, uint32_t const sample) {
     // avoid using WORDS_PER_SAMPLE and simply shift
     return Flavor5::adc(dfstart + Flavor5::HEADER_WORDS + (sample >> 1), sample % 2);
   }
@@ -88,12 +78,10 @@ namespace hcal {
 
   template <typename Flavor>
   constexpr uint32_t compute_nsamples(uint32_t const nwords) {
-    return (nwords - Flavor::HEADER_WORDS) / Flavor::WORDS_PER_SAMPLE;
-  }
-
-  template <>
-  constexpr uint32_t compute_nsamples<Flavor5>(uint32_t const nwords) {
-    return (nwords - Flavor5::HEADER_WORDS) * Flavor5::SAMPLES_PER_WORD;
+    if constexpr (Flavor::SAMPLES_PER_WORD >= 1)
+      return (nwords - Flavor::HEADER_WORDS) * Flavor::SAMPLES_PER_WORD;
+    else
+      return (nwords - Flavor::HEADER_WORDS) / Flavor::WORDS_PER_SAMPLE;
   }
 
   //
@@ -138,12 +126,7 @@ namespace hcal {
   template <typename StoragePolicy>
   struct DigiCollection<Flavor5, StoragePolicy> : public DigiCollectionBase<StoragePolicy> {
     DigiCollection() = default;
-    //DigiCollection(
-    //        uint32_t *ids, uint16_t *data, uint8_t *presamples,
-    //        uint32_t ndigis, uint32_t stride)
-    //    : DigiCollectionBase(ids, data, ndigis, stride)
-    //    , npresamples{npresamples}
-    //{}
+
     DigiCollection(DigiCollection const&) = default;
     DigiCollection& operator=(DigiCollection const&) = default;
 
diff --git a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
index 591da39b704f6..039c38dd9df16 100644
--- a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
+++ b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
@@ -7,12 +7,12 @@
 #include <TFile.h>
 #include <TH1D.h>
 #include <TH2D.h>
-#include <TTree.h>
 #include <TPaveStats.h>
+#include <TTree.h>
 
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 #include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
-#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
 
 #define CREATE_HIST_1D(varname, nbins, first, last) auto varname = new TH1D(#varname, #varname, nbins, first, last)
 
@@ -103,27 +103,6 @@ int main(int argc, char* argv[]) {
   CREATE_HIST_2D(hTDCf01HEGPUvsCPU, 64, 0, 64);
   CREATE_HIST_2D(hTDCf3HBGPUvsCPU, 4, 0, 4);
 
-  /*
-    auto hADCEBGPU = new TH1D("hADCEBGPU", "hADCEBGPU", nbins, 0, last);
-    auto hADCEBCPU = new TH1D("hADCEBCPU", "hADCEBCPU", nbins, 0, last);
-    auto hADCEEGPU = new TH1D("hADCEEGPU", "hADCEEGPU", nbins, 0, last);
-    auto hADCEECPU = new TH1D("hADCEECPU", "hADCEECPU", nbins, 0, last);
-
-    auto hGainEBGPU = new TH1D("hGainEBGPU", "hGainEBGPU", 4, 0, 4);
-    auto hGainEBCPU = new TH1D("hGainEBCPU", "hGainEBCPU", 4, 0, 4);
-    auto hGainEEGPU = new TH1D("hGainEEGPU", "hGainEEGPU", 4, 0, 4);
-    auto hGainEECPU = new TH1D("hGainEECPU", "hGainEECPU", 4, 0, 4);
-
-    auto hADCEBGPUvsCPU = new TH2D("hADCEBGPUvsCPU", "hADCEBGPUvsCPU",
-        nbins, 0, last, nbins, 0, last);
-    auto hADCEEGPUvsCPU = new TH2D("hADCEEGPUvsCPU", "hADCEEGPUvsCPU",
-        nbins, 0, last, nbins, 0, last);
-    auto hGainEBGPUvsCPU = new TH2D("hGainEBGPUvsCPU", "hGainEBGPUvsCPU",
-        4, 0, 4, 4, 0, 4);
-    auto hGainEEGPUvsCPU = new TH2D("hGainEEGPUvsCPU", "hGainEEGPUvsCPU",
-        4, 0, 4, 4, 0, 4);
-        */
-
   // prep input
   TFile rfin{inFileName.c_str()};
   TTree* rt = (TTree*)rfin.Get("Events");
@@ -313,7 +292,7 @@ int main(int argc, char* argv[]) {
           hADCf5HBGPUvsCPU->Fill(cpuadc, gpuadc);
 
           // the must for us at RAW Decoding stage
-          assert(static_cast<hcal::Flavor5::adc_type>(cpuadc) == gpuadc);
+          assert(static_cast<uint8_t>(cpuadc) == gpuadc);
           assert(static_cast<uint8_t>(cpucapid) == gpucapid);
         }
       }
diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
index 9f89e3807402c..9903b77efb341 100644
--- a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
+++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
@@ -5,7 +5,6 @@
 
 #include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
@@ -45,8 +44,8 @@ namespace hcal {
     };
 
     struct ScratchDataGPU {
-      // depends on tHE number of output collections
-      // that is a statically known predefined number!!!
+      // depends on the number of output collections
+      // that is a statically known predefined number
       cms::cuda::device::unique_ptr<uint32_t[]> pChannelsCounters;
     };
 
@@ -58,27 +57,16 @@ namespace hcal {
       void allocate(ConfigurationParameters const &config, cudaStream_t cudaStream) {
         digisF01HE.data = cms::cuda::make_device_unique<uint16_t[]>(
             config.maxChannelsF01HE * compute_stride<Flavor1>(config.nsamplesF01HE), cudaStream);
-        //cudaCheck(
-        //    cudaMalloc((void **)&digisF01HE.data,
-        //               config.maxChannelsF01HE * sizeof(uint16_t) * compute_stride<Flavor1>(config.nsamplesF01HE)));
         digisF01HE.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF01HE, cudaStream);
-        //cudaCheck(cudaMalloc((void **)&digisF01HE.ids, sizeof(uint32_t) * config.maxChannelsF01HE));
 
         digisF5HB.data = cms::cuda::make_device_unique<uint16_t[]>(
             config.maxChannelsF5HB * compute_stride<Flavor5>(config.nsamplesF5HB), cudaStream);
-        //cudaCheck(cudaMalloc((void **)&digisF5HB.data,
-        //                     config.maxChannelsF5HB * sizeof(uint16_t) * compute_stride<Flavor5>(config.nsamplesF5HB)));
         digisF5HB.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF5HB, cudaStream);
-        //cudaCheck(cudaMalloc((void **)&digisF5HB.ids, sizeof(uint32_t) * config.maxChannelsF5HB));
         digisF5HB.npresamples = cms::cuda::make_device_unique<uint8_t[]>(config.maxChannelsF5HB, cudaStream);
-        //cudaCheck(cudaMalloc((void **)&digisF5HB.npresamples, sizeof(uint8_t) * config.maxChannelsF5HB));
 
         digisF3HB.data = cms::cuda::make_device_unique<uint16_t[]>(
             config.maxChannelsF3HB * compute_stride<Flavor3>(config.nsamplesF3HB), cudaStream);
-        //cudaCheck(cudaMalloc((void **)&digisF3HB.data,
-        //                     config.maxChannelsF3HB * sizeof(uint16_t) * compute_stride<Flavor3>(config.nsamplesF3HB)));
         digisF3HB.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF3HB, cudaStream);
-        //cudaCheck(cudaMalloc((void **)&digisF3HB.ids, config.maxChannelsF3HB * sizeof(uint32_t)));
       }
     };
 
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
index 97b4c4f07f497..4f2ca85861b30 100644
--- a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
@@ -164,12 +164,10 @@ namespace hcal {
 
       // get to the payload
       auto const* payload64 = buffer + 2 + namc + amcoffset;
-      //amcoffset += amcSize;
 
 #ifdef HCAL_RAWDECODE_GPUDEBUG
       // uhtr header v1 1st 64 bits
       auto const payload64_w0 = payload64[0];
-      //uint32_t const data_length64 = payload64_w0 & 0xfffff;
 #endif
       // uhtr n bytes comes from amcSize, according to the cpu version!
       uint32_t const data_length64 = amcSize;
@@ -211,12 +209,11 @@ namespace hcal {
       // skip uhtr header words
       auto const channelDataSize = data_length64 - 2;        // 2 uhtr header v1 words
       auto const* channelDataBuffer64Start = payload64 + 2;  // 2 uhtr header v2 wds
-      //auto const* channelDataBuffer64End = channelDataBuffer64Start + channelDataSize;
       auto const* ptr = reinterpret_cast<uint16_t const*>(channelDataBuffer64Start);
       auto const* end = ptr + sizeof(uint64_t) / sizeof(uint16_t) * (channelDataSize - 1);
       auto const t_rank = thread_group.thread_rank();
 
-      // iterate thru the channel data
+      // iterate through the channel data
       while (ptr != end) {
         // this is the starting point for this thread group for this iteration
         // with respect to this pointer every thread will move forward afterwards
@@ -237,8 +234,7 @@ namespace hcal {
           if (is_channel_header_word(ptr))
             ++ptr;
           else {
-            // go to the next channel and do not consider this guy as a
-            // channel
+            // go to the next channel and do not consider this guy as a channel
             while (ptr != end)
               if (!is_channel_header_word(ptr))
                 ++ptr;
@@ -261,18 +257,7 @@ namespace hcal {
         printf("ptr - start_ptr = %d counter = %d rank = %d\n", static_cast<int>(ptr - start_ptr), counter, t_rank);
 #endif
 
-        // assume that if all is valid, ptr points
-        // to the header word of the channel to be decoded
-        // skip to the next channel header word if above assumption
-        // does not hold
-        //uint8_t const fw_lastbit = (*ptr >> 15) & 0x1;
-        //if (fw_lastbit != 1) {
-        //    ptr++;
-        //    continue;
-        //}
-
-        // when the end is near, channels will land outside of the [start_ptr, end)
-        // region
+        // when the end is near, channels will land outside of the [start_ptr, end) region
         if (ptr != end) {
           // for all of the flavors, these 2 guys have the same bit layout
           uint8_t const flavor = (ptr[0] >> 12) & 0x7;
@@ -469,9 +454,6 @@ namespace hcal {
               uint32_t const nwords = channel_end - channel_header_word;
 
               // filter out this digi if nwords does not equal expected
-              //uint32_t const expected_words =
-              //    nsamplesF5HB * Flavor5::WORDS_PER_SAMPLE +
-              //    Flavor5::HEADER_WORDS;
               auto const expected_words = compute_stride<Flavor5>(nsamplesF5HB);
               if (nwords != expected_words)
                 break;
@@ -497,16 +479,14 @@ namespace hcal {
               HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
               auto const did = DetId{eid2did[eid.linearIndex()]};
 
-              /*
-                if (eid.rawId() >= HcalElectronicsId::maxLinearIndex) {
+              /* uncomment to check the linear index validity
+              if (eid.rawId() >= HcalElectronicsId::maxLinearIndex) {
 #ifdef HCAL_RAWDECODE_GPUDEBUG
-                    printf("*** rawid = %u has no known det id***\n",
-                        eid.rawId());
+                  printf("*** rawid = %u has no known det id***\n", eid.rawId());
 #endif
-                    break;
-                }
-                */
-              //auto const did = DetId{eid2did[eid.rawId()]};
+                  break;
+              }
+              */
 
 #ifdef HCAL_RAWDECODE_GPUDEBUG
               printf("erawId = %u linearIndex = %u drawid = %u\n", eid.rawId(), eid.linearIndex(), did.rawId());
diff --git a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
index ede0150e15f0a..49929bf8fe59c 100644
--- a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
+++ b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
@@ -76,10 +76,6 @@ namespace hcal {
       float timeSigmaSiPM, timeSigmaHPD;
       float ts4Thresh;
 
-      std::vector<int> pulseOffsets;
-      // FIXME remove pulseOffsets - they come from esproduce now
-      //int* pulseOffsetsDevice = nullptr;
-
       std::array<uint32_t, 3> kernelMinimizeThreads;
 
       // FIXME:
@@ -113,7 +109,7 @@ namespace hcal {
       DigiCollection<Flavor3, ::calo::common::DevStoragePolicy> const& f3HBDigis;
     };
 
-  }  // namespace mahi
-}  // namespace reconstruction
+  }  // namespace reconstruction
+}  // namespace hcal
 
 #endif  // RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
index cc9341ca4bb97..22cd08e903701 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
@@ -1,44 +1,42 @@
-#include "HcalESProducerGPU.h"
-
-#include "CondFormats/HcalObjects/interface/HcalRecoParams.h"
-#include "CondFormats/HcalObjects/interface/HcalPedestals.h"
-#include "CondFormats/HcalObjects/interface/HcalGains.h"
-#include "CondFormats/HcalObjects/interface/HcalLUTCorrs.h"
-#include "CondFormats/HcalObjects/interface/HcalRespCorrs.h"
-#include "CondFormats/HcalObjects/interface/HcalTimeCorrs.h"
-#include "CondFormats/HcalObjects/interface/HcalPedestalWidths.h"
-#include "CondFormats/HcalObjects/interface/HcalGainWidths.h"
-#include "CondFormats/HcalObjects/interface/HcalQIEData.h"
-#include "CondFormats/HcalObjects/interface/HcalQIETypes.h"
-#include "CondFormats/HcalObjects/interface/HcalSiPMParameters.h"
-#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristics.h"
-
-#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalPedestalsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalPedestalWidthsRcd.h"
-#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalPedestalsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h"
 #include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h"
-#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
-
-#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h"
-#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalPedestalsGPU.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
+#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
+#include "CondFormats/HcalObjects/interface/HcalGainWidths.h"
+#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGains.h"
 #include "CondFormats/HcalObjects/interface/HcalGainsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalLUTCorrs.h"
 #include "CondFormats/HcalObjects/interface/HcalLUTCorrsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestalWidths.h"
 #include "CondFormats/HcalObjects/interface/HcalPedestalWidthsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestals.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestalsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalQIECodersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIEData.h"
+#include "CondFormats/HcalObjects/interface/HcalQIETypes.h"
 #include "CondFormats/HcalObjects/interface/HcalQIETypesGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRecoParams.h"
+#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRespCorrs.h"
+#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristics.h"
 #include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristicsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMParameters.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalTimeCorrs.h"
+#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+
+#include "HcalESProducerGPU.h"
 
 using HcalRecoParamsGPUESProducer = HcalESProducerGPU<HcalRecoParamsRcd, HcalRecoParamsGPU, HcalRecoParams>;
 
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index d01d58b2c753b..f113bb8354b66 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -2,8 +2,9 @@
 
 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
 
-// nvcc not able to parse this guy (whatever is inlcuded from it)....
-//#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h"
+// TODO reuse some of the HCAL constats from
+//#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
+// ?
 
 #include "SimpleAlgoGPU.h"
 #include "KernelHelpers.h"
@@ -737,9 +738,6 @@ namespace hcal {
       auto const id = gch < nchannelsf01HE
                           ? idsf01HE[gch]
                           : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
-      //auto const id = gch >= nchannelsf01HE
-      //    ? idsf5HB[gch - nchannelsf01HE]
-      //    : idsf01HE[gch];
       auto const did = DetId{id};
       auto const hashedId =
           did.subdetId() == HcalBarrel

From bc425a2a705f9bf87928ebe9aeeac05999269712 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 12 Nov 2020 23:25:26 +0100
Subject: [PATCH 28/34] Move common ESProducer templates to
 ConvertingESProducer(WithDependencies)T (cms-patatrack#569)

Move few similar implementations of templated ESProducers
  - EcalESProducerGPU
  - EcalRawESProducerGPU
  - HcalESProducerGPU
  - HcalESProducerGPUWithDependencies
  - HcalRawESProducerGPU
to a common implementation under HeterogeneousCore/CUDACore/ .

Adapt all client code accordingly.

Do not use transient handles to avoid ESProducers taking references to
transient memory objects.
---
 .../plugins/HcalESProducerGPUDefs.cc          |  7 +-
 .../src/HcalESProducersGPUDefs.cc             | 98 +++++++++----------
 2 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
index aa601d6db06eb..749a98e990755 100644
--- a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
+++ b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
@@ -1,11 +1,10 @@
-#include <iostream>
-
 #include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h"
 
 #include "ElectronicsMappingGPU.h"
-#include "HcalRawESProducerGPU.h"
 
 using HcalElectronicsMappingGPUESProducer =
-    HcalRawESProducerGPU<hcal::raw::ElectronicsMappingGPU, HcalElectronicsMap, HcalElectronicsMapRcd>;
+    ConvertingESProducerT<HcalElectronicsMapRcd, hcal::raw::ElectronicsMappingGPU, HcalElectronicsMap>;
 
 DEFINE_FWK_EVENTSETUP_MODULE(HcalElectronicsMappingGPUESProducer);
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
index 22cd08e903701..f92b943c38d57 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
@@ -10,6 +10,11 @@
 #include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
 #include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
+#include "CondFormats/HcalObjects/interface/HcalCombinedRecordsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalGainWidths.h"
 #include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalGains.h"
@@ -34,39 +39,67 @@
 #include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalTimeCorrs.h"
 #include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h"
+#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h"
+#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h"
 #include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
 
-#include "HcalESProducerGPU.h"
-
-using HcalRecoParamsGPUESProducer = HcalESProducerGPU<HcalRecoParamsRcd, HcalRecoParamsGPU, HcalRecoParams>;
+using HcalRecoParamsGPUESProducer = ConvertingESProducerT<HcalRecoParamsRcd, HcalRecoParamsGPU, HcalRecoParams>;
 
 using HcalRecoParamsWithPulseShapesGPUESProducer =
-    HcalESProducerGPU<HcalRecoParamsRcd, HcalRecoParamsWithPulseShapesGPU, HcalRecoParams>;
+    ConvertingESProducerT<HcalRecoParamsRcd, HcalRecoParamsWithPulseShapesGPU, HcalRecoParams>;
 
-using HcalPedestalsGPUESProducer = HcalESProducerGPU<HcalPedestalsRcd, HcalPedestalsGPU, HcalPedestals>;
+using HcalPedestalsGPUESProducer = ConvertingESProducerT<HcalPedestalsRcd, HcalPedestalsGPU, HcalPedestals>;
 
-using HcalGainsGPUESProducer = HcalESProducerGPU<HcalGainsRcd, HcalGainsGPU, HcalGains>;
+using HcalGainsGPUESProducer = ConvertingESProducerT<HcalGainsRcd, HcalGainsGPU, HcalGains>;
 
-using HcalLUTCorrsGPUESProducer = HcalESProducerGPU<HcalLUTCorrsRcd, HcalLUTCorrsGPU, HcalLUTCorrs>;
+using HcalLUTCorrsGPUESProducer = ConvertingESProducerT<HcalLUTCorrsRcd, HcalLUTCorrsGPU, HcalLUTCorrs>;
 
-using HcalRespCorrsGPUESProducer = HcalESProducerGPU<HcalRespCorrsRcd, HcalRespCorrsGPU, HcalRespCorrs>;
+using HcalRespCorrsGPUESProducer = ConvertingESProducerT<HcalRespCorrsRcd, HcalRespCorrsGPU, HcalRespCorrs>;
 
-using HcalTimeCorrsGPUESProducer = HcalESProducerGPU<HcalTimeCorrsRcd, HcalTimeCorrsGPU, HcalTimeCorrs>;
+using HcalTimeCorrsGPUESProducer = ConvertingESProducerT<HcalTimeCorrsRcd, HcalTimeCorrsGPU, HcalTimeCorrs>;
 
 using HcalPedestalWidthsGPUESProducer =
-    HcalESProducerGPU<HcalPedestalWidthsRcd, HcalPedestalWidthsGPU, HcalPedestalWidths>;
+    ConvertingESProducerT<HcalPedestalWidthsRcd, HcalPedestalWidthsGPU, HcalPedestalWidths>;
 
-using HcalGainWidthsGPUESProducer = HcalESProducerGPU<HcalGainWidthsRcd, HcalGainWidthsGPU, HcalGainWidths>;
+using HcalGainWidthsGPUESProducer = ConvertingESProducerT<HcalGainWidthsRcd, HcalGainWidthsGPU, HcalGainWidths>;
 
-using HcalQIECodersGPUESProducer = HcalESProducerGPU<HcalQIEDataRcd, HcalQIECodersGPU, HcalQIEData>;
+using HcalQIECodersGPUESProducer = ConvertingESProducerT<HcalQIEDataRcd, HcalQIECodersGPU, HcalQIEData>;
 
-using HcalQIETypesGPUESProducer = HcalESProducerGPU<HcalQIETypesRcd, HcalQIETypesGPU, HcalQIETypes>;
+using HcalQIETypesGPUESProducer = ConvertingESProducerT<HcalQIETypesRcd, HcalQIETypesGPU, HcalQIETypes>;
 
 using HcalSiPMParametersGPUESProducer =
-    HcalESProducerGPU<HcalSiPMParametersRcd, HcalSiPMParametersGPU, HcalSiPMParameters>;
+    ConvertingESProducerT<HcalSiPMParametersRcd, HcalSiPMParametersGPU, HcalSiPMParameters>;
 
 using HcalSiPMCharacteristicsGPUESProducer =
-    HcalESProducerGPU<HcalSiPMCharacteristicsRcd, HcalSiPMCharacteristicsGPU, HcalSiPMCharacteristics>;
+    ConvertingESProducerT<HcalSiPMCharacteristicsRcd, HcalSiPMCharacteristicsGPU, HcalSiPMCharacteristics>;
+
+using HcalConvertedPedestalsGPUESProducer = ConvertingESProducerWithDependenciesT<HcalConvertedPedestalsRcd,
+                                                                                  HcalConvertedPedestalsGPU,
+                                                                                  HcalPedestals,
+                                                                                  HcalQIEData,
+                                                                                  HcalQIETypes>;
+
+using HcalConvertedEffectivePedestalsGPUESProducer =
+    ConvertingESProducerWithDependenciesT<HcalConvertedEffectivePedestalsRcd,
+                                          HcalConvertedEffectivePedestalsGPU,
+                                          HcalPedestals,
+                                          HcalQIEData,
+                                          HcalQIETypes>;
+
+using HcalConvertedPedestalWidthsGPUESProducer = ConvertingESProducerWithDependenciesT<HcalConvertedPedestalWidthsRcd,
+                                                                                       HcalConvertedPedestalWidthsGPU,
+                                                                                       HcalPedestals,
+                                                                                       HcalPedestalWidths,
+                                                                                       HcalQIEData,
+                                                                                       HcalQIETypes>;
+
+using HcalConvertedEffectivePedestalWidthsGPUESProducer =
+    ConvertingESProducerWithDependenciesT<HcalConvertedEffectivePedestalWidthsRcd,
+                                          HcalConvertedEffectivePedestalWidthsGPU,
+                                          HcalPedestals,
+                                          HcalPedestalWidths,
+                                          HcalQIEData,
+                                          HcalQIETypes>;
 
 DEFINE_FWK_EVENTSETUP_MODULE(HcalRecoParamsGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalRecoParamsWithPulseShapesGPUESProducer);
@@ -81,41 +114,6 @@ DEFINE_FWK_EVENTSETUP_MODULE(HcalQIECodersGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalQIETypesGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMParametersGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMCharacteristicsGPUESProducer);
-
-#include "CondFormats/HcalObjects/interface/HcalCombinedRecordsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalWidthsGPU.h"
-#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
-
-using HcalConvertedPedestalsGPUESProducer = HcalESProducerGPUWithDependencies<HcalConvertedPedestalsRcd,
-                                                                              HcalConvertedPedestalsGPU,
-                                                                              HcalPedestals,
-                                                                              HcalQIEData,
-                                                                              HcalQIETypes>;
-
-using HcalConvertedEffectivePedestalsGPUESProducer =
-    HcalESProducerGPUWithDependencies<HcalConvertedEffectivePedestalsRcd,
-                                      HcalConvertedEffectivePedestalsGPU,
-                                      HcalPedestals,
-                                      HcalQIEData,
-                                      HcalQIETypes>;
-
-using HcalConvertedPedestalWidthsGPUESProducer = HcalESProducerGPUWithDependencies<HcalConvertedPedestalWidthsRcd,
-                                                                                   HcalConvertedPedestalWidthsGPU,
-                                                                                   HcalPedestals,
-                                                                                   HcalPedestalWidths,
-                                                                                   HcalQIEData,
-                                                                                   HcalQIETypes>;
-
-using HcalConvertedEffectivePedestalWidthsGPUESProducer =
-    HcalESProducerGPUWithDependencies<HcalConvertedEffectivePedestalWidthsRcd,
-                                      HcalConvertedEffectivePedestalWidthsGPU,
-                                      HcalPedestals,
-                                      HcalPedestalWidths,
-                                      HcalQIEData,
-                                      HcalQIETypes>;
-
 DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedPedestalsGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedEffectivePedestalsGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedPedestalWidthsGPUESProducer);

From 831b90f6a56ad889a5af80d6c9f85f34a4836467 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 15 Nov 2020 21:37:48 +0100
Subject: [PATCH 29/34] Apply clang-format style formatting

---
 .../src/HBHERecHitProducerGPU.cc              | 36 +++++++++----------
 .../src/HcalMahiPulseOffsetsGPUESProducer.cc  |  3 +-
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
index 5005fd383a0ed..e8a9901e63803 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -189,24 +189,24 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
 
   // bundle up conditions
   hcal::reconstruction::ConditionsProducts conditions{gainWidthsProduct,
-                                            gainsProduct,
-                                            lutCorrsProduct,
-                                            pedestalWidthsProduct,
-                                            effectivePedestalWidthsProduct,
-                                            pedestalsProduct,
-                                            qieCodersProduct,
-                                            recoParamsProduct,
-                                            respCorrsProduct,
-                                            timeCorrsProduct,
-                                            qieTypesProduct,
-                                            sipmParametersProduct,
-                                            sipmCharacteristicsProduct,
-                                            effectivePedestalsProduct,
-                                            topologyHandle.product(),
-                                            recConstantsHandle.product(),
-                                            pedestalsHandle->offsetForHashes(),
-                                            pulseOffsetsProduct,
-                                            pulseOffsetsHandle->getValues()};
+                                                      gainsProduct,
+                                                      lutCorrsProduct,
+                                                      pedestalWidthsProduct,
+                                                      effectivePedestalWidthsProduct,
+                                                      pedestalsProduct,
+                                                      qieCodersProduct,
+                                                      recoParamsProduct,
+                                                      respCorrsProduct,
+                                                      timeCorrsProduct,
+                                                      qieTypesProduct,
+                                                      sipmParametersProduct,
+                                                      sipmCharacteristicsProduct,
+                                                      effectivePedestalsProduct,
+                                                      topologyHandle.product(),
+                                                      recConstantsHandle.product(),
+                                                      pedestalsHandle->offsetForHashes(),
+                                                      pulseOffsetsProduct,
+                                                      pulseOffsetsHandle->getValues()};
 
   // scratch mem on device
   hcal::reconstruction::ScratchDataGPU scratchGPU = {
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
index 7acb8a95cc29c..0862e0a861d5d 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
@@ -51,8 +51,7 @@ void HcalMahiPulseOffsetsGPUESProducer::fillDescriptions(edm::ConfigurationDescr
   desc.addWithDefaultLabel(d);
 }
 
-std::unique_ptr<HcalMahiPulseOffsetsGPU> HcalMahiPulseOffsetsGPUESProducer::produce(
-    JobConfigurationGPURecord const&) {
+std::unique_ptr<HcalMahiPulseOffsetsGPU> HcalMahiPulseOffsetsGPUESProducer::produce(JobConfigurationGPURecord const&) {
   return std::make_unique<HcalMahiPulseOffsetsGPU>(pset_);
 }
 

From bbd4cca8d5a72a497e34c3a1d0d3705ec5234aac Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 9 Nov 2020 14:44:58 +0100
Subject: [PATCH 30/34] Refactor ECAL and HCAL chi2 code (cms-patatrack#567)

Factor out the chi2 computation from the ECAL multifit and HCAL MAHI code,
and move it to MultifitComputations.
---
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 180 +++++++-----------
 1 file changed, 67 insertions(+), 113 deletions(-)

diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index f113bb8354b66..540dab7ad27fd 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -2,6 +2,8 @@
 
 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
 
+// needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
+#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
 // TODO reuse some of the HCAL constats from
 //#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
 // ?
@@ -23,6 +25,46 @@ namespace hcal {
     constexpr double nnlsThresh = 1e-11;
     constexpr float deltaChi2Threashold = 1e-3;
 
+    // from RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc
+    __forceinline__ __device__ float get_raw_charge(double const charge,
+                                                    double const pedestal,
+                                                    float const* shrChargeMinusPedestal,
+                                                    float const* parLin1Values,
+                                                    float const* parLin2Values,
+                                                    float const* parLin3Values,
+                                                    int32_t const nsamplesForCompute,
+                                                    int32_t const soi,
+                                                    int const sipmQTSShift,
+                                                    int const sipmQNTStoSum,
+                                                    int const sipmType,
+                                                    float const fcByPE,
+                                                    bool const isqie11) {
+      float rawCharge;
+
+      if (!isqie11)
+        rawCharge = charge;
+      else {
+        auto const parLin1 = parLin1Values[sipmType - 1];
+        auto const parLin2 = parLin2Values[sipmType - 1];
+        auto const parLin3 = parLin3Values[sipmType - 1];
+
+        int const first = std::max(soi + sipmQTSShift, 0);
+        int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
+        float sipmq = 0.0f;
+        for (auto ts = first; ts < last; ts++)
+          sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesForCompute + ts];
+        auto const effectivePixelsFired = sipmq / fcByPE;
+        auto const factor =
+            hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
+        rawCharge = (charge - pedestal) * factor + pedestal;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
+#endif
+      }
+      return rawCharge;
+    }
+
     // Assume: same number of samples for HB and HE
     // TODO: add/validate restrict (will increase #registers in use by the kernel)
     __global__ void kernel_prep1d_sameNumberOfSamples(float* amplitudes,
@@ -133,7 +175,7 @@ namespace hcal {
                                                                         : compute_nsamples<Flavor3>(stride));
 
 #ifdef HCAL_MAHI_GPUDEBUG
-      assert(nsamples == nsamplesForCompute || nsamples - startingSample == nsampelsForCompute);
+      assert(nsamples == nsamplesForCompute || nsamples - startingSample == nsamplesForCompute);
 #endif
 
       auto const id = gch < nchannelsf01HE
@@ -230,46 +272,35 @@ namespace hcal {
       // NOTE: this branch will be divergent only for a single warp that
       // sits on the boundary when flavor 01 channels end and flavor 5 start
       //
-      float rawCharge;
-#ifdef COMPUTE_TDC_TIME
-      float tdcTime;
-#endif  // COMPUTE_TDC_TIME
+      float const rawCharge = get_raw_charge(charge,
+                                             pedestal,
+                                             shrChargeMinusPedestal,
+                                             parLin1Values,
+                                             parLin2Values,
+                                             parLin3Values,
+                                             nsamplesForCompute,
+                                             soi,
+                                             sipmQTSShift,
+                                             sipmQNTStoSum,
+                                             sipmType,
+                                             fcByPE,
+                                             gch < nchannelsf01HE || gch >= nchannelsf015);
+
       auto const dfc = hcal::reconstruction::compute_diff_charge_gain(
           qieType, adc, capid, qieOffsets, qieSlopes, gch < nchannelsf01HE || gch >= nchannelsf015);
-      if (gch >= nchannelsf01HE && gch < nchannelsf015) {
-        // flavor 5
-        rawCharge = charge;
+
 #ifdef COMPUTE_TDC_TIME
+      float tdcTime;
+      if (gch >= nchannelsf01HE && gch < nchannelsf015) {
         tdcTime = HcalSpecialTimes::UNKNOWN_T_NOTDC;
-#endif  // COMPUTE_TDC_TIME
       } else {
-        // flavor 0 or 1 or 3
-        // conditions needed for sipms
-        auto const parLin1 = parLin1Values[sipmType - 1];
-        auto const parLin2 = parLin2Values[sipmType - 1];
-        auto const parLin3 = parLin3Values[sipmType - 1];
-
-        int const first = std::max(soi + sipmQTSShift, 0);
-        int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
-        float sipmq = 0.0f;
-        for (auto ts = first; ts < last; ts++)
-          sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesForCompute + ts];
-        auto const effectivePixelsFired = sipmq / fcByPE;
-        auto const factor =
-            hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
-        rawCharge = (charge - pedestal) * factor + pedestal;
-#ifdef COMPUTE_TDC_TIME
         if (gch < nchannelsf01HE)
           tdcTime = HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor1>(dataf01HE + stride * gch, sample));
         else if (gch >= nchannelsf015)
           tdcTime =
               HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
-#endif  // COMPUTE_TDC_TIME
-
-#ifdef HCAL_MAHI_GPUDEBUG
-        printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
-#endif
       }
+#endif  // COMPUTE_TDC_TIME
 
       // compute method 0 quantities
       // TODO: need to apply containment
@@ -358,7 +389,7 @@ namespace hcal {
         // FIXME: KNOWN ISSUE: observed a problem when rawCharge and pedestal
         // are basically equal and generate -0.00000...
         // needs to be treated properly
-        if (!(shrEnergyM0TotalAccum[lch] > 0 && energym0_per_ts_gain0 >= ts4Thresh)) {
+        if (!(shrEnergyM0TotalAccum[lch] > 0 && energym0_per_ts_gain0 > ts4Thresh)) {
           // do not need to run mahi minimization
           //outputEnergy[gch] = 0; energy already inited to 0
           outputChi2[gch] = -9999.f;
@@ -378,8 +409,8 @@ namespace hcal {
       //
       auto const amplitude = rawCharge - pedestalToUseForMethod0;
       auto const noiseADC = (1. / std::sqrt(12)) * dfc;
-      auto const noisePhoto = amplitude > pedestalWidth ? std::sqrt(amplitude * fcByPE) : 0.f;
-      auto const noiseTerm = noiseADC * noiseADC + noisePhoto * noisePhoto + pedestalWidth * pedestalWidth;
+      auto const noisePhotoSq = amplitude > pedestalWidth ? (amplitude * fcByPE) : 0.f;
+      auto const noiseTerm = noiseADC * noiseADC + noisePhotoSq + pedestalWidth * pedestalWidth;
 
 #ifdef HCAL_MAHI_GPUDEBUG
       printf(
@@ -396,7 +427,7 @@ namespace hcal {
           sample,
           noiseADC,
           sample,
-          noisePhoto);
+          noisePhotoSq);
 #endif
 
       // store to global memory
@@ -640,7 +671,7 @@ namespace hcal {
           continue;
 
 #ifdef HCAL_MAHI_GPUDEBUG
-        printf("pulse cov array for ibx = %d and offset %d\n", ipulse, offset);
+        printf("pulse cov array for ibx = %d\n", ipulse);
 #endif
 
         // preload a column
@@ -949,84 +980,7 @@ namespace hcal {
           printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i));
 #endif
 
-        // replace pulseMatrixView * result - inputs
-        // NOTE:
-        float accum[NSAMPLES];
-        Eigen::Map<calo::multifit::ColumnVector<NSAMPLES>> mapAccum{accum};
-        {
-          float results[NPULSES];
-
-// preload results and permute according to the pulse offsets
-#pragma unroll
-          for (int counter = 0; counter < NPULSES; counter++) {
-            results[counter] = resultAmplitudesVector[counter];
-          }
-
-// load accum
-#pragma unroll
-          for (int counter = 0; counter < NSAMPLES; counter++)
-            accum[counter] = -inputAmplitudesView(counter);
-
-          // iterate
-          for (int icol = 0; icol < NPULSES; icol++) {
-            float pm_col[NSAMPLES];
-
-// preload a column of pulse matrix
-#pragma unroll
-            for (int counter = 0; counter < NSAMPLES; counter++)
-              pm_col[counter] = __ldg(&glbPulseMatrixView.coeffRef(counter, icol));
-
-// accum
-#pragma unroll
-            for (int counter = 0; counter < NSAMPLES; counter++)
-              accum[counter] += results[icol] * pm_col[counter];
-          }
-        }
-
-        // compute chi2 and check that there is no rotation
-        //chi2 = matrixDecomposition
-        //    .matrixL()
-        //    . solve(mapAccum)
-        //            .solve(pulseMatrixView * resultAmplitudesVector - inputAmplitudesView)
-        //    .squaredNorm();
-        {
-          float reg_b_tmp[NSAMPLES];
-          float reg_L[NSAMPLES];
-          float accumSum = 0;
-
-// preload a column and load column 0 of cholesky
-#pragma unroll
-          for (int i = 0; i < NSAMPLES; i++) {
-            reg_b_tmp[i] = mapAccum(i);
-            reg_L[i] = matrixL(i, 0);
-          }
-
-          // compute x0 and store it
-          auto x_prev = reg_b_tmp[0] / reg_L[0];
-          accumSum += x_prev * x_prev;
-
-// iterate
-#pragma unroll
-          for (int iL = 1; iL < NSAMPLES; iL++) {
-// update accum
-#pragma unroll
-            for (int counter = iL; counter < NSAMPLES; counter++)
-              reg_b_tmp[counter] -= x_prev * reg_L[counter];
-
-// load the next column of cholesky
-#pragma unroll
-            for (int counter = iL; counter < NSAMPLES; counter++)
-              reg_L[counter] = matrixL(counter, iL);
-
-            // compute the next x for M(iL, icol)
-            x_prev = reg_b_tmp[iL] / reg_L[iL];
-
-            // store the result value
-            accumSum += x_prev * x_prev;
-          }
-
-          chi2 = accumSum;
-        }
+        calo::multifit::calculateChiSq(matrixL, glbPulseMatrixView, resultAmplitudesVector, inputAmplitudesView, chi2);
 
         auto const deltaChi2 = std::abs(chi2 - previous_chi2);
         if (chi2 == chi2_2itersback && chi2 < previous_chi2)

From 7d16cb3b7e6ae27333f6f34d8b058b7583ea85d2 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 16 Nov 2020 11:40:11 +0100
Subject: [PATCH 31/34] Move the HCALGPUAnalyzer to
 RecoLocalCalo/HcalRecProducers/ (cms-patatrack#572)

Update the configuration to
  - read the input data from root files, that are available also offline
  - automatically pick up the global tag for the current release
---
 RecoLocalCalo/HcalRecProducers/BuildFile.xml  |  11 +-
 .../HcalRecProducers/src/HCALGPUAnalyzer.cc   | 312 ++++++++++++++++++
 .../test/make_GPUvsCPU_HCAL_plots.py          |  28 ++
 .../test/make_GPUvsCPU_HCAL_rechits.py        | 152 +++++++++
 4 files changed, 502 insertions(+), 1 deletion(-)
 create mode 100644 RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc
 create mode 100644 RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py
 create mode 100644 RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py

diff --git a/RecoLocalCalo/HcalRecProducers/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
index 524c2d64dd5d7..e34037ecbcbb5 100644
--- a/RecoLocalCalo/HcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
@@ -1,16 +1,25 @@
 <use name="boost"/>
 <use name="cuda"/>
-<use name="CUDADataFormats/Common" />
 <use name="CUDADataFormats/CaloCommon"/>
+<use name="CUDADataFormats/Common"/>
 <use name="CUDADataFormats/HcalDigi"/>
 <use name="CUDADataFormats/HcalRecHitSoA"/>
 <use name="CalibCalorimetry/HcalAlgos"/>
 <use name="CalibFormats/HcalObjects"/>
+<use name="CommonTools/UtilAlgos"/>
 <use name="DataFormats/Common"/>
+<use name="DataFormats/HcalDetId"/>
+<use name="DataFormats/HcalRecHit"/>
 <use name="FWCore/Framework"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="Geometry/CaloGeometry"/>
 <use name="Geometry/HcalCommonData"/>
 <use name="Geometry/Records"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="RecoLocalCalo/HcalRecAlgos"/>
+<use name="SimCalorimetry/CaloSimAlgos"/>
+<use name="SimCalorimetry/HcalSimAlgos"/>
 <flags EDM_PLUGIN="1"/>
diff --git a/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc b/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc
new file mode 100644
index 0000000000000..89642d4a7a85d
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc
@@ -0,0 +1,312 @@
+// -*- C++ -*-
+//
+// Package:    ComparisonPlots/HCALGPUAnalyzer
+// Class:      HCALGPUAnalyzer
+//
+/**\class HCALGPUAnalyzer HCALGPUAnalyzer.cc ComparisonPlots/HCALGPUAnalyzer/plugins/HCALGPUAnalyzer.cc
+
+ Description: [one line class summary]
+
+ Implementation:
+     [Notes on implementation]
+*/
+//
+// Original Author:  Mariarosaria D'Alfonso
+//         Created:  Mon, 17 Dec 2018 16:22:58 GMT
+//
+//
+
+// system include files
+#include <memory>
+#include <string>
+#include <map>
+#include <iostream>
+using namespace std;
+
+// user include files
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/one/EDAnalyzer.h"
+
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "CommonTools/UtilAlgos/interface/TFileService.h"
+
+#include "DataFormats/HcalRecHit/interface/HBHERecHit.h"
+#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h"
+#include "DataFormats/HcalDetId/interface/HcalDetId.h"
+
+#include "SimDataFormats/CaloHit/interface/PCaloHit.h"
+#include "SimDataFormats/CaloHit/interface/PCaloHitContainer.h"
+
+#include "SimCalorimetry/HcalSimAlgos/interface/HcalSimParameterMap.h"
+
+#include "TH2F.h"
+
+//
+// class declaration
+//
+
+class HCALGPUAnalyzer : public edm::one::EDAnalyzer<edm::one::SharedResources> {
+public:
+  explicit HCALGPUAnalyzer(const edm::ParameterSet &);
+  ~HCALGPUAnalyzer();
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  virtual void beginJob() override;
+  virtual void analyze(const edm::Event &, const edm::EventSetup &) override;
+  virtual void endJob() override;
+
+  // ----------member data ---------------------------
+  //  void ClearVariables();
+
+  // some variables for storing information
+  double Method0Energy, Method0EnergyGPU;
+  double RecHitEnergy, RecHitEnergyGPU;
+  double RecHitTime, RecHitTimeGPU;
+  double iEta, iEtaGPU;
+  double iPhi, iPhiGPU;
+  int depth, depthGPU;
+
+  TH2F *hEnergy_2dMahi;
+  TH2F *hEnergy_2dM0;
+  TH2F *hTime_2dMahi;
+
+  TH2F *Unmatched;
+  TH2F *Matched;
+  TH1F *hEnergy_cpu;
+  TH1F *hEnergy_gpu;
+  TH1F *hEnergy_cpugpu;
+  TH1F *hEnergy_cpugpu_rel;
+  TH1F *hEnergyM0_cpu;
+  TH1F *hEnergyM0_gpu;
+  TH1F *hTime_cpu;
+  TH1F *hTime_gpu;
+
+  // create the output file
+  edm::Service<TFileService> FileService;
+  // create the token to retrieve hit information
+  edm::EDGetTokenT<HBHERecHitCollection> hRhToken;
+  edm::EDGetTokenT<HBHERecHitCollection> hRhTokenGPU;
+};
+
+//
+// constants, enums and typedefs
+//
+
+//
+// static data member definitions
+//
+
+//
+// constructors and destructor
+//
+HCALGPUAnalyzer::HCALGPUAnalyzer(const edm::ParameterSet &iConfig) {
+  usesResource("TFileService");
+
+  hRhToken = consumes<HBHERecHitCollection>(iConfig.getUntrackedParameter<string>("HBHERecHits", "hbheprereco"));
+  hRhTokenGPU = consumes<HBHERecHitCollection>(
+      iConfig.getUntrackedParameter<string>("HBHERecHits", "hcalCPURecHitsProducer:recHitsLegacyHBHE"));
+
+  //
+
+  hEnergy_2dM0 = FileService->make<TH2F>("hEnergy_2dM0", "hEnergy_2dM0", 1000, 0., 100., 1000, 0., 100.);
+  hEnergy_2dM0->GetXaxis()->SetTitle("Cpu M0 Energy");
+  hEnergy_2dM0->GetYaxis()->SetTitle("GPU M0 Energy");
+
+  hEnergy_2dMahi = FileService->make<TH2F>("hEnergy_2dMahi", "hEnergy_2dMahi", 1000, 0., 100., 1000, 0., 100.);
+  hEnergy_2dMahi->GetXaxis()->SetTitle("CPU Energy");
+  hEnergy_2dMahi->GetYaxis()->SetTitle("GPU Energy");
+
+  hTime_2dMahi = FileService->make<TH2F>("hTime_2dMahi", "hTime_2dMahi", 250, -12.5, 12.5, 250, -12.5, 12.5);
+  hTime_2dMahi->GetXaxis()->SetTitle("Mahi Time CPU");
+  hTime_2dMahi->GetYaxis()->SetTitle("Mahi Time GPU");
+
+  //
+
+  hEnergyM0_cpu = FileService->make<TH1F>("hEnergyM0_cpu", "hEnergyM0_cpu", 100, 0., 100.);
+  hEnergyM0_cpu->GetXaxis()->SetTitle("CPU Energy");
+
+  hEnergy_cpu = FileService->make<TH1F>("hEnergy_cpu", "hEnergy_cpu", 50, 0., 50.);
+  hEnergy_cpu->GetXaxis()->SetTitle("CPU Energy");
+
+  hEnergy_gpu = FileService->make<TH1F>("hEnergy_gpu", "hEnergy_gpu", 50, 0., 50.);
+  hEnergy_gpu->GetXaxis()->SetTitle("GPU Energy");
+
+  //
+
+  hEnergy_cpugpu = FileService->make<TH1F>("hEnergy_cpugpu", "hEnergy_cpugpu", 500, -2.5, 2.5);
+  hEnergy_cpugpu->GetXaxis()->SetTitle("GPU Energy - CPU Energy [GeV]");
+  hEnergy_cpugpu->GetYaxis()->SetTitle("# RecHits");
+
+  hEnergy_cpugpu_rel =
+      FileService->make<TH1F>("hEnergy_cpugpu_rel", "hEnergy_cpugpu_rel ( E > 0.005 GeV)", 500, -2.5, 2.5);
+  hEnergy_cpugpu_rel->GetXaxis()->SetTitle("(GPU Energy - CPU Energy) / CPU energy");
+  hEnergy_cpugpu_rel->GetYaxis()->SetTitle("# RecHits");
+
+  //
+
+  hTime_cpu = FileService->make<TH1F>("hTime_cpu", "hTime_cpu", 50, -25., 25.);
+  hTime_cpu->GetXaxis()->SetTitle("CPU Time");
+
+  hTime_gpu = FileService->make<TH1F>("hTime_gpu", "hTime_gpu", 50, -25., 25.);
+  hTime_gpu->GetXaxis()->SetTitle("GPU Time");
+
+  Unmatched = FileService->make<TH2F>("Unmatched", "Unmatched (eta,phi)", 100, -50., 50., 85, 0., 85.);
+  Matched = FileService->make<TH2F>("Matched", "Matched (eta,phi)", 100, -50., 50., 85, 0., 85.);
+
+  //now do what ever initialization is needed
+}
+
+HCALGPUAnalyzer::~HCALGPUAnalyzer() {
+  // do anything here that needs to be done at desctruction time
+  // (e.g. close files, deallocate resources etc.)
+}
+
+//
+// member functions
+//
+
+// ------------ method called for each event  ------------
+void HCALGPUAnalyzer::analyze(const edm::Event &iEvent, const edm::EventSetup &iSetup) {
+  using namespace edm;
+
+  // Read events
+  Handle<HBHERecHitCollection> hRecHits;
+  iEvent.getByToken(hRhToken, hRecHits);
+
+  Handle<HBHERecHitCollection> hRecHitsGPU;
+  iEvent.getByToken(hRhTokenGPU, hRecHitsGPU);
+
+  // Loop over all rechits in one event
+  for (int i = 0; i < (int)hRecHits->size(); i++) {
+    // get ID information for the reconstructed hit
+    HcalDetId detID_rh = (*hRecHits)[i].id().rawId();
+
+    // ID information can get us detector coordinates
+    depth = (*hRecHits)[i].id().depth();
+    iEta = detID_rh.ieta();
+    iPhi = detID_rh.iphi();
+
+    // get some variables
+    Method0Energy = (*hRecHits)[i].eraw();
+    RecHitEnergy = (*hRecHits)[i].energy();
+    RecHitTime = (*hRecHits)[i].time();
+
+    hEnergy_cpu->Fill(RecHitEnergy);
+    hTime_cpu->Fill(RecHitTime);
+
+    /*
+     cout << "Run " << i << ": ";
+     cout << "Method0Energy: " << Method0Energy;
+     cout << "RecHitEnergy: " << RecHitEnergy;
+     cout << "depth: " << depth;
+     cout << "iEta: " << iEta;
+     cout << "iPhi: " << iPhi;
+     cout << "RecHitTime" << RecHitTime;
+     */
+  }
+
+  for (int i = 0; i < (int)hRecHitsGPU->size(); i++) {
+    // get ID information for the reconstructed hit
+    HcalDetId detID_rh = (*hRecHitsGPU)[i].id().rawId();
+
+    // ID information can get us detector coordinates
+    depthGPU = (*hRecHitsGPU)[i].id().depth();
+    iEtaGPU = detID_rh.ieta();
+    iPhiGPU = detID_rh.iphi();
+
+    // get some variables
+    Method0EnergyGPU = (*hRecHitsGPU)[i].eraw();
+    RecHitEnergyGPU = (*hRecHitsGPU)[i].energy();
+    RecHitTimeGPU = (*hRecHitsGPU)[i].time();
+
+    hEnergy_gpu->Fill(RecHitEnergyGPU);
+    hTime_gpu->Fill(RecHitTimeGPU);
+
+    /*
+     cout << "Run " << i << ": ";
+     cout << "Method0Energy: " << Method0EnergyGPU;
+     cout << "RecHitEnergy: " << RecHitEnergyGPU;
+     cout << "depth: " << depthGPU;
+     cout << "iEta: " << iEtaGPU;
+     cout << "iPhi: " << iPhiGPU;
+     cout << "RecHitTime" << RecHitTimeGPU;
+     */
+  }
+
+  // Loop over all rechits in one event
+  for (int i = 0; i < (int)hRecHits->size(); i++) {
+    HcalDetId detID_rh = (*hRecHits)[i].id().rawId();
+
+    bool unmatched = true;
+    //     cout << "--------------------------------------------------------" << endl;
+
+    for (int j = 0; j < (int)hRecHitsGPU->size(); j++) {
+      HcalDetId detID_gpu = (*hRecHitsGPU)[j].id().rawId();
+
+      if ((detID_rh == detID_gpu)) {
+        /*
+	 cout << "Mtime(cpu)" << (*hRecHits)[i].time() << endl; 
+	 cout << "     Mtime(gpu)" << (*hRecHitsGPU)[j].time() << endl;
+
+	 cout << "M0E(cpu)" << (*hRecHits)[i].eraw() << endl; 
+	 cout << "     M0E(gpu)" << (*hRecHitsGPU)[j].eraw() << endl;
+	 */
+
+        auto relValue = ((*hRecHitsGPU)[j].energy() - (*hRecHits)[i].energy()) / (*hRecHits)[i].energy();
+
+        hEnergy_2dM0->Fill((*hRecHits)[i].eraw(), (*hRecHitsGPU)[j].eraw());
+        hEnergy_2dMahi->Fill((*hRecHits)[i].energy(), (*hRecHitsGPU)[j].energy());
+        hEnergy_cpugpu->Fill((*hRecHitsGPU)[j].energy() - (*hRecHits)[i].energy());
+        if ((*hRecHits)[i].energy() > 0.005)
+          hEnergy_cpugpu_rel->Fill(relValue);
+        hTime_2dMahi->Fill((*hRecHits)[i].time(), (*hRecHitsGPU)[j].time());
+
+        /*
+	 if((relValue < - 0.9) and ((*hRecHits)[i].energy()>0.005)) {
+	   cout << "----------------------------------"<< endl;
+	   cout << " detID = " << detID_rh.rawId() << endl;
+	   cout << "ME(cpu)" << (*hRecHits)[i].energy() << endl; 
+	   cout << "     ME(gpu)" << (*hRecHitsGPU)[j].energy() << endl;
+	 }
+	 */
+
+        Matched->Fill(detID_rh.ieta(), detID_rh.iphi());
+
+        unmatched = false;
+      }
+    }
+
+    ///
+
+    if (unmatched) {
+      Unmatched->Fill(detID_rh.ieta(), detID_rh.iphi());
+      //       cout << "   recHit not matched ="  << detID_rh << "  E(raw)=" << (*hRecHits)[i].eraw() << " E=" << (*hRecHits)[i].energy() << endl;
+    }
+  }
+}
+
+// ------------ method called once each job just before starting event loop  ------------
+void HCALGPUAnalyzer::beginJob() {}
+
+// ------------ method called once each job just after ending the event loop  ------------
+void HCALGPUAnalyzer::endJob() {}
+
+// ------------ method fills 'descriptions' with the allowed parameters for the module  ------------
+void HCALGPUAnalyzer::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  //The following says we do not know what parameters are allowed so do no validation
+  // Please change this to state exactly what you do use, even if it is no parameters
+  edm::ParameterSetDescription desc;
+  desc.setUnknown();
+  descriptions.addDefault(desc);
+}
+
+//define this as a plug-in
+DEFINE_FWK_MODULE(HCALGPUAnalyzer);
diff --git a/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py
new file mode 100644
index 0000000000000..2b97efc2f2d8c
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py
@@ -0,0 +1,28 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process("PLOT")
+
+process.load("FWCore.MessageService.MessageLogger_cfi")
+process.options = cms.untracked.PSet(
+    wantSummary = cms.untracked.bool(False)
+)
+
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load("Configuration.StandardSequences.FrontierConditions_GlobalTag_cff")
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:run2_hlt_relval', '')
+
+process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(-1) )
+process.MessageLogger.cerr.FwkReport.reportEvery = 500
+
+process.source = cms.Source("PoolSource",
+    fileNames = cms.untracked.vstring('file:GPUvsCPU_HCAL_rechits.root')
+)
+
+process.comparisonPlots = cms.EDAnalyzer('HCALGPUAnalyzer')
+
+process.TFileService = cms.Service('TFileService',
+    fileName = cms.string('GPUvsCPU_HCAL_plots.root')
+)
+
+process.path = cms.Path(process.comparisonPlots)
diff --git a/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py
new file mode 100644
index 0000000000000..32d4104a842ef
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py
@@ -0,0 +1,152 @@
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.StandardSequences.Eras import eras
+#from Configuration.ProcessModifiers.gpu_cff import gpu
+
+process = cms.Process('RECOgpu', eras.Run2_2018)
+
+# import of standard configurations
+process.load('Configuration.StandardSequences.Services_cff')
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+
+process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:run2_hlt_relval', '')
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32(1000)
+)
+
+#-----------------------------------------
+# INPUT
+#-----------------------------------------
+
+process.source = cms.Source("PoolSource",
+  fileNames = cms.untracked.vstring('/store/data/Run2018D/EphemeralHLTPhysics1/RAW/v1/000/323/775/00000/A27DFA33-8FCB-BE42-A2D2-1A396EEE2B6E.root')
+)
+
+process.hltGetRaw = cms.EDAnalyzer( "HLTGetRaw",
+    RawDataCollection = cms.InputTag( "rawDataCollector" )
+)
+
+process.input = cms.Path( process.hltGetRaw )
+
+#-----------------------------------------
+# CMSSW/Hcal non-DQM Related Module import
+#-----------------------------------------
+
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff")
+process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi")
+process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi")
+
+process.hcalDigis.InputLabel = cms.InputTag("rawDataCollector")
+
+#-----------------------------------------
+# CMSSW/Hcal GPU related files
+#-----------------------------------------
+
+process.load("RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPUTask_cff")
+process.load("RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi")
+process.hcalCPURecHitsProducer.recHitsM0LabelIn = cms.InputTag("hbheRecHitProducerGPU","")
+process.hcalCPURecHitsProducer.recHitsM0LabelOut = cms.string("")
+
+#-----------------------------------------
+# Temporary customization (things not implemented on the GPU)
+#-----------------------------------------
+
+## the one below is taken directly from the DB, regard M0
+#process.hbheprereco.algorithm.correctForPhaseContainment = cms.bool(False)
+
+## do always 8 pulse
+process.hbheprereco.algorithm.chiSqSwitch = cms.double(-1)
+
+## to match hard coded setting (will be fixed on CPU)
+process.hbheprereco.algorithm.nMaxItersMin = cms.int32(50)
+
+#-----------------------------------------
+# Final Custmization for Run3
+#-----------------------------------------
+
+# we will not run arrival Time at HLT
+process.hbheprereco.algorithm.calculateArrivalTime = cms.bool(False)
+
+## we do not need this
+process.hbheprereco.algorithm.applyLegacyHBMCorrection = cms.bool(False)
+
+# we only run Mahi at HLT
+process.hbheprereco.algorithm.useM3 = cms.bool(False)
+
+# we will not have the HPD noise flags in Run3, as will be all siPM
+process.hbheprereco.setLegacyFlagsQIE8 = cms.bool(False)
+process.hbheprereco.setNegativeFlagsQIE8 = cms.bool(False)
+process.hbheprereco.setNoiseFlagsQIE8 = cms.bool(False)
+process.hbheprereco.setPulseShapeFlagsQIE8 = cms.bool(False)
+
+# for testing M0 only
+##process.hbheprereco.algorithm.useMahi = cms.bool(False)
+
+#-----------------------------------------
+# OUTPUT
+#-----------------------------------------
+
+#process.out = cms.OutputModule("AsciiOutputModule",
+#    outputCommands = cms.untracked.vstring(
+#        'keep *_*_*_*', 
+#    ),
+#    verbosity = cms.untracked.uint32(0)
+#)
+
+process.out = cms.OutputModule("PoolOutputModule",
+    fileName = cms.untracked.string("GPUvsCPU_HCAL_rechits.root")
+)
+
+#---------------
+
+process.finalize = cms.EndPath(process.out)
+
+process.bunchSpacing = cms.Path(
+    process.bunchSpacingProducer
+)
+
+#-----------------------------------------
+# gpu test
+#-----------------------------------------
+
+process.digiPathCPU = cms.Path(
+    process.hcalDigis 
+)
+
+process.recoPathCPU = cms.Path(
+     process.hbheprereco
+)
+
+#---------------
+
+## hcalCPUDigisProducer <-- this convert the GPU digi on cpu (for dqm)
+process.recoPathGPU = cms.Path(
+    process.hbheRecHitProducerGPUSequence
+    * process.hcalCPURecHitsProducer
+)
+
+#---------------
+
+process.schedule = cms.Schedule(
+    process.input,
+    process.digiPathCPU,
+    process.recoPathCPU,
+    process.recoPathGPU,
+    process.finalize
+)
+
+process.options = cms.untracked.PSet(
+    numberOfThreads = cms.untracked.uint32(8),
+    numberOfStreams = cms.untracked.uint32(8),
+    SkipEvent = cms.untracked.vstring('ProductNotFound'),
+    wantSummary = cms.untracked.bool(True)
+)
+
+# report CUDAService messages
+process.MessageLogger.cerr.FwkReport.reportEvery = 100
+process.MessageLogger.categories.append("CUDAService")

From 3740b04b1b1dc9be437a7abe571227d0a5652685 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 23 Nov 2020 00:03:41 +0100
Subject: [PATCH 32/34] Synchronise GPU code with CPU updates
 (cms-patatrack#576)

Update GPU code following #32146.
---
 .../src/HcalRecoParamsWithPulseShapesGPU.cc   | 28 +++++++++----------
 .../HcalRecProducers/src/KernelHelpers.h      |  1 +
 RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu | 19 ++++++++-----
 3 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
index 804ea328c74d4..b42621b98908e 100644
--- a/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
+++ b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
@@ -55,20 +55,20 @@ HcalRecoParamsWithPulseShapesGPU::HcalRecoParamsWithPulseShapesGPU(HcalRecoParam
 
       // precompute and get values from the functor
       auto const& pulseShape = pulseShapes.getShape(pulseShapeId);
-      FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, 10};
+      FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, hcal::constants::maxSamples};
       auto const offset256 = newId * hcal::constants::maxPSshapeBin;
       auto const offset25 = newId * hcal::constants::nsPerBX;
       auto const numShapes = newId;
       for (int i = 0; i < hcal::constants::maxPSshapeBin; i++) {
-        acc25nsVec_[offset256 * numShapes + i] = functor.get_acc25nsVec()[i];
-        diff25nsItvlVec_[offset256 * numShapes + i] = functor.get_diff25nsItvlVec()[i];
+        acc25nsVec_[offset256 * numShapes + i] = functor.acc25nsVec()[i];
+        diff25nsItvlVec_[offset256 * numShapes + i] = functor.diff25nsItvlVec()[i];
       }
 
       for (int i = 0; i < hcal::constants::nsPerBX; i++) {
-        accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxMinusOneVec()[i];
-        diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxMinusOneVec()[i];
-        accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxZEROVec()[i];
-        diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxZEROVec()[i];
+        accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.accVarLenIdxMinusOneVec()[i];
+        diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxMinusOneVec()[i];
+        accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.accVarLenIdxZEROVec()[i];
+        diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxZEROVec()[i];
       }
     } else {
       // already recorded this pulse shape, just set id
@@ -113,20 +113,20 @@ HcalRecoParamsWithPulseShapesGPU::HcalRecoParamsWithPulseShapesGPU(HcalRecoParam
 
       // precompute and get values from the functor
       auto const& pulseShape = pulseShapes.getShape(pulseShapeId);
-      FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, 10};
+      FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, hcal::constants::maxSamples};
       auto const offset256 = newId * hcal::constants::maxPSshapeBin;
       auto const offset25 = newId * hcal::constants::nsPerBX;
       auto const numShapes = newId;
       for (int i = 0; i < hcal::constants::maxPSshapeBin; i++) {
-        acc25nsVec_[offset256 * numShapes + i] = functor.get_acc25nsVec()[i];
-        diff25nsItvlVec_[offset256 * numShapes + i] = functor.get_diff25nsItvlVec()[i];
+        acc25nsVec_[offset256 * numShapes + i] = functor.acc25nsVec()[i];
+        diff25nsItvlVec_[offset256 * numShapes + i] = functor.diff25nsItvlVec()[i];
       }
 
       for (int i = 0; i < hcal::constants::nsPerBX; i++) {
-        accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxMinusOneVec()[i];
-        diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxMinusOneVec()[i];
-        accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.get_accVarLenIdxZEROVec()[i];
-        diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.get_diffVarItvlIdxZEROVec()[i];
+        accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.accVarLenIdxMinusOneVec()[i];
+        diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxMinusOneVec()[i];
+        accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.accVarLenIdxZEROVec()[i];
+        diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxZEROVec()[i];
       }
     } else {
       // already recorded this pulse shape, just set id
diff --git a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
index af705e1f8dd3a..ade221b2c4870 100644
--- a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
+++ b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
@@ -146,6 +146,7 @@ namespace hcal {
     }
 
     // TODO: remove what's not needed
+    // originally from from RecoLocalCalo/HcalRecAlgos/src/PulseShapeFunctor.cc
     __forceinline__ __device__ float compute_pulse_shape_value(float const pulse_time,
                                                                int const sample,
                                                                int const shift,
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index 540dab7ad27fd..a1fc79b41eca6 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -568,7 +568,7 @@ namespace hcal {
       }
 
       if (sample == 0 && ipulse == 0) {
-        for (int i = 0; i < 10; i++) {
+        for (int i = 0; i < hcal::constants::maxSamples; i++) {
           auto const value = hcal::reconstruction::compute_pulse_shape_value(t0,
                                                                              i,
                                                                              0,
@@ -581,7 +581,7 @@ namespace hcal {
           printf("pulse(%d) = %f\n", i, value);
         }
         printf("\n");
-        for (int i = 0; i < 10; i++) {
+        for (int i = 0; i < hcal::constants::maxSamples; i++) {
           auto const value = hcal::reconstruction::compute_pulse_shape_value(t0p,
                                                                              i,
                                                                              0,
@@ -594,7 +594,7 @@ namespace hcal {
           printf("pulseP(%d) = %f\n", i, value);
         }
         printf("\n");
-        for (int i = 0; i < 10; i++) {
+        for (int i = 0; i < hcal::constants::maxSamples; i++) {
           auto const value = hcal::reconstruction::compute_pulse_shape_value(t0m,
                                                                              i,
                                                                              0,
@@ -651,10 +651,15 @@ namespace hcal {
                                  : 0;
 
       // store to global
-      pulseMatrix[ipulse * nsamples + sample] = value;
-      ;
-      pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
-      pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
+      if (amplitude > 0.f) {
+        pulseMatrix[ipulse * nsamples + sample] = value;
+        pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
+        pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
+      } else {
+        pulseMatrix[ipulse * nsamples + sample] = 0.f;
+        pulseMatrixM[ipulse * nsamples + sample] = 0.f;
+        pulseMatrixP[ipulse * nsamples + sample] = 0.f;
+      }
     }
 
     template <int NSAMPLES, int NPULSES>

From 80696645e304cedafd6f34af78ff6b61370e2bd7 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 24 Nov 2020 18:10:07 +0100
Subject: [PATCH 33/34] Update GPU HCAL conditions framework
 (cms-patatrack#574)

Remove duplicate GPU-related HCAL conditions records, and simplify the package dependencies moving the remaining ones to CondFormats/DataRecord.

Improve the handling of the GPU conditions payloads:
  - use cms::cuda::device::unique_ptr to automatically deallocate the memory;
  - use edm::propagate_const_array to ensure that the conditions data are not accidentally modified by client code;
  - use cms::cuda::copyAsync(...) to simplify the copy of the conditions to the device.
---
 RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h        | 2 +-
 RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc | 4 ++--
 .../HcalRecProducers/src/HcalESProducersGPUDefs.cc          | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
index 49929bf8fe59c..807c42b057fa3 100644
--- a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
+++ b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
@@ -7,6 +7,7 @@
 #include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
 #include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
 #include "CalibCalorimetry/HcalAlgos/interface/HcalTimeSlew.h"
+#include "CondFormats/DataRecord/interface/HcalCombinedRecordsGPU.h"
 #include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
@@ -17,7 +18,6 @@
 #include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
 #include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
-#include "CondFormats/HcalObjects/interface/HcalCombinedRecordsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
index e8a9901e63803..af5398e49fa8f 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -139,7 +139,7 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
   // use only 1 depending on useEffectivePedestals
   edm::ESHandle<HcalConvertedPedestalWidthsGPU> pedestalWidthsHandle;
   edm::ESHandle<HcalConvertedEffectivePedestalWidthsGPU> effectivePedestalWidthsHandle;
-  setup.get<HcalConvertedEffectivePedestalWidthsRcd>().get(effectivePedestalWidthsHandle);
+  setup.get<HcalConvertedPedestalWidthsRcd>().get(effectivePedestalWidthsHandle);
   setup.get<HcalConvertedPedestalWidthsRcd>().get(pedestalWidthsHandle);
   auto const& pedestalWidthsProduct = pedestalWidthsHandle->getProduct(ctx.stream());
   auto const& effectivePedestalWidthsProduct = effectivePedestalWidthsHandle->getProduct(ctx.stream());
@@ -150,7 +150,7 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
 
   edm::ESHandle<HcalConvertedEffectivePedestalsGPU> effectivePedestalsHandle;
   if (configParameters_.useEffectivePedestals)
-    setup.get<HcalConvertedEffectivePedestalsRcd>().get(effectivePedestalsHandle);
+    setup.get<HcalConvertedPedestalsRcd>().get(effectivePedestalsHandle);
   auto const* effectivePedestalsProduct =
       configParameters_.useEffectivePedestals ? &effectivePedestalsHandle->getProduct(ctx.stream()) : nullptr;
 
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
index f92b943c38d57..2fc6cc0d19002 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
@@ -1,3 +1,4 @@
+#include "CondFormats/DataRecord/interface/HcalCombinedRecordsGPU.h"
 #include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
@@ -10,7 +11,6 @@
 #include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
 #include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
 #include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
-#include "CondFormats/HcalObjects/interface/HcalCombinedRecordsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h"
 #include "CondFormats/HcalObjects/interface/HcalConvertedPedestalWidthsGPU.h"
@@ -80,7 +80,7 @@ using HcalConvertedPedestalsGPUESProducer = ConvertingESProducerWithDependencies
                                                                                   HcalQIETypes>;
 
 using HcalConvertedEffectivePedestalsGPUESProducer =
-    ConvertingESProducerWithDependenciesT<HcalConvertedEffectivePedestalsRcd,
+    ConvertingESProducerWithDependenciesT<HcalConvertedPedestalsRcd,
                                           HcalConvertedEffectivePedestalsGPU,
                                           HcalPedestals,
                                           HcalQIEData,
@@ -94,7 +94,7 @@ using HcalConvertedPedestalWidthsGPUESProducer = ConvertingESProducerWithDepende
                                                                                        HcalQIETypes>;
 
 using HcalConvertedEffectivePedestalWidthsGPUESProducer =
-    ConvertingESProducerWithDependenciesT<HcalConvertedEffectivePedestalWidthsRcd,
+    ConvertingESProducerWithDependenciesT<HcalConvertedPedestalWidthsRcd,
                                           HcalConvertedEffectivePedestalWidthsGPU,
                                           HcalPedestals,
                                           HcalPedestalWidths,

From 8e6a6a3788aa588e219b9285bc6a66f41bd255ae Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 26 Nov 2020 00:09:12 +0100
Subject: [PATCH 34/34] Apply code formatting

---
 .../HcalRecProducers/src/HCALGPUAnalyzer.cc         | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc b/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc
index 89642d4a7a85d..ba3c9de696c47 100644
--- a/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc
+++ b/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc
@@ -54,14 +54,14 @@ using namespace std;
 class HCALGPUAnalyzer : public edm::one::EDAnalyzer<edm::one::SharedResources> {
 public:
   explicit HCALGPUAnalyzer(const edm::ParameterSet &);
-  ~HCALGPUAnalyzer();
+  ~HCALGPUAnalyzer() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
 
 private:
-  virtual void beginJob() override;
-  virtual void analyze(const edm::Event &, const edm::EventSetup &) override;
-  virtual void endJob() override;
+  void beginJob() override;
+  void analyze(const edm::Event &, const edm::EventSetup &) override;
+  void endJob() override;
 
   // ----------member data ---------------------------
   //  void ClearVariables();
@@ -164,11 +164,6 @@ HCALGPUAnalyzer::HCALGPUAnalyzer(const edm::ParameterSet &iConfig) {
   //now do what ever initialization is needed
 }
 
-HCALGPUAnalyzer::~HCALGPUAnalyzer() {
-  // do anything here that needs to be done at desctruction time
-  // (e.g. close files, deallocate resources etc.)
-}
-
 //
 // member functions
 //