From 34102eeeacecf7d6ec1a67e2852cc10c4dbcc5cb Mon Sep 17 00:00:00 2001 From: adriano Date: Thu, 18 Sep 2025 08:16:00 +0200 Subject: [PATCH 1/2] Better handling of shared memory for pixel clustering and digi morphing --- .../interface/SimplePixelTopology.h | 8 +++++ .../plugins/alpaka/PixelClustering.h | 36 ++++++++++++------- .../plugins/alpaka/SiPixelRawToCluster.cc | 9 ++++- .../alpaka/SiPixelRawToClusterKernel.dev.cc | 17 ++++----- 4 files changed, 49 insertions(+), 21 deletions(-) diff --git a/Geometry/CommonTopologies/interface/SimplePixelTopology.h b/Geometry/CommonTopologies/interface/SimplePixelTopology.h index d775302a993f5..5737577045c8b 100644 --- a/Geometry/CommonTopologies/interface/SimplePixelTopology.h +++ b/Geometry/CommonTopologies/interface/SimplePixelTopology.h @@ -377,6 +377,9 @@ namespace pixelTopology { static constexpr uint16_t last_barrel_detIndex = 864; static constexpr uint32_t maxPixInModule = 6000; + static constexpr uint32_t maxPixInModuleForMorphing = maxPixInModule; + static constexpr uint32_t maxIterClustering = 16; + static constexpr uint32_t maxNumClustersPerModules = phase2PixelTopology::maxNumClustersPerModules; static constexpr uint32_t maxHitsInModule = phase2PixelTopology::maxNumClustersPerModules; @@ -471,6 +474,9 @@ namespace pixelTopology { static constexpr uint16_t last_barrel_detIndex = 1184; static constexpr uint32_t maxPixInModule = 6000; + static constexpr uint32_t maxPixInModuleForMorphing = maxPixInModule * 7/5; + static constexpr uint32_t maxIterClustering = 24; + static constexpr uint32_t maxNumClustersPerModules = phase1PixelTopology::maxNumClustersPerModules; static constexpr uint32_t maxHitsInModule = phase1PixelTopology::maxNumClustersPerModules; @@ -582,6 +588,8 @@ namespace pixelTopology { static constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples; static constexpr uint32_t maxPixInModule = 10000; + static constexpr uint32_t maxPixInModuleForMorphing = maxPixInModule * 11/10; + static constexpr uint32_t maxIterClustering = 32; static constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 4; // TODO need to think a better way to avoid this duplication diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h index 33d062bad4feb..abd350da97a6f 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h @@ -17,7 +17,7 @@ #include "HeterogeneousCore/AlpakaInterface/interface/config.h" #include "HeterogeneousCore/AlpakaInterface/interface/warpsize.h" -//#define GPU_DEBUG +// #define GPU_DEBUG // TODO move to HeterogeneousCore/AlpakaInterface or upstream to alpaka template @@ -174,12 +174,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { template struct FindClus { // assume that we can cover the whole module with up to 16 blockDimension-wide iterations - static constexpr uint32_t maxIterGPU = 16; - - // this must be larger than maxPixInModule / maxIterGPU, and should be a multiple of the warp size + + // this must be larger than maxPixInModule / maxIterClustering, and should be a multiple of the warp size static constexpr uint32_t maxElementsPerBlock = - cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / maxIterGPU, 128); - + cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / TrackerTraits::maxIterClustering, 64); + static constexpr uint32_t maxElementsPerBlockMorph = + cms::alpakatools::round_up_by(TrackerTraits::maxPixInModuleForMorphing / TrackerTraits::maxIterClustering, 64); + static_assert(maxElementsPerBlockMorph>=maxElementsPerBlock); + ALPAKA_FN_ACC void operator()(Acc1D const& acc, SiPixelDigisSoAView digi_view, SiPixelDigisSoAView fakes_view, @@ -261,7 +263,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { using Hist = cms::alpakatools::HistoContainer; constexpr int warpSize = cms::alpakatools::warpSize; @@ -568,15 +570,25 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { #endif [[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv(acc)[0u]; - // assume that we can cover the whole module with up to maxIterGPU blockDimension-wide iterations - ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < maxIterGPU); + // assume that we can cover the whole module with up to maxIterClustering blockDimension-wide iterations + ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < TrackerTraits::maxIterClustering); // number of elements per thread - constexpr uint32_t maxElements = - cms::alpakatools::requires_single_thread_per_block_v ? maxElementsPerBlock : 1; + const uint32_t maxElements = + cms::alpakatools::requires_single_thread_per_block_v ? (enableDigiMorphing ? maxElementsPerBlockMorph : maxElementsPerBlock) : 1; + + +#ifdef GPU_DEBUG + const auto nthreads = alpaka::getWorkDiv(acc)[0u]; + if (nthreads > maxElements) + printf("This is WRONG: nthreads > maxElements : %d > %d\n",nthreads,maxElements); + else if (thisModuleId % 500 == 1) + printf("This is OK: nthreads <= maxElements : %d <= %d\n",nthreads,maxElements); +#endif + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0u] <= maxElements)); - constexpr unsigned int maxIter = maxIterGPU * maxElements; + const unsigned int maxIter = TrackerTraits::maxIterClustering * maxElements; // nearest neighbours (nn) constexpr int maxNeighbours = 8; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToCluster.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToCluster.cc index 6f204a142ca6b..ceb6991822bfe 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToCluster.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToCluster.cc @@ -210,6 +210,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { digiMorphingConfig_.applyDigiMorphing = iConfig.getParameter("DoDigiMorphing"); digiMorphingConfig_.maxFakesInModule = iConfig.getParameter("MaxFakesInModule"); + if (digiMorphingConfig_.maxFakesInModule > TrackerTraits::maxPixInModuleForMorphing - TrackerTraits::maxPixInModule) { + throw cms::Exception("Configuration") << "[SiPixelDigiMorphing]:" + << " maxFakesInModule should be <= " << TrackerTraits::maxPixInModuleForMorphing - TrackerTraits::maxPixInModule + << " (TrackerTraits::maxPixInModuleForMorphing - TrackerTraits::maxPixInModule)" + << " while " << digiMorphingConfig_.maxFakesInModule << " was provided at config level.\n"; + } + // regions if (!iConfig.getParameter("Regions").getParameterNames().empty()) { regions_ = std::make_unique(iConfig, consumesCollector()); @@ -233,7 +240,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { desc.add("VCaltoElectronOffset", -60.f); desc.add("VCaltoElectronOffset_L1", -670.f); desc.add("DoDigiMorphing", false); - desc.add("MaxFakesInModule", TrackerTraits::maxPixInModule * 2 / 5); + desc.add("MaxFakesInModule", TrackerTraits::maxPixInModuleForMorphing - TrackerTraits::maxPixInModule); desc.add("InputLabel", edm::InputTag("rawDataCollector")); { diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc index 1a500bb2c7883..cfdd533ecacf9 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc @@ -291,7 +291,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // Kernel to perform Raw to Digi conversion template struct RawToDigi_kernel { - ALPAKA_FN_ACC void operator()(Acc1D const &acc, + ALPAKA_FN_ACC void operator()(Acc1D const &acc, const SiPixelMappingSoAConstView &cablingMap, const unsigned char *modToUnp, const uint32_t wordCounter, @@ -570,27 +570,28 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { { const int blocks = 64; - const auto elementsPerBlockFindClus = FindClus::maxElementsPerBlock; - const auto workDivMaxNumModules = cms::alpakatools::make_workdiv(blocks, elementsPerBlockFindClus); + + const auto elementsPerBlockFindClus = digiMorphingConfig.applyDigiMorphing ? FindClus::maxElementsPerBlockMorph : FindClus::maxElementsPerBlock; + const auto workDivFindClus = cms::alpakatools::make_workdiv(blocks, elementsPerBlockFindClus); // allocate a transient collection for the fake pixels recovered by the digi morphing algorithm auto fakes_d = SiPixelDigisSoACollection(blocks * digiMorphingConfig.maxFakesInModule, queue); - #ifdef GPU_DEBUG - std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus + alpaka::wait(queue); + std::cout << "FindClus kernel launch with " << blocks << " blocks of " << elementsPerBlockFindClus << " threadsPerBlockOrElementsPerThread\n"; #endif // Use device buffer created by producer and the module count stored in digiMorphingConfig alpaka::exec(queue, - workDivMaxNumModules, + workDivFindClus, FindClus{}, digis_d->view(), fakes_d.view(), digiMorphingConfig.applyDigiMorphing, morphingModulesDevice, digiMorphingConfig.numMorphingModules, - digiMorphingConfig.maxFakesInModule, + digiMorphingConfig.maxFakesInModule, clusters_d->view(), wordCounter); #ifdef GPU_DEBUG @@ -766,5 +767,5 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template class SiPixelRawToClusterKernel; } // namespace pixelDetails - + } // namespace ALPAKA_ACCELERATOR_NAMESPACE From a487e5e24ecda7bef832abc151862340ed8d1a1b Mon Sep 17 00:00:00 2001 From: adriano Date: Mon, 29 Sep 2025 12:45:44 +0200 Subject: [PATCH 2/2] Code formats, maxPixInModuleForMorphing as no. fake pixels and minor fixes --- .../interface/SimplePixelTopology.h | 6 +-- .../plugins/alpaka/PixelClustering.h | 40 ++++++++++--------- .../plugins/alpaka/SiPixelRawToCluster.cc | 15 +++---- .../alpaka/SiPixelRawToClusterKernel.dev.cc | 10 +++-- 4 files changed, 38 insertions(+), 33 deletions(-) diff --git a/Geometry/CommonTopologies/interface/SimplePixelTopology.h b/Geometry/CommonTopologies/interface/SimplePixelTopology.h index 5737577045c8b..c68ef516b2cbf 100644 --- a/Geometry/CommonTopologies/interface/SimplePixelTopology.h +++ b/Geometry/CommonTopologies/interface/SimplePixelTopology.h @@ -377,7 +377,7 @@ namespace pixelTopology { static constexpr uint16_t last_barrel_detIndex = 864; static constexpr uint32_t maxPixInModule = 6000; - static constexpr uint32_t maxPixInModuleForMorphing = maxPixInModule; + static constexpr uint32_t maxPixInModuleForMorphing = 0; static constexpr uint32_t maxIterClustering = 16; static constexpr uint32_t maxNumClustersPerModules = phase2PixelTopology::maxNumClustersPerModules; @@ -474,7 +474,7 @@ namespace pixelTopology { static constexpr uint16_t last_barrel_detIndex = 1184; static constexpr uint32_t maxPixInModule = 6000; - static constexpr uint32_t maxPixInModuleForMorphing = maxPixInModule * 7/5; + static constexpr uint32_t maxPixInModuleForMorphing = maxPixInModule * 2 / 5; static constexpr uint32_t maxIterClustering = 24; static constexpr uint32_t maxNumClustersPerModules = phase1PixelTopology::maxNumClustersPerModules; @@ -588,7 +588,7 @@ namespace pixelTopology { static constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples; static constexpr uint32_t maxPixInModule = 10000; - static constexpr uint32_t maxPixInModuleForMorphing = maxPixInModule * 11/10; + static constexpr uint32_t maxPixInModuleForMorphing = maxPixInModule * 1 / 10; static constexpr uint32_t maxIterClustering = 32; static constexpr uint32_t maxNumOfActiveDoublets = diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h index abd350da97a6f..33481ba71621e 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h @@ -17,7 +17,7 @@ #include "HeterogeneousCore/AlpakaInterface/interface/config.h" #include "HeterogeneousCore/AlpakaInterface/interface/warpsize.h" -// #define GPU_DEBUG +//#define GPU_DEBUG // TODO move to HeterogeneousCore/AlpakaInterface or upstream to alpaka template @@ -174,14 +174,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { template struct FindClus { // assume that we can cover the whole module with up to 16 blockDimension-wide iterations - + // this must be larger than maxPixInModule / maxIterClustering, and should be a multiple of the warp size static constexpr uint32_t maxElementsPerBlock = cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / TrackerTraits::maxIterClustering, 64); - static constexpr uint32_t maxElementsPerBlockMorph = - cms::alpakatools::round_up_by(TrackerTraits::maxPixInModuleForMorphing / TrackerTraits::maxIterClustering, 64); - static_assert(maxElementsPerBlockMorph>=maxElementsPerBlock); - + static constexpr uint32_t maxElementsPerBlockMorph = cms::alpakatools::round_up_by( + (TrackerTraits::maxPixInModule + TrackerTraits::maxPixInModuleForMorphing) / TrackerTraits::maxIterClustering, + 64); + static_assert(maxElementsPerBlockMorph >= maxElementsPerBlock); + ALPAKA_FN_ACC void operator()(Acc1D const& acc, SiPixelDigisSoAView digi_view, SiPixelDigisSoAView fakes_view, @@ -261,11 +262,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { } } - using Hist = cms::alpakatools::HistoContainer; + using Hist = + cms::alpakatools::HistoContainer; constexpr int warpSize = cms::alpakatools::warpSize; auto& hist = alpaka::declareSharedVar(acc); auto& ws = alpaka::declareSharedVar(acc); @@ -574,16 +576,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < TrackerTraits::maxIterClustering); // number of elements per thread - const uint32_t maxElements = - cms::alpakatools::requires_single_thread_per_block_v ? (enableDigiMorphing ? maxElementsPerBlockMorph : maxElementsPerBlock) : 1; - - + const uint32_t maxElements = cms::alpakatools::requires_single_thread_per_block_v + ? (enableDigiMorphing ? maxElementsPerBlockMorph : maxElementsPerBlock) + : 1; + #ifdef GPU_DEBUG - const auto nthreads = alpaka::getWorkDiv(acc)[0u]; - if (nthreads > maxElements) - printf("This is WRONG: nthreads > maxElements : %d > %d\n",nthreads,maxElements); + const auto nElementsPerThread = alpaka::getWorkDiv(acc)[0u]; + if (nElementsPerThread > maxElements) + printf("This is WRONG: nElementsPerThread > maxElements : %d > %d\n", nElementsPerThread, maxElements); else if (thisModuleId % 500 == 1) - printf("This is OK: nthreads <= maxElements : %d <= %d\n",nthreads,maxElements); + printf("This is OK: nElementsPerThread <= maxElements : %d <= %d\n", nElementsPerThread, maxElements); #endif ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0u] <= maxElements)); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToCluster.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToCluster.cc index ceb6991822bfe..14cec6d1b70df 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToCluster.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToCluster.cc @@ -210,13 +210,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { digiMorphingConfig_.applyDigiMorphing = iConfig.getParameter("DoDigiMorphing"); digiMorphingConfig_.maxFakesInModule = iConfig.getParameter("MaxFakesInModule"); - if (digiMorphingConfig_.maxFakesInModule > TrackerTraits::maxPixInModuleForMorphing - TrackerTraits::maxPixInModule) { - throw cms::Exception("Configuration") << "[SiPixelDigiMorphing]:" - << " maxFakesInModule should be <= " << TrackerTraits::maxPixInModuleForMorphing - TrackerTraits::maxPixInModule - << " (TrackerTraits::maxPixInModuleForMorphing - TrackerTraits::maxPixInModule)" - << " while " << digiMorphingConfig_.maxFakesInModule << " was provided at config level.\n"; + if (digiMorphingConfig_.maxFakesInModule > TrackerTraits::maxPixInModuleForMorphing) { + throw cms::Exception("Configuration") + << "[SiPixelDigiMorphing]:" + << " maxFakesInModule should be <= " << TrackerTraits::maxPixInModuleForMorphing + << " (TrackerTraits::maxPixInModuleForMorphing)" + << " while " << digiMorphingConfig_.maxFakesInModule << " was provided at config level.\n"; } - + // regions if (!iConfig.getParameter("Regions").getParameterNames().empty()) { regions_ = std::make_unique(iConfig, consumesCollector()); @@ -240,7 +241,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { desc.add("VCaltoElectronOffset", -60.f); desc.add("VCaltoElectronOffset_L1", -670.f); desc.add("DoDigiMorphing", false); - desc.add("MaxFakesInModule", TrackerTraits::maxPixInModuleForMorphing - TrackerTraits::maxPixInModule); + desc.add("MaxFakesInModule", TrackerTraits::maxPixInModuleForMorphing); desc.add("InputLabel", edm::InputTag("rawDataCollector")); { diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc index cfdd533ecacf9..b6aa5870c8f13 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc @@ -291,7 +291,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // Kernel to perform Raw to Digi conversion template struct RawToDigi_kernel { - ALPAKA_FN_ACC void operator()(Acc1D const &acc, + ALPAKA_FN_ACC void operator()(Acc1D const &acc, const SiPixelMappingSoAConstView &cablingMap, const unsigned char *modToUnp, const uint32_t wordCounter, @@ -571,7 +571,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { { const int blocks = 64; - const auto elementsPerBlockFindClus = digiMorphingConfig.applyDigiMorphing ? FindClus::maxElementsPerBlockMorph : FindClus::maxElementsPerBlock; + const auto elementsPerBlockFindClus = digiMorphingConfig.applyDigiMorphing + ? FindClus::maxElementsPerBlockMorph + : FindClus::maxElementsPerBlock; const auto workDivFindClus = cms::alpakatools::make_workdiv(blocks, elementsPerBlockFindClus); // allocate a transient collection for the fake pixels recovered by the digi morphing algorithm @@ -591,7 +593,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { digiMorphingConfig.applyDigiMorphing, morphingModulesDevice, digiMorphingConfig.numMorphingModules, - digiMorphingConfig.maxFakesInModule, + digiMorphingConfig.maxFakesInModule, clusters_d->view(), wordCounter); #ifdef GPU_DEBUG @@ -767,5 +769,5 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template class SiPixelRawToClusterKernel; } // namespace pixelDetails - + } // namespace ALPAKA_ACCELERATOR_NAMESPACE