diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h index 3f2a551bc320f..8cfa5c9f5ffde 100644 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -92,6 +92,11 @@ namespace cms { return cms::cuda::make_host_unique(stream); } + template + static auto make_unique(size_t size, cudaStream_t stream) { + return cms::cuda::make_host_unique(size, stream); + } + template static auto make_host_unique(cudaStream_t stream) { return cms::cuda::make_host_unique(stream); diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h index 70d00ae584279..78406cd241473 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h @@ -12,7 +12,10 @@ class SiPixelDigisCUDASOAView { public: friend class SiPixelDigisCUDA; - friend class SiPixelRecHitSoAFromLegacy; + + template + friend class SiPixelRecHitSoAFromLegacyT; + enum class StorageLocation { kCLUS = 0, kPDIGI = 2, diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h index 3ee5af80353dd..f9e9b3a37c63f 100644 --- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h +++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h @@ -3,7 +3,9 @@ #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -using PixelTrackHeterogeneous = HeterogeneousSoA; +template +using PixelTrackHeterogeneousT = HeterogeneousSoA>; -#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h \ No newline at end of file +#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h index 356ea3eddeb7f..b5b1df0d5118a 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h @@ -5,12 +5,13 @@ #include #include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" - +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h" namespace pixelTrack { + enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; @@ -18,20 +19,24 @@ namespace pixelTrack { auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; return static_cast(qp); } + } // namespace pixelTrack -template +template class TrackSoAHeterogeneousT { public: + static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; + static constexpr int32_t H = TrackerTraits::maxHitsOnTrack; // Average hits rather than max? static constexpr int32_t stride() { return S; } + using hindex_type = uint32_t; //TrackerTraits::hindex_type ? + using Quality = pixelTrack::Quality; - using hindex_type = uint32_t; - using HitContainer = cms::cuda::OneToManyAssoc; + using HitContainer = cms::cuda::OneToManyAssoc; // Always check quality is at least loose! // CUDA does not support enums in __lgc ... -private: +protected: eigenSoA::ScalarSoA quality_; public: @@ -56,9 +61,9 @@ class TrackSoAHeterogeneousT { // layers are in order and we assume tracks are either forward or backward auto pdet = detIndices.begin(i); int nl = 1; - auto ol = phase1PixelTopology::getLayer(*pdet); + auto ol = pixelTopology::getLayer(*pdet); for (; pdet < detIndices.end(i); ++pdet) { - auto il = phase1PixelTopology::getLayer(*pdet); + auto il = pixelTopology::getLayer(*pdet); if (il != ol) ++nl; ol = il; @@ -90,17 +95,100 @@ class TrackSoAHeterogeneousT { namespace pixelTrack { -#ifdef GPU_SMALL_EVENTS - // kept for testing and debugging - constexpr uint32_t maxNumber() { return 2 * 1024; } -#else - // tested on MC events with 55-75 pileup events - constexpr uint32_t maxNumber() { return 32 * 1024; } -#endif + template + using TrackSoAT = TrackSoAHeterogeneousT; + + template + using HitContainerT = typename TrackSoAHeterogeneousT::HitContainer; + + //Used only to ease classes definitions + using TrackSoAPhase1 = TrackSoAHeterogeneousT; + using TrackSoAPhase2 = TrackSoAHeterogeneousT; + + template + struct QualityCutsT {}; + + template + struct QualityCutsT> { + // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3]))) + float chi2Coeff[4]; + float chi2MaxPt; // GeV + float chi2Scale; + + struct Region { + float maxTip; // cm + float minPt; // GeV + float maxZip; // cm + }; + + Region triplet; + Region quadruplet; + + __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT const *__restrict__ tracks, + int nHits, + int it) const { + // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) + // default cuts: + // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm + // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm + // (see CAHitNtupletGeneratorGPU.cc) + auto const ®ion = (nHits > 3) ? quadruplet : triplet; + return (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and + (std::abs(tracks->zip(it)) < region.maxZip); + } - using TrackSoA = TrackSoAHeterogeneousT; - using TrajectoryState = TrajectoryStateSoAT; - using HitContainer = TrackSoA::HitContainer; + __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT const *__restrict__ tracks, + int it) const { + auto roughLog = [](float x) { + // max diff [0.5,12] at 1.25 0.16143 + // average diff 0.0662998 + union IF { + uint32_t i; + float f; + }; + IF z; + z.f = x; + uint32_t lsb = 1 < 21; + z.i += lsb; + z.i >>= 21; + auto f = z.i & 3; + int ex = int(z.i >> 2) - 127; + + // log2(1+0.25*f) + // averaged over bins + const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f}; + return float(ex) + frac[f]; + }; + + float pt = std::min(tracks->pt(it), chi2MaxPt); + float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]); + if (tracks->chi2(it) >= chi2Cut) { +#ifdef NTUPLE_FIT_DEBUG + printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks->pt(it), tracks->eta(it), tracks->chi2(it)); +#endif + return true; + } + return false; + } + }; + + template + struct QualityCutsT> { + float maxChi2; + float minPt; + float maxTip; + float maxZip; + + __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT const *__restrict__ tracks, + int nHits, + int it) const { + return (std::abs(tracks->tip(it)) < maxTip) and (tracks->pt(it) > minPt) and (std::abs(tracks->zip(it)) < maxZip); + } + __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT const *__restrict__ tracks, + int it) const { + return tracks->chi2(it) >= maxChi2; + } + }; } // namespace pixelTrack diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml index 9c80ae91baf29..5216c19dded65 100644 --- a/CUDADataFormats/Track/src/classes_def.xml +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -1,6 +1,15 @@ - - - - + + + + + + + + + + + + + diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h index 8ce37f280ac6c..ad78daa8354e2 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -4,10 +4,10 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h" -template -class TrackingRecHit2DHeterogeneous { -public: +namespace { enum class Storage32 { kXLocal = 0, kYLocal = 1, @@ -28,37 +28,45 @@ class TrackingRecHit2DHeterogeneous { kXSize = 2, kYSize = 3, }; +} // namespace + +template +class TrackingRecHit2DHeterogeneousT { +public: + template + friend class TrackingRecHit2DHostT; template using unique_ptr = typename Traits::template unique_ptr; - using PhiBinner = TrackingRecHit2DSOAView::PhiBinner; + using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; + using PhiBinner = typename TrackingRecHit2DSOAView::PhiBinner; + using AverageGeometry = typename TrackingRecHit2DSOAView::AverageGeometry; - TrackingRecHit2DHeterogeneous() = default; + TrackingRecHit2DHeterogeneousT() = default; - explicit TrackingRecHit2DHeterogeneous( - uint32_t nHits, - bool isPhase2, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream, - TrackingRecHit2DHeterogeneous const* input = nullptr); + explicit TrackingRecHit2DHeterogeneousT(uint32_t nHits, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPUT const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream = nullptr); - explicit TrackingRecHit2DHeterogeneous( - float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr); - ~TrackingRecHit2DHeterogeneous() = default; + explicit TrackingRecHit2DHeterogeneousT(cms::cuda::host::unique_ptr& store32, + cms::cuda::host::unique_ptr& store16, + uint32_t* modules, + int nHits, + cudaStream_t stream = nullptr); + ~TrackingRecHit2DHeterogeneousT() = default; - TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete; - TrackingRecHit2DHeterogeneous& operator=(const TrackingRecHit2DHeterogeneous&) = delete; - TrackingRecHit2DHeterogeneous(TrackingRecHit2DHeterogeneous&&) = default; - TrackingRecHit2DHeterogeneous& operator=(TrackingRecHit2DHeterogeneous&&) = default; + TrackingRecHit2DHeterogeneousT(const TrackingRecHit2DHeterogeneousT&) = delete; + TrackingRecHit2DHeterogeneousT& operator=(const TrackingRecHit2DHeterogeneousT&) = delete; + TrackingRecHit2DHeterogeneousT(TrackingRecHit2DHeterogeneousT&&) = default; + TrackingRecHit2DHeterogeneousT& operator=(TrackingRecHit2DHeterogeneousT&&) = default; TrackingRecHit2DSOAView* view() { return m_view.get(); } TrackingRecHit2DSOAView const* view() const { return m_view.get(); } auto nHits() const { return m_nHits; } - auto nMaxModules() const { return m_nMaxModules; } auto offsetBPIX2() const { return m_offsetBPIX2; } auto hitsModuleStart() const { return m_hitsModuleStart; } @@ -74,10 +82,7 @@ class TrackingRecHit2DHeterogeneous { cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; - // needs specialization for Host - void copyFromGPU(TrackingRecHit2DHeterogeneous const* input, cudaStream_t stream); - -private: +protected: static constexpr uint32_t n16 = 4; // number of elements in m_store16 static constexpr uint32_t n32 = 10; // number of elements in m_store32 static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious @@ -85,8 +90,8 @@ class TrackingRecHit2DHeterogeneous { unique_ptr m_store16; //! unique_ptr m_store32; //! - unique_ptr m_PhiBinnerStore; //! - unique_ptr m_AverageGeometryStore; //! + unique_ptr m_PhiBinnerStore; //! + unique_ptr m_AverageGeometryStore; //! unique_ptr m_view; //! @@ -95,39 +100,86 @@ class TrackingRecHit2DHeterogeneous { uint32_t const* m_hitsModuleStart; // needed for legacy, this is on GPU! - uint32_t m_nMaxModules; // needed as kernel params... PhiBinner* m_phiBinner; - PhiBinner::index_type* m_phiBinnerStorage; + typename PhiBinner::index_type* m_phiBinnerStorage; uint32_t* m_hitsLayerStart; int16_t* m_iphi; }; -using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous; -using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous; -using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous; +//Inherit and overload only what we need to overload, remember to use this-> +//GPU +template +class TrackingRecHit2DGPUT : public TrackingRecHit2DHeterogeneousT { +public: + using TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT; + + cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; + cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; + cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; + cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; +}; + +//CPU +template +class TrackingRecHit2DCPUT : public TrackingRecHit2DHeterogeneousT { +public: + using TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT; + + cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; + cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; + cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; +}; + +//HOST +template +class TrackingRecHit2DHostT : public TrackingRecHit2DHeterogeneousT { +public: + ~TrackingRecHit2DHostT() = default; + TrackingRecHit2DHostT() = default; + + explicit TrackingRecHit2DHostT(uint32_t nHits, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPUT const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream = nullptr) + : TrackingRecHit2DHeterogeneousT( + nHits, offsetBPIX2, cpeParams, hitsModuleStart, stream) {} + + explicit TrackingRecHit2DHostT(cms::cuda::host::unique_ptr& store32, + cms::cuda::host::unique_ptr& store16, + uint32_t* modules, + int nHits, + cudaStream_t stream = nullptr) + : TrackingRecHit2DHeterogeneousT( + store32, store16, modules, nHits, stream) {} + + explicit TrackingRecHit2DHostT(uint32_t nHits, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPUT const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream, + TrackingRecHit2DHeterogeneousT const* input); +}; #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -template -TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( +template +TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT( uint32_t nHits, - bool isPhase2, int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, + pixelCPEforGPU::ParamsOnGPUT const* cpeParams, uint32_t const* hitsModuleStart, - cudaStream_t stream, - TrackingRecHit2DHeterogeneous const* input) + cudaStream_t stream) : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { - auto view = Traits::template make_host_unique(stream); + using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; - m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + auto view = Traits::template make_host_unique(stream); view->m_nHits = nHits; - view->m_nMaxModules = m_nMaxModules; m_view = Traits::template make_unique(stream); // leave it on host and pass it by value? - m_AverageGeometryStore = Traits::template make_unique(stream); + m_AverageGeometryStore = Traits::template make_unique(stream); view->m_averageGeometry = m_AverageGeometryStore.get(); view->m_cpeParams = cpeParams; view->m_hitsModuleStart = hitsModuleStart; @@ -148,29 +200,20 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( // this will break 1to1 correspondence with cluster and module locality // so unless proven VERY inefficient we keep it ordered as generated - // host copy is "reduced" (to be reviewed at some point) - if constexpr (std::is_same_v) { - // it has to compile for ALL cases - copyFromGPU(input, stream); - } else { - assert(input == nullptr); - - auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; - - m_store16 = Traits::template make_unique(nHits * n16, stream); - m_store32 = Traits::template make_unique(nHits * n32 + nL + 1, stream); - m_PhiBinnerStore = Traits::template make_unique(stream); - } + m_store16 = Traits::template make_unique(nHits * n16, stream); + m_store32 = Traits::template make_unique(nHits * n32 + TrackerTraits::numberOfLayers + 1, stream); + m_PhiBinnerStore = Traits::template make_unique(stream); - static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); - static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); + static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); + static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == + sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type)); auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; // copy all the pointers m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); + reinterpret_cast(get32(Storage32::kPhiStorage)); view->m_xl = get32(Storage32::kXLocal); view->m_yl = get32(Storage32::kYLocal); @@ -178,23 +221,20 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( view->m_yerr = get32(Storage32::kYerror); view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - if constexpr (!std::is_same_v) { - assert(input == nullptr); - view->m_xg = get32(Storage32::kXGlobal); - view->m_yg = get32(Storage32::kYGlobal); - view->m_zg = get32(Storage32::kZGlobal); - view->m_rg = get32(Storage32::kRGlobal); + view->m_xg = get32(Storage32::kXGlobal); + view->m_yg = get32(Storage32::kYGlobal); + view->m_zg = get32(Storage32::kZGlobal); + view->m_rg = get32(Storage32::kRGlobal); - auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; - m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); + auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; + m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); - view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); - view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); - view->m_detInd = get16(Storage16::kDetId); + view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); + view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); + view->m_detInd = get16(Storage16::kDetId); - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); - } + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); // transfer view if constexpr (std::is_same_v) { @@ -204,10 +244,67 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( } } +template +TrackingRecHit2DHostT::TrackingRecHit2DHostT( + uint32_t nHits, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPUT const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream, + TrackingRecHit2DHeterogeneousT const* input) { + using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; + + this->m_nHits = nHits; + this->m_offsetBPIX2 = offsetBPIX2; + this->m_hitsModuleStart = hitsModuleStart; + + auto view = cms::cuda::make_host_unique(stream); + + view->m_nHits = nHits; + this->m_view = + cms::cuda::make_host_unique(stream); // leave it on host and pass it by value? + this->m_AverageGeometryStore = cms::cuda::make_host_unique(stream); + view->m_averageGeometry = this->m_AverageGeometryStore.get(); + view->m_cpeParams = cpeParams; + view->m_hitsModuleStart = hitsModuleStart; + + // if empy do not bother + if (0 == nHits) { + this->m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version + return; + } + + this->m_store32 = cms::cuda::make_host_unique(5 * input->nHits(), stream); + cms::cuda::copyAsync(this->m_store32, input->m_store32, 5 * input->nHits(), stream); + + static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); + static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == + sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type)); + + auto get32 = [&](Storage32 i) { return this->m_store32.get() + static_cast(i) * nHits; }; + + // copy all the pointers + this->m_phiBinner = view->m_phiBinner = this->m_PhiBinnerStore.get(); + this->m_phiBinnerStorage = view->m_phiBinnerStorage = + reinterpret_cast(get32(Storage32::kPhiStorage)); + + view->m_xl = get32(Storage32::kXLocal); + view->m_yl = get32(Storage32::kYLocal); + view->m_xerr = get32(Storage32::kXerror); + view->m_yerr = get32(Storage32::kYerror); + view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); + + this->m_view = std::move(view); +} + //this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases -template -TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( - float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream) +template +TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT( + cms::cuda::host::unique_ptr& store32, + cms::cuda::host::unique_ptr& store16, + uint32_t* modules, + int nHits, + cudaStream_t stream) : m_nHits(nHits), m_hitsModuleStart(modules) { auto view = Traits::template make_host_unique(stream); @@ -226,19 +323,20 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( m_store16 = Traits::template make_unique(nHits * n16, stream); m_store32 = Traits::template make_unique(nHits * n32, stream); - m_PhiBinnerStore = Traits::template make_unique(stream); - m_AverageGeometryStore = Traits::template make_unique(stream); + m_PhiBinnerStore = Traits::template make_unique(stream); + m_AverageGeometryStore = Traits::template make_unique(stream); view->m_averageGeometry = m_AverageGeometryStore.get(); view->m_hitsModuleStart = m_hitsModuleStart; //store transfer if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_store16, store16, stream); - cms::cuda::copyAsync(m_store32, store32, stream); + cms::cuda::copyAsync(m_store16, store16, static_cast(n16 * nHits), stream); + cms::cuda::copyAsync(m_store32, store32, static_cast(n32 * nHits), stream); + } else { - std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it - std::copy(store16, store16 + nHits * n16, m_store16.get()); + std::copy(store32.get(), store32.get() + nHits * n32, m_store32.get()); // want to copy it + std::copy(store16.get(), store16.get() + nHits * n16, m_store16.get()); } //getters @@ -258,7 +356,7 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); + reinterpret_cast(get32(Storage32::kPhiStorage)); //Store 16 view->m_detInd = get16(Storage16::kDetId); @@ -274,4 +372,13 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( } } -#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h +//Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code. +using TrackingRecHit2DGPUPhase1 = TrackingRecHit2DGPUT; +using TrackingRecHit2DCPUPhase1 = TrackingRecHit2DCPUT; +using TrackingRecHit2DHostPhase1 = TrackingRecHit2DHostT; + +using TrackingRecHit2DGPUPhase2 = TrackingRecHit2DGPUT; +using TrackingRecHit2DCPUPhase2 = TrackingRecHit2DCPUT; +using TrackingRecHit2DHostPhase2 = TrackingRecHit2DHostT; + +#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneousT_h diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h index f252ca94d2296..8fd2bc54cfad7 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h @@ -1,17 +1,20 @@ -#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h -#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h +#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h +#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" #include "CUDADataFormats/Common/interface/HostProduct.h" // a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction -class TrackingRecHit2DReduced { +template +class TrackingRecHit2DReducedT { + using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; + public: using HLPstorage = HostProduct; using HIDstorage = HostProduct; template - TrackingRecHit2DReduced(UP32&& istore32, UP16&& istore16, int nhits) + TrackingRecHit2DReducedT(UP32&& istore32, UP16&& istore16, int nhits) : m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) { auto get32 = [&](int i) { return const_cast(m_store32.get()) + i * nhits; }; @@ -26,15 +29,15 @@ class TrackingRecHit2DReduced { } // view only! - TrackingRecHit2DReduced(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {} + TrackingRecHit2DReducedT(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {} - TrackingRecHit2DReduced() = default; - ~TrackingRecHit2DReduced() = default; + TrackingRecHit2DReducedT() = default; + ~TrackingRecHit2DReducedT() = default; - TrackingRecHit2DReduced(const TrackingRecHit2DReduced&) = delete; - TrackingRecHit2DReduced& operator=(const TrackingRecHit2DReduced&) = delete; - TrackingRecHit2DReduced(TrackingRecHit2DReduced&&) = default; - TrackingRecHit2DReduced& operator=(TrackingRecHit2DReduced&&) = default; + TrackingRecHit2DReducedT(const TrackingRecHit2DReducedT&) = delete; + TrackingRecHit2DReducedT& operator=(const TrackingRecHit2DReducedT&) = delete; + TrackingRecHit2DReducedT(TrackingRecHit2DReducedT&&) = default; + TrackingRecHit2DReducedT& operator=(TrackingRecHit2DReducedT&&) = default; TrackingRecHit2DSOAView& view() { return m_view; } TrackingRecHit2DSOAView const& view() const { return m_view; } @@ -50,4 +53,7 @@ class TrackingRecHit2DReduced { int m_nHits; }; +using TrackingRecHit2DReducedPhase1 = TrackingRecHit2DReducedT; +using TrackingRecHit2DReducedPhase2 = TrackingRecHit2DReducedT; + #endif diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h index 39ee136189955..59b7cb1337fdf 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h @@ -10,27 +10,34 @@ #include "CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h" namespace pixelCPEforGPU { - struct ParamsOnGPU; + template + struct ParamsOnGPUT; } -class TrackingRecHit2DSOAView { +template +class TrackingRecHit2DSOAViewT { public: using Status = SiPixelHitStatus; static_assert(sizeof(Status) == sizeof(uint8_t)); - using hindex_type = uint32_t; // if above is <=2^32 - - using PhiBinner = cms::cuda:: - HistoContainer; //28 for phase2 geometry - - using AverageGeometry = pixelTopology::AverageGeometry; - + using hindex_type = typename TrackerTraits::hindex_type; + using PhiBinner = cms::cuda::HistoContainer; //28 for phase2 geometry + using AverageGeometry = pixelTopology::AverageGeometryT; + using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; + + template + friend class TrackingRecHit2DHeterogeneousT; template - friend class TrackingRecHit2DHeterogeneous; - friend class TrackingRecHit2DReduced; + friend class TrackingRecHit2DHostT; + // template + // friend class TrackingRecHit2DReducedT; __device__ __forceinline__ uint32_t nHits() const { return m_nHits; } - __device__ __forceinline__ uint32_t nMaxModules() const { return m_nMaxModules; } __device__ __forceinline__ float& xLocal(int i) { return m_xl[i]; } __device__ __forceinline__ float xLocal(int i) const { return __ldg(m_xl + i); } @@ -75,7 +82,7 @@ class TrackingRecHit2DSOAView { __device__ __forceinline__ uint16_t& detectorIndex(int i) { return m_detInd[i]; } __device__ __forceinline__ uint16_t detectorIndex(int i) const { return __ldg(m_detInd + i); } - __device__ __forceinline__ pixelCPEforGPU::ParamsOnGPU const& cpeParams() const { return *m_cpeParams; } + __device__ __forceinline__ ParamsOnGPU const& cpeParams() const { return *m_cpeParams; } __device__ __forceinline__ uint32_t hitsModuleStart(int i) const { return __ldg(m_hitsModuleStart + i); } @@ -88,6 +95,9 @@ class TrackingRecHit2DSOAView { __device__ __forceinline__ AverageGeometry& averageGeometry() { return *m_averageGeometry; } __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; } + __device__ __forceinline__ bool clusterCut(int i, int o, bool debug = false) const { return false; } + __device__ __forceinline__ bool zSizeCut(int i, int o, bool debug = false) const { return false; } + private: // local coord float *m_xl, *m_yl; @@ -106,17 +116,16 @@ class TrackingRecHit2DSOAView { // supporting objects // m_averageGeometry is corrected for beam spot, not sure where to host it otherwise - AverageGeometry* m_averageGeometry; // owned by TrackingRecHit2DHeterogeneous - pixelCPEforGPU::ParamsOnGPU const* m_cpeParams; // forwarded from setup, NOT owned - uint32_t const* m_hitsModuleStart; // forwarded from clusters + AverageGeometry* m_averageGeometry; // owned by TrackingRecHit2DHeterogeneous + ParamsOnGPU const* m_cpeParams; // forwarded from setup, NOT owned + uint32_t const* m_hitsModuleStart; // forwarded from clusters uint32_t* m_hitsLayerStart; PhiBinner* m_phiBinner; - PhiBinner::index_type* m_phiBinnerStorage; + typename PhiBinner::index_type* m_phiBinnerStorage; uint32_t m_nHits; - uint32_t m_nMaxModules; }; #endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc index fc6a05ba9ed3e..05c3eba3d8bde 100644 --- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc +++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc @@ -4,38 +4,46 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(5 * nHits(), stream); - cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream); +template +cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::localCoordToHostAsync( + cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(5 * this->nHits(), stream); + cms::cuda::copyAsync(ret, this->m_store32, 5 * this->nHits(), stream); return ret; } -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(static_cast(n32) * nHits(), stream); - cms::cuda::copyAsync(ret, m_store32, static_cast(n32) * nHits(), stream); +template +cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::store32ToHostAsync(cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(static_cast(this->n32) * this->nHits(), stream); + cms::cuda::copyAsync(ret, this->m_store32, static_cast(this->n32) * this->nHits(), stream); return ret; } -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(static_cast(n16) * nHits(), stream); - cms::cuda::copyAsync(ret, m_store16, static_cast(n16) * nHits(), stream); +template +cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::store16ToHostAsync( + cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(static_cast(this->n16) * this->nHits(), stream); + cms::cuda::copyAsync(ret, this->m_store16, static_cast(this->n16) * this->nHits(), stream); return ret; } -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nMaxModules() + 1, stream); - cudaCheck( - cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream)); +template +cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::hitsModuleStartToHostAsync( + cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(TrackerTraits::numberOfModules + 1, stream); + cudaCheck(cudaMemcpyAsync(ret.get(), + this->m_hitsModuleStart, + sizeof(uint32_t) * (TrackerTraits::numberOfModules + 1), + cudaMemcpyDefault, + stream)); return ret; } -// the only specialization needed -template <> -void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) { - assert(input); - m_store32 = input->localCoordToHostAsync(stream); -} +template class TrackingRecHit2DGPUT; +template class TrackingRecHit2DGPUT; + +template class TrackingRecHit2DCPUT; +template class TrackingRecHit2DCPUT; + +template class TrackingRecHit2DHostT; +template class TrackingRecHit2DHostT; diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h index abecfb38797de..b9a20695712e3 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes.h +++ b/CUDADataFormats/TrackingRecHit/src/classes.h @@ -1,9 +1,9 @@ -#ifndef CUDADataFormats_SiPixelCluster_src_classes_h -#define CUDADataFormats_SiPixelCluster_src_classes_h +#ifndef CUDADataFormats_TrackingRecHit_src_classes_h +#define CUDADataFormats_TrackingRecHit_src_classes_h #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h" #include "DataFormats/Common/interface/Wrapper.h" -#endif // CUDADataFormats_SiPixelCluster_src_classes_h +#endif // CUDADataFormats_TrackingRecHit_src_classes_h diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml index f633d77c48ef7..4287860ee8495 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml +++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml @@ -1,10 +1,22 @@ - - - - - - - - + + + + + + + + + + + + + + + + + + + + diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml index ce49c46fffba0..f064563aa7051 100644 --- a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml +++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml @@ -1,4 +1,5 @@ + diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h new file mode 100644 index 0000000000000..b2da57c2471ae --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h @@ -0,0 +1,26 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" + +namespace testTrackingRecHit2D { + + template + __global__ void fill(TrackingRecHit2DSOAViewT* phits) { + assert(phits); + auto& hits = *phits; + assert(hits.nHits() == 200); + + int i = threadIdx.x; + if (i > 200) + return; + } + + template + __global__ void verify(TrackingRecHit2DSOAViewT const* phits) { + assert(phits); + auto const& hits = *phits; + assert(hits.nHits() == 200); + + int i = threadIdx.x; + if (i > 200) + return; + } +} // namespace testTrackingRecHit2D diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp index 8aca68e294469..0d910273933dc 100644 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp @@ -6,9 +6,9 @@ namespace testTrackingRecHit2D { - void runKernels(TrackingRecHit2DSOAView* hits); - -} + template + void runKernels(TrackingRecHit2DSOAViewT* hits); +} // namespace testTrackingRecHit2D int main() { cms::cudatest::requireDevices(); @@ -19,23 +19,21 @@ int main() { auto nHits = 200; // inner scope to deallocate memory before destroying the stream { - TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, stream); - testTrackingRecHit2D::runKernels(tkhit.view()); + TrackingRecHit2DGPUT tkhit(nHits, 0, nullptr, nullptr, stream); + testTrackingRecHit2D::runKernels(tkhit.view()); - TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, stream); - testTrackingRecHit2D::runKernels(tkhitPhase2.view()); + TrackingRecHit2DGPUT tkhitPhase2(nHits, 0, nullptr, nullptr, stream); + testTrackingRecHit2D::runKernels(tkhitPhase2.view()); - TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, stream, &tkhit); + TrackingRecHit2DHostT tkhitH(nHits, 0, nullptr, nullptr, stream, &tkhit); cudaStreamSynchronize(stream); assert(tkhitH.view()); assert(tkhitH.view()->nHits() == unsigned(nHits)); - assert(tkhitH.view()->nMaxModules() == phase1PixelTopology::numberOfModules); - TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, stream, &tkhit); + TrackingRecHit2DHostT tkhitHPhase2(nHits, 0, nullptr, nullptr, stream, &tkhitPhase2); cudaStreamSynchronize(stream); assert(tkhitHPhase2.view()); assert(tkhitHPhase2.view()->nHits() == unsigned(nHits)); - assert(tkhitHPhase2.view()->nMaxModules() == phase2PixelTopology::numberOfModules); } cudaCheck(cudaStreamDestroy(stream)); diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu index 06bd599d074f9..e902ea971edf3 100644 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu @@ -1,31 +1,15 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "TrackingRecHit2DCUDAImpl_t.h" namespace testTrackingRecHit2D { - __global__ void fill(TrackingRecHit2DSOAView* phits) { - assert(phits); - auto& hits = *phits; - assert(hits.nHits() == 200); - - int i = threadIdx.x; - if (i > 200) - return; - } - - __global__ void verify(TrackingRecHit2DSOAView const* phits) { - assert(phits); - auto const& hits = *phits; - assert(hits.nHits() == 200); - - int i = threadIdx.x; - if (i > 200) - return; - } - - void runKernels(TrackingRecHit2DSOAView* hits) { + template + void runKernels(TrackingRecHit2DSOAViewT* hits) { assert(hits); - fill<<<1, 1024>>>(hits); - verify<<<1, 1024>>>(hits); + fill<<<1, 1024>>>(hits); + verify<<<1, 1024>>>(hits); } + template void runKernels(TrackingRecHit2DSOAViewT* hits); + template void runKernels(TrackingRecHit2DSOAViewT* hits); } // namespace testTrackingRecHit2D diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h index e31b87f30fa11..95106050f3d7a 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h +++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h @@ -8,7 +8,7 @@ // These vertices are clusterized and fitted only along the beam line (z) // to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well) struct ZVertexSoA { - static constexpr uint32_t MAXTRACKS = 32 * 1024; + static constexpr uint32_t MAXTRACKS = 128 * 1024; static constexpr uint32_t MAXVTX = 1024; int16_t idv[MAXTRACKS]; // vertex index for each associated (original) track (-1 == not associate) diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc index fbd5a41d4a898..71abb95dbb4d1 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc @@ -2,7 +2,7 @@ // Package: SiPixelPhase1CompareRecHitsSoA // Class: SiPixelPhase1CompareRecHitsSoA // -/**\class SiPixelPhase1CompareRecHitsSoA SiPixelPhase1CompareRecHitsSoA.cc +/**\class SiPixelPhase1CompareRecHitsSoA SiPixelPhase1CompareRecHitsSoA.cc */ // // Author: Suvankar Roy Chowdhury, Alessandro Rossi @@ -29,6 +29,9 @@ class SiPixelPhase1CompareRecHitsSoA : public DQMEDAnalyzer { public: + using HitSoA = TrackingRecHit2DSOAViewT; + using HitsOnCPU = TrackingRecHit2DCPUT; + explicit SiPixelPhase1CompareRecHitsSoA(const edm::ParameterSet&); ~SiPixelPhase1CompareRecHitsSoA() override = default; void dqmBeginRun(const edm::Run&, const edm::EventSetup&) override; @@ -39,8 +42,8 @@ class SiPixelPhase1CompareRecHitsSoA : public DQMEDAnalyzer { private: const edm::ESGetToken geomToken_; const edm::ESGetToken topoToken_; - const edm::EDGetTokenT tokenSoAHitsCPU_; - const edm::EDGetTokenT tokenSoAHitsGPU_; + const edm::EDGetTokenT tokenSoAHitsCPU_; + const edm::EDGetTokenT tokenSoAHitsGPU_; const std::string topFolderName_; const float mind2cut_; static constexpr uint32_t invalidHit_ = std::numeric_limits::max(); @@ -77,8 +80,8 @@ class SiPixelPhase1CompareRecHitsSoA : public DQMEDAnalyzer { SiPixelPhase1CompareRecHitsSoA::SiPixelPhase1CompareRecHitsSoA(const edm::ParameterSet& iConfig) : geomToken_(esConsumes()), topoToken_(esConsumes()), - tokenSoAHitsCPU_(consumes(iConfig.getParameter("pixelHitsSrcCPU"))), - tokenSoAHitsGPU_(consumes(iConfig.getParameter("pixelHitsSrcGPU"))), + tokenSoAHitsCPU_(consumes(iConfig.getParameter("pixelHitsSrcCPU"))), + tokenSoAHitsGPU_(consumes(iConfig.getParameter("pixelHitsSrcGPU"))), topFolderName_(iConfig.getParameter("topFolderName")), mind2cut_(iConfig.getParameter("minD2cut")) {} // @@ -106,10 +109,11 @@ void SiPixelPhase1CompareRecHitsSoA::analyze(const edm::Event& iEvent, const edm out << "the comparison will not run."; return; } + auto const& rhsoaCPU = *rhsoaHandleCPU; - const TrackingRecHit2DSOAView* soa2dCPU = rhsoaCPU.view(); + const HitSoA* soa2dCPU = rhsoaCPU.view(); auto const& rhsoaGPU = *rhsoaHandleGPU; - const TrackingRecHit2DSOAView* soa2dGPU = rhsoaGPU.view(); + const HitSoA* soa2dGPU = rhsoaGPU.view(); uint32_t nHitsCPU = soa2dCPU->nHits(); uint32_t nHitsGPU = soa2dGPU->nHits(); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc index 7b12f694d4e8c..915c2ac1399f5 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc @@ -2,7 +2,7 @@ // Package: SiPixelPhase1CompareTrackSoA // Class: SiPixelPhase1CompareTrackSoA // -/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc +/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc */ // // Author: Suvankar Roy Chowdhury @@ -64,6 +64,8 @@ namespace { class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { public: + using PixelTrackSoAPhase1 = PixelTrackHeterogeneousT; + explicit SiPixelPhase1CompareTrackSoA(const edm::ParameterSet&); ~SiPixelPhase1CompareTrackSoA() override = default; void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; @@ -71,8 +73,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - const edm::EDGetTokenT tokenSoATrackCPU_; - const edm::EDGetTokenT tokenSoATrackGPU_; + const edm::EDGetTokenT tokenSoATrackCPU_; + const edm::EDGetTokenT tokenSoATrackGPU_; const std::string topFolderName_; const bool useQualityCut_; const pixelTrack::Quality minQuality_; @@ -113,8 +115,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { // SiPixelPhase1CompareTrackSoA::SiPixelPhase1CompareTrackSoA(const edm::ParameterSet& iConfig) - : tokenSoATrackCPU_(consumes(iConfig.getParameter("pixelTrackSrcCPU"))), - tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), + : tokenSoATrackCPU_(consumes(iConfig.getParameter("pixelTrackSrcCPU"))), + tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), topFolderName_(iConfig.getParameter("topFolderName")), useQualityCut_(iConfig.getParameter("useQualityCut")), minQuality_(pixelTrack::qualityByName(iConfig.getParameter("minQuality"))), diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc index 4559e57d1482c..231186f88e53f 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc @@ -3,7 +3,7 @@ // Package: SiPixelPhase1MonitorRecHitsSoA // Class: SiPixelPhase1MonitorRecHitsSoA // -/**\class SiPixelPhase1MonitorRecHitsSoA SiPixelPhase1MonitorRecHitsSoA.cc +/**\class SiPixelPhase1MonitorRecHitsSoA SiPixelPhase1MonitorRecHitsSoA.cc */ // // Author: Suvankar Roy Chowdhury, Alessandro Rossi @@ -30,6 +30,9 @@ class SiPixelPhase1MonitorRecHitsSoA : public DQMEDAnalyzer { public: + using HitSoA = TrackingRecHit2DSOAViewT; + using HitsOnCPU = TrackingRecHit2DCPUT; + explicit SiPixelPhase1MonitorRecHitsSoA(const edm::ParameterSet&); ~SiPixelPhase1MonitorRecHitsSoA() override = default; void dqmBeginRun(const edm::Run&, const edm::EventSetup&) override; @@ -40,7 +43,7 @@ class SiPixelPhase1MonitorRecHitsSoA : public DQMEDAnalyzer { private: const edm::ESGetToken geomToken_; const edm::ESGetToken topoToken_; - const edm::EDGetTokenT tokenSoAHitsCPU_; + const edm::EDGetTokenT tokenSoAHitsCPU_; const std::string topFolderName_; const TrackerGeometry* tkGeom_ = nullptr; const TrackerTopology* tTopo_ = nullptr; @@ -74,7 +77,7 @@ class SiPixelPhase1MonitorRecHitsSoA : public DQMEDAnalyzer { SiPixelPhase1MonitorRecHitsSoA::SiPixelPhase1MonitorRecHitsSoA(const edm::ParameterSet& iConfig) : geomToken_(esConsumes()), topoToken_(esConsumes()), - tokenSoAHitsCPU_(consumes(iConfig.getParameter("pixelHitsSrc"))), + tokenSoAHitsCPU_(consumes(iConfig.getParameter("pixelHitsSrc"))), topFolderName_(iConfig.getParameter("TopFolderName")) {} // // Begin Run @@ -94,7 +97,7 @@ void SiPixelPhase1MonitorRecHitsSoA::analyze(const edm::Event& iEvent, const edm return; } auto const& rhsoa = *rhsoaHandle; - const TrackingRecHit2DSOAView* soa2d = rhsoa.view(); + const HitSoA* soa2d = rhsoa.view(); uint32_t nHits_ = soa2d->nHits(); hnHits->Fill(nHits_); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc index 622895ba07bcc..5d2545b6cdc9f 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc @@ -3,7 +3,7 @@ // Package: SiPixelPhase1MonitorTrackSoA // Class: SiPixelPhase1MonitorTrackSoA // -/**\class SiPixelPhase1MonitorTrackSoA SiPixelPhase1MonitorTrackSoA.cc +/**\class SiPixelPhase1MonitorTrackSoA SiPixelPhase1MonitorTrackSoA.cc */ // // Author: Suvankar Roy Chowdhury @@ -27,6 +27,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { public: + using PixelTrackHeterogeneousPhase1 = PixelTrackHeterogeneousT; explicit SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet&); ~SiPixelPhase1MonitorTrackSoA() override = default; void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; @@ -34,7 +35,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoATrack_; std::string topFolderName_; bool useQualityCut_; pixelTrack::Quality minQuality_; @@ -62,7 +63,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { // SiPixelPhase1MonitorTrackSoA::SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet& iConfig) { - tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); topFolderName_ = iConfig.getParameter("topFolderName"); //"SiPixelHeterogeneous/PixelTrackSoA"; useQualityCut_ = iConfig.getParameter("useQualityCut"); minQuality_ = pixelTrack::qualityByName(iConfig.getParameter("minQuality")); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc index af6c240a69172..6324cee4372d8 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc @@ -3,7 +3,7 @@ // Package: SiPixelPhase1MonitorVertexSoA // Class: SiPixelPhase1MonitorVertexSoA // -/**\class SiPixelPhase1MonitorVertexSoA SiPixelPhase1MonitorVertexSoA.cc +/**\class SiPixelPhase1MonitorVertexSoA SiPixelPhase1MonitorVertexSoA.cc */ // // Author: Suvankar Roy Chowdhury @@ -81,11 +81,13 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm: dxdz = bs.dxdz(); dydz = bs.dydz(); } + for (int iv = 0; iv < nVertices; iv++) { auto si = vsoa.sortInd[iv]; auto z = vsoa.zv[si]; auto x = x0 + dxdz * z; auto y = y0 + dydz * z; + z += z0; hx->Fill(x); hy->Fill(y); diff --git a/Geometry/CommonTopologies/interface/SimplePixelTopology.h b/Geometry/CommonTopologies/interface/SimplePixelTopology.h index d91d9b40e89ce..c991d09666297 100644 --- a/Geometry/CommonTopologies/interface/SimplePixelTopology.h +++ b/Geometry/CommonTopologies/interface/SimplePixelTopology.h @@ -4,104 +4,48 @@ #include #include #include +#include "FWCore/Utilities/interface/HostDeviceConstant.h" namespace pixelTopology { - template - constexpr auto map_to_array_helper(Function f, std::index_sequence) - -> std::array, sizeof...(Indices)> { - return {{f(Indices)...}}; - } - - template - constexpr auto map_to_array(Function f) -> std::array, N> { - return map_to_array_helper(f, std::make_index_sequence{}); - } constexpr auto maxNumberOfLadders = 160; constexpr uint32_t maxLayers = 28; - struct AverageGeometry { + template + struct AverageGeometryT { // - float ladderZ[maxNumberOfLadders]; - float ladderX[maxNumberOfLadders]; - float ladderY[maxNumberOfLadders]; - float ladderR[maxNumberOfLadders]; - float ladderMinZ[maxNumberOfLadders]; - float ladderMaxZ[maxNumberOfLadders]; + float ladderZ[TrackerTraits::numberOfLaddersInBarrel]; + float ladderX[TrackerTraits::numberOfLaddersInBarrel]; + float ladderY[TrackerTraits::numberOfLaddersInBarrel]; + float ladderR[TrackerTraits::numberOfLaddersInBarrel]; + float ladderMinZ[TrackerTraits::numberOfLaddersInBarrel]; + float ladderMaxZ[TrackerTraits::numberOfLaddersInBarrel]; float endCapZ[2]; // just for pos and neg Layer1 }; - constexpr inline uint16_t localY(uint16_t py, uint16_t n) { - auto roc = py / n; - auto shift = 2 * roc; - auto yInRoc = py - n * roc; - if (yInRoc > 0) - shift += 1; - return py + shift; - } - -} // namespace pixelTopology - -namespace phase1PixelTopology { - - constexpr uint16_t numberOfModulesInBarrel = 1184; - constexpr uint16_t numberOfModulesInLadder = 8; - constexpr uint16_t numberOfLaddersInBarrel = numberOfModulesInBarrel / numberOfModulesInLadder; - - constexpr uint16_t numRowsInRoc = 80; - constexpr uint16_t numColsInRoc = 52; - constexpr uint16_t lastRowInRoc = numRowsInRoc - 1; - constexpr uint16_t lastColInRoc = numColsInRoc - 1; + constexpr int16_t phi0p05 = 522; // round(521.52189...) = phi2short(0.05); + constexpr int16_t phi0p06 = 626; // round(625.82270...) = phi2short(0.06); + constexpr int16_t phi0p07 = 730; // round(730.12648...) = phi2short(0.07); + constexpr int16_t phi0p09 = 900; - constexpr uint16_t numRowsInModule = 2 * numRowsInRoc; - constexpr uint16_t numColsInModule = 8 * numColsInRoc; - constexpr uint16_t lastRowInModule = numRowsInModule - 1; - constexpr uint16_t lastColInModule = numColsInModule - 1; - - constexpr int16_t xOffset = -81; - constexpr int16_t yOffset = -54 * 4; - - constexpr uint16_t pixelThickness = 285; - constexpr uint16_t pixelPitchX = 100; - constexpr uint16_t pixelPitchY = 150; - - constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule); + template + constexpr auto map_to_array_helper(Function f, std::index_sequence) + -> std::array, sizeof...(Indices)> { + return {{f(Indices)...}}; + } - constexpr uint32_t numberOfModules = 1856; - constexpr uint32_t numberOfLayers = 10; -#ifdef __CUDA_ARCH__ - __device__ -#endif - constexpr uint32_t layerStart[numberOfLayers + 1] = {0, - 96, - 320, - 672, // barrel - 1184, - 1296, - 1408, // positive endcap - 1520, - 1632, - 1744, // negative endcap - numberOfModules}; - constexpr char const* layerName[numberOfLayers] = { - "BL1", - "BL2", - "BL3", - "BL4", // barrel - "E+1", - "E+2", - "E+3", // positive endcap - "E-1", - "E-2", - "E-3" // negative endcap - }; + template + constexpr auto map_to_array(Function f) -> std::array, N> { + return map_to_array_helper(f, std::make_index_sequence{}); + } + template constexpr uint16_t findMaxModuleStride() { bool go = true; int n = 2; while (go) { - for (uint8_t i = 1; i < std::size(layerStart); ++i) { - if (layerStart[i] % n != 0) { + for (uint8_t i = 1; i < TrackerTraits::numberOfLayers + 1; ++i) { + if (TrackerTraits::layerStart[i] % n != 0) { go = false; break; } @@ -113,48 +57,62 @@ namespace phase1PixelTopology { return n / 2; } - constexpr uint16_t maxModuleStride = findMaxModuleStride(); + template + constexpr uint16_t maxModuleStride = findMaxModuleStride(); + template constexpr uint8_t findLayer(uint32_t detId, uint8_t sl = 0) { - for (uint8_t i = sl; i < std::size(layerStart); ++i) - if (detId < layerStart[i + 1]) + for (uint8_t i = sl; i < TrackerTraits::numberOfLayers + 1; ++i) + if (detId < TrackerTraits::layerStart[i + 1]) return i; - return std::size(layerStart); + return TrackerTraits::numberOfLayers + 1; } + template constexpr uint8_t findLayerFromCompact(uint32_t detId) { - detId *= maxModuleStride; - for (uint8_t i = 0; i < std::size(layerStart); ++i) - if (detId < layerStart[i + 1]) + detId *= maxModuleStride; + for (uint8_t i = 0; i < TrackerTraits::numberOfLayers + 1; ++i) + if (detId < TrackerTraits::layerStart[i + 1]) return i; - return std::size(layerStart); + return TrackerTraits::numberOfLayers + 1; } - constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride; + template + constexpr uint32_t layerIndexSize = TrackerTraits::numberOfModules / maxModuleStride; + + template #ifdef __CUDA_ARCH__ __device__ #endif - constexpr std::array - layer = pixelTopology::map_to_array(findLayerFromCompact); + constexpr std::array> + layer = map_to_array>(findLayerFromCompact); + template constexpr uint8_t getLayer(uint32_t detId) { - return phase1PixelTopology::layer[detId / phase1PixelTopology::maxModuleStride]; + return layer[detId / maxModuleStride]; } + template constexpr bool validateLayerIndex() { bool res = true; - for (auto i = 0U; i < numberOfModules; ++i) { - auto j = i / maxModuleStride; - res &= (layer[j] < numberOfLayers); - res &= (i >= layerStart[layer[j]]); - res &= (i < layerStart[layer[j] + 1]); + for (auto i = 0U; i < TrackerTraits::numberOfModules; ++i) { + auto j = i / maxModuleStride; + res &= (layer[j] < TrackerTraits::numberOfLayers); + res &= (i >= TrackerTraits::layerStart[layer[j]]); + res &= (i < TrackerTraits::layerStart[layer[j] + 1]); } return res; } - static_assert(validateLayerIndex(), "layer from detIndex algo is buggy"); + template +#ifdef __CUDA_ARCH__ + __device__ +#endif + constexpr inline uint32_t + layerStart(uint32_t i) { + return TrackerTraits::layerStart[i]; + } - // this is for the ROC n<512 (upgrade 1024) constexpr inline uint16_t divu52(uint16_t n) { n = n >> 2; uint16_t q = (n >> 1) + (n >> 4); @@ -163,128 +121,397 @@ namespace phase1PixelTopology { uint16_t r = n - q * 13; return q + ((r + 3) >> 4); } +} // namespace pixelTopology - constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); } +namespace phase1PixelTopology { - constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); } + using pixelTopology::phi0p05; + using pixelTopology::phi0p06; + using pixelTopology::phi0p07; + + constexpr uint32_t numberOfLayers = 28; + constexpr int nPairs = 13 + 2 + 4; + constexpr uint16_t numberOfModules = 1856; + + constexpr uint32_t max_ladder_bpx0 = 12; + constexpr uint32_t first_ladder_bpx0 = 0; + constexpr float module_length_bpx0 = 6.7f; + constexpr float module_tolerance_bpx0 = 0.4f; // projection to cylinder is inaccurate on BPIX1 + constexpr uint32_t max_ladder_bpx4 = 64; + constexpr uint32_t first_ladder_bpx4 = 84; + constexpr float radius_even_ladder = 15.815f; + constexpr float radius_odd_ladder = 16.146f; + constexpr float module_length_bpx4 = 6.7f; + constexpr float module_tolerance_bpx4 = 0.2f; + constexpr float barrel_z_length = 26.f; + constexpr float forward_z_begin = 32.f; + + HOST_DEVICE_CONSTANT uint8_t layerPairs[2 * nPairs] = { + 0, 1, 0, 4, 0, 7, // BPIX1 (3) + 1, 2, 1, 4, 1, 7, // BPIX2 (6) + 4, 5, 7, 8, // FPIX1 (8) + 2, 3, 2, 4, 2, 7, 5, 6, 8, 9, // BPIX3 & FPIX2 (13) + 0, 2, 1, 3, // Jumping Barrel (15) + 0, 5, 0, 8, // Jumping Forward (BPIX1,FPIX2) + 4, 6, 7, 9 // Jumping Forward (19) + }; - constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; } + HOST_DEVICE_CONSTANT int16_t phicuts[nPairs]{phi0p05, + phi0p07, + phi0p07, + phi0p05, + phi0p06, + phi0p06, + phi0p05, + phi0p05, + phi0p06, + phi0p06, + phi0p06, + phi0p05, + phi0p05, + phi0p05, + phi0p05, + phi0p05, + phi0p05, + phi0p05, + phi0p05}; + HOST_DEVICE_CONSTANT float minz[nPairs] = { + -20., 0., -30., -22., 10., -30., -70., -70., -22., 15., -30, -70., -70., -20., -22., 0, -30., -70., -70.}; + HOST_DEVICE_CONSTANT float maxz[nPairs] = { + 20., 30., 0., 22., 30., -10., 70., 70., 22., 30., -15., 70., 70., 20., 22., 30., 0., 70., 70.}; + HOST_DEVICE_CONSTANT float maxr[nPairs] = { + 20., 9., 9., 20., 7., 7., 5., 5., 20., 6., 6., 5., 5., 20., 20., 9., 9., 9., 9.}; + + static constexpr uint32_t layerStart[numberOfLayers + 1] = {0, + 96, + 320, + 672, // barrel + 1184, + 1296, + 1408, // positive endcap + 1520, + 1632, + 1744, // negative endcap + numberOfModules}; +} // namespace phase1PixelTopology - constexpr inline uint16_t toRocY(uint16_t py) { - auto roc = divu52(py); - return py - 52 * roc; - } +namespace phase2PixelTopology { - constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); } + using pixelTopology::phi0p05; + using pixelTopology::phi0p06; + using pixelTopology::phi0p07; + using pixelTopology::phi0p09; - constexpr inline bool isBigPixY(uint16_t py) { - auto ly = toRocY(py); - return (ly == 0) | (ly == lastColInRoc); - } + constexpr uint32_t numberOfLayers = 28; + constexpr int nPairs = 23 + 6 + 14 + 8 + 4; // include far forward layer pairs + constexpr uint16_t numberOfModules = 3892; - constexpr inline uint16_t localX(uint16_t px) { - auto shift = 0; - if (px > lastRowInRoc) - shift += 1; - if (px > numRowsInRoc) - shift += 1; - return px + shift; - } + HOST_DEVICE_CONSTANT uint8_t layerPairs[2 * nPairs] = { - constexpr inline uint16_t localY(uint16_t py) { - auto roc = divu52(py); - auto shift = 2 * roc; - auto yInRoc = py - 52 * roc; - if (yInRoc > 0) - shift += 1; - return py + shift; - } + 0, 1, 0, 4, 0, 16, //BPIX1 (3) + 1, 2, 1, 4, 1, 16, //BPIX2 (6) + 2, 3, 2, 4, 2, 16, //BPIX3 & Forward (9) -} // namespace phase1PixelTopology + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, //POS (16) + 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, //NEG (23) -namespace phase2PixelTopology { + 0, 2, 0, 5, 0, 17, 0, 6, 0, 18, // BPIX1 Jump (28) + 1, 3, 1, 5, 1, 17, 1, 6, 1, 18, // BPIX2 Jump (33) - constexpr uint32_t numberOfModulesInBarrel = 756; - constexpr uint32_t numberOfModulesInLadder = 9; - constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / numberOfModulesInLadder; - - constexpr uint32_t numberOfModules = 3892; - constexpr uint8_t numberOfLayers = 28; - - constexpr uint32_t layerStart[numberOfLayers + 1] = {0, - 108, - 324, - 504, //Barrel - 756, - 864, - 972, - 1080, - 1188, - 1296, - 1404, - 1512, - 1620, - 1796, - 1972, - 2148, //Fp - 2324, - 2432, - 2540, - 2648, - 2756, - 2864, - 2972, - 3080, - 3188, - 3364, - 3540, - 3716, //Np - numberOfModules}; + 11, 12, 12, 13, 13, 14, 14, 15, //Late POS (37) + 23, 24, 24, 25, 25, 26, 26, 27, //Late NEG (41) - constexpr uint16_t findMaxModuleStride() { - bool go = true; - int n = 2; - while (go) { - for (uint8_t i = 1; i < numberOfLayers + 1; ++i) { - if (layerStart[i] % n != 0) { - go = false; - break; - } - } - if (!go) - break; - n *= 2; - } - return n / 2; - } + 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, //POS Jump (48) + 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, //NEG Jump (55) + }; + HOST_DEVICE_CONSTANT uint32_t layerStart[numberOfLayers + 1] = {0, + 108, + 324, + 504, //Barrel + 756, + 864, + 972, + 1080, + 1188, + 1296, + 1404, + 1512, + 1620, + 1796, + 1972, + 2148, //Fp + 2324, + 2432, + 2540, + 2648, + 2756, + 2864, + 2972, + 3080, + 3188, + 3364, + 3540, + 3716, //Np + numberOfModules}; + + HOST_DEVICE_CONSTANT int16_t phicuts[nPairs]{ + phi0p05, phi0p05, phi0p05, phi0p06, phi0p07, phi0p07, phi0p06, phi0p07, phi0p07, phi0p05, phi0p05, + phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, + phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p07, phi0p07, phi0p07, phi0p07, + phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, + phi0p07, phi0p07, phi0p07, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05}; + + HOST_DEVICE_CONSTANT float minz[nPairs] = { + -16.0, 4.0, -22.0, -17.0, 6.0, -22.0, -18.0, 11.0, -22.0, 23.0, 30.0, 39.0, 50.0, 65.0, + 82.0, 109.0, -28.0, -35.0, -44.0, -55.0, -70.0, -87.0, -113.0, -16., 7.0, -22.0, 11.0, -22.0, + -17.0, 9.0, -22.0, 13.0, -22.0, 137.0, 173.0, 199.0, 229.0, -142.0, -177.0, -203.0, -233.0, 23.0, + 30.0, 39.0, 50.0, 65.0, 82.0, 109.0, -28.0, -35.0, -44.0, -55.0, -70.0, -87.0, -113.0}; + + HOST_DEVICE_CONSTANT float maxz[nPairs] = { + + 17.0, 22.0, -4.0, 17.0, 22.0, -6.0, 18.0, 22.0, -11.0, 28.0, 35.0, 44.0, 55.0, 70.0, + 87.0, 113.0, -23.0, -30.0, -39.0, -50.0, -65.0, -82.0, -109.0, 17.0, 22.0, -7.0, 22.0, -10.0, + 17.0, 22.0, -9.0, 22.0, -13.0, 142.0, 177.0, 203.0, 233.0, -137.0, -173.0, -199.0, -229.0, 28.0, + 35.0, 44.0, 55.0, 70.0, 87.0, 113.0, -23.0, -30.0, -39.0, -50.0, -65.0, -82.0, -109.0}; + + HOST_DEVICE_CONSTANT float maxr[nPairs] = {5.0, 5.0, 5.0, 7.0, 8.0, 8.0, 7.0, 7.0, 7.0, 6.0, 6.0, 6.0, 6.0, 5.0, + 6.0, 5.0, 6.0, 6.0, 6.0, 6.0, 5.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0, 8.0, 8.0, 8.0, 8.0, 6.0, 5.0, 5.0, 5.0, 6.0, 5.0, 5.0, 5.0, 9.0, + 9.0, 9.0, 8.0, 8.0, 8.0, 11.0, 9.0, 9.0, 9.0, 8.0, 8.0, 8.0, 11.0}; +} // namespace phase2PixelTopology + +namespace pixelTopology { - constexpr uint16_t maxModuleStride = findMaxModuleStride(); + struct Phase2 { + // types + using hindex_type = uint32_t; // FIXME from siPixelRecHitsHeterogeneousProduct + using tindex_type = uint32_t; // for tuples + using cindex_type = uint32_t; // for cells - constexpr uint8_t findLayerFromCompact(uint32_t detId) { - detId *= maxModuleStride; - for (uint8_t i = 0; i < numberOfLayers + 1; ++i) - if (detId < layerStart[i + 1]) - return i; - return numberOfLayers + 1; - } + static constexpr uint32_t maxCellNeighbors = 64; + static constexpr uint32_t maxCellTracks = 302; + static constexpr uint32_t maxHitsOnTrack = 15; + static constexpr uint32_t maxHitsOnTrackForFullFit = 6; + static constexpr uint32_t avgHitsPerTrack = 9; + static constexpr uint32_t maxCellsPerHit = 256; + static constexpr uint32_t avgTracksPerHit = 10; + static constexpr uint32_t maxNumberOfTuples = 256 * 1024; + static constexpr uint32_t maxHitsForContainers = avgHitsPerTrack * maxNumberOfTuples; + static constexpr uint32_t maxNumberOfDoublets = 5 * 512 * 1024; + static constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8; + static constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples; + static constexpr uint32_t maxDepth = 12; + static constexpr uint32_t numberOfLayers = 28; - constexpr uint16_t layerIndexSize = numberOfModules / maxModuleStride; - constexpr std::array layer = - pixelTopology::map_to_array(findLayerFromCompact); + static constexpr uint32_t maxSizeCluster = 2047; - constexpr bool validateLayerIndex() { - bool res = true; - for (auto i = 0U; i < numberOfModules; ++i) { - auto j = i / maxModuleStride; - res &= (layer[j] < numberOfLayers); - res &= (i >= layerStart[layer[j]]); - res &= (i < layerStart[layer[j] + 1]); + static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64; // for both x and y + static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16; + + static constexpr uint32_t last_bpix1_detIndex = 108; + static constexpr uint32_t last_bpix2_detIndex = 324; + static constexpr uint32_t last_barrel_detIndex = 504; + + static constexpr uint32_t maxPixInModule = 6000; + + static constexpr float moduleLength = 4.345f; + static constexpr float endcapCorrection = 0.0f; + + static constexpr float xerr_barrel_l1_def = 0.00035f; + static constexpr float yerr_barrel_l1_def = 0.00125f; + static constexpr float xerr_barrel_ln_def = 0.00035f; + static constexpr float yerr_barrel_ln_def = 0.00125f; + static constexpr float xerr_endcap_def = 0.00060f; + static constexpr float yerr_endcap_def = 0.00180f; + + static constexpr float bigPixXCorrection = 0.0f; + static constexpr float bigPixYCorrection = 0.0f; + + static constexpr float dzdrFact = 8 * 0.0285 / 0.015; // from dz/dr to "DY" + static constexpr float z0Cut = 7.5f; + static constexpr float doubletHardPt = 0.8f; + + static constexpr int minYsizeB1 = 25; + static constexpr int minYsizeB2 = 15; + + static constexpr int nPairsMinimal = 33; + static constexpr int nPairsFarForwards = nPairsMinimal + 8; // include barrel "jumping" layer pairs + static constexpr int nPairs = phase2PixelTopology::nPairs; // include far forward layer pairs + + static constexpr int maxDYsize12 = 12; + static constexpr int maxDYsize = 10; + static constexpr int maxDYPred = 20; + + static constexpr uint16_t numberOfModules = 3892; + + static constexpr uint16_t clusterBinning = 1024; + static constexpr uint16_t clusterBits = 10; + + static constexpr uint16_t numberOfModulesInBarrel = 756; + static constexpr uint16_t numberOfModulesInLadder = 9; + static constexpr uint16_t numberOfLaddersInBarrel = numberOfModulesInBarrel / numberOfModulesInLadder; + + static constexpr uint16_t firstEndcapPos = 4; + static constexpr uint16_t firstEndcapNeg = 16; + + static constexpr int16_t xOffset = -1e4; //not used actually, to suppress static analyzer warnings + + static constexpr char const *nameModifier = "Phase2"; + + static constexpr uint32_t const *layerStart = phase2PixelTopology::layerStart; + static constexpr float const *minz = phase2PixelTopology::minz; + static constexpr float const *maxz = phase2PixelTopology::maxz; + static constexpr float const *maxr = phase2PixelTopology::maxr; + + static constexpr uint8_t const *layerPairs = phase2PixelTopology::layerPairs; + static constexpr int16_t const *phicuts = phase2PixelTopology::phicuts; + + static constexpr inline bool isBigPixX(uint16_t px) { return false; } + static constexpr inline bool isBigPixY(uint16_t py) { return false; } + + static constexpr inline uint16_t localX(uint16_t px) { return px; } + static constexpr inline uint16_t localY(uint16_t py) { return py; } + }; + + struct Phase1 { + // types + using hindex_type = uint32_t; // FIXME from siPixelRecHitsHeterogeneousProduct + using tindex_type = uint16_t; // for tuples + using cindex_type = uint32_t; // for cells + + static constexpr uint32_t maxCellNeighbors = 36; + static constexpr uint32_t maxCellTracks = 48; + static constexpr uint32_t maxHitsOnTrack = 10; + static constexpr uint32_t maxHitsOnTrackForFullFit = 6; + static constexpr uint32_t avgHitsPerTrack = 4; + static constexpr uint32_t maxCellsPerHit = 256; + static constexpr uint32_t avgTracksPerHit = 6; + static constexpr uint32_t maxNumberOfTuples = 32 * 1024; + static constexpr uint32_t maxHitsForContainers = avgHitsPerTrack * maxNumberOfTuples; + static constexpr uint32_t maxNumberOfDoublets = 512 * 1024; + static constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8; + static constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples; + static constexpr uint32_t maxDepth = 6; + static constexpr uint32_t numberOfLayers = 10; + + static constexpr uint32_t maxSizeCluster = 1023; + + static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64; // for both x and y + static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16; + + static constexpr uint32_t last_bpix1_detIndex = 96; + static constexpr uint32_t last_bpix2_detIndex = 320; + static constexpr uint32_t last_barrel_detIndex = 1184; + + static constexpr uint32_t maxPixInModule = 6000; + + static constexpr float moduleLength = 6.7f; + static constexpr float endcapCorrection = 1.5f; + + static constexpr float xerr_barrel_l1_def = 0.00200f; + static constexpr float yerr_barrel_l1_def = 0.00210f; + static constexpr float xerr_barrel_ln_def = 0.00200f; + static constexpr float yerr_barrel_ln_def = 0.00210f; + static constexpr float xerr_endcap_def = 0.0020f; + static constexpr float yerr_endcap_def = 0.00210f; + + static constexpr float bigPixXCorrection = 1.0f; + static constexpr float bigPixYCorrection = 8.0f; + + static constexpr float dzdrFact = 8 * 0.0285 / 0.015; // from dz/dr to "DY" + static constexpr float z0Cut = 12.f; + static constexpr float doubletHardPt = 0.5f; + + static constexpr int minYsizeB1 = 36; + static constexpr int minYsizeB2 = 28; + + static constexpr int nPairsForQuadruplets = 13; // quadruplets require hits in all layers + static constexpr int nPairsForTriplets = nPairsForQuadruplets + 2; // include barrel "jumping" layer pairs + static constexpr int nPairs = nPairsForTriplets + 4; // include forward "jumping" layer pairs + + static constexpr int maxDYsize12 = 28; + static constexpr int maxDYsize = 20; + static constexpr int maxDYPred = 20; + + static constexpr uint16_t numberOfModules = 1856; + + static constexpr uint16_t numRowsInRoc = 80; + static constexpr uint16_t numColsInRoc = 52; + static constexpr uint16_t lastRowInRoc = numRowsInRoc - 1; + static constexpr uint16_t lastColInRoc = numColsInRoc - 1; + + static constexpr uint16_t numRowsInModule = 2 * numRowsInRoc; + static constexpr uint16_t numColsInModule = 8 * numColsInRoc; + static constexpr uint16_t lastRowInModule = numRowsInModule - 1; + static constexpr uint16_t lastColInModule = numColsInModule - 1; + + static constexpr uint16_t clusterBinning = numColsInModule + 2; + static constexpr uint16_t clusterBits = 9; + + static constexpr uint16_t numberOfModulesInBarrel = 1184; + static constexpr uint16_t numberOfModulesInLadder = 8; + static constexpr uint16_t numberOfLaddersInBarrel = numberOfModulesInBarrel / numberOfModulesInLadder; + + static constexpr uint16_t firstEndcapPos = 4; + static constexpr uint16_t firstEndcapNeg = 7; + + static constexpr int16_t xOffset = -81; + + static constexpr char const *nameModifier = ""; + + static constexpr uint32_t const *layerStart = phase1PixelTopology::layerStart; + static constexpr float const *minz = phase1PixelTopology::minz; + static constexpr float const *maxz = phase1PixelTopology::maxz; + static constexpr float const *maxr = phase1PixelTopology::maxr; + + static constexpr uint8_t const *layerPairs = phase1PixelTopology::layerPairs; + static constexpr int16_t const *phicuts = phase1PixelTopology::phicuts; + + static constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); } + + static constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); } + + static constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; } + + static constexpr inline uint16_t toRocY(uint16_t py) { + auto roc = divu52(py); + return py - 52 * roc; } - return res; - } - static_assert(validateLayerIndex(), "phase2 layer from detIndex algo is buggy"); + static constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); } + static constexpr inline bool isBigPixY(uint16_t py) { + auto ly = toRocY(py); + return (ly == 0) | (ly == lastColInRoc); + } -} // namespace phase2PixelTopology + static constexpr inline uint16_t localX(uint16_t px) { + auto shift = 0; + if (px > lastRowInRoc) + shift += 1; + if (px > numRowsInRoc) + shift += 1; + return px + shift; + } + + static constexpr inline uint16_t localY(uint16_t py) { + auto roc = divu52(py); + auto shift = 2 * roc; + auto yInRoc = py - 52 * roc; + if (yInRoc > 0) + shift += 1; + return py + shift; + } + }; + + template + using isPhase1Topology = typename std::enable_if::value>::type; + + template + using isPhase2Topology = typename std::enable_if::value>::type; + + // struct HIonPhase1 : public Phase1 { + // static constexpr uint32_t maxNumberOfDoublets=3*1024*1024;}; + +} // namespace pixelTopology #endif // Geometry_CommonTopologies_SimplePixelTopology_h diff --git a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu index ed906de004bcf..cfb6784a6c1fb 100644 --- a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu +++ b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu @@ -10,6 +10,7 @@ namespace { // original code from CMSSW_4_4 + using namespace pixelTopology; std::tuple localXori(int mpx) { const float m_pitchx = 1.f; @@ -127,14 +128,14 @@ namespace { } // namespace constexpr void testLayer() { - for (auto i = 0U; i < phase1PixelTopology::numberOfModules; ++i) { - uint32_t layer = phase1PixelTopology::getLayer(i); - uint32_t tLayer = phase1PixelTopology::findLayer(i); + for (auto i = 0U; i < Phase1::numberOfModules; ++i) { + uint32_t layer = getLayer(i); + uint32_t tLayer = findLayer(i); assert(tLayer == layer); - //std::cout << "module " << i << ": " << "layer " << layer << ", \"" << phase1PixelTopology::layerName[layer] << "\", [" << phase1PixelTopology::layerStart[layer] << ", " << phase1PixelTopology::layerStart[layer+1] << ")" << std::endl; - assert(layer < phase1PixelTopology::numberOfLayers); - assert(i >= phase1PixelTopology::layerStart[layer]); - assert(i < phase1PixelTopology::layerStart[layer + 1]); + + assert(layer < Phase1::numberOfLayers); + assert(i >= Phase1::layerStart[layer]); + assert(i < Phase1::layerStart[layer + 1]); } } @@ -145,8 +146,8 @@ int main() { for (uint16_t ix = 0; ix < 80 * 2; ++ix) { auto ori = localXori(ix); - auto xl = phase1PixelTopology::localX(ix); - auto bp = phase1PixelTopology::isBigPixX(ix); + auto xl = Phase1::localX(ix); + auto bp = Phase1::isBigPixX(ix); if (std::get<0>(ori) != xl) std::cout << "Error " << std::get<0>(ori) << "!=" << xl << std::endl; assert(std::get<1>(ori) == bp); @@ -154,21 +155,20 @@ int main() { for (uint16_t iy = 0; iy < 52 * 8; ++iy) { auto ori = localYori(iy); - auto yl = phase1PixelTopology::localY(iy); - auto bp = phase1PixelTopology::isBigPixY(iy); + auto yl = Phase1::localY(iy); + auto bp = Phase1::isBigPixY(iy); if (std::get<0>(ori) != yl) std::cout << "Error " << std::get<0>(ori) << "!=" << yl << std::endl; assert(std::get<1>(ori) == bp); } - for (auto i = 0U; i < phase1PixelTopology::numberOfLayers; ++i) { - std::cout << "layer " << i << ", \"" << phase1PixelTopology::layerName[i] << "\", [" - << phase1PixelTopology::layerStart[i] << ", " << phase1PixelTopology::layerStart[i + 1] << ") " - << phase1PixelTopology::layerStart[i + 1] - phase1PixelTopology::layerStart[i] << std::endl; + for (auto i = 0U; i < Phase1::numberOfLayers; ++i) { + std::cout << "layer " << i << "\", [" << Phase1::layerStart[i] << ", " << Phase1::layerStart[i + 1] << ") " + << Phase1::layerStart[i + 1] - Phase1::layerStart[i] << std::endl; } - std::cout << "maxModuleStide layerIndexSize " << phase1PixelTopology::maxModuleStride << ' ' - << phase1PixelTopology::layerIndexSize << std::endl; + std::cout << "maxModuleStide layerIndexSize " << maxModuleStride << ' ' + << layerIndexSize << std::endl; testLayer(); diff --git a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py index 53b007a50b775..b778daa63677f 100644 --- a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py +++ b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py @@ -210,6 +210,34 @@ def customiseForOffline(process): return process +#Customize for Tracker Traits and Enabling Phase2 for Inner Tracker Reconstruction #38761 +def customizeHLTfor38761(process): + + for producer in producers_by_type(process, "SiPixelRecHitSoAFromLegacy"): + if hasattr(producer, "isPhase2"): + delattr(producer, "isPhase2") + for producer in producers_by_type(process, "SiPixelDigisClustersFromSoA"): + if hasattr(producer, "isPhase2"): + delattr(producer, "isPhase2") + + if 'hltSiPixelRecHitsSoA' in process.__dict__: + process.hltSiPixelRecHitsSoA.cpu = cms.EDAlias(hltSiPixelRecHitsFromLegacy = cms.VPSet( + cms.PSet( + type = cms.string('pixelTopologyPhase1TrackingRecHit2DCPUT') + ), + cms.PSet( + type = cms.string('uintAsHostProduct') + ))) + + for producer in esproducers_by_type(process, "PixelCPEFastESProducer"): + if hasattr(producer, "isPhase2"): + delattr(producer, "isPhase2") + for producer in esproducers_by_type(process, "PixelCPEGenericESProducer"): + if hasattr(producer, "Upgrade"): + setattr(producer,"isPhase2",getattr(producer,"Upgrade")) + delattr(producer, "Upgrade") + + return process # CMSSW version specific customizations def customizeHLTforCMSSW(process, menuType="GRun"): @@ -218,5 +246,9 @@ def customizeHLTforCMSSW(process, menuType="GRun"): # add call to action function in proper order: newest last! # process = customiseFor12718(process) + + process = customizeHLTfor38761(process) + + return process diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.cc index 10cd09502cf9a..363c4b7635b70 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.cc @@ -294,7 +294,7 @@ void PixelThresholdClusterizer::copy_to_buffer(DigiIterator begin, DigiIterator if (adc < 100) adc = 100; // put all negative pixel charges into the 100 elec bin - /* This is semi-random good number. The exact number (in place of 100) is irrelevant from the point + /* This is semi-random good number. The exact number (in place of 100) is irrelevant from the point of view of the final cluster charge since these are typically >= 20000. */ @@ -444,7 +444,7 @@ SiPixelCluster PixelThresholdClusterizer::make_cluster(const SiPixelCluster::Pix /* this is not possible as dead and noisy pixel cannot make it into a seed... if ( doMissCalibrate && - (theSiPixelGainCalibrationService_->isDead(theDetid,pix.col(),pix.row()) || + (theSiPixelGainCalibrationService_->isDead(theDetid,pix.col(),pix.row()) || theSiPixelGainCalibrationService_->isNoisy(theDetid,pix.col(),pix.row())) ) { std::cout << "IMPOSSIBLE" << std::endl; @@ -489,15 +489,15 @@ SiPixelCluster PixelThresholdClusterizer::make_cluster(const SiPixelCluster::Pix } /* //Commenting out the addition of dead pixels to the cluster until further testing -- dfehling 06/09 - //Check on the bounds of the module; this is to keep the isDead and isNoisy modules from returning errors - else if(r>= 0 && c >= 0 && (r <= (theNumOfRows-1.)) && (c <= (theNumOfCols-1.))){ + //Check on the bounds of the module; this is to keep the isDead and isNoisy modules from returning errors + else if(r>= 0 && c >= 0 && (r <= (theNumOfRows-1.)) && (c <= (theNumOfCols-1.))){ //Check for dead/noisy pixels check that the buffer is not -1 (already considered). Check whether we want to split clusters separated by dead pixels or not. if((theSiPixelGainCalibrationService_->isDead(theDetid,c,r) || theSiPixelGainCalibrationService_->isNoisy(theDetid,c,r)) && theBuffer(r,c) != 1){ - - //If a pixel is dead or noisy, check to see if we want to split the clusters or not. + + //If a pixel is dead or noisy, check to see if we want to split the clusters or not. //Push it into a dead pixel stack in case we want to split the clusters. Otherwise add it to the cluster. //If we are splitting the clusters, we will iterate over the dead pixel stack later. - + SiPixelCluster::PixelPos newpix(r,c); if(!doSplitClusters){ @@ -505,10 +505,10 @@ SiPixelCluster PixelThresholdClusterizer::make_cluster(const SiPixelCluster::Pix else if(doSplitClusters){ dead_pixel_stack.push(newpix); dead_flag = true;} - + theBuffer.set_adc(newpix, 1); - } - + } + } */ } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.h b/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.h index 77ac94bb2b7e0..77cc0e6491fcd 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.h @@ -110,8 +110,10 @@ class dso_hidden PixelThresholdClusterizer : public PixelClusterizerBase { const double theElectronPerADCGain; // ADC to electrons conversion - const bool doPhase2Calibration; // The ADC --> electrons calibration is for phase-2 tracker - const bool dropDuplicates; // Enabling dropping duplicate pixels + const bool doPhase2Calibration; // The ADC --> electrons calibration is for phase-2 tracker + + const bool dropDuplicates; // Enabling dropping duplicate pixels + const int thePhase2ReadoutMode; // Readout mode of the phase-2 IT digitizer const double thePhase2DigiBaseline; // Threshold above which digis are measured in the phase-2 IT const int thePhase2KinkADC; // ADC count at which the kink in the dual slop kicks in diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc index d36c345ecf02a..538e0356630a0 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc @@ -21,10 +21,11 @@ #include "PixelClusterizerBase.h" #include "SiPixelClusterThresholds.h" -class SiPixelDigisClustersFromSoA : public edm::global::EDProducer<> { +template +class SiPixelDigisClustersFromSoAT : public edm::global::EDProducer<> { public: - explicit SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig); - ~SiPixelDigisClustersFromSoA() override = default; + explicit SiPixelDigisClustersFromSoAT(const edm::ParameterSet& iConfig); + ~SiPixelDigisClustersFromSoAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -42,38 +43,42 @@ class SiPixelDigisClustersFromSoA : public edm::global::EDProducer<> { const bool produceDigis_; const bool storeDigis_; - const bool isPhase2_; }; -SiPixelDigisClustersFromSoA::SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig) +template +SiPixelDigisClustersFromSoAT::SiPixelDigisClustersFromSoAT(const edm::ParameterSet& iConfig) : topoToken_(esConsumes()), digiGetToken_(consumes(iConfig.getParameter("src"))), clusterPutToken_(produces()), clusterThresholds_{iConfig.getParameter("clusterThreshold_layer1"), iConfig.getParameter("clusterThreshold_otherLayers")}, produceDigis_(iConfig.getParameter("produceDigis")), - storeDigis_(iConfig.getParameter("produceDigis") & iConfig.getParameter("storeDigis")), - isPhase2_(iConfig.getParameter("isPhase2")) { + storeDigis_(iConfig.getParameter("produceDigis") & iConfig.getParameter("storeDigis")) { if (produceDigis_) digiPutToken_ = produces>(); } -void SiPixelDigisClustersFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void SiPixelDigisClustersFromSoAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("src", edm::InputTag("siPixelDigisSoA")); desc.add("clusterThreshold_layer1", kSiPixelClusterThresholdsDefaultPhase1.layer1); desc.add("clusterThreshold_otherLayers", kSiPixelClusterThresholdsDefaultPhase1.otherLayers); desc.add("produceDigis", true); desc.add("storeDigis", true); - desc.add("isPhase2", false); + descriptions.addWithDefaultLabel(desc); } -void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { +template +void SiPixelDigisClustersFromSoAT::produce(edm::StreamID, + edm::Event& iEvent, + const edm::EventSetup& iSetup) const { const auto& digis = iEvent.get(digiGetToken_); const uint32_t nDigis = digis.size(); const auto& ttopo = iSetup.getData(topoToken_); - auto maxModules = isPhase2_ ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + constexpr auto maxModules = TrackerTraits::numberOfModules; + std::unique_ptr> collection; if (produceDigis_) collection = std::make_unique>(); @@ -117,7 +122,7 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con for (int32_t ic = 0; ic < nclus + 1; ++ic) { auto const& acluster = aclusters[ic]; // in any case we cannot go out of sync with gpu... - if (acluster.charge < clusterThreshold and !isPhase2_) + if (!std::is_base_of::value and acluster.charge < clusterThreshold) edm::LogWarning("SiPixelDigisClustersFromSoA") << "cluster below charge Threshold " << "Layer/DetId/clusId " << layer << '/' << detId << '/' << ic << " size/charge " << acluster.isize << '/' << acluster.charge; @@ -200,4 +205,9 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con iEvent.put(clusterPutToken_, std::move(outputClusters)); } +using SiPixelDigisClustersFromSoA = SiPixelDigisClustersFromSoAT; DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoA); +using SiPixelDigisClustersFromSoAPhase1 = SiPixelDigisClustersFromSoAT; +DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoAPhase1); +using SiPixelDigisClustersFromSoAPhase2 = SiPixelDigisClustersFromSoAT; +DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoAPhase2); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 48dfa98839d36..bc9be260deb20 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -443,15 +443,15 @@ namespace pixelgpudetails { } // end of Raw to Digi kernel - template + template __global__ void fillHitsModuleStart(uint32_t const *__restrict__ clusInModule, uint32_t *__restrict__ moduleStart, uint32_t const *__restrict__ nModules, uint32_t *__restrict__ nModules_Clusters) { - constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; - constexpr int startBPIX2 = isPhase2 ? phase2PixelTopology::layerStart[1] : phase1PixelTopology::layerStart[1]; + constexpr int nMaxModules = TrackerTraits::numberOfModules; + constexpr int startBPIX2 = TrackerTraits::layerStart[1]; - assert(nMaxModules < phase2PixelTopology::numberOfModules); + assert(nMaxModules < TrackerTraits::numberOfModules); assert(startBPIX2 < nMaxModules); assert(nMaxModules < 4096); // easy to extend at least till 32*1024 assert(nMaxModules > 1024); @@ -466,7 +466,8 @@ namespace pixelgpudetails { moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), clusInModule[i]); } - __shared__ uint32_t ws[64]; + constexpr bool isPhase2 = std::is_base_of::value; + __shared__ uint32_t ws[32]; cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws); constexpr int lastModules = isPhase2 ? 1024 : nMaxModules - 1024; cms::cuda::blockPrefixScan(moduleStart + 1024 + 1, moduleStart + 1024 + 1, lastModules, ws); @@ -510,10 +511,8 @@ namespace pixelgpudetails { assert(moduleStart[maxH + 1] >= moduleStart[maxH]); assert(moduleStart[nMaxModules] >= moduleStart[maxH + 1]); - constexpr int startFP1 = - isPhase2 ? phase2PixelTopology::numberOfModulesInBarrel : phase1PixelTopology::numberOfModulesInBarrel; - constexpr int startLastFwd = isPhase2 ? phase2PixelTopology::layerStart[phase2PixelTopology::numberOfLayers] - : phase1PixelTopology::layerStart[phase1PixelTopology::numberOfLayers]; + constexpr int startFP1 = TrackerTraits::numberOfModulesInBarrel; + constexpr int startLastFwd = TrackerTraits::layerStart[TrackerTraits::numberOfLayers]; for (int i = first, iend = nMaxModules + 1; i < iend; i += blockDim.x) { if (0 != i) assert(moduleStart[i] >= moduleStart[i - i]); @@ -540,6 +539,7 @@ namespace pixelgpudetails { bool includeErrors, bool debug, cudaStream_t stream) { + using pixelTopology::Phase1; // we're not opting for calling this function in case of early events assert(wordCounter != 0); nDigis = wordCounter; @@ -553,7 +553,7 @@ namespace pixelgpudetails { if (includeErrors) { digiErrors_d = SiPixelDigiErrorsCUDA(wordCounter, std::move(errors), stream); } - clusters_d = SiPixelClustersCUDA(phase1PixelTopology::numberOfModules, stream); + clusters_d = SiPixelClustersCUDA(Phase1::numberOfModules, stream); // Begin Raw2Digi block { @@ -618,8 +618,7 @@ namespace pixelgpudetails { // clusterizer ... using namespace gpuClustering; int threadsPerBlock = 256; - int blocks = (std::max(int(wordCounter), int(phase1PixelTopology::numberOfModules)) + threadsPerBlock - 1) / - threadsPerBlock; + int blocks = (std::max(int(wordCounter), int(Phase1::numberOfModules)) + threadsPerBlock - 1) / threadsPerBlock; if (isRun2) gpuCalibPixel::calibDigis<<>>(digis_d.view().moduleInd(), @@ -652,7 +651,7 @@ namespace pixelgpudetails { << " threads\n"; #endif - countModules<<>>( + countModules<<>>( digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -662,29 +661,30 @@ namespace pixelgpudetails { std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - findClus<<>>(digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), - wordCounter); + findClus<<>>(digis_d.view().rawIdArr(), + digis_d.view().moduleInd(), + digis_d.view().xx(), + digis_d.view().yy(), + clusters_d.moduleStart(), + clusters_d.clusInModule(), + clusters_d.moduleId(), + digis_d.view().clus(), + wordCounter); + cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaStreamSynchronize(stream)); #endif // apply charge cut - clusterChargeCut<<>>(clusterThresholds, - digis_d.view().moduleInd(), - digis_d.view().adc(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), - wordCounter); + clusterChargeCut<<>>(clusterThresholds, + digis_d.view().moduleInd(), + digis_d.view().adc(), + clusters_d.moduleStart(), + clusters_d.clusInModule(), + clusters_d.moduleId(), + digis_d.view().clus(), + wordCounter); cudaCheck(cudaGetLastError()); @@ -694,7 +694,7 @@ namespace pixelgpudetails { // synchronization/ExternalWork auto nModules_Clusters_d = cms::cuda::make_device_unique(3, stream); // MUST be ONE block - fillHitsModuleStart<<<1, 1024, 0, stream>>>( + fillHitsModuleStart<<<1, 1024, 0, stream>>>( clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get()); // copy to host @@ -719,6 +719,7 @@ namespace pixelgpudetails { const uint32_t numDigis, cudaStream_t stream) { using namespace gpuClustering; + using pixelTopology::Phase2; nDigis = numDigis; digis_d = SiPixelDigisCUDA(numDigis, stream); @@ -732,7 +733,7 @@ namespace pixelgpudetails { cudaCheck( cudaMemcpyAsync(digis_d.view().rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream)); - clusters_d = SiPixelClustersCUDA(phase2PixelTopology::numberOfModules, stream); + clusters_d = SiPixelClustersCUDA(Phase2::numberOfModules, stream); nModules_Clusters_h = cms::cuda::make_host_unique(2, stream); @@ -750,13 +751,10 @@ namespace pixelgpudetails { #ifdef GPU_DEBUG cudaCheck(cudaStreamSynchronize(stream)); -#endif - -#ifdef GPU_DEBUG std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - countModules<<>>( + countModules<<>>( digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), numDigis); cudaCheck(cudaGetLastError()); @@ -765,37 +763,49 @@ namespace pixelgpudetails { &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream)); threadsPerBlock = 256; - blocks = phase2PixelTopology::numberOfModules; - - findClus<<>>(digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), - numDigis); + blocks = Phase2::numberOfModules; + +#ifdef GPU_DEBUG + cudaCheck(cudaStreamSynchronize(stream)); + std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; +#endif + findClus<<>>(digis_d.view().rawIdArr(), + digis_d.view().moduleInd(), + digis_d.view().xx(), + digis_d.view().yy(), + clusters_d.moduleStart(), + clusters_d.clusInModule(), + clusters_d.moduleId(), + digis_d.view().clus(), + numDigis); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaStreamSynchronize(stream)); + std::cout << "CUDA clusterChargeCut kernel launch with " << blocks << " blocks of " << threadsPerBlock + << " threads\n"; #endif // apply charge cut - clusterChargeCut<<>>(clusterThresholds, - digis_d.view().moduleInd(), - digis_d.view().adc(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), - numDigis); + clusterChargeCut<<>>(clusterThresholds, + digis_d.view().moduleInd(), + digis_d.view().adc(), + clusters_d.moduleStart(), + clusters_d.clusInModule(), + clusters_d.moduleId(), + digis_d.view().clus(), + numDigis); cudaCheck(cudaGetLastError()); auto nModules_Clusters_d = cms::cuda::make_device_unique(3, stream); // MUST be ONE block - fillHitsModuleStart<<<1, 1024, 0, stream>>>( + +#ifdef GPU_DEBUG + cudaCheck(cudaStreamSynchronize(stream)); + std::cout << "CUDA fillHitsModuleStart kernel launch \n"; +#endif + + fillHitsModuleStart<<<1, 1024, 0, stream>>>( clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get()); nModules_Clusters_h = cms::cuda::make_host_unique(3, stream); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h index d46fe76f3e81d..75e8389513b68 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h @@ -23,9 +23,9 @@ namespace gpuCalibPixel { constexpr float VCaltoElectronOffset_L1 = -670; // L1: -670 +- 220 constexpr int VCalChargeThreshold = 100; //for phase2 - constexpr float ElectronPerADCGain = 600; + constexpr float ElectronPerADCGain = 1500; constexpr int8_t Phase2ReadoutMode = 3; - constexpr uint16_t Phase2DigiBaseline = 1500; + constexpr uint16_t Phase2DigiBaseline = 1000; constexpr uint8_t Phase2KinkADC = 8; template @@ -104,21 +104,20 @@ namespace gpuCalibPixel { adc_int = int(adc_int * ElectronPerADCGain); else { if (adc_int < Phase2KinkADC) - adc_int = int((adc_int - 0.5) * ElectronPerADCGain); + adc_int = int((adc_int + 0.5) * ElectronPerADCGain); else { constexpr int8_t dspp = (Phase2ReadoutMode < 10 ? Phase2ReadoutMode : 10); constexpr int8_t ds = int8_t(dspp <= 1 ? 1 : (dspp - 1) * (dspp - 1)); - adc_int -= (Phase2KinkADC - 1); + adc_int -= Phase2KinkADC; adc_int *= ds; - adc_int += (Phase2KinkADC - 1); + adc_int += Phase2KinkADC; adc_int = ((adc_int + 0.5 * ds) * ElectronPerADCGain); } adc_int += int(Phase2DigiBaseline); } - adc[i] = std::min(adc_int, int(std::numeric_limits::max())); } } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h index a7dd8ac3752c2..fced5675e5c29 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h @@ -14,7 +14,7 @@ namespace gpuClustering { - template + template __global__ void clusterChargeCut( SiPixelClusterThresholds clusterThresholds, // charge cut on cluster in electrons (for layer 1 and for other layers) @@ -29,9 +29,8 @@ namespace gpuClustering { __shared__ uint8_t ok[maxNumClustersPerModules]; __shared__ uint16_t newclusId[maxNumClustersPerModules]; - constexpr int startBPIX2 = isPhase2 ? phase2PixelTopology::layerStart[1] : phase1PixelTopology::layerStart[1]; - [[maybe_unused]] constexpr int nMaxModules = - isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + constexpr int startBPIX2 = TrackerTraits::layerStart[1]; + [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules; assert(nMaxModules < maxNumModules); assert(startBPIX2 < nMaxModules); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index ed3510e4918f8..675eae8938236 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -10,6 +10,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +//#define GPU_DEBUG + namespace gpuClustering { // Phase-1 pixel modules @@ -65,14 +67,15 @@ namespace gpuClustering { __device__ uint32_t gMaxHit = 0; #endif - template + template __global__ void countModules(uint16_t const* __restrict__ id, uint32_t* __restrict__ moduleStart, int32_t* __restrict__ clusterId, int numElements) { int first = blockDim.x * blockIdx.x + threadIdx.x; - [[maybe_unused]] constexpr int nMaxModules = - isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + + [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules; + assert(nMaxModules < maxNumModules); for (int i = first; i < numElements; i += gridDim.x * blockDim.x) { clusterId[i] = i; @@ -89,7 +92,7 @@ namespace gpuClustering { } } - template + template __global__ void findClus(uint32_t* __restrict__ rawIdArr, uint16_t* __restrict__ id, // module id of each pixel uint16_t const* __restrict__ x, // local coordinates of each pixel @@ -101,6 +104,7 @@ namespace gpuClustering { int numElements) { // status is only used for Phase-1, but it cannot be declared conditionally only if isPhase2 is false; // to minimize the impact on Phase-2 reconstruction it is declared with a very small size. + constexpr bool isPhase2 = std::is_base_of::value; constexpr const uint32_t pixelStatusSize = isPhase2 ? 1 : pixelStatus::size; __shared__ uint32_t status[pixelStatusSize]; // packed words array used to store the PixelStatus of each pixel __shared__ int msize; @@ -108,14 +112,12 @@ namespace gpuClustering { auto firstModule = blockIdx.x; auto endModule = moduleStart[0]; - [[maybe_unused]] constexpr int nMaxModules = - isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; - assert(nMaxModules < maxNumModules); + assert(TrackerTraits::numberOfModules < maxNumModules); for (auto module = firstModule; module < endModule; module += gridDim.x) { auto firstPixel = moduleStart[1 + module]; auto thisModuleId = id[firstPixel]; - assert(thisModuleId < nMaxModules); + assert(thisModuleId < TrackerTraits::numberOfModules); #ifdef GPU_DEBUG if (thisModuleId % 100 == 1) @@ -141,9 +143,10 @@ namespace gpuClustering { //init hist (ymax=416 < 512 : 9bits) //6000 max pixels required for HI operations with no measurable impact on pp performance - constexpr uint32_t maxPixInModule = 6000; - constexpr auto nbins = isPhase2 ? 1024 : phase1PixelTopology::numColsInModule + 2; //2+2; - constexpr auto nbits = isPhase2 ? 10 : 9; //2+2; + constexpr uint32_t maxPixInModule = TrackerTraits::maxPixInModule; + constexpr auto nbins = TrackerTraits::clusterBinning; + constexpr auto nbits = TrackerTraits::clusterBits; + using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; __shared__ typename Hist::Counter ws[32]; diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizer_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizer_cfi.py index 91c5cecb848bf..2126235c353bb 100644 --- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizer_cfi.py +++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizer_cfi.py @@ -16,15 +16,15 @@ ClusterThreshold_L1 = 2000 ) -# Run3, changes in the gain calibration scheme +# Run3, changes in the gain calibration scheme #from Configuration.Eras.Era_Run3_cff import Run3 #Run3.toModify(siPixelClusters, from Configuration.Eras.Modifier_run3_common_cff import run3_common run3_common.toModify(siPixelClusters, VCaltoElectronGain = 1, # all gains=1, pedestals=0 - VCaltoElectronGain_L1 = 1, - VCaltoElectronOffset = 0, - VCaltoElectronOffset_L1 = 0, + VCaltoElectronGain_L1 = 1, + VCaltoElectronOffset = 0, + VCaltoElectronOffset_L1 = 0, ClusterThreshold_L1 = 4000 ) @@ -45,4 +45,7 @@ (premix_stage2 & phase2_tracker).toModify(siPixelClusters, src = "mixData:Pixel" ) - +from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit +(phase2_tracker & pixelNtupletFit).toModify(siPixelClusters, #at the moment the duplicate dropping is not imnplemented in Phase2 + DropDuplicates = False +) diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py index 2916f5f8d037b..21b641cae9819 100644 --- a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py +++ b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py @@ -25,6 +25,8 @@ # convert the pixel digis (except errors) and clusters to the legacy format from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoA_cfi import siPixelDigisClustersFromSoA as _siPixelDigisClustersFromSoA +from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoAPhase2_cfi import siPixelDigisClustersFromSoAPhase2 as _siPixelDigisClustersFromSoAPhase2 + siPixelDigisClustersPreSplitting = _siPixelDigisClustersFromSoA.clone() run3_common.toModify(siPixelDigisClustersPreSplitting, @@ -32,7 +34,7 @@ from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker -(gpu & ~phase2_tracker).toReplaceWith(siPixelClustersPreSplittingTask, cms.Task( +gpu.toReplaceWith(siPixelClustersPreSplittingTask, cms.Task( # conditions used *only* by the modules running on GPU siPixelROCsStatusAndMappingWrapperESProducer, siPixelGainCalibrationForHLTGPU, @@ -55,12 +57,13 @@ src = "siPixelClustersPreSplittingCUDA" ) -phase2_tracker.toModify(siPixelDigisClustersPreSplitting, +phase2_tracker.toReplaceWith(siPixelDigisClustersPreSplitting, _siPixelDigisClustersFromSoAPhase2.clone( clusterThreshold_layer1 = 4000, clusterThreshold_otherLayers = 4000, src = "siPixelDigisPhase2SoA", #produceDigis = False - ) + )) + (gpu & phase2_tracker).toReplaceWith(siPixelClustersPreSplittingTask, cms.Task( # reconstruct the pixel clusters on the gpu from copied digis siPixelClustersPreSplittingCUDA, diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h index 8f997722f35ab..6aff7aa15196e 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h @@ -20,12 +20,15 @@ #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClusterThresholds.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + int main(void) { #ifdef __CUDACC__ cms::cudatest::requireDevices(); #endif // __CUDACC__ using namespace gpuClustering; + using pixelTopology::Phase1; constexpr int numElements = 256 * maxNumModules; constexpr SiPixelClusterThresholds clusterThresholds(kSiPixelClusterThresholdsDefaultPhase1); @@ -257,7 +260,7 @@ int main(void) { << " threads\n"; cms::cuda::launch( - countModules, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n); + countModules, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n); blocksPerGrid = maxNumModules; //nModules; @@ -265,7 +268,7 @@ int main(void) { << " threads\n"; cudaCheck(cudaMemset(d_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t))); - cms::cuda::launch(findClus, + cms::cuda::launch(findClus, {blocksPerGrid, threadsPerBlock}, d_raw.get(), d_id.get(), @@ -292,7 +295,7 @@ int main(void) { if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0)) std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl; - cms::cuda::launch(clusterChargeCut, + cms::cuda::launch(clusterChargeCut, {blocksPerGrid, threadsPerBlock}, clusterThresholds, d_id.get(), @@ -306,17 +309,18 @@ int main(void) { cudaDeviceSynchronize(); #else // __CUDACC__ h_moduleStart[0] = nModules; - countModules(h_id.get(), h_moduleStart.get(), h_clus.get(), n); + countModules(h_id.get(), h_moduleStart.get(), h_clus.get(), n); memset(h_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t)); - findClus(h_raw.get(), - h_id.get(), - h_x.get(), - h_y.get(), - h_moduleStart.get(), - h_clusInModule.get(), - h_moduleId.get(), - h_clus.get(), - n); + + findClus(h_raw.get(), + h_id.get(), + h_x.get(), + h_y.get(), + h_moduleStart.get(), + h_clusInModule.get(), + h_moduleId.get(), + h_clus.get(), + n); nModules = h_moduleStart[0]; auto nclus = h_clusInModule.get(); @@ -331,14 +335,14 @@ int main(void) { if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0)) std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl; - clusterChargeCut(clusterThresholds, - h_id.get(), - h_adc.get(), - h_moduleStart.get(), - h_clusInModule.get(), - h_moduleId.get(), - h_clus.get(), - n); + clusterChargeCut(clusterThresholds, + h_id.get(), + h_adc.get(), + h_moduleStart.get(), + h_clusInModule.get(), + h_moduleId.get(), + h_clus.get(), + n); #endif // __CUDACC__ std::cout << "found " << nModules << " Modules active" << std::endl; diff --git a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h index b373e8d0c7ec1..15c24dfefb420 100644 --- a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h +++ b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h @@ -9,8 +9,10 @@ #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEGenericBase.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" class MagneticField; +template class PixelCPEFast final : public PixelCPEGenericBase { public: PixelCPEFast(edm::ParameterSet const &conf, @@ -27,11 +29,13 @@ class PixelCPEFast final : public PixelCPEGenericBase { // The return value can only be used safely in kernels launched on // the same cudaStream, or after cudaStreamSynchronize. - const pixelCPEforGPU::ParamsOnGPU *getGPUProductAsync(cudaStream_t cudaStream) const; + using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; + using LayerGeometry = pixelCPEforGPU::LayerGeometryT; + using AverageGeometry = pixelTopology::AverageGeometryT; - pixelCPEforGPU::ParamsOnGPU const &getCPUProduct() const { return cpuData_; } + const ParamsOnGPU *getGPUProductAsync(cudaStream_t cudaStream) const; - bool isPhase2() const { return isPhase2_; }; + ParamsOnGPU const &getCPUProduct() const { return cpuData_; } private: LocalPoint localPosition(DetParam const &theDetParam, ClusterParam &theClusterParam) const override; @@ -45,17 +49,15 @@ class PixelCPEFast final : public PixelCPEGenericBase { // allocate this with posix malloc to be compatible with the cpu workflow std::vector detParamsGPU_; pixelCPEforGPU::CommonParams commonParamsGPU_; - pixelCPEforGPU::LayerGeometry layerGeometry_; - pixelCPEforGPU::AverageGeometry averageGeometry_; - pixelCPEforGPU::ParamsOnGPU cpuData_; - - bool isPhase2_; + LayerGeometry layerGeometry_; + AverageGeometry averageGeometry_; + ParamsOnGPU cpuData_; struct GPUData { ~GPUData(); // not needed if not used on CPU... - pixelCPEforGPU::ParamsOnGPU paramsOnGPU_h; - pixelCPEforGPU::ParamsOnGPU *paramsOnGPU_d = nullptr; // copy of the above on the Device + ParamsOnGPU paramsOnGPU_h; + ParamsOnGPU *paramsOnGPU_d = nullptr; // copy of the above on the Device }; cms::cuda::ESProduct gpuData_; diff --git a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h index 063a761b9d1d8..e7c8ad5554f36 100644 --- a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h +++ b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h @@ -34,7 +34,6 @@ namespace pixelCPEforGPU { float thePitchX; float thePitchY; - bool isPhase2; uint16_t maxModuleStride; uint8_t numberOfLaddersInBarrel; }; @@ -71,15 +70,21 @@ namespace pixelCPEforGPU { Frame frame; }; - using pixelTopology::AverageGeometry; - - struct LayerGeometry { - uint32_t layerStart[phase2PixelTopology::numberOfLayers + 1]; - uint8_t layer[phase2PixelTopology::layerIndexSize]; + template + struct LayerGeometryT { + uint32_t layerStart[TrackerTopology::numberOfLayers + 1]; + uint8_t layer[pixelTopology::layerIndexSize]; uint16_t maxModuleStride; }; - struct ParamsOnGPU { + // using LayerGeometry = LayerGeometryT; + // using LayerGeometryPhase2 = LayerGeometryT; + + template + struct ParamsOnGPUT { + using LayerGeometry = LayerGeometryT; + using AverageGeometry = pixelTopology::AverageGeometryT; + CommonParams const* m_commonParams; DetParams const* m_detParams; LayerGeometry const* m_layerGeometry; @@ -202,10 +207,12 @@ namespace pixelCPEforGPU { return 0.5f * (qdiff / qsum) * w_eff; } + template constexpr inline void position(CommonParams const& __restrict__ comParams, DetParams const& __restrict__ detParams, ClusParams& cp, uint32_t ic) { + constexpr int maxSize = TrackerTraits::maxSizeCluster; //--- Upper Right corner of Lower Left pixel -- in measurement frame uint16_t llx = cp.minRow[ic] + 1; uint16_t lly = cp.minCol[ic] + 1; @@ -215,56 +222,52 @@ namespace pixelCPEforGPU { uint16_t ury = cp.maxCol[ic]; uint16_t llxl = llx, llyl = lly, urxl = urx, uryl = ury; - if (!comParams.isPhase2) //only in Phase1 - { - llxl = phase1PixelTopology::localX(llx); - llyl = phase1PixelTopology::localY(lly); - urxl = phase1PixelTopology::localX(urx); - uryl = phase1PixelTopology::localY(ury); - } + + llxl = TrackerTraits::localX(llx); + llyl = TrackerTraits::localY(lly); + urxl = TrackerTraits::localX(urx); + uryl = TrackerTraits::localY(ury); auto mx = llxl + urxl; auto my = llyl + uryl; - auto xsize = int(urxl) + 2 - int(llxl); - auto ysize = int(uryl) + 2 - int(llyl); + int xsize = int(urxl) + 2 - int(llxl); + int ysize = int(uryl) + 2 - int(llyl); assert(xsize >= 0); // 0 if bixpix... assert(ysize >= 0); - if (!comParams.isPhase2) //Phase 1 big pixels - { - if (phase1PixelTopology::isBigPixX(cp.minRow[ic])) - ++xsize; - if (phase1PixelTopology::isBigPixX(cp.maxRow[ic])) - ++xsize; - if (phase1PixelTopology::isBigPixY(cp.minCol[ic])) - ++ysize; - if (phase1PixelTopology::isBigPixY(cp.maxCol[ic])) - ++ysize; - } + if (TrackerTraits::isBigPixX(cp.minRow[ic])) + ++xsize; + if (TrackerTraits::isBigPixX(cp.maxRow[ic])) + ++xsize; + if (TrackerTraits::isBigPixY(cp.minCol[ic])) + ++ysize; + if (TrackerTraits::isBigPixY(cp.maxCol[ic])) + ++ysize; int unbalanceX = 8.f * std::abs(float(cp.q_f_X[ic] - cp.q_l_X[ic])) / float(cp.q_f_X[ic] + cp.q_l_X[ic]); int unbalanceY = 8.f * std::abs(float(cp.q_f_Y[ic] - cp.q_l_Y[ic])) / float(cp.q_f_Y[ic] + cp.q_l_Y[ic]); + xsize = 8 * xsize - unbalanceX; ysize = 8 * ysize - unbalanceY; - cp.xsize[ic] = std::min(xsize, comParams.isPhase2 ? 2047 : 1023); - cp.ysize[ic] = std::min(ysize, comParams.isPhase2 ? 2047 : 1023); + cp.xsize[ic] = std::min(xsize, maxSize); + cp.ysize[ic] = std::min(ysize, maxSize); - if (cp.minRow[ic] == 0 || cp.maxRow[ic] == phase1PixelTopology::lastRowInModule) + if (cp.minRow[ic] == 0 || cp.maxRow[ic] == uint32_t(detParams.nRows - 1)) cp.xsize[ic] = -cp.xsize[ic]; - if (cp.minCol[ic] == 0 || cp.maxCol[ic] == phase1PixelTopology::lastColInModule) + + if (cp.minCol[ic] == 0 || cp.maxCol[ic] == uint32_t(detParams.nCols - 1)) cp.ysize[ic] = -cp.ysize[ic]; // apply the lorentz offset correction float xoff = 0.5f * float(detParams.nRows) * comParams.thePitchX; float yoff = 0.5f * float(detParams.nCols) * comParams.thePitchY; - if (!comParams.isPhase2) //correction for bigpixels for phase1 - { - xoff = xoff + comParams.thePitchX; - yoff = yoff + 8.0f * comParams.thePitchY; - } + //correction for bigpixels for phase1 + xoff = xoff + TrackerTraits::bigPixXCorrection * comParams.thePitchX; + yoff = yoff + TrackerTraits::bigPixYCorrection * comParams.thePitchY; + // apply the lorentz offset correction auto xPos = detParams.shiftX + (comParams.thePitchX * 0.5f * float(mx)) - xoff; auto yPos = detParams.shiftY + (comParams.thePitchY * 0.5f * float(my)) - yoff; @@ -284,8 +287,8 @@ namespace pixelCPEforGPU { thickness, cotalpha, comParams.thePitchX, - comParams.isPhase2 ? false : phase1PixelTopology::isBigPixX(cp.minRow[ic]), - comParams.isPhase2 ? false : phase1PixelTopology::isBigPixX(cp.maxRow[ic])); + TrackerTraits::isBigPixX(cp.minRow[ic]), + TrackerTraits::isBigPixX(cp.maxRow[ic])); auto ycorr = correction(cp.maxCol[ic] - cp.minCol[ic], cp.q_f_Y[ic], @@ -296,13 +299,14 @@ namespace pixelCPEforGPU { thickness, cotbeta, comParams.thePitchY, - comParams.isPhase2 ? false : phase1PixelTopology::isBigPixY(cp.minCol[ic]), - comParams.isPhase2 ? false : phase1PixelTopology::isBigPixY(cp.maxCol[ic])); + TrackerTraits::isBigPixY(cp.minCol[ic]), + TrackerTraits::isBigPixY(cp.maxCol[ic])); cp.xpos[ic] = xPos + xcorr; cp.ypos[ic] = yPos + ycorr; } + template constexpr inline void errorFromSize(CommonParams const& __restrict__ comParams, DetParams const& __restrict__ detParams, ClusParams& cp, @@ -312,17 +316,14 @@ namespace pixelCPEforGPU { cp.yerr[ic] = 0.0085; // FIXME these are errors form Run1 - - bool isPhase2 = comParams.isPhase2; - // FIXME these are errors form Run1 - float xerr_barrel_l1_def = isPhase2 ? 0.00035 : 0.00200; // 0.01030; - float yerr_barrel_l1_def = isPhase2 ? 0.00125 : 0.00210; - float xerr_barrel_ln_def = isPhase2 ? 0.00035 : 0.00200; // 0.01030; - float yerr_barrel_ln_def = isPhase2 ? 0.00125 : 0.00210; - float xerr_endcap_def = isPhase2 ? 0.00060 : 0.0020; - float yerr_endcap_def = isPhase2 ? 0.00180 : 0.00210; - - constexpr float xerr_barrel_l1[] = {0.00115, 0.00120, 0.00088}; + float xerr_barrel_l1_def = TrackerTraits::xerr_barrel_l1_def; + float yerr_barrel_l1_def = TrackerTraits::yerr_barrel_l1_def; + float xerr_barrel_ln_def = TrackerTraits::xerr_barrel_ln_def; + float yerr_barrel_ln_def = TrackerTraits::yerr_barrel_ln_def; + float xerr_endcap_def = TrackerTraits::xerr_endcap_def; + float yerr_endcap_def = TrackerTraits::yerr_endcap_def; + + constexpr float xerr_barrel_l1[] = {0.00115, 0.00120, 0.00088}; //TODO MOVE THESE SOMEWHERE ELSE constexpr float yerr_barrel_l1[] = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240}; constexpr float xerr_barrel_ln[] = {0.00115, 0.00120, 0.00088}; @@ -339,52 +340,31 @@ namespace pixelCPEforGPU { bool isEdgeY = cp.ysize[ic] < 1; // is one and big? - bool isBig1X = isPhase2 ? false : ((0 == sx) && phase1PixelTopology::isBigPixX(cp.minRow[ic])); - bool isBig1Y = isPhase2 ? false : ((0 == sy) && phase1PixelTopology::isBigPixY(cp.minCol[ic])); - - if (!isPhase2) { - if (!isEdgeX && !isBig1X) { - if (not detParams.isBarrel) { - cp.xerr[ic] = sx < std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def; - } else if (detParams.layer == 1) { - cp.xerr[ic] = sx < std::size(xerr_barrel_l1) ? xerr_barrel_l1[sx] : xerr_barrel_l1_def; - } else { - cp.xerr[ic] = sx < std::size(xerr_barrel_ln) ? xerr_barrel_ln[sx] : xerr_barrel_ln_def; - } - } - - if (!isEdgeY && !isBig1Y) { - if (not detParams.isBarrel) { - cp.yerr[ic] = sy < std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def; - } else if (detParams.layer == 1) { - cp.yerr[ic] = sy < std::size(yerr_barrel_l1) ? yerr_barrel_l1[sy] : yerr_barrel_l1_def; - } else { - cp.yerr[ic] = sy < std::size(yerr_barrel_ln) ? yerr_barrel_ln[sy] : yerr_barrel_ln_def; - } - } - } else { - if (!isEdgeX) { - if (not detParams.isBarrel) { - cp.xerr[ic] = sx < std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def; - } else if (detParams.layer == 1) { - cp.xerr[ic] = sx < std::size(xerr_barrel_l1) ? xerr_barrel_l1[sx] : xerr_barrel_l1_def; - } else { - cp.xerr[ic] = sx < std::size(xerr_barrel_ln) ? xerr_barrel_ln[sx] : xerr_barrel_ln_def; - } + bool isBig1X = ((0 == sx) && TrackerTraits::isBigPixX(cp.minRow[ic])); + bool isBig1Y = ((0 == sy) && TrackerTraits::isBigPixY(cp.minCol[ic])); + + if (!isEdgeX && !isBig1X) { + if (not detParams.isBarrel) { + cp.xerr[ic] = sx < std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def; + } else if (detParams.layer == 1) { + cp.xerr[ic] = sx < std::size(xerr_barrel_l1) ? xerr_barrel_l1[sx] : xerr_barrel_l1_def; + } else { + cp.xerr[ic] = sx < std::size(xerr_barrel_ln) ? xerr_barrel_ln[sx] : xerr_barrel_ln_def; } + } - if (!isEdgeY) { - if (not detParams.isBarrel) { - cp.yerr[ic] = sy < std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def; - } else if (detParams.layer == 1) { - cp.yerr[ic] = sy < std::size(yerr_barrel_l1) ? yerr_barrel_l1[sy] : yerr_barrel_l1_def; - } else { - cp.yerr[ic] = sy < std::size(yerr_barrel_ln) ? yerr_barrel_ln[sy] : yerr_barrel_ln_def; - } + if (!isEdgeY && !isBig1Y) { + if (not detParams.isBarrel) { + cp.yerr[ic] = sy < std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def; + } else if (detParams.layer == 1) { + cp.yerr[ic] = sy < std::size(yerr_barrel_l1) ? yerr_barrel_l1[sy] : yerr_barrel_l1_def; + } else { + cp.yerr[ic] = sy < std::size(yerr_barrel_ln) ? yerr_barrel_ln[sy] : yerr_barrel_ln_def; } } } + template constexpr inline void errorFromDB(CommonParams const& __restrict__ comParams, DetParams const& __restrict__ detParams, ClusParams& cp, @@ -402,8 +382,8 @@ namespace pixelCPEforGPU { // is one and big? bool isOneX = (0 == sx); bool isOneY = (0 == sy); - bool isBigX = comParams.isPhase2 ? false : phase1PixelTopology::isBigPixX(cp.minRow[ic]); - bool isBigY = comParams.isPhase2 ? false : phase1PixelTopology::isBigPixY(cp.minCol[ic]); + bool isBigX = TrackerTraits::isBigPixX(cp.minRow[ic]); + bool isBigY = TrackerTraits::isBigPixY(cp.minCol[ic]); auto ch = cp.charge[ic]; auto bin = 0; @@ -421,14 +401,14 @@ namespace pixelCPEforGPU { cp.status[ic].isOneY = isOneY; cp.status[ic].isBigY = (isOneY & isBigY) | isEdgeY; - auto xoff = -float(phase1PixelTopology::xOffset) * comParams.thePitchX; + auto xoff = -float(TrackerTraits::xOffset) * comParams.thePitchX; int low_value = 0; int high_value = CPEFastParametrisation::kNumErrorBins - 1; int bin_value = float(CPEFastParametrisation::kNumErrorBins) * (cp.xpos[ic] + xoff) / (2 * xoff); // return estimated bin value truncated to [0, 15] int jx = std::clamp(bin_value, low_value, high_value); - auto toCM = [](uint8_t x) { return float(x) * 1.e-4; }; + auto toCM = [](uint8_t x) { return float(x) * 1.e-4f; }; if (not isEdgeX) { cp.xerr[ic] = isOneX ? toCM(isBigX ? detParams.sx2 : detParams.sigmax1[jx]) @@ -441,6 +421,15 @@ namespace pixelCPEforGPU { } } + //for Phase2 -> fallback to error from size + template <> + constexpr inline void errorFromDB(CommonParams const& __restrict__ comParams, + DetParams const& __restrict__ detParams, + ClusParams& cp, + uint32_t ic) { + errorFromSize(comParams, detParams, cp, ic); + } + } // namespace pixelCPEforGPU #endif // RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc b/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc index cd08eac535372..6044f2a5b9ad4 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc @@ -16,9 +16,10 @@ #include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h" #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h" -class PixelCPEFastESProducer : public edm::ESProducer { +template +class PixelCPEFastESProducerT : public edm::ESProducer { public: - PixelCPEFastESProducer(const edm::ParameterSet& p); + PixelCPEFastESProducerT(const edm::ParameterSet& p); std::unique_ptr produce(const TkPixelCPERecord&); static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -36,7 +37,8 @@ class PixelCPEFastESProducer : public edm::ESProducer { using namespace edm; -PixelCPEFastESProducer::PixelCPEFastESProducer(const edm::ParameterSet& p) : pset_(p) { +template +PixelCPEFastESProducerT::PixelCPEFastESProducerT(const edm::ParameterSet& p) : pset_(p) { auto const& myname = p.getParameter("ComponentName"); auto const& magname = p.getParameter("MagneticFieldRecord"); useErrorsFromTemplates_ = p.getParameter("UseErrorsFromTemplates"); @@ -52,7 +54,9 @@ PixelCPEFastESProducer::PixelCPEFastESProducer(const edm::ParameterSet& p) : pse } } -std::unique_ptr PixelCPEFastESProducer::produce(const TkPixelCPERecord& iRecord) { +template +std::unique_ptr PixelCPEFastESProducerT::produce( + const TkPixelCPERecord& iRecord) { // add the new la width object const SiPixelLorentzAngle* lorentzAngleWidthProduct = nullptr; lorentzAngleWidthProduct = &iRecord.get(lorentzAngleWidthToken_); @@ -65,23 +69,24 @@ std::unique_ptr PixelCPEFastESProducer::produce( //} else { //std::cout<<" pass an empty GenError pointer"<(pset_, - &iRecord.get(magfieldToken_), - iRecord.get(pDDToken_), - iRecord.get(hTTToken_), - &iRecord.get(lorentzAngleToken_), - genErrorDBObjectProduct, - lorentzAngleWidthProduct); + return std::make_unique>(pset_, + &iRecord.get(magfieldToken_), + iRecord.get(pDDToken_), + iRecord.get(hTTToken_), + &iRecord.get(lorentzAngleToken_), + genErrorDBObjectProduct, + lorentzAngleWidthProduct); } -void PixelCPEFastESProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void PixelCPEFastESProducerT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; // from PixelCPEBase PixelCPEBase::fillPSetDescription(desc); // from PixelCPEFast - PixelCPEFast::fillPSetDescription(desc); + PixelCPEFast::fillPSetDescription(desc); // used by PixelCPEFast desc.add("EdgeClusterErrorX", 50.0); @@ -89,11 +94,17 @@ void PixelCPEFastESProducer::fillDescriptions(edm::ConfigurationDescriptions& de desc.add("UseErrorsFromTemplates", true); desc.add("TruncatePixelCharge", true); - // specific to PixelCPEFastESProducer - desc.add("ComponentName", "PixelCPEFast"); + std::string name = "PixelCPEFast"; + name += TrackerTraits::nameModifier; + desc.add("ComponentName", name); desc.add("MagneticFieldRecord", edm::ESInputTag()); - descriptions.add("PixelCPEFastESProducer", desc); + descriptions.addWithDefaultLabel(desc); } +using PixelCPEFastESProducer = PixelCPEFastESProducerT; DEFINE_FWK_EVENTSETUP_MODULE(PixelCPEFastESProducer); +using PixelCPEFastESProducerPhase1 = PixelCPEFastESProducerT; +DEFINE_FWK_EVENTSETUP_MODULE(PixelCPEFastESProducerPhase1); +using PixelCPEFastESProducerPhase2 = PixelCPEFastESProducerT; +DEFINE_FWK_EVENTSETUP_MODULE(PixelCPEFastESProducerPhase2); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index 135254fa6e9f2..cb5b4b2f2c387 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -12,21 +12,28 @@ #include "PixelRecHitGPUKernel.h" #include "gpuPixelRecHits.h" +// #define GPU_DEBUG 1 namespace { + template __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, + pixelCPEforGPU::ParamsOnGPUT const* cpeParams, uint32_t* hitsLayerStart) { auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto m = - cpeParams->commonParams().isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; + constexpr auto m = TrackerTraits::numberOfLayers; assert(0 == hitsModuleStart[0]); if (i <= m) { hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]]; #ifdef GPU_DEBUG - printf("LayerStart %d/%d at module %d: %d\n", i, m, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]); + int old = i == 0 ? 0 : hitsModuleStart[cpeParams->layerGeometry().layerStart[i - 1]]; + printf("LayerStart %d/%d at module %d: %d - %d\n", + i, + m, + cpeParams->layerGeometry().layerStart[i], + hitsLayerStart[i], + hitsLayerStart[i] - old); #endif } } @@ -34,18 +41,18 @@ namespace { namespace pixelgpudetails { - TrackingRecHit2DGPU PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, - SiPixelClustersCUDA const& clusters_d, - BeamSpotCUDA const& bs_d, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - bool isPhase2, - cudaStream_t stream) const { + template + TrackingRecHit2DGPUT PixelRecHitGPUKernel::makeHitsAsync( + SiPixelDigisCUDA const& digis_d, + SiPixelClustersCUDA const& clusters_d, + BeamSpotCUDA const& bs_d, + pixelCPEforGPU::ParamsOnGPUT const* cpeParams, + cudaStream_t stream) const { + using namespace gpuPixelRecHits; auto nHits = clusters_d.nClusters(); - TrackingRecHit2DGPU hits_d( - nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); - assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules - : phase1PixelTopology::numberOfModules); + TrackingRecHit2DGPUT hits_d( + nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); int activeModulesWithDigis = digis_d.nModules(); // protect from empty events @@ -54,9 +61,10 @@ namespace pixelgpudetails { int blocks = activeModulesWithDigis; #ifdef GPU_DEBUG + std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; #endif - gpuPixelRecHits::getHits<<>>( + getHits<<>>( cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -65,9 +73,10 @@ namespace pixelgpudetails { // assuming full warp of threads is better than a smaller number... if (nHits) { - setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); + setHitsLayerStart + <<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); cudaCheck(cudaGetLastError()); - auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; + constexpr auto nLayers = TrackerTraits::numberOfLayers; cms::cuda::fillManyFromVector(hits_d.phiBinner(), nLayers, hits_d.iphi(), @@ -87,4 +96,6 @@ namespace pixelgpudetails { return hits_d; } + template class PixelRecHitGPUKernel; + template class PixelRecHitGPUKernel; } // namespace pixelgpudetails diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h index 8289c8db7f2f4..0a3c2b647f22e 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h @@ -9,9 +9,11 @@ #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" - +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +//#define GPU_DEBUG 1 namespace pixelgpudetails { + template class PixelRecHitGPUKernel { public: PixelRecHitGPUKernel() = default; @@ -22,13 +24,15 @@ namespace pixelgpudetails { PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete; PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete; - TrackingRecHit2DGPU makeHitsAsync(SiPixelDigisCUDA const& digis_d, - SiPixelClustersCUDA const& clusters_d, - BeamSpotCUDA const& bs_d, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - bool isPhase2, - cudaStream_t stream) const; + using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; + + TrackingRecHit2DGPUT makeHitsAsync(SiPixelDigisCUDA const& digis_d, + SiPixelClustersCUDA const& clusters_d, + BeamSpotCUDA const& bs_d, + ParamsOnGPU const* cpeParams, + cudaStream_t stream) const; }; + } // namespace pixelgpudetails #endif // RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHitGPUKernel_h diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc index 8112e9ebd19c8..b23fa7dcc11ed 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc @@ -20,13 +20,15 @@ #include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h" #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h" #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "PixelRecHitGPUKernel.h" -class SiPixelRecHitCUDA : public edm::global::EDProducer<> { +template +class SiPixelRecHitCUDAT : public edm::global::EDProducer<> { public: - explicit SiPixelRecHitCUDA(const edm::ParameterSet& iConfig); - ~SiPixelRecHitCUDA() override = default; + explicit SiPixelRecHitCUDAT(const edm::ParameterSet& iConfig); + ~SiPixelRecHitCUDAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -37,31 +39,40 @@ class SiPixelRecHitCUDA : public edm::global::EDProducer<> { const edm::EDGetTokenT> tBeamSpot; const edm::EDGetTokenT> token_; const edm::EDGetTokenT> tokenDigi_; - const edm::EDPutTokenT> tokenHit_; + const edm::EDPutTokenT>> tokenHit_; - const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_; + const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_; }; -SiPixelRecHitCUDA::SiPixelRecHitCUDA(const edm::ParameterSet& iConfig) +template +SiPixelRecHitCUDAT::SiPixelRecHitCUDAT(const edm::ParameterSet& iConfig) : cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter("CPE")))), tBeamSpot(consumes>(iConfig.getParameter("beamSpot"))), token_(consumes>(iConfig.getParameter("src"))), tokenDigi_(consumes>(iConfig.getParameter("src"))), - tokenHit_(produces>()) {} + tokenHit_(produces>>()) {} -void SiPixelRecHitCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void SiPixelRecHitCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("beamSpot", edm::InputTag("offlineBeamSpotCUDA")); desc.add("src", edm::InputTag("siPixelClustersPreSplittingCUDA")); - desc.add("CPE", "PixelCPEFast"); - descriptions.add("siPixelRecHitCUDA", desc); + + std::string cpe = "PixelCPEFast"; + cpe += TrackerTraits::nameModifier; + desc.add("CPE", cpe); + + descriptions.addWithDefaultLabel(desc); } -void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const { - PixelCPEFast const* fcpe = dynamic_cast(&es.getData(cpeToken_)); +template +void SiPixelRecHitCUDAT::produce(edm::StreamID streamID, + edm::Event& iEvent, + const edm::EventSetup& es) const { + PixelCPEFast const* fcpe = dynamic_cast*>(&es.getData(cpeToken_)); if (not fcpe) { - throw cms::Exception("Configuration") << "SiPixelRecHitSoAFromLegacy can only use a CPE of type PixelCPEFast"; + throw cms::Exception("Configuration") << "SiPixelRecHitCUDA can only use a CPE of type PixelCPEFast"; } edm::Handle> hclusters; @@ -80,8 +91,14 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons ctx.emplace(iEvent, tokenHit_, - gpuAlgo_.makeHitsAsync( - digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), fcpe->isPhase2(), ctx.stream())); + gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), ctx.stream())); } +using SiPixelRecHitCUDA = SiPixelRecHitCUDAT; DEFINE_FWK_MODULE(SiPixelRecHitCUDA); + +using SiPixelRecHitCUDAPhase1 = SiPixelRecHitCUDAT; +DEFINE_FWK_MODULE(SiPixelRecHitCUDAPhase1); + +using SiPixelRecHitCUDAPhase2 = SiPixelRecHitCUDAT; +DEFINE_FWK_MODULE(SiPixelRecHitCUDAPhase2); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc index 7ff2da5552e6d..1428efe06a1d1 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc @@ -24,14 +24,16 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -class SiPixelRecHitFromCUDA : public edm::stream::EDProducer { +template +class SiPixelRecHitFromCUDAT : public edm::stream::EDProducer { public: - explicit SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig); - ~SiPixelRecHitFromCUDA() override = default; + explicit SiPixelRecHitFromCUDAT(const edm::ParameterSet& iConfig); + ~SiPixelRecHitFromCUDAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); using HMSstorage = HostProduct; + using HitsOnGPU = TrackingRecHit2DGPUT; private: void acquire(edm::Event const& iEvent, @@ -40,64 +42,70 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; const edm::ESGetToken geomToken_; - const edm::EDGetTokenT> hitsToken_; // CUDA hits - const edm::EDGetTokenT clusterToken_; // legacy clusters - const edm::EDPutTokenT rechitsPutToken_; // legacy rechits + const edm::EDGetTokenT> hitsToken_; // CUDA hits + const edm::EDGetTokenT clusterToken_; // legacy clusters + const edm::EDPutTokenT rechitsPutToken_; // legacy rechits const edm::EDPutTokenT hostPutToken_; uint32_t nHits_; - uint32_t nMaxModules_; cms::cuda::host::unique_ptr store32_; cms::cuda::host::unique_ptr hitsModuleStart_; }; -SiPixelRecHitFromCUDA::SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig) +template +SiPixelRecHitFromCUDAT::SiPixelRecHitFromCUDAT(const edm::ParameterSet& iConfig) : geomToken_(esConsumes()), - hitsToken_( - consumes>(iConfig.getParameter("pixelRecHitSrc"))), + hitsToken_(consumes>(iConfig.getParameter("pixelRecHitSrc"))), clusterToken_(consumes(iConfig.getParameter("src"))), rechitsPutToken_(produces()), hostPutToken_(produces()) {} -void SiPixelRecHitFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void SiPixelRecHitFromCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA")); desc.add("src", edm::InputTag("siPixelClustersPreSplitting")); + descriptions.addWithDefaultLabel(desc); } -void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent, - edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsToken_); +template +void SiPixelRecHitFromCUDAT::acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsToken_); + cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + auto const& inputData = ctx.get(inputDataWrapped); nHits_ = inputData.nHits(); - nMaxModules_ = inputData.nMaxModules(); LogDebug("SiPixelRecHitFromCUDA") << "converting " << nHits_ << " Hits"; if (0 == nHits_) return; store32_ = inputData.localCoordToHostAsync(ctx.stream()); + hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream()); } -void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) { +template +void SiPixelRecHitFromCUDAT::produce(edm::Event& iEvent, edm::EventSetup const& es) { // allocate a buffer for the indices of the clusters - auto hmsp = std::make_unique(nMaxModules_ + 1); + constexpr auto nMaxModules = TrackerTraits::numberOfModules; + auto hmsp = std::make_unique(nMaxModules + 1); SiPixelRecHitCollection output; - output.reserve(nMaxModules_, nHits_); + output.reserve(nMaxModules, nHits_); if (0 == nHits_) { iEvent.emplace(rechitsPutToken_, std::move(output)); iEvent.emplace(hostPutToken_, std::move(hmsp)); return; } - output.reserve(nMaxModules_, nHits_); + output.reserve(nMaxModules, nHits_); - std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get()); + std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules + 1, hmsp.get()); // wrap the buffer in a HostProduct, and move it to the Event, without reallocating the buffer or affecting hitsModuleStart iEvent.emplace(hostPutToken_, std::move(hmsp)); @@ -130,7 +138,7 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e assert(lc > fc); LogDebug("SiPixelRecHitFromCUDA") << "in det " << gind << ": conv " << nhits << " hits from " << dsv.size() - << " legacy clusters" << ' ' << fc << ',' << lc; + << " legacy clusters" << ' ' << fc << ',' << lc << "\n"; if (nhits > maxHitsInModule) edm::LogWarning("SiPixelRecHitFromCUDA") << fmt::sprintf( "Too many clusters %d in module %d. Only the first %d hits will be converted", nhits, gind, maxHitsInModule); @@ -185,4 +193,11 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e iEvent.emplace(rechitsPutToken_, std::move(output)); } +using SiPixelRecHitFromCUDA = SiPixelRecHitFromCUDAT; DEFINE_FWK_MODULE(SiPixelRecHitFromCUDA); + +using SiPixelRecHitFromCUDAPhase1 = SiPixelRecHitFromCUDAT; +DEFINE_FWK_MODULE(SiPixelRecHitFromCUDAPhase1); + +using SiPixelRecHitFromCUDAPhase2 = SiPixelRecHitFromCUDAT; +DEFINE_FWK_MODULE(SiPixelRecHitFromCUDAPhase2); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc index 7532470ebd3d4..8bcb218255548 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc @@ -24,13 +24,15 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer { +template +class SiPixelRecHitSoAFromCUDAT : public edm::stream::EDProducer { public: - explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig); - ~SiPixelRecHitSoAFromCUDA() override = default; + explicit SiPixelRecHitSoAFromCUDAT(const edm::ParameterSet& iConfig); + ~SiPixelRecHitSoAFromCUDAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); using HMSstorage = HostProduct; + using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; private: void acquire(edm::Event const& iEvent, @@ -38,34 +40,36 @@ class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer> hitsTokenGPU_; // CUDA hits - const edm::EDPutTokenT hitsPutTokenCPU_; + const edm::EDGetTokenT>> hitsTokenGPU_; // CUDA hits + const edm::EDPutTokenT> hitsPutTokenCPU_; const edm::EDPutTokenT hostPutToken_; uint32_t nHits_; - uint32_t nMaxModules_; cms::cuda::host::unique_ptr store32_; cms::cuda::host::unique_ptr store16_; cms::cuda::host::unique_ptr hitsModuleStart_; }; -SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig) - : hitsTokenGPU_( - consumes>(iConfig.getParameter("pixelRecHitSrc"))), - hitsPutTokenCPU_(produces()), +template +SiPixelRecHitSoAFromCUDAT::SiPixelRecHitSoAFromCUDAT(const edm::ParameterSet& iConfig) + : hitsTokenGPU_(consumes(iConfig.getParameter("pixelRecHitSrc"))), + hitsPutTokenCPU_(produces>()), hostPutToken_(produces()) {} -void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void SiPixelRecHitSoAFromCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA")); + descriptions.addWithDefaultLabel(desc); } -void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent, - edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsTokenGPU_); +template +void SiPixelRecHitSoAFromCUDAT::acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + cms::cuda::Product> const& inputDataWrapped = iEvent.get(hitsTokenGPU_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); @@ -74,20 +78,27 @@ void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent, if (0 == nHits_) return; - nMaxModules_ = inputData.nMaxModules(); store32_ = inputData.store32ToHostAsync(ctx.stream()); store16_ = inputData.store16ToHostAsync(ctx.stream()); hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream()); } -void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) { - auto hmsp = std::make_unique(nMaxModules_ + 1); +template +void SiPixelRecHitSoAFromCUDAT::produce(edm::Event& iEvent, edm::EventSetup const& es) { + auto hmsp = std::make_unique(TrackerTraits::numberOfModules + 1); if (nHits_ > 0) - std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get()); + std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + TrackerTraits::numberOfModules + 1, hmsp.get()); iEvent.emplace(hostPutToken_, std::move(hmsp)); - iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_); + iEvent.emplace(hitsPutTokenCPU_, store32_, store16_, hitsModuleStart_.get(), nHits_); } +using SiPixelRecHitSoAFromCUDA = SiPixelRecHitSoAFromCUDAT; DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA); + +using SiPixelRecHitSoAFromCUDAPhase1 = SiPixelRecHitSoAFromCUDAT; +DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDAPhase1); + +using SiPixelRecHitSoAFromCUDAPhase2 = SiPixelRecHitSoAFromCUDAT; +DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDAPhase2); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index d23ecec66fea0..1edc7870f4800 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -27,10 +27,11 @@ #include "gpuPixelRecHits.h" -class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> { +template +class SiPixelRecHitSoAFromLegacyT : public edm::global::EDProducer<> { public: - explicit SiPixelRecHitSoAFromLegacy(const edm::ParameterSet& iConfig); - ~SiPixelRecHitSoAFromLegacy() override = default; + explicit SiPixelRecHitSoAFromLegacyT(const edm::ParameterSet& iConfig); + ~SiPixelRecHitSoAFromLegacyT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -44,39 +45,44 @@ class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> { const edm::ESGetToken cpeToken_; const edm::EDGetTokenT bsGetToken_; const edm::EDGetTokenT clusterToken_; // Legacy Clusters - const edm::EDPutTokenT tokenHit_; + const edm::EDPutTokenT> tokenHit_; const edm::EDPutTokenT tokenModuleStart_; const bool convert2Legacy_; - const bool isPhase2_; }; -SiPixelRecHitSoAFromLegacy::SiPixelRecHitSoAFromLegacy(const edm::ParameterSet& iConfig) +template +SiPixelRecHitSoAFromLegacyT::SiPixelRecHitSoAFromLegacyT(const edm::ParameterSet& iConfig) : geomToken_(esConsumes()), cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter("CPE")))), bsGetToken_{consumes(iConfig.getParameter("beamSpot"))}, clusterToken_{consumes(iConfig.getParameter("src"))}, - tokenHit_{produces()}, + tokenHit_{produces>()}, tokenModuleStart_{produces()}, - convert2Legacy_(iConfig.getParameter("convertToLegacy")), - isPhase2_(iConfig.getParameter("isPhase2")) { + convert2Legacy_(iConfig.getParameter("convertToLegacy")) { if (convert2Legacy_) produces(); } -void SiPixelRecHitSoAFromLegacy::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void SiPixelRecHitSoAFromLegacyT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); desc.add("src", edm::InputTag("siPixelClustersPreSplitting")); - desc.add("CPE", "PixelCPEFast"); + std::string cpeName = "PixelCPEFast"; + cpeName += TrackerTraits::nameModifier; + desc.add("CPE", cpeName); desc.add("convertToLegacy", false); - desc.add("isPhase2", false); + descriptions.addWithDefaultLabel(desc); } -void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const { +template +void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, + edm::Event& iEvent, + const edm::EventSetup& es) const { const TrackerGeometry* geom_ = &es.getData(geomToken_); - PixelCPEFast const* fcpe = dynamic_cast(&es.getData(cpeToken_)); + PixelCPEFast const* fcpe = dynamic_cast*>(&es.getData(cpeToken_)); if (not fcpe) { throw cms::Exception("Configuration") << "SiPixelRecHitSoAFromLegacy can only use a CPE of type PixelCPEFast"; } @@ -93,14 +99,11 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv iEvent.getByToken(clusterToken_, hclusters); auto const& input = *hclusters; - const int nMaxModules = isPhase2_ ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; - const int startBPIX2 = isPhase2_ ? phase2PixelTopology::layerStart[1] : phase1PixelTopology::layerStart[1]; - - assert(nMaxModules < gpuClustering::maxNumModules); - assert(startBPIX2 < nMaxModules); + constexpr int maxModules = TrackerTraits::numberOfModules; + constexpr int startBPIX2 = pixelTopology::layerStart(1); // allocate a buffer for the indices of the clusters - auto hmsp = std::make_unique(nMaxModules + 1); + auto hmsp = std::make_unique(maxModules + 1); // hitsModuleStart is a non-owning pointer to the buffer auto hitsModuleStart = hmsp.get(); // wrap the buffer in a HostProduct @@ -141,24 +144,23 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv DetId detIdObject(detid); const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject); auto gind = genericDet->index(); - assert(gind < nMaxModules); + assert(gind < maxModules); auto const nclus = dsv.size(); clusInModule_[gind] = nclus; numberOfClusters += nclus; } hitsModuleStart[0] = 0; - for (int i = 1, n = nMaxModules + 1; i < n; ++i) + for (int i = 1, n = maxModules + 1; i < n; ++i) hitsModuleStart[i] = hitsModuleStart[i - 1] + clusInModule_[i - 1]; - assert(numberOfClusters == int(hitsModuleStart[nMaxModules])); + assert(numberOfClusters == int(hitsModuleStart[maxModules])); // output SoA // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1) - auto output = std::make_unique( - numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr); - assert(output->nMaxModules() == uint32_t(nMaxModules)); + auto output = std::make_unique>( + numberOfClusters, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr); if (0 == numberOfClusters) { iEvent.put(std::move(output)); @@ -168,7 +170,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv } if (convert2Legacy_) - legacyOutput->reserve(nMaxModules, numberOfClusters); + legacyOutput->reserve(maxModules, numberOfClusters); int numberOfDetUnits = 0; int numberOfHits = 0; @@ -178,7 +180,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv DetId detIdObject(detid); const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject); auto const gind = genericDet->index(); - assert(gind < nMaxModules); + assert(gind < maxModules); const PixelGeomDetUnit* pixDet = dynamic_cast(genericDet); assert(pixDet); auto const nclus = dsv.size(); @@ -249,6 +251,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv if (ih >= maxHitsInModule) break; + assert(ih < clusterRef.size()); LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h)); LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h)); @@ -262,7 +265,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv assert(numberOfHits == numberOfClusters); // fill data structure to support CA - const auto nLayers = isPhase2_ ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; + constexpr auto nLayers = TrackerTraits::numberOfLayers; for (auto i = 0U; i < nLayers + 1; ++i) { output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]]; LogDebug("SiPixelRecHitSoAFromLegacy") @@ -279,10 +282,18 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv output->phiBinnerStorage()); LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in " - << numberOfDetUnits << " Dets"; + << numberOfDetUnits << " Dets" + << "\n"; iEvent.put(std::move(output)); if (convert2Legacy_) iEvent.put(std::move(legacyOutput)); } +using SiPixelRecHitSoAFromLegacy = SiPixelRecHitSoAFromLegacyT; DEFINE_FWK_MODULE(SiPixelRecHitSoAFromLegacy); + +using SiPixelRecHitSoAFromLegacyPhase1 = SiPixelRecHitSoAFromLegacyT; +DEFINE_FWK_MODULE(SiPixelRecHitSoAFromLegacyPhase1); + +using SiPixelRecHitSoAFromLegacyPhase2 = SiPixelRecHitSoAFromLegacyT; +DEFINE_FWK_MODULE(SiPixelRecHitSoAFromLegacyPhase2); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index 5b862b2cf63b9..f0798cc74a975 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -13,30 +13,31 @@ #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" +//#define GPU_DEBUG 1 namespace gpuPixelRecHits { - __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams, + template + __global__ void getHits(pixelCPEforGPU::ParamsOnGPUT const* __restrict__ cpeParams, BeamSpotPOD const* __restrict__ bs, SiPixelDigisCUDASOAView const digis, int numElements, SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters, - TrackingRecHit2DSOAView* phits) { + TrackingRecHit2DSOAViewT* phits) { // FIXME // the compiler seems NOT to optimize loads from views (even in a simple test case) // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain) + assert(phits); assert(cpeParams); auto& hits = *phits; auto const& clusters = *pclusters; - auto isPhase2 = cpeParams->commonParams().isPhase2; // copy average geometry corrected by beamspot . FIXME (move it somewhere else???) if (0 == blockIdx.x) { auto& agc = hits.averageGeometry(); auto const& ag = cpeParams->averageGeometry(); - auto nLadders = - isPhase2 ? phase2PixelTopology::numberOfLaddersInBarrel : phase1PixelTopology::numberOfLaddersInBarrel; + auto nLadders = TrackerTraits::numberOfLaddersInBarrel; for (int il = threadIdx.x, nl = nLadders; il < nl; il += blockDim.x) { agc.ladderZ[il] = ag.ladderZ[il] - bs->z; @@ -68,19 +69,20 @@ namespace gpuPixelRecHits { if (0 == nclus) return; -#ifdef GPU_DEBUG - if (threadIdx.x == 0) { - auto k = clusters.moduleStart(1 + blockIdx.x); - while (digis.moduleInd(k) == invalidModuleId) - ++k; - assert(digis.moduleInd(k) == me); - } -#endif +// #ifdef GPU_DEBUG +// if (threadIdx.x == 0) { +// auto k = clusters.moduleStart(1 + blockIdx.x); +// while (digis.moduleInd(k) == invalidModuleId) +// ++k; +// assert(digis.moduleInd(k) == me); +// } +// #endif #ifdef GPU_DEBUG if (me % 100 == 1) if (threadIdx.x == 0) printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me)); #endif + for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) { int nClusInIter = std::min(MaxHitsInIter, endClus - startClus); int lastClus = startClus + nClusInIter; @@ -168,11 +170,9 @@ namespace gpuPixelRecHits { assert(h < hits.nHits()); assert(h < clusters.clusModuleStart(me + 1)); - pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); - if (!isPhase2) - pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); - else - pixelCPEforGPU::errorFromSize(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); + pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); + + pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); // store it hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]); diff --git a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py index 5fa4e0ffaf68c..cf48b22d02a10 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py +++ b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py @@ -10,12 +10,11 @@ # 2. Pixel Generic CPE # from RecoLocalTracker.SiPixelRecHits.PixelCPEGeneric_cfi import * -from RecoLocalTracker.SiPixelRecHits.PixelCPEFastESProducer_cfi import * +from RecoLocalTracker.SiPixelRecHits.pixelCPEFastESProducer_cfi import pixelCPEFastESProducer as PixelCPEFastESProducer +#from RecoLocalTracker.SiPixelRecHits.pixelCPEFastESProducerPhase1_cfi import pixelCPEFastESProducerPhase1 as PixelCPEFastESProducerPhase1 +from RecoLocalTracker.SiPixelRecHits.pixelCPEFastESProducerPhase2_cfi import pixelCPEFastESProducerPhase2 as PixelCPEFastESProducerPhase2 # # 3. ESProducer for the Magnetic-field dependent template records # from CalibTracker.SiPixelESProducers.SiPixelTemplateDBObjectESProducer_cfi import * from CalibTracker.SiPixelESProducers.SiPixel2DTemplateDBObjectESProducer_cfi import * - -from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker -phase2_tracker.toModify(PixelCPEFastESProducer, isPhase2 = True) diff --git a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEGeneric_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEGeneric_cfi.py index 43a61651c53a3..370eabae2b06d 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEGeneric_cfi.py +++ b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEGeneric_cfi.py @@ -11,13 +11,13 @@ # customize the Pixel CPE generic producer for phase2 from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker phase2_tracker.toModify(PixelCPEGenericESProducer, - UseErrorsFromTemplates = True, - LoadTemplatesFromDB = True, + UseErrorsFromTemplates = True, + LoadTemplatesFromDB = True, NoTemplateErrorsWhenNoTrkAngles = True, TruncatePixelCharge = False, IrradiationBiasCorrection = False, # set IBC off DoCosmics = False, - Upgrade = True, # use 'upgrade' version of hardcoded CPE errors + isPhase2 = True, # use 'Phase2' version of hardcoded CPE errors xerr_barrel_ln = [0.00025, 0.00030, 0.00035, 0.00035], xerr_barrel_ln_def = 0.00035, yerr_barrel_ln = [0.00210, 0.00115, 0.00125], diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py index 4af0238682abb..ec3e068bca422 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py +++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py @@ -16,16 +16,15 @@ ) ) -# convert the pixel rechits from legacy to SoA format -from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy as _siPixelRecHitsPreSplittingSoA -from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromCUDA_cfi import siPixelRecHitSoAFromCUDA as _siPixelRecHitSoAFromCUDA - -siPixelRecHitsPreSplittingCPU = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True) - # phase 2 tracker modifier from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker -phase2_tracker.toModify(siPixelRecHitsPreSplittingCPU, - isPhase2 = True) + +# convert the pixel rechits from legacy to SoA format on CPU +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacyPhase1_cfi import siPixelRecHitSoAFromLegacyPhase1 as _siPixelRecHitsPreSplittingSoA +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacyPhase2_cfi import siPixelRecHitSoAFromLegacyPhase2 as _siPixelRecHitsPreSplittingSoAPhase2 + +siPixelRecHitsPreSplittingCPU = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True) +phase2_tracker.toReplaceWith(siPixelRecHitsPreSplittingCPU, _siPixelRecHitsPreSplittingSoAPhase2.clone(convertToLegacy=True, CPE = cms.string('PixelCPEFastPhase2'))) # modifier used to prompt patatrack pixel tracks reconstruction on cpu from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit @@ -45,25 +44,44 @@ # reconstruct the pixel rechits on the gpu from RecoLocalTracker.SiPixelRecHits.siPixelRecHitCUDA_cfi import siPixelRecHitCUDA as _siPixelRecHitCUDA +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitCUDAPhase2_cfi import siPixelRecHitCUDAPhase2 as _siPixelRecHitCUDAPhase2 siPixelRecHitsPreSplittingCUDA = _siPixelRecHitCUDA.clone( beamSpot = "offlineBeamSpotToCUDA" ) - -# transfer the pixel rechits to the host and convert them from SoA -from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDA_cfi import siPixelRecHitFromCUDA as _siPixelRecHitFromCUDA +phase2_tracker.toReplaceWith(siPixelRecHitsPreSplittingCUDA,_siPixelRecHitCUDAPhase2.clone( + beamSpot = "offlineBeamSpotToCUDA" +)) #this is an alias for the SoA on GPU or CPU to be used for DQM siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA( cpu = cms.EDAlias( siPixelRecHitsPreSplittingCPU = cms.VPSet( - cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")), + cms.PSet(type = cms.string("pixelTopologyPhase1TrackingRecHit2DCPUT")), cms.PSet(type = cms.string("uintAsHostProduct")) )), ) -(gpu & pixelNtupletFit).toModify(siPixelRecHitsPreSplittingSoA,cuda = _siPixelRecHitSoAFromCUDA.clone()) +phase2_tracker.toModify(siPixelRecHitsPreSplittingSoA, +cpu = cms.EDAlias( + siPixelRecHitsPreSplittingCPU = cms.VPSet( + cms.PSet(type = cms.string("pixelTopologyPhase2TrackingRecHit2DCPUT")), + cms.PSet(type = cms.string("uintAsHostProduct")) + ))) + +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromCUDAPhase1_cfi import siPixelRecHitSoAFromCUDAPhase1 as _siPixelRecHitSoAFromCUDA +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromCUDAPhase2_cfi import siPixelRecHitSoAFromCUDAPhase2 as _siPixelRecHitSoAFromCUDAPhase2 + +(gpu & pixelNtupletFit).toModify(siPixelRecHitsPreSplittingSoA, cuda = _siPixelRecHitSoAFromCUDA.clone()) +(gpu & pixelNtupletFit & phase2_tracker).toModify(siPixelRecHitsPreSplittingSoA, cuda = _siPixelRecHitSoAFromCUDAPhase2.clone()) + +# transfer the pixel rechits to the host and convert them from SoA +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDAPhase1_cfi import siPixelRecHitFromCUDAPhase1 as _siPixelRecHitFromCUDA +from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDAPhase2_cfi import siPixelRecHitFromCUDAPhase2 as _siPixelRecHitFromCUDAPhase2 (gpu & pixelNtupletFit).toModify(siPixelRecHitsPreSplitting, cuda = _siPixelRecHitFromCUDA.clone()) +(gpu & pixelNtupletFit & phase2_tracker).toModify(siPixelRecHitsPreSplitting, cuda = _siPixelRecHitFromCUDAPhase2.clone()) + + pixelNtupletFit.toReplaceWith(siPixelRecHitsPreSplittingTask, cms.Task( cms.Task( @@ -76,6 +94,9 @@ ) ) + +#(gpu & pixelNtupletFit & phase2_tracker).toReplaceWith(siPixelRecHitsPreSplitting , cuda = _siPixelRecHitFromCUDAPhase2.clone()) + (gpu & pixelNtupletFit).toReplaceWith(siPixelRecHitsPreSplittingTask, cms.Task( # reconstruct the pixel rechits on the gpu or on the cpu # (normally only one of the two is run because only one is consumed from later stages) diff --git a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc index 4b26153cc72c1..9e30bfe50a1ce 100644 --- a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc +++ b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc @@ -20,13 +20,14 @@ namespace { //----------------------------------------------------------------------------- //! The constructor. //----------------------------------------------------------------------------- -PixelCPEFast::PixelCPEFast(edm::ParameterSet const& conf, - const MagneticField* mag, - const TrackerGeometry& geom, - const TrackerTopology& ttopo, - const SiPixelLorentzAngle* lorentzAngle, - const SiPixelGenErrorDBObject* genErrorDBObject, - const SiPixelLorentzAngle* lorentzAngleWidth) +template +PixelCPEFast::PixelCPEFast(edm::ParameterSet const& conf, + const MagneticField* mag, + const TrackerGeometry& geom, + const TrackerTopology& ttopo, + const SiPixelLorentzAngle* lorentzAngle, + const SiPixelGenErrorDBObject* genErrorDBObject, + const SiPixelLorentzAngle* lorentzAngleWidth) : PixelCPEGenericBase(conf, mag, geom, ttopo, lorentzAngle, genErrorDBObject, lorentzAngleWidth) { // Use errors from templates or from GenError if (useErrorsFromTemplates_) { @@ -36,8 +37,6 @@ PixelCPEFast::PixelCPEFast(edm::ParameterSet const& conf, << (*genErrorDBObject_).version(); } - isPhase2_ = conf.getParameter("isPhase2"); - fillParamsForGpu(); cpuData_ = { @@ -48,18 +47,23 @@ PixelCPEFast::PixelCPEFast(edm::ParameterSet const& conf, }; } -const pixelCPEforGPU::ParamsOnGPU* PixelCPEFast::getGPUProductAsync(cudaStream_t cudaStream) const { +template +const pixelCPEforGPU::ParamsOnGPUT* PixelCPEFast::getGPUProductAsync( + cudaStream_t cudaStream) const { + using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; + using LayerGeometry = pixelCPEforGPU::LayerGeometryT; + using AverageGeometry = pixelTopology::AverageGeometryT; + const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) { // and now copy to device... cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_commonParams, sizeof(pixelCPEforGPU::CommonParams))); cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_detParams, this->detParamsGPU_.size() * sizeof(pixelCPEforGPU::DetParams))); - cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry))); - cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry))); - cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_d, sizeof(pixelCPEforGPU::ParamsOnGPU))); - cudaCheck(cudaMemcpyAsync( - data.paramsOnGPU_d, &data.paramsOnGPU_h, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream)); + cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_averageGeometry, sizeof(AverageGeometry))); + cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_layerGeometry, sizeof(LayerGeometry))); + cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_d, sizeof(ParamsOnGPU))); + cudaCheck(cudaMemcpyAsync(data.paramsOnGPU_d, &data.paramsOnGPU_h, sizeof(ParamsOnGPU), cudaMemcpyDefault, stream)); cudaCheck(cudaMemcpyAsync((void*)data.paramsOnGPU_h.m_commonParams, &this->commonParamsGPU_, sizeof(pixelCPEforGPU::CommonParams), @@ -67,12 +71,12 @@ const pixelCPEforGPU::ParamsOnGPU* PixelCPEFast::getGPUProductAsync(cudaStream_t stream)); cudaCheck(cudaMemcpyAsync((void*)data.paramsOnGPU_h.m_averageGeometry, &this->averageGeometry_, - sizeof(pixelCPEforGPU::AverageGeometry), + sizeof(AverageGeometry), cudaMemcpyDefault, stream)); cudaCheck(cudaMemcpyAsync((void*)data.paramsOnGPU_h.m_layerGeometry, &this->layerGeometry_, - sizeof(pixelCPEforGPU::LayerGeometry), + sizeof(LayerGeometry), cudaMemcpyDefault, stream)); cudaCheck(cudaMemcpyAsync((void*)data.paramsOnGPU_h.m_detParams, @@ -84,7 +88,8 @@ const pixelCPEforGPU::ParamsOnGPU* PixelCPEFast::getGPUProductAsync(cudaStream_t return data.paramsOnGPU_d; } -void PixelCPEFast::fillParamsForGpu() { +template +void PixelCPEFast::fillParamsForGpu() { // // this code executes only once per job, computation inefficiency is not an issue // many code blocks are repeated: better keep the computation local and self oconsistent as blocks may in future move around, be deleted ... @@ -95,15 +100,13 @@ void PixelCPEFast::fillParamsForGpu() { commonParamsGPU_.thePitchX = m_DetParams[0].thePitchX; commonParamsGPU_.thePitchY = m_DetParams[0].thePitchY; - commonParamsGPU_.numberOfLaddersInBarrel = - isPhase2_ ? phase2PixelTopology::numberOfLaddersInBarrel : phase1PixelTopology::numberOfLaddersInBarrel; - commonParamsGPU_.isPhase2 = isPhase2_; + commonParamsGPU_.numberOfLaddersInBarrel = TrackerTraits::numberOfLaddersInBarrel; LogDebug("PixelCPEFast") << "pitch & thickness " << commonParamsGPU_.thePitchX << ' ' << commonParamsGPU_.thePitchY << " " << commonParamsGPU_.theThicknessB << ' ' << commonParamsGPU_.theThicknessE; // zero average geometry - memset(&averageGeometry_, 0, sizeof(pixelCPEforGPU::AverageGeometry)); + memset(&averageGeometry_, 0, sizeof(pixelTopology::AverageGeometryT)); uint32_t oldLayer = 0; uint32_t oldLadder = 0; @@ -118,22 +121,12 @@ void PixelCPEFast::fillParamsForGpu() { auto& p = m_DetParams[i]; auto& g = detParamsGPU_[i]; - if (!isPhase2_) { - g.nRowsRoc = phase1PixelTopology::numRowsInRoc; - g.nColsRoc = phase1PixelTopology::numColsInRoc; - g.nRows = phase1PixelTopology::numRowsInModule; - g.nCols = phase1PixelTopology::numColsInModule; - - g.numPixsInModule = g.nRows * g.nCols; + g.nRowsRoc = p.theDet->specificTopology().rowsperroc(); + g.nColsRoc = p.theDet->specificTopology().colsperroc(); + g.nRows = p.theDet->specificTopology().rocsX() * g.nRowsRoc; + g.nCols = p.theDet->specificTopology().rocsY() * g.nColsRoc; - } else { - g.nRowsRoc = p.theDet->specificTopology().rowsperroc(); - g.nColsRoc = p.theDet->specificTopology().colsperroc(); - g.nRows = p.theDet->specificTopology().rocsX() * g.nRowsRoc; - g.nCols = p.theDet->specificTopology().rocsY() * g.nColsRoc; - - g.numPixsInModule = g.nRows * g.nCols; - } + g.numPixsInModule = g.nRows * g.nCols; assert(p.theDet->index() == int(i)); assert(commonParamsGPU_.thePitchY == p.thePitchY); @@ -164,7 +157,7 @@ void PixelCPEFast::fillParamsForGpu() { rl = 0; zl = 0; pl = 0; - miz = isPhase2_ ? 500 : 90; + miz = 500; mxz = 0; nl++; } @@ -213,10 +206,7 @@ void PixelCPEFast::fillParamsForGpu() { cp.cotalpha = gvx * gvz; cp.cotbeta = gvy * gvz; - if (!isPhase2_) - errorFromTemplates(p, cp, 20000.); - else - cp.qBin_ = 0.f; + errorFromTemplates(p, cp, 20000.); } #ifdef EDM_ML_DEBUG @@ -234,8 +224,11 @@ void PixelCPEFast::fillParamsForGpu() { g.sy1 = std::max(21, toMicron(cp.sy1)); // for some angles sy1 is very small g.sy2 = std::max(55, toMicron(cp.sy2)); // sometimes sy2 is smaller than others (due to angle?) - // sample xerr as function of position - auto const xoff = float(phase1PixelTopology::xOffset) * commonParamsGPU_.thePitchX; + //sample xerr as function of position + // moduleOffsetX is the definition of TrackerTraits::xOffset, + // needs to be calculated because for Phase2 the modules are not uniform + float moduleOffsetX = -(0.5f * float(g.nRows) + TrackerTraits::bigPixXCorrection); + auto const xoff = moduleOffsetX * commonParamsGPU_.thePitchX; for (int ix = 0; ix < CPEFastParametrisation::kNumErrorBins; ++ix) { auto x = xoff * (1.f - (0.5f + float(ix)) / 8.f); @@ -252,7 +245,10 @@ void PixelCPEFast::fillParamsForGpu() { } #ifdef EDM_ML_DEBUG // sample yerr as function of position - auto const yoff = float(phase1PixelTopology::yOffset) * commonParamsGPU_.thePitchY; + // moduleOffsetY is the definition of TrackerTraits::yOffset (removed) + float moduleOffsetY = 0.5f * float(g.nCols) + TrackerTraits::bigPixYCorrection; + auto const yoff = -moduleOffsetY * commonParamsGPU_.thePitchY; + for (int ix = 0; ix < CPEFastParametrisation::kNumErrorBins; ++ix) { auto y = yoff * (1.f - (0.5f + float(ix)) / 8.f); auto gvx = p.theOrigin.x() + 40.f * commonParamsGPU_.thePitchY; @@ -320,14 +316,14 @@ void PixelCPEFast::fillParamsForGpu() { } } // loop over det - const int numberOfModulesInLadder = - isPhase2_ ? int(phase2PixelTopology::numberOfModulesInLadder) : int(phase1PixelTopology::numberOfModulesInLadder); - const int numberOfModulesInBarrel = - isPhase2_ ? int(phase2PixelTopology::numberOfModulesInBarrel) : int(phase1PixelTopology::numberOfModulesInBarrel); - const int numberOfLaddersInBarrel = commonParamsGPU_.numberOfLaddersInBarrel; + constexpr int numberOfModulesInLadder = TrackerTraits::numberOfModulesInLadder; + constexpr int numberOfLaddersInBarrel = TrackerTraits::numberOfLaddersInBarrel; + constexpr int numberOfModulesInBarrel = TrackerTraits::numberOfModulesInBarrel; + + constexpr float ladderFactor = 1.f / float(numberOfModulesInLadder); - const int firstEndcapPos = 4, firstEndcapNeg = isPhase2_ ? 16 : 7; - const float ladderFactor = 1.f / float(numberOfModulesInLadder); + constexpr int firstEndcapPos = TrackerTraits::firstEndcapPos; + constexpr int firstEndcapNeg = TrackerTraits::firstEndcapNeg; // compute ladder baricenter (only in global z) for the barrel // @@ -347,44 +343,25 @@ void PixelCPEFast::fillParamsForGpu() { } assert(il + 1 == int(numberOfLaddersInBarrel)); // add half_module and tollerance - const float module_length = isPhase2_ ? 4.345f : 6.7f; + constexpr float moduleLength = TrackerTraits::moduleLength; constexpr float module_tolerance = 0.2f; for (int il = 0, nl = numberOfLaddersInBarrel; il < nl; ++il) { - aveGeom.ladderMinZ[il] -= (0.5f * module_length - module_tolerance); - aveGeom.ladderMaxZ[il] += (0.5f * module_length - module_tolerance); + aveGeom.ladderMinZ[il] -= (0.5f * moduleLength - module_tolerance); + aveGeom.ladderMaxZ[il] += (0.5f * moduleLength - module_tolerance); } // compute "max z" for first layer in endcap (should we restrict to the outermost ring?) - if (!isPhase2_) { - for (auto im = phase1PixelTopology::layerStart[firstEndcapPos]; - im < phase1PixelTopology::layerStart[firstEndcapPos + 1]; - ++im) { - auto const& g = detParamsGPU_[im]; - aveGeom.endCapZ[0] = std::max(aveGeom.endCapZ[0], g.frame.z()); - } - for (auto im = phase1PixelTopology::layerStart[firstEndcapNeg]; - im < phase1PixelTopology::layerStart[firstEndcapNeg + 1]; - ++im) { - auto const& g = detParamsGPU_[im]; - aveGeom.endCapZ[1] = std::min(aveGeom.endCapZ[1], g.frame.z()); - } - // correct for outer ring being closer - aveGeom.endCapZ[0] -= 1.5f; - aveGeom.endCapZ[1] += 1.5f; - } else { - for (auto im = phase2PixelTopology::layerStart[firstEndcapPos]; - im < phase2PixelTopology::layerStart[firstEndcapPos + 1]; - ++im) { - auto const& g = detParamsGPU_[im]; - aveGeom.endCapZ[0] = std::max(aveGeom.endCapZ[0], g.frame.z()); - } - for (auto im = phase2PixelTopology::layerStart[firstEndcapNeg]; - im < phase2PixelTopology::layerStart[firstEndcapNeg + 1]; - ++im) { - auto const& g = detParamsGPU_[im]; - aveGeom.endCapZ[1] = std::min(aveGeom.endCapZ[1], g.frame.z()); - } + for (auto im = TrackerTraits::layerStart[firstEndcapPos]; im < TrackerTraits::layerStart[firstEndcapPos + 1]; ++im) { + auto const& g = detParamsGPU_[im]; + aveGeom.endCapZ[0] = std::max(aveGeom.endCapZ[0], g.frame.z()); + } + for (auto im = TrackerTraits::layerStart[firstEndcapNeg]; im < TrackerTraits::layerStart[firstEndcapNeg + 1]; ++im) { + auto const& g = detParamsGPU_[im]; + aveGeom.endCapZ[1] = std::min(aveGeom.endCapZ[1], g.frame.z()); } + // correct for outer ring being closer + aveGeom.endCapZ[0] -= TrackerTraits::endcapCorrection; + aveGeom.endCapZ[1] += TrackerTraits::endcapCorrection; #ifdef EDM_ML_DEBUG for (int jl = 0, nl = numberOfLaddersInBarrel; jl < nl; ++jl) { LogDebug("PixelCPEFast") << jl << ':' << aveGeom.ladderR[jl] << '/' @@ -397,19 +374,16 @@ void PixelCPEFast::fillParamsForGpu() { #endif // EDM_ML_DEBUG // fill Layer and ladders geometry - memset(&layerGeometry_, 0, sizeof(pixelCPEforGPU::LayerGeometry)); - if (!isPhase2_) { - memcpy(layerGeometry_.layerStart, phase1PixelTopology::layerStart, sizeof(phase1PixelTopology::layerStart)); - memcpy(layerGeometry_.layer, phase1PixelTopology::layer.data(), phase1PixelTopology::layer.size()); - layerGeometry_.maxModuleStride = phase1PixelTopology::maxModuleStride; - } else { - memcpy(layerGeometry_.layerStart, phase2PixelTopology::layerStart, sizeof(phase2PixelTopology::layerStart)); - memcpy(layerGeometry_.layer, phase2PixelTopology::layer.data(), phase2PixelTopology::layer.size()); - layerGeometry_.maxModuleStride = phase2PixelTopology::maxModuleStride; - } + memset(&layerGeometry_, 0, sizeof(pixelCPEforGPU::LayerGeometryT)); + memcpy(layerGeometry_.layerStart, + TrackerTraits::layerStart, + sizeof(pixelCPEforGPU::LayerGeometryT::layerStart)); + memcpy(layerGeometry_.layer, pixelTopology::layer.data(), pixelTopology::layer.size()); + layerGeometry_.maxModuleStride = pixelTopology::maxModuleStride; } -PixelCPEFast::GPUData::~GPUData() { +template +PixelCPEFast::GPUData::~GPUData() { if (paramsOnGPU_d != nullptr) { cudaFree((void*)paramsOnGPU_h.m_commonParams); cudaFree((void*)paramsOnGPU_h.m_detParams); @@ -419,9 +393,10 @@ PixelCPEFast::GPUData::~GPUData() { } } -void PixelCPEFast::errorFromTemplates(DetParam const& theDetParam, - ClusterParamGeneric& theClusterParam, - float qclus) const { +template +void PixelCPEFast::errorFromTemplates(DetParam const& theDetParam, + ClusterParamGeneric& theClusterParam, + float qclus) const { float locBz = theDetParam.bz; float locBx = theDetParam.bx; LogDebug("PixelCPEFast") << "PixelCPEFast::localPosition(...) : locBz = " << locBz; @@ -470,12 +445,21 @@ void PixelCPEFast::errorFromTemplates(DetParam const& theDetParam, theClusterParam.sy2 = theClusterParam.sy2 * micronsToCm; } +template <> +void PixelCPEFast::errorFromTemplates(DetParam const& theDetParam, + ClusterParamGeneric& theClusterParam, + float qclus) const { + theClusterParam.qBin_ = 0.0f; +} + //----------------------------------------------------------------------------- //! Hit position in the local frame (in cm). Unlike other CPE's, this //! one converts everything from the measurement frame (in channel numbers) //! into the local frame (in centimeters). //----------------------------------------------------------------------------- -LocalPoint PixelCPEFast::localPosition(DetParam const& theDetParam, ClusterParam& theClusterParamBase) const { +template +LocalPoint PixelCPEFast::localPosition(DetParam const& theDetParam, + ClusterParam& theClusterParamBase) const { ClusterParamGeneric& theClusterParam = static_cast(theClusterParamBase); assert(!theClusterParam.with_track_angle); @@ -508,12 +492,12 @@ LocalPoint PixelCPEFast::localPosition(DetParam const& theDetParam, ClusterParam cp.charge[0] = theClusterParam.theCluster->charge(); auto ind = theDetParam.theDet->index(); - pixelCPEforGPU::position(commonParamsGPU_, detParamsGPU_[ind], cp, 0); + pixelCPEforGPU::position(commonParamsGPU_, detParamsGPU_[ind], cp, 0); auto xPos = cp.xpos[0]; auto yPos = cp.ypos[0]; // set the error (mind ape....) - pixelCPEforGPU::errorFromDB(commonParamsGPU_, detParamsGPU_[ind], cp, 0); + pixelCPEforGPU::errorFromDB(commonParamsGPU_, detParamsGPU_[ind], cp, 0); theClusterParam.sigmax = cp.xerr[0]; theClusterParam.sigmay = cp.yerr[0]; @@ -530,7 +514,9 @@ LocalPoint PixelCPEFast::localPosition(DetParam const& theDetParam, ClusterParam //------------------------------------------------------------------------- // Hit error in the local frame //------------------------------------------------------------------------- -LocalError PixelCPEFast::localError(DetParam const& theDetParam, ClusterParam& theClusterParamBase) const { +template +LocalError PixelCPEFast::localError(DetParam const& theDetParam, + ClusterParam& theClusterParamBase) const { ClusterParamGeneric& theClusterParam = static_cast(theClusterParamBase); auto xerr = theClusterParam.sigmax; @@ -544,8 +530,11 @@ LocalError PixelCPEFast::localError(DetParam const& theDetParam, ClusterParam& t return LocalError(xerr_sq, 0, yerr_sq); } -void PixelCPEFast::fillPSetDescription(edm::ParameterSetDescription& desc) { +template +void PixelCPEFast::fillPSetDescription(edm::ParameterSetDescription& desc) { // call PixelCPEGenericBase fillPSetDescription to add common rechit errors PixelCPEGenericBase::fillPSetDescription(desc); - desc.add("isPhase2", false); } + +template class PixelCPEFast; +template class PixelCPEFast; diff --git a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEGeneric.cc b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEGeneric.cc index efffc63015b45..707b2c15d79c6 100644 --- a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEGeneric.cc +++ b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEGeneric.cc @@ -54,8 +54,7 @@ PixelCPEGeneric::PixelCPEGeneric(edm::ParameterSet const& conf, IrradiationBiasCorrection_ = conf.getParameter("IrradiationBiasCorrection"); DoCosmics_ = conf.getParameter("DoCosmics"); - // Upgrade means phase 2 - isPhase2_ = conf.getParameter("Upgrade"); + isPhase2_ = conf.getParameter("isPhase2"); // For cosmics force the use of simple errors if ((DoCosmics_)) @@ -450,6 +449,6 @@ void PixelCPEGeneric::fillPSetDescription(edm::ParameterSetDescription& desc) { desc.add("TruncatePixelCharge", true); desc.add("IrradiationBiasCorrection", false); desc.add("DoCosmics", false); - desc.add("Upgrade", false); + desc.add("isPhase2", false); desc.add("SmallPitch", false); } diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py index 6954b536aba1f..e941ffb207fce 100644 --- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py +++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py @@ -14,7 +14,9 @@ from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker # build the pixel vertices in SoA format on the CPU -from RecoPixelVertexing.PixelVertexFinding.pixelVerticesCUDA_cfi import pixelVerticesCUDA as _pixelVerticesCUDA +from RecoPixelVertexing.PixelVertexFinding.pixelVertexProducerCUDAPhase1_cfi import pixelVertexProducerCUDAPhase1 as _pixelVerticesCUDA +from RecoPixelVertexing.PixelVertexFinding.pixelVertexProducerCUDAPhase2_cfi import pixelVertexProducerCUDAPhase2 as _pixelVerticesCUDAPhase2 + pixelVerticesSoA = SwitchProducerCUDA( cpu = _pixelVerticesCUDA.clone( pixelTrackSrc = "pixelTracksSoA", @@ -22,13 +24,20 @@ ) ) +phase2_tracker.toModify(pixelVerticesSoA,cpu = _pixelVerticesCUDAPhase2.clone( + pixelTrackSrc = "pixelTracksSoA", + onGPU = False, + PtMin = 2.0 +)) + # convert the pixel vertices from SoA to legacy format from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA -(pixelNtupletFit & ~phase2_tracker).toReplaceWith(pixelVertices, _pixelVertexFromSoA.clone( + +(pixelNtupletFit).toReplaceWith(pixelVertices, _pixelVertexFromSoA.clone( src = "pixelVerticesSoA" )) -(pixelNtupletFit & ~phase2_tracker).toReplaceWith(pixelVerticesTask, cms.Task( +(pixelNtupletFit).toReplaceWith(pixelVerticesTask, cms.Task( # build the pixel vertices in SoA format on the CPU pixelVerticesSoA, # convert the pixel vertices from SoA to legacy format @@ -45,6 +54,12 @@ onGPU = True ) +phase2_tracker.toReplaceWith(pixelVerticesCUDA,_pixelVerticesCUDAPhase2.clone( + pixelTrackSrc = "pixelTracksCUDA", + onGPU = True, + PtMin = 2.0 +)) + # transfer the pixel vertices in SoA format to the CPU from RecoPixelVertexing.PixelVertexFinding.pixelVerticesSoA_cfi import pixelVerticesSoA as _pixelVerticesSoA gpu.toModify(pixelVerticesSoA, diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForTriplets.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForTriplets.py index 51abcd3ea7982..141a999e4979f 100644 --- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForTriplets.py +++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForTriplets.py @@ -3,8 +3,10 @@ def customizePixelTracksForTriplets(process): from HLTrigger.Configuration.common import producers_by_type - for producer in producers_by_type(process, 'CAHitNtupletCUDA'): - producer.includeJumpingForwardDoublets = True - producer.minHitsPerNtuplet = 3 + producers = ['CAHitNtupletCUDA','CAHitNtupletCUDAPhase1','CAHitNtupletCUDAPhase2'] + for name in producers: + for producer in producers_by_type(process, name): + producer.includeJumpingForwardDoublets = True + producer.minHitsPerNtuplet = 3 return process diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc index f3d6022e21654..ef73c625ebfa8 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -20,10 +20,12 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" -class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { +template +class PixelTrackDumpCUDAT : public edm::global::EDAnalyzer<> { public: - explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig); - ~PixelTrackDumpCUDA() override = default; + using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + explicit PixelTrackDumpCUDAT(const edm::ParameterSet& iConfig); + ~PixelTrackDumpCUDAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -36,7 +38,8 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { edm::EDGetTokenT tokenSoAVertex_; }; -PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) +template +PixelTrackDumpCUDAT::PixelTrackDumpCUDAT(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter("onGPU")) { if (m_onGPU) { tokenGPUTrack_ = @@ -44,23 +47,25 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) tokenGPUVertex_ = consumes>(iConfig.getParameter("pixelVertexSrc")); } else { - tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); } } -void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void PixelTrackDumpCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("onGPU", true); desc.add("pixelTrackSrc", edm::InputTag("pixelTracksCUDA")); desc.add("pixelVertexSrc", edm::InputTag("pixelVerticesCUDA")); - descriptions.add("pixelTrackDumpCUDA", desc); + descriptions.addWithDefaultLabel(desc); } -void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, - edm::Event const& iEvent, - const edm::EventSetup& iSetup) const { +template +void PixelTrackDumpCUDAT::analyze(edm::StreamID streamID, + edm::Event const& iEvent, + const edm::EventSetup& iSetup) const { if (m_onGPU) { auto const& hTracks = iEvent.get(tokenGPUTrack_); cms::cuda::ScopedContextProduce ctx{hTracks}; @@ -82,4 +87,11 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, } } +using PixelTrackDumpCUDA = PixelTrackDumpCUDAT; DEFINE_FWK_MODULE(PixelTrackDumpCUDA); + +using PixelTrackDumpCUDAPhase1 = PixelTrackDumpCUDAT; +DEFINE_FWK_MODULE(PixelTrackDumpCUDAPhase1); + +using PixelTrackDumpCUDAPhase2 = PixelTrackDumpCUDAT; +DEFINE_FWK_MODULE(PixelTrackDumpCUDAPhase2); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index 59ba877e9e626..6a0f918b0d979 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -29,20 +29,24 @@ #include "CUDADataFormats/Common/interface/HostProduct.h" #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "storeTracks.h" #include "CUDADataFormats/Common/interface/HostProduct.h" /** * This class creates "leagcy" reco::Track - * objects from the output of SoA CA. + * objects from the output of SoA CA. */ -class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { +template +class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> { + using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + public: - using IndToEdm = std::vector; + using IndToEdm = std::vector; - explicit PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig); - ~PixelTrackProducerFromSoA() override = default; + explicit PixelTrackProducerFromSoAT(const edm::ParameterSet &iConfig); + ~PixelTrackProducerFromSoAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); @@ -65,9 +69,10 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { pixelTrack::Quality const minQuality_; }; -PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) +template +PixelTrackProducerFromSoAT::PixelTrackProducerFromSoAT(const edm::ParameterSet &iConfig) : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), - tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), cpuHits_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), hmsToken_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), idealMagneticFieldToken_(esConsumes()), @@ -91,7 +96,8 @@ PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iC produces(); } -void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { +template +void PixelTrackProducerFromSoAT::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { edm::ParameterSetDescription desc; desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); desc.add("trackSrc", edm::InputTag("pixelTracksSoA")); @@ -101,9 +107,10 @@ void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions descriptions.addWithDefaultLabel(desc); } -void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, - edm::Event &iEvent, - const edm::EventSetup &iSetup) const { +template +void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, + edm::Event &iEvent, + const edm::EventSetup &iSetup) const { // enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity }; reco::TrackBase::TrackQuality recoQuality[] = {reco::TrackBase::undefQuality, reco::TrackBase::undefQuality, @@ -175,9 +182,10 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, auto nHits = tsoa.nHits(it); assert(nHits >= 3); auto q = quality[it]; + if (q < minQuality_) continue; - if (nHits < minNumberOfHits_) + if (tsoa.nLayers(it) < minNumberOfHits_) continue; indToEdm[it] = nt; ++nt; @@ -244,4 +252,11 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, iEvent.put(std::move(indToEdmP)); } +using PixelTrackProducerFromSoA = PixelTrackProducerFromSoAT; DEFINE_FWK_MODULE(PixelTrackProducerFromSoA); + +using PixelTrackProducerFromSoAPhase1 = PixelTrackProducerFromSoAT; +DEFINE_FWK_MODULE(PixelTrackProducerFromSoAPhase1); + +using PixelTrackProducerFromSoAPhase2 = PixelTrackProducerFromSoAT; +DEFINE_FWK_MODULE(PixelTrackProducerFromSoAPhase2); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 5cf4aac491901..0675effd091e8 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -19,10 +19,14 @@ // Switch on to enable checks and printout for found tracks // #define PIXEL_DEBUG_PRODUCE -class PixelTrackSoAFromCUDA : public edm::stream::EDProducer { +template +class PixelTrackSoAFromCUDAT : public edm::stream::EDProducer { + using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TrackSoA = pixelTrack::TrackSoAT; + public: - explicit PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig); - ~PixelTrackSoAFromCUDA() override = default; + explicit PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig); + ~PixelTrackSoAFromCUDAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -35,23 +39,26 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer edm::EDGetTokenT> tokenCUDA_; edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr soa_; + cms::cuda::host::unique_ptr soa_; }; -PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) +template +PixelTrackSoAFromCUDAT::PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig) : tokenCUDA_(consumes>(iConfig.getParameter("src"))), tokenSOA_(produces()) {} -void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void PixelTrackSoAFromCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("src", edm::InputTag("pixelTracksCUDA")); - descriptions.add("pixelTracksSoA", desc); + descriptions.addWithDefaultLabel(desc); } -void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, - edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { +template +void PixelTrackSoAFromCUDAT::acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); @@ -59,10 +66,11 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, soa_ = inputData.toHostAsync(ctx.stream()); } -void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { - // check that the fixed-size SoA does not overflow +template +void PixelTrackSoAFromCUDAT::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { auto const& tsoa = *soa_; auto maxTracks = tsoa.stride(); + auto nTracks = tsoa.nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { @@ -91,4 +99,11 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i assert(!soa_); } +using PixelTrackSoAFromCUDA = PixelTrackSoAFromCUDAT; DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA); + +using PixelTrackSoAFromCUDAPhase1 = PixelTrackSoAFromCUDAT; +DEFINE_FWK_MODULE(PixelTrackSoAFromCUDAPhase1); + +using PixelTrackSoAFromCUDAPhase2 = PixelTrackSoAFromCUDAT; +DEFINE_FWK_MODULE(PixelTrackSoAFromCUDAPhase2); diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py index 4f0f6f93cab62..7aeb0e80c60b0 100644 --- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py +++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py @@ -91,7 +91,10 @@ # "Patatrack" pixel ntuplets, fishbone cleaning, Broken Line fit, and density-based vertex reconstruction from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit -from RecoPixelVertexing.PixelTriplets.pixelTracksCUDA_cfi import pixelTracksCUDA as _pixelTracksCUDA +from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDAPhase1_cfi import caHitNtupletCUDAPhase1 as _pixelTracksCUDA +from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDAPhase2_cfi import caHitNtupletCUDAPhase2 as _pixelTracksCUDAPhase2 + +from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker # SwitchProducer providing the pixel tracks in SoA format on the CPU pixelTracksSoA = SwitchProducerCUDA( @@ -102,25 +105,31 @@ onGPU = False ) ) + # use quality cuts tuned for Run 2 ideal conditions for all Run 3 workflows run3_common.toModify(pixelTracksSoA.cpu, idealConditions = True ) # convert the pixel tracks from SoA to legacy format -from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA as _pixelTrackProducerFromSoA -(pixelNtupletFit & ~phase2_tracker).toReplaceWith(pixelTracks, _pixelTrackProducerFromSoA.clone( +from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoAPhase1_cfi import pixelTrackProducerFromSoAPhase1 as _pixelTrackProducerFromSoA +from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoAPhase2_cfi import pixelTrackProducerFromSoAPhase2 as _pixelTrackProducerFromSoAPhase2 + +pixelNtupletFit.toReplaceWith(pixelTracks, _pixelTrackProducerFromSoA.clone( pixelRecHitLegacySrc = "siPixelRecHitsPreSplitting", )) -(pixelNtupletFit & ~phase2_tracker).toReplaceWith(pixelTracksTask, cms.Task( +(pixelNtupletFit & phase2_tracker).toReplaceWith(pixelTracks, _pixelTrackProducerFromSoAPhase2.clone( + pixelRecHitLegacySrc = "siPixelRecHitsPreSplitting", +)) + +pixelNtupletFit.toReplaceWith(pixelTracksTask, cms.Task( # build the pixel ntuplets and the pixel tracks in SoA format on the GPU pixelTracksSoA, # convert the pixel tracks from SoA to legacy format pixelTracks )) - # "Patatrack" sequence running on GPU (or CPU if not available) from Configuration.ProcessModifiers.gpu_cff import gpu @@ -128,23 +137,37 @@ pixelTracksCUDA = _pixelTracksCUDA.clone( pixelRecHitSrc = "siPixelRecHitsPreSplittingCUDA", idealConditions = False, - onGPU = True + onGPU = True, ) + # use quality cuts tuned for Run 2 ideal conditions for all Run 3 workflows run3_common.toModify(pixelTracksCUDA, idealConditions = True ) # SwitchProducer providing the pixel tracks in SoA format on the CPU -from RecoPixelVertexing.PixelTrackFitting.pixelTracksSoA_cfi import pixelTracksSoA as _pixelTracksSoA +from RecoPixelVertexing.PixelTrackFitting.pixelTrackSoAFromCUDAPhase1_cfi import pixelTrackSoAFromCUDAPhase1 as _pixelTracksSoA +from RecoPixelVertexing.PixelTrackFitting.pixelTrackSoAFromCUDAPhase2_cfi import pixelTrackSoAFromCUDAPhase2 as _pixelTracksSoAPhase2 + gpu.toModify(pixelTracksSoA, # transfer the pixel tracks in SoA format to the host cuda = _pixelTracksSoA.clone() ) -from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker +(gpu & phase2_tracker).toModify(pixelTracksSoA,cuda = _pixelTracksSoAPhase2.clone( +)) + +phase2_tracker.toModify(pixelTracksSoA,cpu = _pixelTracksCUDAPhase2.clone( + pixelRecHitSrc = "siPixelRecHitsPreSplittingSoA", + onGPU = False +)) + +phase2_tracker.toReplaceWith(pixelTracksCUDA,_pixelTracksCUDAPhase2.clone( + pixelRecHitSrc = "siPixelRecHitsPreSplittingCUDA", + onGPU = True, +)) -(pixelNtupletFit & gpu & ~phase2_tracker).toReplaceWith(pixelTracksTask, cms.Task( +(pixelNtupletFit & gpu).toReplaceWith(pixelTracksTask, cms.Task( # build the pixel ntuplets and pixel tracks in SoA format on the GPU pixelTracksCUDA, # transfer the pixel tracks in SoA format to the CPU, and convert them to legacy format diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc index 1523640e2ef8f..d6a9db4953be1 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc @@ -1,6 +1,9 @@ #include "BrokenLineFitOnGPU.h" -void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) { +template +void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, + uint32_t hitsInFit, + uint32_t maxNumberOfTuples) { assert(tuples_); #ifdef BROKENLINE_DEBUG @@ -8,7 +11,7 @@ void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hi #endif // Fit internals - auto tkidGPU = std::make_unique(maxNumberOfConcurrentFits_); + auto tkidGPU = std::make_unique(maxNumberOfConcurrentFits_); auto hitsGPU = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double)); auto hits_geGPU = @@ -18,104 +21,97 @@ void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hi for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets - kernel_BLFastFit<3>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 3, - 3, - offset); - - kernel_BLFit<3>(tupleMultiplicity_, - bField_, - outputSoa_, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get()); + kernel_BLFastFit<3, TrackerTraits>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 3, + 3, + offset); + kernel_BLFit<3, TrackerTraits>(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); if (fitNas4_) { - // fit all as 4 - kernel_BLFastFit<4>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 4, - 8, - offset); + riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrack, 1>( + [this, &hv, &tkidGPU, &hitsGPU, &hits_geGPU, &fast_fit_resultsGPU, &offset](auto i) { + kernel_BLFastFit<4, TrackerTraits>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 4, + i, + offset); + + kernel_BLFit<4, TrackerTraits>(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); + }); - kernel_BLFit<4>(tupleMultiplicity_, - bField_, - outputSoa_, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get()); } else { - // fit quads - kernel_BLFastFit<4>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 4, - 4, - offset); + //Fit these using all the hits they have + riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrackForFullFit, 1>( + [this, &hv, &tkidGPU, &hitsGPU, &hits_geGPU, &fast_fit_resultsGPU, &offset](auto i) { + kernel_BLFastFit(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + i, + i, + offset); + + kernel_BLFit(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); + }); - kernel_BLFit<4>(tupleMultiplicity_, - bField_, - outputSoa_, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get()); - // fit penta (all 5) - kernel_BLFastFit<5>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 5, - 5, - offset); + static_assert(TrackerTraits::maxHitsOnTrackForFullFit < TrackerTraits::maxHitsOnTrack); - kernel_BLFit<5>(tupleMultiplicity_, - bField_, - outputSoa_, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get()); - // fit sexta and above (as 6) - kernel_BLFastFit<6>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 6, - 8, - offset); + //Fit all the rest using the maximum from previous call - kernel_BLFit<6>(tupleMultiplicity_, - bField_, - outputSoa_, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get()); + kernel_BLFastFit(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + TrackerTraits::maxHitsOnTrackForFullFit, + TrackerTraits::maxHitsOnTrack - 1, + offset); + + kernel_BLFit(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); } } // loop on concurrent fits } + +template class HelixFitOnGPU; +template class HelixFitOnGPU; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index d99a96b705451..b1ee028b8863e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -1,17 +1,19 @@ #include "BrokenLineFitOnGPU.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, - uint32_t hitsInFit, - uint32_t maxNumberOfTuples, - cudaStream_t stream) { +template +void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, + uint32_t hitsInFit, + uint32_t maxNumberOfTuples, + cudaStream_t stream) { assert(tuples_); auto blockSize = 64; auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; // Fit internals - auto tkidGPU = cms::cuda::make_device_unique(maxNumberOfConcurrentFits_, stream); + auto tkidGPU = + cms::cuda::make_device_unique(maxNumberOfConcurrentFits_, stream); auto hitsGPU = cms::cuda::make_device_unique( maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), stream); auto hits_geGPU = cms::cuda::make_device_unique( @@ -21,112 +23,122 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets - kernel_BLFastFit<3><<>>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 3, - 3, - offset); + + kernel_BLFastFit<3, TrackerTraits><<>>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 3, + 3, + offset); cudaCheck(cudaGetLastError()); - kernel_BLFit<3><<>>(tupleMultiplicity_, - bField_, - outputSoa_, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get()); + kernel_BLFit<3, TrackerTraits><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); cudaCheck(cudaGetLastError()); if (fitNas4_) { // fit all as 4 - kernel_BLFastFit<4><<>>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 4, - 8, - offset); - cudaCheck(cudaGetLastError()); - - kernel_BLFit<4><<>>(tupleMultiplicity_, - bField_, - outputSoa_, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get()); + riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrack, 1>([this, + &hv, + &tkidGPU, + &hitsGPU, + &hits_geGPU, + &fast_fit_resultsGPU, + &offset, + &numberOfBlocks, + &blockSize, + &stream](auto i) { + kernel_BLFastFit<4, TrackerTraits><<>>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 4, + 4, + offset); + + cudaCheck(cudaGetLastError()); + + kernel_BLFit<4, TrackerTraits><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); + + cudaCheck(cudaGetLastError()); + }); + } else { - // fit quads - kernel_BLFastFit<4><<>>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 4, - 4, - offset); - cudaCheck(cudaGetLastError()); - - kernel_BLFit<4><<>>(tupleMultiplicity_, + riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrackForFullFit, 1>([this, + &hv, + &tkidGPU, + &hitsGPU, + &hits_geGPU, + &fast_fit_resultsGPU, + &offset, + &numberOfBlocks, + &blockSize, + &stream](auto i) { + kernel_BLFastFit<<>>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + i, + i, + offset); + + kernel_BLFit<<<8, blockSize, 0, stream>>>(tupleMultiplicity_, bField_, outputSoa_, tkidGPU.get(), hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get()); - // fit penta (all 5) - kernel_BLFastFit<5><<>>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 5, - 5, - offset); - cudaCheck(cudaGetLastError()); - - kernel_BLFit<5><<<8, blockSize, 0, stream>>>(tupleMultiplicity_, - bField_, - outputSoa_, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get()); - cudaCheck(cudaGetLastError()); - // fit sexta and above (as 6) - kernel_BLFastFit<6><<<4, blockSize, 0, stream>>>(tuples_, - tupleMultiplicity_, - hv, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - 6, - 8, - offset); - cudaCheck(cudaGetLastError()); - - kernel_BLFit<6><<<4, blockSize, 0, stream>>>(tupleMultiplicity_, - bField_, - outputSoa_, - tkidGPU.get(), - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get()); - cudaCheck(cudaGetLastError()); + }); + + static_assert(TrackerTraits::maxHitsOnTrackForFullFit < TrackerTraits::maxHitsOnTrack); + + //Fit all the rest using the maximum from previous call + kernel_BLFastFit + <<>>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + TrackerTraits::maxHitsOnTrackForFullFit, + TrackerTraits::maxHitsOnTrack - 1, + offset); + + kernel_BLFit + <<<8, blockSize, 0, stream>>>(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); } } // loop on concurrent fits } + +template class HelixFitOnGPU; +template class HelixFitOnGPU; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index 6ec6afb83cba1..4d1d57c4e27a8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -2,8 +2,8 @@ // Author: Felice Pantaleo, CERN // -// #define BROKENLINE_DEBUG - +//#define BROKENLINE_DEBUG +//#define BL_DUMP_HITS #include #include @@ -16,19 +16,25 @@ #include "HelixFitOnGPU.h" -using HitsOnGPU = TrackingRecHit2DSOAView; -using Tuples = pixelTrack::HitContainer; -using OutputSoA = pixelTrack::TrackSoA; -using tindex_type = caConstants::tindex_type; -constexpr auto invalidTkId = std::numeric_limits::max(); +template +using HitsOnGPU = TrackingRecHit2DSOAViewT; +template +using Tuples = pixelTrack::HitContainerT; +template +using OutputSoA = pixelTrack::TrackSoAT; +template +using TupleMultiplicity = caStructures::TupleMultiplicityT; + +// using tindex_type = typename TrackerTraits::tindex_type; +// constexpr auto invalidTkId = std::numeric_limits::max(); // #define BL_DUMP_HITS -template -__global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, - caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, - HitsOnGPU const *__restrict__ hhp, - tindex_type *__restrict__ ptkids, +template +__global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, + TupleMultiplicity const *__restrict__ tupleMultiplicity, + HitsOnGPU const *__restrict__ hhp, + typename TrackerTraits::tindex_type *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit, @@ -36,6 +42,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, uint32_t nHitsH, int32_t offset) { constexpr uint32_t hitsInFit = N; + constexpr auto invalidTkId = std::numeric_limits::max(); assert(hitsInFit <= nHitsL); assert(nHitsL <= nHitsH); @@ -67,7 +74,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, } // get it from the ntuple container (one to one to helix) auto tkid = *(tupleMultiplicity->begin(nHitsL) + tuple_idx); - assert(tkid < foundNtuplets->nOnes()); + assert(int(tkid) < foundNtuplets->nOnes()); ptkids[local_idx] = tkid; @@ -166,29 +173,28 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, } } -template -__global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, +template +__global__ void kernel_BLFit(TupleMultiplicity const *__restrict__ tupleMultiplicity, double bField, - OutputSoA *results, - tindex_type const *__restrict__ ptkids, + OutputSoA *results, + typename TrackerTraits::tindex_type const *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit) { assert(results); assert(pfast_fit); + constexpr auto invalidTkId = std::numeric_limits::max(); // same as above... - // look in bin for this hit multiplicity auto local_start = blockIdx.x * blockDim.x + threadIdx.x; for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt; local_idx += gridDim.x * blockDim.x) { if (invalidTkId == ptkids[local_idx]) break; - auto tkid = ptkids[local_idx]; - assert(tkid < caConstants::maxTuples); + assert(tkid < TrackerTraits::maxNumberOfTuples); riemannFit::Map3xNd hits(phits + local_idx); riemannFit::Map4d fast_fit(pfast_fit + local_idx); @@ -213,7 +219,7 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2); printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", N, - nHits, + N, tkid, circle.par(0), circle.par(1), diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h deleted file mode 100644 index 127831e0e2eb7..0000000000000 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h -#define RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h - -#include - -#include - -#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" -#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" - -//#define ONLY_PHICUT - -// Cellular automaton constants -namespace caConstants { - - // constants -#ifdef ONLY_PHICUT - constexpr uint32_t maxCellNeighbors = 64; - constexpr uint32_t maxCellTracks = 64; - constexpr uint32_t maxNumberOfTuples = 48 * 1024; - constexpr uint32_t maxNumberOfDoublets = 2 * 1024 * 1024; - constexpr uint32_t maxCellsPerHit = 8 * 128; -#else // ONLY_PHICUT - constexpr uint32_t maxCellNeighbors = 36; - constexpr uint32_t maxCellTracks = 48; -#ifdef GPU_SMALL_EVENTS - // kept for testing and debugging - constexpr uint32_t maxNumberOfTuples = 3 * 1024; - constexpr uint32_t maxNumberOfDoublets = 128 * 1024; - constexpr uint32_t maxCellsPerHit = 128 / 2; -#else // GPU_SMALL_EVENTS - // tested on MC events with 55-75 pileup events - // and extended for Heavy Ions operations (24k -> 32k tuples, 128 -> 256 cells) - constexpr uint32_t maxNumberOfTuples = 32 * 1024; - constexpr uint32_t maxNumberOfDoublets = 512 * 1024; - constexpr uint32_t maxCellsPerHit = 256; -#endif // GPU_SMALL_EVENTS -#endif // ONLY_PHICUT - constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8; - constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples; - - constexpr uint32_t maxNumberOfLayerPairs = 20; - constexpr uint32_t maxNumberOfLayers = 10; - constexpr uint32_t maxTuples = maxNumberOfTuples; - constexpr int32_t maxHitsOnTrack = 10; - - // Modules constants - constexpr uint32_t max_ladder_bpx0 = 12; - constexpr uint32_t first_ladder_bpx0 = 0; - constexpr float module_length_bpx0 = 6.7f; - constexpr float module_tolerance_bpx0 = 0.4f; // projection to cylinder is inaccurate on BPIX1 - constexpr uint32_t max_ladder_bpx4 = 64; - constexpr uint32_t first_ladder_bpx4 = 84; - constexpr float radius_even_ladder = 15.815f; - constexpr float radius_odd_ladder = 16.146f; - constexpr float module_length_bpx4 = 6.7f; - constexpr float module_tolerance_bpx4 = 0.2f; - constexpr float barrel_z_length = 26.f; - constexpr float forward_z_begin = 32.f; - - // Last indexes - constexpr uint32_t last_bpix1_detIndex = 96; - constexpr uint32_t last_barrel_detIndex = 1184; - - // types - using hindex_type = uint32_t; // FIXME from siPixelRecHitsHeterogeneousProduct - using tindex_type = uint16_t; // for tuples - - using CellNeighbors = cms::cuda::VecArray; - using CellTracks = cms::cuda::VecArray; - - using CellNeighborsVector = cms::cuda::SimpleVector; - using CellTracksVector = cms::cuda::SimpleVector; - - using OuterHitOfCellContainer = cms::cuda::VecArray; - using TuplesContainer = cms::cuda::OneToManyAssoc; - using HitToTuple = cms::cuda::OneToManyAssoc; // 3.5 should be enough - using TupleMultiplicity = cms::cuda::OneToManyAssoc; - - struct OuterHitOfCell { - OuterHitOfCellContainer* container; - int32_t offset; - constexpr auto& operator[](int i) { return container[i - offset]; } - constexpr auto const& operator[](int i) const { return container[i - offset]; } - }; - -} // namespace caConstants - -#endif // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index 72c482c6189db..fade739410e2f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -23,10 +23,18 @@ #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -class CAHitNtupletCUDA : public edm::global::EDProducer<> { +template +class CAHitNtupletCUDAT : public edm::global::EDProducer<> { + using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + + using HitsView = TrackingRecHit2DSOAViewT; + using HitsOnGPU = TrackingRecHit2DGPUT; + using HitsOnCPU = TrackingRecHit2DCPUT; + using GPUAlgo = CAHitNtupletGeneratorOnGPU; + public: - explicit CAHitNtupletCUDA(const edm::ParameterSet& iConfig); - ~CAHitNtupletCUDA() override = default; + explicit CAHitNtupletCUDAT(const edm::ParameterSet& iConfig); + ~CAHitNtupletCUDAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -39,49 +47,57 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> { bool onGPU_; edm::ESGetToken tokenField_; - edm::EDGetTokenT> tokenHitGPU_; + edm::EDGetTokenT> tokenHitGPU_; edm::EDPutTokenT> tokenTrackGPU_; - edm::EDGetTokenT tokenHitCPU_; + edm::EDGetTokenT tokenHitCPU_; edm::EDPutTokenT tokenTrackCPU_; - CAHitNtupletGeneratorOnGPU gpuAlgo_; + GPUAlgo gpuAlgo_; }; -CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) +template +CAHitNtupletCUDAT::CAHitNtupletCUDAT(const edm::ParameterSet& iConfig) : onGPU_(iConfig.getParameter("onGPU")), tokenField_(esConsumes()), gpuAlgo_(iConfig, consumesCollector()) { if (onGPU_) { - tokenHitGPU_ = - consumes>(iConfig.getParameter("pixelRecHitSrc")); + tokenHitGPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); tokenTrackGPU_ = produces>(); } else { - tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); + tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); tokenTrackCPU_ = produces(); } } -void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void CAHitNtupletCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("onGPU", true); desc.add("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA")); - CAHitNtupletGeneratorOnGPU::fillDescriptions(desc); - descriptions.add("pixelTracksCUDA", desc); + GPUAlgo::fillDescriptions(desc); + descriptions.addWithDefaultLabel(desc); } -void CAHitNtupletCUDA::beginJob() { gpuAlgo_.beginJob(); } +template +void CAHitNtupletCUDAT::beginJob() { + gpuAlgo_.beginJob(); +} -void CAHitNtupletCUDA::endJob() { gpuAlgo_.endJob(); } +template +void CAHitNtupletCUDAT::endJob() { + gpuAlgo_.endJob(); +} -void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const { +template +void CAHitNtupletCUDAT::produce(edm::StreamID streamID, + edm::Event& iEvent, + const edm::EventSetup& es) const { auto bf = 1. / es.getData(tokenField_).inverseBzAtOriginInGeV(); if (onGPU_) { auto hHits = iEvent.getHandle(tokenHitGPU_); - cms::cuda::ScopedContextProduce ctx{*hHits}; auto const& hits = ctx.get(*hHits); - ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); } else { auto const& hits = iEvent.get(tokenHitCPU_); @@ -89,4 +105,11 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const } } +using CAHitNtupletCUDA = CAHitNtupletCUDAT; DEFINE_FWK_MODULE(CAHitNtupletCUDA); + +using CAHitNtupletCUDAPhase1 = CAHitNtupletCUDAT; +DEFINE_FWK_MODULE(CAHitNtupletCUDAPhase1); + +using CAHitNtupletCUDAPhase2 = CAHitNtupletCUDAT; +DEFINE_FWK_MODULE(CAHitNtupletCUDAPhase2); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 66208debdc98d..75fbbffb49190 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -8,13 +8,21 @@ namespace { std::mutex lock_stat; } // namespace -template <> -void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) { - kernel_printCounters(counters); +template +void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) { + caHitNtupletGeneratorKernels::kernel_printCounters(counters); } -template <> -void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { +template +void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { + using namespace gpuPixelDoublets; + + using GPUCACell = GPUCACellT; + using OuterHitOfCell = typename GPUCACell::OuterHitOfCell; + using CellNeighbors = typename GPUCACell::CellNeighbors; + using CellTracks = typename GPUCACell::CellTracks; + using OuterHitOfCellContainer = typename GPUCACell::OuterHitOfCellContainer; + auto nhits = hh.nHits(); #ifdef NTUPLE_DEBUG @@ -24,61 +32,54 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr // use "nhits" to heuristically dimension the workspace // no need to use the Traits allocations, since we know this is being compiled for the CPU - //device_isOuterHitOfCell_ = Traits::template make_unique(std::max(1U, nhits), stream); - device_isOuterHitOfCell_ = std::make_unique(std::max(1U, nhits)); - assert(device_isOuterHitOfCell_.get()); - isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + //this->device_isOuterHitOfCell_ = Traits::template make_unique(std::max(1U, nhits), stream); + this->device_isOuterHitOfCell_ = std::make_unique(std::max(1U, nhits)); + assert(this->device_isOuterHitOfCell_.get()); + this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; - auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) + - caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks); + auto cellStorageSize = TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) + + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks); // no need to use the Traits allocations, since we know this is being compiled for the CPU //cellStorage_ = Traits::template make_unique(cellStorageSize, stream); - cellStorage_ = std::make_unique(cellStorageSize); - device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); - device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets * - sizeof(GPUCACell::CellNeighbors)); - - gpuPixelDoublets::initDoublets(isOuterHitOfCell_, - nhits, - device_theCellNeighbors_.get(), - device_theCellNeighborsContainer_, - device_theCellTracks_.get(), - device_theCellTracksContainer_); + this->cellStorage_ = std::make_unique(cellStorageSize); + this->device_theCellNeighborsContainer_ = (CellNeighbors *)this->cellStorage_.get(); + this->device_theCellTracksContainer_ = + (CellTracks *)(this->cellStorage_.get() + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors)); + + initDoublets(this->isOuterHitOfCell_, + nhits, + this->device_theCellNeighbors_.get(), + this->device_theCellNeighborsContainer_, + this->device_theCellTracks_.get(), + this->device_theCellTracksContainer_); // no need to use the Traits allocations, since we know this is being compiled for the CPU - //device_theCells_ = Traits::template make_unique(params_.maxNumberOfDoublets_, stream); - device_theCells_ = std::make_unique(params_.maxNumberOfDoublets_); + //this->device_theCells_ = Traits::template make_unique(this->params_.cellCuts_.maxNumberOfDoublets_, stream); + this->device_theCells_ = std::make_unique(this->params_.cellCuts_.maxNumberOfDoublets_); if (0 == nhits) return; // protect against empty events // take all layer pairs into account - auto nActualPairs = gpuPixelDoublets::nPairs; - if (not params_.includeJumpingForwardDoublets_) { - // exclude forward "jumping" layer pairs - nActualPairs = gpuPixelDoublets::nPairsForTriplets; - } - if (params_.minHitsPerNtuplet_ > 3) { - // for quadruplets, exclude all "jumping" layer pairs - nActualPairs = gpuPixelDoublets::nPairsForQuadruplets; - } - - assert(nActualPairs <= gpuPixelDoublets::nPairs); - gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - device_theCellTracks_.get(), - hh.view(), - isOuterHitOfCell_, - nActualPairs, - params_.idealConditions_, - params_.doClusterCut_, - params_.doZ0Cut_, - params_.doPtCut_, - params_.maxNumberOfDoublets_); + auto nActualPairs = this->params_.nPairs(); + + assert(nActualPairs <= TrackerTraits::nPairs); + + getDoubletsFromHisto(this->device_theCells_.get(), + this->device_nCells_, + this->device_theCellNeighbors_.get(), + this->device_theCellTracks_.get(), + hh.view(), + this->isOuterHitOfCell_, + nActualPairs, + this->params_.cellCuts_); } -template <> -void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +template +void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, + TkSoA *tracks_d, + cudaStream_t cudaStream) { + using namespace caHitNtupletGeneratorKernels; + auto *tuples_d = &tracks_d->hitIndices; auto *detId_d = &tracks_d->detIndices; auto *quality_d = tracks_d->qualityData(); @@ -90,125 +91,139 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * auto nhits = hh.nHits(); - // std::cout << "N hits " << nhits << std::endl; - // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; +#ifdef NTUPLE_DEBUG + std::cout << "start tuple building. N hits " << nhits << std::endl; + if (nhits < 2) + std::cout << "too few hits " << nhits << std::endl; +#endif // // applying conbinatoric cleaning such as fishbone at this stage is too expensive // - kernel_connect(device_hitTuple_apc_, - device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - isOuterHitOfCell_, - params_.hardCurvCut_, - params_.ptmin_, - params_.CAThetaCutBarrel_, - params_.CAThetaCutForward_, - params_.dcaCutInnerTriplet_, - params_.dcaCutOuterTriplet_); - - if (nhits > 1 && params_.earlyFishbone_) { - gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false); + kernel_connect(this->device_hitTuple_apc_, + this->device_hitToTuple_apc_, // needed only to be reset, ready for next kernel + hh.view(), + this->device_theCells_.get(), + this->device_nCells_, + this->device_theCellNeighbors_.get(), + this->isOuterHitOfCell_, + this->params_.caParams_); + + if (nhits > 1 && this->params_.earlyFishbone_) { + gpuPixelDoublets::fishbone( + hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); } - kernel_find_ntuplets(hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellTracks_.get(), - tuples_d, - device_hitTuple_apc_, - quality_d, - params_.minHitsPerNtuplet_); - if (params_.doStats_) - kernel_mark_used(device_theCells_.get(), device_nCells_); + kernel_find_ntuplets(hh.view(), + this->device_theCells_.get(), + this->device_nCells_, + this->device_theCellTracks_.get(), + tuples_d, + this->device_hitTuple_apc_, + quality_d, + this->params_.caParams_); + if (this->params_.doStats_) + kernel_mark_used(this->device_theCells_.get(), this->device_nCells_); - cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d); + cms::cuda::finalizeBulk(this->device_hitTuple_apc_, tuples_d); - kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); - kernel_fillNLayers(tracks_d, device_hitTuple_apc_); + kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); + kernel_fillNLayers(tracks_d, this->device_hitTuple_apc_); // remove duplicates (tracks that share a doublet) - kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); + kernel_earlyDuplicateRemover( + this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_); - kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); - cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + kernel_countMultiplicity(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream); + kernel_fillMultiplicity(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); - if (nhits > 1 && params_.lateFishbone_) { - gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true); + if (nhits > 1 && this->params_.lateFishbone_) { + gpuPixelDoublets::fishbone( + hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); } } -template <> -void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +template +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, + TkSoA *tracks_d, + cudaStream_t cudaStream) { + using namespace caHitNtupletGeneratorKernels; + int32_t nhits = hh.nHits(); auto const *tuples_d = &tracks_d->hitIndices; auto *quality_d = tracks_d->qualityData(); // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d); - - if (params_.lateFishbone_) { + kernel_classifyTracks(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d); + if (this->params_.lateFishbone_) { // apply fishbone cleaning to good tracks - kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d); + kernel_fishboneCleaner(this->device_theCells_.get(), this->device_nCells_, quality_d); } // remove duplicates (tracks that share a doublet) - kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_); + kernel_fastDuplicateRemover( + this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_); // fill hit->track "map" - if (params_.doSharedHitCut_ || params_.doStats_) { - kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); - cms::cuda::launchFinalize(hitToTupleView_, cudaStream); - kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + if (this->params_.doSharedHitCut_ || this->params_.doStats_) { + kernel_countHitInTracks(tuples_d, quality_d, this->device_hitToTuple_.get()); + cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream); + kernel_fillHitInTracks(tuples_d, quality_d, this->device_hitToTuple_.get()); } // remove duplicates (tracks that share at least one hit) - if (params_.doSharedHitCut_) { - kernel_rejectDuplicate( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - - kernel_sharedHitCleaner(hh.view(), - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); - if (params_.useSimpleTripletCleaner_) { - kernel_simpleTripletCleaner( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + if (this->params_.doSharedHitCut_) { + kernel_rejectDuplicate(tracks_d, + quality_d, + this->params_.minHitsForSharingCut_, + this->params_.dupPassThrough_, + this->device_hitToTuple_.get()); + + kernel_sharedHitCleaner(hh.view(), + tracks_d, + quality_d, + this->params_.minHitsForSharingCut_, + this->params_.dupPassThrough_, + this->device_hitToTuple_.get()); + if (this->params_.useSimpleTripletCleaner_) { + kernel_simpleTripletCleaner(tracks_d, + quality_d, + this->params_.minHitsForSharingCut_, + this->params_.dupPassThrough_, + this->device_hitToTuple_.get()); } else { - kernel_tripletCleaner( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + kernel_tripletCleaner(tracks_d, + quality_d, + this->params_.minHitsForSharingCut_, + this->params_.dupPassThrough_, + this->device_hitToTuple_.get()); } } - if (params_.doStats_) { + if (this->params_.doStats_) { std::lock_guard guard(lock_stat); - kernel_checkOverflows(tuples_d, - device_tupleMultiplicity_.get(), - device_hitToTuple_.get(), - device_hitTuple_apc_, - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - device_theCellTracks_.get(), - isOuterHitOfCell_, - nhits, - params_.maxNumberOfDoublets_, - counters_); + kernel_checkOverflows(tuples_d, + this->device_tupleMultiplicity_.get(), + this->device_hitToTuple_.get(), + this->device_hitTuple_apc_, + this->device_theCells_.get(), + this->device_nCells_, + this->device_theCellNeighbors_.get(), + this->device_theCellTracks_.get(), + this->isOuterHitOfCell_, + nhits, + this->params_.cellCuts_.maxNumberOfDoublets_, + this->counters_); } - if (params_.doStats_) { + if (this->params_.doStats_) { // counters (add flag???) std::lock_guard guard(lock_stat); - kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_); - kernel_doStatsForTracks(tuples_d, quality_d, counters_); + kernel_doStatsForHitInTracks(this->device_hitToTuple_.get(), this->counters_); + kernel_doStatsForTracks(tuples_d, quality_d, this->counters_); } #ifdef DUMP_GPU_TK_TUPLES @@ -217,7 +232,11 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA { std::lock_guard guard(lock); ++iev; - kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev); + kernel_print_found_ntuplets( + hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 0, 1000000, iev); } #endif } + +template class CAHitNtupletGeneratorKernelsCPU; +template class CAHitNtupletGeneratorKernelsCPU; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 913b6d5a32d28..59ae2041b44aa 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -1,8 +1,15 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" #include -template <> -void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +// #define NTUPLE_DEBUG +// #define GPU_DEBUG + +template +void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, + TkSoA *tracks_d, + cudaStream_t cudaStream) { + using namespace gpuPixelDoublets; + using namespace caHitNtupletGeneratorKernels; // these are pointer on GPU! auto *tuples_d = &tracks_d->hitIndices; auto *detId_d = &tracks_d->detIndices; @@ -26,58 +33,57 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * auto nthTot = 64; auto stride = 4; auto blockSize = nthTot / stride; - auto numberOfBlocks = nDoubletBlocks(blockSize); + auto numberOfBlocks = this->nDoubletBlocks(blockSize); auto rescale = numberOfBlocks / 65536; blockSize *= (rescale + 1); - numberOfBlocks = nDoubletBlocks(blockSize); + numberOfBlocks = this->nDoubletBlocks(blockSize); assert(numberOfBlocks < 65536); assert(blockSize > 0 && 0 == blockSize % 16); dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); - kernel_connect<<>>( - device_hitTuple_apc_, - device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - isOuterHitOfCell_, - params_.hardCurvCut_, - params_.ptmin_, - params_.CAThetaCutBarrel_, - params_.CAThetaCutForward_, - params_.dcaCutInnerTriplet_, - params_.dcaCutOuterTriplet_); + kernel_connect + <<>>(this->device_hitTuple_apc_, + this->device_hitToTuple_apc_, // needed only to be reset, ready for next kernel + hh.view(), + this->device_theCells_.get(), + this->device_nCells_, + this->device_theCellNeighbors_.get(), + this->isOuterHitOfCell_, + this->params_.caParams_); + cudaCheck(cudaGetLastError()); // do not run the fishbone if there are hits only in BPIX1 - if (nhits > isOuterHitOfCell_.offset && params_.earlyFishbone_) { + if (nhits > this->isOuterHitOfCell_.offset && this->params_.earlyFishbone_) { auto nthTot = 128; auto stride = 16; auto blockSize = nthTot / stride; - auto numberOfBlocks = (nhits - isOuterHitOfCell_.offset + blockSize - 1) / blockSize; + auto numberOfBlocks = (nhits - this->isOuterHitOfCell_.offset + blockSize - 1) / blockSize; dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); - gpuPixelDoublets::fishbone<<>>( - hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false); + fishbone<<>>( + hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); cudaCheck(cudaGetLastError()); } blockSize = 64; - numberOfBlocks = (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_find_ntuplets<<>>(hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellTracks_.get(), - tuples_d, - device_hitTuple_apc_, - quality_d, - params_.minHitsPerNtuplet_); + numberOfBlocks = (3 * this->params_.cellCuts_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + kernel_find_ntuplets<<>>(hh.view(), + this->device_theCells_.get(), + this->device_nCells_, + this->device_theCellTracks_.get(), + tuples_d, + this->device_hitTuple_apc_, + quality_d, + this->params_.caParams_); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); - - if (params_.doStats_) - kernel_mark_used<<>>(device_theCells_.get(), device_nCells_); +#endif + if (this->params_.doStats_) + kernel_mark_used + <<>>(this->device_theCells_.get(), this->device_nCells_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -87,38 +93,63 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * blockSize = 128; numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize; - cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); - kernel_fillHitDetIndices<<>>(tuples_d, hh.view(), detId_d); + cms::cuda::finalizeBulk<<>>(this->device_hitTuple_apc_, tuples_d); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); - kernel_fillNLayers<<>>(tracks_d, device_hitTuple_apc_); +#endif + + kernel_fillHitDetIndices<<>>(tuples_d, hh.view(), detId_d); cudaCheck(cudaGetLastError()); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + kernel_fillNLayers<<>>(tracks_d, this->device_hitTuple_apc_); + cudaCheck(cudaGetLastError()); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + // remove duplicates (tracks that share a doublet) - numberOfBlocks = nDoubletBlocks(blockSize); - kernel_earlyDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); + numberOfBlocks = this->nDoubletBlocks(blockSize); + + kernel_earlyDuplicateRemover<<>>( + this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_); + cudaCheck(cudaGetLastError()); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); +#endif blockSize = 128; - numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; - kernel_countMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); - cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); + numberOfBlocks = (3 * TrackerTraits::maxNumberOfTuples / 4 + blockSize - 1) / blockSize; + kernel_countMultiplicity + <<>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream); + kernel_fillMultiplicity + <<>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); cudaCheck(cudaGetLastError()); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif // do not run the fishbone if there are hits only in BPIX1 - if (nhits > isOuterHitOfCell_.offset && params_.lateFishbone_) { + if (nhits > this->isOuterHitOfCell_.offset && this->params_.lateFishbone_) { auto nthTot = 128; auto stride = 16; auto blockSize = nthTot / stride; - auto numberOfBlocks = (nhits - isOuterHitOfCell_.offset + blockSize - 1) / blockSize; + auto numberOfBlocks = (nhits - this->isOuterHitOfCell_.offset + blockSize - 1) / blockSize; dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); - gpuPixelDoublets::fishbone<<>>( - hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true); + fishbone<<>>( + hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); cudaCheck(cudaGetLastError()); } @@ -128,14 +159,22 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * #endif // free space asap - // device_isOuterHitOfCell_.reset(); + // this->device_isOuterHitOfCell_.reset(); } -template <> -void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { +template +void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { int32_t nhits = hh.nHits(); - isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + using namespace gpuPixelDoublets; + + using GPUCACell = GPUCACellT; + using OuterHitOfCell = typename GPUCACell::OuterHitOfCell; + using CellNeighbors = typename GPUCACell::CellNeighbors; + using CellTracks = typename GPUCACell::CellTracks; + using OuterHitOfCellContainer = typename GPUCACell::OuterHitOfCellContainer; + + this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; #ifdef NTUPLE_DEBUG std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; @@ -147,34 +186,35 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr #endif // in principle we can use "nhits" to heuristically dimension the workspace... - device_isOuterHitOfCell_ = cms::cuda::make_device_unique( - std::max(1, nhits - hh.offsetBPIX2()), stream); - assert(device_isOuterHitOfCell_.get()); + this->device_isOuterHitOfCell_ = + cms::cuda::make_device_unique(std::max(1, nhits - hh.offsetBPIX2()), stream); + assert(this->device_isOuterHitOfCell_.get()); - isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; - cellStorage_ = cms::cuda::make_device_unique( - caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) + - caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks), - stream); - device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); - device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets * - sizeof(GPUCACell::CellNeighbors)); + this->cellStorage_ = + cms::cuda::make_device_unique(TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) + + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks), + stream); + this->device_theCellNeighborsContainer_ = (CellNeighbors *)this->cellStorage_.get(); + this->device_theCellTracksContainer_ = + (CellTracks *)(this->cellStorage_.get() + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors)); { int threadsPerBlock = 128; // at least one block! int blocks = (std::max(1, nhits - hh.offsetBPIX2()) + threadsPerBlock - 1) / threadsPerBlock; - gpuPixelDoublets::initDoublets<<>>(isOuterHitOfCell_, - nhits, - device_theCellNeighbors_.get(), - device_theCellNeighborsContainer_, - device_theCellTracks_.get(), - device_theCellTracksContainer_); + initDoublets<<>>(this->isOuterHitOfCell_, + nhits, + this->device_theCellNeighbors_.get(), + this->device_theCellNeighborsContainer_, + this->device_theCellTracks_.get(), + this->device_theCellTracksContainer_); cudaCheck(cudaGetLastError()); } - device_theCells_ = cms::cuda::make_device_unique(params_.maxNumberOfDoublets_, stream); + this->device_theCells_ = + cms::cuda::make_device_unique(this->params_.cellCuts_.maxNumberOfDoublets_, stream); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -185,34 +225,21 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr return; // protect against empty events // take all layer pairs into account - auto nActualPairs = gpuPixelDoublets::nPairs; - if (not params_.includeJumpingForwardDoublets_) { - // exclude forward "jumping" layer pairs - nActualPairs = gpuPixelDoublets::nPairsForTriplets; - } - if (params_.minHitsPerNtuplet_ > 3) { - // for quadruplets, exclude all "jumping" layer pairs - nActualPairs = gpuPixelDoublets::nPairsForQuadruplets; - } + auto nActualPairs = this->params_.nPairs(); - assert(nActualPairs <= gpuPixelDoublets::nPairs); int stride = 4; - int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride; + int threadsPerBlock = TrackerTraits::getDoubletsFromHistoMaxBlockSize / stride; int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; dim3 blks(1, blocks, 1); dim3 thrs(stride, threadsPerBlock, 1); - gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - device_theCellTracks_.get(), - hh.view(), - isOuterHitOfCell_, - nActualPairs, - params_.idealConditions_, - params_.doClusterCut_, - params_.doZ0Cut_, - params_.doPtCut_, - params_.maxNumberOfDoublets_); + getDoubletsFromHisto<<>>(this->device_theCells_.get(), + this->device_nCells_, + this->device_theCellNeighbors_.get(), + this->device_theCellTracks_.get(), + hh.view(), + this->isOuterHitOfCell_, + nActualPairs, + this->params_.cellCuts_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -221,8 +248,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr #endif } -template <> -void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { +template +void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, + TkSoA *tracks_d, + cudaStream_t cudaStream) { + using namespace caHitNtupletGeneratorKernels; + // these are pointer on GPU! auto const *tuples_d = &tracks_d->hitIndices; auto *quality_d = tracks_d->qualityData(); @@ -232,65 +263,80 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA auto blockSize = 64; // classify tracks based on kinematics - auto numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_classifyTracks<<>>(tuples_d, tracks_d, params_.cuts_, quality_d); - cudaCheck(cudaGetLastError()); + auto numberOfBlocks = this->nQuadrupletBlocks(blockSize); + kernel_classifyTracks + <<>>(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d); - if (params_.lateFishbone_) { + if (this->params_.lateFishbone_) { // apply fishbone cleaning to good tracks - numberOfBlocks = nDoubletBlocks(blockSize); - kernel_fishboneCleaner<<>>( - device_theCells_.get(), device_nCells_, quality_d); + numberOfBlocks = this->nDoubletBlocks(blockSize); + kernel_fishboneCleaner + <<>>(this->device_theCells_.get(), this->device_nCells_, quality_d); cudaCheck(cudaGetLastError()); } // mark duplicates (tracks that share a doublet) - numberOfBlocks = nDoubletBlocks(blockSize); - kernel_fastDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_); + numberOfBlocks = this->nDoubletBlocks(blockSize); + kernel_fastDuplicateRemover<<>>( + this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); #endif - if (params_.doSharedHitCut_ || params_.doStats_) { + if (this->params_.doSharedHitCut_ || this->params_.doStats_) { // fill hit->track "map" - assert(hitToTupleView_.offSize > nhits); - numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_countHitInTracks<<>>( - tuples_d, quality_d, device_hitToTuple_.get()); + assert(this->hitToTupleView_.offSize > nhits); + numberOfBlocks = this->nQuadrupletBlocks(blockSize); + kernel_countHitInTracks + <<>>(tuples_d, quality_d, this->device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); - assert((hitToTupleView_.assoc == device_hitToTuple_.get()) && - (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0)); - cms::cuda::launchFinalize(hitToTupleView_, cudaStream); + assert((this->hitToTupleView_.assoc == this->device_hitToTuple_.get()) && + (this->hitToTupleView_.offStorage == this->device_hitToTupleStorage_.get()) && + (this->hitToTupleView_.offSize > 0)); + cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); + kernel_fillHitInTracks + <<>>(tuples_d, quality_d, this->device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); #endif } - if (params_.doSharedHitCut_) { + if (this->params_.doSharedHitCut_) { // mark duplicates (tracks that share at least one hit) - numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; - - kernel_rejectDuplicate<<>>( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); - - kernel_sharedHitCleaner<<>>(hh.view(), - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); - - if (params_.useSimpleTripletCleaner_) { - kernel_simpleTripletCleaner<<>>( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + numberOfBlocks = (this->hitToTupleView_.offSize + blockSize - 1) / blockSize; + + kernel_rejectDuplicate + <<>>(tracks_d, + quality_d, + this->params_.minHitsForSharingCut_, + this->params_.dupPassThrough_, + this->device_hitToTuple_.get()); + + kernel_sharedHitCleaner + <<>>(hh.view(), + tracks_d, + quality_d, + this->params_.minHitsForSharingCut_, + this->params_.dupPassThrough_, + this->device_hitToTuple_.get()); + + if (this->params_.useSimpleTripletCleaner_) { + kernel_simpleTripletCleaner + <<>>(tracks_d, + quality_d, + this->params_.minHitsForSharingCut_, + this->params_.dupPassThrough_, + this->device_hitToTuple_.get()); } else { - kernel_tripletCleaner<<>>( - tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); + kernel_tripletCleaner + <<>>(tracks_d, + quality_d, + this->params_.minHitsForSharingCut_, + this->params_.dupPassThrough_, + this->device_hitToTuple_.get()); } cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -298,30 +344,33 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA #endif } - if (params_.doStats_) { - numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize; - kernel_checkOverflows<<>>(tuples_d, - device_tupleMultiplicity_.get(), - device_hitToTuple_.get(), - device_hitTuple_apc_, - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_.get(), - device_theCellTracks_.get(), - isOuterHitOfCell_, - nhits, - params_.maxNumberOfDoublets_, - counters_); + if (this->params_.doStats_) { + numberOfBlocks = (std::max(nhits, int(this->params_.cellCuts_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize; + kernel_checkOverflows + <<>>(tuples_d, + this->device_tupleMultiplicity_.get(), + this->device_hitToTuple_.get(), + this->device_hitTuple_apc_, + this->device_theCells_.get(), + this->device_nCells_, + this->device_theCellNeighbors_.get(), + this->device_theCellTracks_.get(), + this->isOuterHitOfCell_, + nhits, + this->params_.cellCuts_.maxNumberOfDoublets_, + this->counters_); cudaCheck(cudaGetLastError()); } - if (params_.doStats_) { + if (this->params_.doStats_) { // counters (add flag???) - numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; - kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); + numberOfBlocks = (this->hitToTupleView_.offSize + blockSize - 1) / blockSize; + kernel_doStatsForHitInTracks + <<>>(this->device_hitToTuple_.get(), this->counters_); cudaCheck(cudaGetLastError()); - numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; - kernel_doStatsForTracks<<>>(tuples_d, quality_d, counters_); + numberOfBlocks = (3 * TrackerTraits::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; + kernel_doStatsForTracks + <<>>(tuples_d, quality_d, this->counters_); cudaCheck(cudaGetLastError()); } #ifdef GPU_DEBUG @@ -336,19 +385,22 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA std::lock_guard guard(lock); ++iev; for (int k = 0; k < 20000; k += 500) { - kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), k, k + 500, iev); + kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( + hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), k, k + 500, iev); cudaDeviceSynchronize(); } - kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 20000, 1000000, iev); + kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( + hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 20000, 1000000, iev); cudaDeviceSynchronize(); // cudaStreamSynchronize(cudaStream); } #endif } -template <> -void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) { - kernel_printCounters<<<1, 1>>>(counters); +template +void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) { + caHitNtupletGeneratorKernels::kernel_printCounters<<<1, 1>>>(counters); } + +template class CAHitNtupletGeneratorKernelsGPU; +template class CAHitNtupletGeneratorKernelsGPU; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 8af1176fe92c6..b595106299d71 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -5,10 +5,167 @@ #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "GPUCACell.h" - +#include "gpuPixelDoublets.h" // #define DUMP_GPU_TK_TUPLES -namespace cAHitNtupletGenerator { +namespace caHitNtupletGenerator { + + //Configuration params common to all topologies, for the algorithms + struct AlgoParams { + const bool onGPU_; + const uint32_t minHitsForSharingCut_; + const bool useRiemannFit_; + const bool fitNas4_; + const bool includeJumpingForwardDoublets_; + const bool earlyFishbone_; + const bool lateFishbone_; + const bool doStats_; + const bool doSharedHitCut_; + const bool dupPassThrough_; + const bool useSimpleTripletCleaner_; + }; + + //CAParams + struct CACommon { + const uint32_t minHitsPerNtuplet_; + const float ptmin_; + const float CAThetaCutBarrel_; + const float CAThetaCutForward_; + const float hardCurvCut_; + const float dcaCutInnerTriplet_; + const float dcaCutOuterTriplet_; + }; + + template + struct CAParamsT : public CACommon { + __device__ __forceinline__ bool startingLayerPair(int16_t pid) const { return false; }; + __device__ __forceinline__ bool startAt0(int16_t pid) const { return false; }; + }; + + template + struct CAParamsT> : public CACommon { + /// Is is a starting layer pair? + __device__ __forceinline__ bool startingLayerPair(int16_t pid) const { + return minHitsPerNtuplet_ > 3 ? pid < 3 : pid < 8 || pid > 12; + } + + /// Is this a pair with inner == 0? + __device__ __forceinline__ bool startAt0(int16_t pid) const { + assert((pixelTopology::Phase1::layerPairs[pid * 2] == 0) == + (pid < 3 || pid == 13 || pid == 15 || pid == 16)); // to be 100% sure it's working, may be removed + return pixelTopology::Phase1::layerPairs[pid * 2] == 0; + } + }; + + template + struct CAParamsT> : public CACommon { + const bool includeFarForwards_; + /// Is is a starting layer pair? + __device__ __forceinline__ bool startingLayerPair(int16_t pid) const { + return pid < 33; // in principle one could remove 5,6,7 23, 28 and 29 + } + + /// Is this a pair with inner == 0 + __device__ __forceinline__ bool startAt0(int16_t pid) const { + assert((pixelTopology::Phase2::layerPairs[pid * 2] == 0) == ((pid < 3) | (pid >= 23 && pid < 28))); + return pixelTopology::Phase2::layerPairs[pid * 2] == 0; + } + }; + + //Full list of params = algo params + ca params + cell params + quality cuts + //Generic template + template + struct ParamsT : public AlgoParams { + // one should define the params for its own pixelTopology + // not defining anything here + inline uint32_t nPairs() const { return 0; } + }; + + template + struct ParamsT> : public AlgoParams { + using TT = TrackerTraits; + using QualityCuts = pixelTrack::QualityCutsT; //track quality cuts + using CellCuts = gpuPixelDoublets::CellCutsT; //cell building cuts + using CAParams = CAParamsT; //params to be used on device + + ParamsT(AlgoParams const& commonCuts, + CellCuts const& cellCuts, + QualityCuts const& cutsCuts, + CAParams const& caParams) + : AlgoParams(commonCuts), cellCuts_(cellCuts), qualityCuts_(cutsCuts), caParams_(caParams) {} + + const CellCuts cellCuts_; + const QualityCuts qualityCuts_{// polynomial coefficients for the pT-dependent chi2 cut + {0.68177776, 0.74609577, -0.08035491, 0.00315399}, + // max pT used to determine the chi2 cut + 10., + // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit + 30., + // regional cuts for triplets + { + 0.3, // |Tip| < 0.3 cm + 0.5, // pT > 0.5 GeV + 12.0 // |Zip| < 12.0 cm + }, + // regional cuts for quadruplets + { + 0.5, // |Tip| < 0.5 cm + 0.3, // pT > 0.3 GeV + 12.0 // |Zip| < 12.0 cm + }}; + const CAParams caParams_; + /// Compute the number of pairs + inline uint32_t nPairs() const { + // take all layer pairs into account + uint32_t nActualPairs = TT::nPairs; + if (not includeJumpingForwardDoublets_) { + // exclude forward "jumping" layer pairs + nActualPairs = TT::nPairsForTriplets; + } + if (caParams_.minHitsPerNtuplet_ > 3) { + // for quadruplets, exclude all "jumping" layer pairs + nActualPairs = TT::nPairsForQuadruplets; + } + + return nActualPairs; + } + + }; // Params Phase1 + + template + struct ParamsT> : public AlgoParams { + using TT = TrackerTraits; + using QualityCuts = pixelTrack::QualityCutsT; + using CellCuts = gpuPixelDoublets::CellCutsT; + using CAParams = CAParamsT; + + ParamsT(AlgoParams const& commonCuts, + CellCuts const& cellCuts, + QualityCuts const& qualityCuts, + CAParams const& caParams) + : AlgoParams(commonCuts), cellCuts_(cellCuts), qualityCuts_(qualityCuts), caParams_(caParams) {} + + // quality cuts + const CellCuts cellCuts_; + const QualityCuts qualityCuts_{5.0f, /*chi2*/ 0.9f, /* pT in Gev*/ 0.4f, /*zip in cm*/ 12.0f /*tip in cm*/}; + const CAParams caParams_; + + inline uint32_t nPairs() const { + // take all layer pairs into account + uint32_t nActualPairs = TT::nPairsMinimal; + if (caParams_.includeFarForwards_) { + // considera far forwards (> 11 & > 23) + nActualPairs = TT::nPairsFarForwards; + } + if (includeJumpingForwardDoublets_) { + // include jumping forwards + nActualPairs = TT::nPairs; + } + + return nActualPairs; + } + + }; // Params Phase1 // counters struct Counters { @@ -27,157 +184,44 @@ namespace cAHitNtupletGenerator { unsigned long long nZeroTrackCells; }; - using HitsView = TrackingRecHit2DSOAView; - using HitsOnGPU = TrackingRecHit2DSOAView; - - using HitToTuple = caConstants::HitToTuple; - using TupleMultiplicity = caConstants::TupleMultiplicity; - using Quality = pixelTrack::Quality; - using TkSoA = pixelTrack::TrackSoA; - using HitContainer = pixelTrack::HitContainer; - - struct QualityCuts { - // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3]))) - float chi2Coeff[4]; - float chi2MaxPt; // GeV - float chi2Scale; - - struct Region { - float maxTip; // cm - float minPt; // GeV - float maxZip; // cm - }; - - Region triplet; - Region quadruplet; - }; - // params (FIXME: thi si a POD: so no constructor no traling _ and no const as params_ is already const) - struct Params { - Params(bool onGPU, - uint32_t minHitsPerNtuplet, - uint32_t maxNumberOfDoublets, - uint16_t minHitsForSharingCuts, - bool useRiemannFit, - bool fitNas4, - bool includeJumpingForwardDoublets, - bool earlyFishbone, - bool lateFishbone, - bool idealConditions, - bool doStats, - bool doClusterCut, - bool doZ0Cut, - bool doPtCut, - bool doSharedHitCut, - bool dupPassThrough, - bool useSimpleTripletCleaner, - float ptmin, - float CAThetaCutBarrel, - float CAThetaCutForward, - float hardCurvCut, - float dcaCutInnerTriplet, - float dcaCutOuterTriplet, - - QualityCuts const& cuts) - : onGPU_(onGPU), - minHitsPerNtuplet_(minHitsPerNtuplet), - maxNumberOfDoublets_(maxNumberOfDoublets), - minHitsForSharingCut_(minHitsForSharingCuts), - useRiemannFit_(useRiemannFit), - fitNas4_(fitNas4), - includeJumpingForwardDoublets_(includeJumpingForwardDoublets), - earlyFishbone_(earlyFishbone), - lateFishbone_(lateFishbone), - idealConditions_(idealConditions), - doStats_(doStats), - doClusterCut_(doClusterCut), - doZ0Cut_(doZ0Cut), - doPtCut_(doPtCut), - doSharedHitCut_(doSharedHitCut), - dupPassThrough_(dupPassThrough), - useSimpleTripletCleaner_(useSimpleTripletCleaner), - ptmin_(ptmin), - CAThetaCutBarrel_(CAThetaCutBarrel), - CAThetaCutForward_(CAThetaCutForward), - hardCurvCut_(hardCurvCut), - dcaCutInnerTriplet_(dcaCutInnerTriplet), - dcaCutOuterTriplet_(dcaCutOuterTriplet), - cuts_(cuts) {} +} // namespace caHitNtupletGenerator - const bool onGPU_; - const uint32_t minHitsPerNtuplet_; - const uint32_t maxNumberOfDoublets_; - const uint16_t minHitsForSharingCut_; - const bool useRiemannFit_; - const bool fitNas4_; - const bool includeJumpingForwardDoublets_; - const bool earlyFishbone_; - const bool lateFishbone_; - const bool idealConditions_; - const bool doStats_; - const bool doClusterCut_; - const bool doZ0Cut_; - const bool doPtCut_; - const bool doSharedHitCut_; - const bool dupPassThrough_; - const bool useSimpleTripletCleaner_; - const float ptmin_; - const float CAThetaCutBarrel_; - const float CAThetaCutForward_; - const float hardCurvCut_; - const float dcaCutInnerTriplet_; - const float dcaCutOuterTriplet_; - - // quality cuts - QualityCuts cuts_{// polynomial coefficients for the pT-dependent chi2 cut - {0.68177776, 0.74609577, -0.08035491, 0.00315399}, - // max pT used to determine the chi2 cut - 10., - // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit - 30., - // regional cuts for triplets - { - 0.3, // |Tip| < 0.3 cm - 0.5, // pT > 0.5 GeV - 12.0 // |Zip| < 12.0 cm - }, - // regional cuts for quadruplets - { - 0.5, // |Tip| < 0.5 cm - 0.3, // pT > 0.3 GeV - 12.0 // |Zip| < 12.0 cm - }}; - - }; // Params - -} // namespace cAHitNtupletGenerator - -template +template class CAHitNtupletGeneratorKernels { public: using Traits = TTraits; - - using QualityCuts = cAHitNtupletGenerator::QualityCuts; - using Params = cAHitNtupletGenerator::Params; - using Counters = cAHitNtupletGenerator::Counters; + using TrackerTraits = TTTraits; + using QualityCuts = pixelTrack::QualityCutsT; + using Params = caHitNtupletGenerator::ParamsT; + using CAParams = caHitNtupletGenerator::CAParamsT; + using Counters = caHitNtupletGenerator::Counters; template using unique_ptr = typename Traits::template unique_ptr; - using HitsView = TrackingRecHit2DSOAView; - using HitsOnGPU = TrackingRecHit2DSOAView; - using HitsOnCPU = TrackingRecHit2DHeterogeneous; + using HitsView = TrackingRecHit2DSOAViewT; + using HitsOnCPU = TrackingRecHit2DHeterogeneousT; - using HitToTuple = caConstants::HitToTuple; - using TupleMultiplicity = caConstants::TupleMultiplicity; + using HitToTuple = caStructures::HitToTupleT; + using TupleMultiplicity = caStructures::TupleMultiplicityT; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + using CellNeighbors = caStructures::CellNeighborsT; + using CellTracksVector = caStructures::CellTracksVectorT; + using CellTracks = caStructures::CellTracksT; + using OuterHitOfCellContainer = caStructures::OuterHitOfCellContainerT; + using OuterHitOfCell = caStructures::OuterHitOfCellT; + + using CACell = GPUCACellT; using Quality = pixelTrack::Quality; - using TkSoA = pixelTrack::TrackSoA; - using HitContainer = pixelTrack::HitContainer; + using TkSoA = pixelTrack::TrackSoAT; + using HitContainer = pixelTrack::HitContainerT; CAHitNtupletGeneratorKernels(Params const& params) - : params_(params), paramsMaxDoubletes3Quarters_(3 * params.maxNumberOfDoublets_ / 4) {} + : params_(params), paramsMaxDoubletes3Quarters_(3 * params.cellCuts_.maxNumberOfDoublets_ / 4) {} + ~CAHitNtupletGeneratorKernels() = default; TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); } @@ -193,24 +237,23 @@ class CAHitNtupletGeneratorKernels { static void printCounters(Counters const* counters); void setCounters(Counters* counters) { counters_ = counters; } -private: +protected: Counters* counters_ = nullptr; - // workspace unique_ptr cellStorage_; - unique_ptr device_theCellNeighbors_; - caConstants::CellNeighbors* device_theCellNeighborsContainer_; - unique_ptr device_theCellTracks_; - caConstants::CellTracks* device_theCellTracksContainer_; - - unique_ptr device_theCells_; - unique_ptr device_isOuterHitOfCell_; - GPUCACell::OuterHitOfCell isOuterHitOfCell_; + unique_ptr device_theCellNeighbors_; + CellNeighbors* device_theCellNeighborsContainer_; + unique_ptr device_theCellTracks_; + CellTracks* device_theCellTracksContainer_; + + unique_ptr device_theCells_; + unique_ptr device_isOuterHitOfCell_; + OuterHitOfCell isOuterHitOfCell_; uint32_t* device_nCells_ = nullptr; unique_ptr device_hitToTuple_; - unique_ptr device_hitToTupleStorage_; - HitToTuple::View hitToTupleView_; + unique_ptr device_hitToTupleStorage_; + typename HitToTuple::View hitToTupleView_; cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr; @@ -219,8 +262,9 @@ class CAHitNtupletGeneratorKernels { unique_ptr device_tupleMultiplicity_; unique_ptr device_storage_; + // params - Params const& params_; + Params params_; /// Intermediate result avoiding repeated computations. const uint32_t paramsMaxDoubletes3Quarters_; /// Compute the number of doublet blocks for block size @@ -231,12 +275,50 @@ class CAHitNtupletGeneratorKernels { /// Compute the number of quadruplet blocks for block size inline uint32_t nQuadrupletBlocks(uint32_t blockSize) { - // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4 - return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; + // pixelTopology::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4 + return (3 * TrackerTraits::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; } }; -using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; -using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels; +template +class CAHitNtupletGeneratorKernelsGPU : public CAHitNtupletGeneratorKernels { + using CAHitNtupletGeneratorKernels::CAHitNtupletGeneratorKernels; + using HitsOnCPU = TrackingRecHit2DHeterogeneousT; + using TkSoA = pixelTrack::TrackSoAT; + using Counters = caHitNtupletGenerator::Counters; + using HitContainer = pixelTrack::HitContainerT; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + using HitToTuple = caStructures::HitToTupleT; + using CellTracksVector = caStructures::CellTracksVectorT; + using TupleMultiplicity = caStructures::TupleMultiplicityT; + using CAParams = caHitNtupletGenerator::CAParamsT; + +public: + void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); + void allocateOnGPU(int32_t nHits, cudaStream_t stream); + static void printCounters(Counters const* counters); +}; + +template +class CAHitNtupletGeneratorKernelsCPU : public CAHitNtupletGeneratorKernels { + using CAHitNtupletGeneratorKernels::CAHitNtupletGeneratorKernels; + using HitsOnCPU = TrackingRecHit2DHeterogeneousT; + using TkSoA = pixelTrack::TrackSoAT; + using Counters = caHitNtupletGenerator::Counters; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + using HitToTuple = caStructures::HitToTupleT; + using CellTracksVector = caStructures::CellTracksVectorT; + using TupleMultiplicity = caStructures::TupleMultiplicityT; + using CAParams = caHitNtupletGenerator::CAParamsT; + +public: + void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); + void allocateOnGPU(int32_t nHits, cudaStream_t stream); + static void printCounters(Counters const* counters); +}; #endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc index 5978ef8851c73..af085bb12eddd 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc @@ -2,18 +2,21 @@ #include "CAHitNtupletGeneratorKernels.h" -template <> +//#define GPU_DEBUG +template #ifdef __CUDACC__ -void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) { + using Traits = cms::cudacompat::GPUTraits; #else -void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) { + using Traits = cms::cudacompat::CPUTraits; #endif ////////////////////////////////////////////////////////// // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// - device_theCellNeighbors_ = Traits::template make_unique(stream); - device_theCellTracks_ = Traits::template make_unique(stream); + this->device_theCellNeighbors_ = Traits::template make_unique(stream); + this->device_theCellTracks_ = Traits::template make_unique(stream); #ifdef GPU_DEBUG std::cout << "Allocation for tuple building. N hits " << nHits << std::endl; @@ -21,30 +24,36 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t nHits++; // storage requires one more counter; assert(nHits > 0); - device_hitToTuple_ = Traits::template make_unique(stream); - device_hitToTupleStorage_ = Traits::template make_unique(nHits, stream); - hitToTupleView_.assoc = device_hitToTuple_.get(); - hitToTupleView_.offStorage = device_hitToTupleStorage_.get(); - hitToTupleView_.offSize = nHits; + this->device_hitToTuple_ = Traits::template make_unique(stream); + this->device_hitToTupleStorage_ = Traits::template make_unique(nHits, stream); + this->hitToTupleView_.assoc = this->device_hitToTuple_.get(); + this->hitToTupleView_.offStorage = this->device_hitToTupleStorage_.get(); + this->hitToTupleView_.offSize = nHits; - device_tupleMultiplicity_ = Traits::template make_unique(stream); + this->device_tupleMultiplicity_ = Traits::template make_unique(stream); - device_storage_ = Traits::template make_unique(3, stream); + this->device_storage_ = Traits::template make_unique(3, stream); - device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get(); - device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1; - device_nCells_ = (uint32_t*)(device_storage_.get() + 2); + this->device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)this->device_storage_.get(); + this->device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)this->device_storage_.get() + 1; + this->device_nCells_ = (uint32_t*)(this->device_storage_.get() + 2); // FIXME: consider collapsing these 3 in one adhoc kernel if constexpr (std::is_same::value) { - cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream)); + cudaCheck(cudaMemsetAsync(this->device_nCells_, 0, sizeof(uint32_t), stream)); } else { - *device_nCells_ = 0; + *(this->device_nCells_) = 0; } - cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream); - cms::cuda::launchZero(hitToTupleView_, stream); // we may wish to keep it in the edm + cms::cuda::launchZero(this->device_tupleMultiplicity_.get(), stream); + cms::cuda::launchZero(this->hitToTupleView_, stream); // we may wish to keep it in the edm #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif } + +template class CAHitNtupletGeneratorKernelsGPU; +template class CAHitNtupletGeneratorKernelsGPU; + +template class CAHitNtupletGeneratorKernelsCPU; +template class CAHitNtupletGeneratorKernelsCPU; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index bbe5df891a735..03112e0f3fc48 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -15,923 +15,964 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "CAConstants.h" +#include "CAStructures.h" #include "CAHitNtupletGeneratorKernels.h" #include "GPUCACell.h" #include "gpuFishbone.h" #include "gpuPixelDoublets.h" -using HitsOnGPU = TrackingRecHit2DSOAView; -using HitsOnCPU = TrackingRecHit2DGPU; +namespace caHitNtupletGeneratorKernels { -using HitToTuple = caConstants::HitToTuple; -using TupleMultiplicity = caConstants::TupleMultiplicity; + constexpr uint32_t tkNotFound = std::numeric_limits::max(); + constexpr float maxScore = std::numeric_limits::max(); + constexpr float nSigma2 = 25.f; -using Quality = pixelTrack::Quality; -using TkSoA = pixelTrack::TrackSoA; -using HitContainer = pixelTrack::HitContainer; + //all of these below are mostly to avoid brining around the relative namespace + template + using HitsView = TrackingRecHit2DSOAViewT; -namespace { + template + using HitToTuple = caStructures::HitToTupleT; - constexpr uint16_t tkNotFound = std::numeric_limits::max(); - constexpr float maxScore = std::numeric_limits::max(); - constexpr float nSigma2 = 25.f; + template + using TupleMultiplicity = caStructures::TupleMultiplicityT; -} // namespace - -__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, - caConstants::TupleMultiplicity const *tupleMultiplicity, - CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple, - cms::cuda::AtomicPairCounter *apc, - GPUCACell const *__restrict__ cells, - uint32_t const *__restrict__ nCells, - gpuPixelDoublets::CellNeighborsVector const *cellNeighbors, - gpuPixelDoublets::CellTracksVector const *cellTracks, - GPUCACell::OuterHitOfCell const isOuterHitOfCell, - int32_t nHits, - uint32_t maxNumberOfDoublets, - CAHitNtupletGeneratorKernelsGPU::Counters *counters) { - auto first = threadIdx.x + blockIdx.x * blockDim.x; - - auto &c = *counters; - // counters once per event - if (0 == first) { - atomicAdd(&c.nEvents, 1); - atomicAdd(&c.nHits, nHits); - atomicAdd(&c.nCells, *nCells); - atomicAdd(&c.nTuples, apc->get().m); - atomicAdd(&c.nFitTracks, tupleMultiplicity->size()); - } + template + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + + template + using CellTracksVector = caStructures::CellTracksVectorT; + + template + using OuterHitOfCell = caStructures::OuterHitOfCellT; + + using Quality = pixelTrack::Quality; + + template + using TkSoA = pixelTrack::TrackSoAT; + + template + using HitContainer = pixelTrack::HitContainerT; + + template + using Hits = typename GPUCACellT::Hits; + + template + using QualityCuts = pixelTrack::QualityCutsT; + + template + using CAParams = caHitNtupletGenerator::CAParamsT; + + using Counters = caHitNtupletGenerator::Counters; + + template + __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, + TupleMultiplicity const *tupleMultiplicity, + HitToTuple const *hitToTuple, + cms::cuda::AtomicPairCounter *apc, + GPUCACellT const *__restrict__ cells, + uint32_t const *__restrict__ nCells, + CellNeighborsVector const *cellNeighbors, + CellTracksVector const *cellTracks, + OuterHitOfCell const isOuterHitOfCell, + int32_t nHits, + uint32_t maxNumberOfDoublets, + Counters *counters) { + auto first = threadIdx.x + blockIdx.x * blockDim.x; + + auto &c = *counters; + // counters once per event + if (0 == first) { + atomicAdd(&c.nEvents, 1); + atomicAdd(&c.nHits, nHits); + atomicAdd(&c.nCells, *nCells); + atomicAdd(&c.nTuples, apc->get().m); + atomicAdd(&c.nFitTracks, tupleMultiplicity->size()); + } #ifdef NTUPLE_DEBUG - if (0 == first) { - printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n", - *nCells, - apc->get().m, - apc->get().n, - nHits, - hitToTuple->totOnes()); - if (apc->get().m < caConstants::maxNumberOfQuadruplets) { - assert(foundNtuplets->size(apc->get().m) == 0); - assert(foundNtuplets->size() == apc->get().n); + if (0 == first) { + printf("number of found cells %d \n found tuples %d with total hits %d out of %d %d\n", + *nCells, + apc->get().m, + apc->get().n, + nHits, + hitToTuple->totOnes()); + if (apc->get().m < TrackerTraits::maxNumberOfQuadruplets) { + assert(foundNtuplets->size(apc->get().m) == 0); + assert(foundNtuplets->size() == apc->get().n); + } } - } - for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { - if (foundNtuplets->size(idx) > 7) // current real limit - printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); - assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack); - for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) - assert(int(*ih) < nHits); - } + for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { + if (foundNtuplets->size(idx) > TrackerTraits::maxHitsOnTrack) // current real limit + printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); + assert(foundNtuplets->size(idx) <= TrackerTraits::maxHitsOnTrack); + for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) + assert(int(*ih) < nHits); + } #endif - if (0 == first) { - if (apc->get().m >= caConstants::maxNumberOfQuadruplets) - printf("Tuples overflow\n"); - if (*nCells >= maxNumberOfDoublets) - printf("Cells overflow\n"); - if (cellNeighbors && cellNeighbors->full()) - printf("cellNeighbors overflow\n"); - if (cellTracks && cellTracks->full()) - printf("cellTracks overflow\n"); - if (int(hitToTuple->nOnes()) < nHits) - printf("ERROR hitToTuple overflow %d %d\n", hitToTuple->nOnes(), nHits); - } + if (0 == first) { + if (apc->get().m >= TrackerTraits::maxNumberOfQuadruplets) + printf("Tuples overflow\n"); + if (*nCells >= maxNumberOfDoublets) + printf("Cells overflow\n"); + if (cellNeighbors && cellNeighbors->full()) + printf("cellNeighbors overflow %d %d \n", cellNeighbors->capacity(), cellNeighbors->size()); + if (cellTracks && cellTracks->full()) + printf("cellTracks overflow\n"); + if (int(hitToTuple->nOnes()) < nHits) + printf("ERROR hitToTuple overflow %d %d\n", hitToTuple->nOnes(), nHits); +#ifdef GPU_DEBUG + printf("size of cellNeighbors %d \n cellTracks %d \n hitToTuple %d \n", + cellNeighbors->size(), + cellTracks->size(), + hitToTuple->size()); +// printf("cellTracksSizes;"); +// for (int i = 0; i < cellTracks->size(); i++) { +// printf("%d;",cellTracks[i].size()); +// } +// +// printf("\n"); +// printf("cellNeighborsSizes;"); +// for (int i = 0; i < cellNeighbors->size(); i++) { +// printf("%d;",cellNeighbors[i].size()); +// } +// printf("\n"); +#endif + } - for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { - auto const &thisCell = cells[idx]; - if (thisCell.hasFishbone() && !thisCell.isKilled()) - atomicAdd(&c.nFishCells, 1); - if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; - printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId()); - if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; - printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId()); - if (thisCell.isKilled()) - atomicAdd(&c.nKilledCells, 1); - if (!thisCell.unused()) - atomicAdd(&c.nEmptyCells, 1); - if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id()))) - atomicAdd(&c.nZeroTrackCells, 1); - } + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + if (thisCell.hasFishbone() && !thisCell.isKilled()) + atomicAdd(&c.nFishCells, 1); + if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; + printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId()); + if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; + printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId()); + if (thisCell.isKilled()) + atomicAdd(&c.nKilledCells, 1); + if (!thisCell.unused()) + atomicAdd(&c.nEmptyCells, 1); + if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id()))) + atomicAdd(&c.nZeroTrackCells, 1); + } - for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) { - if (isOuterHitOfCell.container[idx].full()) // ++tooManyOuterHitOfCell; - printf("OuterHitOfCell overflow %d\n", idx); + for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) { + if (isOuterHitOfCell.container[idx].full()) // ++tooManyOuterHitOfCell; + printf("OuterHitOfCell overflow %d\n", idx); + } } -} -__global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) { - constexpr auto reject = pixelTrack::Quality::dup; + template + __global__ void kernel_fishboneCleaner(GPUCACellT const *cells, + uint32_t const *__restrict__ nCells, + Quality *quality) { + constexpr auto reject = pixelTrack::Quality::dup; - auto first = threadIdx.x + blockIdx.x * blockDim.x; - for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { - auto const &thisCell = cells[idx]; - if (!thisCell.isKilled()) - continue; + auto first = threadIdx.x + blockIdx.x * blockDim.x; + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + if (!thisCell.isKilled()) + continue; - for (auto it : thisCell.tracks()) - quality[it] = reject; - } -} - -// remove shorter tracks if sharing a cell -// It does not seem to affect efficiency in any way! -__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, - uint32_t const *__restrict__ nCells, - TkSoA const *__restrict__ ptracks, - Quality *quality, - bool dupPassThrough) { - // quality to mark rejected - constexpr auto reject = pixelTrack::Quality::edup; /// cannot be loose - - auto const &tracks = *ptracks; - - assert(nCells); - auto first = threadIdx.x + blockIdx.x * blockDim.x; - for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { - auto const &thisCell = cells[idx]; - - if (thisCell.tracks().size() < 2) - continue; - - int8_t maxNl = 0; - - // find maxNl - for (auto it : thisCell.tracks()) { - auto nl = tracks.nLayers(it); - maxNl = std::max(nl, maxNl); + for (auto it : thisCell.tracks()) + quality[it] = reject; } + } - // if (maxNl<4) continue; - // quad pass through (leave it her for tests) - // maxNl = std::min(4, maxNl); + // remove shorter tracks if sharing a cell + // It does not seem to affect efficiency in any way! + template + __global__ void kernel_earlyDuplicateRemover(GPUCACellT const *cells, + uint32_t const *__restrict__ nCells, + TkSoA const *__restrict__ ptracks, + Quality *quality, + bool dupPassThrough) { + // quality to mark rejected + constexpr auto reject = pixelTrack::Quality::edup; /// cannot be loose + + auto const &tracks = *ptracks; + + assert(nCells); + auto first = threadIdx.x + blockIdx.x * blockDim.x; + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + + if (thisCell.tracks().size() < 2) + continue; - for (auto it : thisCell.tracks()) { - if (tracks.nLayers(it) < maxNl) - quality[it] = reject; //no race: simple assignment of the same constant - } - } -} + int8_t maxNl = 0; -// assume the above (so, short tracks already removed) -__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, - uint32_t const *__restrict__ nCells, - TkSoA *__restrict__ tracks, - bool dupPassThrough) { - // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; - constexpr auto loose = pixelTrack::Quality::loose; + // find maxNl + for (auto it : thisCell.tracks()) { + auto nl = tracks.nLayers(it); + maxNl = std::max(nl, maxNl); + } - assert(nCells); + // if (maxNl<4) continue; + // quad pass through (leave it her for tests) + // maxNl = std::min(4, maxNl); - auto first = threadIdx.x + blockIdx.x * blockDim.x; - for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { - auto const &thisCell = cells[idx]; - if (thisCell.tracks().size() < 2) - continue; + for (auto it : thisCell.tracks()) { + if (tracks.nLayers(it) < maxNl) + quality[it] = reject; //no race: simple assignment of the same constant + } + } + } - float mc = maxScore; - uint16_t im = tkNotFound; + // assume the above (so, short tracks already removed) + template + __global__ void kernel_fastDuplicateRemover(GPUCACellT const *__restrict__ cells, + uint32_t const *__restrict__ nCells, + TkSoA *__restrict__ tracks, + bool dupPassThrough) { + // quality to mark rejected + auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; + constexpr auto loose = pixelTrack::Quality::loose; + + assert(nCells); + + auto first = threadIdx.x + blockIdx.x * blockDim.x; + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + if (thisCell.tracks().size() < 2) + continue; - /* chi2 penalize higher-pt tracks (try rescale it?) + float mc = maxScore; + uint16_t im = tkNotFound; + + /* chi2 penalize higher-pt tracks (try rescale it?) auto score = [&](auto it) { - return tracks->nLayers(it) < 4 ? + return tracks->nLayers(it) < 4 ? std::abs(tracks->tip(it)) : // tip for triplets tracks->chi2(it); //chi2 for quads }; */ - auto score = [&](auto it) { return std::abs(tracks->tip(it)); }; + auto score = [&](auto it) { return std::abs(tracks->tip(it)); }; - // full crazy combinatorics - int ntr = thisCell.tracks().size(); - for (int i = 0; i < ntr - 1; ++i) { - auto it = thisCell.tracks()[i]; - auto qi = tracks->quality(it); - if (qi <= reject) - continue; - auto opi = tracks->stateAtBS.state(it)(2); - auto e2opi = tracks->stateAtBS.covariance(it)(9); - auto cti = tracks->stateAtBS.state(it)(3); - auto e2cti = tracks->stateAtBS.covariance(it)(12); - for (auto j = i + 1; j < ntr; ++j) { - auto jt = thisCell.tracks()[j]; - auto qj = tracks->quality(jt); - if (qj <= reject) - continue; - auto opj = tracks->stateAtBS.state(jt)(2); - auto ctj = tracks->stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti); - if ((cti - ctj) * (cti - ctj) > dct) + // full crazy combinatorics + // full crazy combinatorics + int ntr = thisCell.tracks().size(); + for (int i = 0; i < ntr - 1; ++i) { + auto it = thisCell.tracks()[i]; + auto qi = tracks->quality(it); + if (qi <= reject) continue; - auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi); - if ((opi - opj) * (opi - opj) > dop) - continue; - if ((qj < qi) || (qj == qi && score(it) < score(jt))) - tracks->quality(jt) = reject; - else { - tracks->quality(it) = reject; - break; + auto opi = tracks->stateAtBS.state(it)(2); + auto e2opi = tracks->stateAtBS.covariance(it)(9); + auto cti = tracks->stateAtBS.state(it)(3); + auto e2cti = tracks->stateAtBS.covariance(it)(12); + for (auto j = i + 1; j < ntr; ++j) { + auto jt = thisCell.tracks()[j]; + auto qj = tracks->quality(jt); + if (qj <= reject) + continue; + auto opj = tracks->stateAtBS.state(jt)(2); + auto ctj = tracks->stateAtBS.state(jt)(3); + auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti); + if ((cti - ctj) * (cti - ctj) > dct) + continue; + auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi); + if ((opi - opj) * (opi - opj) > dop) + continue; + if ((qj < qi) || (qj == qi && score(it) < score(jt))) + tracks->quality(jt) = reject; + else { + tracks->quality(it) = reject; + break; + } } } - } - // find maxQual - auto maxQual = reject; // no duplicate! - for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > maxQual) - maxQual = tracks->quality(it); - } + // find maxQual + auto maxQual = reject; // no duplicate! + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) > maxQual) + maxQual = tracks->quality(it); + } - if (maxQual <= loose) - continue; + if (maxQual <= loose) + continue; - // find min score - for (auto it : thisCell.tracks()) { - if (tracks->quality(it) == maxQual && score(it) < mc) { - mc = score(it); - im = it; + // find min score + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) == maxQual && score(it) < mc) { + mc = score(it); + im = it; + } } - } - if (tkNotFound == im) - continue; + if (tkNotFound == im) + continue; - // mark all other duplicates (not yet, keep it loose) - for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > loose && it != im) - tracks->quality(it) = loose; //no race: simple assignment of the same constant + // mark all other duplicates (not yet, keep it loose) + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) > loose && it != im) + tracks->quality(it) = loose; //no race: simple assignment of the same constant + } } } -} - -__global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1, - cms::cuda::AtomicPairCounter *apc2, // just to zero them, - GPUCACell::Hits const *__restrict__ hhp, - GPUCACell *cells, - uint32_t const *__restrict__ nCells, - gpuPixelDoublets::CellNeighborsVector *cellNeighbors, - GPUCACell::OuterHitOfCell const isOuterHitOfCell, - float hardCurvCut, - float ptmin, - float CAThetaCutBarrel, - float CAThetaCutForward, - float dcaCutInnerTriplet, - float dcaCutOuterTriplet) { - auto const &hh = *hhp; - - auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y; - auto first = threadIdx.x; - auto stride = blockDim.x; - - if (0 == (firstCellIndex + first)) { - (*apc1) = 0; - (*apc2) = 0; - } // ready for next kernel - - for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) { - auto cellIndex = idx; - auto &thisCell = cells[idx]; - auto innerHitId = thisCell.inner_hit_id(); - if (int(innerHitId) < isOuterHitOfCell.offset) - continue; - int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); - auto vi = isOuterHitOfCell[innerHitId].data(); - - auto ri = thisCell.inner_r(hh); - auto zi = thisCell.inner_z(hh); - - auto ro = thisCell.outer_r(hh); - auto zo = thisCell.outer_z(hh); - auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex; - - for (int j = first; j < numberOfPossibleNeighbors; j += stride) { - auto otherCell = __ldg(vi + j); - auto &oc = cells[otherCell]; - auto r1 = oc.inner_r(hh); - auto z1 = oc.inner_z(hh); - bool aligned = GPUCACell::areAlignedRZ( - r1, - z1, - ri, - zi, - ro, - zo, - ptmin, - isBarrel ? CAThetaCutBarrel : CAThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts - if (aligned && thisCell.dcaCut(hh, - oc, - oc.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet - : dcaCutOuterTriplet, - hardCurvCut)) { // FIXME tune cuts - oc.addOuterNeighbor(cellIndex, *cellNeighbors); - thisCell.setStatusBits(GPUCACell::StatusBit::kUsed); - oc.setStatusBits(GPUCACell::StatusBit::kUsed); + + template + __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1, + cms::cuda::AtomicPairCounter *apc2, // just to zero them, + Hits const *__restrict__ hhp, + GPUCACellT *cells, + uint32_t const *__restrict__ nCells, + CellNeighborsVector *cellNeighbors, + OuterHitOfCell const isOuterHitOfCell, + CAParams params) { + using Cell = GPUCACellT; + auto const &hh = *hhp; + + auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y; + auto first = threadIdx.x; + auto stride = blockDim.x; + + if (0 == (firstCellIndex + first)) { + (*apc1) = 0; + (*apc2) = 0; + } // ready for next kernel + + constexpr uint32_t last_bpix1_detIndex = TrackerTraits::last_bpix1_detIndex; + constexpr uint32_t last_barrel_detIndex = TrackerTraits::last_barrel_detIndex; + for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) { + auto cellIndex = idx; + auto &thisCell = cells[idx]; + auto innerHitId = thisCell.inner_hit_id(); + if (int(innerHitId) < isOuterHitOfCell.offset) + continue; + int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); + auto vi = isOuterHitOfCell[innerHitId].data(); + + auto ri = thisCell.inner_r(hh); + auto zi = thisCell.inner_z(hh); + + auto ro = thisCell.outer_r(hh); + auto zo = thisCell.outer_z(hh); + auto isBarrel = thisCell.inner_detIndex(hh) < last_barrel_detIndex; + + for (int j = first; j < numberOfPossibleNeighbors; j += stride) { + auto otherCell = __ldg(vi + j); + auto &oc = cells[otherCell]; + auto r1 = oc.inner_r(hh); + auto z1 = oc.inner_z(hh); + bool aligned = Cell::areAlignedRZ( + r1, + z1, + ri, + zi, + ro, + zo, + params.ptmin_, + isBarrel ? params.CAThetaCutBarrel_ : params.CAThetaCutForward_); // 2.f*thetaCut); // FIXME tune cuts + if (aligned && thisCell.dcaCut(hh, + oc, + oc.inner_detIndex(hh) < last_bpix1_detIndex ? params.dcaCutInnerTriplet_ + : params.dcaCutOuterTriplet_, + params.hardCurvCut_)) { // FIXME tune cuts + oc.addOuterNeighbor(cellIndex, *cellNeighbors); + thisCell.setStatusBits(Cell::StatusBit::kUsed); + oc.setStatusBits(Cell::StatusBit::kUsed); + } + } // loop on inner cells + } // loop on outer cells + } + + template + __global__ void kernel_find_ntuplets(Hits const *__restrict__ hhp, + GPUCACellT *__restrict__ cells, + uint32_t const *nCells, + CellTracksVector *cellTracks, + HitContainer *foundNtuplets, + cms::cuda::AtomicPairCounter *apc, + Quality *__restrict__ quality, + CAParams params) { + // recursive: not obvious to widen + auto const &hh = *hhp; + + using Cell = GPUCACellT; + + auto first = threadIdx.x + blockIdx.x * blockDim.x; + +#ifdef GPU_DEBUG + if (first == 0) + printf("starting producing ntuplets from %d cells \n", *nCells); +#endif + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + + if (thisCell.isKilled()) + continue; // cut by earlyFishbone + + // we require at least three hits... + if (thisCell.outerNeighbors().empty()) + continue; + + auto pid = thisCell.layerPairId(); + bool doit = params.startingLayerPair(pid); + + constexpr uint32_t maxDepth = TrackerTraits::maxDepth; + if (doit) { + typename Cell::TmpTuple stack; + stack.reset(); + + bool bpix1Start = params.startAt0(pid); + + thisCell.template find_ntuplets( + hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, params.minHitsPerNtuplet_, bpix1Start); + + assert(stack.empty()); } - } // loop on inner cells - } // loop on outer cells -} - -__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, - GPUCACell *__restrict__ cells, - uint32_t const *nCells, - gpuPixelDoublets::CellTracksVector *cellTracks, - HitContainer *foundNtuplets, - cms::cuda::AtomicPairCounter *apc, - Quality *__restrict__ quality, - unsigned int minHitsPerNtuplet) { - // recursive: not obvious to widen - auto const &hh = *hhp; - - auto first = threadIdx.x + blockIdx.x * blockDim.x; - for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { - auto const &thisCell = cells[idx]; - if (thisCell.isKilled()) - continue; // cut by earlyFishbone - // we require at least three hits... - if (thisCell.outerNeighbors().empty()) - continue; - auto pid = thisCell.layerPairId(); - auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12; - if (doit) { - GPUCACell::TmpTuple stack; - stack.reset(); - thisCell.find_ntuplets<6>( - hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3); - assert(stack.empty()); - // printf("in %d found quadruplets: %d\n", cellIndex, apc->get()); } } -} - -__global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *nCells) { - auto first = threadIdx.x + blockIdx.x * blockDim.x; - for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { - auto &thisCell = cells[idx]; - if (!thisCell.tracks().empty()) - thisCell.setStatusBits(GPUCACell::StatusBit::kInTrack); + template + __global__ void kernel_mark_used(GPUCACellT *__restrict__ cells, uint32_t const *nCells) { + auto first = threadIdx.x + blockIdx.x * blockDim.x; + using Cell = GPUCACellT; + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto &thisCell = cells[idx]; + if (!thisCell.tracks().empty()) + thisCell.setStatusBits(Cell::StatusBit::kInTrack); + } } -} -__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, - caConstants::TupleMultiplicity *tupleMultiplicity) { - auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); - if (nhits < 3) - continue; - if (quality[it] == pixelTrack::Quality::edup) - continue; - assert(quality[it] == pixelTrack::Quality::bad); - if (nhits > 7) // current limit - printf("wrong mult %d %d\n", it, nhits); - assert(nhits <= caConstants::maxHitsOnTrack); - tupleMultiplicity->count(nhits); + template + __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + TupleMultiplicity *tupleMultiplicity) { + auto first = blockIdx.x * blockDim.x + threadIdx.x; + for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = foundNtuplets->size(it); + if (nhits < 3) + continue; + if (quality[it] == pixelTrack::Quality::edup) + continue; + assert(quality[it] == pixelTrack::Quality::bad); + if (nhits > TrackerTraits::maxHitsOnTrack) // current limit + printf("wrong mult %d %d\n", it, nhits); + assert(nhits <= TrackerTraits::maxHitsOnTrack); + tupleMultiplicity->count(nhits); + } } -} -__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, - caConstants::TupleMultiplicity *tupleMultiplicity) { - auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); - if (nhits < 3) - continue; - if (quality[it] == pixelTrack::Quality::edup) - continue; - assert(quality[it] == pixelTrack::Quality::bad); - if (nhits > 7) - printf("wrong mult %d %d\n", it, nhits); - assert(nhits <= caConstants::maxHitsOnTrack); - tupleMultiplicity->fill(nhits, it); - } -} - -__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, - TkSoA const *__restrict__ tracks, - CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts, - Quality *__restrict__ quality) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = tuples->size(it); - if (nhits == 0) - break; // guard - - // if duplicate: not even fit - if (quality[it] == pixelTrack::Quality::edup) - continue; - - assert(quality[it] == pixelTrack::Quality::bad); - - // mark doublets as bad - if (nhits < 3) - continue; - - // if the fit has any invalid parameters, mark it as bad - bool isNaN = false; - for (int i = 0; i < 5; ++i) { - isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); - } - if (isNaN) { -#ifdef NTUPLE_DEBUG - printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); -#endif - continue; + template + __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + TupleMultiplicity *tupleMultiplicity) { + auto first = blockIdx.x * blockDim.x + threadIdx.x; + for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = foundNtuplets->size(it); + if (nhits < 3) + continue; + if (quality[it] == pixelTrack::Quality::edup) + continue; + assert(quality[it] == pixelTrack::Quality::bad); + if (nhits > TrackerTraits::maxHitsOnTrack) + printf("wrong mult %d %d\n", it, nhits); + assert(nhits <= TrackerTraits::maxHitsOnTrack); + tupleMultiplicity->fill(nhits, it); } + } - quality[it] = pixelTrack::Quality::strict; - - // compute a pT-dependent chi2 cut - - auto roughLog = [](float x) { - // max diff [0.5,12] at 1.25 0.16143 - // average diff 0.0662998 - union IF { - uint32_t i; - float f; - }; - IF z; - z.f = x; - uint32_t lsb = 1 < 21; - z.i += lsb; - z.i >>= 21; - auto f = z.i & 3; - int ex = int(z.i >> 2) - 127; - - // log2(1+0.25*f) - // averaged over bins - const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f}; - return float(ex) + frac[f]; - }; + template + __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, + TkSoA const *__restrict__ tracks, + QualityCuts cuts, + Quality *__restrict__ quality) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tuples->size(it); + if (nhits == 0) + break; // guard + + // if duplicate: not even fit + if (quality[it] == pixelTrack::Quality::edup) + continue; + + assert(quality[it] == pixelTrack::Quality::bad); + + // mark doublets as bad + if (nhits < 3) + continue; - // (see CAHitNtupletGeneratorGPU.cc) - float pt = std::min(tracks->pt(it), cuts.chi2MaxPt); - float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]); - if (tracks->chi2(it) >= chi2Cut) { -#ifdef NTUPLE_FIT_DEBUG - printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n", - it, - tuples->size(it), - tracks->pt(it), - tracks->eta(it), - tracks->chi2(it)); + // if the fit has any invalid parameters, mark it as bad + bool isNaN = false; + for (int i = 0; i < 5; ++i) { + isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); + } + if (isNaN) { +#ifdef NTUPLE_DEBUG + printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); #endif - continue; - } + continue; + } - quality[it] = pixelTrack::Quality::tight; + quality[it] = pixelTrack::Quality::strict; - // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) - // default cuts: - // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm - // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm - // (see CAHitNtupletGeneratorGPU.cc) - auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet; - bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and - (std::abs(tracks->zip(it)) < region.maxZip); + if (cuts.strictCut(tracks, it)) + continue; - if (isOk) - quality[it] = pixelTrack::Quality::highPurity; - } -} + quality[it] = pixelTrack::Quality::tight; -__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - CAHitNtupletGeneratorKernelsGPU::Counters *counters) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) - break; //guard - if (quality[idx] < pixelTrack::Quality::loose) - continue; - atomicAdd(&(counters->nLooseTracks), 1); - if (quality[idx] < pixelTrack::Quality::strict) - continue; - atomicAdd(&(counters->nGoodTracks), 1); + if (cuts.isHP(tracks, nhits, it)) + quality[it] = pixelTrack::Quality::highPurity; + } } -} -__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) - break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->count(*h); - } -} - -__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) - break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->fill(*h, idx); - } -} - -__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, - TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer *__restrict__ hitDetIndices) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - // copy offsets - for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - hitDetIndices->off[idx] = tuples->off[idx]; + template + __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + Counters *counters) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + break; //guard + if (quality[idx] < pixelTrack::Quality::loose) + continue; + atomicAdd(&(counters->nLooseTracks), 1); + if (quality[idx] < pixelTrack::Quality::strict) + continue; + atomicAdd(&(counters->nGoodTracks), 1); + } } - // fill hit indices - auto const &hh = *hhp; - auto nhits = hh.nHits(); - for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { - assert(tuples->content[idx] < nhits); - hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]); + + template + __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + HitToTuple *hitToTuple) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + break; // guard + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->count(*h); + } } -} - -__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) { - auto &tracks = *ptracks; - auto first = blockIdx.x * blockDim.x + threadIdx.x; - // clamp the number of tracks to the capacity of the SoA - auto ntracks = std::min(apc->get().m, tracks.stride() - 1); - if (0 == first) - tracks.setNTracks(ntracks); - for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); - assert(nHits >= 3); - tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx); + + template + __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + HitToTuple *hitToTuple) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + break; // guard + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->fill(*h, idx); + } } -} - -__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple, - CAHitNtupletGeneratorKernelsGPU::Counters *counters) { - auto &c = *counters; - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (hitToTuple->size(idx) == 0) - continue; // SHALL NOT BE break - atomicAdd(&c.nUsedHits, 1); - if (hitToTuple->size(idx) > 1) - atomicAdd(&c.nDupHits, 1); + + template + __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, + HitsView const *__restrict__ hhp, + HitContainer *__restrict__ hitDetIndices) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + // copy offsets + for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + hitDetIndices->off[idx] = tuples->off[idx]; + } + // fill hit indices + auto const &hh = *hhp; + auto nhits = hh.nHits(); + + for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { + assert(tuples->content[idx] < nhits); + hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]); + } } -} -__global__ void kernel_countSharedHit(int *__restrict__ nshared, - HitContainer const *__restrict__ ptuples, - Quality const *__restrict__ quality, - CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { - constexpr auto loose = pixelTrack::Quality::loose; + template + __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) { + auto &tracks = *ptracks; + auto first = blockIdx.x * blockDim.x + threadIdx.x; + // clamp the number of tracks to the capacity of the SoA + auto ntracks = std::min(apc->get().m, tracks.stride() - 1); + if (0 == first) + tracks.setNTracks(ntracks); + for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { + auto nHits = tracks.nHits(idx); + assert(nHits >= 3); + tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx); + } + } - auto &hitToTuple = *phitToTuple; - auto const &foundNtuplets = *ptuples; + template + __global__ void kernel_doStatsForHitInTracks(HitToTuple const *__restrict__ hitToTuple, + Counters *counters) { + auto &c = *counters; + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple->size(idx) == 0) + continue; // SHALL NOT BE break + atomicAdd(&c.nUsedHits, 1); + if (hitToTuple->size(idx) > 1) + atomicAdd(&c.nDupHits, 1); + } + } - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (hitToTuple.size(idx) < 2) - continue; + template + __global__ void kernel_countSharedHit(int *__restrict__ nshared, + HitContainer const *__restrict__ ptuples, + Quality const *__restrict__ quality, + HitToTuple const *__restrict__ phitToTuple) { + constexpr auto loose = pixelTrack::Quality::loose; - int nt = 0; + auto &hitToTuple = *phitToTuple; + auto const &foundNtuplets = *ptuples; - // count "good" tracks - for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] < loose) + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple.size(idx) < 2) continue; - ++nt; - } - if (nt < 2) - continue; + int nt = 0; + + // count "good" tracks + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + if (quality[*it] < loose) + continue; + ++nt; + } - // now mark each track triplet as sharing a hit - for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (foundNtuplets.size(*it) > 3) + if (nt < 2) continue; - atomicAdd(&nshared[*it], 1); - } - } // hit loop -} - -__global__ void kernel_markSharedHit(int const *__restrict__ nshared, - HitContainer const *__restrict__ tuples, - Quality *__restrict__ quality, - bool dupPassThrough) { - // constexpr auto bad = pixelTrack::Quality::bad; - constexpr auto dup = pixelTrack::Quality::dup; - constexpr auto loose = pixelTrack::Quality::loose; - // constexpr auto strict = pixelTrack::Quality::strict; - - // quality to mark rejected - auto const reject = dupPassThrough ? loose : dup; - - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) - break; //guard - if (quality[idx] <= reject) - continue; - if (nshared[idx] > 2) - quality[idx] = reject; + // now mark each track triplet as sharing a hit + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + if (foundNtuplets.size(*it) > 3) + continue; + atomicAdd(&nshared[*it], 1); + } + + } // hit loop } -} -// mostly for very forward triplets..... -__global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, + template + __global__ void kernel_markSharedHit(int const *__restrict__ nshared, + HitContainer const *__restrict__ tuples, Quality *__restrict__ quality, - uint16_t nmin, - bool dupPassThrough, - CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { - // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; - - auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; + bool dupPassThrough) { + // constexpr auto bad = pixelTrack::Quality::bad; + constexpr auto dup = pixelTrack::Quality::dup; + constexpr auto loose = pixelTrack::Quality::loose; + // constexpr auto strict = pixelTrack::Quality::strict; + + // quality to mark rejected + auto const reject = dupPassThrough ? loose : dup; + + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + break; //guard + if (quality[idx] <= reject) + continue; + if (nshared[idx] > 2) + quality[idx] = reject; + } + } - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (hitToTuple.size(idx) < 2) - continue; + // mostly for very forward triplets..... + template + __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, + Quality *__restrict__ quality, + uint16_t nmin, + bool dupPassThrough, + HitToTuple const *__restrict__ phitToTuple) { + // quality to mark rejected + auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; + + auto &hitToTuple = *phitToTuple; + auto const &tracks = *ptracks; + + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple.size(idx) < 2) + continue; - /* chi2 is bad for large pt + /* chi2 is bad for large pt auto score = [&](auto it, auto nl) { return nl < 4 ? std::abs(tracks.tip(it)) : // tip for triplets tracks.chi2(it); //chi2 }; */ - auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); }; + auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); }; - // full combinatorics - for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) { - auto const it = *ip; - auto qi = quality[it]; - if (qi <= reject) - continue; - auto opi = tracks.stateAtBS.state(it)(2); - auto e2opi = tracks.stateAtBS.covariance(it)(9); - auto cti = tracks.stateAtBS.state(it)(3); - auto e2cti = tracks.stateAtBS.covariance(it)(12); - auto nli = tracks.nLayers(it); - for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { - auto const jt = *jp; - auto qj = quality[jt]; - if (qj <= reject) - continue; - auto opj = tracks.stateAtBS.state(jt)(2); - auto ctj = tracks.stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti); - if ((cti - ctj) * (cti - ctj) > dct) + // full combinatorics + for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) { + auto const it = *ip; + auto qi = quality[it]; + if (qi <= reject) continue; - auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi); - if ((opi - opj) * (opi - opj) > dop) - continue; - auto nlj = tracks.nLayers(jt); - if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj))))) - quality[jt] = reject; - else { - quality[it] = reject; - break; + auto opi = tracks.stateAtBS.state(it)(2); + auto e2opi = tracks.stateAtBS.covariance(it)(9); + auto cti = tracks.stateAtBS.state(it)(3); + auto e2cti = tracks.stateAtBS.covariance(it)(12); + auto nli = tracks.nLayers(it); + for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { + auto const jt = *jp; + auto qj = quality[jt]; + if (qj <= reject) + continue; + auto opj = tracks.stateAtBS.state(jt)(2); + auto ctj = tracks.stateAtBS.state(jt)(3); + auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti); + if ((cti - ctj) * (cti - ctj) > dct) + continue; + auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi); + if ((opi - opj) * (opi - opj) > dop) + continue; + auto nlj = tracks.nLayers(jt); + if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj))))) + quality[jt] = reject; + else { + quality[it] = reject; + break; + } } } } } -} -__global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp, - TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, - int nmin, - bool dupPassThrough, - CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { - // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; - // quality of longest track - auto const longTqual = pixelTrack::Quality::highPurity; + template + __global__ void kernel_sharedHitCleaner(HitsView const *__restrict__ hhp, + TkSoA const *__restrict__ ptracks, + Quality *__restrict__ quality, + int nmin, + bool dupPassThrough, + HitToTuple const *__restrict__ phitToTuple) { + // quality to mark rejected + auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; + // quality of longest track + auto const longTqual = pixelTrack::Quality::highPurity; + + auto &hitToTuple = *phitToTuple; + auto const &tracks = *ptracks; + + auto const &hh = *hhp; + int l1end = hh.hitsLayerStart()[1]; + + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple.size(idx) < 2) + continue; - auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; + int8_t maxNl = 0; - auto const &hh = *hhp; - int l1end = hh.hitsLayerStart()[1]; + // find maxNl + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + if (quality[*it] < longTqual) + continue; + // if (tracks.nHits(*it)==3) continue; + auto nl = tracks.nLayers(*it); + maxNl = std::max(nl, maxNl); + } - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (hitToTuple.size(idx) < 2) - continue; + if (maxNl < 4) + continue; - int8_t maxNl = 0; + // quad pass through (leave for tests) + // maxNl = std::min(4, maxNl); - // find maxNl - for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] < longTqual) - continue; - // if (tracks.nHits(*it)==3) continue; - auto nl = tracks.nLayers(*it); - maxNl = std::max(nl, maxNl); + // kill all tracks shorter than maxHl (only triplets??? + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + auto nl = tracks.nLayers(*it); + + //checking if shared hit is on bpix1 and if the tuple is short enough + if (idx < l1end and nl > nmin) + continue; + + if (nl < maxNl && quality[*it] > reject) + quality[*it] = reject; + } } + } - if (maxNl < 4) - continue; + template + __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, + Quality *__restrict__ quality, + uint16_t nmin, + bool dupPassThrough, + HitToTuple const *__restrict__ phitToTuple) { + // quality to mark rejected + auto const reject = pixelTrack::Quality::loose; + /// min quality of good + auto const good = pixelTrack::Quality::strict; + + auto &hitToTuple = *phitToTuple; + auto const &tracks = *ptracks; + + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple.size(idx) < 2) + continue; - // quad pass through (leave for tests) - // maxNl = std::min(4, maxNl); + float mc = maxScore; + uint16_t im = tkNotFound; + bool onlyTriplets = true; - // kill all tracks shorter than maxHl (only triplets??? - for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - auto nl = tracks.nLayers(*it); + // check if only triplets + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + if (quality[*it] <= good) + continue; + onlyTriplets &= tracks.isTriplet(*it); + if (!onlyTriplets) + break; + } - //checking if shared hit is on bpix1 and if the tuple is short enough - if (idx < l1end and nl > nmin) + // only triplets + if (!onlyTriplets) continue; - if (nl < maxNl && quality[*it] > reject) - quality[*it] = reject; - } + // for triplets choose best tip! (should we first find best quality???) + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { + mc = std::abs(tracks.tip(it)); + im = it; + } + } + + if (tkNotFound == im) + continue; + + // mark worse ambiguities + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] > reject && it != im) + quality[it] = reject; //no race: simple assignment of the same constant + } + + } // loop over hits } -} - -__global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, - uint16_t nmin, - bool dupPassThrough, - CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { - // quality to mark rejected - auto const reject = pixelTrack::Quality::loose; - /// min quality of good - auto const good = pixelTrack::Quality::strict; - - auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; - - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (hitToTuple.size(idx) < 2) - continue; - - float mc = maxScore; - uint16_t im = tkNotFound; - bool onlyTriplets = true; - - // check if only triplets - for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] <= good) + + template + __global__ void kernel_simpleTripletCleaner(TkSoA const *__restrict__ ptracks, + Quality *__restrict__ quality, + uint16_t nmin, + bool dupPassThrough, + HitToTuple const *__restrict__ phitToTuple) { + // quality to mark rejected + auto const reject = pixelTrack::Quality::loose; + /// min quality of good + auto const good = pixelTrack::Quality::loose; + + auto &hitToTuple = *phitToTuple; + auto const &tracks = *ptracks; + + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple.size(idx) < 2) continue; - onlyTriplets &= tracks.isTriplet(*it); - if (!onlyTriplets) - break; - } - // only triplets - if (!onlyTriplets) - continue; + float mc = maxScore; + uint16_t im = tkNotFound; - // for triplets choose best tip! (should we first find best quality???) - for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { - auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); - im = it; + // choose best tip! (should we first find best quality???) + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { + mc = std::abs(tracks.tip(it)); + im = it; + } } - } - - if (tkNotFound == im) - continue; - // mark worse ambiguities - for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { - auto const it = *ip; - if (quality[it] > reject && it != im) - quality[it] = reject; //no race: simple assignment of the same constant - } + if (tkNotFound == im) + continue; - } // loop over hits -} - -__global__ void kernel_simpleTripletCleaner( - TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, - uint16_t nmin, - bool dupPassThrough, - CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { - // quality to mark rejected - auto const reject = pixelTrack::Quality::loose; - /// min quality of good - auto const good = pixelTrack::Quality::loose; - - auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; - - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (hitToTuple.size(idx) < 2) - continue; - - float mc = maxScore; - uint16_t im = tkNotFound; - - // choose best tip! (should we first find best quality???) - for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { - auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); - im = it; + // mark worse ambiguities + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] > reject && tracks.isTriplet(it) && it != im) + quality[it] = reject; //no race: simple assignment of the same constant } - } - if (tkNotFound == im) - continue; + } // loop over hits + } - // mark worse ambiguities - for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { - auto const it = *ip; - if (quality[it] > reject && tracks.isTriplet(it) && it != im) - quality[it] = reject; //no race: simple assignment of the same constant + template + __global__ void kernel_print_found_ntuplets(HitsView const *__restrict__ hhp, + HitContainer const *__restrict__ ptuples, + TkSoA const *__restrict__ ptracks, + Quality const *__restrict__ quality, + HitToTuple const *__restrict__ phitToTuple, + int32_t firstPrint, + int32_t lastPrint, + int iev) { + constexpr auto loose = pixelTrack::Quality::loose; + auto const &hh = *hhp; + auto const &foundNtuplets = *ptuples; + auto const &tracks = *ptracks; + int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) { + auto nh = foundNtuplets.size(i); + if (nh < 3) + continue; + if (quality[i] < loose) + continue; + printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", + 10000 * iev + i, + int(quality[i]), + nh, + tracks.nLayers(i), + tracks.charge(i), + tracks.pt(i), + tracks.eta(i), + tracks.phi(i), + tracks.tip(i), + tracks.zip(i), + // asinhf(fit_results[i].par(3)), + tracks.chi2(i), + hh.zGlobal(*foundNtuplets.begin(i)), + hh.zGlobal(*(foundNtuplets.begin(i) + 1)), + hh.zGlobal(*(foundNtuplets.begin(i) + 2)), + nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0, + nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0, + nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0, + nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0); } + } - } // loop over hits -} - -__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer const *__restrict__ ptuples, - TkSoA const *__restrict__ ptracks, - Quality const *__restrict__ quality, - CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple, - int32_t firstPrint, - int32_t lastPrint, - int iev) { - constexpr auto loose = pixelTrack::Quality::loose; - auto const &hh = *hhp; - auto const &foundNtuplets = *ptuples; - auto const &tracks = *ptracks; - int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) { - auto nh = foundNtuplets.size(i); - if (nh < 3) - continue; - if (quality[i] < loose) - continue; - printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", - 10000 * iev + i, - int(quality[i]), - nh, - tracks.nLayers(i), - tracks.charge(i), - tracks.pt(i), - tracks.eta(i), - tracks.phi(i), - tracks.tip(i), - tracks.zip(i), - // asinhf(fit_results[i].par(3)), - tracks.chi2(i), - hh.zGlobal(*foundNtuplets.begin(i)), - hh.zGlobal(*(foundNtuplets.begin(i) + 1)), - hh.zGlobal(*(foundNtuplets.begin(i) + 2)), - nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0, - nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0, - nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0, - nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0); + __global__ void kernel_printCounters(Counters const *counters) { + auto const &c = *counters; + printf( + "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nLooseTracks | nGoodTracks | nUsedHits | " + "nDupHits | " + "nFishCells | " + "nKilledCells | " + "nUsedCells | nZeroTrackCells ||\n"); + printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n", + c.nEvents, + c.nHits, + c.nCells, + c.nTuples, + c.nFitTracks, + c.nLooseTracks, + c.nGoodTracks, + c.nUsedHits, + c.nDupHits, + c.nFishCells, + c.nKilledCells, + c.nEmptyCells, + c.nZeroTrackCells); + printf( + "Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| %.3f| %.3f||\n", + c.nEvents, + c.nHits / double(c.nEvents), + c.nCells / double(c.nEvents), + c.nTuples / double(c.nEvents), + c.nFitTracks / double(c.nEvents), + c.nLooseTracks / double(c.nEvents), + c.nGoodTracks / double(c.nEvents), + c.nUsedHits / double(c.nEvents), + c.nDupHits / double(c.nEvents), + c.nFishCells / double(c.nCells), + c.nKilledCells / double(c.nCells), + c.nEmptyCells / double(c.nCells), + c.nZeroTrackCells / double(c.nCells)); } -} - -__global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) { - auto const &c = *counters; - printf( - "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nLooseTracks | nGoodTracks | nUsedHits | " - "nDupHits | " - "nFishCells | " - "nKilledCells | " - "nUsedCells | nZeroTrackCells ||\n"); - printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n", - c.nEvents, - c.nHits, - c.nCells, - c.nTuples, - c.nFitTracks, - c.nLooseTracks, - c.nGoodTracks, - c.nUsedHits, - c.nDupHits, - c.nFishCells, - c.nKilledCells, - c.nEmptyCells, - c.nZeroTrackCells); - printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| %.3f| %.3f||\n", - c.nEvents, - c.nHits / double(c.nEvents), - c.nCells / double(c.nEvents), - c.nTuples / double(c.nEvents), - c.nFitTracks / double(c.nEvents), - c.nLooseTracks / double(c.nEvents), - c.nGoodTracks / double(c.nEvents), - c.nUsedHits / double(c.nEvents), - c.nDupHits / double(c.nEvents), - c.nFishCells / double(c.nCells), - c.nKilledCells / double(c.nCells), - c.nEmptyCells / double(c.nCells), - c.nZeroTrackCells / double(c.nCells)); -} + +} // namespace caHitNtupletGeneratorKernels diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index f650ca8ab2a08..6d9ac785155d2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -3,6 +3,7 @@ // // #define GPU_DEBUG +// #define DUMP_GPU_TK_TUPLES #include #include @@ -24,64 +25,115 @@ namespace { + using namespace caHitNtupletGenerator; + using namespace gpuPixelDoublets; + using namespace pixelTopology; + using namespace pixelTrack; + template T sqr(T x) { return x * x; } - cAHitNtupletGenerator::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) { - auto coeff = pset.getParameter>("chi2Coeff"); - auto ptMax = pset.getParameter("chi2MaxPt"); - if (coeff.size() != 2) { - throw edm::Exception(edm::errors::Configuration, - "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 2 elements"); + //Common Params + AlgoParams makeCommonParams(edm::ParameterSet const& cfg) { + return AlgoParams({cfg.getParameter("onGPU"), + cfg.getParameter("minHitsForSharingCut"), + cfg.getParameter("useRiemannFit"), + cfg.getParameter("fitNas4"), + cfg.getParameter("includeJumpingForwardDoublets"), + cfg.getParameter("earlyFishbone"), + cfg.getParameter("lateFishbone"), + cfg.getParameter("fillStatistics"), + cfg.getParameter("doSharedHitCut"), + cfg.getParameter("dupPassThrough"), + cfg.getParameter("useSimpleTripletCleaner")}); + } + + //This is needed to have the partial specialization for isPhase1Topology/isPhase2Topology + template + struct topologyCuts {}; + + template + struct topologyCuts> { + static constexpr CAParamsT makeCACuts(edm::ParameterSet const& cfg) { + return CAParamsT{{cfg.getParameter("minHitsPerNtuplet"), + (float)cfg.getParameter("ptmin"), + (float)cfg.getParameter("CAThetaCutBarrel"), + (float)cfg.getParameter("CAThetaCutForward"), + (float)cfg.getParameter("hardCurvCut"), + (float)cfg.getParameter("dcaCutInnerTriplet"), + (float)cfg.getParameter("dcaCutOuterTriplet")}}; + }; + + static constexpr QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { + auto coeff = pset.getParameter>("chi2Coeff"); + auto ptMax = pset.getParameter("chi2MaxPt"); + + coeff[1] = (coeff[1] - coeff[0]) / log2(ptMax); + return QualityCutsT{// polynomial coefficients for the pT-dependent chi2 cut + {(float)coeff[0], (float)coeff[1], 0.f, 0.f}, + // max pT used to determine the chi2 cut + (float)ptMax, + // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit + (float)pset.getParameter("chi2Scale"), + // regional cuts for triplets + {(float)pset.getParameter("tripletMaxTip"), + (float)pset.getParameter("tripletMinPt"), + (float)pset.getParameter("tripletMaxZip")}, + // regional cuts for quadruplets + {(float)pset.getParameter("quadrupletMaxTip"), + (float)pset.getParameter("quadrupletMinPt"), + (float)pset.getParameter("quadrupletMaxZip")}}; } - coeff[1] = (coeff[1] - coeff[0]) / log2(ptMax); - return cAHitNtupletGenerator::QualityCuts{// polynomial coefficients for the pT-dependent chi2 cut - {(float)coeff[0], (float)coeff[1], 0.f, 0.f}, - // max pT used to determine the chi2 cut - (float)ptMax, - // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit - (float)pset.getParameter("chi2Scale"), - // regional cuts for triplets - {(float)pset.getParameter("tripletMaxTip"), - (float)pset.getParameter("tripletMinPt"), - (float)pset.getParameter("tripletMaxZip")}, - // regional cuts for quadruplets - {(float)pset.getParameter("quadrupletMaxTip"), - (float)pset.getParameter("quadrupletMinPt"), - (float)pset.getParameter("quadrupletMaxZip")}}; + }; + + template + struct topologyCuts> { + static constexpr CAParamsT makeCACuts(edm::ParameterSet const& cfg) { + return CAParamsT{{cfg.getParameter("minHitsPerNtuplet"), + (float)cfg.getParameter("ptmin"), + (float)cfg.getParameter("CAThetaCutBarrel"), + (float)cfg.getParameter("CAThetaCutForward"), + (float)cfg.getParameter("hardCurvCut"), + (float)cfg.getParameter("dcaCutInnerTriplet"), + (float)cfg.getParameter("dcaCutOuterTriplet")}, + {(bool)cfg.getParameter("includeFarForwards")}}; + } + + static constexpr QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { + return QualityCutsT{ + (float)pset.getParameter("maxChi2"), + (float)pset.getParameter("minPt"), + (float)pset.getParameter("maxTip"), + (float)pset.getParameter("maxZip"), + }; + } + }; + + //Cell Cuts, as they are the cuts have the same logic for Phase2 and Phase1 + //keeping them separate would allow further differentiation in the future + //moving them to topologyCuts and using the same syntax + template + CellCutsT makeCellCuts(edm::ParameterSet const& cfg) { + return CellCutsT{cfg.getParameter("maxNumberOfDoublets"), + cfg.getParameter("doClusterCut"), + cfg.getParameter("doZ0Cut"), + cfg.getParameter("doPtCut"), + cfg.getParameter("idealConditions")}; } } // namespace using namespace std; -CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC) - : m_params(cfg.getParameter("onGPU"), - cfg.getParameter("minHitsPerNtuplet"), - cfg.getParameter("maxNumberOfDoublets"), - cfg.getParameter("minHitsForSharingCut"), - cfg.getParameter("useRiemannFit"), - cfg.getParameter("fitNas4"), - cfg.getParameter("includeJumpingForwardDoublets"), - cfg.getParameter("earlyFishbone"), - cfg.getParameter("lateFishbone"), - cfg.getParameter("idealConditions"), - cfg.getParameter("fillStatistics"), - cfg.getParameter("doClusterCut"), - cfg.getParameter("doZ0Cut"), - cfg.getParameter("doPtCut"), - cfg.getParameter("doSharedHitCut"), - cfg.getParameter("dupPassThrough"), - cfg.getParameter("useSimpleTripletCleaner"), - cfg.getParameter("ptmin"), - cfg.getParameter("CAThetaCutBarrel"), - cfg.getParameter("CAThetaCutForward"), - cfg.getParameter("hardCurvCut"), - cfg.getParameter("dcaCutInnerTriplet"), - cfg.getParameter("dcaCutOuterTriplet"), - makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) { +template +CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, + edm::ConsumesCollector& iC) + : m_params(makeCommonParams(cfg), + makeCellCuts(cfg), + topologyCuts::makeQualityCuts(cfg.getParameterSet("trackQualityCuts")), + topologyCuts::makeCACuts(cfg)) { #ifdef DUMP_GPU_TK_TUPLES printf("TK: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", "tid", @@ -104,7 +156,61 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& #endif } -void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc) { +template +void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc) { + fillDescriptionsCommon(desc); + edm::LogWarning("CAHitNtupletGeneratorOnGPU::fillDescriptions") + << "Note: this fillDescriptions is a dummy one. Most probably you are missing some parameters. \n" + "please implement your TrackerTraits descriptions in CAHitNtupletGeneratorOnGPU. \n"; +} + +template <> +void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc) { + fillDescriptionsCommon(desc); + + desc.add("idealConditions", true); + desc.add("includeJumpingForwardDoublets", false); + + edm::ParameterSetDescription trackQualityCuts; + trackQualityCuts.add("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut"); + trackQualityCuts.add>("chi2Coeff", {0.9, 1.8})->setComment("chi2 at 1GeV and at ptMax above"); + trackQualityCuts.add("chi2Scale", 8.) + ->setComment( + "Factor to multiply the pT-dependent chi2 cut (currently: 8 for the broken line fit, ?? for the Riemann " + "fit)"); + trackQualityCuts.add("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV"); + trackQualityCuts.add("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm"); + trackQualityCuts.add("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm"); + trackQualityCuts.add("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV"); + trackQualityCuts.add("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm"); + trackQualityCuts.add("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm"); + desc.add("trackQualityCuts", trackQualityCuts) + ->setComment( + "Quality cuts based on the results of the track fit:\n - apply a pT-dependent chi2 cut;\n - apply \"region " + "cuts\" based on the fit results (pT, Tip, Zip)."); +} + +template <> +void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc) { + fillDescriptionsCommon(desc); + + desc.add("idealConditions", false); + desc.add("includeFarForwards", true); + desc.add("includeJumpingForwardDoublets", true); + + edm::ParameterSetDescription trackQualityCuts; + trackQualityCuts.add("maxChi2", 5.)->setComment("Max normalized chi2"); + trackQualityCuts.add("minPt", 0.5)->setComment("Min pT in GeV"); + trackQualityCuts.add("maxTip", 0.3)->setComment("Max |Tip| in cm"); + trackQualityCuts.add("maxZip", 12.)->setComment("Max |Zip|, in cm"); + desc.add("trackQualityCuts", trackQualityCuts) + ->setComment( + "Quality cuts based on the results of the track fit:\n - apply cuts based on the fit results (pT, Tip, " + "Zip)."); +} + +template +void CAHitNtupletGeneratorOnGPU::fillDescriptionsCommon(edm::ParameterSetDescription& desc) { // 87 cm/GeV = 1/(3.8T * 0.3) // take less than radius given by the hardPtCut and reject everything below // auto hardCurvCut = 1.f/(0.35 * 87.f); @@ -116,13 +222,12 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc.add("dcaCutOuterTriplet", 0.25f)->setComment("Cut on origin radius when the outer hit is on BPix1"); desc.add("earlyFishbone", true); desc.add("lateFishbone", false); - desc.add("idealConditions", true); desc.add("fillStatistics", false); desc.add("minHitsPerNtuplet", 4); - desc.add("maxNumberOfDoublets", caConstants::maxNumberOfDoublets); + desc.add("maxNumberOfDoublets", TrackerTraits::maxNumberOfDoublets); desc.add("minHitsForSharingCut", 10) ->setComment("Maximum number of hits in a tuple to clean also if the shared hit is on bpx1"); - desc.add("includeJumpingForwardDoublets", false); + desc.add("fitNas4", false)->setComment("fit only 4 hits out of N"); desc.add("doClusterCut", true); desc.add("doZ0Cut", true); @@ -131,27 +236,10 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc.add("doSharedHitCut", true)->setComment("Sharing hit nTuples cleaning"); desc.add("dupPassThrough", false)->setComment("Do not reject duplicate"); desc.add("useSimpleTripletCleaner", true)->setComment("use alternate implementation"); - - edm::ParameterSetDescription trackQualityCuts; - trackQualityCuts.add("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut"); - trackQualityCuts.add>("chi2Coeff", {0.9, 1.8})->setComment("chi2 at 1GeV and at ptMax above"); - trackQualityCuts.add("chi2Scale", 8.) - ->setComment( - "Factor to multiply the pT-dependent chi2 cut (currently: 8 for the broken line fit, ?? for the Riemann " - "fit)"); - trackQualityCuts.add("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV"); - trackQualityCuts.add("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm"); - trackQualityCuts.add("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm"); - trackQualityCuts.add("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV"); - trackQualityCuts.add("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm"); - trackQualityCuts.add("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm"); - desc.add("trackQualityCuts", trackQualityCuts) - ->setComment( - "Quality cuts based on the results of the track fit:\n - apply a pT-dependent chi2 cut;\n - apply \"region " - "cuts\" based on the fit results (pT, Tip, Zip)."); } -void CAHitNtupletGeneratorOnGPU::beginJob() { +template +void CAHitNtupletGeneratorOnGPU::beginJob() { if (m_params.onGPU_) { // allocate pinned host memory only if CUDA is available edm::Service cs; @@ -165,49 +253,58 @@ void CAHitNtupletGeneratorOnGPU::beginJob() { } } -void CAHitNtupletGeneratorOnGPU::endJob() { +template +void CAHitNtupletGeneratorOnGPU::endJob() { if (m_params.onGPU_) { // print the gpu statistics and free pinned host memory only if CUDA is available edm::Service cs; if (cs and cs->enabled()) { if (m_params.doStats_) { // crash on multi-gpu processes - CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters); + CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters); } cudaFree(m_counters); } } else { if (m_params.doStats_) { - CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters); + CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters); } delete m_counters; } } -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, - float bfield, - cudaStream_t stream) const { - PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); +template +PixelTrackHeterogeneousT CAHitNtupletGeneratorOnGPU::makeTuplesAsync( + HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const { + using HelixFitOnGPU = HelixFitOnGPU; + using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using GPUKernels = CAHitNtupletGeneratorKernelsGPU; + + PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); auto* soa = tracks.get(); assert(soa); + cudaCheck(cudaGetLastError()); - CAHitNtupletGeneratorKernelsGPU kernels(m_params); + GPUKernels kernels(m_params); kernels.setCounters(m_counters); kernels.allocateOnGPU(hits_d.nHits(), stream); + cudaCheck(cudaGetLastError()); kernels.buildDoublets(hits_d, stream); + cudaCheck(cudaGetLastError()); + kernels.launchKernels(hits_d, soa, stream); + cudaCheck(cudaGetLastError()); HelixFitOnGPU fitter(bfield, m_params.fitNas4_); fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { - fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream); } else { - fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream); } kernels.classifyTuples(hits_d, soa, stream); - #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); @@ -217,13 +314,19 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH return tracks; } -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - PixelTrackHeterogeneous tracks(std::make_unique()); +template +PixelTrackHeterogeneousT CAHitNtupletGeneratorOnGPU::makeTuples(HitsOnCPU const& hits_d, + float bfield) const { + using HelixFitOnGPU = HelixFitOnGPU; + using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using CPUKernels = CAHitNtupletGeneratorKernelsCPU; + + PixelTrackHeterogeneous tracks(std::make_unique()); auto* soa = tracks.get(); assert(soa); - CAHitNtupletGeneratorKernelsCPU kernels(m_params); + CPUKernels kernels(m_params); kernels.setCounters(m_counters); kernels.allocateOnGPU(hits_d.nHits(), nullptr); @@ -238,9 +341,9 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { - fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); + fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets); } else { - fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); + fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets); } kernels.classifyTuples(hits_d, soa, nullptr); @@ -261,3 +364,6 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC return tracks; } + +template class CAHitNtupletGeneratorOnGPU; +template class CAHitNtupletGeneratorOnGPU; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index ae4576d883530..745579b960b76 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -21,20 +21,33 @@ namespace edm { class ParameterSetDescription; } // namespace edm +template class CAHitNtupletGeneratorOnGPU { public: - using HitsOnGPU = TrackingRecHit2DSOAView; - using HitsOnCPU = TrackingRecHit2DGPU; - using hindex_type = TrackingRecHit2DSOAView::hindex_type; + using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; - using Quality = pixelTrack::Quality; - using OutputSoA = pixelTrack::TrackSoA; - using HitContainer = pixelTrack::HitContainer; + using HitsView = TrackingRecHit2DSOAViewT; + using HitsOnGPU = TrackingRecHit2DGPUT; + using HitsOnCPU = TrackingRecHit2DCPUT; + using hindex_type = typename HitsView::hindex_type; + + using HitToTuple = caStructures::HitToTupleT; + using TupleMultiplicity = caStructures::TupleMultiplicityT; + using OuterHitOfCell = caStructures::OuterHitOfCellT; + + using GPUCACell = GPUCACellT; + using OutputSoA = pixelTrack::TrackSoAT; + using HitContainer = typename OutputSoA::HitContainer; using Tuple = HitContainer; - using QualityCuts = cAHitNtupletGenerator::QualityCuts; - using Params = cAHitNtupletGenerator::Params; - using Counters = cAHitNtupletGenerator::Counters; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + using CellTracksVector = caStructures::CellTracksVectorT; + + using Quality = pixelTrack::Quality; + + using QualityCuts = pixelTrack::QualityCutsT; + using Params = caHitNtupletGenerator::ParamsT; + using Counters = caHitNtupletGenerator::Counters; public: CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC) @@ -42,21 +55,22 @@ class CAHitNtupletGeneratorOnGPU { CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC); static void fillDescriptions(edm::ParameterSetDescription& desc); - static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; } + static void fillDescriptionsCommon(edm::ParameterSetDescription& desc); + //static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; } void beginJob(); void endJob(); - PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; + PixelTrackHeterogeneous makeTuplesAsync(HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const; - PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; + PixelTrackHeterogeneous makeTuples(HitsOnCPU const& hits_d, float bfield) const; private: - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const; + void buildDoublets(HitsOnGPU const& hh, cudaStream_t stream) const; - void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream); + void hitNtuplets(HitsOnGPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream); - void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const; + void launchKernels(HitsOnGPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const; Params m_params; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAStructures.h b/RecoPixelVertexing/PixelTriplets/plugins/CAStructures.h new file mode 100644 index 0000000000000..21f9d474c683c --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAStructures.h @@ -0,0 +1,55 @@ +#ifndef CUDADataFormats_TrackerGeometry_CAStructures_h +#define CUDADataFormats_TrackerGeometry_CAStructures_h + +#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" +#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + +namespace caStructures { + + // types + // using typename TrackerTraits::hindex_type = uint32_t; // FIXME from siPixelRecHitsHeterogeneousProduct + // using typename TrackerTraits::tindex_type = uint32_t; // for tuples + // using typename TrackerTraits::cindex_type = uint32_t; // for cells + + template + using CellNeighborsT = cms::cuda::VecArray; + + template + using CellTracksT = cms::cuda::VecArray; + + template + using CellNeighborsVectorT = cms::cuda::SimpleVector>; + + template + using CellTracksVectorT = cms::cuda::SimpleVector>; + + template + using OuterHitOfCellContainerT = cms::cuda::VecArray; + + template + using TupleMultiplicityT = cms::cuda::OneToManyAssoc; + + template + using HitToTupleT = cms::cuda::OneToManyAssoc; // 3.5 should be enough + + template + using TuplesContainerT = cms::cuda::OneToManyAssoc; + + template + struct OuterHitOfCellT { + OuterHitOfCellContainerT* container; + int32_t offset; + constexpr auto& operator[](int i) { return container[i - offset]; } + constexpr auto const& operator[](int i) const { return container[i - offset]; } + }; + +} // namespace caStructures + +#endif diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 2e3747a2b6841..965889abcb268 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -1,5 +1,5 @@ -#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h -#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h +#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACellT_h +#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACellT_h // // Author: Felice Pantaleo, CERN @@ -15,33 +15,36 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CAConstants.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "CAStructures.h" -class GPUCACell { +template +class GPUCACellT { public: using PtrAsInt = unsigned long long; - static constexpr auto maxCellsPerHit = caConstants::maxCellsPerHit; - static constexpr auto invalidHitId = std::numeric_limits::max(); - using OuterHitOfCellContainer = caConstants::OuterHitOfCellContainer; - using OuterHitOfCell = caConstants::OuterHitOfCell; - using CellNeighbors = caConstants::CellNeighbors; - using CellTracks = caConstants::CellTracks; - using CellNeighborsVector = caConstants::CellNeighborsVector; - using CellTracksVector = caConstants::CellTracksVector; + static constexpr auto maxCellsPerHit = TrackerTraits::maxCellsPerHit; + using OuterHitOfCellContainer = caStructures::OuterHitOfCellContainerT; + using OuterHitOfCell = caStructures::OuterHitOfCellT; + using CellNeighbors = caStructures::CellNeighborsT; + using CellTracks = caStructures::CellTracksT; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + using CellTracksVector = caStructures::CellTracksVectorT; - using Hits = TrackingRecHit2DSOAView; - using hindex_type = Hits::hindex_type; + using Hits = TrackingRecHit2DSOAViewT; + using hindex_type = typename TrackerTraits::hindex_type; + using tindex_type = typename TrackerTraits::tindex_type; + static constexpr auto invalidHitId = std::numeric_limits::max(); - using TmpTuple = cms::cuda::VecArray; + using TmpTuple = cms::cuda::VecArray; - using HitContainer = pixelTrack::HitContainer; + using HitContainer = pixelTrack::HitContainerT; using Quality = pixelTrack::Quality; static constexpr auto bad = pixelTrack::Quality::bad; enum class StatusBit : uint16_t { kUsed = 1, kInTrack = 2, kKilled = 1 << 15 }; - GPUCACell() = default; + GPUCACellT() = default; __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors, CellTracksVector& cellTracks, @@ -66,7 +69,8 @@ class GPUCACell { assert(tracks().empty()); } - __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) { + __device__ __forceinline__ int addOuterNeighbor(typename TrackerTraits::cindex_type t, + CellNeighborsVector& cellNeighbors) { // use smart cache if (outerNeighbors().empty()) { auto i = cellNeighbors.extend(); // maybe wasted.... @@ -88,7 +92,7 @@ class GPUCACell { return outerNeighbors().push_back(t); } - __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) { + __device__ __forceinline__ int addTrack(tindex_type t, CellTracksVector& cellTracks) { if (tracks().empty()) { auto i = cellTracks.extend(); // maybe wasted.... if (i > 0) { @@ -139,7 +143,7 @@ class GPUCACell { } __device__ bool check_alignment(Hits const& hh, - GPUCACell const& otherCell, + GPUCACellT const& otherCell, const float ptmin, const float hardCurvCut, const float caThetaCutBarrel, @@ -157,7 +161,7 @@ class GPUCACell { auto r1 = otherCell.inner_r(hh); auto z1 = otherCell.inner_z(hh); - auto isBarrel = otherCell.outer_detIndex(hh) < caConstants::last_barrel_detIndex; + auto isBarrel = otherCell.outer_detIndex(hh) < TrackerTraits::last_barrel_detIndex; bool aligned = areAlignedRZ(r1, z1, ri, @@ -168,8 +172,8 @@ class GPUCACell { isBarrel ? caThetaCutBarrel : caThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts return (aligned && dcaCut(hh, otherCell, - otherCell.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet - : dcaCutOuterTriplet, + otherCell.inner_detIndex(hh) < TrackerTraits::last_bpix1_detIndex ? dcaCutInnerTriplet + : dcaCutOuterTriplet, hardCurvCut)); // FIXME tune cuts } @@ -186,7 +190,7 @@ class GPUCACell { } __device__ inline bool dcaCut(Hits const& hh, - GPUCACell const& otherCell, + GPUCACellT const& otherCell, const float region_origin_radius_plus_tolerance, const float maxCurv) const { auto x1 = otherCell.inner_x(hh); @@ -222,11 +226,9 @@ class GPUCACell { return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature()); } - __device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const { - using caConstants::first_ladder_bpx0; - using caConstants::max_ladder_bpx0; - using caConstants::module_length_bpx0; - using caConstants::module_tolerance_bpx0; + __device__ inline bool hole0(Hits const& hh, GPUCACellT const& innerCell) const { + using namespace phase1PixelTopology; + int p = innerCell.inner_iphi(hh); if (p < 0) p += std::numeric_limits::max(); @@ -245,11 +247,9 @@ class GPUCACell { return gap; } - __device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const { - using caConstants::first_ladder_bpx4; - using caConstants::max_ladder_bpx4; - using caConstants::module_length_bpx4; - using caConstants::module_tolerance_bpx4; + __device__ inline bool hole4(Hits const& hh, GPUCACellT const& innerCell) const { + using namespace phase1PixelTopology; + int p = outer_iphi(hh); if (p < 0) p += std::numeric_limits::max(); @@ -272,9 +272,10 @@ class GPUCACell { // trying to free the track building process from hardcoded layers, leaving // the visit of the graph based on the neighborhood connections between cells. + template __device__ inline void find_ntuplets(Hits const& hh, - GPUCACell* __restrict__ cells, + GPUCACellT* __restrict__ cells, CellTracksVector& cellTracks, HitContainer& foundNtuplets, cms::cuda::AtomicPairCounter& apc, @@ -288,50 +289,62 @@ class GPUCACell { // the ntuplets is then saved if the number of hits it contains is greater // than a threshold - auto doubletId = this - cells; - tmpNtuplet.push_back_unsafe(doubletId); - assert(tmpNtuplet.size() <= 5); - - bool last = true; - for (unsigned int otherCell : outerNeighbors()) { - if (cells[otherCell].isKilled()) - continue; // killed by earlyFishbone - last = false; - cells[otherCell].find_ntuplets( - hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0); - } - if (last) { // if long enough save... - if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) { + if constexpr (DEPTH <= 0) { + printf("ERROR: GPUCACellT::find_ntuplets reached full depth!\n"); +#ifdef __CUDA_ARCH__ + __trap(); +#else + abort(); +#endif + } else { + auto doubletId = this - cells; + tmpNtuplet.push_back_unsafe(doubletId); + assert(tmpNtuplet.size() <= + int(TrackerTraits::maxHitsOnTrack - + 3)); //1 for the container, 1 because these are doublets, 1 because we may push another + + bool last = true; + for (unsigned int otherCell : outerNeighbors()) { + if (cells[otherCell].isKilled()) + continue; // killed by earlyFishbone + last = false; + cells[otherCell].template find_ntuplets( + hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0); + } + + if (last) { // if long enough save... + if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) { #ifdef ONLY_TRIPLETS_IN_HOLE - // triplets accepted only pointing to the hole - if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) || - ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]]))) + // triplets accepted only pointing to the hole + if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) || + ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]]))) #endif - { - hindex_type hits[8]; - auto nh = 0U; - constexpr int maxFB = 2; // for the time being let's limit this - int nfb = 0; - for (auto c : tmpNtuplet) { - hits[nh++] = cells[c].theInnerHitId; - if (nfb < maxFB && cells[c].hasFishbone()) { - ++nfb; - hits[nh++] = cells[c].theFishboneId; // fishbone hit is always outer than inner hit + { + hindex_type hits[TrackerTraits::maxDepth + 2]; + auto nh = 0U; + constexpr int maxFB = 2; // for the time being let's limit this + int nfb = 0; + for (auto c : tmpNtuplet) { + hits[nh++] = cells[c].theInnerHitId; + if (nfb < maxFB && cells[c].hasFishbone()) { + ++nfb; + hits[nh++] = cells[c].theFishboneId; // fishbone hit is always outer than inner hit + } + } + assert(nh < TrackerTraits::maxHitsOnTrack); + hits[nh] = theOuterHitId; + auto it = foundNtuplets.bulkFill(apc, hits, nh + 1); + if (it >= 0) { // if negative is overflow.... + for (auto c : tmpNtuplet) + cells[c].addTrack(it, cellTracks); + quality[it] = bad; // initialize to bad } - } - assert(nh < caConstants::maxHitsOnTrack); - hits[nh] = theOuterHitId; - auto it = foundNtuplets.bulkFill(apc, hits, nh + 1); - if (it >= 0) { // if negative is overflow.... - for (auto c : tmpNtuplet) - cells[c].addTrack(it, cellTracks); - quality[it] = bad; // initialize to bad } } } + tmpNtuplet.pop_back(); + assert(tmpNtuplet.size() < int(TrackerTraits::maxHitsOnTrack - 1)); } - tmpNtuplet.pop_back(); - assert(tmpNtuplet.size() < 5); } // Cell status management @@ -370,22 +383,4 @@ class GPUCACell { hindex_type theFishboneId; }; -template <> -__device__ inline void GPUCACell::find_ntuplets<0>(Hits const& hh, - GPUCACell* __restrict__ cells, - CellTracksVector& cellTracks, - HitContainer& foundNtuplets, - cms::cuda::AtomicPairCounter& apc, - Quality* __restrict__ quality, - TmpTuple& tmpNtuplet, - const unsigned int minHitsPerNtuplet, - bool startAt0) const { - printf("ERROR: GPUCACell::find_ntuplets reached full depth!\n"); -#ifdef __CUDA_ARCH__ - __trap(); -#else - abort(); -#endif -} - -#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h +#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACellT_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index 880bdb47dfb5c..c300329a82208 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -1,9 +1,11 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HelixFitOnGPU.h" -void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, - TupleMultiplicity const *tupleMultiplicity, - OutputSoA *helix_fit_results) { +template +void HelixFitOnGPU::allocateOnGPU( + Tuples const *tuples, + caStructures::TupleMultiplicityT const *tupleMultiplicity, + pixelTrack::TrackSoAT *helix_fit_results) { tuples_ = tuples; tupleMultiplicity_ = tupleMultiplicity; outputSoa_ = helix_fit_results; @@ -13,4 +15,8 @@ void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, assert(outputSoa_); } -void HelixFitOnGPU::deallocateOnGPU() {} +template +void HelixFitOnGPU::deallocateOnGPU() {} + +template class HelixFitOnGPU; +template class HelixFitOnGPU; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 9a9c85970af33..78bec6f5e2a87 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -4,12 +4,13 @@ #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "CAConstants.h" +#include "CAStructures.h" namespace riemannFit { // in case of memory issue can be made smaller - constexpr uint32_t maxNumberOfConcurrentFits = caConstants::maxNumberOfTuples; + constexpr uint32_t maxNumberOfConcurrentFits = 32 * 1024; constexpr uint32_t stride = maxNumberOfConcurrentFits; using Matrix3x4d = Eigen::Matrix; using Map3x4d = Eigen::Map >; @@ -29,16 +30,25 @@ namespace riemannFit { // fast fit using Map4d = Eigen::Map >; + template //a compile-time bounded for loop + constexpr void rolling_fits(F &&f) { + if constexpr (Start < End) { + f(std::integral_constant()); + rolling_fits(f); + } + } + } // namespace riemannFit +template class HelixFitOnGPU { public: - using HitsView = TrackingRecHit2DSOAView; + using HitsView = TrackingRecHit2DSOAViewT; - using Tuples = pixelTrack::HitContainer; - using OutputSoA = pixelTrack::TrackSoA; + using Tuples = pixelTrack::HitContainerT; + using OutputSoA = pixelTrack::TrackSoAT; - using TupleMultiplicity = caConstants::TupleMultiplicity; + using TupleMultiplicity = caStructures::TupleMultiplicityT; explicit HelixFitOnGPU(float bf, bool fitNas4) : bField_(bf), fitNas4_(fitNas4) {} ~HelixFitOnGPU() { deallocateOnGPU(); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc index 13d665b597b13..e4a7de6adaf4c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc @@ -1,6 +1,9 @@ #include "RiemannFitOnGPU.h" -void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) { +template +void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, + uint32_t nhits, + uint32_t maxNumberOfTuples) { assert(tuples_); // Fit internals @@ -16,98 +19,101 @@ void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // triplets - kernel_FastFit<3>( + kernel_FastFit<3, TrackerTraits>( tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); - kernel_CircleFit<3>(tupleMultiplicity_, - 3, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU, - offset); + kernel_CircleFit<3, TrackerTraits>(tupleMultiplicity_, + 3, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); - kernel_LineFit<3>(tupleMultiplicity_, - 3, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU, - offset); + kernel_LineFit<3, TrackerTraits>(tupleMultiplicity_, + 3, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); // quads - kernel_FastFit<4>( + kernel_FastFit<4, TrackerTraits>( tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); - kernel_CircleFit<4>(tupleMultiplicity_, - 4, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU, - offset); + kernel_CircleFit<4, TrackerTraits>(tupleMultiplicity_, + 4, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); - kernel_LineFit<4>(tupleMultiplicity_, - 4, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU, - offset); + kernel_LineFit<4, TrackerTraits>(tupleMultiplicity_, + 4, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); if (fitNas4_) { // penta - kernel_FastFit<4>( + kernel_FastFit<4, TrackerTraits>( tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); - kernel_CircleFit<4>(tupleMultiplicity_, - 5, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU, - offset); + kernel_CircleFit<4, TrackerTraits>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); - kernel_LineFit<4>(tupleMultiplicity_, - 5, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU, - offset); + kernel_LineFit<4, TrackerTraits>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); } else { // penta all 5 - kernel_FastFit<5>( + kernel_FastFit<5, TrackerTraits>( tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); - kernel_CircleFit<5>(tupleMultiplicity_, - 5, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU, - offset); + kernel_CircleFit<5, TrackerTraits>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); - kernel_LineFit<5>(tupleMultiplicity_, - 5, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU, - offset); + kernel_LineFit<5, TrackerTraits>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); } } } + +template class HelixFitOnGPU; +template class HelixFitOnGPU; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu index d8530bac964c1..3d6b2d570077e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -1,10 +1,11 @@ #include "RiemannFitOnGPU.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, - uint32_t nhits, - uint32_t maxNumberOfTuples, - cudaStream_t stream) { +template +void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, + uint32_t nhits, + uint32_t maxNumberOfTuples, + cudaStream_t stream) { assert(tuples_); auto blockSize = 64; @@ -23,109 +24,112 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // triplets - kernel_FastFit<3><<>>( + kernel_FastFit<3, TrackerTraits><<>>( tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); cudaCheck(cudaGetLastError()); - kernel_CircleFit<3><<>>(tupleMultiplicity_, - 3, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_CircleFit<3, TrackerTraits><<>>(tupleMultiplicity_, + 3, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); - kernel_LineFit<3><<>>(tupleMultiplicity_, - 3, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_LineFit<3, TrackerTraits><<>>(tupleMultiplicity_, + 3, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); // quads - kernel_FastFit<4><<>>( + kernel_FastFit<4, TrackerTraits><<>>( tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); cudaCheck(cudaGetLastError()); - kernel_CircleFit<4><<>>(tupleMultiplicity_, - 4, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_CircleFit<4, TrackerTraits><<>>(tupleMultiplicity_, + 4, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); - kernel_LineFit<4><<>>(tupleMultiplicity_, - 4, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_LineFit<4, TrackerTraits><<>>(tupleMultiplicity_, + 4, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); if (fitNas4_) { // penta - kernel_FastFit<4><<>>( + kernel_FastFit<4, TrackerTraits><<>>( tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); cudaCheck(cudaGetLastError()); - kernel_CircleFit<4><<>>(tupleMultiplicity_, - 5, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_CircleFit<4, TrackerTraits><<>>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); - kernel_LineFit<4><<>>(tupleMultiplicity_, - 5, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_LineFit<4, TrackerTraits><<>>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); } else { // penta all 5 - kernel_FastFit<5><<>>( + kernel_FastFit<5, TrackerTraits><<>>( tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); cudaCheck(cudaGetLastError()); - kernel_CircleFit<5><<>>(tupleMultiplicity_, - 5, - bField_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_CircleFit<5, TrackerTraits><<>>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); - kernel_LineFit<5><<>>(tupleMultiplicity_, - 5, - bField_, - outputSoa_, - hitsGPU.get(), - hits_geGPU.get(), - fast_fit_resultsGPU.get(), - circle_fit_resultsGPU_, - offset); + kernel_LineFit<5, TrackerTraits><<>>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); cudaCheck(cudaGetLastError()); } } } + +template class HelixFitOnGPU; +template class HelixFitOnGPU; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index 926002d674b83..18dd205cd13c3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -14,15 +14,20 @@ #include "HelixFitOnGPU.h" -using HitsOnGPU = TrackingRecHit2DSOAView; -using Tuples = pixelTrack::HitContainer; -using OutputSoA = pixelTrack::TrackSoA; - -template -__global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, - caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, +template +using HitsOnGPU = TrackingRecHit2DSOAViewT; +template +using Tuples = pixelTrack::HitContainerT; +template +using OutputSoA = pixelTrack::TrackSoAT; +template +using TupleMultiplicity = caStructures::TupleMultiplicityT; + +template +__global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, + TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, - HitsOnGPU const *__restrict__ hhp, + HitsOnGPU const *__restrict__ hhp, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit, @@ -51,7 +56,7 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, // get it from the ntuple container (one to one to helix) auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); - assert(tkid < foundNtuplets->nOnes()); + assert(int(tkid) < foundNtuplets->nOnes()); assert(foundNtuplets->size(tkid) == nHits); @@ -83,8 +88,8 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, } } -template -__global__ void kernel_CircleFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, +template +__global__ void kernel_CircleFit(TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, double bField, double *__restrict__ phits, @@ -124,11 +129,11 @@ __global__ void kernel_CircleFit(caConstants::TupleMultiplicity const *__restric } } -template -__global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, +template +__global__ void kernel_LineFit(TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, double bField, - OutputSoA *results, + OutputSoA *results, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit_input, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h index 5f3866af0b3d3..d4b3282574ec3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -8,21 +8,35 @@ #include #include "DataFormats/Math/interface/approx_atan2.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "GPUCACell.h" +#include "CAStructures.h" namespace gpuPixelDoublets { - __global__ void fishbone(GPUCACell::Hits const* __restrict__ hhp, - GPUCACell* cells, + template + using CellNeighbors = caStructures::CellNeighborsT; + template + using CellTracks = caStructures::CellTracksT; + template + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + template + using CellTracksVector = caStructures::CellTracksVectorT; + template + using OuterHitOfCell = caStructures::OuterHitOfCellT; + template + using Hits = typename GPUCACellT::Hits; + + template + __global__ void fishbone(Hits const* __restrict__ hhp, + GPUCACellT* cells, uint32_t const* __restrict__ nCells, - GPUCACell::OuterHitOfCell const isOuterHitOfCellWrap, + OuterHitOfCell const isOuterHitOfCellWrap, int32_t nHits, bool checkTrack) { - constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit; + constexpr auto maxCellsPerHit = GPUCACellT::maxCellsPerHit; auto const& hh = *hhp; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index 57dbf822c88d3..deed54ca02b5b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -7,78 +7,37 @@ namespace gpuPixelDoublets { - constexpr int nPairsForQuadruplets = 13; // quadruplets require hits in all layers - constexpr int nPairsForTriplets = nPairsForQuadruplets + 2; // include barrel "jumping" layer pairs - constexpr int nPairs = nPairsForTriplets + 4; // include forward "jumping" layer pairs - static_assert(nPairs <= caConstants::maxNumberOfLayerPairs); - - // start constants - // clang-format off - - CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = { - 0, 1, 0, 4, 0, 7, // BPIX1 (3) - 1, 2, 1, 4, 1, 7, // BPIX2 (6) - 4, 5, 7, 8, // FPIX1 (8) - 2, 3, 2, 4, 2, 7, 5, 6, 8, 9, // BPIX3 & FPIX2 (13) - 0, 2, 1, 3, // Jumping Barrel (15) - 0, 5, 0, 8, // Jumping Forward (BPIX1,FPIX2) - 4, 6, 7, 9 // Jumping Forward (19) - }; - - constexpr int16_t phi0p05 = 522; // round(521.52189...) = phi2short(0.05); - constexpr int16_t phi0p06 = 626; // round(625.82270...) = phi2short(0.06); - constexpr int16_t phi0p07 = 730; // round(730.12648...) = phi2short(0.07); - - CONSTANT_VAR const int16_t phicuts[nPairs]{phi0p05, - phi0p07, - phi0p07, - phi0p05, - phi0p06, - phi0p06, - phi0p05, - phi0p05, - phi0p06, - phi0p06, - phi0p06, - phi0p05, - phi0p05, - phi0p05, - phi0p05, - phi0p05, - phi0p05, - phi0p05, - phi0p05}; - // phi0p07, phi0p07, phi0p06,phi0p06, phi0p06,phi0p06}; // relaxed cuts - - CONSTANT_VAR float const minz[nPairs] = { - -20., 0., -30., -22., 10., -30., -70., -70., -22., 15., -30, -70., -70., -20., -22., 0, -30., -70., -70.}; - CONSTANT_VAR float const maxz[nPairs] = { - 20., 30., 0., 22., 30., -10., 70., 70., 22., 30., -15., 70., 70., 20., 22., 30., 0., 70., 70.}; - CONSTANT_VAR float const maxr[nPairs] = { - 20., 9., 9., 20., 7., 7., 5., 5., 20., 6., 6., 5., 5., 20., 20., 9., 9., 9., 9.}; + template + using CellNeighbors = caStructures::CellNeighborsT; + template + using CellTracks = caStructures::CellTracksT; + template + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + template + using CellTracksVector = caStructures::CellTracksVectorT; + template + using OuterHitOfCell = caStructures::OuterHitOfCellT; + template + using Hits = typename GPUCACellT::Hits; // end constants // clang-format on - using CellNeighbors = caConstants::CellNeighbors; - using CellTracks = caConstants::CellTracks; - using CellNeighborsVector = caConstants::CellNeighborsVector; - using CellTracksVector = caConstants::CellTracksVector; - - __global__ void initDoublets(GPUCACell::OuterHitOfCell isOuterHitOfCell, + template + __global__ void initDoublets(OuterHitOfCell isOuterHitOfCell, int nHits, - CellNeighborsVector* cellNeighbors, - CellNeighbors* cellNeighborsContainer, - CellTracksVector* cellTracks, - CellTracks* cellTracksContainer) { + CellNeighborsVector* cellNeighbors, + CellNeighbors* cellNeighborsContainer, + CellTracksVector* cellTracks, + CellTracks* cellTracksContainer) { assert(isOuterHitOfCell.container); int first = blockIdx.x * blockDim.x + threadIdx.x; for (int i = first; i < nHits - isOuterHitOfCell.offset; i += gridDim.x * blockDim.x) isOuterHitOfCell.container[i].reset(); if (0 == first) { - cellNeighbors->construct(caConstants::maxNumOfActiveDoublets, cellNeighborsContainer); - cellTracks->construct(caConstants::maxNumOfActiveDoublets, cellTracksContainer); + cellNeighbors->construct(TrackerTraits::maxNumOfActiveDoublets, cellNeighborsContainer); + cellTracks->construct(TrackerTraits::maxNumOfActiveDoublets, cellTracksContainer); auto i = cellNeighbors->extend(); assert(0 == i); (*cellNeighbors)[0].reset(); @@ -91,40 +50,23 @@ namespace gpuPixelDoublets { constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16; + template __global__ #ifdef __CUDACC__ __launch_bounds__(getDoubletsFromHistoMaxBlockSize, getDoubletsFromHistoMinBlocksPerMP) #endif - void getDoubletsFromHisto(GPUCACell* cells, + void getDoubletsFromHisto(GPUCACellT* cells, uint32_t* nCells, - CellNeighborsVector* cellNeighbors, - CellTracksVector* cellTracks, - TrackingRecHit2DSOAView const* __restrict__ hhp, - GPUCACell::OuterHitOfCell isOuterHitOfCell, + CellNeighborsVector* cellNeighbors, + CellTracksVector* cellTracks, + TrackingRecHit2DSOAViewT const* __restrict__ hhp, + OuterHitOfCell isOuterHitOfCell, int nActualPairs, - bool ideal_cond, - bool doClusterCut, - bool doZ0Cut, - bool doPtCut, - uint32_t maxNumOfDoublets) { + CellCutsT cuts) { auto const& __restrict__ hh = *hhp; - doubletsFromHisto(layerPairs, - nActualPairs, - cells, - nCells, - cellNeighbors, - cellTracks, - hh, - isOuterHitOfCell, - phicuts, - minz, - maxz, - maxr, - ideal_cond, - doClusterCut, - doZ0Cut, - doPtCut, - maxNumOfDoublets); + + doubletsFromHisto( + nActualPairs, cells, nCells, cellNeighbors, cellTracks, hh, isOuterHitOfCell, cuts); } } // namespace gpuPixelDoublets diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h index 80316d24c748b..0f3d786a8e476 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h @@ -12,45 +12,119 @@ #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" -#include "CAConstants.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "CAStructures.h" #include "GPUCACell.h" +//#define GPU_DEBUG +//#define NTUPLE_DEBUG + namespace gpuPixelDoublets { - using CellNeighbors = caConstants::CellNeighbors; - using CellTracks = caConstants::CellTracks; - using CellNeighborsVector = caConstants::CellNeighborsVector; - using CellTracksVector = caConstants::CellTracksVector; + template + using CellNeighbors = caStructures::CellNeighborsT; + template + using CellTracks = caStructures::CellTracksT; + template + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + template + using CellTracksVector = caStructures::CellTracksVectorT; + template + using OuterHitOfCell = caStructures::OuterHitOfCellT; + template + using Hits = typename GPUCACellT::Hits; + + template + struct CellCutsT { + using H = Hits; + using T = TrackerTraits; + + const uint32_t maxNumberOfDoublets_; + const bool doClusterCut_; + const bool doZ0Cut_; + const bool doPtCut_; + const bool idealConditions_; //this is actually not used by phase2 + + __device__ __forceinline__ bool zSizeCut(H const& hh, int i, int o) const { + auto mi = hh.detectorIndex(i); + + bool innerB1 = mi < T::last_bpix1_detIndex; + bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2; + auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1; + + if (mes < 0) + return false; + + auto mo = hh.detectorIndex(o); + auto so = hh.clusterSizeY(o); + + auto dz = hh.zGlobal(i) - hh.zGlobal(o); + auto dr = hh.rGlobal(i) - hh.rGlobal(o); + + auto innerBarrel = mi < T::last_barrel_detIndex; + auto onlyBarrel = mo < T::last_barrel_detIndex; + + if (not innerBarrel and not onlyBarrel) + return false; + auto dy = innerB1 ? T::maxDYsize12 : T::maxDYsize; + + return onlyBarrel ? so > 0 && std::abs(so - mes) > dy + : innerBarrel && std::abs(mes - int(std::abs(dz / dr) * T::dzdrFact + 0.5f)) > T::maxDYPred; + } + + __device__ __forceinline__ bool clusterCut(H const& hh, int i, int o) const { + auto mo = hh.detectorIndex(o); + bool outerFwd = (mo >= T::last_barrel_detIndex); + + if (!outerFwd) + return false; + + auto mi = hh.detectorIndex(i); + bool innerB1orB2 = mi < T::last_bpix2_detIndex; - __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs, - uint32_t nPairs, - GPUCACell* cells, + if (!innerB1orB2) + return false; + + bool innerB1 = mi < T::last_bpix1_detIndex; + bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2; + auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1; + + if (innerB1 && outerFwd) // B1 and F1 + if (mes > 0 && mes < T::minYsizeB1) + return true; // only long cluster (5*8) + bool innerB2 = (mi >= T::last_bpix1_detIndex) && (mi < T::last_bpix2_detIndex); //FIXME number + if (innerB2 && outerFwd) // B2 and F1 + if (mes > 0 && mes < T::minYsizeB2) + return true; + + return false; + } + }; + + // template + // struct CellCutsT : public CellCutsCommon {}; + // + // template <> + // struct CellCutsT : public CellCutsCommon {}; + + template + __device__ __forceinline__ void doubletsFromHisto(uint32_t nPairs, + GPUCACellT* cells, uint32_t* nCells, - CellNeighborsVector* cellNeighbors, - CellTracksVector* cellTracks, - TrackingRecHit2DSOAView const& __restrict__ hh, - GPUCACell::OuterHitOfCell isOuterHitOfCell, - int16_t const* __restrict__ phicuts, - float const* __restrict__ minz, - float const* __restrict__ maxz, - float const* __restrict__ maxr, - bool ideal_cond, - bool doClusterCut, - bool doZ0Cut, - bool doPtCut, - uint32_t maxNumOfDoublets) { + CellNeighborsVector* cellNeighbors, + CellTracksVector* cellTracks, + TrackingRecHit2DSOAViewT const& __restrict__ hh, + OuterHitOfCell isOuterHitOfCell, + CellCutsT const& cuts) { // ysize cuts (z in the barrel) times 8 // these are used if doClusterCut is true - constexpr int minYsizeB1 = 36; - constexpr int minYsizeB2 = 28; - constexpr int maxDYsize12 = 28; - constexpr int maxDYsize = 20; - constexpr int maxDYPred = 20; - constexpr float dzdrFact = 8 * 0.0285 / 0.015; // from dz/dr to "DY" - bool isOuterLadder = ideal_cond; + const bool doClusterCut = cuts.doClusterCut_; + const bool doZ0Cut = cuts.doZ0Cut_; + const bool doPtCut = cuts.doPtCut_; + const uint32_t maxNumOfDoublets = cuts.maxNumberOfDoublets_; - using PhiBinner = TrackingRecHit2DSOAView::PhiBinner; + using PhiBinner = typename TrackingRecHit2DSOAViewT::PhiBinner; auto const& __restrict__ phiBinner = hh.phiBinner(); uint32_t const* __restrict__ offsets = hh.hitsLayerStart(); @@ -61,14 +135,13 @@ namespace gpuPixelDoublets { // nPairsMax to be optimized later (originally was 64). // If it should be much bigger, consider using a block-wide parallel prefix scan, // e.g. see https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html - const int nPairsMax = caConstants::maxNumberOfLayerPairs; - assert(nPairs <= nPairsMax); - __shared__ uint32_t innerLayerCumulativeSize[nPairsMax]; + + __shared__ uint32_t innerLayerCumulativeSize[TrackerTraits::nPairs]; __shared__ uint32_t ntot; if (threadIdx.y == 0 && threadIdx.x == 0) { - innerLayerCumulativeSize[0] = layerSize(layerPairs[0]); + innerLayerCumulativeSize[0] = layerSize(TrackerTraits::layerPairs[0]); for (uint32_t i = 1; i < nPairs; ++i) { - innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(layerPairs[2 * i]); + innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(TrackerTraits::layerPairs[2 * i]); } ntot = innerLayerCumulativeSize[nPairs - 1]; } @@ -80,6 +153,7 @@ namespace gpuPixelDoublets { auto stride = blockDim.x; uint32_t pairLayerId = 0; // cannot go backward + for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) { while (j >= innerLayerCumulativeSize[pairLayerId++]) ; @@ -89,12 +163,12 @@ namespace gpuPixelDoublets { assert(j < innerLayerCumulativeSize[pairLayerId]); assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]); - uint8_t inner = layerPairs[2 * pairLayerId]; - uint8_t outer = layerPairs[2 * pairLayerId + 1]; + uint8_t inner = TrackerTraits::layerPairs[2 * pairLayerId]; + uint8_t outer = TrackerTraits::layerPairs[2 * pairLayerId + 1]; assert(outer > inner); auto hoff = PhiBinner::histOff(outer); - + auto fo = __ldg(phiBinner.begin(hoff)); //first hit on outer for the cluster cut auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1]; i += offsets[inner]; @@ -104,8 +178,8 @@ namespace gpuPixelDoublets { assert(i < offsets[inner + 1]); // found hit corresponding to our cuda thread, now do the job - auto mi = hh.detectorIndex(i); - if (mi > gpuClustering::maxNumModules) + + if (hh.detectorIndex(i) > gpuClustering::maxNumModules) continue; // invalid /* maybe clever, not effective when zoCut is on @@ -116,32 +190,18 @@ namespace gpuPixelDoublets { auto mez = hh.zGlobal(i); - if (mez < minz[pairLayerId] || mez > maxz[pairLayerId]) + if (mez < TrackerTraits::minz[pairLayerId] || mez > TrackerTraits::maxz[pairLayerId]) + continue; + + if (doClusterCut && cuts.clusterCut(hh, i, fo)) continue; - int16_t mes = -1; // make compiler happy - if (doClusterCut) { - // if ideal treat inner ladder as outer - if (inner == 0) - assert(mi < 96); - isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue... - - // in any case we always test mes>0 ... - mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1; - - if (inner == 0 && outer > 3) // B1 and F1 - if (mes > 0 && mes < minYsizeB1) - continue; // only long cluster (5*8) - if (inner == 1 && outer > 3) // B2 and F1 - if (mes > 0 && mes < minYsizeB2) - continue; - } auto mep = hh.iphi(i); auto mer = hh.rGlobal(i); // all cuts: true if fails - constexpr float z0cut = 12.f; // cm - constexpr float hardPtCut = 0.5f; // GeV + constexpr float z0cut = TrackerTraits::z0Cut; // cm + constexpr float hardPtCut = TrackerTraits::doubletHardPt; // GeV // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field) constexpr float minRadius = hardPtCut * 87.78f; constexpr float minRadius2T4 = 4.f * minRadius * minRadius; @@ -156,24 +216,10 @@ namespace gpuPixelDoublets { auto zo = hh.zGlobal(j); auto ro = hh.rGlobal(j); auto dr = ro - mer; - return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr; - }; - - auto zsizeCut = [&](int j) { - auto onlyBarrel = outer < 4; - auto so = hh.clusterSizeY(j); - auto dy = inner == 0 ? maxDYsize12 : maxDYsize; - // in the barrel cut on difference in size - // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well) - // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ... - auto zo = hh.zGlobal(j); - auto ro = hh.rGlobal(j); - return onlyBarrel ? mes > 0 && so > 0 && std::abs(so - mes) > dy - : (inner < 4) && mes > 0 && - std::abs(mes - int(std::abs((mez - zo) / (mer - ro)) * dzdrFact + 0.5f)) > maxDYPred; + return dr > TrackerTraits::maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr; }; - auto iphicut = phicuts[pairLayerId]; + auto iphicut = TrackerTraits::phicuts[pairLayerId]; auto kl = PhiBinner::bin(int16_t(mep - iphicut)); auto kh = PhiBinner::bin(int16_t(mep + iphicut)); @@ -200,18 +246,18 @@ namespace gpuPixelDoublets { assert(oi >= offsets[outer]); assert(oi < offsets[outer + 1]); auto mo = hh.detectorIndex(oi); + if (mo > gpuClustering::maxNumModules) continue; // invalid if (doZ0Cut && z0cutoff(oi)) continue; - auto mop = hh.iphi(oi); uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))); if (idphi > iphicut) continue; - if (doClusterCut && zsizeCut(oi)) + if (doClusterCut && cuts.zSizeCut(hh, i, oi)) continue; if (doPtCut && ptcut(oi, idphi)) continue; @@ -231,9 +277,19 @@ namespace gpuPixelDoublets { #endif } } +// #endif #ifdef GPU_DEBUG if (tooMany > 0) - printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany); + printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f\n", + i, + inner, + outer, + nmin, + tot, + tooMany, + iphicut, + TrackerTraits::minz[pairLayerId], + TrackerTraits::maxz[pairLayerId]); #endif } // loop in block... } diff --git a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp index 70544a2647ee7..3c6be161a346f 100644 --- a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp +++ b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp @@ -1,5 +1,5 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h" - +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include #include @@ -9,17 +9,30 @@ void print() { } int main() { - using namespace caConstants; + using namespace pixelTopology; + using namespace caStructures; + //for Phase-I + print>(); + print>(); + print>(); + print>(); + print>(); + print>(); + print>(); + + print>(); + + //for Phase-II - print(); - print(); - print(); - print(); - print(); - print(); - print(); + print>(); + print>(); + print>(); + print>(); + print>(); + print>(); + print>(); - print(); + print>(); return 0; } diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 34b0ed9e29fc1..024c95398b988 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -1,6 +1,7 @@ #include #include "CUDADataFormats/Common/interface/Product.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" @@ -21,10 +22,14 @@ #undef PIXVERTEX_DEBUG_PRODUCE -class PixelVertexProducerCUDA : public edm::global::EDProducer<> { +template +class PixelVertexProducerCUDAT : public edm::global::EDProducer<> { + using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using GPUAlgo = gpuVertexFinder::Producer; + public: - explicit PixelVertexProducerCUDA(const edm::ParameterSet& iConfig); - ~PixelVertexProducerCUDA() override = default; + explicit PixelVertexProducerCUDAT(const edm::ParameterSet& iConfig); + ~PixelVertexProducerCUDAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -40,14 +45,15 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> { edm::EDGetTokenT tokenCPUTrack_; edm::EDPutTokenT tokenCPUVertex_; - const gpuVertexFinder::Producer gpuAlgo_; + const GPUAlgo gpuAlgo_; // Tracking cuts before sending tracks to vertex algo const float ptMin_; const float ptMax_; }; -PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) +template +PixelVertexProducerCUDAT::PixelVertexProducerCUDAT(const edm::ParameterSet& conf) : onGPU_(conf.getParameter("onGPU")), gpuAlgo_(conf.getParameter("oneKernel"), conf.getParameter("useDensity"), @@ -65,12 +71,13 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) consumes>(conf.getParameter("pixelTrackSrc")); tokenGPUVertex_ = produces(); } else { - tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); + tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); tokenCPUVertex_ = produces(); } } -void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void PixelVertexProducerCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; // Only one of these three algos can be used at once. @@ -90,13 +97,13 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d desc.add("PtMax", 75.); desc.add("pixelTrackSrc", edm::InputTag("pixelTracksCUDA")); - auto label = "pixelVerticesCUDA"; - descriptions.add(label, desc); + descriptions.addWithDefaultLabel(desc); } -void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, - edm::Event& iEvent, - const edm::EventSetup& iSetup) const { +template +void PixelVertexProducerCUDAT::produceOnGPU(edm::StreamID streamID, + edm::Event& iEvent, + const edm::EventSetup& iSetup) const { edm::Handle> hTracks; iEvent.getByToken(tokenGPUTrack_, hTracks); @@ -108,9 +115,10 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_, ptMax_)); } -void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, - edm::Event& iEvent, - const edm::EventSetup& iSetup) const { +template +void PixelVertexProducerCUDAT::produceOnCPU(edm::StreamID streamID, + edm::Event& iEvent, + const edm::EventSetup& iSetup) const { auto const* tracks = iEvent.get(tokenCPUTrack_).get(); assert(tracks); @@ -133,7 +141,10 @@ void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_, ptMax_)); } -void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { +template +void PixelVertexProducerCUDAT::produce(edm::StreamID streamID, + edm::Event& iEvent, + const edm::EventSetup& iSetup) const { if (onGPU_) { produceOnGPU(streamID, iEvent, iSetup); } else { @@ -141,4 +152,11 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent } } +using PixelVertexProducerCUDA = PixelVertexProducerCUDAT; DEFINE_FWK_MODULE(PixelVertexProducerCUDA); + +using PixelVertexProducerCUDAPhase1 = PixelVertexProducerCUDAT; +DEFINE_FWK_MODULE(PixelVertexProducerCUDAPhase1); + +using PixelVertexProducerCUDAPhase2 = PixelVertexProducerCUDAT; +DEFINE_FWK_MODULE(PixelVertexProducerCUDAPhase2); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc index 029c619b42e58..8cceeaa42cc10 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc @@ -25,7 +25,7 @@ class PixelVertexProducerFromSoA : public edm::global::EDProducer<> { public: - using IndToEdm = std::vector; + using IndToEdm = std::vector; explicit PixelVertexProducerFromSoA(const edm::ParameterSet &iConfig); ~PixelVertexProducerFromSoA() override = default; @@ -90,7 +90,7 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv << " from " << indToEdm.size() << " tracks" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - std::set uind; // for verifing index consistency + std::set uind; // for verifing index consistency for (int j = nv - 1; j >= 0; --j) { auto i = soa.sortInd[j]; // on gpu sorted in ascending order.... assert(i < nv); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 20b007d2d029f..74bcd26f8a79c 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -18,7 +18,9 @@ namespace gpuVertexFinder { // split vertices with a chi2/NDoF greater than this constexpr float maxChi2ForSplit = 9.f; - __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { + template + __global__ void loadTracks( + pixelTrack::TrackSoAT const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { assert(ptracks); assert(soa); auto const& tracks = *ptracks; @@ -26,6 +28,7 @@ namespace gpuVertexFinder { auto const* quality = tracks.qualityData(); auto first = blockIdx.x * blockDim.x + threadIdx.x; + for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { auto nHits = tracks.nHits(idx); assert(nHits >= 3); @@ -94,14 +97,22 @@ namespace gpuVertexFinder { } #endif + template #ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const { + ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, + pixelTrack::TrackSoAT const* tksoa, + float ptMin, + float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); #else - ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const { + + ZVertexHeterogeneous Producer::make(pixelTrack::TrackSoAT const* tksoa, + float ptMin, + float ptMax) const { + #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on CPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE @@ -120,12 +131,12 @@ namespace gpuVertexFinder { #ifdef __CUDACC__ init<<<1, 1, 0, stream>>>(soa, ws_d.get()); auto blockSize = 128; - auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin, ptMax); + auto numberOfBlocks = (pixelTrack::TrackSoAT::stride() + blockSize - 1) / blockSize; + loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin, ptMax); cudaCheck(cudaGetLastError()); #else init(soa, ws_d.get()); - loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax); + loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax); #endif #ifdef __CUDACC__ @@ -186,4 +197,7 @@ namespace gpuVertexFinder { return vertices; } + template class Producer; + template class Producer; + } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index 2b6a8107d927f..6128939f6eb87 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -10,15 +10,13 @@ namespace gpuVertexFinder { using ZVertices = ZVertexSoA; - using TkSoA = pixelTrack::TrackSoA; - // workspace used in the vertex reco algos struct WorkSpace { static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS; static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX; uint32_t ntrks; // number of "selected tracks" - uint16_t itrk[MAXTRACKS]; // index of original track + uint32_t itrk[MAXTRACKS]; // index of original track float zt[MAXTRACKS]; // input track z at bs float ezt2[MAXTRACKS]; // input error^2 on the above float ptt2[MAXTRACKS]; // input pt^2 on the above @@ -38,11 +36,12 @@ namespace gpuVertexFinder { pws->init(); } + template class Producer { public: using ZVertices = ZVertexSoA; using WorkSpace = gpuVertexFinder::WorkSpace; - using TkSoA = pixelTrack::TrackSoA; + using TkSoA = pixelTrack::TrackSoAT; Producer(bool oneKernel, bool useDensity, diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc index 34e04b0f7aedb..c11b53538c5b0 100644 --- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc +++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc @@ -145,6 +145,9 @@ struct L2TauNNProducerCacheData { }; class L2TauNNProducer : public edm::stream::EDProducer> { + using TrackSoA = pixelTrack::TrackSoAT; + using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + public: struct caloRecHitCollections { const HBHERecHitCollection* hbhe; @@ -179,16 +182,16 @@ class L2TauNNProducer : public edm::stream::EDProducer& allTaus, - const pixelTrack::TrackSoA& patatracks_tsoa, + const TrackSoA& patatracks_tsoa, const ZVertexSoA& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi); void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const pixelTrack::TrackSoA& patatracks_tsoa, + const TrackSoA& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood); std::pair impactParameter(int it, - const pixelTrack::TrackSoA& patatracks_tsoa, + const TrackSoA& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi); @@ -293,7 +296,7 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd geometryToken_(esConsumes()), bFieldToken_(esConsumes()), pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), - pataTracksToken_(consumes(cfg.getParameter("pataTracks"))), + pataTracksToken_(consumes(cfg.getParameter("pataTracks"))), beamSpotToken_(consumes(cfg.getParameter("BeamSpot"))), maxVtx_(cfg.getParameter("maxVtx")), fractionSumPt2_(cfg.getParameter("fractionSumPt2")), @@ -570,7 +573,7 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix, } void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const pixelTrack::TrackSoA& patatracks_tsoa, + const TrackSoA& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood) { const auto maxTracks = patatracks_tsoa.stride(); @@ -617,7 +620,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, } std::pair L2TauNNProducer::impactParameter(int it, - const pixelTrack::TrackSoA& patatracks_tsoa, + const TrackSoA& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi) { @@ -650,7 +653,7 @@ std::pair L2TauNNProducer::impactParameter(int it, void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const std::vector& allTaus, - const pixelTrack::TrackSoA& patatracks_tsoa, + const TrackSoA& patatracks_tsoa, const ZVertexSoA& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi) { diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc index 0e5823fc46c46..9023640f62d5a 100644 --- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc +++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc @@ -33,10 +33,11 @@ /* produces seeds directly from cuda produced tuples */ -class SeedProducerFromSoA : public edm::global::EDProducer<> { +template +class SeedProducerFromSoAT : public edm::global::EDProducer<> { public: - explicit SeedProducerFromSoA(const edm::ParameterSet& iConfig); - ~SeedProducerFromSoA() override = default; + explicit SeedProducerFromSoAT(const edm::ParameterSet& iConfig); + ~SeedProducerFromSoAT() override = default; static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -45,7 +46,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> { // Event data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT> tokenTrack_; // Event setup tokens const edm::ESGetToken idealMagneticFieldToken_; const edm::ESGetToken trackerDigiGeometryToken_; @@ -53,9 +54,10 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> { int32_t minNumberOfHits_; }; -SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig) +template +SeedProducerFromSoAT::SeedProducerFromSoAT(const edm::ParameterSet& iConfig) : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), - tokenTrack_(consumes(iConfig.getParameter("src"))), + tokenTrack_(consumes(iConfig.getParameter("src"))), idealMagneticFieldToken_(esConsumes()), trackerDigiGeometryToken_(esConsumes()), trackerPropagatorToken_(esConsumes(edm::ESInputTag("PropagatorWithMaterial"))), @@ -65,7 +67,8 @@ SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig) produces(); } -void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { +template +void SeedProducerFromSoAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); desc.add("src", edm::InputTag("pixelTrackSoA")); @@ -74,7 +77,10 @@ void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descr descriptions.addWithDefaultLabel(desc); } -void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { +template +void SeedProducerFromSoAT::produce(edm::StreamID streamID, + edm::Event& iEvent, + const edm::EventSetup& iSetup) const { // std::cout << "Converting gpu helix to trajectory seed" << std::endl; auto result = std::make_unique(); @@ -167,4 +173,11 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co iEvent.put(std::move(result)); } +using SeedProducerFromSoA = SeedProducerFromSoAT; DEFINE_FWK_MODULE(SeedProducerFromSoA); + +using SeedProducerFromSoAPhase1 = SeedProducerFromSoAT; +DEFINE_FWK_MODULE(SeedProducerFromSoAPhase1); + +using SeedProducerFromSoAPhase2 = SeedProducerFromSoAT; +DEFINE_FWK_MODULE(SeedProducerFromSoAPhase2);