diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h index 907b7647a3452..0c3e8bd37cb6a 100644 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -5,6 +5,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" @@ -21,15 +22,15 @@ class HeterogeneousSoA { explicit HeterogeneousSoA(cudautils::device::unique_ptr &&p) : dm_ptr(std::move(p)) {} explicit HeterogeneousSoA(cudautils::host::unique_ptr &&p) : hm_ptr(std::move(p)) {} - explicit HeterogeneousSoA(std::unique_ptr &&p) : std_ptr(std::move(p)) {} + explicit HeterogeneousSoA(cudautils::cpu::unique_ptr &&p) : cm_ptr(std::move(p)) {} - auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } + auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); } auto const &operator*() const { return *get(); } auto const *operator-> () const { return get(); } - auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } + auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); } auto &operator*() { return *get(); } @@ -47,12 +48,15 @@ class HeterogeneousSoA { // a union wan't do it, a variant will not be more efficienct cudautils::device::unique_ptr dm_ptr; //! cudautils::host::unique_ptr hm_ptr; //! - std::unique_ptr std_ptr; //! + cudautils::cpu::unique_ptr cm_ptr; //! }; namespace cudaCompat { struct GPUTraits { + static constexpr const char * name = "GPU"; + static constexpr bool runOnDevice = true; + template using unique_ptr = cudautils::device::unique_ptr; @@ -83,6 +87,9 @@ namespace cudaCompat { }; struct HostTraits { + static constexpr const char * name = "HOST"; + static constexpr bool runOnDevice = false; + template using unique_ptr = cudautils::host::unique_ptr; @@ -108,32 +115,45 @@ namespace cudaCompat { }; struct CPUTraits { + static constexpr const char * name = "CPU"; + static constexpr bool runOnDevice = false; + + template + using unique_ptr = cudautils::cpu::unique_ptr;; + + template + static auto make_unique() { + return cudautils::make_cpu_unique(cudaStreamDefault); + } + template - using unique_ptr = std::unique_ptr; + static auto make_unique(size_t size) { + return cudautils::make_cpu_unique(size,cudaStreamDefault); + } template - static auto make_unique(cudaStream_t) { - return std::make_unique(); + static auto make_unique(cudaStream_t stream) { + return cudautils::make_cpu_unique(stream); } template - static auto make_unique(size_t size, cudaStream_t) { - return std::make_unique(size); + static auto make_unique(size_t size, cudaStream_t stream) { + return cudautils::make_cpu_unique(size, stream); } template - static auto make_host_unique(cudaStream_t) { - return std::make_unique(); + static auto make_host_unique(cudaStream_t stream) { + return cudautils::make_cpu_unique(stream); } template - static auto make_device_unique(cudaStream_t) { - return std::make_unique(); + static auto make_device_unique(cudaStream_t stream) { + return cudautils::make_cpu_unique(stream); } template - static auto make_device_unique(size_t size, cudaStream_t) { - return std::make_unique(size); + static auto make_device_unique(size_t size, cudaStream_t stream) { + return cudautils::make_cpu_unique(size, stream); } }; diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h index 17ad98ba403a4..0a706b6c57927 100644 --- a/CUDADataFormats/Common/interface/HostProduct.h +++ b/CUDADataFormats/Common/interface/HostProduct.h @@ -2,6 +2,7 @@ #define CUDADataFormatsCommonHostProduct_H #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h" // a heterogeneous unique pointer... template @@ -13,9 +14,9 @@ class HostProduct { HostProduct& operator=(HostProduct&&) = default; explicit HostProduct(cudautils::host::unique_ptr&& p) : hm_ptr(std::move(p)) {} - explicit HostProduct(std::unique_ptr&& p) : std_ptr(std::move(p)) {} + explicit HostProduct(cudautils::cpu::unique_ptr&& p) : cm_ptr(std::move(p)) {} - auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); } + auto const* get() const { return hm_ptr ? hm_ptr.get() : cm_ptr.get(); } auto const& operator*() const { return *get(); } @@ -23,7 +24,7 @@ class HostProduct { private: cudautils::host::unique_ptr hm_ptr; //! - std::unique_ptr std_ptr; //! + cudautils::cpu::unique_ptr cm_ptr; //! }; #endif diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h index d3650e164d44e..afb7c5aa37833 100644 --- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -2,7 +2,6 @@ #define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc index e6f223bfec4e3..60d2693fa45b0 100644 --- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc +++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc @@ -1,7 +1,6 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" template <> diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 6fbaced1858dd..5f29ad3b81ea8 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -72,7 +72,7 @@ namespace cudautils { inline void launchFinalize(Histo *__restrict__ h, uint8_t *__restrict__ ws #ifndef __CUDACC__ - = cudaStreamDefault + = nullptr #endif , cudaStream_t stream diff --git a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h new file mode 100644 index 0000000000000..dab7d15f40f92 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h @@ -0,0 +1,86 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h +#define HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h + +#include +#include + +#include +#include + +namespace cudautils { + namespace cpu { + namespace impl { + // Additional layer of types to distinguish from device:: and host::unique_ptr + class CPUDeleter { + public: + CPUDeleter() = default; + + void operator()(void *ptr) { + ::free(ptr); + } + }; + } // namespace impl + + template + using unique_ptr = std::unique_ptr; + + namespace impl { + template + struct make_cpu_unique_selector { + using non_array = cudautils::cpu::unique_ptr; + }; + template + struct make_cpu_unique_selector { + using unbounded_array = cudautils::cpu::unique_ptr; + }; + template + struct make_cpu_unique_selector { + struct bounded_array {}; + }; + } // namespace impl + } // namespace cpu + + template + typename cpu::impl::make_cpu_unique_selector::non_array make_cpu_unique(cudaStream_t) { + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the cpu memory is not supported"); + void *mem = ::malloc(sizeof(T)); + return typename cpu::impl::make_cpu_unique_selector::non_array{reinterpret_cast(mem), + cpu::impl::CPUDeleter()}; + } + + template + typename cpu::impl::make_cpu_unique_selector::unbounded_array make_cpu_unique(size_t n, cudaStream_t) { + using element_type = typename std::remove_extent::type; + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the cpu memory is not supported"); + void *mem = ::malloc(n * sizeof(element_type)); + return typename cpu::impl::make_cpu_unique_selector::unbounded_array{reinterpret_cast(mem), + cpu::impl::CPUDeleter()}; + } + + template + typename cpu::impl::make_cpu_unique_selector::bounded_array make_cpu_unique(Args &&...) = delete; + + // No check for the trivial constructor, make it clear in the interface + template + typename cpu::impl::make_cpu_unique_selector::non_array make_cpu_unique_uninitialized(cudaStream_t) { + void *mem = ::malloc(sizeof(T)); + return typename cpu::impl::make_cpu_unique_selector::non_array{reinterpret_cast(mem), + cpu::impl::CPUDeleter()}; + } + + template + typename cpu::impl::make_cpu_unique_selector::unbounded_array make_cpu_unique_uninitialized(size_t n, cudaStream_t) { + using element_type = typename std::remove_extent::type; + void *mem = ::malloc(n * sizeof(element_type)); + return typename cpu::impl::make_cpu_unique_selector::unbounded_array{reinterpret_cast(mem), + cpu::impl::CPUDeleter()}; + } + + template + typename cpu::impl::make_cpu_unique_selector::bounded_array make_cpu_unique_uninitialized(Args &&...) = + delete; +} // namespace cudautils + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index 32c7d7b481a50..892cb7cf5271a 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -6,6 +6,10 @@ */ #ifndef __CUDACC__ + #define CUDA_KERNELS_ON_CPU +#endif + +#ifdef CUDA_KERNELS_ON_CPU #include #include @@ -86,18 +90,20 @@ namespace cudaCompat { #define __forceinline__ #endif -// make sure function are inlined to avoid multiple definition #ifndef __CUDA_ARCH__ +using namespace cudaCompat; +#endif + +#endif // CUDA_KERNELS_ON_CPU + + +// make sure function are inlined to avoid multiple definition +#ifndef __CUDACC__ #undef __global__ #define __global__ inline __attribute__((always_inline)) #undef __forceinline__ #define __forceinline__ inline __attribute__((always_inline)) #endif -#ifndef __CUDA_ARCH__ -using namespace cudaCompat; -#endif - -#endif #endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h index f3c8240224c0f..dfa48e33be4c5 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/launch.h +++ b/HeterogeneousCore/CUDAUtilities/interface/launch.h @@ -93,8 +93,12 @@ namespace cudautils { // wrappers for cudaLaunchKernel inline void launch(void (*kernel)(), LaunchParameters config) { +#ifdef CUDA_KERNELS_ON_CPU + kernel(); +#else cudaCheck(cudaLaunchKernel( (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream)); +#endif } template @@ -104,6 +108,9 @@ namespace cudautils { std::enable_if_t >::value> #endif launch(F* kernel, LaunchParameters config, Args&&... args) { +#ifdef CUDA_KERNELS_ON_CPU + kernel(args...); +#else using function_type = detail::kernel_traits; typename function_type::argument_type_tuple args_copy(args...); @@ -113,6 +120,7 @@ namespace cudautils { detail::pointer_setter()(pointers, args_copy); cudaCheck(cudaLaunchKernel( (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream)); +#endif } // wrappers for cudaLaunchCooperativeKernel diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h index 203aac78a165c..819db79d62497 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h +++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h @@ -3,8 +3,10 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h" #include +#include namespace cudautils { template @@ -15,6 +17,11 @@ namespace cudautils { cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream)); } + template + inline void memsetAsync(cudautils::cpu::unique_ptr& ptr, T value, cudaStream_t) { + ::memset(ptr.get(), value, sizeof(T)); + } + /** * The type of `value` is `int` because of `cudaMemsetAsync()` takes * it as an `int`. Note that `cudaMemsetAsync()` sets the value of @@ -25,6 +32,12 @@ namespace cudautils { inline void memsetAsync(cudautils::device::unique_ptr& ptr, int value, size_t nelements, cudaStream_t stream) { cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream)); } + template + inline void memsetAsync(cudautils::cpu::unique_ptr& ptr, int value, size_t nelements, cudaStream_t) { + ::memset(ptr.get(), value, nelements * sizeof(T)); + } + + } // namespace cudautils #endif diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index a700c0865f0f2..88a12c789ed8f 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -7,6 +7,18 @@ + + + + + + + + + + + + @@ -65,7 +77,7 @@ - + diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp new file mode 100644 index 0000000000000..77c824fc833c4 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp @@ -0,0 +1,24 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h" + +#include "Launch_t.h" + +#ifdef LaunchInCU + void wrapperInCU(); +#endif + + +int main() { + + requireCUDADevices(); + + printf("in Main\n"); + printEnv(); + + wrapper(); + +#ifdef LaunchInCU + wrapperInCU(); +#endif + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu new file mode 100644 index 0000000000000..90b24fb6fec4f --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu @@ -0,0 +1,12 @@ +#ifdef CUDA_KERNELS_ON_CPU +#undef CUDA_KERNELS_ON_CPU +#endif + +#include "Launch_t.h" + +#ifdef LaunchInCU + void wrapperInCU() { + printf("in cu wrapper\n"); + wrapper(); + } +#endif diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.h b/HeterogeneousCore/CUDAUtilities/test/Launch_t.h new file mode 100644 index 0000000000000..3237f6383f47d --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.h @@ -0,0 +1,46 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include +#include + +// these are in cudaCompat: here we do just a minimal test of "launch" +#ifndef __CUDACC__ +#undef __global__ +#define __global__ inline __attribute__((always_inline)) +#endif + +inline +constexpr void printEnv() { + printf("ENV\n"); +#ifdef CUDA_KERNELS_ON_CPU + printf("CUDA_KERNELS_ON_CPU defined\n"); +#endif +#ifdef __CUDACC__ + printf("__CUDACC__ defined\n"); +#endif +#ifdef __CUDA_RUNTIME_H__ + printf("__CUDA_RUNTIME_H__ defined\n"); +#endif +#ifdef __CUDA_ARCH__ + printf("__CUDA_ARCH__ defined\n"); +#endif + printf("---\n"); +} + +__global__ +void hello(float k) { + + printf("hello from kernel %f\n",k); + printEnv(); +} + + +inline void wrapper() { + + printf("in Wrapper\n"); + printEnv(); + + cudautils::launch(hello,{1, 1},3.14); + cudaDeviceSynchronize(); + +} + diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp new file mode 100644 index 0000000000000..e945464c26b74 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp @@ -0,0 +1,38 @@ +#include "catch.hpp" + +#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h" + +TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") { + + SECTION("Single element") { + auto ptr = cudautils::make_cpu_unique(cudaStreamDefault); + REQUIRE(ptr != nullptr); + *ptr = 1; + cudautils::memsetAsync(ptr,0,cudaStreamDefault); + REQUIRE(0==*ptr); + } + + SECTION("Reset") { + auto ptr = cudautils::make_cpu_unique(cudaStreamDefault); + REQUIRE(ptr != nullptr); + + ptr.reset(); + REQUIRE(ptr.get() == nullptr); + } + + SECTION("Multiple elements") { + auto ptr = cudautils::make_cpu_unique(10,cudaStreamDefault); + REQUIRE(ptr != nullptr); + for (int i=0; i<10; ++i) ptr[i]=1; + cudautils::memsetAsync(ptr,0,10,cudaStreamDefault); + int s=0; for (int i=0; i<10; ++i) s+=ptr[i]; + REQUIRE(0==s); + } + + SECTION("Allocating too much") { + constexpr size_t maxSize = 1 << 30; // 8**10 + auto ptr = cudautils::make_cpu_unique(maxSize+1,cudaStreamDefault); + REQUIRE(ptr != nullptr); + } +} diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc index 945bbb28a3262..608c65eff83d1 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc @@ -105,7 +105,7 @@ namespace cms { const edmNew::DetSetVector& input = *inputhandle; // yes a unique ptr of a unique ptr so edm is happy and the pointer stay still... - auto hmsp = std::make_unique(gpuClustering::MaxNumModules + 1); + auto hmsp = cudautils::make_cpu_unique(gpuClustering::MaxNumModules + 1,cudaStreamDefault); auto hitsModuleStart = hmsp.get(); auto hms = std::make_unique(std::move(hmsp)); // hmsp is gone iEvent.put(std::move(hms)); // hms is gone! hitsModuleStart still alive and kicking... diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc index a4f19ac276a7a..7e840e1b266e1 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc @@ -81,7 +81,7 @@ void SiPixelRecHitFromSOA::acquire(edm::Event const& iEvent, void SiPixelRecHitFromSOA::produce(edm::Event& iEvent, edm::EventSetup const& es) { // yes a unique ptr of a unique ptr so edm is happy auto sizeOfHitModuleStart = gpuClustering::MaxNumModules + 1; - auto hmsp = std::make_unique(sizeOfHitModuleStart); + auto hmsp = cudautils::make_cpu_unique(sizeOfHitModuleStart,cudaStreamDefault); std::copy(m_hitsModuleStart.get(), m_hitsModuleStart.get() + sizeOfHitModuleStart, hmsp.get()); auto hms = std::make_unique(std::move(hmsp)); // hmsp is gone iEvent.put(std::move(hms)); // hms is gone! diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index 7900cf8b2289a..d286583cbc822 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -103,7 +103,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv auto const& input = *hclusters; // yes a unique ptr of a unique ptr so edm is happy and the pointer stay still... - auto hmsp = std::make_unique(gpuClustering::MaxNumModules + 1); + auto hmsp = cudautils::make_cpu_unique(gpuClustering::MaxNumModules + 1,cudaStreamDefault); auto hitsModuleStart = hmsp.get(); auto hms = std::make_unique(std::move(hmsp)); // hmsp is gone iEvent.put(tokenModuleStart_, std::move(hms)); // hms is gone! hitsModuleStart still alive and kicking... diff --git a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml index d8f6b2a35880a..bae77ee5d8fcf 100644 --- a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml @@ -21,3 +21,12 @@ + + + + + + + + + diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc new file mode 100644 index 0000000000000..3b2cb0a921e19 --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc @@ -0,0 +1,78 @@ +#include + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" + +template +void analyzeImpl(TrackingRecHit2DHeterogeneous const & ghits, cudaStream_t stream); + +/* +TrackingRecHit2DCUDA and TrackingRecHit2DCPU are NOT the same type (for tracks and vertices are the same type) +they are templated with the (GPU/CPU)Traits so in principle the whole Analyzer can be (partially) templated as well +they return the same view type though (so the real analyzer algo in the header file above does not need to be templated) +*/ + +class RecHitSoATest : public edm::global::EDAnalyzer<> { +public: + + explicit RecHitSoATest(const edm::ParameterSet& iConfig); + ~RecHitSoATest() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; + const bool m_onGPU; + edm::EDGetTokenT> tGpuHits; + edm::EDGetTokenT tCpuHits; +}; + +RecHitSoATest::RecHitSoATest(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter("onGPU")) { + if (m_onGPU) { + tGpuHits = + consumes>(iConfig.getParameter("heterogeneousPixelRecHitSrc")); + } else { + tCpuHits = + consumes(iConfig.getParameter("heterogeneousPixelRecHitSrc")); + } +} + + +void RecHitSoATest::analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const { + if (m_onGPU) { + auto const & gh = iEvent.get(tGpuHits); + CUDAScopedContextProduce ctx{gh}; + + analyzeImpl(ctx.get(gh),ctx.stream()); + + } else { + analyzeImpl(iEvent.get(tCpuHits),cudaStreamDefault); + } +} + +void RecHitSoATest::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("onGPU", true); + desc.add("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting")); + descriptions.add("RecHitSoATest", desc); +} + +DEFINE_FWK_MODULE(RecHitSoATest); diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h new file mode 100644 index 0000000000000..1032501abe3fc --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h @@ -0,0 +1,71 @@ +#include + +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h" + +__global__ +void analyzeHits(TrackingRecHit2DSOAView const* hhp, uint32_t nhits, double * ws) { + + assert(hhp); + assert(ws); + + auto const& hh = *hhp; + auto first = blockIdx.x * blockDim.x + threadIdx.x; + +#ifdef CUDA_KERNELS_ON_CPU + std::cout << "kernel ON CPU!" << std::endl; +#else + if (0 == first) printf("kernel ON GPU!\n"); +#endif + + + if (0 == first) + printf("in analyzeHits SoA\n"); + + __shared__ int nh; + __shared__ double sch; + + nh=0; sch=0; + __syncthreads(); + + for (int i = first, ni=nhits; i < ni; i += gridDim.x * blockDim.x) { + if (hh.charge(i) <=0) printf("a hit with zero charge?\n"); + atomicAdd(&nh,1); + atomicAdd(&sch,(double)(hh.charge(i))); + } + + __syncthreads(); + + if(0==threadIdx.x) { + atomicAdd(&ws[0],nh); + atomicAdd(&ws[1],sch); + atomicAdd(&ws[9],1); + if (gridDim.x==ws[9]) { + printf("in event : found %f hits, average charge is %f\n",ws[0], ws[1]/ws[0]); + } + } + +} + +template +void analyzeImpl(TrackingRecHit2DHeterogeneous const & gHits, cudaStream_t stream) { +#ifdef CUDA_KERNELS_ON_CPU + std::cout << "launching ON CPU!" << std::endl; + assert(!Traits::runOnDevice); +#else + printf("launching ON GPU!\n"); + assert(Traits::runOnDevice); +#endif + + auto nhits = gHits.nHits(); + + if (0 == nhits) return; + + auto ws_d = Traits:: template make_unique(10,stream); + cudautils::memsetAsync(ws_d, 0, 10, stream); + int threadsPerBlock = 256; + int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock; + cudautils::launch(analyzeHits,{threadsPerBlock,blocks,0,stream} , gHits.view(), nhits, ws_d.get()); +} diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc new file mode 100644 index 0000000000000..aa9ab7168b6d3 --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc @@ -0,0 +1,4 @@ +#include "RecHitSoATest.h" +template +void analyzeImpl(TrackingRecHit2DHeterogeneous const & gHits, cudaStream_t stream); + diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu new file mode 100644 index 0000000000000..0553db894f398 --- /dev/null +++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu @@ -0,0 +1,4 @@ +#include "RecHitSoATest.h" +template +void analyzeImpl(TrackingRecHit2DHeterogeneous const & gHits, cudaStream_t stream); + diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc index cc5865d97fd95..6c90ab1dd8f6f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc @@ -1,68 +1,7 @@ #include "BrokenLineFitOnGPU.h" -void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) { - assert(tuples_d); - - // Fit internals - auto hitsGPU_ = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = - std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); - - for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { - // fit triplets - kernelBLFastFit<3>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); - - kernelBLFit<3>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 3, - offset); - - // fit quads - kernelBLFastFit<4>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); - - kernelBLFit<4>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 4, - offset); - - if (fit5as4_) { - // fit penta (only first 4) - kernelBLFastFit<4>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); - - kernelBLFit<4>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); - } else { - // fit penta (all 5) - kernelBLFastFit<5>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); - - kernelBLFit<5>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); - } - - } // loop on concurrent fits -} +template +void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, + uint32_t hitsInFit, + uint32_t maxNumberOfTuples, + cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index 660cf75e1f460..2eba3e9af5965 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -1,85 +1,7 @@ #include "BrokenLineFitOnGPU.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, +template +void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples, - cudaStream_t stream) { - assert(tuples_d); - - auto blockSize = 64; - auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; - - // Fit internals - auto hitsGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream); - auto hits_geGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream); - auto fast_fit_resultsGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream); - - for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { - // fit triplets - kernelBLFastFit<3><<>>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); - cudaCheck(cudaGetLastError()); - - kernelBLFit<3><<>>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 3, - offset); - cudaCheck(cudaGetLastError()); - - // fit quads - kernelBLFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); - cudaCheck(cudaGetLastError()); - - kernelBLFit<4><<>>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 4, - offset); - cudaCheck(cudaGetLastError()); - - if (fit5as4_) { - // fit penta (only first 4) - kernelBLFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); - cudaCheck(cudaGetLastError()); - - kernelBLFit<4><<>>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); - cudaCheck(cudaGetLastError()); - } else { - // fit penta (all 5) - kernelBLFastFit<5><<>>( - tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); - cudaCheck(cudaGetLastError()); - - kernelBLFit<5><<>>(tupleMultiplicity_d, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); - cudaCheck(cudaGetLastError()); - } - - } // loop on concurrent fits -} + cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index 82a5bee443f88..aa7c1e680929e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -185,3 +185,84 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t #endif } } + + +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +template +void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, + uint32_t hitsInFit, + uint32_t maxNumberOfTuples, + cudaStream_t stream) { + assert(tuples_d); + + int blockSize = 64; + int numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; + + // Fit internals + auto hitsGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream); + auto hits_geGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream); + auto fast_fit_resultsGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // fit triplets + cudautils::launch(kernelBLFastFit<3>,{numberOfBlocks, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelBLFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 3, + offset); + + // fit quads + cudautils::launch(kernelBLFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelBLFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 4, + offset); + + if (fit5as4_) { + // fit penta (only first 4) + cudautils::launch(kernelBLFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + + cudautils::launch(kernelBLFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); + } else { + // fit penta (all 5) + cudautils::launch(kernelBLFastFit<5>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + + cudautils::launch(kernelBLFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); + } + + } // loop on concurrent fits +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index 11b644d466768..a87b45f47cbe5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -74,10 +74,10 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const CUDAScopedContextProduce ctx{*hHits}; auto const& hits = ctx.get(*hHits); - ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuples(hits, bf, ctx.stream())); } else { auto const& hits = iEvent.get(tokenHitCPU_); - iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf)); + iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf,cudaStreamDefault)); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 75066458dc170..03a8b927289b8 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -1,174 +1,17 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" -template <> -void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) { - kernel_printCounters(counters); -} -template <> -void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) { - kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices); -} +template +void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream); -template <> -void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { - auto nhits = hh.nHits(); +template +void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream); -#ifdef NTUPLE_DEBUG - std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; -#endif +template +void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream); - // in principle we can use "nhits" to heuristically dimension the workspace... - // overkill to use template here (std::make_unique would suffice) - // device_isOuterHitOfCell_ = Traits:: template make_unique(cs, std::max(1U,nhits), stream); - device_isOuterHitOfCell_.reset( - (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell))); - assert(device_isOuterHitOfCell_.get()); - gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(), - nhits, - device_theCellNeighbors_, - device_theCellNeighborsContainer_.get(), - device_theCellTracks_, - device_theCellTracksContainer_.get()); +template +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream); - // device_theCells_ = Traits:: template make_unique(cs, m_params.maxNumberOfDoublets_, stream); - device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_)); - if (0 == nhits) - return; // protect against empty events - - // FIXME avoid magic numbers - auto nActualPairs = gpuPixelDoublets::nPairs; - if (!m_params.includeJumpingForwardDoublets_) - nActualPairs = 15; - if (m_params.minHitsPerNtuplet_ > 3) { - nActualPairs = 13; - } - - assert(nActualPairs <= gpuPixelDoublets::nPairs); - gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, - hh.view(), - device_isOuterHitOfCell_.get(), - nActualPairs, - m_params.idealConditions_, - m_params.doClusterCut_, - m_params.doZ0Cut_, - m_params.doPtCut_, - m_params.maxNumberOfDoublets_); -} - -template <> -void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto *tuples_d = &tracks_d->hitIndices; - auto *quality_d = (Quality *)(&tracks_d->m_quality); - - assert(tuples_d && quality_d); - - // zero tuples - cudautils::launchZero(tuples_d, cudaStream); - - auto nhits = hh.nHits(); - assert(nhits <= pixelGPUConstants::maxNumberOfHits); - - // std::cout << "N hits " << nhits << std::endl; - // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; - - // - // applying conbinatoric cleaning such as fishbone at this stage is too expensive - // - - kernel_connect(device_hitTuple_apc_, - device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_isOuterHitOfCell_.get(), - m_params.hardCurvCut_, - m_params.ptmin_, - m_params.CAThetaCutBarrel_, - m_params.CAThetaCutForward_, - m_params.dcaCutInnerTriplet_, - m_params.dcaCutOuterTriplet_); - - if (nhits > 1 && m_params.earlyFishbone_) { - fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); - } - - kernel_find_ntuplets(hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellTracks_, - tuples_d, - device_hitTuple_apc_, - quality_d, - m_params.minHitsPerNtuplet_); - if (m_params.doStats_) - kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_); - - cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d); - - // remove duplicates (tracks that share a doublet) - kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d); - - kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); - cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); - kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); - - if (nhits > 1 && m_params.lateFishbone_) { - fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); - } - - if (m_params.doStats_) { - kernel_checkOverflows(tuples_d, - device_tupleMultiplicity_.get(), - device_hitTuple_apc_, - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, - device_isOuterHitOfCell_.get(), - nhits, - m_params.maxNumberOfDoublets_, - counters_); - } -} - -template <> -void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = (Quality *)(&tracks_d->m_quality); - - // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, m_params.cuts_, quality_d); - - if (m_params.lateFishbone_) { - // apply fishbone cleaning to good tracks - kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d); - } - - // remove duplicates (tracks that share a doublet) - kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d); - - // fill hit->track "map" - kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); - cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); - kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); - - // remove duplicates (tracks that share a hit) - kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); - - if (m_params.doStats_) { - // counters (add flag???) - kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_); - kernel_doStatsForTracks(tuples_d, quality_d, counters_); - } - -#ifdef DUMP_GPU_TK_TUPLES - static std::atomic iev(0); - ++iev; - kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); -#endif -} +template +void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index aaf882633f17d..8939acff2ebc2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -1,293 +1,17 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" -template <> -void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto blockSize = 128; - auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; - kernel_fillHitDetIndices<<>>( - &tracks_d->hitIndices, hv, &tracks_d->detIndices); - cudaCheck(cudaGetLastError()); -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif -} +template +void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream); -template <> -void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - // these are pointer on GPU! - auto *tuples_d = &tracks_d->hitIndices; - auto *quality_d = (Quality *)(&tracks_d->m_quality); +template +void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream); - // zero tuples - cudautils::launchZero(tuples_d, cudaStream); +template +void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream); - auto nhits = hh.nHits(); - assert(nhits <= pixelGPUConstants::maxNumberOfHits); +template +void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream); - // std::cout << "N hits " << nhits << std::endl; - // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; - - // - // applying conbinatoric cleaning such as fishbone at this stage is too expensive - // - - auto nthTot = 64; - auto stride = 4; - auto blockSize = nthTot / stride; - auto numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - auto rescale = numberOfBlocks / 65536; - blockSize *= (rescale + 1); - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - assert(numberOfBlocks < 65536); - assert(blockSize > 0 && 0 == blockSize % 16); - dim3 blks(1, numberOfBlocks, 1); - dim3 thrs(stride, blockSize, 1); - - kernel_connect<<>>( - device_hitTuple_apc_, - device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_isOuterHitOfCell_.get(), - m_params.hardCurvCut_, - m_params.ptmin_, - m_params.CAThetaCutBarrel_, - m_params.CAThetaCutForward_, - m_params.dcaCutInnerTriplet_, - m_params.dcaCutOuterTriplet_); - cudaCheck(cudaGetLastError()); - - if (nhits > 1 && m_params.earlyFishbone_) { - auto nthTot = 128; - auto stride = 16; - auto blockSize = nthTot / stride; - auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; - dim3 blks(1, numberOfBlocks, 1); - dim3 thrs(stride, blockSize, 1); - fishbone<<>>( - hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); - cudaCheck(cudaGetLastError()); - } - - blockSize = 64; - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_find_ntuplets<<>>(hh.view(), - device_theCells_.get(), - device_nCells_, - device_theCellTracks_, - tuples_d, - device_hitTuple_apc_, - quality_d, - m_params.minHitsPerNtuplet_); - cudaCheck(cudaGetLastError()); - - if (m_params.doStats_) - kernel_mark_used<<>>(hh.view(), device_theCells_.get(), device_nCells_); - cudaCheck(cudaGetLastError()); - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - - blockSize = 128; - numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; - cudautils::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); - - // remove duplicates (tracks that share a doublet) - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_earlyDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tuples_d, quality_d); - cudaCheck(cudaGetLastError()); - - blockSize = 128; - numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize; - kernel_countMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); - cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); - kernel_fillMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); - cudaCheck(cudaGetLastError()); - - if (nhits > 1 && m_params.lateFishbone_) { - auto nthTot = 128; - auto stride = 16; - auto blockSize = nthTot / stride; - auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; - dim3 blks(1, numberOfBlocks, 1); - dim3 thrs(stride, blockSize, 1); - fishbone<<>>( - hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); - cudaCheck(cudaGetLastError()); - } - - if (m_params.doStats_) { - numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize; - kernel_checkOverflows<<>>(tuples_d, - device_tupleMultiplicity_.get(), - device_hitTuple_apc_, - device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, - device_isOuterHitOfCell_.get(), - nhits, - m_params.maxNumberOfDoublets_, - counters_); - cudaCheck(cudaGetLastError()); - } -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif -} - -template <> -void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { - auto nhits = hh.nHits(); - -#ifdef NTUPLE_DEBUG - std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; -#endif - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - - // in principle we can use "nhits" to heuristically dimension the workspace... - device_isOuterHitOfCell_ = cudautils::make_device_unique(std::max(1U, nhits), stream); - assert(device_isOuterHitOfCell_.get()); - { - int threadsPerBlock = 128; - // at least one block! - int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; - gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), - nhits, - device_theCellNeighbors_, - device_theCellNeighborsContainer_.get(), - device_theCellTracks_, - device_theCellTracksContainer_.get()); - cudaCheck(cudaGetLastError()); - } - - device_theCells_ = cudautils::make_device_unique(m_params.maxNumberOfDoublets_, stream); - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - - if (0 == nhits) - return; // protect against empty events - - // FIXME avoid magic numbers - auto nActualPairs = gpuPixelDoublets::nPairs; - if (!m_params.includeJumpingForwardDoublets_) - nActualPairs = 15; - if (m_params.minHitsPerNtuplet_ > 3) { - nActualPairs = 13; - } - - assert(nActualPairs <= gpuPixelDoublets::nPairs); - int stride = 4; - int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride; - int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; - dim3 blks(1, blocks, 1); - dim3 thrs(stride, threadsPerBlock, 1); - gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), - device_nCells_, - device_theCellNeighbors_, - device_theCellTracks_, - hh.view(), - device_isOuterHitOfCell_.get(), - nActualPairs, - m_params.idealConditions_, - m_params.doClusterCut_, - m_params.doZ0Cut_, - m_params.doPtCut_, - m_params.maxNumberOfDoublets_); - cudaCheck(cudaGetLastError()); - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif -} - -template <> -void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { - // these are pointer on GPU! - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = (Quality *)(&tracks_d->m_quality); - - auto blockSize = 64; - - // classify tracks based on kinematics - auto numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - kernel_classifyTracks<<>>(tuples_d, tracks_d, m_params.cuts_, quality_d); - cudaCheck(cudaGetLastError()); - - if (m_params.lateFishbone_) { - // apply fishbone cleaning to good tracks - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_fishboneCleaner<<>>( - device_theCells_.get(), device_nCells_, quality_d); - cudaCheck(cudaGetLastError()); - } - - // remove duplicates (tracks that share a doublet) - numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_fastDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tuples_d, tracks_d); - cudaCheck(cudaGetLastError()); - - if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) { - // fill hit->track "map" - numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - kernel_countHitInTracks<<>>( - tuples_d, quality_d, device_hitToTuple_.get()); - cudaCheck(cudaGetLastError()); - cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); - cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); - cudaCheck(cudaGetLastError()); - } - if (m_params.minHitsPerNtuplet_ < 4) { - // remove duplicates (tracks that share a hit) - numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; - kernel_tripletCleaner<<>>( - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); - cudaCheck(cudaGetLastError()); - } - - if (m_params.doStats_) { - // counters (add flag???) - numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; - kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); - cudaCheck(cudaGetLastError()); - numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - kernel_doStatsForTracks<<>>(tuples_d, quality_d, counters_); - cudaCheck(cudaGetLastError()); - } -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - -#ifdef DUMP_GPU_TK_TUPLES - static std::atomic iev(0); - ++iev; - kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); -#endif -} - -template <> -void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) { - kernel_printCounters<<<1, 1>>>(counters); -} +template +void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 0140958aa9ef2..8213cc9c2af03 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -171,7 +171,7 @@ class CAHitNtupletGeneratorKernels { void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream); void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); - void allocateOnGPU(cudaStream_t stream); + void allocate(cudaStream_t stream); void cleanup(cudaStream_t cudaStream); static void printCounters(Counters const* counters); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h index b91911c66924e..e688052f1faf0 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h @@ -4,9 +4,9 @@ template <> #ifdef __CUDACC__ -void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsGPU::allocate(cudaStream_t stream) { #else -void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsCPU::allocate(cudaStream_t stream) { #endif ////////////////////////////////////////////////////////// // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 6888e6725bc39..19df8f5e249a2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -601,3 +601,282 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun c.nEmptyCells / double(c.nCells), c.nZeroTrackCells / double(c.nCells)); } + + + +// launchers + +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + + +template +void CAHitNtupletGeneratorKernels::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { + auto blockSize = 128; + int numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; + + cudautils::launch(kernel_fillHitDetIndices,{numberOfBlocks, blockSize, 0, cudaStream}, + &tracks_d->hitIndices, hv, &tracks_d->detIndices); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif +} + +template +void CAHitNtupletGeneratorKernels::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + // these are pointer on GPU! + auto *tuples_d = &tracks_d->hitIndices; + auto *quality_d = (Quality *)(&tracks_d->m_quality); + + // zero tuples + cudautils::launchZero(tuples_d, cudaStream); + + auto nhits = hh.nHits(); + assert(nhits <= pixelGPUConstants::maxNumberOfHits); + + // std::cout << "N hits " << nhits << std::endl; + // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; + + // + // applying conbinatoric cleaning such as fishbone at this stage is too expensive + // + + auto nthTot = 64; + auto stride = 4; + auto blockSize = nthTot / stride; + int numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + auto rescale = numberOfBlocks / 65536; + blockSize *= (rescale + 1); + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + assert(numberOfBlocks < 65536); + assert(blockSize > 0 && 0 == blockSize % 16); + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + + cudautils::launch(kernel_connect,{blks, thrs, 0, cudaStream}, + device_hitTuple_apc_, + device_hitToTuple_apc_, // needed only to be reset, ready for next kernel + hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_, + device_isOuterHitOfCell_.get(), + m_params.hardCurvCut_, + m_params.ptmin_, + m_params.CAThetaCutBarrel_, + m_params.CAThetaCutForward_, + m_params.dcaCutInnerTriplet_, + m_params.dcaCutOuterTriplet_); + + if (nhits > 1 && m_params.earlyFishbone_) { + auto nthTot = 128; + auto stride = 16; + int blockSize = nthTot / stride; + int numberOfBlocks = (nhits + blockSize - 1) / blockSize; + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + cudautils::launch(fishbone,{blks, thrs, 0, cudaStream}, + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); + } + + blockSize = 64; + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_find_ntuplets,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellTracks_, + tuples_d, + device_hitTuple_apc_, + quality_d, + m_params.minHitsPerNtuplet_); + + if (m_params.doStats_) + cudautils::launch(kernel_mark_used,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), device_theCells_.get(), device_nCells_); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + blockSize = 128; + numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; + cudautils::launch(cudautils::finalizeBulk::type>,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d); + + // remove duplicates (tracks that share a doublet) + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_earlyDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream}, + device_theCells_.get(), device_nCells_, tuples_d, quality_d); + + blockSize = 128; + numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_countMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream}, + tuples_d, quality_d, device_tupleMultiplicity_.get()); + cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); + cudautils::launch(kernel_fillMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream}, + tuples_d, quality_d, device_tupleMultiplicity_.get()); + + if (nhits > 1 && m_params.lateFishbone_) { + auto nthTot = 128; + auto stride = 16; + auto blockSize = nthTot / stride; + auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + cudautils::launch(fishbone,{blks, thrs, 0, cudaStream}, + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); + } + + if (m_params.doStats_) { + numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize; + cudautils::launch(kernel_checkOverflows,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, + device_tupleMultiplicity_.get(), + device_hitTuple_apc_, + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_, + device_theCellTracks_, + device_isOuterHitOfCell_.get(), + nhits, + m_params.maxNumberOfDoublets_, + counters_); + } +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif +} + +template +void CAHitNtupletGeneratorKernels::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { + auto nhits = hh.nHits(); + +#ifdef NTUPLE_DEBUG + std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; +#endif + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + // in principle we can use "nhits" to heuristically dimension the workspace... + device_isOuterHitOfCell_ = TTraits:: template make_unique(std::max(1U, nhits), stream); + assert(device_isOuterHitOfCell_.get()); + { + int threadsPerBlock = 128; + // at least one block! + int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; + cudautils::launch(gpuPixelDoublets::initDoublets,{blocks, threadsPerBlock, 0, stream},device_isOuterHitOfCell_.get(), + nhits, + device_theCellNeighbors_, + device_theCellNeighborsContainer_.get(), + device_theCellTracks_, + device_theCellTracksContainer_.get()); + } + + device_theCells_ = TTraits:: template make_unique(m_params.maxNumberOfDoublets_, stream); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + if (0 == nhits) + return; // protect against empty events + + // FIXME avoid magic numbers + auto nActualPairs = gpuPixelDoublets::nPairs; + if (!m_params.includeJumpingForwardDoublets_) + nActualPairs = 15; + if (m_params.minHitsPerNtuplet_ > 3) { + nActualPairs = 13; + } + + assert(nActualPairs <= gpuPixelDoublets::nPairs); + int stride = 4; + int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride; + int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; + dim3 blks(1, blocks, 1); + dim3 thrs(stride, threadsPerBlock, 1); + cudautils::launch(gpuPixelDoublets::getDoubletsFromHisto,{blks, thrs, 0, stream},device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_, + device_theCellTracks_, + hh.view(), + device_isOuterHitOfCell_.get(), + nActualPairs, + m_params.idealConditions_, + m_params.doClusterCut_, + m_params.doZ0Cut_, + m_params.doPtCut_, + m_params.maxNumberOfDoublets_); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif +} + +template +void CAHitNtupletGeneratorKernels::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + // these are pointer on GPU! + auto const *tuples_d = &tracks_d->hitIndices; + auto *quality_d = (Quality *)(&tracks_d->m_quality); + + int blockSize = 64; + + // classify tracks based on kinematics + int numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_classifyTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, tracks_d, m_params.cuts_, quality_d); + + if (m_params.lateFishbone_) { + // apply fishbone cleaning to good tracks + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_fishboneCleaner,{numberOfBlocks, blockSize, 0, cudaStream}, + device_theCells_.get(), device_nCells_, quality_d); + } + + // remove duplicates (tracks that share a doublet) + numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_fastDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream}, + device_theCells_.get(), device_nCells_, tuples_d, tracks_d); + + if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) { + // fill hit->track "map" + numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_countHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream}, + tuples_d, quality_d, device_hitToTuple_.get()); + cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); + cudaCheck(cudaGetLastError()); + cudautils::launch(kernel_fillHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, device_hitToTuple_.get()); + } + if (m_params.minHitsPerNtuplet_ < 4) { + // remove duplicates (tracks that share a hit) + numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + cudautils::launch(kernel_tripletCleaner,{numberOfBlocks, blockSize, 0, cudaStream}, + hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); + } + + if (m_params.doStats_) { + // counters (add flag???) + numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + cudautils::launch(kernel_doStatsForHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},device_hitToTuple_.get(), counters_); + numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; + cudautils::launch(kernel_doStatsForTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, counters_); + } +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + +#ifdef DUMP_GPU_TK_TUPLES + static std::atomic iev(0); + ++iev; + cudautils::launch(kernel_print_found_ntuplets,{1, 32, 0, cudaStream}, + hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); +#endif +} + +template +void CAHitNtupletGeneratorKernels::printCounters(Counters const *counters) { + cudautils::launch(kernel_printCounters,{1, 1,0, cudaStreamDefault},counters); +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 2e875caba7130..e6525bd16a49b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -159,61 +159,3 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& "cuts\" based on the fit results (pT, Tip, Zip)."); } -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d, - float bfield, - cudaStream_t stream) const { - PixelTrackHeterogeneous tracks(cudautils::make_device_unique(stream)); - - auto* soa = tracks.get(); - - CAHitNtupletGeneratorKernelsGPU kernels(m_params); - kernels.counters_ = m_counters; - HelixFitOnGPU fitter(bfield, m_params.fit5as4_); - - kernels.allocateOnGPU(stream); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); - - kernels.buildDoublets(hits_d, stream); - kernels.launchKernels(hits_d, soa, stream); - kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" - if (m_params.useRiemannFit_) { - fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); - } else { - fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); - } - kernels.classifyTuples(hits_d, soa, stream); - - return tracks; -} - -PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - PixelTrackHeterogeneous tracks(std::make_unique()); - - auto* soa = tracks.get(); - assert(soa); - - CAHitNtupletGeneratorKernelsCPU kernels(m_params); - kernels.counters_ = m_counters; - kernels.allocateOnGPU(nullptr); - - kernels.buildDoublets(hits_d, nullptr); - kernels.launchKernels(hits_d, soa, nullptr); - kernels.fillHitDetIndices(hits_d.view(), soa, nullptr); // in principle needed only if Hits not "available" - - if (0 == hits_d.nHits()) - return tracks; - - // now fit - HelixFitOnGPU fitter(bfield, m_params.fit5as4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); - - if (m_params.useRiemannFit_) { - fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets()); - } else { - fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets()); - } - - kernels.classifyTuples(hits_d, soa, nullptr); - - return tracks; -} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index de2e1913dd18b..6e92c1b217cfb 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -47,9 +47,8 @@ class CAHitNtupletGeneratorOnGPU { static void fillDescriptions(edm::ParameterSetDescription& desc); static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; } - PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; - - PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; + template + PixelTrackHeterogeneous makeTuples(Hits2D const& hits_d, float bfield, cudaStream_t stream) const; private: void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const; @@ -63,4 +62,42 @@ class CAHitNtupletGeneratorOnGPU { Counters* m_counters = nullptr; }; +template +inline +PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hits_d, + float bfield, + cudaStream_t stream) const { + PixelTrackHeterogeneous tracks(Traits:: template make_unique(stream)); + + using NTupletGeneratorKernels = CAHitNtupletGeneratorKernels; + + auto* soa = tracks.get(); + + NTupletGeneratorKernels kernels(m_params); + kernels.counters_ = m_counters; + kernels.allocate(stream); + + kernels.buildDoublets(hits_d, stream); + kernels.launchKernels(hits_d, soa, stream); + kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" + + if constexpr (!Traits::runOnDevice) + if (0 == hits_d.nHits()) + return tracks; + + // now fit + HelixFitOnGPU fitter(bfield, m_params.fit5as4_); + fitter.allocate(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + if (m_params.useRiemannFit_) { + fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); + } else { + fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); + } + kernels.classifyTuples(hits_d, soa, stream); + + return tracks; +} + + + #endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index becbd0a1a8540..050a5c83431ce 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -1,7 +1,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HelixFitOnGPU.h" -void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, +void HelixFitOnGPU::allocate(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *helix_fit_results) { tuples_d = tuples; @@ -13,4 +13,4 @@ void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, assert(outputSoa_d); } -void HelixFitOnGPU::deallocateOnGPU() {} +void HelixFitOnGPU::deallocate() {} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 05b399e870f58..cf877dbd11363 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -41,17 +41,18 @@ class HelixFitOnGPU { using TupleMultiplicity = CAConstants::TupleMultiplicity; explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {} - ~HelixFitOnGPU() { deallocateOnGPU(); } + ~HelixFitOnGPU() { deallocate(); } void setBField(double bField) { bField_ = bField; } + + template void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); + template + void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); - void deallocateOnGPU(); + void allocate(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); + void deallocate(); private: static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits(); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc index 3476362864a79..f77f9a4fb9a41 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc @@ -1,110 +1,7 @@ #include "RiemannFitOnGPU.h" -void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) { - assert(tuples_d); - - // Fit internals - auto hitsGPU_ = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = - std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); - auto circle_fit_resultsGPU_holder = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)); - Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get()); - - for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { - // triplets - kernelFastFit<3>( - tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - - kernelCircleFit<3>(tupleMultiplicity_d, - 3, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - kernelLineFit<3>(tupleMultiplicity_d, - 3, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - // quads - kernelFastFit<4>( - tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - - kernelCircleFit<4>(tupleMultiplicity_d, - 4, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - kernelLineFit<4>(tupleMultiplicity_d, - 4, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - if (fit5as4_) { - // penta - kernelFastFit<4>( - tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - - kernelCircleFit<4>(tupleMultiplicity_d, - 5, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - kernelLineFit<4>(tupleMultiplicity_d, - 5, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - } else { - // penta all 5 - kernelFastFit<5>( - tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - - kernelCircleFit<5>(tupleMultiplicity_d, - 5, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - - kernelLineFit<5>(tupleMultiplicity_d, - 5, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - } - } -} +template +void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, + uint32_t nhits, + uint32_t maxNumberOfTuples, + cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu index cb5d32b47aea3..011b7842f9681 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -1,131 +1,7 @@ #include "RiemannFitOnGPU.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, +template +void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, - cudaStream_t stream) { - assert(tuples_d); - - auto blockSize = 64; - auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; - - // Fit internals - auto hitsGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream); - auto hits_geGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream); - auto fast_fit_resultsGPU_ = cudautils::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream); - auto circle_fit_resultsGPU_holder = - cudautils::make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream); - Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get()); - - for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { - // triplets - kernelFastFit<3><<>>( - tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - cudaCheck(cudaGetLastError()); - - kernelCircleFit<3><<>>(tupleMultiplicity_d, - 3, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - kernelLineFit<3><<>>(tupleMultiplicity_d, - 3, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - // quads - kernelFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - cudaCheck(cudaGetLastError()); - - kernelCircleFit<4><<>>(tupleMultiplicity_d, - 4, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - kernelLineFit<4><<>>(tupleMultiplicity_d, - 4, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - if (fit5as4_) { - // penta - kernelFastFit<4><<>>( - tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - cudaCheck(cudaGetLastError()); - - kernelCircleFit<4><<>>(tupleMultiplicity_d, - 5, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - kernelLineFit<4><<>>(tupleMultiplicity_d, - 5, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - } else { - // penta all 5 - kernelFastFit<5><<>>( - tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); - cudaCheck(cudaGetLastError()); - - kernelCircleFit<5><<>>(tupleMultiplicity_d, - 5, - bField_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - - kernelLineFit<5><<>>(tupleMultiplicity_d, - 5, - bField_, - outputSoa_d, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - circle_fit_resultsGPU_, - offset); - cudaCheck(cudaGetLastError()); - } - } -} + cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index b46de519034d5..c6345ba6eab3b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -187,3 +187,134 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ #endif } } +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +template +void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, + uint32_t nhits, + uint32_t maxNumberOfTuples, + cudaStream_t stream) { + assert(tuples_d); + + int blockSize = 64; + int numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; + + // Fit internals + auto hitsGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream); + auto hits_geGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream); + auto fast_fit_resultsGPU_ = Traits:: template make_unique( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream); + auto circle_fit_resultsGPU_holder = + Traits:: template make_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream); + Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get()); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // triplets + cudautils::launch(kernelFastFit<3>,{numberOfBlocks, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelCircleFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d, + 3, + bField_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelLineFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d, + 3, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + // quads + cudautils::launch(kernelFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelCircleFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 4, + bField_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelLineFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 4, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + if (fit5as4_) { + // penta + cudautils::launch(kernelFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelCircleFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 5, + bField_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelLineFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 5, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + } else { + // penta all 5 + cudautils::launch(kernelFastFit<5>,{numberOfBlocks / 4, blockSize, 0, stream}, + tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelCircleFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 5, + bField_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + cudautils::launch(kernelLineFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d, + 5, + bField_, + outputSoa_d, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + } + } +} diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc index e959abcfef949..347476cf1b6a3 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc @@ -96,7 +96,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent assert(tracks); - ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin)); + ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make(ctx.stream(), tracks, m_ptMin,m_OnGPU)); } else { auto const* tracks = iEvent.get(tokenCPUTrack_).get(); @@ -117,7 +117,8 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; */ - iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin)); + cudaStream_t stream=cudaStreamDefault; + iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(stream,tracks, m_ptMin,m_OnGPU)); } } diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc new file mode 100644 index 0000000000000..e8c6a812d86d2 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc @@ -0,0 +1,9 @@ +#define CUDA_KERNELS_ON_CPU // not needed yet +#include "gpuVertexFinderImpl.h" +namespace gpuVertexFinder { +template<> + ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const { + assert(!onGPU); + return makeImpl(stream,tksoa,ptMin); + } +} diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp deleted file mode 100644 index 084763385bdb4..0000000000000 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "gpuVertexFinderImpl.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu index 084763385bdb4..96739a42c3b0a 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu @@ -1 +1,8 @@ #include "gpuVertexFinderImpl.h" +namespace gpuVertexFinder { +template<> + ZVertexHeterogeneous Producer::make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const { + assert(onGPU); + return makeImpl(stream,tksoa,ptMin); + } +} diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h index 6cd86c93a6737..4dc4572da6e7e 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h @@ -63,8 +63,11 @@ namespace gpuVertexFinder { ~Producer() = default; - ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; - ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const; + template + ZVertexHeterogeneous makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; + + template + ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const; private: const bool oneKernel_; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h index d6e63227ccf85..6fd6e985f25b9 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h @@ -1,12 +1,14 @@ -#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h" -#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h" -#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksIterative.h" +#include "gpuClusterTracksByDensity.h" +#include "gpuClusterTracksDBSCAN.h" +#include "gpuClusterTracksIterative.h" #include "gpuFitVertices.h" #include "gpuSortByPt2.h" #include "gpuSplitVertices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + namespace gpuVertexFinder { @@ -84,87 +86,45 @@ namespace gpuVertexFinder { } #endif -#ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { - // std::cout << "producing Vertices on GPU" << std::endl; - ZVertexHeterogeneous vertices(cudautils::make_device_unique(stream)); -#else - ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const { - // std::cout << "producing Vertices on CPU" << std::endl; - ZVertexHeterogeneous vertices(std::make_unique()); -#endif + template + ZVertexHeterogeneous Producer::makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { + // std::cout << "producing Vertices on " << Traits::name << std::endl; + ZVertexHeterogeneous vertices(Traits::template make_unique(stream)); assert(tksoa); auto* soa = vertices.get(); assert(soa); -#ifdef __CUDACC__ - auto ws_d = cudautils::make_device_unique(stream); -#else - auto ws_d = std::make_unique(); -#endif + auto ws_d = Traits::template make_unique(stream); -#ifdef __CUDACC__ - init<<<1, 1, 0, stream>>>(soa, ws_d.get()); auto blockSize = 128; auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin); - cudaCheck(cudaGetLastError()); -#else - cudaCompat::resetGrid(); - init(soa, ws_d.get()); - loadTracks(tksoa, soa, ws_d.get(), ptMin); -#endif + cudautils::launch(init,{1,1,0,stream},soa, ws_d.get()); + cudautils::launch(loadTracks, {numberOfBlocks, blockSize, 0, stream}, tksoa, soa, ws_d.get(), ptMin); -#ifdef __CUDACC__ if (oneKernel_) { // implemented only for density clustesrs #ifndef THREE_KERNELS - vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudautils::launch(vertexFinderOneKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); #else - vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); - cudaCheck(cudaGetLastError()); + cudautils::launch(vertexFinderKernel1,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); // one block per vertex... - splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f); - cudaCheck(cudaGetLastError()); - vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get()); + cudautils::launch(splitVerticesKernel,{1024, 128, 0, stream},soa, ws_d.get(), 9.f); + cudautils::launch(vertexFinderKernel2,{1, 1024 - 256, 0, stream},soa, ws_d.get()); #endif } else { // five kernels if (useDensity_) { - clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudautils::launch(clusterTracksByDensityKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudautils::launch(clusterTracksDBSCAN,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudautils::launch(clusterTracksIterative,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max); } - cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.); - cudaCheck(cudaGetLastError()); + cudautils::launch(fitVerticesKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), 50.); // one block per vertex... - splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f); - cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.); - cudaCheck(cudaGetLastError()); - sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get()); + cudautils::launch(splitVerticesKernel,{1024, 128, 0, stream},soa, ws_d.get(), 9.f); + cudautils::launch(fitVerticesKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), 5000.); + cudautils::launch(sortByPt2Kernel,{1, 1024 - 256, 0, stream},soa, ws_d.get()); } - cudaCheck(cudaGetLastError()); -#else // __CUDACC__ - if (useDensity_) { - clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); - } else if (useDBSCAN_) { - clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); - } else if (useIterative_) { - clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); - } - // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; - fitVertices(soa, ws_d.get(), 50.); - // one block per vertex! - blockIdx.x = 0; - gridDim.x = 1; - splitVertices(soa, ws_d.get(), 9.f); - resetGrid(); - fitVertices(soa, ws_d.get(), 5000.); - sortByPt2(soa, ws_d.get()); -#endif return vertices; } diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 5261069a6b283..1bf4e223ab52f 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -22,7 +22,6 @@ #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h" #ifdef ONE_KERNEL -#ifdef __CUDACC__ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws, int minT, // min number of neighbours to be "seed" @@ -41,7 +40,6 @@ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, sortByPt2(pdata, pws); } #endif -#endif using namespace gpuVertexFinder; @@ -56,7 +54,7 @@ struct Event { struct ClusterGenerator { explicit ClusterGenerator(float nvert, float ntrack) - : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {} + : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(0.001,1.) {} void operator()(Event& ev) { int nclus = clusGen(reng); @@ -68,19 +66,28 @@ struct ClusterGenerator { ev.ztrack.clear(); ev.eztrack.clear(); - ev.ivert.clear(); + ev.pttrack.clear(); + ev.ivert.clear(); + float ptMax=0; float pt5=0; for (int iv = 0; iv < nclus; ++iv) { auto nt = trackGen(reng); - ev.itrack[nclus] = nt; + if (iv == 5) nt *= 3; + ev.itrack[iv] = nt; + float ptSum=0; for (int it = 0; it < nt; ++it) { auto err = errgen(reng); // reality is not flat.... ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng)); ev.eztrack.push_back(err * err); ev.ivert.push_back(iv); - ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng)); + ev.pttrack.push_back(std::pow(ptGen(reng),iv==5 ?-1.0f:-0.5f)); ev.pttrack.back() *= ev.pttrack.back(); + ptSum += ev.pttrack.back(); } + ptMax = std::max(ptMax,ptSum); + if (iv == 5) pt5 = ptSum; } + std::cout << "PV, ptMax " << std::sqrt(pt5) << ' ' << std::sqrt(ptMax) << std::endl; + // add noise auto nt = 2 * trackGen(reng); for (int it = 0; it < nt; ++it) { @@ -88,7 +95,7 @@ struct ClusterGenerator { ev.ztrack.push_back(rgen(reng)); ev.eztrack.push_back(err * err); ev.ivert.push_back(9999); - ev.pttrack.push_back(0.5f + ptGen(reng)); + ev.pttrack.push_back(std::pow(ptGen(reng),-0.5f)); ev.pttrack.back() *= ev.pttrack.back(); } } @@ -99,7 +106,7 @@ struct ClusterGenerator { std::poisson_distribution clusGen; std::poisson_distribution trackGen; std::normal_distribution gauss; - std::exponential_distribution ptGen; + std::uniform_real_distribution ptGen; // becomes a power low }; // a macro SORRY @@ -112,8 +119,14 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) { printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate); } +__global__ void linit(ZVertices * pdata, WorkSpace * pws, int nt) { + auto & __restrict__ data = *pdata; + auto & __restrict__ ws = *pws; + for (int i=0; i(1, nullptr); @@ -127,28 +140,36 @@ int main() { float eps = 0.1f; std::array par{{eps, 0.01f, 9.0f}}; - for (int nav = 30; nav < 80; nav += 20) { + for (int nav = 30; nav < 100; nav += 20) { ClusterGenerator gen(nav, 10); - for (int i = 8; i < 20; ++i) { - auto kk = i / 4; // M param + for (int iii = 8; iii < 20; ++iii) { + auto kk = iii / 4; // M param gen(ev); -#ifdef __CUDACC__ + std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; + int nt = ev.ztrack.size(); + int nvori = ev.zvert.size(); + int ntori = nt; + assert(ntori>>(onGPU_d.get(), ws_d.get()); + linit<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get(),ntori); #else onGPU_d->init(); ws_d->init(); + for (int16_t i=0; iitrk[i]=i; onGPU_d->idv[i] = -1;} // FIXME do the same on GPU.... #endif - std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; - auto nt = ev.ztrack.size(); -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + #else ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); @@ -156,19 +177,19 @@ int main() { ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); #endif - std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; + std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (iii % 4) << std::endl; - if ((i % 4) == 0) + if ((iii % 4) == 0) par = {{eps, 0.02f, 12.0f}}; - if ((i % 4) == 1) + if ((iii % 4) == 1) par = {{eps, 0.02f, 9.0f}}; - if ((i % 4) == 2) + if ((iii % 4) == 2) par = {{eps, 0.01f, 9.0f}}; - if ((i % 4) == 3) + if ((iii % 4) == 3) par = {{0.7f * eps, 0.01f, 9.0f}}; uint32_t nv = 0; -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); @@ -200,6 +221,7 @@ int main() { continue; } + int16_t * idv = nullptr; float* zv = nullptr; float* wv = nullptr; float* ptv2 = nullptr; @@ -209,19 +231,22 @@ int main() { // keep chi2 separated... float chi2[2 * nv]; // make space for splitting... -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU + int16_t hidv[16000]; float hzv[2 * nv]; float hwv[2 * nv]; float hptv2[2 * nv]; int32_t hnn[2 * nv]; uint16_t hind[2 * nv]; + idv = hidv; zv = hzv; wv = hwv; ptv2 = hptv2; nn = hnn; ind = hind; #else + idv = onGPU_d->idv; zv = onGPU_d->zv; wv = onGPU_d->wv; ptv2 = onGPU_d->ptv2; @@ -229,13 +254,71 @@ int main() { ind = onGPU_d->sortInd; #endif -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); #endif + auto verifyMatch = [&]() { + + // matching-merging metrics + constexpr int MAXMA = 32; + struct Match { Match() {for (auto&e:vid)e=-1; for (auto&e:nt)e=0;} std::array vid; std::array nt; }; + + auto nnn=0; + Match matches[nv]; for (auto kv = 0U; kv < nv; ++kv) { matches[kv] = Match();} + auto iPV = ind[nv - 1]; + for (int it=0; it9990) continue; + assert(iv9990) continue; + ++nnn; + for (int i=0; i=0 && mx0.75f) ++nok; + if (frac[kv]<0.5f) ++nmess; + auto ldz = std::abs(zv[kv] - ev.zvert[itv]); + dz = std::max(dz,ldz); + int nm5=0; int nm7=0; + int ntt=0; + for (int i=0; i0.5f) ++nm5; + if (f>0.75f) ++nm7; + } + if (nm5>1) ++merged50; + if (nm7>1) ++merged75; + if (kv == iPV ) std::cout << "PV " << itv << ' ' << std::sqrt(ptv2[kv]) << ' ' << float(ntt)/float(ev.itrack[itv]) << '/' << frac[kv] << '/' << nm5 << '/' << nm7 << ' ' << dz << std::endl; + } + // for (auto f: frac) std::cout << f << ' '; + // std::cout << std::endl; + std::cout << "ori/tot/matched/merged5//merged7/random/dz " + << nvori << '/' << nv << '/' << nok << '/' << merged50 << '/' << merged75 << '/' << nmess + << '/' << dz << std::endl; + }; // verifyMatch + + for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) chi2[j] /= float(nn[j]); @@ -244,7 +327,7 @@ int main() { std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; } -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); @@ -263,7 +346,7 @@ int main() { std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; } -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU // one vertex per block!!! cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); @@ -276,12 +359,9 @@ int main() { #endif std::cout << "after split " << nv << std::endl; -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); - cudaCheck(cudaGetLastError()); - cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); - cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); @@ -295,14 +375,16 @@ int main() { continue; } -#ifdef __CUDACC__ +#ifndef CUDA_KERNELS_ON_CPU cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(idv, LOC_ONGPU(idv), nt * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif + for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) chi2[j] /= float(nn[j]); @@ -333,7 +415,7 @@ int main() { } dd[kv] = md; } - if (i == 6) { + if (iii == 6) { for (auto d : dd) std::cout << d << ' '; std::cout << std::endl; @@ -345,6 +427,10 @@ int main() { rms = std::sqrt(rms) / (nv - 1); std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl; + + verifyMatch(); + + } // loop on events } // lopp on ave vert