Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CUDADataFormats/TrackingRecHit/interface/SiPixelStatus.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#ifndef CUDADataFormats_TrackingRecHit_interface_SiPixelStatus_H
#define CUDADataFormats_TrackingRecHit_interface_SiPixelStatus_H

struct SiPixelStatus {
uint8_t isBigX : 1;
uint8_t isOneX : 1;
uint8_t isBigY : 1;
uint8_t isOneY : 1;
uint8_t qBin : 3;
};

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ class TrackingRecHit2DHeterogeneous {

TrackingRecHit2DHeterogeneous() = default;

explicit TrackingRecHit2DHeterogeneous(uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream);
explicit TrackingRecHit2DHeterogeneous(
uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);

~TrackingRecHit2DHeterogeneous() = default;

Expand All @@ -41,6 +43,9 @@ class TrackingRecHit2DHeterogeneous {
cms::cuda::host::unique_ptr<uint16_t[]> detIndexToHostAsync(cudaStream_t stream) const;
cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;

// needs specialization for Host
void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);

private:
static constexpr uint32_t n16 = 4;
static constexpr uint32_t n32 = 9;
Expand All @@ -64,26 +69,33 @@ class TrackingRecHit2DHeterogeneous {
int16_t* m_iphi;
};

using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;

#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

template <typename Traits>
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream)
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
: m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);

view->m_nHits = nHits;
m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream); // leave it on host and pass it by value?
m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_cpeParams = cpeParams;
view->m_hitsModuleStart = hitsModuleStart;

// if empy do not bother
if (0 == nHits) {
if (0 == nHits)
if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
cms::cuda::copyAsync(m_view, view, stream);
} else {
Expand All @@ -97,38 +109,57 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
// if ordering is relevant they may have to be stored phi-ordered by layer or so
// this will break 1to1 correspondence with cluster and module locality
// so unless proven VERY inefficient we keep it ordered as generated
m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>(stream);

auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
// host copy is "reduced" (to be reviewed at some point)
if
#ifdef __cpp_if_constexpr
constexpr
#endif
(std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
// it has to compile for ALL cases
copyFromGPU(input, stream);
} else {
assert(input == nullptr);
m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_unique<float[]>(nHits * n32 + 11, stream);
m_HistStore = Traits::template make_unique<TrackingRecHit2DSOAView::Hist>(stream);
}

auto get32 = [&](int i) { return m_store32.get() + i * nHits; };

// copy all the pointers
m_hist = view->m_hist = m_HistStore.get();

view->m_xl = get32(0);
view->m_yl = get32(1);
view->m_xerr = get32(2);
view->m_yerr = get32(3);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));

view->m_xg = get32(4);
view->m_yg = get32(5);
view->m_zg = get32(6);
view->m_rg = get32(7);

m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(0));

view->m_charge = reinterpret_cast<int32_t*>(get32(8));
view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
view->m_detInd = get16(1);

m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
if
#ifdef __cpp_if_constexpr
constexpr
#endif
(!std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
assert(input == nullptr);
view->m_xg = get32(5);
view->m_yg = get32(6);
view->m_zg = get32(7);
view->m_rg = get32(8);

auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(1));

view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
view->m_detInd = get16(0);

m_hist = view->m_hist = m_HistStore.get();
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
}

// transfer view
if
#ifndef __CUDACC__
#ifdef __cpp_if_constexpr
constexpr
#endif
(std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
Expand All @@ -138,9 +169,4 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
}
}

using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;

#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
52 changes: 52 additions & 0 deletions CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h

#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
#include "CUDADataFormats/Common/interface/HostProduct.h"

// a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction
class TrackingRecHit2DReduced {
public:
using HLPstorage = HostProduct<float[]>;
using HIDstorage = HostProduct<uint16_t[]>;

template <typename UP32, typename UP16>
TrackingRecHit2DReduced(UP32&& istore32, UP16&& istore16, int nhits)
: m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) {
auto get32 = [&](int i) { return const_cast<float*>(m_store32.get()) + i * nhits; };

// copy all the pointers (better be in sync with the producer store)

m_view.m_xl = get32(0);
m_view.m_yl = get32(1);
m_view.m_xerr = get32(2);
m_view.m_yerr = get32(3);
m_view.m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
m_view.m_detInd = const_cast<uint16_t*>(m_store16.get());
}

// view only!
TrackingRecHit2DReduced(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {}

TrackingRecHit2DReduced() = default;
~TrackingRecHit2DReduced() = default;

TrackingRecHit2DReduced(const TrackingRecHit2DReduced&) = delete;
TrackingRecHit2DReduced& operator=(const TrackingRecHit2DReduced&) = delete;
TrackingRecHit2DReduced(TrackingRecHit2DReduced&&) = default;
TrackingRecHit2DReduced& operator=(TrackingRecHit2DReduced&&) = default;

TrackingRecHit2DSOAView& view() { return m_view; }
TrackingRecHit2DSOAView const& view() const { return m_view; }

auto nHits() const { return m_nHits; }

TrackingRecHit2DSOAView m_view;

HLPstorage m_store32;
HIDstorage m_store16;

int m_nHits;
};

#endif
23 changes: 20 additions & 3 deletions CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
#include "CUDADataFormats/TrackingRecHit/interface/SiPixelStatus.h"

namespace pixelCPEforGPU {
struct ParamsOnGPU;
}

class TrackingRecHit2DSOAView {
public:
using Status = SiPixelStatus;
static_assert(sizeof(Status) == sizeof(uint8_t));

static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; }
using hindex_type = uint16_t; // if above is <=2^16

Expand All @@ -24,6 +28,7 @@ class TrackingRecHit2DSOAView {

template <typename>
friend class TrackingRecHit2DHeterogeneous;
friend class TrackingRecHit2DReduced;

__device__ __forceinline__ uint32_t nHits() const { return m_nHits; }

Expand All @@ -49,8 +54,19 @@ class TrackingRecHit2DSOAView {
__device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; }
__device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); }

__device__ __forceinline__ int32_t& charge(int i) { return m_charge[i]; }
__device__ __forceinline__ int32_t charge(int i) const { return __ldg(m_charge + i); }
__device__ __forceinline__ void setChargeAndStatus(int i, uint32_t ich, Status is) {
// static_assert(0xffffff == chargeMask());
ich = std::min(ich, chargeMask());
uint32_t w = *reinterpret_cast<uint8_t*>(&is);
ich |= (w << 24);
m_chargeAndStatus[i] = ich;
}

__device__ __forceinline__ Status status(int i) const {
uint8_t w = __ldg(m_chargeAndStatus + i) >> 24;
return *reinterpret_cast<Status*>(&w);
}
__device__ __forceinline__ uint32_t charge(int i) const { return __ldg(m_chargeAndStatus + i) & chargeMask(); }
__device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; }
__device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); }
__device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; }
Expand Down Expand Up @@ -81,7 +97,8 @@ class TrackingRecHit2DSOAView {
int16_t* m_iphi;

// cluster properties
int32_t* m_charge;
static constexpr uint32_t chargeMask() { return (1 << 24) - 1; }
uint32_t* m_chargeAndStatus;
int16_t* m_xsize;
int16_t* m_ysize;
uint16_t* m_detInd;
Expand Down
24 changes: 19 additions & 5 deletions CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

template <>
cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream);
cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream);
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::detIndexToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint16_t[]>(nHits(), stream);
cms::cuda::copyAsync(ret, m_store16, nHits(), stream);
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint32_t[]>(2001, stream);
cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
return ret;
}

// the only specialization needed
template <>
void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) {
assert(input);
m_store32 = input->localCoordToHostAsync(stream);
}
1 change: 1 addition & 0 deletions CUDADataFormats/TrackingRecHit/src/classes.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif // CUDADataFormats_SiPixelCluster_src_classes_h
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@ int main() {
cudaStream_t stream;
cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

uint32_t nHits = 200;
// inner scope to deallocate memory before destroying the stream
{
auto nHits = 200;
TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream);

testTrackingRecHit2D::runKernels(tkhit.view());
}

TrackingRecHit2DHost tkhitH(nHits, nullptr, nullptr, stream, &tkhit);
cudaStreamSynchronize(stream);
assert(tkhitH.view());
assert(tkhitH.view()->nHits() == nHits);
}
cudaCheck(cudaStreamDestroy(stream));

return 0;
Expand Down
5 changes: 0 additions & 5 deletions RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,6 @@

#include "DataFormats/CaloRecHit/interface/MultifitComputations.h"

// needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
// TODO reuse some of the HCAL constats from
//#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
// ?

#include "SimpleAlgoGPU.h"
#include "KernelHelpers.h"
Expand Down
Loading