Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#ifndef CUDADataFormats_TrackingRecHit_interface_SiPixelHitStatus_H
#define CUDADataFormats_TrackingRecHit_interface_SiPixelHitStatus_H

#include <cstdint>

// more information on bit fields : https://en.cppreference.com/w/cpp/language/bit_field
struct SiPixelHitStatus {
bool isBigX : 1; // ∈[0,1]
bool isOneX : 1; // ∈[0,1]
bool isBigY : 1; // ∈[0,1]
bool isOneY : 1; // ∈[0,1]
uint8_t qBin : 3; // ∈[0,1,...,7]
};

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ class TrackingRecHit2DHeterogeneous {

TrackingRecHit2DHeterogeneous() = default;

explicit TrackingRecHit2DHeterogeneous(uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream);
explicit TrackingRecHit2DHeterogeneous(
uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);

~TrackingRecHit2DHeterogeneous() = default;

Expand All @@ -41,6 +43,9 @@ class TrackingRecHit2DHeterogeneous {
cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;

// needs specialization for Host
void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);

private:
static constexpr uint32_t n16 = 4; // number of elements in m_store16
static constexpr uint32_t n32 = 10; // number of elements in m_store32
Expand All @@ -65,20 +70,27 @@ class TrackingRecHit2DHeterogeneous {
int16_t* m_iphi;
};

using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;

#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

template <typename Traits>
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream)
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
: m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);

view->m_nHits = nHits;
m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream); // leave it on host and pass it by value?
m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_cpeParams = cpeParams;
view->m_hitsModuleStart = hitsModuleStart;
Expand All @@ -98,15 +110,21 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
// if ordering is relevant they may have to be stored phi-ordered by layer or so
// this will break 1to1 correspondence with cluster and module locality
// so unless proven VERY inefficient we keep it ordered as generated
m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
m_store32 =
Traits::template make_device_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
m_PhiBinnerStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);

// host copy is "reduced" (to be reviewed at some point)
if constexpr (std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
// it has to compile for ALL cases
copyFromGPU(input, stream);
} else {
assert(input == nullptr);
m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
}

static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));

auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
auto get32 = [&](int i) { return m_store32.get() + i * nHits; };

// copy all the pointers
Expand All @@ -118,20 +136,25 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
view->m_yl = get32(1);
view->m_xerr = get32(2);
view->m_yerr = get32(3);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));

view->m_xg = get32(4);
view->m_yg = get32(5);
view->m_zg = get32(6);
view->m_rg = get32(7);
if constexpr (!std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
assert(input == nullptr);
view->m_xg = get32(5);
view->m_yg = get32(6);
view->m_zg = get32(7);
view->m_rg = get32(8);

m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(0));
auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(1));

view->m_charge = reinterpret_cast<int32_t*>(get32(8));
view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
view->m_detInd = get16(1);
view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
view->m_detInd = get16(0);

m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
}

// transfer view
if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
Expand Down
53 changes: 53 additions & 0 deletions CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h

#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
#include "CUDADataFormats/Common/interface/HostProduct.h"

// a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction
class TrackingRecHit2DReduced {
public:
using HLPstorage = HostProduct<float[]>;
using HIDstorage = HostProduct<uint16_t[]>;

template <typename UP32, typename UP16>
TrackingRecHit2DReduced(UP32&& istore32, UP16&& istore16, int nhits)
: m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) {
auto get32 = [&](int i) { return const_cast<float*>(m_store32.get()) + i * nhits; };

// copy all the pointers (better be in sync with the producer store)

m_view.m_xl = get32(0);
m_view.m_yl = get32(1);
m_view.m_xerr = get32(2);
m_view.m_yerr = get32(3);
m_view.m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
m_view.m_detInd = const_cast<uint16_t*>(m_store16.get());
}

// view only!
TrackingRecHit2DReduced(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {}

TrackingRecHit2DReduced() = default;
~TrackingRecHit2DReduced() = default;

TrackingRecHit2DReduced(const TrackingRecHit2DReduced&) = delete;
TrackingRecHit2DReduced& operator=(const TrackingRecHit2DReduced&) = delete;
TrackingRecHit2DReduced(TrackingRecHit2DReduced&&) = default;
TrackingRecHit2DReduced& operator=(TrackingRecHit2DReduced&&) = default;

TrackingRecHit2DSOAView& view() { return m_view; }
TrackingRecHit2DSOAView const& view() const { return m_view; }

auto nHits() const { return m_nHits; }

private:
TrackingRecHit2DSOAView m_view;

HLPstorage m_store32;
HIDstorage m_store16;

int m_nHits;
};

#endif
22 changes: 19 additions & 3 deletions CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
#include "CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h"

namespace pixelCPEforGPU {
struct ParamsOnGPU;
}

class TrackingRecHit2DSOAView {
public:
using Status = SiPixelHitStatus;
static_assert(sizeof(Status) == sizeof(uint8_t));

using hindex_type = uint32_t; // if above is <=2^32

using PhiBinner = cms::cuda::HistoContainer<int16_t, 128, -1, 8 * sizeof(int16_t), hindex_type, 10>;
Expand All @@ -22,6 +26,7 @@ class TrackingRecHit2DSOAView {

template <typename>
friend class TrackingRecHit2DHeterogeneous;
friend class TrackingRecHit2DReduced;

__device__ __forceinline__ uint32_t nHits() const { return m_nHits; }

Expand All @@ -47,8 +52,18 @@ class TrackingRecHit2DSOAView {
__device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; }
__device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); }

__device__ __forceinline__ int32_t& charge(int i) { return m_charge[i]; }
__device__ __forceinline__ int32_t charge(int i) const { return __ldg(m_charge + i); }
__device__ __forceinline__ void setChargeAndStatus(int i, uint32_t ich, Status is) {
ich = std::min(ich, chargeMask());
uint32_t w = *reinterpret_cast<uint8_t*>(&is);
ich |= (w << 24);
m_chargeAndStatus[i] = ich;
}

__device__ __forceinline__ Status status(int i) const {
uint8_t w = __ldg(m_chargeAndStatus + i) >> 24;
return *reinterpret_cast<Status*>(&w);
}

__device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; }
__device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); }
__device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; }
Expand Down Expand Up @@ -79,7 +94,8 @@ class TrackingRecHit2DSOAView {
int16_t* m_iphi;

// cluster properties
int32_t* m_charge;
static constexpr uint32_t chargeMask() { return (1 << 24) - 1; }
uint32_t* m_chargeAndStatus;
int16_t* m_xsize;
int16_t* m_ysize;
uint16_t* m_detInd;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,22 @@

template <>
cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream);
auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream);
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint32_t[]>(gpuClustering::maxNumModules + 1, stream);
cudaCheck(cudaMemcpyAsync(
ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (gpuClustering::maxNumModules + 1), cudaMemcpyDefault, stream));
return ret;
}

// the only specialization needed
template <>
void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) {
assert(input);
m_store32 = input->localCoordToHostAsync(stream);
}
1 change: 1 addition & 0 deletions CUDADataFormats/TrackingRecHit/src/classes.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif // CUDADataFormats_SiPixelCluster_src_classes_h
6 changes: 4 additions & 2 deletions CUDADataFormats/TrackingRecHit/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
<lcgdict>
<class name="TrackingRecHit2DCPU" persistent="false"/>
<class name="TrackingRecHit2DHost" persistent="false"/>
<class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/>
<class name="edm::Wrapper<TrackingRecHit2DCPU>" persistent="false"/>
<class name="TrackingRecHit2DHost" persistent="false"/>
<class name="edm::Wrapper<TrackingRecHit2DHost>" persistent="false"/>
<class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/>
<class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPU>>" persistent="false"/>
<class name="TrackingRecHit2DReduced" persistent="false"/>
<class name="edm::Wrapper<TrackingRecHit2DReduced>" persistent="false"/>
</lcgdict>
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@ int main() {
cudaStream_t stream;
cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

auto nHits = 200;
// inner scope to deallocate memory before destroying the stream
{
auto nHits = 200;
TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream);

testTrackingRecHit2D::runKernels(tkhit.view());

TrackingRecHit2DHost tkhitH(nHits, nullptr, nullptr, stream, &tkhit);
cudaStreamSynchronize(stream);
assert(tkhitH.view());
assert(tkhitH.view()->nHits() == unsigned(nHits));
}

cudaCheck(cudaStreamDestroy(stream));
Expand Down
Loading