From ce7301ff375896a44e8094fef146883a51258528 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 5 Mar 2020 12:36:29 +0100 Subject: [PATCH 01/15] Add common tools for processing on GPUs --- .../GeometrySurface/interface/SOARotation.h | 28 ++ .../GeometrySurface/test/BuildFile.xml | 13 + .../test/gpuFrameTransformKernel.cu | 40 +++ .../test/gpuFrameTransformTest.cpp | 114 ++++++ DataFormats/Math/BuildFile.xml | 7 +- .../Math/interface/choleskyInversion.h | 333 +++++++++++++++++ DataFormats/Math/test/BuildFile.xml | 86 +++-- DataFormats/Math/test/CholeskyInvert_t.cpp | 136 +++++++ DataFormats/Math/test/CholeskyInvert_t.cu | 207 +++++++++++ DataFormats/Math/test/cudaAtan2Test.cu | 34 +- DataFormats/Math/test/cudaMathTest.cu | 53 ++- DataFormats/Math/test/testAtan2.cpp | 4 + .../interface/AtomicPairCounter.h | 52 +++ .../CUDAUtilities/interface/HistoContainer.h | 340 ++++++++++++++++++ .../CUDAUtilities/interface/cudaCompat.h | 103 ++++++ .../interface/cudastdAlgorithm.h | 70 ++++ .../CUDAUtilities/interface/eigenSoA.h | 55 +++ .../CUDAUtilities/interface/prefixScan.h | 169 +++++++++ .../CUDAUtilities/interface/radixSort.h | 266 ++++++++++++++ .../CUDAUtilities/src/cudaCompat.cc | 15 + .../CUDAUtilities/test/AtomicPairCounter_t.cu | 65 ++++ .../CUDAUtilities/test/HistoContainer_t.cpp | 146 ++++++++ .../CUDAUtilities/test/HistoContainer_t.cu | 154 ++++++++ .../CUDAUtilities/test/OneHistoContainer_t.cu | 140 ++++++++ .../CUDAUtilities/test/OneToManyAssoc_t.cpp | 1 + .../CUDAUtilities/test/OneToManyAssoc_t.cu | 1 + .../CUDAUtilities/test/OneToManyAssoc_t.h | 308 ++++++++++++++++ .../CUDAUtilities/test/cudastdAlgorithm_t.cpp | 34 ++ .../CUDAUtilities/test/cudastdAlgorithm_t.cu | 30 ++ .../CUDAUtilities/test/eigenSoA_t.cpp | 1 + .../CUDAUtilities/test/eigenSoA_t.cu | 1 + .../CUDAUtilities/test/eigenSoA_t.h | 101 ++++++ .../CUDAUtilities/test/prefixScan_t.cu | 165 +++++++++ .../CUDAUtilities/test/radixSort_t.cu | 202 +++++++++++ 34 files changed, 3388 insertions(+), 86 deletions(-) create mode 100644 DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu create mode 100644 DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp create mode 100644 DataFormats/Math/interface/choleskyInversion.h create mode 100644 DataFormats/Math/test/CholeskyInvert_t.cpp create mode 100644 DataFormats/Math/test/CholeskyInvert_t.cu create mode 100644 HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h create mode 100644 HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h create mode 100644 HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h create mode 100644 HeterogeneousCore/CUDAUtilities/interface/prefixScan.h create mode 100644 HeterogeneousCore/CUDAUtilities/interface/radixSort.h create mode 100644 HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc create mode 100644 HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu create mode 100644 HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp create mode 100644 HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h create mode 100644 HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp create mode 100644 HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu create mode 100644 HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp create mode 100644 HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu create mode 100644 HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h create mode 100644 HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu create mode 100644 HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu diff --git a/DataFormats/GeometrySurface/interface/SOARotation.h b/DataFormats/GeometrySurface/interface/SOARotation.h index 331a56b7ecf57..d75efef4736cb 100644 --- a/DataFormats/GeometrySurface/interface/SOARotation.h +++ b/DataFormats/GeometrySurface/interface/SOARotation.h @@ -100,6 +100,34 @@ class SOAFrame { uz += pz; } + constexpr inline void toGlobal(T cxx, T cxy, T cyy, T *gl) const { + auto const &r = rot; + gl[0] = r.xx() * (r.xx() * cxx + r.yx() * cxy) + r.yx() * (r.xx() * cxy + r.yx() * cyy); + gl[1] = r.xx() * (r.xy() * cxx + r.yy() * cxy) + r.yx() * (r.xy() * cxy + r.yy() * cyy); + gl[2] = r.xy() * (r.xy() * cxx + r.yy() * cxy) + r.yy() * (r.xy() * cxy + r.yy() * cyy); + gl[3] = r.xx() * (r.xz() * cxx + r.yz() * cxy) + r.yx() * (r.xz() * cxy + r.yz() * cyy); + gl[4] = r.xy() * (r.xz() * cxx + r.yz() * cxy) + r.yy() * (r.xz() * cxy + r.yz() * cyy); + gl[5] = r.xz() * (r.xz() * cxx + r.yz() * cxy) + r.yz() * (r.xz() * cxy + r.yz() * cyy); + } + + constexpr inline void toLocal(T const *ge, T &lxx, T &lxy, T &lyy) const { + auto const &r = rot; + + T cxx = ge[0]; + T cyx = ge[1]; + T cyy = ge[2]; + T czx = ge[3]; + T czy = ge[4]; + T czz = ge[5]; + + lxx = r.xx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) + + r.xy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.xz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz); + lxy = r.yx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) + + r.yy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.yz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz); + lyy = r.yx() * (r.yx() * cxx + r.yy() * cyx + r.yz() * czx) + + r.yy() * (r.yx() * cyx + r.yy() * cyy + r.yz() * czy) + r.yz() * (r.yx() * czx + r.yy() * czy + r.yz() * czz); + } + constexpr inline T x() const { return px; } constexpr inline T y() const { return py; } constexpr inline T z() const { return pz; } diff --git a/DataFormats/GeometrySurface/test/BuildFile.xml b/DataFormats/GeometrySurface/test/BuildFile.xml index 050cdb4c8f19d..5f4db224a639b 100644 --- a/DataFormats/GeometrySurface/test/BuildFile.xml +++ b/DataFormats/GeometrySurface/test/BuildFile.xml @@ -13,3 +13,16 @@ + + + + + + + + + + + + + diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu new file mode 100644 index 0000000000000..c24510146fb59 --- /dev/null +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu @@ -0,0 +1,40 @@ +#include +#include +#include + +#include "DataFormats/GeometrySurface/interface/SOARotation.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +__global__ void toGlobal(SOAFrame const* frame, + float const* xl, + float const* yl, + float* x, + float* y, + float* z, + float const* le, + float* ge, + uint32_t n) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i >= n) + return; + + frame[0].toGlobal(xl[i], yl[i], x[i], y[i], z[i]); + frame[0].toGlobal(le[3 * i], le[3 * i + 1], le[3 * i + 2], ge + 6 * i); +} + +void toGlobalWrapper(SOAFrame const* frame, + float const* xl, + float const* yl, + float* x, + float* y, + float* z, + float const* le, + float* ge, + uint32_t n) { + int threadsPerBlock = 256; + int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA toGlobal kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" + << std::endl; + + cms::cuda::launch(toGlobal, {blocksPerGrid, threadsPerBlock}, frame, xl, yl, x, y, z, le, ge, n); +} diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp new file mode 100644 index 0000000000000..ad62b7a1d131c --- /dev/null +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "DataFormats/GeometrySurface/interface/GloballyPositioned.h" +#include "DataFormats/GeometrySurface/interface/SOARotation.h" +#include "DataFormats/GeometrySurface/interface/TkRotation.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +void toGlobalWrapper(SOAFrame const *frame, + float const *xl, + float const *yl, + float *x, + float *y, + float *z, + float const *le, + float *ge, + uint32_t n); + +int main(void) { + cms::cudatest::requireDevices(); + + typedef float T; + typedef TkRotation Rotation; + typedef SOARotation SRotation; + typedef GloballyPositioned Frame; + typedef SOAFrame SFrame; + typedef typename Frame::PositionType Position; + typedef typename Frame::GlobalVector GlobalVector; + typedef typename Frame::GlobalPoint GlobalPoint; + typedef typename Frame::LocalVector LocalVector; + typedef typename Frame::LocalPoint LocalPoint; + + constexpr uint32_t size = 10000; + constexpr uint32_t size32 = size * sizeof(float); + + float xl[size], yl[size]; + float x[size], y[size], z[size]; + + // errors + float le[3 * size]; + float ge[6 * size]; + + auto d_xl = cms::cuda::make_device_unique(size, nullptr); + auto d_yl = cms::cuda::make_device_unique(size, nullptr); + + auto d_x = cms::cuda::make_device_unique(size, nullptr); + auto d_y = cms::cuda::make_device_unique(size, nullptr); + auto d_z = cms::cuda::make_device_unique(size, nullptr); + + auto d_le = cms::cuda::make_device_unique(3 * size, nullptr); + auto d_ge = cms::cuda::make_device_unique(6 * size, nullptr); + + double a = 0.01; + double ca = std::cos(a); + double sa = std::sin(a); + + Rotation r1(ca, sa, 0, -sa, ca, 0, 0, 0, 1); + Frame f1(Position(2, 3, 4), r1); + std::cout << "f1.position() " << f1.position() << std::endl; + std::cout << "f1.rotation() " << '\n' << f1.rotation() << std::endl; + + SFrame sf1(f1.position().x(), f1.position().y(), f1.position().z(), f1.rotation()); + + auto d_sf = cms::cuda::make_device_unique(sizeof(SFrame), nullptr); + cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice)); + + for (auto i = 0U; i < size; ++i) { + xl[i] = yl[i] = 0.1f * float(i) - float(size / 2); + le[3 * i] = 0.01f; + le[3 * i + 2] = (i > size / 2) ? 1.f : 0.04f; + le[2 * i + 1] = 0.; + } + std::random_shuffle(xl, xl + size); + std::random_shuffle(yl, yl + size); + + cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice)); + + toGlobalWrapper((SFrame const *)(d_sf.get()), + d_xl.get(), + d_yl.get(), + d_x.get(), + d_y.get(), + d_z.get(), + d_le.get(), + d_ge.get(), + size); + cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost)); + + float eps = 0.; + for (auto i = 0U; i < size; ++i) { + auto gp = f1.toGlobal(LocalPoint(xl[i], yl[i])); + eps = std::max(eps, std::abs(x[i] - gp.x())); + eps = std::max(eps, std::abs(y[i] - gp.y())); + eps = std::max(eps, std::abs(z[i] - gp.z())); + } + + std::cout << "max eps " << eps << std::endl; + + return 0; +} diff --git a/DataFormats/Math/BuildFile.xml b/DataFormats/Math/BuildFile.xml index 6aa1d86287860..83d06125a017c 100644 --- a/DataFormats/Math/BuildFile.xml +++ b/DataFormats/Math/BuildFile.xml @@ -1,6 +1,7 @@ - - + + + - + diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h new file mode 100644 index 0000000000000..eba2a17648008 --- /dev/null +++ b/DataFormats/Math/interface/choleskyInversion.h @@ -0,0 +1,333 @@ +#ifndef DataFormat_Math_choleskyInversion_h +#define DataFormat_Math_choleskyInversion_h + +#include + +#include + +/** + * fully inlined specialized code to perform the inversion of a + * positive defined matrix of rank up to 6. + * + * originally by + * @author Manuel Schiller + * @date Aug 29 2008 + * + * + */ +namespace choleskyInversion { + + template + inline constexpr void invert11(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + dst(0, 0) = F(1.0) / src(0, 0); + } + + template + inline constexpr void invert22(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0) * src(1, 0) * luc0; + auto luc2 = F(1.0) / (src(1, 1) - luc1); + + auto li21 = luc1 * luc0 * luc2; + + dst(0, 0) = li21 + luc0; + dst(1, 0) = -src(1, 0) * luc0 * luc2; + dst(1, 1) = luc2; + } + + template + inline constexpr void invert33(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4); + luc5 = F(1.0) / luc5; + + auto li21 = -luc0 * luc1; + auto li32 = -(luc2 * luc4); + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + + dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0; + dst(1, 0) = luc5 * li31 * li32 + li21 * luc2; + dst(1, 1) = luc5 * li32 * li32 + luc2; + dst(2, 0) = luc5 * li31; + dst(2, 1) = luc5 * li32; + dst(2, 2) = luc5; + } + + template + inline constexpr void invert44(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + + dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc9 * li43 * li43 + luc5; + dst(3, 0) = luc9 * li41; + dst(3, 1) = luc9 * li42; + dst(3, 2) = luc9 * li43; + dst(3, 3) = luc9; + } + + template + inline constexpr void invert55(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc14 * li54 * li54 + luc9; + dst(4, 0) = luc14 * li51; + dst(4, 1) = luc14 * li52; + dst(4, 2) = luc14 * li53; + dst(4, 3) = luc14 * li54; + dst(4, 4) = luc14; + } + + template + inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + auto luc15 = src(5, 0); + auto luc16 = (src(5, 1) - luc0 * luc1 * luc15); + auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16); + auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17); + auto luc19 = + (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18); + auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 + + luc9 * luc18 * luc18 + luc14 * luc19 * luc19); + luc20 = F(1.0) / luc20; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + auto li65 = -luc19 * luc14; + auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9; + auto li63 = (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5; + auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 - + luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 + + luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) * + luc2; + auto li61 = + (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 + + luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 + + luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 - + luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 - + luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - luc19 * luc13 * luc6 * luc9 * luc14 + + luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + luc19 * luc10 * luc14 - luc15) * + luc0; + + dst(0, 0) = + luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9; + dst(4, 0) = luc20 * li61 * li65 + luc14 * li51; + dst(4, 1) = luc20 * li62 * li65 + luc14 * li52; + dst(4, 2) = luc20 * li63 * li65 + luc14 * li53; + dst(4, 3) = luc20 * li64 * li65 + luc14 * li54; + dst(4, 4) = luc20 * li65 * li65 + luc14; + dst(5, 0) = luc20 * li61; + dst(5, 1) = luc20 * li62; + dst(5, 2) = luc20 * li63; + dst(5, 3) = luc20 * li64; + dst(5, 4) = luc20 * li65; + dst(5, 5) = luc20; + } + + template + inline constexpr void symmetrize11(M& dst) {} + template + inline constexpr void symmetrize22(M& dst) { + dst(0, 1) = dst(1, 0); + } + template + inline constexpr void symmetrize33(M& dst) { + symmetrize22(dst); + dst(0, 2) = dst(2, 0); + dst(1, 2) = dst(2, 1); + } + template + inline constexpr void symmetrize44(M& dst) { + symmetrize33(dst); + dst(0, 3) = dst(3, 0); + dst(1, 3) = dst(3, 1); + dst(2, 3) = dst(3, 2); + } + template + inline constexpr void symmetrize55(M& dst) { + symmetrize44(dst); + dst(0, 4) = dst(4, 0); + dst(1, 4) = dst(4, 1); + dst(2, 4) = dst(4, 2); + dst(3, 4) = dst(4, 3); + } + template + inline constexpr void symmetrize66(M& dst) { + symmetrize55(dst); + dst(0, 5) = dst(5, 0); + dst(1, 5) = dst(5, 1); + dst(2, 5) = dst(5, 2); + dst(3, 5) = dst(5, 3); + dst(4, 5) = dst(5, 4); + } + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert22(src, dst); + symmetrize22(dst); + } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert33(src, dst); + symmetrize33(dst); + } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert44(src, dst); + symmetrize44(dst); + } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert55(src, dst); + symmetrize55(dst); + } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert66(src, dst); + symmetrize66(dst); + } + }; + + // Eigen interface + template + inline constexpr void invert(Eigen::DenseBase const& src, Eigen::DenseBase& dst) { + using M1 = Eigen::DenseBase; + using M2 = Eigen::DenseBase; + Inverter::eval(src, dst); + } + +} // namespace choleskyInversion + +#endif // DataFormat_Math_choleskyInversion_h diff --git a/DataFormats/Math/test/BuildFile.xml b/DataFormats/Math/test/BuildFile.xml index 6b1112e30472c..5f2dd3854f6d1 100644 --- a/DataFormats/Math/test/BuildFile.xml +++ b/DataFormats/Math/test/BuildFile.xml @@ -1,27 +1,31 @@ - - - - + + + + - - - - + + + + - + + - + + - + + - + + @@ -29,75 +33,97 @@ - + - + + - + + - + + - + + - + + + + + - + - + - + + - + + - + + + + - + + + - - + + - - + + - - + + + + + + + + + diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp new file mode 100644 index 0000000000000..f1bf92353dfc3 --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t.cpp @@ -0,0 +1,136 @@ +// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride >; + +// generate matrices +template +void genMatrix(M& m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS") + << " mode" << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + + constexpr unsigned int SIZE = 4 * 1024; + + alignas(128) MX mm[stride()]; // just storage in case of SOA + double* __restrict__ p = (double*)__builtin_assume_aligned(mm, 128); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto& m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + else + for (auto& m : mm) { + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + delta -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep +#ifdef __clang__ +#pragma clang loop vectorize(enable) interleave(enable) +#endif + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + } + else +#pragma GCC ivdep + for (auto& m : mm) { + choleskyInversion::invert(m, m); + } + + delta += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "x86 computation took " << std::chrono::duration_cast(delta).count() / DNNK + << ' ' << " ms" << std::endl; +} + +int main() { + go<2>(false); + go<3>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<3>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu new file mode 100644 index 0000000000000..55df4ed23f20d --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t.cu @@ -0,0 +1,207 @@ +// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride>; + +template +__global__ void invertSOA(double *__restrict__ p, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + MapMX m(p + i); + choleskyInversion::invert(m, m); +} + +template +__global__ void invert(M *mm, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto &m = mm[i]; + choleskyInversion::invert(m, m); +} + +template +__global__ void invertSeq(M *mm, unsigned int n) { + if (threadIdx.x != 0) + return; + auto first = blockIdx.x * blockDim.x; + auto last = std::min(first + blockDim.x, n); + + for (auto i = first; i < last; ++i) { + auto &m = mm[i]; + choleskyInversion::invert(m, m); + } +} + +// generate matrices +template +void genMatrix(M &m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + auto delta1 = delta; + auto delta2 = delta; + + constexpr unsigned int SIZE = 4 * 1024; + + MX mm[stride()]; // just storage in case of SOA + double *__restrict__ p = (double *)(mm); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto &m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + else + for (auto &m : mm) { + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + auto m_d = cms::cuda::make_device_unique(DIM * DIM * stride(), nullptr); + cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice)); + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + int threadsPerBlock = 128; + int blocksPerGrid = SIZE / threadsPerBlock; + + delta -= (std::chrono::high_resolution_clock::now() - start); + + if (soa) + cms::cuda::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); + else + cms::cuda::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); + + delta += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (!soa) { + delta1 -= (std::chrono::high_resolution_clock::now() - start); + +#ifndef DOPROF + cms::cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); +#endif + delta1 += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + } + + delta2 -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + } + else +#pragma GCC ivdep + for (auto &m : mm) { + choleskyInversion::invert(m, m); + } + + delta2 += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "cuda/cudaSeq/x86 computation took " + << std::chrono::duration_cast(delta).count() / DNNK << ' ' + << std::chrono::duration_cast(delta1).count() / DNNK << ' ' + << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" + << std::endl; +} + +int main() { + cms::cudatest::requireDevices(); + + go<2>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu index ecc0be911c777..731447fe826e4 100644 --- a/DataFormats/Math/test/cudaAtan2Test.cu +++ b/DataFormats/Math/test/cudaAtan2Test.cu @@ -25,10 +25,13 @@ end #include #include #include - -#include "cuda/api_wrappers.h" +#include #include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" constexpr float xmin = -100.001; // avoid 0 constexpr float incr = 0.04; @@ -62,15 +65,13 @@ void go() { auto start = std::chrono::high_resolution_clock::now(); auto delta = start - start; - auto current_device = cuda::device::current::get(); - // atan2 delta -= (std::chrono::high_resolution_clock::now() - start); - auto diff_d = cuda::memory::device::make_unique(current_device, 3); + auto diff_d = cms::cuda::make_device_unique(3, nullptr); int diffs[3]; - cuda::memory::device::zero(diff_d.get(), 3 * 4); + cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4)); // Launch the diff CUDA Kernel dim3 threadsPerBlock(32, 32, 1); @@ -79,9 +80,9 @@ void go() { std::cout << "CUDA kernel 'diff' launch with " << blocksPerGrid.x << " blocks of " << threadsPerBlock.y << " threads\n"; - cuda::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get()); + cms::cuda::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get()); - cuda::memory::copy(diffs, diff_d.get(), 3 * 4); + cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); float mdiff = diffs[0] * 1.e-7; @@ -95,26 +96,15 @@ void go() { } int main() { - int count = 0; - auto status = cudaGetDeviceCount(&count); - if (status != cudaSuccess) { - std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } - if (count == 0) { - std::cerr << "No CUDA devices on this system, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } + cms::cudatest::requireDevices(); try { go<3>(); go<5>(); go<7>(); go<9>(); - } catch (cuda::runtime_error &ex) { - std::cerr << "CUDA error: " << ex.what() << std::endl; + } catch (std::runtime_error &ex) { + std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "A non-CUDA error occurred" << std::endl; diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu index 6aeaa0f2ededb..dd6576de46c1c 100644 --- a/DataFormats/Math/test/cudaMathTest.cu +++ b/DataFormats/Math/test/cudaMathTest.cu @@ -25,12 +25,7 @@ end #include #include #include - -#include "cuda/api_wrappers.h" - -#include -#include -#include +#include #ifdef __CUDACC__ #define inline __host__ __device__ inline @@ -40,6 +35,14 @@ end #include #endif +#include "DataFormats/Math/interface/approx_log.h" +#include "DataFormats/Math/interface/approx_exp.h" +#include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + std::mt19937 eng; std::mt19937 eng2; std::uniform_real_distribution rgen(0., 1.); @@ -85,8 +88,6 @@ void go() { auto start = std::chrono::high_resolution_clock::now(); auto delta = start - start; - auto current_device = cuda::device::current::get(); - int numElements = 200000; size_t size = numElements * sizeof(float); std::cout << "[Vector of " << numElements << " elements]\n"; @@ -100,12 +101,12 @@ void go() { std::generate(h_B.get(), h_B.get() + numElements, [&]() { return rgen(eng); }); delta -= (std::chrono::high_resolution_clock::now() - start); - auto d_A = cuda::memory::device::make_unique(current_device, numElements); - auto d_B = cuda::memory::device::make_unique(current_device, numElements); - auto d_C = cuda::memory::device::make_unique(current_device, numElements); + auto d_A = cms::cuda::make_device_unique(numElements, nullptr); + auto d_B = cms::cuda::make_device_unique(numElements, nullptr); + auto d_C = cms::cuda::make_device_unique(numElements, nullptr); - cuda::memory::copy(d_A.get(), h_A.get(), size); - cuda::memory::copy(d_B.get(), h_B.get(), size); + cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda alloc+copy took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; @@ -116,19 +117,21 @@ void go() { std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cms::cuda::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cms::cuda::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::memory::copy(h_C.get(), d_C.get(), size); + cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda copy back took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; @@ -178,27 +181,15 @@ void go() { } int main() { - int count = 0; - auto status = cudaGetDeviceCount(&count); - if (status != cudaSuccess) { - std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } - if (count == 0) { - std::cerr << "No CUDA devices on this system, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } + cms::cudatest::requireDevices(); try { go(); go(); go(); - go(); - } catch (cuda::runtime_error &ex) { - std::cerr << "CUDA error: " << ex.what() << std::endl; + } catch (std::runtime_error &ex) { + std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "A non-CUDA error occurred" << std::endl; diff --git a/DataFormats/Math/test/testAtan2.cpp b/DataFormats/Math/test/testAtan2.cpp index 207e69c848de9..11c951fc2e06b 100644 --- a/DataFormats/Math/test/testAtan2.cpp +++ b/DataFormats/Math/test/testAtan2.cpp @@ -195,6 +195,10 @@ void testIntPhi() { } int main() { + constexpr float p2i = ((int)(std::numeric_limits::max()) + 1) / M_PI; + constexpr float i2p = M_PI / ((int)(std::numeric_limits::max()) + 1); + std::cout << std::hexfloat << "p2i i2p " << p2i << " " << i2p << std::defaultfloat << std::endl; + std::cout << unsafe_atan2f<5>(0.f, 0.f) << " " << std::atan2(0., 0.) << std::endl; std::cout << unsafe_atan2f<5>(0.5f, 0.5f) << " " << std::atan2(0.5, 0.5) << std::endl; std::cout << unsafe_atan2f<5>(0.5f, -0.5f) << " " << std::atan2(0.5, -0.5) << std::endl; diff --git a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h new file mode 100644 index 0000000000000..8854b02176260 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h @@ -0,0 +1,52 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h +#define HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +class AtomicPairCounter { +public: + using c_type = unsigned long long int; + + AtomicPairCounter() {} + AtomicPairCounter(c_type i) { counter.ac = i; } + + __device__ __host__ AtomicPairCounter& operator=(c_type i) { + counter.ac = i; + return *this; + } + + struct Counters { + uint32_t n; // in a "One to Many" association is the number of "One" + uint32_t m; // in a "One to Many" association is the total number of associations + }; + + union Atomic2 { + Counters counters; + c_type ac; + }; + + static constexpr c_type incr = 1UL << 32; + + __device__ __host__ Counters get() const { return counter.counters; } + + // increment n by 1 and m by i. return previous value + __host__ __device__ __forceinline__ Counters add(uint32_t i) { + c_type c = i; + c += incr; + Atomic2 ret; +#ifdef __CUDA_ARCH__ + ret.ac = atomicAdd(&counter.ac, c); +#else + ret.ac = counter.ac; + counter.ac += c; +#endif + return ret.counters; + } + +private: + Atomic2 counter; +}; + +#endif // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h new file mode 100644 index 0000000000000..67b2b46f45101 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -0,0 +1,340 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h +#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h + +#include +#ifndef __CUDA_ARCH__ +#include +#endif // __CUDA_ARCH__ +#include +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" + +namespace cms { + namespace cuda { + + template + __global__ void countFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + (*h).count(v[i], ih); + } + } + + template + __global__ void fillFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + (*h).fill(v[i], i, ih); + } + } + + template + inline void launchZero(Histo *__restrict__ h, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); +#ifdef __CUDACC__ + cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream)); +#else + ::memset(off, 0, 4 * Histo::totbins()); +#endif + } + + template + inline void launchFinalize(Histo *__restrict__ h, + uint8_t *__restrict__ ws +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + , + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { +#ifdef __CUDACC__ + assert(ws); + uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + size_t wss = Histo::wsSize(); + assert(wss > 0); + CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream)); +#else + h->finalize(); +#endif + } + + template + inline void fillManyFromVector(Histo *__restrict__ h, + uint8_t *__restrict__ ws, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + int nthreads, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + launchZero(h, stream); +#ifdef __CUDACC__ + auto nblocks = (totSize + nthreads - 1) / nthreads; + countFromVector<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); + launchFinalize(h, ws, stream); + fillFromVector<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); +#else + countFromVector(h, nh, v, offsets); + h->finalize(); + fillFromVector(h, nh, v, offsets); +#endif + } + + template + __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { + assoc->bulkFinalizeFill(*apc); + } + + } // namespace cuda +} // namespace cms + +// iteratate over N bins left and right of the one containing "v" +template +__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } +} + +// iteratate over bins containing all values in window wmin, wmax +template +__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } +} + +template +class HistoContainer { +public: + using Counter = uint32_t; + + using CountersOnly = HistoContainer; + + using index_type = I; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr uint32_t nhists() { return NHISTS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + static constexpr uint32_t capacity() { return SIZE; } + + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + + __host__ static size_t wsSize() { +#ifdef __CUDACC__ + uint32_t *v = nullptr; + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()); + return temp_storage_bytes; +#else + return 0; +#endif + } + + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1 << nbits()) - 1; + return (t >> shift) & mask; + } + + __host__ __device__ void zero() { + for (auto &i : off) + i = 0; + } + + __host__ __device__ void add(CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { +#ifdef __CUDA_ARCH__ + atomicAdd(off + i, co.off[i]); +#else + auto &a = (std::atomic &)(off[i]); + a += co.off[i]; +#endif + } + } + + static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) { +#ifdef __CUDA_ARCH__ + return atomicAdd(&x, 1); +#else + auto &a = (std::atomic &)(x); + return a++; +#endif + } + + static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) { +#ifdef __CUDA_ARCH__ + return atomicSub(&x, 1); +#else + auto &a = (std::atomic &)(x); + return a--; +#endif + } + + __host__ __device__ __forceinline__ void countDirect(T b) { + assert(b < nbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) { + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __device__ __host__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } + + __device__ __host__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } + + __device__ __host__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; + } + auto first = m + blockDim.x * blockIdx.x + threadIdx.x; + for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { + off[i] = n; + } + } + + __host__ __device__ __forceinline__ void count(T t) { + uint32_t b = bin(t); + assert(b < nbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fill(T t, index_type j) { + uint32_t b = bin(t); + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ void count(T t, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __device__ __host__ __forceinline__ void finalize(Counter *ws = nullptr) { + assert(off[totbins() - 1] == 0); + blockPrefixScan(off, totbins(), ws); + assert(off[totbins() - 1] == off[totbins() - 2]); + } + + constexpr auto size() const { return uint32_t(off[totbins() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + + constexpr index_type const *begin() const { return bins; } + constexpr index_type const *end() const { return begin() + size(); } + + constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } + constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } + + Counter off[totbins()]; + index_type bins[capacity()]; +}; + +template +using OneToManyAssoc = HistoContainer; + +#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h new file mode 100644 index 0000000000000..32c7d7b481a50 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -0,0 +1,103 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h +#define HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h + +/* + * Everything you need to run cuda code in plain sequential c++ code + */ + +#ifndef __CUDACC__ + +#include +#include +#include + +#include + +namespace cudaCompat { + +#ifndef __CUDA_RUNTIME_H__ + struct dim3 { + uint32_t x, y, z; + }; +#endif + const dim3 threadIdx = {0, 0, 0}; + const dim3 blockDim = {1, 1, 1}; + + extern thread_local dim3 blockIdx; + extern thread_local dim3 gridDim; + + template + T1 atomicInc(T1* a, T2 b) { + auto ret = *a; + if ((*a) < T1(b)) + (*a)++; + return ret; + } + + template + T1 atomicAdd(T1* a, T2 b) { + auto ret = *a; + (*a) += b; + return ret; + } + + template + T1 atomicSub(T1* a, T2 b) { + auto ret = *a; + (*a) -= b; + return ret; + } + + template + T1 atomicMin(T1* a, T2 b) { + auto ret = *a; + *a = std::min(*a, T1(b)); + return ret; + } + template + T1 atomicMax(T1* a, T2 b) { + auto ret = *a; + *a = std::max(*a, T1(b)); + return ret; + } + + inline void __syncthreads() {} + inline void __threadfence() {} + inline bool __syncthreads_or(bool x) { return x; } + inline bool __syncthreads_and(bool x) { return x; } + template + inline T __ldg(T const* x) { + return *x; + } + + inline void resetGrid() { + blockIdx = {0, 0, 0}; + gridDim = {1, 1, 1}; + } + +} // namespace cudaCompat + +// some not needed as done by cuda runtime... +#ifndef __CUDA_RUNTIME_H__ +#define __host__ +#define __device__ +#define __global__ +#define __shared__ +#define __forceinline__ +#endif + +// make sure function are inlined to avoid multiple definition +#ifndef __CUDA_ARCH__ +#undef __global__ +#define __global__ inline __attribute__((always_inline)) +#undef __forceinline__ +#define __forceinline__ inline __attribute__((always_inline)) +#endif + +#ifndef __CUDA_ARCH__ +using namespace cudaCompat; +#endif + +#endif + +#endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h new file mode 100644 index 0000000000000..4ff01d75ad02f --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h @@ -0,0 +1,70 @@ +#ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h +#define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h + +#include + +#include + +// reimplementation of std algorithms able to compile with CUDA and run on GPUs, +// mostly by declaringthem constexpr + +namespace cuda_std { + + template + struct less { + __host__ __device__ constexpr bool operator()(const T &lhs, const T &rhs) const { return lhs < rhs; } + }; + + template <> + struct less { + template + __host__ __device__ constexpr bool operator()(const T &lhs, const U &rhs) const { + return lhs < rhs; + } + }; + + template > + __host__ __device__ constexpr RandomIt lower_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) { + auto count = last - first; + + while (count > 0) { + auto it = first; + auto step = count / 2; + it += step; + if (comp(*it, value)) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + + template > + __host__ __device__ constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) { + auto count = last - first; + + while (count > 0) { + auto it = first; + auto step = count / 2; + it += step; + if (!comp(value, *it)) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + + template > + __host__ __device__ constexpr RandomIt binary_find(RandomIt first, RandomIt last, const T &value, Compare comp = {}) { + first = cuda_std::lower_bound(first, last, value, comp); + return first != last && !comp(value, *first) ? first : last; + } + +} // namespace cuda_std + +#endif // HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h b/HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h new file mode 100644 index 0000000000000..021402c93c203 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h @@ -0,0 +1,55 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h +#define HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h + +#include +#include +#include + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace eigenSoA { + + constexpr bool isPowerOf2(int32_t v) { return v && !(v & (v - 1)); } + + template + class alignas(128) ScalarSoA { + public: + using Scalar = T; + + __host__ __device__ constexpr Scalar& operator()(int32_t i) { return data_[i]; } + __device__ constexpr const Scalar operator()(int32_t i) const { return __ldg(data_ + i); } + __host__ __device__ constexpr Scalar& operator[](int32_t i) { return data_[i]; } + __device__ constexpr const Scalar operator[](int32_t i) const { return __ldg(data_ + i); } + + __host__ __device__ constexpr Scalar* data() { return data_; } + __host__ __device__ constexpr Scalar const* data() const { return data_; } + + private: + Scalar data_[S]; + static_assert(isPowerOf2(S), "SoA stride not a power of 2"); + static_assert(sizeof(data_) % 128 == 0, "SoA size not a multiple of 128"); + }; + + template + class alignas(128) MatrixSoA { + public: + using Scalar = typename M::Scalar; + using Map = Eigen::Map >; + using CMap = Eigen::Map >; + + __host__ __device__ constexpr Map operator()(int32_t i) { return Map(data_ + i); } + __host__ __device__ constexpr CMap operator()(int32_t i) const { return CMap(data_ + i); } + __host__ __device__ constexpr Map operator[](int32_t i) { return Map(data_ + i); } + __host__ __device__ constexpr CMap operator[](int32_t i) const { return CMap(data_ + i); } + + private: + Scalar data_[S * M::RowsAtCompileTime * M::ColsAtCompileTime]; + static_assert(isPowerOf2(S), "SoA stride not a power of 2"); + static_assert(sizeof(data_) % 128 == 0, "SoA size not a multiple of 128"); + }; + +} // namespace eigenSoA + +#endif // HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h new file mode 100644 index 0000000000000..8b784bdd61bfe --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -0,0 +1,169 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h +#define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#ifdef __CUDA_ARCH__ +template +__device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) { + // ci and co may be the same + auto x = ci[i]; + auto laneId = threadIdx.x & 0x1f; +#pragma unroll + for (int offset = 1; offset < 32; offset <<= 1) { + auto y = __shfl_up_sync(mask, x, offset); + if (laneId >= offset) + x += y; + } + co[i] = x; +} +#endif + +//same as above may remove +#ifdef __CUDA_ARCH__ +template +__device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) { + auto x = c[i]; + auto laneId = threadIdx.x & 0x1f; +#pragma unroll + for (int offset = 1; offset < 32; offset <<= 1) { + auto y = __shfl_up_sync(mask, x, offset); + if (laneId >= offset) + x += y; + } + c[i] = x; +} +#endif + +// limited to 32*32 elements.... +template +__device__ __host__ void __forceinline__ blockPrefixScan(T const* __restrict__ ci, + T* __restrict__ co, + uint32_t size, + T* ws +#ifndef __CUDA_ARCH__ + = nullptr +#endif +) { +#ifdef __CUDA_ARCH__ + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); + auto first = threadIdx.x; + auto mask = __ballot_sync(0xffffffff, first < size); + + for (auto i = first; i < size; i += blockDim.x) { + warpPrefixScan(ci, co, i, mask); + auto laneId = threadIdx.x & 0x1f; + auto warpId = i / 32; + assert(warpId < 32); + if (31 == laneId) + ws[warpId] = co[i]; + mask = __ballot_sync(mask, i + blockDim.x < size); + } + __syncthreads(); + if (size <= 32) + return; + if (threadIdx.x < 32) + warpPrefixScan(ws, threadIdx.x, 0xffffffff); + __syncthreads(); + for (auto i = first + 32; i < size; i += blockDim.x) { + auto warpId = i / 32; + co[i] += ws[warpId - 1]; + } + __syncthreads(); +#else + co[0] = ci[0]; + for (uint32_t i = 1; i < size; ++i) + co[i] = ci[i] + co[i - 1]; +#endif +} + +// same as above, may remove +// limited to 32*32 elements.... +template +__device__ __host__ void __forceinline__ blockPrefixScan(T* c, + uint32_t size, + T* ws +#ifndef __CUDA_ARCH__ + = nullptr +#endif +) { +#ifdef __CUDA_ARCH__ + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); + auto first = threadIdx.x; + auto mask = __ballot_sync(0xffffffff, first < size); + + for (auto i = first; i < size; i += blockDim.x) { + warpPrefixScan(c, i, mask); + auto laneId = threadIdx.x & 0x1f; + auto warpId = i / 32; + assert(warpId < 32); + if (31 == laneId) + ws[warpId] = c[i]; + mask = __ballot_sync(mask, i + blockDim.x < size); + } + __syncthreads(); + if (size <= 32) + return; + if (threadIdx.x < 32) + warpPrefixScan(ws, threadIdx.x, 0xffffffff); + __syncthreads(); + for (auto i = first + 32; i < size; i += blockDim.x) { + auto warpId = i / 32; + c[i] += ws[warpId - 1]; + } + __syncthreads(); +#else + for (uint32_t i = 1; i < size; ++i) + c[i] += c[i - 1]; +#endif +} + +// limited to 1024*1024 elements.... +template +__global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) { + __shared__ T ws[32]; + // first each block does a scan of size 1024; (better be enough blocks....) + assert(1024 * gridDim.x >= size); + int off = 1024 * blockIdx.x; + if (size - off > 0) + blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws); + + // count blocks that finished + __shared__ bool isLastBlockDone; + if (0 == threadIdx.x) { + auto value = atomicAdd(pc, 1); // block counter + isLastBlockDone = (value == (int(gridDim.x) - 1)); + } + + __syncthreads(); + + if (!isLastBlockDone) + return; + + // good each block has done its work and now we are left in last block + + // let's get the partial sums from each block + __shared__ T psum[1024]; + for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) { + auto j = 1024 * i + 1023; + psum[i] = (j < size) ? co[j] : T(0); + } + __syncthreads(); + blockPrefixScan(psum, psum, gridDim.x, ws); + + // now it would have been handy to have the other blocks around... + int first = threadIdx.x; // + blockDim.x * blockIdx.x + for (int i = first + 1024; i < size; i += blockDim.x) { // *gridDim.x) { + auto k = i / 1024; // block + co[i] += psum[k - 1]; + } +} + +#endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h new file mode 100644 index 0000000000000..ce2acf2006090 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -0,0 +1,266 @@ +#ifndef HeterogeneousCoreCUDAUtilities_radixSort_H +#define HeterogeneousCoreCUDAUtilities_radixSort_H + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +template +__device__ inline void dummyReorder(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {} + +template +__device__ inline void reorderSigned(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + int32_t first = threadIdx.x; + __shared__ uint32_t firstNeg; + firstNeg = a[ind[0]] < 0 ? 0 : size; + __syncthreads(); + + // find first negative + for (auto i = first; i < size - 1; i += blockDim.x) { + if ((a[ind[i]] ^ a[ind[i + 1]]) < 0) + firstNeg = i + 1; + } + + __syncthreads(); + + auto ii = first; + for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) { + ind2[ii] = ind[i]; + ii += blockDim.x; + } + __syncthreads(); + ii = size - firstNeg + threadIdx.x; + assert(ii >= 0); + for (auto i = first; i < firstNeg; i += blockDim.x) { + ind2[ii] = ind[i]; + ii += blockDim.x; + } + __syncthreads(); + for (auto i = first; i < size; i += blockDim.x) + ind[i] = ind2[i]; +} + +template +__device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + int32_t first = threadIdx.x; + __shared__ uint32_t firstNeg; + firstNeg = a[ind[0]] < 0 ? 0 : size; + __syncthreads(); + + // find first negative + for (auto i = first; i < size - 1; i += blockDim.x) { + if ((a[ind[i]] ^ a[ind[i + 1]]) < 0) + firstNeg = i + 1; + } + + __syncthreads(); + + int ii = size - firstNeg - threadIdx.x - 1; + for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) { + ind2[ii] = ind[i]; + ii -= blockDim.x; + } + __syncthreads(); + ii = size - firstNeg + threadIdx.x; + assert(ii >= 0); + for (auto i = first; i < firstNeg; i += blockDim.x) { + ind2[ii] = ind[i]; + ii += blockDim.x; + } + __syncthreads(); + for (auto i = first; i < size; i += blockDim.x) + ind[i] = ind2[i]; +} + +template +__device__ void __forceinline__ +radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { + constexpr int d = 8, w = 8 * sizeof(T); + constexpr int sb = 1 << d; + constexpr int ps = int(sizeof(T)) - NS; + + __shared__ int32_t c[sb], ct[sb], cu[sb]; + + __shared__ int ibs; + __shared__ int p; + + assert(size > 0); + assert(blockDim.x >= sb); + + // bool debug = false; // threadIdx.x==0 && blockIdx.x==5; + + p = ps; + + auto j = ind; + auto k = ind2; + + int32_t first = threadIdx.x; + for (auto i = first; i < size; i += blockDim.x) + j[i] = i; + __syncthreads(); + + while (__syncthreads_and(p < w / d)) { + if (threadIdx.x < sb) + c[threadIdx.x] = 0; + __syncthreads(); + + // fill bins + for (auto i = first; i < size; i += blockDim.x) { + auto bin = (a[j[i]] >> d * p) & (sb - 1); + atomicAdd(&c[bin], 1); + } + __syncthreads(); + + // prefix scan "optimized"???... + if (threadIdx.x < sb) { + auto x = c[threadIdx.x]; + auto laneId = threadIdx.x & 0x1f; +#pragma unroll + for (int offset = 1; offset < 32; offset <<= 1) { + auto y = __shfl_up_sync(0xffffffff, x, offset); + if (laneId >= offset) + x += y; + } + ct[threadIdx.x] = x; + } + __syncthreads(); + if (threadIdx.x < sb) { + auto ss = (threadIdx.x / 32) * 32 - 1; + c[threadIdx.x] = ct[threadIdx.x]; + for (int i = ss; i > 0; i -= 32) + c[threadIdx.x] += ct[i]; + } + /* + //prefix scan for the nulls (for documentation) + if (threadIdx.x==0) + for (int i = 1; i < sb; ++i) c[i] += c[i-1]; + */ + + // broadcast + ibs = size - 1; + __syncthreads(); + while (__syncthreads_and(ibs > 0)) { + int i = ibs - threadIdx.x; + if (threadIdx.x < sb) { + cu[threadIdx.x] = -1; + ct[threadIdx.x] = -1; + } + __syncthreads(); + int32_t bin = -1; + if (threadIdx.x < sb) { + if (i >= 0) { + bin = (a[j[i]] >> d * p) & (sb - 1); + ct[threadIdx.x] = bin; + atomicMax(&cu[bin], int(i)); + } + } + __syncthreads(); + if (threadIdx.x < sb) { + if (i >= 0 && i == cu[bin]) // ensure to keep them in order + for (int ii = threadIdx.x; ii < sb; ++ii) + if (ct[ii] == bin) { + auto oi = ii - threadIdx.x; + // assert(i>=oi);if(i>=oi) + k[--c[bin]] = j[i - oi]; + } + } + __syncthreads(); + if (bin >= 0) + assert(c[bin] >= 0); + if (threadIdx.x == 0) + ibs -= sb; + __syncthreads(); + } + + /* + // broadcast for the nulls (for documentation) + if (threadIdx.x==0) + for (int i=size-first-1; i>=0; i--) { // =blockDim.x) { + auto bin = (a[j[i]] >> d*p)&(sb-1); + auto ik = atomicSub(&c[bin],1); + k[ik-1] = j[i]; + } + */ + + __syncthreads(); + assert(c[0] == 0); + + // swap (local, ok) + auto t = j; + j = k; + k = t; + + if (threadIdx.x == 0) + ++p; + __syncthreads(); + } + + if ((w != 8) && (0 == (NS & 1))) + assert(j == ind); // w/d is even so ind is correct + + if (j != ind) // odd... + for (auto i = first; i < size; i += blockDim.x) + ind[i] = ind2[i]; + + __syncthreads(); + + // now move negative first... (if signed) + reorder(a, ind, ind2, size); +} + +template ::value, T>::type* = nullptr> +__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(a, ind, ind2, size, dummyReorder); +} + +template ::value && std::is_signed::value, T>::type* = nullptr> +__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(a, ind, ind2, size, reorderSigned); +} + +template ::value, T>::type* = nullptr> +__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + using I = int; + radixSortImpl((I const*)(a), ind, ind2, size, reorderFloat); +} + +template +__device__ void __forceinline__ +radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + extern __shared__ uint16_t ws[]; + + auto a = v + offsets[blockIdx.x]; + auto ind = index + offsets[blockIdx.x]; + auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx.x]; + auto size = offsets[blockIdx.x + 1] - offsets[blockIdx.x]; + assert(offsets[blockIdx.x + 1] >= offsets[blockIdx.x]); + if (size > 0) + radixSort(a, ind, ind2, size); +} + +template +__global__ void __launch_bounds__(256, 4) + radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMulti(v, index, offsets, workspace); +} + +template +__global__ void +// __launch_bounds__(256, 4) +radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMulti(v, index, offsets, workspace); +} + +#endif // HeterogeneousCoreCUDAUtilities_radixSort_H diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc new file mode 100644 index 0000000000000..1aad065963ff2 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc @@ -0,0 +1,15 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace cudaCompat { + thread_local dim3 blockIdx; + thread_local dim3 gridDim; +} // namespace cudaCompat + +namespace { + struct InitGrid { + InitGrid() { cudaCompat::resetGrid(); } + }; + + const InitGrid initGrid; + +} // namespace diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu new file mode 100644 index 0000000000000..1c28758fb970e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu @@ -0,0 +1,65 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +__global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto m = i % 11; + m = m % 6 + 1; // max 6, no 0 + auto c = dc->add(m); + assert(c.m < n); + ind[c.m] = c.n; + for (int j = c.n; j < c.n + m; ++j) + cont[j] = i; +}; + +__global__ void finalize(AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { + assert(dc->get().m == n); + ind[n] = dc->get().n; +} + +__global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_t const *cont, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + assert(0 == ind[0]); + assert(dc->get().m == n); + assert(ind[n] == dc->get().n); + auto ib = ind[i]; + auto ie = ind[i + 1]; + auto k = cont[ib++]; + assert(k < n); + for (; ib < ie; ++ib) + assert(cont[ib] == k); +} + +#include +int main() { + AtomicPairCounter *dc_d; + cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); + + std::cout << "size " << sizeof(AtomicPairCounter) << std::endl; + + constexpr uint32_t N = 20000; + constexpr uint32_t M = N * 6; + uint32_t *n_d, *m_d; + cudaCheck(cudaMalloc(&n_d, N * sizeof(int))); + // cudaMemset(n_d, 0, N*sizeof(int)); + cudaCheck(cudaMalloc(&m_d, M * sizeof(int))); + + update<<<2000, 512>>>(dc_d, n_d, m_d, 10000); + finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000); + verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000); + + AtomicPairCounter dc; + cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); + + std::cout << dc.get().n << ' ' << dc.get().m << std::endl; + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp new file mode 100644 index 0000000000000..cc5541f58ad60 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp @@ -0,0 +1,146 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +template +void go() { + std::mt19937 eng; + + int rmin = std::numeric_limits::min(); + int rmax = std::numeric_limits::max(); + if (NBINS != 128) { + rmin = 0; + rmax = NBINS * 2 - 1; + } + + std::uniform_int_distribution rgen(rmin, rmax); + + constexpr int N = 12000; + T v[N]; + + using Hist = HistoContainer; + using Hist4 = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' + << Hist::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' ' + << Hist4::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; + for (auto nh = 0; nh < 4; ++nh) + std::cout << "bins " << int(Hist4::bin(0)) + Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin)) + Hist4::histOff(nh) + << ' ' << int(Hist::bin(rmax)) + Hist4::histOff(nh) << std::endl; + + Hist h; + Hist4 h4; + for (int it = 0; it < 5; ++it) { + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + if (it == 2) + for (long long j = N / 2; j < N / 2 + N / 4; j++) + v[j] = 4; + h.zero(); + h4.zero(); + assert(h.size() == 0); + assert(h4.size() == 0); + for (long long j = 0; j < N; j++) { + h.count(v[j]); + if (j < 2000) + h4.count(v[j], 2); + else + h4.count(v[j], j % 4); + } + assert(h.size() == 0); + assert(h4.size() == 0); + h.finalize(); + h4.finalize(); + assert(h.size() == N); + assert(h4.size() == N); + for (long long j = 0; j < N; j++) { + h.fill(v[j], j); + if (j < 2000) + h4.fill(v[j], j, 2); + else + h4.fill(v[j], j, j % 4); + } + assert(h.off[0] == 0); + assert(h4.off[0] == 0); + assert(h.size() == N); + assert(h4.size() == N); + + auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) { + assert((int32_t)t1 < N); + assert((int32_t)t2 < N); + if (i != j && T(v[t1] - v[t2]) <= 0) + std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl; + }; + + for (uint32_t i = 0; i < Hist::nbins(); ++i) { + if (0 == h.size(i)) + continue; + auto k = *h.begin(i); + assert(k < N); + auto kl = NBINS != 128 ? h.bin(std::max(rmin, v[k] - DELTA)) : h.bin(v[k] - T(DELTA)); + auto kh = NBINS != 128 ? h.bin(std::min(rmax, v[k] + DELTA)) : h.bin(v[k] + T(DELTA)); + if (NBINS == 128) { + assert(kl != i); + assert(kh != i); + } + if (NBINS != 128) { + assert(kl <= i); + assert(kh >= i); + } + // std::cout << kl << ' ' << kh << std::endl; + for (auto j = h.begin(kl); j < h.end(kl); ++j) + verify(i, kl, k, k, (*j)); + for (auto j = h.begin(kh); j < h.end(kh); ++j) + verify(i, kh, k, (*j), k); + } + } + + for (long long j = 0; j < N; j++) { + auto b0 = h.bin(v[j]); + int w = 0; + int tot = 0; + auto ftest = [&](int k) { + assert(k >= 0 && k < N); + tot++; + }; + forEachInBins(h, v[j], w, ftest); + int rtot = h.end(b0) - h.begin(b0); + assert(tot == rtot); + w = 1; + tot = 0; + forEachInBins(h, v[j], w, ftest); + int bp = b0 + 1; + int bm = b0 - 1; + if (bp < int(h.nbins())) + rtot += h.end(bp) - h.begin(bp); + if (bm >= 0) + rtot += h.end(bm) - h.begin(bm); + assert(tot == rtot); + w = 2; + tot = 0; + forEachInBins(h, v[j], w, ftest); + bp++; + bm--; + if (bp < int(h.nbins())) + rtot += h.end(bp) - h.begin(bp); + if (bm >= 0) + rtot += h.end(bm) - h.begin(bm); + assert(tot == rtot); + } +} + +int main() { + cms::cudatest::requireDevices(); + + go(); + go(); + go(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu new file mode 100644 index 0000000000000..aa7eb245c350d --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -0,0 +1,154 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +template +void go() { + std::mt19937 eng; + std::uniform_int_distribution rgen(std::numeric_limits::min(), std::numeric_limits::max()); + + constexpr int N = 12000; + T v[N]; + auto v_d = cms::cuda::make_device_unique(N, nullptr); + + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + + constexpr uint32_t nParts = 10; + constexpr uint32_t partSize = N / nParts; + uint32_t offsets[nParts + 1]; + + using Hist = HistoContainer; + std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' + << Hist::capacity() << ' ' << Hist::wsSize() << ' ' + << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; + + Hist h; + auto h_d = cms::cuda::make_device_unique(1, nullptr); + auto ws_d = cms::cuda::make_device_unique(Hist::wsSize(), nullptr); + + auto off_d = cms::cuda::make_device_unique(nParts + 1, nullptr); + + for (int it = 0; it < 5; ++it) { + offsets[0] = 0; + for (uint32_t j = 1; j < nParts + 1; ++j) { + offsets[j] = offsets[j - 1] + partSize - 3 * j; + assert(offsets[j] <= N); + } + + if (it == 1) { // special cases... + offsets[0] = 0; + offsets[1] = 0; + offsets[2] = 19; + offsets[3] = 32 + offsets[2]; + offsets[4] = 123 + offsets[3]; + offsets[5] = 256 + offsets[4]; + offsets[6] = 311 + offsets[5]; + offsets[7] = 2111 + offsets[6]; + offsets[8] = 256 * 11 + offsets[7]; + offsets[9] = 44 + offsets[8]; + offsets[10] = 3297 + offsets[9]; + } + + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice)); + + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + + if (it == 2) { // big bin + for (long long j = 1000; j < 2000; j++) + v[j] = sizeof(T) == 1 ? 22 : 3456; + } + + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + + cms::cuda::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); + cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); + assert(0 == h.off[0]); + assert(offsets[10] == h.size()); + + auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { + assert(t1 < N); + assert(t2 < N); + if (T(v[t1] - v[t2]) <= 0) + std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl; + }; + + auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); }; + + // make sure it spans 3 bins... + auto window = T(1300); + + for (uint32_t j = 0; j < nParts; ++j) { + auto off = Hist::histOff(j); + for (uint32_t i = 0; i < Hist::nbins(); ++i) { + auto ii = i + off; + if (0 == h.size(ii)) + continue; + auto k = *h.begin(ii); + if (j % 2) + k = *(h.begin(ii) + (h.end(ii) - h.begin(ii)) / 2); + auto bk = h.bin(v[k]); + assert(bk == i); + assert(k < offsets[j + 1]); + auto kl = h.bin(v[k] - window); + auto kh = h.bin(v[k] + window); + assert(kl != i); + assert(kh != i); + // std::cout << kl << ' ' << kh << std::endl; + + auto me = v[k]; + auto tot = 0; + auto nm = 0; + bool l = true; + auto khh = kh; + incr(khh); + for (auto kk = kl; kk != khh; incr(kk)) { + if (kk != kl && kk != kh) + nm += h.size(kk + off); + for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) { + if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) { + } else { + ++tot; + } + } + if (kk == i) { + l = false; + continue; + } + if (l) + for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) + verify(i, k, k, (*p)); + else + for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) + verify(i, k, (*p), k); + } + if (!(tot >= nm)) { + std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + } + if (l) + std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + assert(!l); + } + } + } +} + +int main() { + cms::cudatest::requireDevices(); + + go(); + go(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu new file mode 100644 index 0000000000000..020d69268a420 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +template +__global__ void mykernel(T const* __restrict__ v, uint32_t N) { + assert(v); + assert(N == 12000); + + if (threadIdx.x == 0) + printf("start kernel for %d data\n", N); + + using Hist = HistoContainer; + + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[32]; + + for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { + hist.off[j] = 0; + } + __syncthreads(); + + for (auto j = threadIdx.x; j < N; j += blockDim.x) + hist.count(v[j]); + __syncthreads(); + + assert(0 == hist.size()); + __syncthreads(); + + hist.finalize(ws); + __syncthreads(); + + assert(N == hist.size()); + for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) + assert(hist.off[j] <= hist.off[j + 1]); + __syncthreads(); + + if (threadIdx.x < 32) + ws[threadIdx.x] = 0; // used by prefix scan... + __syncthreads(); + + for (auto j = threadIdx.x; j < N; j += blockDim.x) + hist.fill(v[j], j); + __syncthreads(); + assert(0 == hist.off[0]); + assert(N == hist.size()); + + for (auto j = threadIdx.x; j < hist.size() - 1; j += blockDim.x) { + auto p = hist.begin() + j; + assert((*p) < N); + auto k1 = Hist::bin(v[*p]); + auto k2 = Hist::bin(v[*(p + 1)]); + assert(k2 >= k1); + } + + for (auto i = threadIdx.x; i < hist.size(); i += blockDim.x) { + auto p = hist.begin() + i; + auto j = *p; + auto b0 = Hist::bin(v[j]); + int tot = 0; + auto ftest = [&](int k) { + assert(k >= 0 && k < N); + ++tot; + }; + forEachInWindow(hist, v[j], v[j], ftest); + int rtot = hist.size(b0); + assert(tot == rtot); + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + forEachInWindow(hist, vm, vp, ftest); + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp) - hist.begin(bm); + assert(tot == rtot); + } +} + +template +void go() { + std::mt19937 eng; + + int rmin = std::numeric_limits::min(); + int rmax = std::numeric_limits::max(); + if (NBINS != 128) { + rmin = 0; + rmax = NBINS * 2 - 1; + } + + std::uniform_int_distribution rgen(rmin, rmax); + + constexpr int N = 12000; + T v[N]; + + auto v_d = cms::cuda::make_device_unique(N, nullptr); + assert(v_d.get()); + + using Hist = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' + << (rmax - rmin) / Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + + for (int it = 0; it < 5; ++it) { + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + if (it == 2) + for (long long j = N / 2; j < N / 2 + N / 4; j++) + v[j] = 4; + + assert(v_d.get()); + assert(v); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + assert(v_d.get()); + cms::cuda::launch(mykernel, {1, 256}, v_d.get(), N); + } +} + +int main() { + cms::cudatest::requireDevices(); + + go(); + go(); + go(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp new file mode 100644 index 0000000000000..3d452e851ba12 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp @@ -0,0 +1 @@ +#include "OneToManyAssoc_t.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu new file mode 100644 index 0000000000000..3d452e851ba12 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu @@ -0,0 +1 @@ +#include "OneToManyAssoc_t.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h new file mode 100644 index 0000000000000..92bb359115a02 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -0,0 +1,308 @@ +#include +#include +#include +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" + +#endif + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + +constexpr uint32_t MaxElem = 64000; +constexpr uint32_t MaxTk = 8000; +constexpr uint32_t MaxAssocs = 4 * MaxTk; +using Assoc = OneToManyAssoc; + +using SmallAssoc = OneToManyAssoc; + +using Multiplicity = OneToManyAssoc; + +using TK = std::array; + +__global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < n; i += gridDim.x * blockDim.x) { + __shared__ Multiplicity::CountersOnly local; + if (threadIdx.x == 0) + local.zero(); + __syncthreads(); + local.countDirect(2 + i % 4); + __syncthreads(); + if (threadIdx.x == 0) + assoc->add(local); + } +} + +__global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < n; i += gridDim.x * blockDim.x) + assoc->countDirect(2 + i % 4); +} + +__global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) { + auto first = blockDim.x * blockIdx.x + threadIdx.x; + for (auto i = first; i < Multiplicity::totbins(); i += gridDim.x * blockDim.x) + assert(m1->off[i] == m2->off[i]); +} + +__global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) + return; + if (tk[k][j] < MaxElem) + assoc->countDirect(tk[k][j]); + } +} + +__global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) + return; + if (tk[k][j] < MaxElem) + assoc->fillDirect(tk[k][j], k); + } +} + +__global__ void verify(Assoc* __restrict__ assoc) { assert(assoc->size() < Assoc::capacity()); } + +template +__global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int k = first; k < n; k += gridDim.x * blockDim.x) { + auto m = tk[k][3] < MaxElem ? 4 : 3; + assoc->bulkFill(*apc, &tk[k][0], m); + } +} + +template +__global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) { + if (apc->get().m >= Assoc::nbins()) + printf("Overflow %d %d\n", apc->get().m, Assoc::nbins()); + assert(assoc->size() < Assoc::capacity()); +} + +int main() { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); + auto current_device = cms::cuda::currentDevice(); +#else + // make sure cuda emulation is working + std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl; + std::cout << "cuda y's " << threadIdx.y << ' ' << blockIdx.y << ' ' << blockDim.y << ' ' << gridDim.y << std::endl; + std::cout << "cuda z's " << threadIdx.z << ' ' << blockIdx.z << ' ' << blockDim.z << ' ' << gridDim.z << std::endl; + assert(threadIdx.x == 0); + assert(threadIdx.y == 0); + assert(threadIdx.z == 0); + assert(blockIdx.x == 0); + assert(blockIdx.y == 0); + assert(blockIdx.z == 0); + assert(blockDim.x == 1); + assert(blockDim.y == 1); + assert(blockDim.z == 1); + assert(gridDim.x == 1); + assert(gridDim.y == 1); + assert(gridDim.z == 1); +#endif + + std::cout << "OneToManyAssoc " << Assoc::nbins() << ' ' << Assoc::capacity() << ' ' << Assoc::wsSize() << std::endl; + std::cout << "OneToManyAssoc (small) " << SmallAssoc::nbins() << ' ' << SmallAssoc::capacity() << ' ' + << SmallAssoc::wsSize() << std::endl; + + std::mt19937 eng; + + std::geometric_distribution rdm(0.8); + + constexpr uint32_t N = 4000; + + std::vector> tr(N); + + // fill with "index" to element + long long ave = 0; + int imax = 0; + auto n = 0U; + auto z = 0U; + auto nz = 0U; + for (auto i = 0U; i < 4U; ++i) { + auto j = 0U; + while (j < N && n < MaxElem) { + if (z == 11) { + ++n; + z = 0; + ++nz; + continue; + } // a bit of not assoc + auto x = rdm(eng); + auto k = std::min(j + x + 1, N); + if (i == 3 && z == 3) { // some triplets time to time + for (; j < k; ++j) + tr[j][i] = MaxElem + 1; + } else { + ave += x + 1; + imax = std::max(imax, x); + for (; j < k; ++j) + tr[j][i] = n; + ++n; + } + ++z; + } + assert(n <= MaxElem); + assert(j <= N); + } + std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; + +#ifdef __CUDACC__ + auto v_d = cms::cuda::make_device_unique[]>(N, nullptr); + assert(v_d.get()); + auto a_d = cms::cuda::make_device_unique(1, nullptr); + auto sa_d = cms::cuda::make_device_unique(1, nullptr); + auto ws_d = cms::cuda::make_device_unique(Assoc::wsSize(), nullptr); + + cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); +#else + auto a_d = std::make_unique(); + auto sa_d = std::make_unique(); + auto v_d = tr.data(); +#endif + + cms::cuda::launchZero(a_d.get(), 0); + +#ifdef __CUDACC__ + auto nThreads = 256; + auto nBlocks = (4 * N + nThreads - 1) / nThreads; + + count<<>>(v_d.get(), a_d.get(), N); + + cms::cuda::launchFinalize(a_d.get(), ws_d.get(), 0); + verify<<<1, 1>>>(a_d.get()); + fill<<>>(v_d.get(), a_d.get(), N); +#else + count(v_d, a_d.get(), N); + cms::cuda::launchFinalize(a_d.get()); + verify(a_d.get()); + fill(v_d, a_d.get(), N); +#endif + + Assoc la; + +#ifdef __CUDACC__ + cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); +#else + memcpy(&la, a_d.get(), sizeof(Assoc)); // not required, easier +#endif + + std::cout << la.size() << std::endl; + imax = 0; + ave = 0; + z = 0; + for (auto i = 0U; i < n; ++i) { + auto x = la.size(i); + if (x == 0) { + z++; + continue; + } + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la.size(n)); + std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl; + + // now the inverse map (actually this is the direct....) + AtomicPairCounter* dc_d; + AtomicPairCounter dc(0); + +#ifdef __CUDACC__ + cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); + nBlocks = (N + nThreads - 1) / nThreads; + fillBulk<<>>(dc_d, v_d.get(), a_d.get(), N); + cms::cuda::finalizeBulk<<>>(dc_d, a_d.get()); + verifyBulk<<<1, 1>>>(a_d.get(), dc_d); + + cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); + + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); + fillBulk<<>>(dc_d, v_d.get(), sa_d.get(), N); + cms::cuda::finalizeBulk<<>>(dc_d, sa_d.get()); + verifyBulk<<<1, 1>>>(sa_d.get(), dc_d); + +#else + dc_d = &dc; + fillBulk(dc_d, v_d, a_d.get(), N); + cms::cuda::finalizeBulk(dc_d, a_d.get()); + verifyBulk(a_d.get(), dc_d); + memcpy(&la, a_d.get(), sizeof(Assoc)); + + AtomicPairCounter sdc(0); + fillBulk(&sdc, v_d, sa_d.get(), N); + cms::cuda::finalizeBulk(&sdc, sa_d.get()); + verifyBulk(sa_d.get(), &sdc); + +#endif + + std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl; + + std::cout << la.size() << std::endl; + imax = 0; + ave = 0; + for (auto i = 0U; i < N; ++i) { + auto x = la.size(i); + if (!(x == 4 || x == 3)) + std::cout << i << ' ' << x << std::endl; + assert(x == 4 || x == 3); + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la.size(N)); + std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl; + + // here verify use of block local counters +#ifdef __CUDACC__ + auto m1_d = cms::cuda::make_device_unique(1, nullptr); + auto m2_d = cms::cuda::make_device_unique(1, nullptr); +#else + auto m1_d = std::make_unique(); + auto m2_d = std::make_unique(); +#endif + cms::cuda::launchZero(m1_d.get(), 0); + cms::cuda::launchZero(m2_d.get(), 0); + +#ifdef __CUDACC__ + nBlocks = (4 * N + nThreads - 1) / nThreads; + countMulti<<>>(v_d.get(), m1_d.get(), N); + countMultiLocal<<>>(v_d.get(), m2_d.get(), N); + verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); + + cms::cuda::launchFinalize(m1_d.get(), ws_d.get(), 0); + cms::cuda::launchFinalize(m2_d.get(), ws_d.get(), 0); + verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); +#else + countMulti(v_d, m1_d.get(), N); + countMultiLocal(v_d, m2_d.get(), N); + verifyMulti(m1_d.get(), m2_d.get()); + + cms::cuda::launchFinalize(m1_d.get()); + cms::cuda::launchFinalize(m2_d.get()); + verifyMulti(m1_d.get(), m2_d.get()); +#endif + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp new file mode 100644 index 0000000000000..f453041b85b38 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp @@ -0,0 +1,34 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include +#include +#include +#include + +void testBinaryFind() { + std::vector data = {1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6}; + + auto lower = cuda_std::lower_bound(data.begin(), data.end(), 4); + auto upper = cuda_std::upper_bound(data.begin(), data.end(), 4); + + std::copy(lower, upper, std::ostream_iterator(std::cout, " ")); + + std::cout << '\n'; + + // classic binary search, returning a value only if it is present + + data = {1, 2, 4, 6, 9, 10}; + + auto test = [&](auto v) { + auto it = cuda_std::binary_find(data.cbegin(), data.cend(), v); + + if (it != data.cend()) + std::cout << *it << " found at index " << std::distance(data.cbegin(), it) << std::endl; + else + std::cout << v << " non found" << std::endl; + }; + + test(4); + test(5); +} + +int main() { testBinaryFind(); } diff --git a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu new file mode 100644 index 0000000000000..7b8ed6219392e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu @@ -0,0 +1,30 @@ +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +__global__ void testBinaryFind() { + int data[] = {1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6}; + + auto lower = cuda_std::lower_bound(data, data + 13, 4); + auto upper = cuda_std::upper_bound(data, data + 12, 4); + + assert(3 == upper - lower); + + // classic binary search, returning a value only if it is present + + constexpr int data2[] = {1, 2, 4, 6, 9, 10}; + + assert(data2 + 2 == cuda_std::binary_find(data2, data2 + 6, 4)); + assert(data2 + 6 == cuda_std::binary_find(data2, data2 + 6, 5)); +} + +void wrapper() { cms::cuda::launch(testBinaryFind, {32, 64}); } + +int main() { + cms::cudatest::requireDevices(); + + wrapper(); +} diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp new file mode 100644 index 0000000000000..c5200e2a87bee --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp @@ -0,0 +1 @@ +#include "eigenSoA_t.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu new file mode 100644 index 0000000000000..c5200e2a87bee --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu @@ -0,0 +1 @@ +#include "eigenSoA_t.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h new file mode 100644 index 0000000000000..e9fef92e68083 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h @@ -0,0 +1,101 @@ +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" + +template +struct MySoA { + // we can find a way to avoid this copy/paste??? + static constexpr int32_t stride() { return S; } + + eigenSoA::ScalarSoA a; + eigenSoA::ScalarSoA b; +}; + +using V = MySoA<128>; + +__global__ void testBasicSoA(float* p) { + using namespace eigenSoA; + + assert(!isPowerOf2(0)); + assert(isPowerOf2(1)); + assert(isPowerOf2(1024)); + assert(!isPowerOf2(1026)); + + using M3 = Eigen::Matrix; + ; + __shared__ eigenSoA::MatrixSoA m; + + int first = threadIdx.x + blockIdx.x * blockDim.x; + if (0 == first) + printf("before %f\n", p[0]); + + // a silly game... + int n = 64; + for (int i = first; i < n; i += blockDim.x * gridDim.x) { + m[i].setZero(); + m[i](0, 0) = p[i]; + m[i](1, 1) = p[i + 64]; + m[i](2, 2) = p[i + 64 * 2]; + } + __syncthreads(); // not needed + + for (int i = first; i < n; i += blockDim.x * gridDim.x) + m[i] = m[i].inverse().eval(); + __syncthreads(); + + for (int i = first; i < n; i += blockDim.x * gridDim.x) { + p[i] = m[63 - i](0, 0); + p[i + 64] = m[63 - i](1, 1); + p[i + 64 * 2] = m[63 - i](2, 2); + } + + if (0 == first) + printf("after %f\n", p[0]); +} + +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#endif + +int main() { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); +#endif + + float p[1024]; + + std::uniform_real_distribution rgen(0.01, 0.99); + std::mt19937 eng; + + for (auto& r : p) + r = rgen(eng); + for (int i = 0, n = 64 * 3; i < n; ++i) + assert(p[i] > 0 && p[i] < 1.); + + std::cout << p[0] << std::endl; +#ifdef __CUDACC__ + float* p_d; + cudaCheck(cudaMalloc(&p_d, 1024 * 4)); + cudaCheck(cudaMemcpy(p_d, p, 1024 * 4, cudaMemcpyDefault)); + testBasicSoA<<<1, 1024>>>(p_d); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(p, p_d, 1024 * 4, cudaMemcpyDefault)); + cudaCheck(cudaDeviceSynchronize()); +#else + testBasicSoA(p); +#endif + + std::cout << p[0] << std::endl; + + for (int i = 0, n = 64 * 3; i < n; ++i) + assert(p[i] > 1.); + + std::cout << "END" << std::endl; + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu new file mode 100644 index 0000000000000..8bbaf4d066abc --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -0,0 +1,165 @@ +#include + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +template +struct format_traits { +public: + static const constexpr char* failed_msg = "failed %d %d %d: %d %d\n"; +}; + +template<> +struct format_traits { +public: + static const constexpr char* failed_msg = "failed %d %d %d: %f %f\n"; +}; + +template +__global__ void testPrefixScan(uint32_t size) { + __shared__ T ws[32]; + __shared__ T c[1024]; + __shared__ T co[1024]; + + auto first = threadIdx.x; + for (auto i = first; i < size; i += blockDim.x) + c[i] = 1; + __syncthreads(); + + blockPrefixScan(c, co, size, ws); + blockPrefixScan(c, size, ws); + + assert(1 == c[0]); + assert(1 == co[0]); + for (auto i = first + 1; i < size; i += blockDim.x) { + if (c[i] != c[i - 1] + 1) + printf(format_traits::failed_msg, size, i, blockDim.x, c[i], c[i - 1]); + assert(c[i] == c[i - 1] + 1); + assert(c[i] == i + 1); + assert(c[i] = co[i]); + } +} + +template +__global__ void testWarpPrefixScan(uint32_t size) { + assert(size <= 32); + __shared__ T c[1024]; + __shared__ T co[1024]; + auto i = threadIdx.x; + c[i] = 1; + __syncthreads(); + + warpPrefixScan(c, co, i, 0xffffffff); + warpPrefixScan(c, i, 0xffffffff); + __syncthreads(); + + assert(1 == c[0]); + assert(1 == co[0]); + if (i != 0) { + if (c[i] != c[i - 1] + 1) + printf(format_traits::failed_msg, size, i, blockDim.x, c[i], c[i - 1]); + assert(c[i] == c[i - 1] + 1); + assert(c[i] == i + 1); + assert(c[i] = co[i]); + } +} + +__global__ void init(uint32_t *v, uint32_t val, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + v[i] = val; + if (i == 0) + printf("init\n"); +} + +__global__ void verify(uint32_t const *v, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + assert(v[i] == i + 1); + if (i == 0) + printf("verify\n"); +} + +int main() { + cms::cudatest::requireDevices(); + + std::cout << "warp level" << std::endl; + // std::cout << "warp 32" << std::endl; + testWarpPrefixScan<<<1, 32>>>(32); + cudaDeviceSynchronize(); + // std::cout << "warp 16" << std::endl; + testWarpPrefixScan<<<1, 32>>>(16); + cudaDeviceSynchronize(); + // std::cout << "warp 5" << std::endl; + testWarpPrefixScan<<<1, 32>>>(5); + cudaDeviceSynchronize(); + + std::cout << "block level" << std::endl; + for (int bs = 32; bs <= 1024; bs += 32) { + // std::cout << "bs " << bs << std::endl; + for (int j = 1; j <= 1024; ++j) { + // std::cout << j << std::endl; + testPrefixScan<<<1, bs>>>(j); + cudaDeviceSynchronize(); + testPrefixScan<<<1, bs>>>(j); + cudaDeviceSynchronize(); + } + } + cudaDeviceSynchronize(); + + int num_items = 200; + for (int ksize = 1; ksize < 4; ++ksize) { + // test multiblock + std::cout << "multiblok" << std::endl; + // Declare, allocate, and initialize device-accessible pointers for input and output + num_items *= 10; + uint32_t *d_in; + uint32_t *d_out1; + uint32_t *d_out2; + + cudaCheck(cudaMalloc(&d_in, num_items * sizeof(uint32_t))); + cudaCheck(cudaMalloc(&d_out1, num_items * sizeof(uint32_t))); + cudaCheck(cudaMalloc(&d_out2, num_items * sizeof(uint32_t))); + + auto nthreads = 256; + auto nblocks = (num_items + nthreads - 1) / nthreads; + + init<<>>(d_in, 1, num_items); + + // the block counter + int32_t *d_pc; + cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t))); + cudaCheck(cudaMemset(d_pc, 0, 4)); + + nthreads = 1024; + nblocks = (num_items + nthreads - 1) / nthreads; + multiBlockPrefixScan<<>>(d_in, d_out1, num_items, d_pc); + verify<<>>(d_out1, num_items); + cudaDeviceSynchronize(); + + // test cub + std::cout << "cub" << std::endl; + // Determine temporary device storage requirements for inclusive prefix sum + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items); + + std::cout << "temp storage " << temp_storage_bytes << std::endl; + + // Allocate temporary storage for inclusive prefix sum + // fake larger ws already available + temp_storage_bytes *= 8; + cudaCheck(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + std::cout << "temp storage " << temp_storage_bytes << std::endl; + // Run inclusive prefix sum + CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items)); + std::cout << "temp storage " << temp_storage_bytes << std::endl; + + verify<<>>(d_out2, num_items); + cudaDeviceSynchronize(); + } // ksize + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu new file mode 100644 index 0000000000000..febdb9c92b0a7 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -0,0 +1,202 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" + +template +struct RS { + using type = std::uniform_int_distribution; + static auto ud() { return type(std::numeric_limits::min(), std::numeric_limits::max()); } + static constexpr T imax = std::numeric_limits::max(); +}; + +template <> +struct RS { + using T = float; + using type = std::uniform_real_distribution; + static auto ud() { return type(-std::numeric_limits::max() / 2, std::numeric_limits::max() / 2); } + // static auto ud() { return type(0,std::numeric_limits::max()/2);} + static constexpr int imax = std::numeric_limits::max(); +}; + +template +void go(bool useShared) { + std::mt19937 eng; + //std::mt19937 eng2; + auto rgen = RS::ud(); + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + + constexpr int blocks = 10; + constexpr int blockSize = 256 * 32; + constexpr int N = blockSize * blocks; + T v[N]; + uint16_t ind[N]; + + constexpr bool sgn = T(-1) < T(0); + std::cout << "Will sort " << N << (sgn ? " signed" : " unsigned") + << (std::numeric_limits::is_integer ? " 'ints'" : " 'float'") << " of size " << sizeof(T) << " using " + << NS << " significant bytes" << std::endl; + + for (int i = 0; i < 50; ++i) { + if (i == 49) { + for (long long j = 0; j < N; j++) + v[j] = 0; + } else if (i > 30) { + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + } else { + uint64_t imax = (i < 15) ? uint64_t(RS::imax) + 1LL : 255; + for (uint64_t j = 0; j < N; j++) { + v[j] = (j % imax); + if (j % 2 && i % 2) + v[j] = -v[j]; + } + } + + uint32_t offsets[blocks + 1]; + offsets[0] = 0; + for (int j = 1; j < blocks + 1; ++j) { + offsets[j] = offsets[j - 1] + blockSize - 3 * j; + assert(offsets[j] <= N); + } + + if (i == 1) { // special cases... + offsets[0] = 0; + offsets[1] = 0; + offsets[2] = 19; + offsets[3] = 32 + offsets[2]; + offsets[4] = 123 + offsets[3]; + offsets[5] = 256 + offsets[4]; + offsets[6] = 311 + offsets[5]; + offsets[7] = 2111 + offsets[6]; + offsets[8] = 256 * 11 + offsets[7]; + offsets[9] = 44 + offsets[8]; + offsets[10] = 3297 + offsets[9]; + } + + std::random_shuffle(v, v + N); + + auto v_d = cms::cuda::make_device_unique(N, nullptr); + auto ind_d = cms::cuda::make_device_unique(N, nullptr); + auto ws_d = cms::cuda::make_device_unique(N, nullptr); + auto off_d = cms::cuda::make_device_unique(blocks + 1, nullptr); + + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice)); + + if (i < 2) + std::cout << "lauch for " << offsets[blocks] << std::endl; + + auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; + + delta -= (std::chrono::high_resolution_clock::now() - start); + constexpr int MaxSize = 256 * 32; + if (useShared) + cms::cuda::launch( + radixSortMultiWrapper, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr); + else + cms::cuda::launch( + radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get()); + + if (i == 0) + std::cout << "done for " << offsets[blocks] << std::endl; + + cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost)); + + delta += (std::chrono::high_resolution_clock::now() - start); + + if (i == 0) + std::cout << "done for " << offsets[blocks] << std::endl; + + if (32 == i) { + std::cout << LL(v[ind[0]]) << ' ' << LL(v[ind[1]]) << ' ' << LL(v[ind[2]]) << std::endl; + std::cout << LL(v[ind[3]]) << ' ' << LL(v[ind[10]]) << ' ' << LL(v[ind[blockSize - 1000]]) << std::endl; + std::cout << LL(v[ind[blockSize / 2 - 1]]) << ' ' << LL(v[ind[blockSize / 2]]) << ' ' + << LL(v[ind[blockSize / 2 + 1]]) << std::endl; + } + for (int ib = 0; ib < blocks; ++ib) { + std::set inds; + if (offsets[ib + 1] > offsets[ib]) + inds.insert(ind[offsets[ib]]); + for (auto j = offsets[ib] + 1; j < offsets[ib + 1]; j++) { + inds.insert(ind[j]); + auto a = v + offsets[ib]; + auto k1 = a[ind[j]]; + auto k2 = a[ind[j - 1]]; + auto sh = sizeof(uint64_t) - NS; + sh *= 8; + auto shorten = [sh](T& t) { + auto k = (uint64_t*)(&t); + *k = (*k >> sh) << sh; + }; + shorten(k1); + shorten(k2); + if (k1 < k2) + std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl; + } + if (!inds.empty()) { + assert(0 == *inds.begin()); + assert(inds.size() - 1 == *inds.rbegin()); + } + if (inds.size() != (offsets[ib + 1] - offsets[ib])) + std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets[ib + 1] - offsets[ib]) + << std::endl; + assert(inds.size() == (offsets[ib + 1] - offsets[ib])); + } + } // 50 times + std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() / 50. + << " ms" << std::endl; +} + +int main() { + cms::cudatest::requireDevices(); + + bool useShared = false; + + std::cout << "using Global memory" << std::endl; + + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + + go(useShared); + go(useShared); + go(useShared); + // go(v); + + useShared = true; + + std::cout << "using Shared memory" << std::endl; + + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + + go(useShared); + go(useShared); + go(useShared); + // go(v); + + return 0; +} From 9b10f923405831759fcca6110bebbbbc3763f144 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 5 Mar 2020 12:38:25 +0100 Subject: [PATCH 02/15] CUDA does not support C++17 yet, so we define here some of the missing library functions --- .../CUDAUtilities/interface/cuda_cxx17.h | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h diff --git a/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h b/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h new file mode 100644 index 0000000000000..89f131edd941e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h @@ -0,0 +1,63 @@ +#ifndef HeterogeneousCore_CUDAUtilities_cuda_cxx17_h +#define HeterogeneousCore_CUDAUtilities_cuda_cxx17_h + +#include + +// CUDA does not support C++17 yet, so we define here some of the missing library functions +#if __cplusplus <= 201402L + +namespace std { + + // from https://en.cppreference.com/w/cpp/iterator/size + template + constexpr auto size(const C& c) -> decltype(c.size()) { + return c.size(); + } + + template + constexpr std::size_t size(const T (&array)[N]) noexcept { + return N; + } + + // from https://en.cppreference.com/w/cpp/iterator/empty + template + constexpr auto empty(const C& c) -> decltype(c.empty()) { + return c.empty(); + } + + template + constexpr bool empty(const T (&array)[N]) noexcept { + return false; + } + + template + constexpr bool empty(std::initializer_list il) noexcept { + return il.size() == 0; + } + + // from https://en.cppreference.com/w/cpp/iterator/data + template + constexpr auto data(C& c) -> decltype(c.data()) { + return c.data(); + } + + template + constexpr auto data(const C& c) -> decltype(c.data()) { + return c.data(); + } + + template + constexpr T* data(T (&array)[N]) noexcept { + return array; + } + + template + constexpr const E* data(std::initializer_list il) noexcept { + return il.begin(); + } + +} // namespace std + +#endif + +#endif // HeterogeneousCore_CUDAUtilities_cuda_cxx17_h From e4ead5ae0908f5fa3a736677e8223639b07d6e8d Mon Sep 17 00:00:00 2001 From: Felice Date: Thu, 5 Mar 2020 12:43:11 +0100 Subject: [PATCH 03/15] Implement vector-like classes for use on GPUs --- .../CUDAUtilities/interface/GPUSimpleVector.h | 138 ++++++++++++++++++ .../CUDAUtilities/interface/GPUVecArray.h | 104 +++++++++++++ .../test/test_GPUSimpleVector.cu | 83 +++++++++++ 3 files changed, 325 insertions(+) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h create mode 100644 HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h create mode 100644 HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h new file mode 100644 index 0000000000000..0a8df34b5125e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h @@ -0,0 +1,138 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h +#define HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h + +// author: Felice Pantaleo, CERN, 2018 + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace GPU { + template + struct SimpleVector { + constexpr SimpleVector() = default; + + // ownership of m_data stays within the caller + constexpr void construct(int capacity, T *data) { + m_size = 0; + m_capacity = capacity; + m_data = data; + } + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + __device__ inline T &back() { return m_data[m_size - 1]; } + + __device__ inline const T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + // thread safe version of resize + __device__ int extend(int size = 1) { + auto previousSize = atomicAdd(&m_size, size); + if (previousSize < m_capacity) { + return previousSize; + } else { + atomicSub(&m_size, size); + return -1; + } + } + + __device__ int shrink(int size = 1) { + auto previousSize = atomicSub(&m_size, size); + if (previousSize >= size) { + return previousSize - size; + } else { + atomicAdd(&m_size, size); + return -1; + } + } + + inline constexpr bool empty() const { return m_size <= 0; } + inline constexpr bool full() const { return m_size >= m_capacity; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int size() const { return m_size; } + inline constexpr int capacity() const { return m_capacity; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr void set_data(T *data) { m_data = data; } + + private: + int m_size; + int m_capacity; + + T *m_data; + }; + + // ownership of m_data stays within the caller + template + SimpleVector make_SimpleVector(int capacity, T *data) { + SimpleVector ret; + ret.construct(capacity, data); + return ret; + } + + // ownership of m_data stays within the caller + template + SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { + auto ret = new (mem) SimpleVector(); + ret->construct(capacity, data); + return ret; + } + +} // namespace GPU + +#endif // HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h new file mode 100644 index 0000000000000..d9ae73d6d4002 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h @@ -0,0 +1,104 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h +#define HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h + +// +// Author: Felice Pantaleo, CERN +// + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace GPU { + + template + class VecArray { + public: + using self = VecArray; + using value_t = T; + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + inline constexpr T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + inline constexpr T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } + + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline static constexpr int capacity() { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } + + private: + T m_data[maxSize]; + + int m_size; + }; + +} // namespace GPU + +#endif // HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h diff --git a/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu b/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu new file mode 100644 index 0000000000000..2811e3b34598e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu @@ -0,0 +1,83 @@ +// author: Felice Pantaleo, CERN, 2018 +#include +#include +#include + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +__global__ void vector_pushback(GPU::SimpleVector *foo) { + auto index = threadIdx.x + blockIdx.x * blockDim.x; + foo->push_back(index); +} + +__global__ void vector_reset(GPU::SimpleVector *foo) { foo->reset(); } + +__global__ void vector_emplace_back(GPU::SimpleVector *foo) { + auto index = threadIdx.x + blockIdx.x * blockDim.x; + foo->emplace_back(index); +} + +int main() { + cms::cudatest::requireDevices(); + + auto maxN = 10000; + GPU::SimpleVector *obj_ptr = nullptr; + GPU::SimpleVector *d_obj_ptr = nullptr; + GPU::SimpleVector *tmp_obj_ptr = nullptr; + int *data_ptr = nullptr; + int *d_data_ptr = nullptr; + + cudaCheck(cudaMallocHost(&obj_ptr, sizeof(GPU::SimpleVector))); + cudaCheck(cudaMallocHost(&data_ptr, maxN * sizeof(int))); + cudaCheck(cudaMalloc(&d_data_ptr, maxN * sizeof(int))); + + auto v = GPU::make_SimpleVector(obj_ptr, maxN, data_ptr); + + cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(GPU::SimpleVector))); + GPU::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr); + assert(tmp_obj_ptr->size() == 0); + assert(tmp_obj_ptr->capacity() == static_cast(maxN)); + + cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(GPU::SimpleVector))); + // ... and copy the object to the device. + cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + + int numBlocks = 5; + int numThreadsPerBlock = 256; + vector_pushback<<>>(d_obj_ptr); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + + assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN)); + vector_reset<<>>(d_obj_ptr); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + + assert(obj_ptr->size() == 0); + + vector_emplace_back<<>>(d_obj_ptr); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + + assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN)); + + cudaCheck(cudaMemcpy(data_ptr, d_data_ptr, obj_ptr->size() * sizeof(int), cudaMemcpyDefault)); + cudaCheck(cudaFreeHost(obj_ptr)); + cudaCheck(cudaFreeHost(data_ptr)); + cudaCheck(cudaFreeHost(tmp_obj_ptr)); + cudaCheck(cudaFree(d_data_ptr)); + cudaCheck(cudaFree(d_obj_ptr)); + std::cout << "TEST PASSED" << std::endl; + return 0; +} From 020fdd1db6d6dd7c104be60ff28bc8ec6778fa24 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 5 Mar 2020 12:43:29 +0100 Subject: [PATCH 04/15] Update BuildFile --- .../CUDAUtilities/test/BuildFile.xml | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index ae6835126f9cf..60961d42999d4 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -8,7 +8,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 35ffe1f9b33e9948d889e5f10ca000f341891793 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 6 Mar 2020 12:07:31 +0100 Subject: [PATCH 05/15] Fix AtomicPairCounter unit test --- HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu index 1c28758fb970e..92806c91337d8 100644 --- a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu @@ -1,7 +1,7 @@ -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" - +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" __global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { auto i = blockIdx.x * blockDim.x + threadIdx.x; @@ -39,6 +39,8 @@ __global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_ #include int main() { + cms::cudatest::requireDevices(); + AtomicPairCounter *dc_d; cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); From bfc0df7596871073706cafcb5279154c788983e8 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 6 Mar 2020 12:21:29 +0100 Subject: [PATCH 06/15] Rename the cudaCompat namespace to cms::cudacompat --- .../CUDAUtilities/interface/cudaCompat.h | 124 +++++++++--------- .../CUDAUtilities/src/cudaCompat.cc | 12 +- 2 files changed, 70 insertions(+), 66 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index 32c7d7b481a50..593821fe805ed 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -13,69 +13,71 @@ #include -namespace cudaCompat { +namespace cms { + namespace cudacompat { #ifndef __CUDA_RUNTIME_H__ - struct dim3 { - uint32_t x, y, z; - }; + struct dim3 { + uint32_t x, y, z; + }; #endif - const dim3 threadIdx = {0, 0, 0}; - const dim3 blockDim = {1, 1, 1}; - - extern thread_local dim3 blockIdx; - extern thread_local dim3 gridDim; - - template - T1 atomicInc(T1* a, T2 b) { - auto ret = *a; - if ((*a) < T1(b)) - (*a)++; - return ret; - } - - template - T1 atomicAdd(T1* a, T2 b) { - auto ret = *a; - (*a) += b; - return ret; - } - - template - T1 atomicSub(T1* a, T2 b) { - auto ret = *a; - (*a) -= b; - return ret; - } - - template - T1 atomicMin(T1* a, T2 b) { - auto ret = *a; - *a = std::min(*a, T1(b)); - return ret; - } - template - T1 atomicMax(T1* a, T2 b) { - auto ret = *a; - *a = std::max(*a, T1(b)); - return ret; - } - - inline void __syncthreads() {} - inline void __threadfence() {} - inline bool __syncthreads_or(bool x) { return x; } - inline bool __syncthreads_and(bool x) { return x; } - template - inline T __ldg(T const* x) { - return *x; - } - - inline void resetGrid() { - blockIdx = {0, 0, 0}; - gridDim = {1, 1, 1}; - } - -} // namespace cudaCompat + const dim3 threadIdx = {0, 0, 0}; + const dim3 blockDim = {1, 1, 1}; + + extern thread_local dim3 blockIdx; + extern thread_local dim3 gridDim; + + template + T1 atomicInc(T1* a, T2 b) { + auto ret = *a; + if ((*a) < T1(b)) + (*a)++; + return ret; + } + + template + T1 atomicAdd(T1* a, T2 b) { + auto ret = *a; + (*a) += b; + return ret; + } + + template + T1 atomicSub(T1* a, T2 b) { + auto ret = *a; + (*a) -= b; + return ret; + } + + template + T1 atomicMin(T1* a, T2 b) { + auto ret = *a; + *a = std::min(*a, T1(b)); + return ret; + } + template + T1 atomicMax(T1* a, T2 b) { + auto ret = *a; + *a = std::max(*a, T1(b)); + return ret; + } + + inline void __syncthreads() {} + inline void __threadfence() {} + inline bool __syncthreads_or(bool x) { return x; } + inline bool __syncthreads_and(bool x) { return x; } + template + inline T __ldg(T const* x) { + return *x; + } + + inline void resetGrid() { + blockIdx = {0, 0, 0}; + gridDim = {1, 1, 1}; + } + + } // namespace cudacompat +} // namespace cms // some not needed as done by cuda runtime... #ifndef __CUDA_RUNTIME_H__ @@ -95,7 +97,7 @@ namespace cudaCompat { #endif #ifndef __CUDA_ARCH__ -using namespace cudaCompat; +using namespace cms::cudacompat; #endif #endif diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc index 1aad065963ff2..7b8efda8e3811 100644 --- a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc +++ b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc @@ -1,13 +1,15 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" -namespace cudaCompat { - thread_local dim3 blockIdx; - thread_local dim3 gridDim; -} // namespace cudaCompat +namespace cms { + namespace cudacompat { + thread_local dim3 blockIdx; + thread_local dim3 gridDim; + } // namespace cudacompat +} // namespace cms namespace { struct InitGrid { - InitGrid() { cudaCompat::resetGrid(); } + InitGrid() { cms::cudacompat::resetGrid(); } }; const InitGrid initGrid; From 85a0b47749426138065b147c044e9c0268561b67 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 6 Mar 2020 12:41:53 +0100 Subject: [PATCH 07/15] Remove extra semicolon --- HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h index e9fef92e68083..b19cfc55f919d 100644 --- a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h @@ -22,7 +22,7 @@ __global__ void testBasicSoA(float* p) { assert(!isPowerOf2(1026)); using M3 = Eigen::Matrix; - ; + __shared__ eigenSoA::MatrixSoA m; int first = threadIdx.x + blockIdx.x * blockDim.x; From 591a0c305d490968b2ccb2f23c9bfefc02f5e0da Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 6 Mar 2020 12:35:58 +0100 Subject: [PATCH 08/15] Move SimpleVector and VecArray to the cms::cuda namespace --- .../CUDAUtilities/interface/GPUSimpleVector.h | 138 ----------------- .../CUDAUtilities/interface/GPUVecArray.h | 104 ------------- .../CUDAUtilities/interface/SimpleVector.h | 141 ++++++++++++++++++ .../CUDAUtilities/interface/VecArray.h | 106 +++++++++++++ .../CUDAUtilities/test/BuildFile.xml | 2 +- ...PUSimpleVector.cu => test_SimpleVector.cu} | 32 ++-- 6 files changed, 264 insertions(+), 259 deletions(-) delete mode 100644 HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h delete mode 100644 HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h create mode 100644 HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h create mode 100644 HeterogeneousCore/CUDAUtilities/interface/VecArray.h rename HeterogeneousCore/CUDAUtilities/test/{test_GPUSimpleVector.cu => test_SimpleVector.cu} (61%) diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h deleted file mode 100644 index 0a8df34b5125e..0000000000000 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h +++ /dev/null @@ -1,138 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h -#define HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h - -// author: Felice Pantaleo, CERN, 2018 - -#include -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" - -namespace GPU { - template - struct SimpleVector { - constexpr SimpleVector() = default; - - // ownership of m_data stays within the caller - constexpr void construct(int capacity, T *data) { - m_size = 0; - m_capacity = capacity; - m_data = data; - } - - inline constexpr int push_back_unsafe(const T &element) { - auto previousSize = m_size; - m_size++; - if (previousSize < m_capacity) { - m_data[previousSize] = element; - return previousSize; - } else { - --m_size; - return -1; - } - } - - template - constexpr int emplace_back_unsafe(Ts &&... args) { - auto previousSize = m_size; - m_size++; - if (previousSize < m_capacity) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - --m_size; - return -1; - } - } - - __device__ inline T &back() { return m_data[m_size - 1]; } - - __device__ inline const T &back() const { - if (m_size > 0) { - return m_data[m_size - 1]; - } else - return T(); //undefined behaviour - } - - // thread-safe version of the vector, when used in a CUDA kernel - __device__ int push_back(const T &element) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < m_capacity) { - m_data[previousSize] = element; - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; - } - } - - template - __device__ int emplace_back(Ts &&... args) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < m_capacity) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; - } - } - - // thread safe version of resize - __device__ int extend(int size = 1) { - auto previousSize = atomicAdd(&m_size, size); - if (previousSize < m_capacity) { - return previousSize; - } else { - atomicSub(&m_size, size); - return -1; - } - } - - __device__ int shrink(int size = 1) { - auto previousSize = atomicSub(&m_size, size); - if (previousSize >= size) { - return previousSize - size; - } else { - atomicAdd(&m_size, size); - return -1; - } - } - - inline constexpr bool empty() const { return m_size <= 0; } - inline constexpr bool full() const { return m_size >= m_capacity; } - inline constexpr T &operator[](int i) { return m_data[i]; } - inline constexpr const T &operator[](int i) const { return m_data[i]; } - inline constexpr void reset() { m_size = 0; } - inline constexpr int size() const { return m_size; } - inline constexpr int capacity() const { return m_capacity; } - inline constexpr T const *data() const { return m_data; } - inline constexpr void resize(int size) { m_size = size; } - inline constexpr void set_data(T *data) { m_data = data; } - - private: - int m_size; - int m_capacity; - - T *m_data; - }; - - // ownership of m_data stays within the caller - template - SimpleVector make_SimpleVector(int capacity, T *data) { - SimpleVector ret; - ret.construct(capacity, data); - return ret; - } - - // ownership of m_data stays within the caller - template - SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { - auto ret = new (mem) SimpleVector(); - ret->construct(capacity, data); - return ret; - } - -} // namespace GPU - -#endif // HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h deleted file mode 100644 index d9ae73d6d4002..0000000000000 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h -#define HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h - -// -// Author: Felice Pantaleo, CERN -// - -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" - -namespace GPU { - - template - class VecArray { - public: - using self = VecArray; - using value_t = T; - - inline constexpr int push_back_unsafe(const T &element) { - auto previousSize = m_size; - m_size++; - if (previousSize < maxSize) { - m_data[previousSize] = element; - return previousSize; - } else { - --m_size; - return -1; - } - } - - template - constexpr int emplace_back_unsafe(Ts &&... args) { - auto previousSize = m_size; - m_size++; - if (previousSize < maxSize) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - --m_size; - return -1; - } - } - - inline constexpr T &back() const { - if (m_size > 0) { - return m_data[m_size - 1]; - } else - return T(); //undefined behaviour - } - - // thread-safe version of the vector, when used in a CUDA kernel - __device__ int push_back(const T &element) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < maxSize) { - m_data[previousSize] = element; - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; - } - } - - template - __device__ int emplace_back(Ts &&... args) { - auto previousSize = atomicAdd(&m_size, 1); - if (previousSize < maxSize) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - atomicSub(&m_size, 1); - return -1; - } - } - - inline constexpr T pop_back() { - if (m_size > 0) { - auto previousSize = m_size--; - return m_data[previousSize - 1]; - } else - return T(); - } - - inline constexpr T const *begin() const { return m_data; } - inline constexpr T const *end() const { return m_data + m_size; } - inline constexpr T *begin() { return m_data; } - inline constexpr T *end() { return m_data + m_size; } - inline constexpr int size() const { return m_size; } - inline constexpr T &operator[](int i) { return m_data[i]; } - inline constexpr const T &operator[](int i) const { return m_data[i]; } - inline constexpr void reset() { m_size = 0; } - inline static constexpr int capacity() { return maxSize; } - inline constexpr T const *data() const { return m_data; } - inline constexpr void resize(int size) { m_size = size; } - inline constexpr bool empty() const { return 0 == m_size; } - inline constexpr bool full() const { return maxSize == m_size; } - - private: - T m_data[maxSize]; - - int m_size; - }; - -} // namespace GPU - -#endif // HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h new file mode 100644 index 0000000000000..0e3caf72d1023 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h @@ -0,0 +1,141 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h +#define HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h + +// author: Felice Pantaleo, CERN, 2018 + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace cms { + namespace cuda { + + template + struct SimpleVector { + constexpr SimpleVector() = default; + + // ownership of m_data stays within the caller + constexpr void construct(int capacity, T *data) { + m_size = 0; + m_capacity = capacity; + m_data = data; + } + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + __device__ inline T &back() { return m_data[m_size - 1]; } + + __device__ inline const T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + // thread safe version of resize + __device__ int extend(int size = 1) { + auto previousSize = atomicAdd(&m_size, size); + if (previousSize < m_capacity) { + return previousSize; + } else { + atomicSub(&m_size, size); + return -1; + } + } + + __device__ int shrink(int size = 1) { + auto previousSize = atomicSub(&m_size, size); + if (previousSize >= size) { + return previousSize - size; + } else { + atomicAdd(&m_size, size); + return -1; + } + } + + inline constexpr bool empty() const { return m_size <= 0; } + inline constexpr bool full() const { return m_size >= m_capacity; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int size() const { return m_size; } + inline constexpr int capacity() const { return m_capacity; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr void set_data(T *data) { m_data = data; } + + private: + int m_size; + int m_capacity; + + T *m_data; + }; + + // ownership of m_data stays within the caller + template + SimpleVector make_SimpleVector(int capacity, T *data) { + SimpleVector ret; + ret.construct(capacity, data); + return ret; + } + + // ownership of m_data stays within the caller + template + SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { + auto ret = new (mem) SimpleVector(); + ret->construct(capacity, data); + return ret; + } + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/VecArray.h b/HeterogeneousCore/CUDAUtilities/interface/VecArray.h new file mode 100644 index 0000000000000..08388dc8d4b3e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/VecArray.h @@ -0,0 +1,106 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_VecArray_h +#define HeterogeneousCore_CUDAUtilities_interface_VecArray_h + +// +// Author: Felice Pantaleo, CERN +// + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace cms { + namespace cuda { + + template + class VecArray { + public: + using self = VecArray; + using value_t = T; + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + inline constexpr T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + inline constexpr T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } + + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline static constexpr int capacity() { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } + + private: + T m_data[maxSize]; + + int m_size; + }; + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_VecArray_h diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index 60961d42999d4..5455860900400 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -8,7 +8,7 @@ - + diff --git a/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu b/HeterogeneousCore/CUDAUtilities/test/test_SimpleVector.cu similarity index 61% rename from HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu rename to HeterogeneousCore/CUDAUtilities/test/test_SimpleVector.cu index 2811e3b34598e..6ecc89f30f158 100644 --- a/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu +++ b/HeterogeneousCore/CUDAUtilities/test/test_SimpleVector.cu @@ -6,18 +6,18 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" -__global__ void vector_pushback(GPU::SimpleVector *foo) { +__global__ void vector_pushback(cms::cuda::SimpleVector *foo) { auto index = threadIdx.x + blockIdx.x * blockDim.x; foo->push_back(index); } -__global__ void vector_reset(GPU::SimpleVector *foo) { foo->reset(); } +__global__ void vector_reset(cms::cuda::SimpleVector *foo) { foo->reset(); } -__global__ void vector_emplace_back(GPU::SimpleVector *foo) { +__global__ void vector_emplace_back(cms::cuda::SimpleVector *foo) { auto index = threadIdx.x + blockIdx.x * blockDim.x; foo->emplace_back(index); } @@ -26,26 +26,26 @@ int main() { cms::cudatest::requireDevices(); auto maxN = 10000; - GPU::SimpleVector *obj_ptr = nullptr; - GPU::SimpleVector *d_obj_ptr = nullptr; - GPU::SimpleVector *tmp_obj_ptr = nullptr; + cms::cuda::SimpleVector *obj_ptr = nullptr; + cms::cuda::SimpleVector *d_obj_ptr = nullptr; + cms::cuda::SimpleVector *tmp_obj_ptr = nullptr; int *data_ptr = nullptr; int *d_data_ptr = nullptr; - cudaCheck(cudaMallocHost(&obj_ptr, sizeof(GPU::SimpleVector))); + cudaCheck(cudaMallocHost(&obj_ptr, sizeof(cms::cuda::SimpleVector))); cudaCheck(cudaMallocHost(&data_ptr, maxN * sizeof(int))); cudaCheck(cudaMalloc(&d_data_ptr, maxN * sizeof(int))); - auto v = GPU::make_SimpleVector(obj_ptr, maxN, data_ptr); + auto v = cms::cuda::make_SimpleVector(obj_ptr, maxN, data_ptr); - cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(GPU::SimpleVector))); - GPU::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr); + cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(cms::cuda::SimpleVector))); + cms::cuda::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr); assert(tmp_obj_ptr->size() == 0); assert(tmp_obj_ptr->capacity() == static_cast(maxN)); - cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(GPU::SimpleVector))); + cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(cms::cuda::SimpleVector))); // ... and copy the object to the device. - cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); int numBlocks = 5; int numThreadsPerBlock = 256; @@ -53,14 +53,14 @@ int main() { cudaCheck(cudaGetLastError()); cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN)); vector_reset<<>>(d_obj_ptr); cudaCheck(cudaGetLastError()); cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); assert(obj_ptr->size() == 0); @@ -68,7 +68,7 @@ int main() { cudaCheck(cudaGetLastError()); cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN)); From f4dc65051b14516993fa023f623f04d31af58cf6 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 6 Mar 2020 12:51:06 +0100 Subject: [PATCH 09/15] Add missing dependency --- HeterogeneousCore/CUDAUtilities/BuildFile.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml index 4a8e644a231f6..1e2de51dc0053 100644 --- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml @@ -1,6 +1,7 @@ + From 68de0b3d1028d85f7a591dff33dd2a7c5c693934 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 6 Mar 2020 13:14:55 +0100 Subject: [PATCH 10/15] Move HistoContainer, AtomicPairCounter, prefixScan and radixSort to the cms::cuda namespace --- .../interface/AtomicPairCounter.h | 70 ++-- .../CUDAUtilities/interface/HistoContainer.h | 378 +++++++++--------- .../CUDAUtilities/interface/prefixScan.h | 245 ++++++------ .../CUDAUtilities/interface/radixSort.h | 45 ++- .../CUDAUtilities/test/AtomicPairCounter_t.cu | 2 + .../CUDAUtilities/test/HistoContainer_t.cpp | 2 + .../CUDAUtilities/test/HistoContainer_t.cu | 12 +- .../CUDAUtilities/test/OneHistoContainer_t.cu | 6 +- .../CUDAUtilities/test/OneToManyAssoc_t.h | 42 +- .../CUDAUtilities/test/prefixScan_t.cu | 8 +- .../CUDAUtilities/test/radixSort_t.cu | 2 + 11 files changed, 423 insertions(+), 389 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h index 8854b02176260..2e191ce551c42 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h +++ b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h @@ -5,48 +5,54 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" -class AtomicPairCounter { -public: - using c_type = unsigned long long int; +namespace cms { + namespace cuda { - AtomicPairCounter() {} - AtomicPairCounter(c_type i) { counter.ac = i; } + class AtomicPairCounter { + public: + using c_type = unsigned long long int; - __device__ __host__ AtomicPairCounter& operator=(c_type i) { - counter.ac = i; - return *this; - } + AtomicPairCounter() {} + AtomicPairCounter(c_type i) { counter.ac = i; } - struct Counters { - uint32_t n; // in a "One to Many" association is the number of "One" - uint32_t m; // in a "One to Many" association is the total number of associations - }; + __device__ __host__ AtomicPairCounter& operator=(c_type i) { + counter.ac = i; + return *this; + } - union Atomic2 { - Counters counters; - c_type ac; - }; + struct Counters { + uint32_t n; // in a "One to Many" association is the number of "One" + uint32_t m; // in a "One to Many" association is the total number of associations + }; - static constexpr c_type incr = 1UL << 32; + union Atomic2 { + Counters counters; + c_type ac; + }; - __device__ __host__ Counters get() const { return counter.counters; } + static constexpr c_type incr = 1UL << 32; - // increment n by 1 and m by i. return previous value - __host__ __device__ __forceinline__ Counters add(uint32_t i) { - c_type c = i; - c += incr; - Atomic2 ret; + __device__ __host__ Counters get() const { return counter.counters; } + + // increment n by 1 and m by i. return previous value + __host__ __device__ __forceinline__ Counters add(uint32_t i) { + c_type c = i; + c += incr; + Atomic2 ret; #ifdef __CUDA_ARCH__ - ret.ac = atomicAdd(&counter.ac, c); + ret.ac = atomicAdd(&counter.ac, c); #else - ret.ac = counter.ac; - counter.ac += c; + ret.ac = counter.ac; + counter.ac += c; #endif - return ret.counters; - } + return ret.counters; + } + + private: + Atomic2 counter; + }; -private: - Atomic2 counter; -}; + } // namespace cuda +} // namespace cms #endif // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 67b2b46f45101..88118fcdac277 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -125,216 +125,216 @@ namespace cms { assoc->bulkFinalizeFill(*apc); } - } // namespace cuda -} // namespace cms + // iteratate over N bins left and right of the one containing "v" + template + __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } -// iteratate over N bins left and right of the one containing "v" -template -__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { - int bs = Hist::bin(value); - int be = std::min(int(Hist::nbins() - 1), bs + n); - bs = std::max(0, bs - n); - assert(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } -} - -// iteratate over bins containing all values in window wmin, wmax -template -__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { - auto bs = Hist::bin(wmin); - auto be = Hist::bin(wmax); - assert(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } -} - -template -class HistoContainer { -public: - using Counter = uint32_t; - - using CountersOnly = HistoContainer; - - using index_type = I; - using UT = typename std::make_unsigned::type; - - static constexpr uint32_t ilog2(uint32_t v) { - constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; - constexpr uint32_t s[] = {1, 2, 4, 8, 16}; - - uint32_t r = 0; // result of log2(v) will go here - for (auto i = 4; i >= 0; i--) - if (v & b[i]) { - v >>= s[i]; - r |= s[i]; + // iteratate over bins containing all values in window wmin, wmax + template + __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); } - return r; - } + } - static constexpr uint32_t sizeT() { return S; } - static constexpr uint32_t nbins() { return NBINS; } - static constexpr uint32_t nhists() { return NHISTS; } - static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } - static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } - static constexpr uint32_t capacity() { return SIZE; } + template + class HistoContainer { + public: + using Counter = uint32_t; + + using CountersOnly = HistoContainer; + + using index_type = I; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } - static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr uint32_t nhists() { return NHISTS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + static constexpr uint32_t capacity() { return SIZE; } - __host__ static size_t wsSize() { + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + + __host__ static size_t wsSize() { #ifdef __CUDACC__ - uint32_t *v = nullptr; - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()); - return temp_storage_bytes; + uint32_t *v = nullptr; + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()); + return temp_storage_bytes; #else - return 0; + return 0; #endif - } + } - static constexpr UT bin(T t) { - constexpr uint32_t shift = sizeT() - nbits(); - constexpr uint32_t mask = (1 << nbits()) - 1; - return (t >> shift) & mask; - } + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1 << nbits()) - 1; + return (t >> shift) & mask; + } - __host__ __device__ void zero() { - for (auto &i : off) - i = 0; - } + __host__ __device__ void zero() { + for (auto &i : off) + i = 0; + } - __host__ __device__ void add(CountersOnly const &co) { - for (uint32_t i = 0; i < totbins(); ++i) { + __host__ __device__ void add(CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { #ifdef __CUDA_ARCH__ - atomicAdd(off + i, co.off[i]); + atomicAdd(off + i, co.off[i]); #else - auto &a = (std::atomic &)(off[i]); - a += co.off[i]; + auto &a = (std::atomic &)(off[i]); + a += co.off[i]; #endif - } - } + } + } - static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) { + static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) { #ifdef __CUDA_ARCH__ - return atomicAdd(&x, 1); + return atomicAdd(&x, 1); #else - auto &a = (std::atomic &)(x); - return a++; + auto &a = (std::atomic &)(x); + return a++; #endif - } + } - static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) { + static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) { #ifdef __CUDA_ARCH__ - return atomicSub(&x, 1); + return atomicSub(&x, 1); #else - auto &a = (std::atomic &)(x); - return a--; + auto &a = (std::atomic &)(x); + return a--; #endif - } - - __host__ __device__ __forceinline__ void countDirect(T b) { - assert(b < nbins()); - atomicIncrement(off[b]); - } - - __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) { - assert(b < nbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; - } - - __device__ __host__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { - auto c = apc.add(n); - if (c.m >= nbins()) - return -int32_t(c.m); - off[c.m] = c.n; - for (uint32_t j = 0; j < n; ++j) - bins[c.n + j] = v[j]; - return c.m; - } - - __device__ __host__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) { - off[apc.get().m] = apc.get().n; - } - - __device__ __host__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) { - auto m = apc.get().m; - auto n = apc.get().n; - if (m >= nbins()) { // overflow! - off[nbins()] = uint32_t(off[nbins() - 1]); - return; - } - auto first = m + blockDim.x * blockIdx.x + threadIdx.x; - for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { - off[i] = n; - } - } - - __host__ __device__ __forceinline__ void count(T t) { - uint32_t b = bin(t); - assert(b < nbins()); - atomicIncrement(off[b]); - } - - __host__ __device__ __forceinline__ void fill(T t, index_type j) { - uint32_t b = bin(t); - assert(b < nbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; - } - - __host__ __device__ __forceinline__ void count(T t, uint32_t nh) { - uint32_t b = bin(t); - assert(b < nbins()); - b += histOff(nh); - assert(b < totbins()); - atomicIncrement(off[b]); - } - - __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) { - uint32_t b = bin(t); - assert(b < nbins()); - b += histOff(nh); - assert(b < totbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; - } - - __device__ __host__ __forceinline__ void finalize(Counter *ws = nullptr) { - assert(off[totbins() - 1] == 0); - blockPrefixScan(off, totbins(), ws); - assert(off[totbins() - 1] == off[totbins() - 2]); - } - - constexpr auto size() const { return uint32_t(off[totbins() - 1]); } - constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } - - constexpr index_type const *begin() const { return bins; } - constexpr index_type const *end() const { return begin() + size(); } - - constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } - constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } - - Counter off[totbins()]; - index_type bins[capacity()]; -}; - -template -using OneToManyAssoc = HistoContainer; + } + + __host__ __device__ __forceinline__ void countDirect(T b) { + assert(b < nbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) { + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } + + __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } + + __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; + } + auto first = m + blockDim.x * blockIdx.x + threadIdx.x; + for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { + off[i] = n; + } + } + + __host__ __device__ __forceinline__ void count(T t) { + uint32_t b = bin(t); + assert(b < nbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fill(T t, index_type j) { + uint32_t b = bin(t); + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ void count(T t, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) { + assert(off[totbins() - 1] == 0); + blockPrefixScan(off, totbins(), ws); + assert(off[totbins() - 1] == off[totbins() - 2]); + } + + constexpr auto size() const { return uint32_t(off[totbins() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + + constexpr index_type const *begin() const { return bins; } + constexpr index_type const *end() const { return begin() + size(); } + + constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } + constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } + + Counter off[totbins()]; + index_type bins[capacity()]; + }; + + template + using OneToManyAssoc = HistoContainer; + + } // namespace cuda +} // namespace cms #endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index 8b784bdd61bfe..1cf554163f409 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -7,6 +7,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #ifdef __CUDA_ARCH__ + template __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) { // ci and co may be the same @@ -20,10 +21,7 @@ __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __re } co[i] = x; } -#endif -//same as above may remove -#ifdef __CUDA_ARCH__ template __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) { auto x = c[i]; @@ -36,134 +34,141 @@ __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) } c[i] = x; } + #endif -// limited to 32*32 elements.... -template -__device__ __host__ void __forceinline__ blockPrefixScan(T const* __restrict__ ci, - T* __restrict__ co, - uint32_t size, - T* ws +namespace cms { + namespace cuda { + + // limited to 32*32 elements.... + template + __host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci, + T* __restrict__ co, + uint32_t size, + T* ws #ifndef __CUDA_ARCH__ - = nullptr + = nullptr #endif -) { + ) { #ifdef __CUDA_ARCH__ - assert(ws); - assert(size <= 1024); - assert(0 == blockDim.x % 32); - auto first = threadIdx.x; - auto mask = __ballot_sync(0xffffffff, first < size); - - for (auto i = first; i < size; i += blockDim.x) { - warpPrefixScan(ci, co, i, mask); - auto laneId = threadIdx.x & 0x1f; - auto warpId = i / 32; - assert(warpId < 32); - if (31 == laneId) - ws[warpId] = co[i]; - mask = __ballot_sync(mask, i + blockDim.x < size); - } - __syncthreads(); - if (size <= 32) - return; - if (threadIdx.x < 32) - warpPrefixScan(ws, threadIdx.x, 0xffffffff); - __syncthreads(); - for (auto i = first + 32; i < size; i += blockDim.x) { - auto warpId = i / 32; - co[i] += ws[warpId - 1]; - } - __syncthreads(); + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); + auto first = threadIdx.x; + auto mask = __ballot_sync(0xffffffff, first < size); + + for (auto i = first; i < size; i += blockDim.x) { + warpPrefixScan(ci, co, i, mask); + auto laneId = threadIdx.x & 0x1f; + auto warpId = i / 32; + assert(warpId < 32); + if (31 == laneId) + ws[warpId] = co[i]; + mask = __ballot_sync(mask, i + blockDim.x < size); + } + __syncthreads(); + if (size <= 32) + return; + if (threadIdx.x < 32) + warpPrefixScan(ws, threadIdx.x, 0xffffffff); + __syncthreads(); + for (auto i = first + 32; i < size; i += blockDim.x) { + auto warpId = i / 32; + co[i] += ws[warpId - 1]; + } + __syncthreads(); #else - co[0] = ci[0]; - for (uint32_t i = 1; i < size; ++i) - co[i] = ci[i] + co[i - 1]; + co[0] = ci[0]; + for (uint32_t i = 1; i < size; ++i) + co[i] = ci[i] + co[i - 1]; #endif -} - -// same as above, may remove -// limited to 32*32 elements.... -template -__device__ __host__ void __forceinline__ blockPrefixScan(T* c, - uint32_t size, - T* ws + } + + // same as above, may remove + // limited to 32*32 elements.... + template + __host__ __device__ __forceinline__ void blockPrefixScan(T* c, + uint32_t size, + T* ws #ifndef __CUDA_ARCH__ - = nullptr + = nullptr #endif -) { + ) { #ifdef __CUDA_ARCH__ - assert(ws); - assert(size <= 1024); - assert(0 == blockDim.x % 32); - auto first = threadIdx.x; - auto mask = __ballot_sync(0xffffffff, first < size); - - for (auto i = first; i < size; i += blockDim.x) { - warpPrefixScan(c, i, mask); - auto laneId = threadIdx.x & 0x1f; - auto warpId = i / 32; - assert(warpId < 32); - if (31 == laneId) - ws[warpId] = c[i]; - mask = __ballot_sync(mask, i + blockDim.x < size); - } - __syncthreads(); - if (size <= 32) - return; - if (threadIdx.x < 32) - warpPrefixScan(ws, threadIdx.x, 0xffffffff); - __syncthreads(); - for (auto i = first + 32; i < size; i += blockDim.x) { - auto warpId = i / 32; - c[i] += ws[warpId - 1]; - } - __syncthreads(); + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); + auto first = threadIdx.x; + auto mask = __ballot_sync(0xffffffff, first < size); + + for (auto i = first; i < size; i += blockDim.x) { + warpPrefixScan(c, i, mask); + auto laneId = threadIdx.x & 0x1f; + auto warpId = i / 32; + assert(warpId < 32); + if (31 == laneId) + ws[warpId] = c[i]; + mask = __ballot_sync(mask, i + blockDim.x < size); + } + __syncthreads(); + if (size <= 32) + return; + if (threadIdx.x < 32) + warpPrefixScan(ws, threadIdx.x, 0xffffffff); + __syncthreads(); + for (auto i = first + 32; i < size; i += blockDim.x) { + auto warpId = i / 32; + c[i] += ws[warpId - 1]; + } + __syncthreads(); #else - for (uint32_t i = 1; i < size; ++i) - c[i] += c[i - 1]; + for (uint32_t i = 1; i < size; ++i) + c[i] += c[i - 1]; #endif -} - -// limited to 1024*1024 elements.... -template -__global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) { - __shared__ T ws[32]; - // first each block does a scan of size 1024; (better be enough blocks....) - assert(1024 * gridDim.x >= size); - int off = 1024 * blockIdx.x; - if (size - off > 0) - blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws); - - // count blocks that finished - __shared__ bool isLastBlockDone; - if (0 == threadIdx.x) { - auto value = atomicAdd(pc, 1); // block counter - isLastBlockDone = (value == (int(gridDim.x) - 1)); - } - - __syncthreads(); - - if (!isLastBlockDone) - return; - - // good each block has done its work and now we are left in last block - - // let's get the partial sums from each block - __shared__ T psum[1024]; - for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) { - auto j = 1024 * i + 1023; - psum[i] = (j < size) ? co[j] : T(0); - } - __syncthreads(); - blockPrefixScan(psum, psum, gridDim.x, ws); - - // now it would have been handy to have the other blocks around... - int first = threadIdx.x; // + blockDim.x * blockIdx.x - for (int i = first + 1024; i < size; i += blockDim.x) { // *gridDim.x) { - auto k = i / 1024; // block - co[i] += psum[k - 1]; - } -} + } + + // limited to 1024*1024 elements.... + template + __global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) { + __shared__ T ws[32]; + // first each block does a scan of size 1024; (better be enough blocks....) + assert(1024 * gridDim.x >= size); + int off = 1024 * blockIdx.x; + if (size - off > 0) + blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws); + + // count blocks that finished + __shared__ bool isLastBlockDone; + if (0 == threadIdx.x) { + auto value = atomicAdd(pc, 1); // block counter + isLastBlockDone = (value == (int(gridDim.x) - 1)); + } + + __syncthreads(); + + if (!isLastBlockDone) + return; + + // good each block has done its work and now we are left in last block + + // let's get the partial sums from each block + __shared__ T psum[1024]; + for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) { + auto j = 1024 * i + 1023; + psum[i] = (j < size) ? co[j] : T(0); + } + __syncthreads(); + blockPrefixScan(psum, psum, gridDim.x, ws); + + // now it would have been handy to have the other blocks around... + int first = threadIdx.x; // + blockDim.x * blockIdx.x + for (int i = first + 1024; i < size; i += blockDim.x) { // *gridDim.x) { + auto k = i / 1024; // block + co[i] += psum[k - 1]; + } + } + + } // namespace cuda +} // namespace cms #endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h index ce2acf2006090..09fcdc8cd5f69 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -2,6 +2,7 @@ #define HeterogeneousCoreCUDAUtilities_radixSort_H #include +#include #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" @@ -79,8 +80,8 @@ __device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, u template -__device__ void __forceinline__ -radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { +__device__ __forceinline__ void radixSortImpl( + T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { constexpr int d = 8, w = 8 * sizeof(T); constexpr int sb = 1 << d; constexpr int ps = int(sizeof(T)) - NS; @@ -217,28 +218,30 @@ radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t s template ::value, T>::type* = nullptr> -__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { +__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { radixSortImpl(a, ind, ind2, size, dummyReorder); } template ::value && std::is_signed::value, T>::type* = nullptr> -__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { +__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { radixSortImpl(a, ind, ind2, size, reorderSigned); } template ::value, T>::type* = nullptr> -__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { +__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { using I = int; radixSortImpl((I const*)(a), ind, ind2, size, reorderFloat); } template -__device__ void __forceinline__ -radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { +__device__ __forceinline__ void radixSortMulti(T const* v, + uint16_t* index, + uint32_t const* offsets, + uint16_t* workspace) { extern __shared__ uint16_t ws[]; auto a = v + offsets[blockIdx.x]; @@ -250,17 +253,23 @@ radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* w radixSort(a, ind, ind2, size); } -template -__global__ void __launch_bounds__(256, 4) - radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { - radixSortMulti(v, index, offsets, workspace); -} +namespace cms { + namespace cuda { -template -__global__ void -// __launch_bounds__(256, 4) -radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { - radixSortMulti(v, index, offsets, workspace); -} + template + __global__ void __launch_bounds__(256, 4) + radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMulti(v, index, offsets, workspace); + } + + template + __global__ void + // __launch_bounds__(256, 4) + radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMulti(v, index, offsets, workspace); + } + + } // namespace cuda +} // namespace cms #endif // HeterogeneousCoreCUDAUtilities_radixSort_H diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu index 92806c91337d8..e671c5c1c2ae8 100644 --- a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu @@ -3,6 +3,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +using namespace cms::cuda; + __global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { auto i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n) diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp index cc5541f58ad60..577e9bbb69b57 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp @@ -7,6 +7,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +using namespace cms::cuda; + template void go() { std::mt19937 eng; diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index aa7eb245c350d..cf512521e1f4d 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -9,6 +9,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +using namespace cms::cuda; + template void go() { std::mt19937 eng; @@ -16,7 +18,7 @@ void go() { constexpr int N = 12000; T v[N]; - auto v_d = cms::cuda::make_device_unique(N, nullptr); + auto v_d = make_device_unique(N, nullptr); cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); @@ -30,10 +32,10 @@ void go() { << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; Hist h; - auto h_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(Hist::wsSize(), nullptr); + auto h_d = make_device_unique(1, nullptr); + auto ws_d = make_device_unique(Hist::wsSize(), nullptr); - auto off_d = cms::cuda::make_device_unique(nParts + 1, nullptr); + auto off_d = make_device_unique(nParts + 1, nullptr); for (int it = 0; it < 5; ++it) { offsets[0] = 0; @@ -68,7 +70,7 @@ void go() { cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); - cms::cuda::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); + fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); assert(0 == h.off[0]); assert(offsets[10] == h.size()); diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu index 020d69268a420..f08c768b43fa5 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -10,6 +10,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +using namespace cms::cuda; + template __global__ void mykernel(T const* __restrict__ v, uint32_t N) { assert(v); @@ -106,7 +108,7 @@ void go() { constexpr int N = 12000; T v[N]; - auto v_d = cms::cuda::make_device_unique(N, nullptr); + auto v_d = make_device_unique(N, nullptr); assert(v_d.get()); using Hist = HistoContainer; @@ -125,7 +127,7 @@ void go() { assert(v); cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); assert(v_d.get()); - cms::cuda::launch(mykernel, {1, 256}, v_d.get(), N); + launch(mykernel, {1, 256}, v_d.get(), N); } } diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index 92bb359115a02..765ec89e295f2 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -16,6 +16,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +using namespace cms::cuda; + constexpr uint32_t MaxElem = 64000; constexpr uint32_t MaxTk = 8000; constexpr uint32_t MaxAssocs = 4 * MaxTk; @@ -100,7 +102,7 @@ __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter co int main() { #ifdef __CUDACC__ cms::cudatest::requireDevices(); - auto current_device = cms::cuda::currentDevice(); + auto current_device = currentDevice(); #else // make sure cuda emulation is working std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl; @@ -167,11 +169,11 @@ int main() { std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; #ifdef __CUDACC__ - auto v_d = cms::cuda::make_device_unique[]>(N, nullptr); + auto v_d = make_device_unique[]>(N, nullptr); assert(v_d.get()); - auto a_d = cms::cuda::make_device_unique(1, nullptr); - auto sa_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(Assoc::wsSize(), nullptr); + auto a_d = make_device_unique(1, nullptr); + auto sa_d = make_device_unique(1, nullptr); + auto ws_d = make_device_unique(Assoc::wsSize(), nullptr); cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); #else @@ -180,7 +182,7 @@ int main() { auto v_d = tr.data(); #endif - cms::cuda::launchZero(a_d.get(), 0); + launchZero(a_d.get(), 0); #ifdef __CUDACC__ auto nThreads = 256; @@ -188,12 +190,12 @@ int main() { count<<>>(v_d.get(), a_d.get(), N); - cms::cuda::launchFinalize(a_d.get(), ws_d.get(), 0); + launchFinalize(a_d.get(), ws_d.get(), 0); verify<<<1, 1>>>(a_d.get()); fill<<>>(v_d.get(), a_d.get(), N); #else count(v_d, a_d.get(), N); - cms::cuda::launchFinalize(a_d.get()); + launchFinalize(a_d.get()); verify(a_d.get()); fill(v_d, a_d.get(), N); #endif @@ -231,7 +233,7 @@ int main() { cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); nBlocks = (N + nThreads - 1) / nThreads; fillBulk<<>>(dc_d, v_d.get(), a_d.get(), N); - cms::cuda::finalizeBulk<<>>(dc_d, a_d.get()); + finalizeBulk<<>>(dc_d, a_d.get()); verifyBulk<<<1, 1>>>(a_d.get(), dc_d); cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); @@ -239,19 +241,19 @@ int main() { cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); fillBulk<<>>(dc_d, v_d.get(), sa_d.get(), N); - cms::cuda::finalizeBulk<<>>(dc_d, sa_d.get()); + finalizeBulk<<>>(dc_d, sa_d.get()); verifyBulk<<<1, 1>>>(sa_d.get(), dc_d); #else dc_d = &dc; fillBulk(dc_d, v_d, a_d.get(), N); - cms::cuda::finalizeBulk(dc_d, a_d.get()); + finalizeBulk(dc_d, a_d.get()); verifyBulk(a_d.get(), dc_d); memcpy(&la, a_d.get(), sizeof(Assoc)); AtomicPairCounter sdc(0); fillBulk(&sdc, v_d, sa_d.get(), N); - cms::cuda::finalizeBulk(&sdc, sa_d.get()); + finalizeBulk(&sdc, sa_d.get()); verifyBulk(sa_d.get(), &sdc); #endif @@ -274,14 +276,14 @@ int main() { // here verify use of block local counters #ifdef __CUDACC__ - auto m1_d = cms::cuda::make_device_unique(1, nullptr); - auto m2_d = cms::cuda::make_device_unique(1, nullptr); + auto m1_d = make_device_unique(1, nullptr); + auto m2_d = make_device_unique(1, nullptr); #else auto m1_d = std::make_unique(); auto m2_d = std::make_unique(); #endif - cms::cuda::launchZero(m1_d.get(), 0); - cms::cuda::launchZero(m2_d.get(), 0); + launchZero(m1_d.get(), 0); + launchZero(m2_d.get(), 0); #ifdef __CUDACC__ nBlocks = (4 * N + nThreads - 1) / nThreads; @@ -289,8 +291,8 @@ int main() { countMultiLocal<<>>(v_d.get(), m2_d.get(), N); verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); - cms::cuda::launchFinalize(m1_d.get(), ws_d.get(), 0); - cms::cuda::launchFinalize(m2_d.get(), ws_d.get(), 0); + launchFinalize(m1_d.get(), ws_d.get(), 0); + launchFinalize(m2_d.get(), ws_d.get(), 0); verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); cudaCheck(cudaGetLastError()); @@ -300,8 +302,8 @@ int main() { countMultiLocal(v_d, m2_d.get(), N); verifyMulti(m1_d.get(), m2_d.get()); - cms::cuda::launchFinalize(m1_d.get()); - cms::cuda::launchFinalize(m2_d.get()); + launchFinalize(m1_d.get()); + launchFinalize(m2_d.get()); verifyMulti(m1_d.get(), m2_d.get()); #endif return 0; diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index 8bbaf4d066abc..01376c668ef6a 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -6,16 +6,18 @@ #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +using namespace cms::cuda; + template struct format_traits { public: - static const constexpr char* failed_msg = "failed %d %d %d: %d %d\n"; + static const constexpr char *failed_msg = "failed %d %d %d: %d %d\n"; }; -template<> +template <> struct format_traits { public: - static const constexpr char* failed_msg = "failed %d %d %d: %f %f\n"; + static const constexpr char *failed_msg = "failed %d %d %d: %f %f\n"; }; template diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu index febdb9c92b0a7..e96a9d8eae83b 100644 --- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -14,6 +14,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" +using namespace cms::cuda; + template struct RS { using type = std::uniform_int_distribution; From d76ad8e63a90171ace695aaa67713323811e3194 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Sun, 8 Mar 2020 21:12:29 +0100 Subject: [PATCH 11/15] Fix code rule violations Replace using namespace cms::cuda in test/OneToManyAssoc_t.h . --- .../CUDAUtilities/test/OneToManyAssoc_t.h | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index 765ec89e295f2..9f6631a70b214 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -11,22 +11,18 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" - #endif #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" - -using namespace cms::cuda; +using cms::cuda::AtomicPairCounter; constexpr uint32_t MaxElem = 64000; constexpr uint32_t MaxTk = 8000; constexpr uint32_t MaxAssocs = 4 * MaxTk; -using Assoc = OneToManyAssoc; - -using SmallAssoc = OneToManyAssoc; - -using Multiplicity = OneToManyAssoc; +using Assoc = cms::cuda::OneToManyAssoc; +using SmallAssoc = cms::cuda::OneToManyAssoc; +using Multiplicity = cms::cuda::OneToManyAssoc; using TK = std::array; __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { @@ -102,7 +98,7 @@ __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter co int main() { #ifdef __CUDACC__ cms::cudatest::requireDevices(); - auto current_device = currentDevice(); + auto current_device = cms::cuda::currentDevice(); #else // make sure cuda emulation is working std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl; @@ -169,11 +165,11 @@ int main() { std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; #ifdef __CUDACC__ - auto v_d = make_device_unique[]>(N, nullptr); + auto v_d = cms::cuda::make_device_unique[]>(N, nullptr); assert(v_d.get()); - auto a_d = make_device_unique(1, nullptr); - auto sa_d = make_device_unique(1, nullptr); - auto ws_d = make_device_unique(Assoc::wsSize(), nullptr); + auto a_d = cms::cuda::make_device_unique(1, nullptr); + auto sa_d = cms::cuda::make_device_unique(1, nullptr); + auto ws_d = cms::cuda::make_device_unique(Assoc::wsSize(), nullptr); cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); #else @@ -276,8 +272,8 @@ int main() { // here verify use of block local counters #ifdef __CUDACC__ - auto m1_d = make_device_unique(1, nullptr); - auto m2_d = make_device_unique(1, nullptr); + auto m1_d = cms::cuda::make_device_unique(1, nullptr); + auto m2_d = cms::cuda::make_device_unique(1, nullptr); #else auto m1_d = std::make_unique(); auto m2_d = std::make_unique(); From 1f91c73d82f431ece465fbf164c4db444465d880 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Sun, 8 Mar 2020 21:16:37 +0100 Subject: [PATCH 12/15] Add an exception for cudaCompat.h cudaCompat.ch relies on defining equivalent symbols to the CUDA intrinsics in the cms::cudacompat namespace, and pulling them in the global namespace when compiling device code without CUDA. --- Utilities/ReleaseScripts/python/cmsCodeRules/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utilities/ReleaseScripts/python/cmsCodeRules/config.py b/Utilities/ReleaseScripts/python/cmsCodeRules/config.py index 264cf76e5d888..a7b9a11299441 100644 --- a/Utilities/ReleaseScripts/python/cmsCodeRules/config.py +++ b/Utilities/ReleaseScripts/python/cmsCodeRules/config.py @@ -33,7 +33,7 @@ Configuration[ruleName]['description'] = 'Search for "using namespace" or "using std::" in header files' Configuration[ruleName]['filesToMatch'] = ['*.h'] -Configuration[ruleName]['exceptPaths'] = [] +Configuration[ruleName]['exceptPaths'] = ['HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h'] Configuration[ruleName]['skip'] = [comment, function] Configuration[ruleName]['filter'] = '(\susing|\Ausing)\s+(namespace|std::)' #should be regular expression Configuration[ruleName]['exceptFilter'] = [] From c52b855961d4a7ee0fa985c683765a72153b0815 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 9 Mar 2020 17:09:47 +0100 Subject: [PATCH 13/15] Protect the header to compile only with a CUDA compiler --- HeterogeneousCore/CUDAUtilities/interface/radixSort.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h index 09fcdc8cd5f69..e4b2905782825 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -1,6 +1,8 @@ #ifndef HeterogeneousCoreCUDAUtilities_radixSort_H #define HeterogeneousCoreCUDAUtilities_radixSort_H +#ifdef __CUDACC__ + #include #include @@ -263,13 +265,13 @@ namespace cms { } template - __global__ void - // __launch_bounds__(256, 4) - radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + __global__ void radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { radixSortMulti(v, index, offsets, workspace); } } // namespace cuda } // namespace cms +#endif // __CUDACC__ + #endif // HeterogeneousCoreCUDAUtilities_radixSort_H From 62480002ebec6cc6354e2efb14d45e964773f85d Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 26 Mar 2020 11:05:18 +0100 Subject: [PATCH 14/15] choleskyInversion.h: expand source notice --- DataFormats/Math/interface/choleskyInversion.h | 1 + 1 file changed, 1 insertion(+) diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h index eba2a17648008..a04ad456f96f5 100644 --- a/DataFormats/Math/interface/choleskyInversion.h +++ b/DataFormats/Math/interface/choleskyInversion.h @@ -9,6 +9,7 @@ * fully inlined specialized code to perform the inversion of a * positive defined matrix of rank up to 6. * + * adapted from ROOT::Math::CholeskyDecomp * originally by * @author Manuel Schiller * @date Aug 29 2008 From 8ce96318e31ea00aad63e7e840755cc0492766b4 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 26 Mar 2020 11:37:28 +0100 Subject: [PATCH 15/15] Rename namespace choleskyInversion to math::cholesky --- .../Math/interface/choleskyInversion.h | 623 +++++++++--------- DataFormats/Math/test/CholeskyInvert_t.cpp | 12 +- DataFormats/Math/test/CholeskyInvert_t.cu | 20 +- 3 files changed, 336 insertions(+), 319 deletions(-) diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h index a04ad456f96f5..2cb4105f86bae 100644 --- a/DataFormats/Math/interface/choleskyInversion.h +++ b/DataFormats/Math/interface/choleskyInversion.h @@ -16,319 +16,334 @@ * * */ -namespace choleskyInversion { - - template - inline constexpr void invert11(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - dst(0, 0) = F(1.0) / src(0, 0); - } - - template - inline constexpr void invert22(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0) * src(1, 0) * luc0; - auto luc2 = F(1.0) / (src(1, 1) - luc1); - - auto li21 = luc1 * luc0 * luc2; - - dst(0, 0) = li21 + luc0; - dst(1, 0) = -src(1, 0) * luc0 * luc2; - dst(1, 1) = luc2; - } - - template - inline constexpr void invert33(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0); - auto luc2 = src(1, 1) - luc0 * luc1 * luc1; - luc2 = F(1.0) / luc2; - auto luc3 = src(2, 0); - auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); - auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4); - luc5 = F(1.0) / luc5; - - auto li21 = -luc0 * luc1; - auto li32 = -(luc2 * luc4); - auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; - - dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0; - dst(1, 0) = luc5 * li31 * li32 + li21 * luc2; - dst(1, 1) = luc5 * li32 * li32 + luc2; - dst(2, 0) = luc5 * li31; - dst(2, 1) = luc5 * li32; - dst(2, 2) = luc5; - } - - template - inline constexpr void invert44(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0); - auto luc2 = src(1, 1) - luc0 * luc1 * luc1; - luc2 = F(1.0) / luc2; - auto luc3 = src(2, 0); - auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); - auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); - luc5 = F(1.0) / luc5; - auto luc6 = src(3, 0); - auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); - auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); - auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); - luc9 = F(1.0) / luc9; - - auto li21 = -luc1 * luc0; - auto li32 = -luc2 * luc4; - auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; - auto li43 = -(luc8 * luc5); - auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; - auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; - - dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; - dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; - dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; - dst(2, 0) = luc9 * li41 * li43 + luc5 * li31; - dst(2, 1) = luc9 * li42 * li43 + luc5 * li32; - dst(2, 2) = luc9 * li43 * li43 + luc5; - dst(3, 0) = luc9 * li41; - dst(3, 1) = luc9 * li42; - dst(3, 2) = luc9 * li43; - dst(3, 3) = luc9; - } - - template - inline constexpr void invert55(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0); - auto luc2 = src(1, 1) - luc0 * luc1 * luc1; - luc2 = F(1.0) / luc2; - auto luc3 = src(2, 0); - auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); - auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); - luc5 = F(1.0) / luc5; - auto luc6 = src(3, 0); - auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); - auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); - auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); - luc9 = F(1.0) / luc9; - auto luc10 = src(4, 0); - auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); - auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); - auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); - auto luc14 = - src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); - luc14 = F(1.0) / luc14; - - auto li21 = -luc1 * luc0; - auto li32 = -luc2 * luc4; - auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; - auto li43 = -(luc8 * luc5); - auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; - auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; - auto li54 = -luc13 * luc9; - auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; - auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; - auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - - luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + - luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * - luc0; - - dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; - dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; - dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; - dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; - dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; - dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; - dst(3, 0) = luc14 * li51 * li54 + luc9 * li41; - dst(3, 1) = luc14 * li52 * li54 + luc9 * li42; - dst(3, 2) = luc14 * li53 * li54 + luc9 * li43; - dst(3, 3) = luc14 * li54 * li54 + luc9; - dst(4, 0) = luc14 * li51; - dst(4, 1) = luc14 * li52; - dst(4, 2) = luc14 * li53; - dst(4, 3) = luc14 * li54; - dst(4, 4) = luc14; - } - - template - inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) { - using F = decltype(src(0, 0)); - auto luc0 = F(1.0) / src(0, 0); - auto luc1 = src(1, 0); - auto luc2 = src(1, 1) - luc0 * luc1 * luc1; - luc2 = F(1.0) / luc2; - auto luc3 = src(2, 0); - auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); - auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); - luc5 = F(1.0) / luc5; - auto luc6 = src(3, 0); - auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); - auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); - auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); - luc9 = F(1.0) / luc9; - auto luc10 = src(4, 0); - auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); - auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); - auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); - auto luc14 = - src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); - luc14 = F(1.0) / luc14; - auto luc15 = src(5, 0); - auto luc16 = (src(5, 1) - luc0 * luc1 * luc15); - auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16); - auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17); - auto luc19 = - (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18); - auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 + - luc9 * luc18 * luc18 + luc14 * luc19 * luc19); - luc20 = F(1.0) / luc20; - - auto li21 = -luc1 * luc0; - auto li32 = -luc2 * luc4; - auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; - auto li43 = -(luc8 * luc5); - auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; - auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; - auto li54 = -luc13 * luc9; - auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; - auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; - auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - - luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + - luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * - luc0; - - auto li65 = -luc19 * luc14; - auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9; - auto li63 = (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5; - auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 - - luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 + - luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) * - luc2; - auto li61 = - (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 + - luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 + - luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 - - luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 - - luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - luc19 * luc13 * luc6 * luc9 * luc14 + - luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + luc19 * luc10 * luc14 - luc15) * - luc0; - - dst(0, 0) = - luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; - dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; - dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; - dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; - dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; - dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; - dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41; - dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42; - dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43; - dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9; - dst(4, 0) = luc20 * li61 * li65 + luc14 * li51; - dst(4, 1) = luc20 * li62 * li65 + luc14 * li52; - dst(4, 2) = luc20 * li63 * li65 + luc14 * li53; - dst(4, 3) = luc20 * li64 * li65 + luc14 * li54; - dst(4, 4) = luc20 * li65 * li65 + luc14; - dst(5, 0) = luc20 * li61; - dst(5, 1) = luc20 * li62; - dst(5, 2) = luc20 * li63; - dst(5, 3) = luc20 * li64; - dst(5, 4) = luc20 * li65; - dst(5, 5) = luc20; - } - - template - inline constexpr void symmetrize11(M& dst) {} - template - inline constexpr void symmetrize22(M& dst) { - dst(0, 1) = dst(1, 0); - } - template - inline constexpr void symmetrize33(M& dst) { - symmetrize22(dst); - dst(0, 2) = dst(2, 0); - dst(1, 2) = dst(2, 1); - } - template - inline constexpr void symmetrize44(M& dst) { - symmetrize33(dst); - dst(0, 3) = dst(3, 0); - dst(1, 3) = dst(3, 1); - dst(2, 3) = dst(3, 2); - } - template - inline constexpr void symmetrize55(M& dst) { - symmetrize44(dst); - dst(0, 4) = dst(4, 0); - dst(1, 4) = dst(4, 1); - dst(2, 4) = dst(4, 2); - dst(3, 4) = dst(4, 3); - } - template - inline constexpr void symmetrize66(M& dst) { - symmetrize55(dst); - dst(0, 5) = dst(5, 0); - dst(1, 5) = dst(5, 1); - dst(2, 5) = dst(5, 2); - dst(3, 5) = dst(5, 3); - dst(4, 5) = dst(5, 4); - } - - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert22(src, dst); +namespace math { + namespace cholesky { + + template + inline constexpr void invert11(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + dst(0, 0) = F(1.0) / src(0, 0); + } + + template + inline constexpr void invert22(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0) * src(1, 0) * luc0; + auto luc2 = F(1.0) / (src(1, 1) - luc1); + + auto li21 = luc1 * luc0 * luc2; + + dst(0, 0) = li21 + luc0; + dst(1, 0) = -src(1, 0) * luc0 * luc2; + dst(1, 1) = luc2; + } + + template + inline constexpr void invert33(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4); + luc5 = F(1.0) / luc5; + + auto li21 = -luc0 * luc1; + auto li32 = -(luc2 * luc4); + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + + dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0; + dst(1, 0) = luc5 * li31 * li32 + li21 * luc2; + dst(1, 1) = luc5 * li32 * li32 + luc2; + dst(2, 0) = luc5 * li31; + dst(2, 1) = luc5 * li32; + dst(2, 2) = luc5; + } + + template + inline constexpr void invert44(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + + dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc9 * li43 * li43 + luc5; + dst(3, 0) = luc9 * li41; + dst(3, 1) = luc9 * li42; + dst(3, 2) = luc9 * li43; + dst(3, 3) = luc9; + } + + template + inline constexpr void invert55(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc14 * li54 * li54 + luc9; + dst(4, 0) = luc14 * li51; + dst(4, 1) = luc14 * li52; + dst(4, 2) = luc14 * li53; + dst(4, 3) = luc14 * li54; + dst(4, 4) = luc14; + } + + template + inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + auto luc15 = src(5, 0); + auto luc16 = (src(5, 1) - luc0 * luc1 * luc15); + auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16); + auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17); + auto luc19 = + (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18); + auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 + + luc9 * luc18 * luc18 + luc14 * luc19 * luc19); + luc20 = F(1.0) / luc20; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + auto li65 = -luc19 * luc14; + auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9; + auto li63 = + (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5; + auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 - + luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 + + luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) * + luc2; + auto li61 = + (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 + + luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 + + luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 - + luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 - + luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - + luc19 * luc13 * luc6 * luc9 * luc14 + luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + + luc19 * luc10 * luc14 - luc15) * + luc0; + + dst(0, 0) = luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9; + dst(4, 0) = luc20 * li61 * li65 + luc14 * li51; + dst(4, 1) = luc20 * li62 * li65 + luc14 * li52; + dst(4, 2) = luc20 * li63 * li65 + luc14 * li53; + dst(4, 3) = luc20 * li64 * li65 + luc14 * li54; + dst(4, 4) = luc20 * li65 * li65 + luc14; + dst(5, 0) = luc20 * li61; + dst(5, 1) = luc20 * li62; + dst(5, 2) = luc20 * li63; + dst(5, 3) = luc20 * li64; + dst(5, 4) = luc20 * li65; + dst(5, 5) = luc20; + } + + template + inline constexpr void symmetrize11(M& dst) {} + + template + inline constexpr void symmetrize22(M& dst) { + dst(0, 1) = dst(1, 0); + } + + template + inline constexpr void symmetrize33(M& dst) { symmetrize22(dst); + dst(0, 2) = dst(2, 0); + dst(1, 2) = dst(2, 1); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert33(src, dst); + + template + inline constexpr void symmetrize44(M& dst) { symmetrize33(dst); + dst(0, 3) = dst(3, 0); + dst(1, 3) = dst(3, 1); + dst(2, 3) = dst(3, 2); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert44(src, dst); + + template + inline constexpr void symmetrize55(M& dst) { symmetrize44(dst); + dst(0, 4) = dst(4, 0); + dst(1, 4) = dst(4, 1); + dst(2, 4) = dst(4, 2); + dst(3, 4) = dst(4, 3); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert55(src, dst); + + template + inline constexpr void symmetrize66(M& dst) { symmetrize55(dst); + dst(0, 5) = dst(5, 0); + dst(1, 5) = dst(5, 1); + dst(2, 5) = dst(5, 2); + dst(3, 5) = dst(5, 3); + dst(4, 5) = dst(5, 4); } - }; - template - struct Inverter { - static constexpr void eval(M1 const& src, M2& dst) { - invert66(src, dst); - symmetrize66(dst); - } - }; - // Eigen interface - template - inline constexpr void invert(Eigen::DenseBase const& src, Eigen::DenseBase& dst) { - using M1 = Eigen::DenseBase; - using M2 = Eigen::DenseBase; - Inverter::eval(src, dst); - } + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert22(src, dst); + symmetrize22(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert33(src, dst); + symmetrize33(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert44(src, dst); + symmetrize44(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert55(src, dst); + symmetrize55(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert66(src, dst); + symmetrize66(dst); + } + }; + + // Eigen interface + template + inline constexpr void invert(Eigen::DenseBase const& src, Eigen::DenseBase& dst) { + using M1 = Eigen::DenseBase; + using M2 = Eigen::DenseBase; + Inverter::eval(src, dst); + } -} // namespace choleskyInversion + } // namespace cholesky +} // namespace math #endif // DataFormat_Math_choleskyInversion_h diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp index f1bf92353dfc3..c5dea25231988 100644 --- a/DataFormats/Math/test/CholeskyInvert_t.cpp +++ b/DataFormats/Math/test/CholeskyInvert_t.cpp @@ -74,13 +74,13 @@ void go(bool soa) { if (soa) for (unsigned int i = 0; i < SIZE; ++i) { MapMX m(p + i); - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); + math::cholesky::invert(m, m); } else for (auto& m : mm) { - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); + math::cholesky::invert(m, m); } std::cout << mm[SIZE / 2](1, 1) << std::endl; @@ -100,12 +100,12 @@ void go(bool soa) { #endif for (unsigned int i = 0; i < SIZE; ++i) { MapMX m(p + i); - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); } else #pragma GCC ivdep for (auto& m : mm) { - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); } delta += (std::chrono::high_resolution_clock::now() - start); diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu index 55df4ed23f20d..558f9296150c7 100644 --- a/DataFormats/Math/test/CholeskyInvert_t.cu +++ b/DataFormats/Math/test/CholeskyInvert_t.cu @@ -32,7 +32,7 @@ __global__ void invertSOA(double *__restrict__ p, unsigned int n) { return; MapMX m(p + i); - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); } template @@ -42,7 +42,7 @@ __global__ void invert(M *mm, unsigned int n) { return; auto &m = mm[i]; - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); } template @@ -54,7 +54,7 @@ __global__ void invertSeq(M *mm, unsigned int n) { for (auto i = first; i < last; ++i) { auto &m = mm[i]; - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); } } @@ -112,13 +112,13 @@ void go(bool soa) { if (soa) for (unsigned int i = 0; i < SIZE; ++i) { MapMX m(p + i); - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); + math::cholesky::invert(m, m); } else for (auto &m : mm) { - choleskyInversion::invert(m, m); - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); + math::cholesky::invert(m, m); } std::cout << mm[SIZE / 2](1, 1) << std::endl; @@ -168,12 +168,12 @@ void go(bool soa) { #pragma GCC ivdep for (unsigned int i = 0; i < SIZE; ++i) { MapMX m(p + i); - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); } else #pragma GCC ivdep for (auto &m : mm) { - choleskyInversion::invert(m, m); + math::cholesky::invert(m, m); } delta2 += (std::chrono::high_resolution_clock::now() - start); @@ -193,11 +193,13 @@ int main() { cms::cudatest::requireDevices(); go<2>(false); + go<3>(false); go<4>(false); go<5>(false); go<6>(false); go<2>(true); + go<3>(true); go<4>(true); go<5>(true); go<6>(true);