diff --git a/DataFormats/GeometrySurface/interface/SOARotation.h b/DataFormats/GeometrySurface/interface/SOARotation.h index 331a56b7ecf57..d75efef4736cb 100644 --- a/DataFormats/GeometrySurface/interface/SOARotation.h +++ b/DataFormats/GeometrySurface/interface/SOARotation.h @@ -100,6 +100,34 @@ class SOAFrame { uz += pz; } + constexpr inline void toGlobal(T cxx, T cxy, T cyy, T *gl) const { + auto const &r = rot; + gl[0] = r.xx() * (r.xx() * cxx + r.yx() * cxy) + r.yx() * (r.xx() * cxy + r.yx() * cyy); + gl[1] = r.xx() * (r.xy() * cxx + r.yy() * cxy) + r.yx() * (r.xy() * cxy + r.yy() * cyy); + gl[2] = r.xy() * (r.xy() * cxx + r.yy() * cxy) + r.yy() * (r.xy() * cxy + r.yy() * cyy); + gl[3] = r.xx() * (r.xz() * cxx + r.yz() * cxy) + r.yx() * (r.xz() * cxy + r.yz() * cyy); + gl[4] = r.xy() * (r.xz() * cxx + r.yz() * cxy) + r.yy() * (r.xz() * cxy + r.yz() * cyy); + gl[5] = r.xz() * (r.xz() * cxx + r.yz() * cxy) + r.yz() * (r.xz() * cxy + r.yz() * cyy); + } + + constexpr inline void toLocal(T const *ge, T &lxx, T &lxy, T &lyy) const { + auto const &r = rot; + + T cxx = ge[0]; + T cyx = ge[1]; + T cyy = ge[2]; + T czx = ge[3]; + T czy = ge[4]; + T czz = ge[5]; + + lxx = r.xx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) + + r.xy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.xz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz); + lxy = r.yx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) + + r.yy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.yz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz); + lyy = r.yx() * (r.yx() * cxx + r.yy() * cyx + r.yz() * czx) + + r.yy() * (r.yx() * cyx + r.yy() * cyy + r.yz() * czy) + r.yz() * (r.yx() * czx + r.yy() * czy + r.yz() * czz); + } + constexpr inline T x() const { return px; } constexpr inline T y() const { return py; } constexpr inline T z() const { return pz; } diff --git a/DataFormats/GeometrySurface/test/BuildFile.xml b/DataFormats/GeometrySurface/test/BuildFile.xml index 050cdb4c8f19d..5f4db224a639b 100644 --- a/DataFormats/GeometrySurface/test/BuildFile.xml +++ b/DataFormats/GeometrySurface/test/BuildFile.xml @@ -13,3 +13,16 @@ + + + + + + + + + + + + + diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu new file mode 100644 index 0000000000000..c24510146fb59 --- /dev/null +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu @@ -0,0 +1,40 @@ +#include +#include +#include + +#include "DataFormats/GeometrySurface/interface/SOARotation.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +__global__ void toGlobal(SOAFrame const* frame, + float const* xl, + float const* yl, + float* x, + float* y, + float* z, + float const* le, + float* ge, + uint32_t n) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i >= n) + return; + + frame[0].toGlobal(xl[i], yl[i], x[i], y[i], z[i]); + frame[0].toGlobal(le[3 * i], le[3 * i + 1], le[3 * i + 2], ge + 6 * i); +} + +void toGlobalWrapper(SOAFrame const* frame, + float const* xl, + float const* yl, + float* x, + float* y, + float* z, + float const* le, + float* ge, + uint32_t n) { + int threadsPerBlock = 256; + int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA toGlobal kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" + << std::endl; + + cms::cuda::launch(toGlobal, {blocksPerGrid, threadsPerBlock}, frame, xl, yl, x, y, z, le, ge, n); +} diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp new file mode 100644 index 0000000000000..ad62b7a1d131c --- /dev/null +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "DataFormats/GeometrySurface/interface/GloballyPositioned.h" +#include "DataFormats/GeometrySurface/interface/SOARotation.h" +#include "DataFormats/GeometrySurface/interface/TkRotation.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +void toGlobalWrapper(SOAFrame const *frame, + float const *xl, + float const *yl, + float *x, + float *y, + float *z, + float const *le, + float *ge, + uint32_t n); + +int main(void) { + cms::cudatest::requireDevices(); + + typedef float T; + typedef TkRotation Rotation; + typedef SOARotation SRotation; + typedef GloballyPositioned Frame; + typedef SOAFrame SFrame; + typedef typename Frame::PositionType Position; + typedef typename Frame::GlobalVector GlobalVector; + typedef typename Frame::GlobalPoint GlobalPoint; + typedef typename Frame::LocalVector LocalVector; + typedef typename Frame::LocalPoint LocalPoint; + + constexpr uint32_t size = 10000; + constexpr uint32_t size32 = size * sizeof(float); + + float xl[size], yl[size]; + float x[size], y[size], z[size]; + + // errors + float le[3 * size]; + float ge[6 * size]; + + auto d_xl = cms::cuda::make_device_unique(size, nullptr); + auto d_yl = cms::cuda::make_device_unique(size, nullptr); + + auto d_x = cms::cuda::make_device_unique(size, nullptr); + auto d_y = cms::cuda::make_device_unique(size, nullptr); + auto d_z = cms::cuda::make_device_unique(size, nullptr); + + auto d_le = cms::cuda::make_device_unique(3 * size, nullptr); + auto d_ge = cms::cuda::make_device_unique(6 * size, nullptr); + + double a = 0.01; + double ca = std::cos(a); + double sa = std::sin(a); + + Rotation r1(ca, sa, 0, -sa, ca, 0, 0, 0, 1); + Frame f1(Position(2, 3, 4), r1); + std::cout << "f1.position() " << f1.position() << std::endl; + std::cout << "f1.rotation() " << '\n' << f1.rotation() << std::endl; + + SFrame sf1(f1.position().x(), f1.position().y(), f1.position().z(), f1.rotation()); + + auto d_sf = cms::cuda::make_device_unique(sizeof(SFrame), nullptr); + cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice)); + + for (auto i = 0U; i < size; ++i) { + xl[i] = yl[i] = 0.1f * float(i) - float(size / 2); + le[3 * i] = 0.01f; + le[3 * i + 2] = (i > size / 2) ? 1.f : 0.04f; + le[2 * i + 1] = 0.; + } + std::random_shuffle(xl, xl + size); + std::random_shuffle(yl, yl + size); + + cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice)); + + toGlobalWrapper((SFrame const *)(d_sf.get()), + d_xl.get(), + d_yl.get(), + d_x.get(), + d_y.get(), + d_z.get(), + d_le.get(), + d_ge.get(), + size); + cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost)); + + float eps = 0.; + for (auto i = 0U; i < size; ++i) { + auto gp = f1.toGlobal(LocalPoint(xl[i], yl[i])); + eps = std::max(eps, std::abs(x[i] - gp.x())); + eps = std::max(eps, std::abs(y[i] - gp.y())); + eps = std::max(eps, std::abs(z[i] - gp.z())); + } + + std::cout << "max eps " << eps << std::endl; + + return 0; +} diff --git a/DataFormats/Math/BuildFile.xml b/DataFormats/Math/BuildFile.xml index 6aa1d86287860..83d06125a017c 100644 --- a/DataFormats/Math/BuildFile.xml +++ b/DataFormats/Math/BuildFile.xml @@ -1,6 +1,7 @@ - - + + + - + diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h new file mode 100644 index 0000000000000..2cb4105f86bae --- /dev/null +++ b/DataFormats/Math/interface/choleskyInversion.h @@ -0,0 +1,349 @@ +#ifndef DataFormat_Math_choleskyInversion_h +#define DataFormat_Math_choleskyInversion_h + +#include + +#include + +/** + * fully inlined specialized code to perform the inversion of a + * positive defined matrix of rank up to 6. + * + * adapted from ROOT::Math::CholeskyDecomp + * originally by + * @author Manuel Schiller + * @date Aug 29 2008 + * + * + */ +namespace math { + namespace cholesky { + + template + inline constexpr void invert11(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + dst(0, 0) = F(1.0) / src(0, 0); + } + + template + inline constexpr void invert22(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0) * src(1, 0) * luc0; + auto luc2 = F(1.0) / (src(1, 1) - luc1); + + auto li21 = luc1 * luc0 * luc2; + + dst(0, 0) = li21 + luc0; + dst(1, 0) = -src(1, 0) * luc0 * luc2; + dst(1, 1) = luc2; + } + + template + inline constexpr void invert33(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4); + luc5 = F(1.0) / luc5; + + auto li21 = -luc0 * luc1; + auto li32 = -(luc2 * luc4); + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + + dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0; + dst(1, 0) = luc5 * li31 * li32 + li21 * luc2; + dst(1, 1) = luc5 * li32 * li32 + luc2; + dst(2, 0) = luc5 * li31; + dst(2, 1) = luc5 * li32; + dst(2, 2) = luc5; + } + + template + inline constexpr void invert44(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + + dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc9 * li43 * li43 + luc5; + dst(3, 0) = luc9 * li41; + dst(3, 1) = luc9 * li42; + dst(3, 2) = luc9 * li43; + dst(3, 3) = luc9; + } + + template + inline constexpr void invert55(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc14 * li54 * li54 + luc9; + dst(4, 0) = luc14 * li51; + dst(4, 1) = luc14 * li52; + dst(4, 2) = luc14 * li53; + dst(4, 3) = luc14 * li54; + dst(4, 4) = luc14; + } + + template + inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + auto luc15 = src(5, 0); + auto luc16 = (src(5, 1) - luc0 * luc1 * luc15); + auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16); + auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17); + auto luc19 = + (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18); + auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 + + luc9 * luc18 * luc18 + luc14 * luc19 * luc19); + luc20 = F(1.0) / luc20; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + auto li65 = -luc19 * luc14; + auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9; + auto li63 = + (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5; + auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 - + luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 + + luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) * + luc2; + auto li61 = + (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 + + luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 + + luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 - + luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 - + luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - + luc19 * luc13 * luc6 * luc9 * luc14 + luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + + luc19 * luc10 * luc14 - luc15) * + luc0; + + dst(0, 0) = luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9; + dst(4, 0) = luc20 * li61 * li65 + luc14 * li51; + dst(4, 1) = luc20 * li62 * li65 + luc14 * li52; + dst(4, 2) = luc20 * li63 * li65 + luc14 * li53; + dst(4, 3) = luc20 * li64 * li65 + luc14 * li54; + dst(4, 4) = luc20 * li65 * li65 + luc14; + dst(5, 0) = luc20 * li61; + dst(5, 1) = luc20 * li62; + dst(5, 2) = luc20 * li63; + dst(5, 3) = luc20 * li64; + dst(5, 4) = luc20 * li65; + dst(5, 5) = luc20; + } + + template + inline constexpr void symmetrize11(M& dst) {} + + template + inline constexpr void symmetrize22(M& dst) { + dst(0, 1) = dst(1, 0); + } + + template + inline constexpr void symmetrize33(M& dst) { + symmetrize22(dst); + dst(0, 2) = dst(2, 0); + dst(1, 2) = dst(2, 1); + } + + template + inline constexpr void symmetrize44(M& dst) { + symmetrize33(dst); + dst(0, 3) = dst(3, 0); + dst(1, 3) = dst(3, 1); + dst(2, 3) = dst(3, 2); + } + + template + inline constexpr void symmetrize55(M& dst) { + symmetrize44(dst); + dst(0, 4) = dst(4, 0); + dst(1, 4) = dst(4, 1); + dst(2, 4) = dst(4, 2); + dst(3, 4) = dst(4, 3); + } + + template + inline constexpr void symmetrize66(M& dst) { + symmetrize55(dst); + dst(0, 5) = dst(5, 0); + dst(1, 5) = dst(5, 1); + dst(2, 5) = dst(5, 2); + dst(3, 5) = dst(5, 3); + dst(4, 5) = dst(5, 4); + } + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert22(src, dst); + symmetrize22(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert33(src, dst); + symmetrize33(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert44(src, dst); + symmetrize44(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert55(src, dst); + symmetrize55(dst); + } + }; + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert66(src, dst); + symmetrize66(dst); + } + }; + + // Eigen interface + template + inline constexpr void invert(Eigen::DenseBase const& src, Eigen::DenseBase& dst) { + using M1 = Eigen::DenseBase; + using M2 = Eigen::DenseBase; + Inverter::eval(src, dst); + } + + } // namespace cholesky +} // namespace math + +#endif // DataFormat_Math_choleskyInversion_h diff --git a/DataFormats/Math/test/BuildFile.xml b/DataFormats/Math/test/BuildFile.xml index 6b1112e30472c..5f2dd3854f6d1 100644 --- a/DataFormats/Math/test/BuildFile.xml +++ b/DataFormats/Math/test/BuildFile.xml @@ -1,27 +1,31 @@ - - - - + + + + - - - - + + + + - + + - + + - + + - + + @@ -29,75 +33,97 @@ - + - + + - + + - + + - + + - + + + + + - + - + - + + - + + - + + + + - + + + - - + + - - + + - - + + + + + + + + + diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp new file mode 100644 index 0000000000000..c5dea25231988 --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t.cpp @@ -0,0 +1,136 @@ +// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride >; + +// generate matrices +template +void genMatrix(M& m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS") + << " mode" << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + + constexpr unsigned int SIZE = 4 * 1024; + + alignas(128) MX mm[stride()]; // just storage in case of SOA + double* __restrict__ p = (double*)__builtin_assume_aligned(mm, 128); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto& m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + math::cholesky::invert(m, m); + math::cholesky::invert(m, m); + } + else + for (auto& m : mm) { + math::cholesky::invert(m, m); + math::cholesky::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + delta -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep +#ifdef __clang__ +#pragma clang loop vectorize(enable) interleave(enable) +#endif + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + math::cholesky::invert(m, m); + } + else +#pragma GCC ivdep + for (auto& m : mm) { + math::cholesky::invert(m, m); + } + + delta += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "x86 computation took " << std::chrono::duration_cast(delta).count() / DNNK + << ' ' << " ms" << std::endl; +} + +int main() { + go<2>(false); + go<3>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<3>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu new file mode 100644 index 0000000000000..558f9296150c7 --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t.cu @@ -0,0 +1,209 @@ +// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride>; + +template +__global__ void invertSOA(double *__restrict__ p, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + MapMX m(p + i); + math::cholesky::invert(m, m); +} + +template +__global__ void invert(M *mm, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto &m = mm[i]; + math::cholesky::invert(m, m); +} + +template +__global__ void invertSeq(M *mm, unsigned int n) { + if (threadIdx.x != 0) + return; + auto first = blockIdx.x * blockDim.x; + auto last = std::min(first + blockDim.x, n); + + for (auto i = first; i < last; ++i) { + auto &m = mm[i]; + math::cholesky::invert(m, m); + } +} + +// generate matrices +template +void genMatrix(M &m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + auto delta1 = delta; + auto delta2 = delta; + + constexpr unsigned int SIZE = 4 * 1024; + + MX mm[stride()]; // just storage in case of SOA + double *__restrict__ p = (double *)(mm); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto &m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + math::cholesky::invert(m, m); + math::cholesky::invert(m, m); + } + else + for (auto &m : mm) { + math::cholesky::invert(m, m); + math::cholesky::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + auto m_d = cms::cuda::make_device_unique(DIM * DIM * stride(), nullptr); + cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice)); + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + int threadsPerBlock = 128; + int blocksPerGrid = SIZE / threadsPerBlock; + + delta -= (std::chrono::high_resolution_clock::now() - start); + + if (soa) + cms::cuda::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); + else + cms::cuda::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); + + delta += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (!soa) { + delta1 -= (std::chrono::high_resolution_clock::now() - start); + +#ifndef DOPROF + cms::cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); +#endif + delta1 += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + } + + delta2 -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + math::cholesky::invert(m, m); + } + else +#pragma GCC ivdep + for (auto &m : mm) { + math::cholesky::invert(m, m); + } + + delta2 += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "cuda/cudaSeq/x86 computation took " + << std::chrono::duration_cast(delta).count() / DNNK << ' ' + << std::chrono::duration_cast(delta1).count() / DNNK << ' ' + << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" + << std::endl; +} + +int main() { + cms::cudatest::requireDevices(); + + go<2>(false); + go<3>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<3>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu index ecc0be911c777..731447fe826e4 100644 --- a/DataFormats/Math/test/cudaAtan2Test.cu +++ b/DataFormats/Math/test/cudaAtan2Test.cu @@ -25,10 +25,13 @@ end #include #include #include - -#include "cuda/api_wrappers.h" +#include #include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" constexpr float xmin = -100.001; // avoid 0 constexpr float incr = 0.04; @@ -62,15 +65,13 @@ void go() { auto start = std::chrono::high_resolution_clock::now(); auto delta = start - start; - auto current_device = cuda::device::current::get(); - // atan2 delta -= (std::chrono::high_resolution_clock::now() - start); - auto diff_d = cuda::memory::device::make_unique(current_device, 3); + auto diff_d = cms::cuda::make_device_unique(3, nullptr); int diffs[3]; - cuda::memory::device::zero(diff_d.get(), 3 * 4); + cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4)); // Launch the diff CUDA Kernel dim3 threadsPerBlock(32, 32, 1); @@ -79,9 +80,9 @@ void go() { std::cout << "CUDA kernel 'diff' launch with " << blocksPerGrid.x << " blocks of " << threadsPerBlock.y << " threads\n"; - cuda::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get()); + cms::cuda::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get()); - cuda::memory::copy(diffs, diff_d.get(), 3 * 4); + cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); float mdiff = diffs[0] * 1.e-7; @@ -95,26 +96,15 @@ void go() { } int main() { - int count = 0; - auto status = cudaGetDeviceCount(&count); - if (status != cudaSuccess) { - std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } - if (count == 0) { - std::cerr << "No CUDA devices on this system, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } + cms::cudatest::requireDevices(); try { go<3>(); go<5>(); go<7>(); go<9>(); - } catch (cuda::runtime_error &ex) { - std::cerr << "CUDA error: " << ex.what() << std::endl; + } catch (std::runtime_error &ex) { + std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "A non-CUDA error occurred" << std::endl; diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu index 6aeaa0f2ededb..dd6576de46c1c 100644 --- a/DataFormats/Math/test/cudaMathTest.cu +++ b/DataFormats/Math/test/cudaMathTest.cu @@ -25,12 +25,7 @@ end #include #include #include - -#include "cuda/api_wrappers.h" - -#include -#include -#include +#include #ifdef __CUDACC__ #define inline __host__ __device__ inline @@ -40,6 +35,14 @@ end #include #endif +#include "DataFormats/Math/interface/approx_log.h" +#include "DataFormats/Math/interface/approx_exp.h" +#include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + std::mt19937 eng; std::mt19937 eng2; std::uniform_real_distribution rgen(0., 1.); @@ -85,8 +88,6 @@ void go() { auto start = std::chrono::high_resolution_clock::now(); auto delta = start - start; - auto current_device = cuda::device::current::get(); - int numElements = 200000; size_t size = numElements * sizeof(float); std::cout << "[Vector of " << numElements << " elements]\n"; @@ -100,12 +101,12 @@ void go() { std::generate(h_B.get(), h_B.get() + numElements, [&]() { return rgen(eng); }); delta -= (std::chrono::high_resolution_clock::now() - start); - auto d_A = cuda::memory::device::make_unique(current_device, numElements); - auto d_B = cuda::memory::device::make_unique(current_device, numElements); - auto d_C = cuda::memory::device::make_unique(current_device, numElements); + auto d_A = cms::cuda::make_device_unique(numElements, nullptr); + auto d_B = cms::cuda::make_device_unique(numElements, nullptr); + auto d_C = cms::cuda::make_device_unique(numElements, nullptr); - cuda::memory::copy(d_A.get(), h_A.get(), size); - cuda::memory::copy(d_B.get(), h_B.get(), size); + cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda alloc+copy took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; @@ -116,19 +117,21 @@ void go() { std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cms::cuda::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cms::cuda::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::memory::copy(h_C.get(), d_C.get(), size); + cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda copy back took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; @@ -178,27 +181,15 @@ void go() { } int main() { - int count = 0; - auto status = cudaGetDeviceCount(&count); - if (status != cudaSuccess) { - std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } - if (count == 0) { - std::cerr << "No CUDA devices on this system, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } + cms::cudatest::requireDevices(); try { go(); go(); go(); - go(); - } catch (cuda::runtime_error &ex) { - std::cerr << "CUDA error: " << ex.what() << std::endl; + } catch (std::runtime_error &ex) { + std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "A non-CUDA error occurred" << std::endl; diff --git a/DataFormats/Math/test/testAtan2.cpp b/DataFormats/Math/test/testAtan2.cpp index 207e69c848de9..11c951fc2e06b 100644 --- a/DataFormats/Math/test/testAtan2.cpp +++ b/DataFormats/Math/test/testAtan2.cpp @@ -195,6 +195,10 @@ void testIntPhi() { } int main() { + constexpr float p2i = ((int)(std::numeric_limits::max()) + 1) / M_PI; + constexpr float i2p = M_PI / ((int)(std::numeric_limits::max()) + 1); + std::cout << std::hexfloat << "p2i i2p " << p2i << " " << i2p << std::defaultfloat << std::endl; + std::cout << unsafe_atan2f<5>(0.f, 0.f) << " " << std::atan2(0., 0.) << std::endl; std::cout << unsafe_atan2f<5>(0.5f, 0.5f) << " " << std::atan2(0.5, 0.5) << std::endl; std::cout << unsafe_atan2f<5>(0.5f, -0.5f) << " " << std::atan2(0.5, -0.5) << std::endl; diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml index 4a8e644a231f6..1e2de51dc0053 100644 --- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml @@ -1,6 +1,7 @@ + diff --git a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h new file mode 100644 index 0000000000000..2e191ce551c42 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h @@ -0,0 +1,58 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h +#define HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace cms { + namespace cuda { + + class AtomicPairCounter { + public: + using c_type = unsigned long long int; + + AtomicPairCounter() {} + AtomicPairCounter(c_type i) { counter.ac = i; } + + __device__ __host__ AtomicPairCounter& operator=(c_type i) { + counter.ac = i; + return *this; + } + + struct Counters { + uint32_t n; // in a "One to Many" association is the number of "One" + uint32_t m; // in a "One to Many" association is the total number of associations + }; + + union Atomic2 { + Counters counters; + c_type ac; + }; + + static constexpr c_type incr = 1UL << 32; + + __device__ __host__ Counters get() const { return counter.counters; } + + // increment n by 1 and m by i. return previous value + __host__ __device__ __forceinline__ Counters add(uint32_t i) { + c_type c = i; + c += incr; + Atomic2 ret; +#ifdef __CUDA_ARCH__ + ret.ac = atomicAdd(&counter.ac, c); +#else + ret.ac = counter.ac; + counter.ac += c; +#endif + return ret.counters; + } + + private: + Atomic2 counter; + }; + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h new file mode 100644 index 0000000000000..88118fcdac277 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -0,0 +1,340 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h +#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h + +#include +#ifndef __CUDA_ARCH__ +#include +#endif // __CUDA_ARCH__ +#include +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" + +namespace cms { + namespace cuda { + + template + __global__ void countFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + (*h).count(v[i], ih); + } + } + + template + __global__ void fillFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + (*h).fill(v[i], i, ih); + } + } + + template + inline void launchZero(Histo *__restrict__ h, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); +#ifdef __CUDACC__ + cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream)); +#else + ::memset(off, 0, 4 * Histo::totbins()); +#endif + } + + template + inline void launchFinalize(Histo *__restrict__ h, + uint8_t *__restrict__ ws +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + , + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { +#ifdef __CUDACC__ + assert(ws); + uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off)); + size_t wss = Histo::wsSize(); + assert(wss > 0); + CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream)); +#else + h->finalize(); +#endif + } + + template + inline void fillManyFromVector(Histo *__restrict__ h, + uint8_t *__restrict__ ws, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + int nthreads, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + launchZero(h, stream); +#ifdef __CUDACC__ + auto nblocks = (totSize + nthreads - 1) / nthreads; + countFromVector<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); + launchFinalize(h, ws, stream); + fillFromVector<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); +#else + countFromVector(h, nh, v, offsets); + h->finalize(); + fillFromVector(h, nh, v, offsets); +#endif + } + + template + __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { + assoc->bulkFinalizeFill(*apc); + } + + // iteratate over N bins left and right of the one containing "v" + template + __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + // iteratate over bins containing all values in window wmin, wmax + template + __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + template + class HistoContainer { + public: + using Counter = uint32_t; + + using CountersOnly = HistoContainer; + + using index_type = I; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr uint32_t nhists() { return NHISTS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + static constexpr uint32_t capacity() { return SIZE; } + + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + + __host__ static size_t wsSize() { +#ifdef __CUDACC__ + uint32_t *v = nullptr; + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()); + return temp_storage_bytes; +#else + return 0; +#endif + } + + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1 << nbits()) - 1; + return (t >> shift) & mask; + } + + __host__ __device__ void zero() { + for (auto &i : off) + i = 0; + } + + __host__ __device__ void add(CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { +#ifdef __CUDA_ARCH__ + atomicAdd(off + i, co.off[i]); +#else + auto &a = (std::atomic &)(off[i]); + a += co.off[i]; +#endif + } + } + + static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) { +#ifdef __CUDA_ARCH__ + return atomicAdd(&x, 1); +#else + auto &a = (std::atomic &)(x); + return a++; +#endif + } + + static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) { +#ifdef __CUDA_ARCH__ + return atomicSub(&x, 1); +#else + auto &a = (std::atomic &)(x); + return a--; +#endif + } + + __host__ __device__ __forceinline__ void countDirect(T b) { + assert(b < nbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) { + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } + + __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } + + __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; + } + auto first = m + blockDim.x * blockIdx.x + threadIdx.x; + for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { + off[i] = n; + } + } + + __host__ __device__ __forceinline__ void count(T t) { + uint32_t b = bin(t); + assert(b < nbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fill(T t, index_type j) { + uint32_t b = bin(t); + assert(b < nbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ void count(T t, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + assert(b < nbins()); + b += histOff(nh); + assert(b < totbins()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + bins[w - 1] = j; + } + + __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) { + assert(off[totbins() - 1] == 0); + blockPrefixScan(off, totbins(), ws); + assert(off[totbins() - 1] == off[totbins() - 2]); + } + + constexpr auto size() const { return uint32_t(off[totbins() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + + constexpr index_type const *begin() const { return bins; } + constexpr index_type const *end() const { return begin() + size(); } + + constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } + constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } + + Counter off[totbins()]; + index_type bins[capacity()]; + }; + + template + using OneToManyAssoc = HistoContainer; + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h new file mode 100644 index 0000000000000..0e3caf72d1023 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h @@ -0,0 +1,141 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h +#define HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h + +// author: Felice Pantaleo, CERN, 2018 + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace cms { + namespace cuda { + + template + struct SimpleVector { + constexpr SimpleVector() = default; + + // ownership of m_data stays within the caller + constexpr void construct(int capacity, T *data) { + m_size = 0; + m_capacity = capacity; + m_data = data; + } + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + __device__ inline T &back() { return m_data[m_size - 1]; } + + __device__ inline const T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + // thread safe version of resize + __device__ int extend(int size = 1) { + auto previousSize = atomicAdd(&m_size, size); + if (previousSize < m_capacity) { + return previousSize; + } else { + atomicSub(&m_size, size); + return -1; + } + } + + __device__ int shrink(int size = 1) { + auto previousSize = atomicSub(&m_size, size); + if (previousSize >= size) { + return previousSize - size; + } else { + atomicAdd(&m_size, size); + return -1; + } + } + + inline constexpr bool empty() const { return m_size <= 0; } + inline constexpr bool full() const { return m_size >= m_capacity; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int size() const { return m_size; } + inline constexpr int capacity() const { return m_capacity; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr void set_data(T *data) { m_data = data; } + + private: + int m_size; + int m_capacity; + + T *m_data; + }; + + // ownership of m_data stays within the caller + template + SimpleVector make_SimpleVector(int capacity, T *data) { + SimpleVector ret; + ret.construct(capacity, data); + return ret; + } + + // ownership of m_data stays within the caller + template + SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { + auto ret = new (mem) SimpleVector(); + ret->construct(capacity, data); + return ret; + } + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/VecArray.h b/HeterogeneousCore/CUDAUtilities/interface/VecArray.h new file mode 100644 index 0000000000000..08388dc8d4b3e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/VecArray.h @@ -0,0 +1,106 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_VecArray_h +#define HeterogeneousCore_CUDAUtilities_interface_VecArray_h + +// +// Author: Felice Pantaleo, CERN +// + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace cms { + namespace cuda { + + template + class VecArray { + public: + using self = VecArray; + using value_t = T; + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&... args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + inline constexpr T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + __device__ int push_back(const T &element) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + template + __device__ int emplace_back(Ts &&... args) { + auto previousSize = atomicAdd(&m_size, 1); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(&m_size, 1); + return -1; + } + } + + inline constexpr T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } + + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline static constexpr int capacity() { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } + + private: + T m_data[maxSize]; + + int m_size; + }; + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_VecArray_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h new file mode 100644 index 0000000000000..593821fe805ed --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -0,0 +1,105 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h +#define HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h + +/* + * Everything you need to run cuda code in plain sequential c++ code + */ + +#ifndef __CUDACC__ + +#include +#include +#include + +#include + +namespace cms { + namespace cudacompat { + +#ifndef __CUDA_RUNTIME_H__ + struct dim3 { + uint32_t x, y, z; + }; +#endif + const dim3 threadIdx = {0, 0, 0}; + const dim3 blockDim = {1, 1, 1}; + + extern thread_local dim3 blockIdx; + extern thread_local dim3 gridDim; + + template + T1 atomicInc(T1* a, T2 b) { + auto ret = *a; + if ((*a) < T1(b)) + (*a)++; + return ret; + } + + template + T1 atomicAdd(T1* a, T2 b) { + auto ret = *a; + (*a) += b; + return ret; + } + + template + T1 atomicSub(T1* a, T2 b) { + auto ret = *a; + (*a) -= b; + return ret; + } + + template + T1 atomicMin(T1* a, T2 b) { + auto ret = *a; + *a = std::min(*a, T1(b)); + return ret; + } + template + T1 atomicMax(T1* a, T2 b) { + auto ret = *a; + *a = std::max(*a, T1(b)); + return ret; + } + + inline void __syncthreads() {} + inline void __threadfence() {} + inline bool __syncthreads_or(bool x) { return x; } + inline bool __syncthreads_and(bool x) { return x; } + template + inline T __ldg(T const* x) { + return *x; + } + + inline void resetGrid() { + blockIdx = {0, 0, 0}; + gridDim = {1, 1, 1}; + } + + } // namespace cudacompat +} // namespace cms + +// some not needed as done by cuda runtime... +#ifndef __CUDA_RUNTIME_H__ +#define __host__ +#define __device__ +#define __global__ +#define __shared__ +#define __forceinline__ +#endif + +// make sure function are inlined to avoid multiple definition +#ifndef __CUDA_ARCH__ +#undef __global__ +#define __global__ inline __attribute__((always_inline)) +#undef __forceinline__ +#define __forceinline__ inline __attribute__((always_inline)) +#endif + +#ifndef __CUDA_ARCH__ +using namespace cms::cudacompat; +#endif + +#endif + +#endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h b/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h new file mode 100644 index 0000000000000..89f131edd941e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h @@ -0,0 +1,63 @@ +#ifndef HeterogeneousCore_CUDAUtilities_cuda_cxx17_h +#define HeterogeneousCore_CUDAUtilities_cuda_cxx17_h + +#include + +// CUDA does not support C++17 yet, so we define here some of the missing library functions +#if __cplusplus <= 201402L + +namespace std { + + // from https://en.cppreference.com/w/cpp/iterator/size + template + constexpr auto size(const C& c) -> decltype(c.size()) { + return c.size(); + } + + template + constexpr std::size_t size(const T (&array)[N]) noexcept { + return N; + } + + // from https://en.cppreference.com/w/cpp/iterator/empty + template + constexpr auto empty(const C& c) -> decltype(c.empty()) { + return c.empty(); + } + + template + constexpr bool empty(const T (&array)[N]) noexcept { + return false; + } + + template + constexpr bool empty(std::initializer_list il) noexcept { + return il.size() == 0; + } + + // from https://en.cppreference.com/w/cpp/iterator/data + template + constexpr auto data(C& c) -> decltype(c.data()) { + return c.data(); + } + + template + constexpr auto data(const C& c) -> decltype(c.data()) { + return c.data(); + } + + template + constexpr T* data(T (&array)[N]) noexcept { + return array; + } + + template + constexpr const E* data(std::initializer_list il) noexcept { + return il.begin(); + } + +} // namespace std + +#endif + +#endif // HeterogeneousCore_CUDAUtilities_cuda_cxx17_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h new file mode 100644 index 0000000000000..4ff01d75ad02f --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h @@ -0,0 +1,70 @@ +#ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h +#define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h + +#include + +#include + +// reimplementation of std algorithms able to compile with CUDA and run on GPUs, +// mostly by declaringthem constexpr + +namespace cuda_std { + + template + struct less { + __host__ __device__ constexpr bool operator()(const T &lhs, const T &rhs) const { return lhs < rhs; } + }; + + template <> + struct less { + template + __host__ __device__ constexpr bool operator()(const T &lhs, const U &rhs) const { + return lhs < rhs; + } + }; + + template > + __host__ __device__ constexpr RandomIt lower_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) { + auto count = last - first; + + while (count > 0) { + auto it = first; + auto step = count / 2; + it += step; + if (comp(*it, value)) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + + template > + __host__ __device__ constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) { + auto count = last - first; + + while (count > 0) { + auto it = first; + auto step = count / 2; + it += step; + if (!comp(value, *it)) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + + template > + __host__ __device__ constexpr RandomIt binary_find(RandomIt first, RandomIt last, const T &value, Compare comp = {}) { + first = cuda_std::lower_bound(first, last, value, comp); + return first != last && !comp(value, *first) ? first : last; + } + +} // namespace cuda_std + +#endif // HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h b/HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h new file mode 100644 index 0000000000000..021402c93c203 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h @@ -0,0 +1,55 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h +#define HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h + +#include +#include +#include + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace eigenSoA { + + constexpr bool isPowerOf2(int32_t v) { return v && !(v & (v - 1)); } + + template + class alignas(128) ScalarSoA { + public: + using Scalar = T; + + __host__ __device__ constexpr Scalar& operator()(int32_t i) { return data_[i]; } + __device__ constexpr const Scalar operator()(int32_t i) const { return __ldg(data_ + i); } + __host__ __device__ constexpr Scalar& operator[](int32_t i) { return data_[i]; } + __device__ constexpr const Scalar operator[](int32_t i) const { return __ldg(data_ + i); } + + __host__ __device__ constexpr Scalar* data() { return data_; } + __host__ __device__ constexpr Scalar const* data() const { return data_; } + + private: + Scalar data_[S]; + static_assert(isPowerOf2(S), "SoA stride not a power of 2"); + static_assert(sizeof(data_) % 128 == 0, "SoA size not a multiple of 128"); + }; + + template + class alignas(128) MatrixSoA { + public: + using Scalar = typename M::Scalar; + using Map = Eigen::Map >; + using CMap = Eigen::Map >; + + __host__ __device__ constexpr Map operator()(int32_t i) { return Map(data_ + i); } + __host__ __device__ constexpr CMap operator()(int32_t i) const { return CMap(data_ + i); } + __host__ __device__ constexpr Map operator[](int32_t i) { return Map(data_ + i); } + __host__ __device__ constexpr CMap operator[](int32_t i) const { return CMap(data_ + i); } + + private: + Scalar data_[S * M::RowsAtCompileTime * M::ColsAtCompileTime]; + static_assert(isPowerOf2(S), "SoA stride not a power of 2"); + static_assert(sizeof(data_) % 128 == 0, "SoA size not a multiple of 128"); + }; + +} // namespace eigenSoA + +#endif // HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h new file mode 100644 index 0000000000000..1cf554163f409 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -0,0 +1,174 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h +#define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#ifdef __CUDA_ARCH__ + +template +__device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) { + // ci and co may be the same + auto x = ci[i]; + auto laneId = threadIdx.x & 0x1f; +#pragma unroll + for (int offset = 1; offset < 32; offset <<= 1) { + auto y = __shfl_up_sync(mask, x, offset); + if (laneId >= offset) + x += y; + } + co[i] = x; +} + +template +__device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) { + auto x = c[i]; + auto laneId = threadIdx.x & 0x1f; +#pragma unroll + for (int offset = 1; offset < 32; offset <<= 1) { + auto y = __shfl_up_sync(mask, x, offset); + if (laneId >= offset) + x += y; + } + c[i] = x; +} + +#endif + +namespace cms { + namespace cuda { + + // limited to 32*32 elements.... + template + __host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci, + T* __restrict__ co, + uint32_t size, + T* ws +#ifndef __CUDA_ARCH__ + = nullptr +#endif + ) { +#ifdef __CUDA_ARCH__ + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); + auto first = threadIdx.x; + auto mask = __ballot_sync(0xffffffff, first < size); + + for (auto i = first; i < size; i += blockDim.x) { + warpPrefixScan(ci, co, i, mask); + auto laneId = threadIdx.x & 0x1f; + auto warpId = i / 32; + assert(warpId < 32); + if (31 == laneId) + ws[warpId] = co[i]; + mask = __ballot_sync(mask, i + blockDim.x < size); + } + __syncthreads(); + if (size <= 32) + return; + if (threadIdx.x < 32) + warpPrefixScan(ws, threadIdx.x, 0xffffffff); + __syncthreads(); + for (auto i = first + 32; i < size; i += blockDim.x) { + auto warpId = i / 32; + co[i] += ws[warpId - 1]; + } + __syncthreads(); +#else + co[0] = ci[0]; + for (uint32_t i = 1; i < size; ++i) + co[i] = ci[i] + co[i - 1]; +#endif + } + + // same as above, may remove + // limited to 32*32 elements.... + template + __host__ __device__ __forceinline__ void blockPrefixScan(T* c, + uint32_t size, + T* ws +#ifndef __CUDA_ARCH__ + = nullptr +#endif + ) { +#ifdef __CUDA_ARCH__ + assert(ws); + assert(size <= 1024); + assert(0 == blockDim.x % 32); + auto first = threadIdx.x; + auto mask = __ballot_sync(0xffffffff, first < size); + + for (auto i = first; i < size; i += blockDim.x) { + warpPrefixScan(c, i, mask); + auto laneId = threadIdx.x & 0x1f; + auto warpId = i / 32; + assert(warpId < 32); + if (31 == laneId) + ws[warpId] = c[i]; + mask = __ballot_sync(mask, i + blockDim.x < size); + } + __syncthreads(); + if (size <= 32) + return; + if (threadIdx.x < 32) + warpPrefixScan(ws, threadIdx.x, 0xffffffff); + __syncthreads(); + for (auto i = first + 32; i < size; i += blockDim.x) { + auto warpId = i / 32; + c[i] += ws[warpId - 1]; + } + __syncthreads(); +#else + for (uint32_t i = 1; i < size; ++i) + c[i] += c[i - 1]; +#endif + } + + // limited to 1024*1024 elements.... + template + __global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) { + __shared__ T ws[32]; + // first each block does a scan of size 1024; (better be enough blocks....) + assert(1024 * gridDim.x >= size); + int off = 1024 * blockIdx.x; + if (size - off > 0) + blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws); + + // count blocks that finished + __shared__ bool isLastBlockDone; + if (0 == threadIdx.x) { + auto value = atomicAdd(pc, 1); // block counter + isLastBlockDone = (value == (int(gridDim.x) - 1)); + } + + __syncthreads(); + + if (!isLastBlockDone) + return; + + // good each block has done its work and now we are left in last block + + // let's get the partial sums from each block + __shared__ T psum[1024]; + for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) { + auto j = 1024 * i + 1023; + psum[i] = (j < size) ? co[j] : T(0); + } + __syncthreads(); + blockPrefixScan(psum, psum, gridDim.x, ws); + + // now it would have been handy to have the other blocks around... + int first = threadIdx.x; // + blockDim.x * blockIdx.x + for (int i = first + 1024; i < size; i += blockDim.x) { // *gridDim.x) { + auto k = i / 1024; // block + co[i] += psum[k - 1]; + } + } + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h new file mode 100644 index 0000000000000..e4b2905782825 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -0,0 +1,277 @@ +#ifndef HeterogeneousCoreCUDAUtilities_radixSort_H +#define HeterogeneousCoreCUDAUtilities_radixSort_H + +#ifdef __CUDACC__ + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +template +__device__ inline void dummyReorder(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {} + +template +__device__ inline void reorderSigned(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + int32_t first = threadIdx.x; + __shared__ uint32_t firstNeg; + firstNeg = a[ind[0]] < 0 ? 0 : size; + __syncthreads(); + + // find first negative + for (auto i = first; i < size - 1; i += blockDim.x) { + if ((a[ind[i]] ^ a[ind[i + 1]]) < 0) + firstNeg = i + 1; + } + + __syncthreads(); + + auto ii = first; + for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) { + ind2[ii] = ind[i]; + ii += blockDim.x; + } + __syncthreads(); + ii = size - firstNeg + threadIdx.x; + assert(ii >= 0); + for (auto i = first; i < firstNeg; i += blockDim.x) { + ind2[ii] = ind[i]; + ii += blockDim.x; + } + __syncthreads(); + for (auto i = first; i < size; i += blockDim.x) + ind[i] = ind2[i]; +} + +template +__device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + int32_t first = threadIdx.x; + __shared__ uint32_t firstNeg; + firstNeg = a[ind[0]] < 0 ? 0 : size; + __syncthreads(); + + // find first negative + for (auto i = first; i < size - 1; i += blockDim.x) { + if ((a[ind[i]] ^ a[ind[i + 1]]) < 0) + firstNeg = i + 1; + } + + __syncthreads(); + + int ii = size - firstNeg - threadIdx.x - 1; + for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) { + ind2[ii] = ind[i]; + ii -= blockDim.x; + } + __syncthreads(); + ii = size - firstNeg + threadIdx.x; + assert(ii >= 0); + for (auto i = first; i < firstNeg; i += blockDim.x) { + ind2[ii] = ind[i]; + ii += blockDim.x; + } + __syncthreads(); + for (auto i = first; i < size; i += blockDim.x) + ind[i] = ind2[i]; +} + +template +__device__ __forceinline__ void radixSortImpl( + T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { + constexpr int d = 8, w = 8 * sizeof(T); + constexpr int sb = 1 << d; + constexpr int ps = int(sizeof(T)) - NS; + + __shared__ int32_t c[sb], ct[sb], cu[sb]; + + __shared__ int ibs; + __shared__ int p; + + assert(size > 0); + assert(blockDim.x >= sb); + + // bool debug = false; // threadIdx.x==0 && blockIdx.x==5; + + p = ps; + + auto j = ind; + auto k = ind2; + + int32_t first = threadIdx.x; + for (auto i = first; i < size; i += blockDim.x) + j[i] = i; + __syncthreads(); + + while (__syncthreads_and(p < w / d)) { + if (threadIdx.x < sb) + c[threadIdx.x] = 0; + __syncthreads(); + + // fill bins + for (auto i = first; i < size; i += blockDim.x) { + auto bin = (a[j[i]] >> d * p) & (sb - 1); + atomicAdd(&c[bin], 1); + } + __syncthreads(); + + // prefix scan "optimized"???... + if (threadIdx.x < sb) { + auto x = c[threadIdx.x]; + auto laneId = threadIdx.x & 0x1f; +#pragma unroll + for (int offset = 1; offset < 32; offset <<= 1) { + auto y = __shfl_up_sync(0xffffffff, x, offset); + if (laneId >= offset) + x += y; + } + ct[threadIdx.x] = x; + } + __syncthreads(); + if (threadIdx.x < sb) { + auto ss = (threadIdx.x / 32) * 32 - 1; + c[threadIdx.x] = ct[threadIdx.x]; + for (int i = ss; i > 0; i -= 32) + c[threadIdx.x] += ct[i]; + } + /* + //prefix scan for the nulls (for documentation) + if (threadIdx.x==0) + for (int i = 1; i < sb; ++i) c[i] += c[i-1]; + */ + + // broadcast + ibs = size - 1; + __syncthreads(); + while (__syncthreads_and(ibs > 0)) { + int i = ibs - threadIdx.x; + if (threadIdx.x < sb) { + cu[threadIdx.x] = -1; + ct[threadIdx.x] = -1; + } + __syncthreads(); + int32_t bin = -1; + if (threadIdx.x < sb) { + if (i >= 0) { + bin = (a[j[i]] >> d * p) & (sb - 1); + ct[threadIdx.x] = bin; + atomicMax(&cu[bin], int(i)); + } + } + __syncthreads(); + if (threadIdx.x < sb) { + if (i >= 0 && i == cu[bin]) // ensure to keep them in order + for (int ii = threadIdx.x; ii < sb; ++ii) + if (ct[ii] == bin) { + auto oi = ii - threadIdx.x; + // assert(i>=oi);if(i>=oi) + k[--c[bin]] = j[i - oi]; + } + } + __syncthreads(); + if (bin >= 0) + assert(c[bin] >= 0); + if (threadIdx.x == 0) + ibs -= sb; + __syncthreads(); + } + + /* + // broadcast for the nulls (for documentation) + if (threadIdx.x==0) + for (int i=size-first-1; i>=0; i--) { // =blockDim.x) { + auto bin = (a[j[i]] >> d*p)&(sb-1); + auto ik = atomicSub(&c[bin],1); + k[ik-1] = j[i]; + } + */ + + __syncthreads(); + assert(c[0] == 0); + + // swap (local, ok) + auto t = j; + j = k; + k = t; + + if (threadIdx.x == 0) + ++p; + __syncthreads(); + } + + if ((w != 8) && (0 == (NS & 1))) + assert(j == ind); // w/d is even so ind is correct + + if (j != ind) // odd... + for (auto i = first; i < size; i += blockDim.x) + ind[i] = ind2[i]; + + __syncthreads(); + + // now move negative first... (if signed) + reorder(a, ind, ind2, size); +} + +template ::value, T>::type* = nullptr> +__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(a, ind, ind2, size, dummyReorder); +} + +template ::value && std::is_signed::value, T>::type* = nullptr> +__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(a, ind, ind2, size, reorderSigned); +} + +template ::value, T>::type* = nullptr> +__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + using I = int; + radixSortImpl((I const*)(a), ind, ind2, size, reorderFloat); +} + +template +__device__ __forceinline__ void radixSortMulti(T const* v, + uint16_t* index, + uint32_t const* offsets, + uint16_t* workspace) { + extern __shared__ uint16_t ws[]; + + auto a = v + offsets[blockIdx.x]; + auto ind = index + offsets[blockIdx.x]; + auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx.x]; + auto size = offsets[blockIdx.x + 1] - offsets[blockIdx.x]; + assert(offsets[blockIdx.x + 1] >= offsets[blockIdx.x]); + if (size > 0) + radixSort(a, ind, ind2, size); +} + +namespace cms { + namespace cuda { + + template + __global__ void __launch_bounds__(256, 4) + radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMulti(v, index, offsets, workspace); + } + + template + __global__ void radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMulti(v, index, offsets, workspace); + } + + } // namespace cuda +} // namespace cms + +#endif // __CUDACC__ + +#endif // HeterogeneousCoreCUDAUtilities_radixSort_H diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc new file mode 100644 index 0000000000000..7b8efda8e3811 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc @@ -0,0 +1,17 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +namespace cms { + namespace cudacompat { + thread_local dim3 blockIdx; + thread_local dim3 gridDim; + } // namespace cudacompat +} // namespace cms + +namespace { + struct InitGrid { + InitGrid() { cms::cudacompat::resetGrid(); } + }; + + const InitGrid initGrid; + +} // namespace diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu new file mode 100644 index 0000000000000..e671c5c1c2ae8 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu @@ -0,0 +1,69 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +using namespace cms::cuda; + +__global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto m = i % 11; + m = m % 6 + 1; // max 6, no 0 + auto c = dc->add(m); + assert(c.m < n); + ind[c.m] = c.n; + for (int j = c.n; j < c.n + m; ++j) + cont[j] = i; +}; + +__global__ void finalize(AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) { + assert(dc->get().m == n); + ind[n] = dc->get().n; +} + +__global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_t const *cont, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + assert(0 == ind[0]); + assert(dc->get().m == n); + assert(ind[n] == dc->get().n); + auto ib = ind[i]; + auto ie = ind[i + 1]; + auto k = cont[ib++]; + assert(k < n); + for (; ib < ie; ++ib) + assert(cont[ib] == k); +} + +#include +int main() { + cms::cudatest::requireDevices(); + + AtomicPairCounter *dc_d; + cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); + + std::cout << "size " << sizeof(AtomicPairCounter) << std::endl; + + constexpr uint32_t N = 20000; + constexpr uint32_t M = N * 6; + uint32_t *n_d, *m_d; + cudaCheck(cudaMalloc(&n_d, N * sizeof(int))); + // cudaMemset(n_d, 0, N*sizeof(int)); + cudaCheck(cudaMalloc(&m_d, M * sizeof(int))); + + update<<<2000, 512>>>(dc_d, n_d, m_d, 10000); + finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000); + verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000); + + AtomicPairCounter dc; + cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); + + std::cout << dc.get().n << ' ' << dc.get().m << std::endl; + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index ae6835126f9cf..5455860900400 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -8,7 +8,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp new file mode 100644 index 0000000000000..577e9bbb69b57 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +using namespace cms::cuda; + +template +void go() { + std::mt19937 eng; + + int rmin = std::numeric_limits::min(); + int rmax = std::numeric_limits::max(); + if (NBINS != 128) { + rmin = 0; + rmax = NBINS * 2 - 1; + } + + std::uniform_int_distribution rgen(rmin, rmax); + + constexpr int N = 12000; + T v[N]; + + using Hist = HistoContainer; + using Hist4 = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' + << Hist::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' ' + << Hist4::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; + for (auto nh = 0; nh < 4; ++nh) + std::cout << "bins " << int(Hist4::bin(0)) + Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin)) + Hist4::histOff(nh) + << ' ' << int(Hist::bin(rmax)) + Hist4::histOff(nh) << std::endl; + + Hist h; + Hist4 h4; + for (int it = 0; it < 5; ++it) { + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + if (it == 2) + for (long long j = N / 2; j < N / 2 + N / 4; j++) + v[j] = 4; + h.zero(); + h4.zero(); + assert(h.size() == 0); + assert(h4.size() == 0); + for (long long j = 0; j < N; j++) { + h.count(v[j]); + if (j < 2000) + h4.count(v[j], 2); + else + h4.count(v[j], j % 4); + } + assert(h.size() == 0); + assert(h4.size() == 0); + h.finalize(); + h4.finalize(); + assert(h.size() == N); + assert(h4.size() == N); + for (long long j = 0; j < N; j++) { + h.fill(v[j], j); + if (j < 2000) + h4.fill(v[j], j, 2); + else + h4.fill(v[j], j, j % 4); + } + assert(h.off[0] == 0); + assert(h4.off[0] == 0); + assert(h.size() == N); + assert(h4.size() == N); + + auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) { + assert((int32_t)t1 < N); + assert((int32_t)t2 < N); + if (i != j && T(v[t1] - v[t2]) <= 0) + std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl; + }; + + for (uint32_t i = 0; i < Hist::nbins(); ++i) { + if (0 == h.size(i)) + continue; + auto k = *h.begin(i); + assert(k < N); + auto kl = NBINS != 128 ? h.bin(std::max(rmin, v[k] - DELTA)) : h.bin(v[k] - T(DELTA)); + auto kh = NBINS != 128 ? h.bin(std::min(rmax, v[k] + DELTA)) : h.bin(v[k] + T(DELTA)); + if (NBINS == 128) { + assert(kl != i); + assert(kh != i); + } + if (NBINS != 128) { + assert(kl <= i); + assert(kh >= i); + } + // std::cout << kl << ' ' << kh << std::endl; + for (auto j = h.begin(kl); j < h.end(kl); ++j) + verify(i, kl, k, k, (*j)); + for (auto j = h.begin(kh); j < h.end(kh); ++j) + verify(i, kh, k, (*j), k); + } + } + + for (long long j = 0; j < N; j++) { + auto b0 = h.bin(v[j]); + int w = 0; + int tot = 0; + auto ftest = [&](int k) { + assert(k >= 0 && k < N); + tot++; + }; + forEachInBins(h, v[j], w, ftest); + int rtot = h.end(b0) - h.begin(b0); + assert(tot == rtot); + w = 1; + tot = 0; + forEachInBins(h, v[j], w, ftest); + int bp = b0 + 1; + int bm = b0 - 1; + if (bp < int(h.nbins())) + rtot += h.end(bp) - h.begin(bp); + if (bm >= 0) + rtot += h.end(bm) - h.begin(bm); + assert(tot == rtot); + w = 2; + tot = 0; + forEachInBins(h, v[j], w, ftest); + bp++; + bm--; + if (bp < int(h.nbins())) + rtot += h.end(bp) - h.begin(bp); + if (bm >= 0) + rtot += h.end(bm) - h.begin(bm); + assert(tot == rtot); + } +} + +int main() { + cms::cudatest::requireDevices(); + + go(); + go(); + go(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu new file mode 100644 index 0000000000000..cf512521e1f4d --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -0,0 +1,156 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +using namespace cms::cuda; + +template +void go() { + std::mt19937 eng; + std::uniform_int_distribution rgen(std::numeric_limits::min(), std::numeric_limits::max()); + + constexpr int N = 12000; + T v[N]; + auto v_d = make_device_unique(N, nullptr); + + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + + constexpr uint32_t nParts = 10; + constexpr uint32_t partSize = N / nParts; + uint32_t offsets[nParts + 1]; + + using Hist = HistoContainer; + std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' + << Hist::capacity() << ' ' << Hist::wsSize() << ' ' + << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; + + Hist h; + auto h_d = make_device_unique(1, nullptr); + auto ws_d = make_device_unique(Hist::wsSize(), nullptr); + + auto off_d = make_device_unique(nParts + 1, nullptr); + + for (int it = 0; it < 5; ++it) { + offsets[0] = 0; + for (uint32_t j = 1; j < nParts + 1; ++j) { + offsets[j] = offsets[j - 1] + partSize - 3 * j; + assert(offsets[j] <= N); + } + + if (it == 1) { // special cases... + offsets[0] = 0; + offsets[1] = 0; + offsets[2] = 19; + offsets[3] = 32 + offsets[2]; + offsets[4] = 123 + offsets[3]; + offsets[5] = 256 + offsets[4]; + offsets[6] = 311 + offsets[5]; + offsets[7] = 2111 + offsets[6]; + offsets[8] = 256 * 11 + offsets[7]; + offsets[9] = 44 + offsets[8]; + offsets[10] = 3297 + offsets[9]; + } + + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice)); + + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + + if (it == 2) { // big bin + for (long long j = 1000; j < 2000; j++) + v[j] = sizeof(T) == 1 ? 22 : 3456; + } + + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + + fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); + cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); + assert(0 == h.off[0]); + assert(offsets[10] == h.size()); + + auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { + assert(t1 < N); + assert(t2 < N); + if (T(v[t1] - v[t2]) <= 0) + std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl; + }; + + auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); }; + + // make sure it spans 3 bins... + auto window = T(1300); + + for (uint32_t j = 0; j < nParts; ++j) { + auto off = Hist::histOff(j); + for (uint32_t i = 0; i < Hist::nbins(); ++i) { + auto ii = i + off; + if (0 == h.size(ii)) + continue; + auto k = *h.begin(ii); + if (j % 2) + k = *(h.begin(ii) + (h.end(ii) - h.begin(ii)) / 2); + auto bk = h.bin(v[k]); + assert(bk == i); + assert(k < offsets[j + 1]); + auto kl = h.bin(v[k] - window); + auto kh = h.bin(v[k] + window); + assert(kl != i); + assert(kh != i); + // std::cout << kl << ' ' << kh << std::endl; + + auto me = v[k]; + auto tot = 0; + auto nm = 0; + bool l = true; + auto khh = kh; + incr(khh); + for (auto kk = kl; kk != khh; incr(kk)) { + if (kk != kl && kk != kh) + nm += h.size(kk + off); + for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) { + if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) { + } else { + ++tot; + } + } + if (kk == i) { + l = false; + continue; + } + if (l) + for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) + verify(i, k, k, (*p)); + else + for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) + verify(i, k, (*p), k); + } + if (!(tot >= nm)) { + std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + } + if (l) + std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + assert(!l); + } + } + } +} + +int main() { + cms::cudatest::requireDevices(); + + go(); + go(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu new file mode 100644 index 0000000000000..f08c768b43fa5 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -0,0 +1,142 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +using namespace cms::cuda; + +template +__global__ void mykernel(T const* __restrict__ v, uint32_t N) { + assert(v); + assert(N == 12000); + + if (threadIdx.x == 0) + printf("start kernel for %d data\n", N); + + using Hist = HistoContainer; + + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[32]; + + for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { + hist.off[j] = 0; + } + __syncthreads(); + + for (auto j = threadIdx.x; j < N; j += blockDim.x) + hist.count(v[j]); + __syncthreads(); + + assert(0 == hist.size()); + __syncthreads(); + + hist.finalize(ws); + __syncthreads(); + + assert(N == hist.size()); + for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) + assert(hist.off[j] <= hist.off[j + 1]); + __syncthreads(); + + if (threadIdx.x < 32) + ws[threadIdx.x] = 0; // used by prefix scan... + __syncthreads(); + + for (auto j = threadIdx.x; j < N; j += blockDim.x) + hist.fill(v[j], j); + __syncthreads(); + assert(0 == hist.off[0]); + assert(N == hist.size()); + + for (auto j = threadIdx.x; j < hist.size() - 1; j += blockDim.x) { + auto p = hist.begin() + j; + assert((*p) < N); + auto k1 = Hist::bin(v[*p]); + auto k2 = Hist::bin(v[*(p + 1)]); + assert(k2 >= k1); + } + + for (auto i = threadIdx.x; i < hist.size(); i += blockDim.x) { + auto p = hist.begin() + i; + auto j = *p; + auto b0 = Hist::bin(v[j]); + int tot = 0; + auto ftest = [&](int k) { + assert(k >= 0 && k < N); + ++tot; + }; + forEachInWindow(hist, v[j], v[j], ftest); + int rtot = hist.size(b0); + assert(tot == rtot); + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + forEachInWindow(hist, vm, vp, ftest); + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp) - hist.begin(bm); + assert(tot == rtot); + } +} + +template +void go() { + std::mt19937 eng; + + int rmin = std::numeric_limits::min(); + int rmax = std::numeric_limits::max(); + if (NBINS != 128) { + rmin = 0; + rmax = NBINS * 2 - 1; + } + + std::uniform_int_distribution rgen(rmin, rmax); + + constexpr int N = 12000; + T v[N]; + + auto v_d = make_device_unique(N, nullptr); + assert(v_d.get()); + + using Hist = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' + << (rmax - rmin) / Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + + for (int it = 0; it < 5; ++it) { + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + if (it == 2) + for (long long j = N / 2; j < N / 2 + N / 4; j++) + v[j] = 4; + + assert(v_d.get()); + assert(v); + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + assert(v_d.get()); + launch(mykernel, {1, 256}, v_d.get(), N); + } +} + +int main() { + cms::cudatest::requireDevices(); + + go(); + go(); + go(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp new file mode 100644 index 0000000000000..3d452e851ba12 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp @@ -0,0 +1 @@ +#include "OneToManyAssoc_t.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu new file mode 100644 index 0000000000000..3d452e851ba12 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu @@ -0,0 +1 @@ +#include "OneToManyAssoc_t.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h new file mode 100644 index 0000000000000..9f6631a70b214 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -0,0 +1,306 @@ +#include +#include +#include +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#endif + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +using cms::cuda::AtomicPairCounter; + +constexpr uint32_t MaxElem = 64000; +constexpr uint32_t MaxTk = 8000; +constexpr uint32_t MaxAssocs = 4 * MaxTk; + +using Assoc = cms::cuda::OneToManyAssoc; +using SmallAssoc = cms::cuda::OneToManyAssoc; +using Multiplicity = cms::cuda::OneToManyAssoc; +using TK = std::array; + +__global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < n; i += gridDim.x * blockDim.x) { + __shared__ Multiplicity::CountersOnly local; + if (threadIdx.x == 0) + local.zero(); + __syncthreads(); + local.countDirect(2 + i % 4); + __syncthreads(); + if (threadIdx.x == 0) + assoc->add(local); + } +} + +__global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < n; i += gridDim.x * blockDim.x) + assoc->countDirect(2 + i % 4); +} + +__global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) { + auto first = blockDim.x * blockIdx.x + threadIdx.x; + for (auto i = first; i < Multiplicity::totbins(); i += gridDim.x * blockDim.x) + assert(m1->off[i] == m2->off[i]); +} + +__global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) + return; + if (tk[k][j] < MaxElem) + assoc->countDirect(tk[k][j]); + } +} + +__global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) + return; + if (tk[k][j] < MaxElem) + assoc->fillDirect(tk[k][j], k); + } +} + +__global__ void verify(Assoc* __restrict__ assoc) { assert(assoc->size() < Assoc::capacity()); } + +template +__global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int k = first; k < n; k += gridDim.x * blockDim.x) { + auto m = tk[k][3] < MaxElem ? 4 : 3; + assoc->bulkFill(*apc, &tk[k][0], m); + } +} + +template +__global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) { + if (apc->get().m >= Assoc::nbins()) + printf("Overflow %d %d\n", apc->get().m, Assoc::nbins()); + assert(assoc->size() < Assoc::capacity()); +} + +int main() { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); + auto current_device = cms::cuda::currentDevice(); +#else + // make sure cuda emulation is working + std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl; + std::cout << "cuda y's " << threadIdx.y << ' ' << blockIdx.y << ' ' << blockDim.y << ' ' << gridDim.y << std::endl; + std::cout << "cuda z's " << threadIdx.z << ' ' << blockIdx.z << ' ' << blockDim.z << ' ' << gridDim.z << std::endl; + assert(threadIdx.x == 0); + assert(threadIdx.y == 0); + assert(threadIdx.z == 0); + assert(blockIdx.x == 0); + assert(blockIdx.y == 0); + assert(blockIdx.z == 0); + assert(blockDim.x == 1); + assert(blockDim.y == 1); + assert(blockDim.z == 1); + assert(gridDim.x == 1); + assert(gridDim.y == 1); + assert(gridDim.z == 1); +#endif + + std::cout << "OneToManyAssoc " << Assoc::nbins() << ' ' << Assoc::capacity() << ' ' << Assoc::wsSize() << std::endl; + std::cout << "OneToManyAssoc (small) " << SmallAssoc::nbins() << ' ' << SmallAssoc::capacity() << ' ' + << SmallAssoc::wsSize() << std::endl; + + std::mt19937 eng; + + std::geometric_distribution rdm(0.8); + + constexpr uint32_t N = 4000; + + std::vector> tr(N); + + // fill with "index" to element + long long ave = 0; + int imax = 0; + auto n = 0U; + auto z = 0U; + auto nz = 0U; + for (auto i = 0U; i < 4U; ++i) { + auto j = 0U; + while (j < N && n < MaxElem) { + if (z == 11) { + ++n; + z = 0; + ++nz; + continue; + } // a bit of not assoc + auto x = rdm(eng); + auto k = std::min(j + x + 1, N); + if (i == 3 && z == 3) { // some triplets time to time + for (; j < k; ++j) + tr[j][i] = MaxElem + 1; + } else { + ave += x + 1; + imax = std::max(imax, x); + for (; j < k; ++j) + tr[j][i] = n; + ++n; + } + ++z; + } + assert(n <= MaxElem); + assert(j <= N); + } + std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; + +#ifdef __CUDACC__ + auto v_d = cms::cuda::make_device_unique[]>(N, nullptr); + assert(v_d.get()); + auto a_d = cms::cuda::make_device_unique(1, nullptr); + auto sa_d = cms::cuda::make_device_unique(1, nullptr); + auto ws_d = cms::cuda::make_device_unique(Assoc::wsSize(), nullptr); + + cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); +#else + auto a_d = std::make_unique(); + auto sa_d = std::make_unique(); + auto v_d = tr.data(); +#endif + + launchZero(a_d.get(), 0); + +#ifdef __CUDACC__ + auto nThreads = 256; + auto nBlocks = (4 * N + nThreads - 1) / nThreads; + + count<<>>(v_d.get(), a_d.get(), N); + + launchFinalize(a_d.get(), ws_d.get(), 0); + verify<<<1, 1>>>(a_d.get()); + fill<<>>(v_d.get(), a_d.get(), N); +#else + count(v_d, a_d.get(), N); + launchFinalize(a_d.get()); + verify(a_d.get()); + fill(v_d, a_d.get(), N); +#endif + + Assoc la; + +#ifdef __CUDACC__ + cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); +#else + memcpy(&la, a_d.get(), sizeof(Assoc)); // not required, easier +#endif + + std::cout << la.size() << std::endl; + imax = 0; + ave = 0; + z = 0; + for (auto i = 0U; i < n; ++i) { + auto x = la.size(i); + if (x == 0) { + z++; + continue; + } + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la.size(n)); + std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl; + + // now the inverse map (actually this is the direct....) + AtomicPairCounter* dc_d; + AtomicPairCounter dc(0); + +#ifdef __CUDACC__ + cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter))); + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); + nBlocks = (N + nThreads - 1) / nThreads; + fillBulk<<>>(dc_d, v_d.get(), a_d.get(), N); + finalizeBulk<<>>(dc_d, a_d.get()); + verifyBulk<<<1, 1>>>(a_d.get(), dc_d); + + cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); + + cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); + fillBulk<<>>(dc_d, v_d.get(), sa_d.get(), N); + finalizeBulk<<>>(dc_d, sa_d.get()); + verifyBulk<<<1, 1>>>(sa_d.get(), dc_d); + +#else + dc_d = &dc; + fillBulk(dc_d, v_d, a_d.get(), N); + finalizeBulk(dc_d, a_d.get()); + verifyBulk(a_d.get(), dc_d); + memcpy(&la, a_d.get(), sizeof(Assoc)); + + AtomicPairCounter sdc(0); + fillBulk(&sdc, v_d, sa_d.get(), N); + finalizeBulk(&sdc, sa_d.get()); + verifyBulk(sa_d.get(), &sdc); + +#endif + + std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl; + + std::cout << la.size() << std::endl; + imax = 0; + ave = 0; + for (auto i = 0U; i < N; ++i) { + auto x = la.size(i); + if (!(x == 4 || x == 3)) + std::cout << i << ' ' << x << std::endl; + assert(x == 4 || x == 3); + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la.size(N)); + std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl; + + // here verify use of block local counters +#ifdef __CUDACC__ + auto m1_d = cms::cuda::make_device_unique(1, nullptr); + auto m2_d = cms::cuda::make_device_unique(1, nullptr); +#else + auto m1_d = std::make_unique(); + auto m2_d = std::make_unique(); +#endif + launchZero(m1_d.get(), 0); + launchZero(m2_d.get(), 0); + +#ifdef __CUDACC__ + nBlocks = (4 * N + nThreads - 1) / nThreads; + countMulti<<>>(v_d.get(), m1_d.get(), N); + countMultiLocal<<>>(v_d.get(), m2_d.get(), N); + verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); + + launchFinalize(m1_d.get(), ws_d.get(), 0); + launchFinalize(m2_d.get(), ws_d.get(), 0); + verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); +#else + countMulti(v_d, m1_d.get(), N); + countMultiLocal(v_d, m2_d.get(), N); + verifyMulti(m1_d.get(), m2_d.get()); + + launchFinalize(m1_d.get()); + launchFinalize(m2_d.get()); + verifyMulti(m1_d.get(), m2_d.get()); +#endif + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp new file mode 100644 index 0000000000000..f453041b85b38 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp @@ -0,0 +1,34 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include +#include +#include +#include + +void testBinaryFind() { + std::vector data = {1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6}; + + auto lower = cuda_std::lower_bound(data.begin(), data.end(), 4); + auto upper = cuda_std::upper_bound(data.begin(), data.end(), 4); + + std::copy(lower, upper, std::ostream_iterator(std::cout, " ")); + + std::cout << '\n'; + + // classic binary search, returning a value only if it is present + + data = {1, 2, 4, 6, 9, 10}; + + auto test = [&](auto v) { + auto it = cuda_std::binary_find(data.cbegin(), data.cend(), v); + + if (it != data.cend()) + std::cout << *it << " found at index " << std::distance(data.cbegin(), it) << std::endl; + else + std::cout << v << " non found" << std::endl; + }; + + test(4); + test(5); +} + +int main() { testBinaryFind(); } diff --git a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu new file mode 100644 index 0000000000000..7b8ed6219392e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu @@ -0,0 +1,30 @@ +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +__global__ void testBinaryFind() { + int data[] = {1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6}; + + auto lower = cuda_std::lower_bound(data, data + 13, 4); + auto upper = cuda_std::upper_bound(data, data + 12, 4); + + assert(3 == upper - lower); + + // classic binary search, returning a value only if it is present + + constexpr int data2[] = {1, 2, 4, 6, 9, 10}; + + assert(data2 + 2 == cuda_std::binary_find(data2, data2 + 6, 4)); + assert(data2 + 6 == cuda_std::binary_find(data2, data2 + 6, 5)); +} + +void wrapper() { cms::cuda::launch(testBinaryFind, {32, 64}); } + +int main() { + cms::cudatest::requireDevices(); + + wrapper(); +} diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp new file mode 100644 index 0000000000000..c5200e2a87bee --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp @@ -0,0 +1 @@ +#include "eigenSoA_t.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu new file mode 100644 index 0000000000000..c5200e2a87bee --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu @@ -0,0 +1 @@ +#include "eigenSoA_t.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h new file mode 100644 index 0000000000000..b19cfc55f919d --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h @@ -0,0 +1,101 @@ +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" + +template +struct MySoA { + // we can find a way to avoid this copy/paste??? + static constexpr int32_t stride() { return S; } + + eigenSoA::ScalarSoA a; + eigenSoA::ScalarSoA b; +}; + +using V = MySoA<128>; + +__global__ void testBasicSoA(float* p) { + using namespace eigenSoA; + + assert(!isPowerOf2(0)); + assert(isPowerOf2(1)); + assert(isPowerOf2(1024)); + assert(!isPowerOf2(1026)); + + using M3 = Eigen::Matrix; + + __shared__ eigenSoA::MatrixSoA m; + + int first = threadIdx.x + blockIdx.x * blockDim.x; + if (0 == first) + printf("before %f\n", p[0]); + + // a silly game... + int n = 64; + for (int i = first; i < n; i += blockDim.x * gridDim.x) { + m[i].setZero(); + m[i](0, 0) = p[i]; + m[i](1, 1) = p[i + 64]; + m[i](2, 2) = p[i + 64 * 2]; + } + __syncthreads(); // not needed + + for (int i = first; i < n; i += blockDim.x * gridDim.x) + m[i] = m[i].inverse().eval(); + __syncthreads(); + + for (int i = first; i < n; i += blockDim.x * gridDim.x) { + p[i] = m[63 - i](0, 0); + p[i + 64] = m[63 - i](1, 1); + p[i + 64 * 2] = m[63 - i](2, 2); + } + + if (0 == first) + printf("after %f\n", p[0]); +} + +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#endif + +int main() { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); +#endif + + float p[1024]; + + std::uniform_real_distribution rgen(0.01, 0.99); + std::mt19937 eng; + + for (auto& r : p) + r = rgen(eng); + for (int i = 0, n = 64 * 3; i < n; ++i) + assert(p[i] > 0 && p[i] < 1.); + + std::cout << p[0] << std::endl; +#ifdef __CUDACC__ + float* p_d; + cudaCheck(cudaMalloc(&p_d, 1024 * 4)); + cudaCheck(cudaMemcpy(p_d, p, 1024 * 4, cudaMemcpyDefault)); + testBasicSoA<<<1, 1024>>>(p_d); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(p, p_d, 1024 * 4, cudaMemcpyDefault)); + cudaCheck(cudaDeviceSynchronize()); +#else + testBasicSoA(p); +#endif + + std::cout << p[0] << std::endl; + + for (int i = 0, n = 64 * 3; i < n; ++i) + assert(p[i] > 1.); + + std::cout << "END" << std::endl; + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu new file mode 100644 index 0000000000000..01376c668ef6a --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -0,0 +1,167 @@ +#include + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +using namespace cms::cuda; + +template +struct format_traits { +public: + static const constexpr char *failed_msg = "failed %d %d %d: %d %d\n"; +}; + +template <> +struct format_traits { +public: + static const constexpr char *failed_msg = "failed %d %d %d: %f %f\n"; +}; + +template +__global__ void testPrefixScan(uint32_t size) { + __shared__ T ws[32]; + __shared__ T c[1024]; + __shared__ T co[1024]; + + auto first = threadIdx.x; + for (auto i = first; i < size; i += blockDim.x) + c[i] = 1; + __syncthreads(); + + blockPrefixScan(c, co, size, ws); + blockPrefixScan(c, size, ws); + + assert(1 == c[0]); + assert(1 == co[0]); + for (auto i = first + 1; i < size; i += blockDim.x) { + if (c[i] != c[i - 1] + 1) + printf(format_traits::failed_msg, size, i, blockDim.x, c[i], c[i - 1]); + assert(c[i] == c[i - 1] + 1); + assert(c[i] == i + 1); + assert(c[i] = co[i]); + } +} + +template +__global__ void testWarpPrefixScan(uint32_t size) { + assert(size <= 32); + __shared__ T c[1024]; + __shared__ T co[1024]; + auto i = threadIdx.x; + c[i] = 1; + __syncthreads(); + + warpPrefixScan(c, co, i, 0xffffffff); + warpPrefixScan(c, i, 0xffffffff); + __syncthreads(); + + assert(1 == c[0]); + assert(1 == co[0]); + if (i != 0) { + if (c[i] != c[i - 1] + 1) + printf(format_traits::failed_msg, size, i, blockDim.x, c[i], c[i - 1]); + assert(c[i] == c[i - 1] + 1); + assert(c[i] == i + 1); + assert(c[i] = co[i]); + } +} + +__global__ void init(uint32_t *v, uint32_t val, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + v[i] = val; + if (i == 0) + printf("init\n"); +} + +__global__ void verify(uint32_t const *v, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + assert(v[i] == i + 1); + if (i == 0) + printf("verify\n"); +} + +int main() { + cms::cudatest::requireDevices(); + + std::cout << "warp level" << std::endl; + // std::cout << "warp 32" << std::endl; + testWarpPrefixScan<<<1, 32>>>(32); + cudaDeviceSynchronize(); + // std::cout << "warp 16" << std::endl; + testWarpPrefixScan<<<1, 32>>>(16); + cudaDeviceSynchronize(); + // std::cout << "warp 5" << std::endl; + testWarpPrefixScan<<<1, 32>>>(5); + cudaDeviceSynchronize(); + + std::cout << "block level" << std::endl; + for (int bs = 32; bs <= 1024; bs += 32) { + // std::cout << "bs " << bs << std::endl; + for (int j = 1; j <= 1024; ++j) { + // std::cout << j << std::endl; + testPrefixScan<<<1, bs>>>(j); + cudaDeviceSynchronize(); + testPrefixScan<<<1, bs>>>(j); + cudaDeviceSynchronize(); + } + } + cudaDeviceSynchronize(); + + int num_items = 200; + for (int ksize = 1; ksize < 4; ++ksize) { + // test multiblock + std::cout << "multiblok" << std::endl; + // Declare, allocate, and initialize device-accessible pointers for input and output + num_items *= 10; + uint32_t *d_in; + uint32_t *d_out1; + uint32_t *d_out2; + + cudaCheck(cudaMalloc(&d_in, num_items * sizeof(uint32_t))); + cudaCheck(cudaMalloc(&d_out1, num_items * sizeof(uint32_t))); + cudaCheck(cudaMalloc(&d_out2, num_items * sizeof(uint32_t))); + + auto nthreads = 256; + auto nblocks = (num_items + nthreads - 1) / nthreads; + + init<<>>(d_in, 1, num_items); + + // the block counter + int32_t *d_pc; + cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t))); + cudaCheck(cudaMemset(d_pc, 0, 4)); + + nthreads = 1024; + nblocks = (num_items + nthreads - 1) / nthreads; + multiBlockPrefixScan<<>>(d_in, d_out1, num_items, d_pc); + verify<<>>(d_out1, num_items); + cudaDeviceSynchronize(); + + // test cub + std::cout << "cub" << std::endl; + // Determine temporary device storage requirements for inclusive prefix sum + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items); + + std::cout << "temp storage " << temp_storage_bytes << std::endl; + + // Allocate temporary storage for inclusive prefix sum + // fake larger ws already available + temp_storage_bytes *= 8; + cudaCheck(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + std::cout << "temp storage " << temp_storage_bytes << std::endl; + // Run inclusive prefix sum + CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items)); + std::cout << "temp storage " << temp_storage_bytes << std::endl; + + verify<<>>(d_out2, num_items); + cudaDeviceSynchronize(); + } // ksize + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu new file mode 100644 index 0000000000000..e96a9d8eae83b --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" + +using namespace cms::cuda; + +template +struct RS { + using type = std::uniform_int_distribution; + static auto ud() { return type(std::numeric_limits::min(), std::numeric_limits::max()); } + static constexpr T imax = std::numeric_limits::max(); +}; + +template <> +struct RS { + using T = float; + using type = std::uniform_real_distribution; + static auto ud() { return type(-std::numeric_limits::max() / 2, std::numeric_limits::max() / 2); } + // static auto ud() { return type(0,std::numeric_limits::max()/2);} + static constexpr int imax = std::numeric_limits::max(); +}; + +template +void go(bool useShared) { + std::mt19937 eng; + //std::mt19937 eng2; + auto rgen = RS::ud(); + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + + constexpr int blocks = 10; + constexpr int blockSize = 256 * 32; + constexpr int N = blockSize * blocks; + T v[N]; + uint16_t ind[N]; + + constexpr bool sgn = T(-1) < T(0); + std::cout << "Will sort " << N << (sgn ? " signed" : " unsigned") + << (std::numeric_limits::is_integer ? " 'ints'" : " 'float'") << " of size " << sizeof(T) << " using " + << NS << " significant bytes" << std::endl; + + for (int i = 0; i < 50; ++i) { + if (i == 49) { + for (long long j = 0; j < N; j++) + v[j] = 0; + } else if (i > 30) { + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + } else { + uint64_t imax = (i < 15) ? uint64_t(RS::imax) + 1LL : 255; + for (uint64_t j = 0; j < N; j++) { + v[j] = (j % imax); + if (j % 2 && i % 2) + v[j] = -v[j]; + } + } + + uint32_t offsets[blocks + 1]; + offsets[0] = 0; + for (int j = 1; j < blocks + 1; ++j) { + offsets[j] = offsets[j - 1] + blockSize - 3 * j; + assert(offsets[j] <= N); + } + + if (i == 1) { // special cases... + offsets[0] = 0; + offsets[1] = 0; + offsets[2] = 19; + offsets[3] = 32 + offsets[2]; + offsets[4] = 123 + offsets[3]; + offsets[5] = 256 + offsets[4]; + offsets[6] = 311 + offsets[5]; + offsets[7] = 2111 + offsets[6]; + offsets[8] = 256 * 11 + offsets[7]; + offsets[9] = 44 + offsets[8]; + offsets[10] = 3297 + offsets[9]; + } + + std::random_shuffle(v, v + N); + + auto v_d = cms::cuda::make_device_unique(N, nullptr); + auto ind_d = cms::cuda::make_device_unique(N, nullptr); + auto ws_d = cms::cuda::make_device_unique(N, nullptr); + auto off_d = cms::cuda::make_device_unique(blocks + 1, nullptr); + + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice)); + + if (i < 2) + std::cout << "lauch for " << offsets[blocks] << std::endl; + + auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; + + delta -= (std::chrono::high_resolution_clock::now() - start); + constexpr int MaxSize = 256 * 32; + if (useShared) + cms::cuda::launch( + radixSortMultiWrapper, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr); + else + cms::cuda::launch( + radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get()); + + if (i == 0) + std::cout << "done for " << offsets[blocks] << std::endl; + + cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost)); + + delta += (std::chrono::high_resolution_clock::now() - start); + + if (i == 0) + std::cout << "done for " << offsets[blocks] << std::endl; + + if (32 == i) { + std::cout << LL(v[ind[0]]) << ' ' << LL(v[ind[1]]) << ' ' << LL(v[ind[2]]) << std::endl; + std::cout << LL(v[ind[3]]) << ' ' << LL(v[ind[10]]) << ' ' << LL(v[ind[blockSize - 1000]]) << std::endl; + std::cout << LL(v[ind[blockSize / 2 - 1]]) << ' ' << LL(v[ind[blockSize / 2]]) << ' ' + << LL(v[ind[blockSize / 2 + 1]]) << std::endl; + } + for (int ib = 0; ib < blocks; ++ib) { + std::set inds; + if (offsets[ib + 1] > offsets[ib]) + inds.insert(ind[offsets[ib]]); + for (auto j = offsets[ib] + 1; j < offsets[ib + 1]; j++) { + inds.insert(ind[j]); + auto a = v + offsets[ib]; + auto k1 = a[ind[j]]; + auto k2 = a[ind[j - 1]]; + auto sh = sizeof(uint64_t) - NS; + sh *= 8; + auto shorten = [sh](T& t) { + auto k = (uint64_t*)(&t); + *k = (*k >> sh) << sh; + }; + shorten(k1); + shorten(k2); + if (k1 < k2) + std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl; + } + if (!inds.empty()) { + assert(0 == *inds.begin()); + assert(inds.size() - 1 == *inds.rbegin()); + } + if (inds.size() != (offsets[ib + 1] - offsets[ib])) + std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets[ib + 1] - offsets[ib]) + << std::endl; + assert(inds.size() == (offsets[ib + 1] - offsets[ib])); + } + } // 50 times + std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() / 50. + << " ms" << std::endl; +} + +int main() { + cms::cudatest::requireDevices(); + + bool useShared = false; + + std::cout << "using Global memory" << std::endl; + + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + + go(useShared); + go(useShared); + go(useShared); + // go(v); + + useShared = true; + + std::cout << "using Shared memory" << std::endl; + + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + + go(useShared); + go(useShared); + go(useShared); + // go(v); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/test_SimpleVector.cu b/HeterogeneousCore/CUDAUtilities/test/test_SimpleVector.cu new file mode 100644 index 0000000000000..6ecc89f30f158 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/test_SimpleVector.cu @@ -0,0 +1,83 @@ +// author: Felice Pantaleo, CERN, 2018 +#include +#include +#include + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +__global__ void vector_pushback(cms::cuda::SimpleVector *foo) { + auto index = threadIdx.x + blockIdx.x * blockDim.x; + foo->push_back(index); +} + +__global__ void vector_reset(cms::cuda::SimpleVector *foo) { foo->reset(); } + +__global__ void vector_emplace_back(cms::cuda::SimpleVector *foo) { + auto index = threadIdx.x + blockIdx.x * blockDim.x; + foo->emplace_back(index); +} + +int main() { + cms::cudatest::requireDevices(); + + auto maxN = 10000; + cms::cuda::SimpleVector *obj_ptr = nullptr; + cms::cuda::SimpleVector *d_obj_ptr = nullptr; + cms::cuda::SimpleVector *tmp_obj_ptr = nullptr; + int *data_ptr = nullptr; + int *d_data_ptr = nullptr; + + cudaCheck(cudaMallocHost(&obj_ptr, sizeof(cms::cuda::SimpleVector))); + cudaCheck(cudaMallocHost(&data_ptr, maxN * sizeof(int))); + cudaCheck(cudaMalloc(&d_data_ptr, maxN * sizeof(int))); + + auto v = cms::cuda::make_SimpleVector(obj_ptr, maxN, data_ptr); + + cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(cms::cuda::SimpleVector))); + cms::cuda::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr); + assert(tmp_obj_ptr->size() == 0); + assert(tmp_obj_ptr->capacity() == static_cast(maxN)); + + cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(cms::cuda::SimpleVector))); + // ... and copy the object to the device. + cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); + + int numBlocks = 5; + int numThreadsPerBlock = 256; + vector_pushback<<>>(d_obj_ptr); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); + + assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN)); + vector_reset<<>>(d_obj_ptr); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); + + assert(obj_ptr->size() == 0); + + vector_emplace_back<<>>(d_obj_ptr); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector), cudaMemcpyDefault)); + + assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN)); + + cudaCheck(cudaMemcpy(data_ptr, d_data_ptr, obj_ptr->size() * sizeof(int), cudaMemcpyDefault)); + cudaCheck(cudaFreeHost(obj_ptr)); + cudaCheck(cudaFreeHost(data_ptr)); + cudaCheck(cudaFreeHost(tmp_obj_ptr)); + cudaCheck(cudaFree(d_data_ptr)); + cudaCheck(cudaFree(d_obj_ptr)); + std::cout << "TEST PASSED" << std::endl; + return 0; +} diff --git a/Utilities/ReleaseScripts/python/cmsCodeRules/config.py b/Utilities/ReleaseScripts/python/cmsCodeRules/config.py index 264cf76e5d888..a7b9a11299441 100644 --- a/Utilities/ReleaseScripts/python/cmsCodeRules/config.py +++ b/Utilities/ReleaseScripts/python/cmsCodeRules/config.py @@ -33,7 +33,7 @@ Configuration[ruleName]['description'] = 'Search for "using namespace" or "using std::" in header files' Configuration[ruleName]['filesToMatch'] = ['*.h'] -Configuration[ruleName]['exceptPaths'] = [] +Configuration[ruleName]['exceptPaths'] = ['HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h'] Configuration[ruleName]['skip'] = [comment, function] Configuration[ruleName]['filter'] = '(\susing|\Ausing)\s+(namespace|std::)' #should be regular expression Configuration[ruleName]['exceptFilter'] = []