diff --git a/DataFormats/GeometrySurface/interface/SOARotation.h b/DataFormats/GeometrySurface/interface/SOARotation.h
index 331a56b7ecf57..d75efef4736cb 100644
--- a/DataFormats/GeometrySurface/interface/SOARotation.h
+++ b/DataFormats/GeometrySurface/interface/SOARotation.h
@@ -100,6 +100,34 @@ class SOAFrame {
uz += pz;
}
+ constexpr inline void toGlobal(T cxx, T cxy, T cyy, T *gl) const {
+ auto const &r = rot;
+ gl[0] = r.xx() * (r.xx() * cxx + r.yx() * cxy) + r.yx() * (r.xx() * cxy + r.yx() * cyy);
+ gl[1] = r.xx() * (r.xy() * cxx + r.yy() * cxy) + r.yx() * (r.xy() * cxy + r.yy() * cyy);
+ gl[2] = r.xy() * (r.xy() * cxx + r.yy() * cxy) + r.yy() * (r.xy() * cxy + r.yy() * cyy);
+ gl[3] = r.xx() * (r.xz() * cxx + r.yz() * cxy) + r.yx() * (r.xz() * cxy + r.yz() * cyy);
+ gl[4] = r.xy() * (r.xz() * cxx + r.yz() * cxy) + r.yy() * (r.xz() * cxy + r.yz() * cyy);
+ gl[5] = r.xz() * (r.xz() * cxx + r.yz() * cxy) + r.yz() * (r.xz() * cxy + r.yz() * cyy);
+ }
+
+ constexpr inline void toLocal(T const *ge, T &lxx, T &lxy, T &lyy) const {
+ auto const &r = rot;
+
+ T cxx = ge[0];
+ T cyx = ge[1];
+ T cyy = ge[2];
+ T czx = ge[3];
+ T czy = ge[4];
+ T czz = ge[5];
+
+ lxx = r.xx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) +
+ r.xy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.xz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz);
+ lxy = r.yx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) +
+ r.yy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.yz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz);
+ lyy = r.yx() * (r.yx() * cxx + r.yy() * cyx + r.yz() * czx) +
+ r.yy() * (r.yx() * cyx + r.yy() * cyy + r.yz() * czy) + r.yz() * (r.yx() * czx + r.yy() * czy + r.yz() * czz);
+ }
+
constexpr inline T x() const { return px; }
constexpr inline T y() const { return py; }
constexpr inline T z() const { return pz; }
diff --git a/DataFormats/GeometrySurface/test/BuildFile.xml b/DataFormats/GeometrySurface/test/BuildFile.xml
index 050cdb4c8f19d..5f4db224a639b 100644
--- a/DataFormats/GeometrySurface/test/BuildFile.xml
+++ b/DataFormats/GeometrySurface/test/BuildFile.xml
@@ -13,3 +13,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
new file mode 100644
index 0000000000000..c24510146fb59
--- /dev/null
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
@@ -0,0 +1,40 @@
+#include
+#include
+#include
+
+#include "DataFormats/GeometrySurface/interface/SOARotation.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+__global__ void toGlobal(SOAFrame const* frame,
+ float const* xl,
+ float const* yl,
+ float* x,
+ float* y,
+ float* z,
+ float const* le,
+ float* ge,
+ uint32_t n) {
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ if (i >= n)
+ return;
+
+ frame[0].toGlobal(xl[i], yl[i], x[i], y[i], z[i]);
+ frame[0].toGlobal(le[3 * i], le[3 * i + 1], le[3 * i + 2], ge + 6 * i);
+}
+
+void toGlobalWrapper(SOAFrame const* frame,
+ float const* xl,
+ float const* yl,
+ float* x,
+ float* y,
+ float* z,
+ float const* le,
+ float* ge,
+ uint32_t n) {
+ int threadsPerBlock = 256;
+ int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
+ std::cout << "CUDA toGlobal kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads"
+ << std::endl;
+
+ cms::cuda::launch(toGlobal, {blocksPerGrid, threadsPerBlock}, frame, xl, yl, x, y, z, le, ge, n);
+}
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
new file mode 100644
index 0000000000000..ad62b7a1d131c
--- /dev/null
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
@@ -0,0 +1,114 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "DataFormats/GeometrySurface/interface/GloballyPositioned.h"
+#include "DataFormats/GeometrySurface/interface/SOARotation.h"
+#include "DataFormats/GeometrySurface/interface/TkRotation.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+void toGlobalWrapper(SOAFrame const *frame,
+ float const *xl,
+ float const *yl,
+ float *x,
+ float *y,
+ float *z,
+ float const *le,
+ float *ge,
+ uint32_t n);
+
+int main(void) {
+ cms::cudatest::requireDevices();
+
+ typedef float T;
+ typedef TkRotation Rotation;
+ typedef SOARotation SRotation;
+ typedef GloballyPositioned Frame;
+ typedef SOAFrame SFrame;
+ typedef typename Frame::PositionType Position;
+ typedef typename Frame::GlobalVector GlobalVector;
+ typedef typename Frame::GlobalPoint GlobalPoint;
+ typedef typename Frame::LocalVector LocalVector;
+ typedef typename Frame::LocalPoint LocalPoint;
+
+ constexpr uint32_t size = 10000;
+ constexpr uint32_t size32 = size * sizeof(float);
+
+ float xl[size], yl[size];
+ float x[size], y[size], z[size];
+
+ // errors
+ float le[3 * size];
+ float ge[6 * size];
+
+ auto d_xl = cms::cuda::make_device_unique(size, nullptr);
+ auto d_yl = cms::cuda::make_device_unique(size, nullptr);
+
+ auto d_x = cms::cuda::make_device_unique(size, nullptr);
+ auto d_y = cms::cuda::make_device_unique(size, nullptr);
+ auto d_z = cms::cuda::make_device_unique(size, nullptr);
+
+ auto d_le = cms::cuda::make_device_unique(3 * size, nullptr);
+ auto d_ge = cms::cuda::make_device_unique(6 * size, nullptr);
+
+ double a = 0.01;
+ double ca = std::cos(a);
+ double sa = std::sin(a);
+
+ Rotation r1(ca, sa, 0, -sa, ca, 0, 0, 0, 1);
+ Frame f1(Position(2, 3, 4), r1);
+ std::cout << "f1.position() " << f1.position() << std::endl;
+ std::cout << "f1.rotation() " << '\n' << f1.rotation() << std::endl;
+
+ SFrame sf1(f1.position().x(), f1.position().y(), f1.position().z(), f1.rotation());
+
+ auto d_sf = cms::cuda::make_device_unique(sizeof(SFrame), nullptr);
+ cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice));
+
+ for (auto i = 0U; i < size; ++i) {
+ xl[i] = yl[i] = 0.1f * float(i) - float(size / 2);
+ le[3 * i] = 0.01f;
+ le[3 * i + 2] = (i > size / 2) ? 1.f : 0.04f;
+ le[2 * i + 1] = 0.;
+ }
+ std::random_shuffle(xl, xl + size);
+ std::random_shuffle(yl, yl + size);
+
+ cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice));
+ cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice));
+ cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice));
+
+ toGlobalWrapper((SFrame const *)(d_sf.get()),
+ d_xl.get(),
+ d_yl.get(),
+ d_x.get(),
+ d_y.get(),
+ d_z.get(),
+ d_le.get(),
+ d_ge.get(),
+ size);
+ cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost));
+
+ float eps = 0.;
+ for (auto i = 0U; i < size; ++i) {
+ auto gp = f1.toGlobal(LocalPoint(xl[i], yl[i]));
+ eps = std::max(eps, std::abs(x[i] - gp.x()));
+ eps = std::max(eps, std::abs(y[i] - gp.y()));
+ eps = std::max(eps, std::abs(z[i] - gp.z()));
+ }
+
+ std::cout << "max eps " << eps << std::endl;
+
+ return 0;
+}
diff --git a/DataFormats/Math/BuildFile.xml b/DataFormats/Math/BuildFile.xml
index 6aa1d86287860..83d06125a017c 100644
--- a/DataFormats/Math/BuildFile.xml
+++ b/DataFormats/Math/BuildFile.xml
@@ -1,6 +1,7 @@
-
-
+
+
+
-
+
diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h
new file mode 100644
index 0000000000000..2cb4105f86bae
--- /dev/null
+++ b/DataFormats/Math/interface/choleskyInversion.h
@@ -0,0 +1,349 @@
+#ifndef DataFormat_Math_choleskyInversion_h
+#define DataFormat_Math_choleskyInversion_h
+
+#include
+
+#include
+
+/**
+ * fully inlined specialized code to perform the inversion of a
+ * positive defined matrix of rank up to 6.
+ *
+ * adapted from ROOT::Math::CholeskyDecomp
+ * originally by
+ * @author Manuel Schiller
+ * @date Aug 29 2008
+ *
+ *
+ */
+namespace math {
+ namespace cholesky {
+
+ template
+ inline constexpr void invert11(M1 const& src, M2& dst) {
+ using F = decltype(src(0, 0));
+ dst(0, 0) = F(1.0) / src(0, 0);
+ }
+
+ template
+ inline constexpr void invert22(M1 const& src, M2& dst) {
+ using F = decltype(src(0, 0));
+ auto luc0 = F(1.0) / src(0, 0);
+ auto luc1 = src(1, 0) * src(1, 0) * luc0;
+ auto luc2 = F(1.0) / (src(1, 1) - luc1);
+
+ auto li21 = luc1 * luc0 * luc2;
+
+ dst(0, 0) = li21 + luc0;
+ dst(1, 0) = -src(1, 0) * luc0 * luc2;
+ dst(1, 1) = luc2;
+ }
+
+ template
+ inline constexpr void invert33(M1 const& src, M2& dst) {
+ using F = decltype(src(0, 0));
+ auto luc0 = F(1.0) / src(0, 0);
+ auto luc1 = src(1, 0);
+ auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+ luc2 = F(1.0) / luc2;
+ auto luc3 = src(2, 0);
+ auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+ auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4);
+ luc5 = F(1.0) / luc5;
+
+ auto li21 = -luc0 * luc1;
+ auto li32 = -(luc2 * luc4);
+ auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+
+ dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0;
+ dst(1, 0) = luc5 * li31 * li32 + li21 * luc2;
+ dst(1, 1) = luc5 * li32 * li32 + luc2;
+ dst(2, 0) = luc5 * li31;
+ dst(2, 1) = luc5 * li32;
+ dst(2, 2) = luc5;
+ }
+
+ template
+ inline constexpr void invert44(M1 const& src, M2& dst) {
+ using F = decltype(src(0, 0));
+ auto luc0 = F(1.0) / src(0, 0);
+ auto luc1 = src(1, 0);
+ auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+ luc2 = F(1.0) / luc2;
+ auto luc3 = src(2, 0);
+ auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+ auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+ luc5 = F(1.0) / luc5;
+ auto luc6 = src(3, 0);
+ auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+ auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+ auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+ luc9 = F(1.0) / luc9;
+
+ auto li21 = -luc1 * luc0;
+ auto li32 = -luc2 * luc4;
+ auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+ auto li43 = -(luc8 * luc5);
+ auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+ auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+
+ dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+ dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+ dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+ dst(2, 0) = luc9 * li41 * li43 + luc5 * li31;
+ dst(2, 1) = luc9 * li42 * li43 + luc5 * li32;
+ dst(2, 2) = luc9 * li43 * li43 + luc5;
+ dst(3, 0) = luc9 * li41;
+ dst(3, 1) = luc9 * li42;
+ dst(3, 2) = luc9 * li43;
+ dst(3, 3) = luc9;
+ }
+
+ template
+ inline constexpr void invert55(M1 const& src, M2& dst) {
+ using F = decltype(src(0, 0));
+ auto luc0 = F(1.0) / src(0, 0);
+ auto luc1 = src(1, 0);
+ auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+ luc2 = F(1.0) / luc2;
+ auto luc3 = src(2, 0);
+ auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+ auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+ luc5 = F(1.0) / luc5;
+ auto luc6 = src(3, 0);
+ auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+ auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+ auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+ luc9 = F(1.0) / luc9;
+ auto luc10 = src(4, 0);
+ auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+ auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+ auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+ auto luc14 =
+ src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+ luc14 = F(1.0) / luc14;
+
+ auto li21 = -luc1 * luc0;
+ auto li32 = -luc2 * luc4;
+ auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+ auto li43 = -(luc8 * luc5);
+ auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+ auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+ auto li54 = -luc13 * luc9;
+ auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+ auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+ auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+ luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+ luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+ luc0;
+
+ dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+ dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+ dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+ dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+ dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+ dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+ dst(3, 0) = luc14 * li51 * li54 + luc9 * li41;
+ dst(3, 1) = luc14 * li52 * li54 + luc9 * li42;
+ dst(3, 2) = luc14 * li53 * li54 + luc9 * li43;
+ dst(3, 3) = luc14 * li54 * li54 + luc9;
+ dst(4, 0) = luc14 * li51;
+ dst(4, 1) = luc14 * li52;
+ dst(4, 2) = luc14 * li53;
+ dst(4, 3) = luc14 * li54;
+ dst(4, 4) = luc14;
+ }
+
+ template
+ inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) {
+ using F = decltype(src(0, 0));
+ auto luc0 = F(1.0) / src(0, 0);
+ auto luc1 = src(1, 0);
+ auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+ luc2 = F(1.0) / luc2;
+ auto luc3 = src(2, 0);
+ auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+ auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+ luc5 = F(1.0) / luc5;
+ auto luc6 = src(3, 0);
+ auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+ auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+ auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+ luc9 = F(1.0) / luc9;
+ auto luc10 = src(4, 0);
+ auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+ auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+ auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+ auto luc14 =
+ src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+ luc14 = F(1.0) / luc14;
+ auto luc15 = src(5, 0);
+ auto luc16 = (src(5, 1) - luc0 * luc1 * luc15);
+ auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16);
+ auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17);
+ auto luc19 =
+ (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18);
+ auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 +
+ luc9 * luc18 * luc18 + luc14 * luc19 * luc19);
+ luc20 = F(1.0) / luc20;
+
+ auto li21 = -luc1 * luc0;
+ auto li32 = -luc2 * luc4;
+ auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+ auto li43 = -(luc8 * luc5);
+ auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+ auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+ auto li54 = -luc13 * luc9;
+ auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+ auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+ auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+ luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+ luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+ luc0;
+
+ auto li65 = -luc19 * luc14;
+ auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9;
+ auto li63 =
+ (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5;
+ auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 -
+ luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 +
+ luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) *
+ luc2;
+ auto li61 =
+ (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 +
+ luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 +
+ luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 -
+ luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 -
+ luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 -
+ luc19 * luc13 * luc6 * luc9 * luc14 + luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 +
+ luc19 * luc10 * luc14 - luc15) *
+ luc0;
+
+ dst(0, 0) = luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 +
+ luc2 * li21 * li21 + luc0;
+ dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+ dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+ dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+ dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+ dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+ dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41;
+ dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42;
+ dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43;
+ dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9;
+ dst(4, 0) = luc20 * li61 * li65 + luc14 * li51;
+ dst(4, 1) = luc20 * li62 * li65 + luc14 * li52;
+ dst(4, 2) = luc20 * li63 * li65 + luc14 * li53;
+ dst(4, 3) = luc20 * li64 * li65 + luc14 * li54;
+ dst(4, 4) = luc20 * li65 * li65 + luc14;
+ dst(5, 0) = luc20 * li61;
+ dst(5, 1) = luc20 * li62;
+ dst(5, 2) = luc20 * li63;
+ dst(5, 3) = luc20 * li64;
+ dst(5, 4) = luc20 * li65;
+ dst(5, 5) = luc20;
+ }
+
+ template
+ inline constexpr void symmetrize11(M& dst) {}
+
+ template
+ inline constexpr void symmetrize22(M& dst) {
+ dst(0, 1) = dst(1, 0);
+ }
+
+ template
+ inline constexpr void symmetrize33(M& dst) {
+ symmetrize22(dst);
+ dst(0, 2) = dst(2, 0);
+ dst(1, 2) = dst(2, 1);
+ }
+
+ template
+ inline constexpr void symmetrize44(M& dst) {
+ symmetrize33(dst);
+ dst(0, 3) = dst(3, 0);
+ dst(1, 3) = dst(3, 1);
+ dst(2, 3) = dst(3, 2);
+ }
+
+ template
+ inline constexpr void symmetrize55(M& dst) {
+ symmetrize44(dst);
+ dst(0, 4) = dst(4, 0);
+ dst(1, 4) = dst(4, 1);
+ dst(2, 4) = dst(4, 2);
+ dst(3, 4) = dst(4, 3);
+ }
+
+ template
+ inline constexpr void symmetrize66(M& dst) {
+ symmetrize55(dst);
+ dst(0, 5) = dst(5, 0);
+ dst(1, 5) = dst(5, 1);
+ dst(2, 5) = dst(5, 2);
+ dst(3, 5) = dst(5, 3);
+ dst(4, 5) = dst(5, 4);
+ }
+
+ template
+ struct Inverter {
+ static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); }
+ };
+
+ template
+ struct Inverter {
+ static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); }
+ };
+
+ template
+ struct Inverter {
+ static constexpr void eval(M1 const& src, M2& dst) {
+ invert22(src, dst);
+ symmetrize22(dst);
+ }
+ };
+
+ template
+ struct Inverter {
+ static constexpr void eval(M1 const& src, M2& dst) {
+ invert33(src, dst);
+ symmetrize33(dst);
+ }
+ };
+
+ template
+ struct Inverter {
+ static constexpr void eval(M1 const& src, M2& dst) {
+ invert44(src, dst);
+ symmetrize44(dst);
+ }
+ };
+
+ template
+ struct Inverter {
+ static constexpr void eval(M1 const& src, M2& dst) {
+ invert55(src, dst);
+ symmetrize55(dst);
+ }
+ };
+
+ template
+ struct Inverter {
+ static constexpr void eval(M1 const& src, M2& dst) {
+ invert66(src, dst);
+ symmetrize66(dst);
+ }
+ };
+
+ // Eigen interface
+ template
+ inline constexpr void invert(Eigen::DenseBase const& src, Eigen::DenseBase& dst) {
+ using M1 = Eigen::DenseBase;
+ using M2 = Eigen::DenseBase;
+ Inverter::eval(src, dst);
+ }
+
+ } // namespace cholesky
+} // namespace math
+
+#endif // DataFormat_Math_choleskyInversion_h
diff --git a/DataFormats/Math/test/BuildFile.xml b/DataFormats/Math/test/BuildFile.xml
index 6b1112e30472c..5f2dd3854f6d1 100644
--- a/DataFormats/Math/test/BuildFile.xml
+++ b/DataFormats/Math/test/BuildFile.xml
@@ -1,27 +1,31 @@
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
+
+
-
+
+
-
+
+
-
+
+
@@ -29,75 +33,97 @@
-
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
+
+
+
-
+
-
+
-
+
+
-
+
+
-
+
+
+
+
-
+
+
+
-
-
+
+
-
-
+
+
-
-
+
+
+
+
+
+
+
+
+
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp
new file mode 100644
index 0000000000000..c5dea25231988
--- /dev/null
+++ b/DataFormats/Math/test/CholeskyInvert_t.cpp
@@ -0,0 +1,136 @@
+// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
+// add -DDOPROF to run nvprof --metrics all
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+
+constexpr int stride() { return 5 * 1024; }
+template
+using MXN = Eigen::Matrix;
+template
+using MapMX = Eigen::Map, 0, Eigen::Stride >;
+
+// generate matrices
+template
+void genMatrix(M& m) {
+ using T = typename std::remove_reference::type;
+ int n = M::ColsAtCompileTime;
+ std::mt19937 eng;
+ // std::mt19937 eng2;
+ std::uniform_real_distribution rgen(0., 1.);
+
+ // generate first diagonal elemets
+ for (int i = 0; i < n; ++i) {
+ double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4
+ m(i, i) = maxVal * rgen(eng);
+ }
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < i; ++j) {
+ double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined
+ m(i, j) = v * rgen(eng);
+ m(j, i) = m(i, j);
+ }
+ }
+}
+
+template
+void go(bool soa) {
+ constexpr unsigned int DIM = N;
+ using MX = MXN;
+ std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS")
+ << " mode" << std::endl;
+
+ auto start = std::chrono::high_resolution_clock::now();
+ auto delta = start - start;
+
+ constexpr unsigned int SIZE = 4 * 1024;
+
+ alignas(128) MX mm[stride()]; // just storage in case of SOA
+ double* __restrict__ p = (double*)__builtin_assume_aligned(mm, 128);
+
+ if (soa) {
+ for (unsigned int i = 0; i < SIZE; ++i) {
+ MapMX m(p + i);
+ genMatrix(m);
+ }
+ } else {
+ for (auto& m : mm)
+ genMatrix(m);
+ }
+
+ std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+ if (soa)
+ for (unsigned int i = 0; i < SIZE; ++i) {
+ MapMX m(p + i);
+ math::cholesky::invert(m, m);
+ math::cholesky::invert(m, m);
+ }
+ else
+ for (auto& m : mm) {
+ math::cholesky::invert(m, m);
+ math::cholesky::invert(m, m);
+ }
+
+ std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+ constexpr int NKK =
+#ifdef DOPROF
+ 2;
+#else
+ 1000;
+#endif
+ for (int kk = 0; kk < NKK; ++kk) {
+ delta -= (std::chrono::high_resolution_clock::now() - start);
+ if (soa)
+#pragma GCC ivdep
+#ifdef __clang__
+#pragma clang loop vectorize(enable) interleave(enable)
+#endif
+ for (unsigned int i = 0; i < SIZE; ++i) {
+ MapMX m(p + i);
+ math::cholesky::invert(m, m);
+ }
+ else
+#pragma GCC ivdep
+ for (auto& m : mm) {
+ math::cholesky::invert(m, m);
+ }
+
+ delta += (std::chrono::high_resolution_clock::now() - start);
+ }
+
+ std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+ double DNNK = NKK;
+ std::cout << "x86 computation took " << std::chrono::duration_cast(delta).count() / DNNK
+ << ' ' << " ms" << std::endl;
+}
+
+int main() {
+ go<2>(false);
+ go<3>(false);
+ go<4>(false);
+ go<5>(false);
+ go<6>(false);
+
+ go<2>(true);
+ go<3>(true);
+ go<4>(true);
+ go<5>(true);
+ go<6>(true);
+
+ // go<10>();
+ return 0;
+}
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
new file mode 100644
index 0000000000000..558f9296150c7
--- /dev/null
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -0,0 +1,209 @@
+// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
+// add -DDOPROF to run nvprof --metrics all
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+constexpr int stride() { return 5 * 1024; }
+template
+using MXN = Eigen::Matrix;
+template
+using MapMX = Eigen::Map, 0, Eigen::Stride>;
+
+template
+__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
+ auto i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i >= n)
+ return;
+
+ MapMX m(p + i);
+ math::cholesky::invert(m, m);
+}
+
+template
+__global__ void invert(M *mm, unsigned int n) {
+ auto i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i >= n)
+ return;
+
+ auto &m = mm[i];
+ math::cholesky::invert(m, m);
+}
+
+template
+__global__ void invertSeq(M *mm, unsigned int n) {
+ if (threadIdx.x != 0)
+ return;
+ auto first = blockIdx.x * blockDim.x;
+ auto last = std::min(first + blockDim.x, n);
+
+ for (auto i = first; i < last; ++i) {
+ auto &m = mm[i];
+ math::cholesky::invert(m, m);
+ }
+}
+
+// generate matrices
+template
+void genMatrix(M &m) {
+ using T = typename std::remove_reference::type;
+ int n = M::ColsAtCompileTime;
+ std::mt19937 eng;
+ // std::mt19937 eng2;
+ std::uniform_real_distribution rgen(0., 1.);
+
+ // generate first diagonal elemets
+ for (int i = 0; i < n; ++i) {
+ double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4
+ m(i, i) = maxVal * rgen(eng);
+ }
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < i; ++j) {
+ double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined
+ m(i, j) = v * rgen(eng);
+ m(j, i) = m(i, j);
+ }
+ }
+}
+
+template
+void go(bool soa) {
+ constexpr unsigned int DIM = N;
+ using MX = MXN;
+ std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
+
+ auto start = std::chrono::high_resolution_clock::now();
+ auto delta = start - start;
+ auto delta1 = delta;
+ auto delta2 = delta;
+
+ constexpr unsigned int SIZE = 4 * 1024;
+
+ MX mm[stride()]; // just storage in case of SOA
+ double *__restrict__ p = (double *)(mm);
+
+ if (soa) {
+ for (unsigned int i = 0; i < SIZE; ++i) {
+ MapMX m(p + i);
+ genMatrix(m);
+ }
+ } else {
+ for (auto &m : mm)
+ genMatrix(m);
+ }
+
+ std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+ if (soa)
+ for (unsigned int i = 0; i < SIZE; ++i) {
+ MapMX m(p + i);
+ math::cholesky::invert(m, m);
+ math::cholesky::invert(m, m);
+ }
+ else
+ for (auto &m : mm) {
+ math::cholesky::invert(m, m);
+ math::cholesky::invert(m, m);
+ }
+
+ std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+ auto m_d = cms::cuda::make_device_unique(DIM * DIM * stride(), nullptr);
+ cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice));
+
+ constexpr int NKK =
+#ifdef DOPROF
+ 2;
+#else
+ 1000;
+#endif
+ for (int kk = 0; kk < NKK; ++kk) {
+ int threadsPerBlock = 128;
+ int blocksPerGrid = SIZE / threadsPerBlock;
+
+ delta -= (std::chrono::high_resolution_clock::now() - start);
+
+ if (soa)
+ cms::cuda::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
+ else
+ cms::cuda::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+
+ cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
+
+ delta += (std::chrono::high_resolution_clock::now() - start);
+
+ if (0 == kk)
+ std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+ if (!soa) {
+ delta1 -= (std::chrono::high_resolution_clock::now() - start);
+
+#ifndef DOPROF
+ cms::cuda::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+ cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
+#endif
+ delta1 += (std::chrono::high_resolution_clock::now() - start);
+
+ if (0 == kk)
+ std::cout << mm[SIZE / 2](1, 1) << std::endl;
+ }
+
+ delta2 -= (std::chrono::high_resolution_clock::now() - start);
+ if (soa)
+#pragma GCC ivdep
+ for (unsigned int i = 0; i < SIZE; ++i) {
+ MapMX m(p + i);
+ math::cholesky::invert(m, m);
+ }
+ else
+#pragma GCC ivdep
+ for (auto &m : mm) {
+ math::cholesky::invert(m, m);
+ }
+
+ delta2 += (std::chrono::high_resolution_clock::now() - start);
+ }
+
+ std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+ double DNNK = NKK;
+ std::cout << "cuda/cudaSeq/x86 computation took "
+ << std::chrono::duration_cast(delta).count() / DNNK << ' '
+ << std::chrono::duration_cast(delta1).count() / DNNK << ' '
+ << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms"
+ << std::endl;
+}
+
+int main() {
+ cms::cudatest::requireDevices();
+
+ go<2>(false);
+ go<3>(false);
+ go<4>(false);
+ go<5>(false);
+ go<6>(false);
+
+ go<2>(true);
+ go<3>(true);
+ go<4>(true);
+ go<5>(true);
+ go<6>(true);
+
+ // go<10>();
+ return 0;
+}
diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu
index ecc0be911c777..731447fe826e4 100644
--- a/DataFormats/Math/test/cudaAtan2Test.cu
+++ b/DataFormats/Math/test/cudaAtan2Test.cu
@@ -25,10 +25,13 @@ end
#include
#include
#include
-
-#include "cuda/api_wrappers.h"
+#include
#include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
constexpr float xmin = -100.001; // avoid 0
constexpr float incr = 0.04;
@@ -62,15 +65,13 @@ void go() {
auto start = std::chrono::high_resolution_clock::now();
auto delta = start - start;
- auto current_device = cuda::device::current::get();
-
// atan2
delta -= (std::chrono::high_resolution_clock::now() - start);
- auto diff_d = cuda::memory::device::make_unique(current_device, 3);
+ auto diff_d = cms::cuda::make_device_unique(3, nullptr);
int diffs[3];
- cuda::memory::device::zero(diff_d.get(), 3 * 4);
+ cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4));
// Launch the diff CUDA Kernel
dim3 threadsPerBlock(32, 32, 1);
@@ -79,9 +80,9 @@ void go() {
std::cout << "CUDA kernel 'diff' launch with " << blocksPerGrid.x << " blocks of " << threadsPerBlock.y
<< " threads\n";
- cuda::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get());
+ cms::cuda::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get());
- cuda::memory::copy(diffs, diff_d.get(), 3 * 4);
+ cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost));
delta += (std::chrono::high_resolution_clock::now() - start);
float mdiff = diffs[0] * 1.e-7;
@@ -95,26 +96,15 @@ void go() {
}
int main() {
- int count = 0;
- auto status = cudaGetDeviceCount(&count);
- if (status != cudaSuccess) {
- std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped."
- << "\n";
- exit(EXIT_SUCCESS);
- }
- if (count == 0) {
- std::cerr << "No CUDA devices on this system, the test will be skipped."
- << "\n";
- exit(EXIT_SUCCESS);
- }
+ cms::cudatest::requireDevices();
try {
go<3>();
go<5>();
go<7>();
go<9>();
- } catch (cuda::runtime_error &ex) {
- std::cerr << "CUDA error: " << ex.what() << std::endl;
+ } catch (std::runtime_error &ex) {
+ std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl;
exit(EXIT_FAILURE);
} catch (...) {
std::cerr << "A non-CUDA error occurred" << std::endl;
diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu
index 6aeaa0f2ededb..dd6576de46c1c 100644
--- a/DataFormats/Math/test/cudaMathTest.cu
+++ b/DataFormats/Math/test/cudaMathTest.cu
@@ -25,12 +25,7 @@ end
#include
#include
#include
-
-#include "cuda/api_wrappers.h"
-
-#include
-#include
-#include
+#include
#ifdef __CUDACC__
#define inline __host__ __device__ inline
@@ -40,6 +35,14 @@ end
#include
#endif
+#include "DataFormats/Math/interface/approx_log.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
std::mt19937 eng;
std::mt19937 eng2;
std::uniform_real_distribution rgen(0., 1.);
@@ -85,8 +88,6 @@ void go() {
auto start = std::chrono::high_resolution_clock::now();
auto delta = start - start;
- auto current_device = cuda::device::current::get();
-
int numElements = 200000;
size_t size = numElements * sizeof(float);
std::cout << "[Vector of " << numElements << " elements]\n";
@@ -100,12 +101,12 @@ void go() {
std::generate(h_B.get(), h_B.get() + numElements, [&]() { return rgen(eng); });
delta -= (std::chrono::high_resolution_clock::now() - start);
- auto d_A = cuda::memory::device::make_unique(current_device, numElements);
- auto d_B = cuda::memory::device::make_unique(current_device, numElements);
- auto d_C = cuda::memory::device::make_unique(current_device, numElements);
+ auto d_A = cms::cuda::make_device_unique(numElements, nullptr);
+ auto d_B = cms::cuda::make_device_unique(numElements, nullptr);
+ auto d_C = cms::cuda::make_device_unique(numElements, nullptr);
- cuda::memory::copy(d_A.get(), h_A.get(), size);
- cuda::memory::copy(d_B.get(), h_B.get(), size);
+ cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
+ cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda alloc+copy took " << std::chrono::duration_cast(delta).count() << " ms"
<< std::endl;
@@ -116,19 +117,21 @@ void go() {
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";
delta -= (std::chrono::high_resolution_clock::now() - start);
- cuda::launch(vectorOp