diff --git a/src/cuda/test/radixSort_t.cu b/src/cuda/test/radixSort_t.cu index e1b9bca4c..d2c469510 100644 --- a/src/cuda/test/radixSort_t.cu +++ b/src/cuda/test/radixSort_t.cu @@ -1,18 +1,20 @@ #include #include #include +#include #include #include #include #include #include #include +#include -#include "CUDACore/device_unique_ptr.h" #include "CUDACore/cudaCheck.h" -#include "CUDACore/requireDevices.h" +#include "CUDACore/device_unique_ptr.h" #include "CUDACore/launch.h" #include "CUDACore/radixSort.h" +#include "CUDACore/requireDevices.h" using namespace cms::cuda; @@ -32,6 +34,50 @@ struct RS { static constexpr int imax = std::numeric_limits::max(); }; +// A templated unsigned integer type with N bytes +template +struct uintN; + +template <> +struct uintN<8> { + using type = uint8_t; +}; + +template <> +struct uintN<16> { + using type = uint16_t; +}; + +template <> +struct uintN<32> { + using type = uint32_t; +}; + +template <> +struct uintN<64> { + using type = uint64_t; +}; + +template +using uintN_t = typename uintN::type; + +// A templated unsigned integer type with the same size as T +template +using uintT_t = uintN_t; + +// Keep only the `N` most significant bytes of `t`, and set the others to zero +template > +void truncate(T& t) { + const int shift = 8 * (sizeof(T) - N); + union { + T t; + uintT_t u; + } c; + c.t = t; + c.u = c.u >> shift << shift; + t = c.t; +} + template void go(bool useShared) { std::mt19937 eng; @@ -100,7 +146,7 @@ void go(bool useShared) { cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice)); if (i < 2) - std::cout << "lauch for " << offsets[blocks] << std::endl; + std::cout << "launch for " << offsets[blocks] << std::endl; auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; @@ -138,14 +184,8 @@ void go(bool useShared) { auto a = v + offsets[ib]; auto k1 = a[ind[j]]; auto k2 = a[ind[j - 1]]; - auto sh = sizeof(uint64_t) - NS; - sh *= 8; - auto shorten = [sh](T& t) { - auto k = (uint64_t*)(&t); - *k = (*k >> sh) << sh; - }; - shorten(k1); - shorten(k2); + truncate(k1); + truncate(k2); if (k1 < k2) std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl; } diff --git a/src/cudacompat/test/radixSort_t.cu b/src/cudacompat/test/radixSort_t.cu index e1b9bca4c..d2c469510 100644 --- a/src/cudacompat/test/radixSort_t.cu +++ b/src/cudacompat/test/radixSort_t.cu @@ -1,18 +1,20 @@ #include #include #include +#include #include #include #include #include #include #include +#include -#include "CUDACore/device_unique_ptr.h" #include "CUDACore/cudaCheck.h" -#include "CUDACore/requireDevices.h" +#include "CUDACore/device_unique_ptr.h" #include "CUDACore/launch.h" #include "CUDACore/radixSort.h" +#include "CUDACore/requireDevices.h" using namespace cms::cuda; @@ -32,6 +34,50 @@ struct RS { static constexpr int imax = std::numeric_limits::max(); }; +// A templated unsigned integer type with N bytes +template +struct uintN; + +template <> +struct uintN<8> { + using type = uint8_t; +}; + +template <> +struct uintN<16> { + using type = uint16_t; +}; + +template <> +struct uintN<32> { + using type = uint32_t; +}; + +template <> +struct uintN<64> { + using type = uint64_t; +}; + +template +using uintN_t = typename uintN::type; + +// A templated unsigned integer type with the same size as T +template +using uintT_t = uintN_t; + +// Keep only the `N` most significant bytes of `t`, and set the others to zero +template > +void truncate(T& t) { + const int shift = 8 * (sizeof(T) - N); + union { + T t; + uintT_t u; + } c; + c.t = t; + c.u = c.u >> shift << shift; + t = c.t; +} + template void go(bool useShared) { std::mt19937 eng; @@ -100,7 +146,7 @@ void go(bool useShared) { cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice)); if (i < 2) - std::cout << "lauch for " << offsets[blocks] << std::endl; + std::cout << "launch for " << offsets[blocks] << std::endl; auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; @@ -138,14 +184,8 @@ void go(bool useShared) { auto a = v + offsets[ib]; auto k1 = a[ind[j]]; auto k2 = a[ind[j - 1]]; - auto sh = sizeof(uint64_t) - NS; - sh *= 8; - auto shorten = [sh](T& t) { - auto k = (uint64_t*)(&t); - *k = (*k >> sh) << sh; - }; - shorten(k1); - shorten(k2); + truncate(k1); + truncate(k2); if (k1 < k2) std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl; } diff --git a/src/cudadev/test/radixSort_t.cu b/src/cudadev/test/radixSort_t.cu index e1b9bca4c..d2c469510 100644 --- a/src/cudadev/test/radixSort_t.cu +++ b/src/cudadev/test/radixSort_t.cu @@ -1,18 +1,20 @@ #include #include #include +#include #include #include #include #include #include #include +#include -#include "CUDACore/device_unique_ptr.h" #include "CUDACore/cudaCheck.h" -#include "CUDACore/requireDevices.h" +#include "CUDACore/device_unique_ptr.h" #include "CUDACore/launch.h" #include "CUDACore/radixSort.h" +#include "CUDACore/requireDevices.h" using namespace cms::cuda; @@ -32,6 +34,50 @@ struct RS { static constexpr int imax = std::numeric_limits::max(); }; +// A templated unsigned integer type with N bytes +template +struct uintN; + +template <> +struct uintN<8> { + using type = uint8_t; +}; + +template <> +struct uintN<16> { + using type = uint16_t; +}; + +template <> +struct uintN<32> { + using type = uint32_t; +}; + +template <> +struct uintN<64> { + using type = uint64_t; +}; + +template +using uintN_t = typename uintN::type; + +// A templated unsigned integer type with the same size as T +template +using uintT_t = uintN_t; + +// Keep only the `N` most significant bytes of `t`, and set the others to zero +template > +void truncate(T& t) { + const int shift = 8 * (sizeof(T) - N); + union { + T t; + uintT_t u; + } c; + c.t = t; + c.u = c.u >> shift << shift; + t = c.t; +} + template void go(bool useShared) { std::mt19937 eng; @@ -100,7 +146,7 @@ void go(bool useShared) { cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice)); if (i < 2) - std::cout << "lauch for " << offsets[blocks] << std::endl; + std::cout << "launch for " << offsets[blocks] << std::endl; auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; @@ -138,14 +184,8 @@ void go(bool useShared) { auto a = v + offsets[ib]; auto k1 = a[ind[j]]; auto k2 = a[ind[j - 1]]; - auto sh = sizeof(uint64_t) - NS; - sh *= 8; - auto shorten = [sh](T& t) { - auto k = (uint64_t*)(&t); - *k = (*k >> sh) << sh; - }; - shorten(k1); - shorten(k2); + truncate(k1); + truncate(k2); if (k1 < k2) std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl; } diff --git a/src/cudauvm/test/radixSort_t.cu b/src/cudauvm/test/radixSort_t.cu index e1b9bca4c..d2c469510 100644 --- a/src/cudauvm/test/radixSort_t.cu +++ b/src/cudauvm/test/radixSort_t.cu @@ -1,18 +1,20 @@ #include #include #include +#include #include #include #include #include #include #include +#include -#include "CUDACore/device_unique_ptr.h" #include "CUDACore/cudaCheck.h" -#include "CUDACore/requireDevices.h" +#include "CUDACore/device_unique_ptr.h" #include "CUDACore/launch.h" #include "CUDACore/radixSort.h" +#include "CUDACore/requireDevices.h" using namespace cms::cuda; @@ -32,6 +34,50 @@ struct RS { static constexpr int imax = std::numeric_limits::max(); }; +// A templated unsigned integer type with N bytes +template +struct uintN; + +template <> +struct uintN<8> { + using type = uint8_t; +}; + +template <> +struct uintN<16> { + using type = uint16_t; +}; + +template <> +struct uintN<32> { + using type = uint32_t; +}; + +template <> +struct uintN<64> { + using type = uint64_t; +}; + +template +using uintN_t = typename uintN::type; + +// A templated unsigned integer type with the same size as T +template +using uintT_t = uintN_t; + +// Keep only the `N` most significant bytes of `t`, and set the others to zero +template > +void truncate(T& t) { + const int shift = 8 * (sizeof(T) - N); + union { + T t; + uintT_t u; + } c; + c.t = t; + c.u = c.u >> shift << shift; + t = c.t; +} + template void go(bool useShared) { std::mt19937 eng; @@ -100,7 +146,7 @@ void go(bool useShared) { cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice)); if (i < 2) - std::cout << "lauch for " << offsets[blocks] << std::endl; + std::cout << "launch for " << offsets[blocks] << std::endl; auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; @@ -138,14 +184,8 @@ void go(bool useShared) { auto a = v + offsets[ib]; auto k1 = a[ind[j]]; auto k2 = a[ind[j - 1]]; - auto sh = sizeof(uint64_t) - NS; - sh *= 8; - auto shorten = [sh](T& t) { - auto k = (uint64_t*)(&t); - *k = (*k >> sh) << sh; - }; - shorten(k1); - shorten(k2); + truncate(k1); + truncate(k2); if (k1 < k2) std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl; } diff --git a/src/hip/test/radixSort_t.cu b/src/hip/test/radixSort_t.cu index 299eb5637..947f98d9a 100644 --- a/src/hip/test/radixSort_t.cu +++ b/src/hip/test/radixSort_t.cu @@ -1,18 +1,20 @@ #include #include #include +#include #include #include #include #include #include #include +#include -#include "CUDACore/device_unique_ptr.h" #include "CUDACore/cudaCheck.h" -#include "CUDACore/requireDevices.h" +#include "CUDACore/device_unique_ptr.h" #include "CUDACore/launch.h" #include "CUDACore/radixSort.h" +#include "CUDACore/requireDevices.h" using namespace cms::hip; @@ -32,6 +34,50 @@ struct RS { static constexpr int imax = std::numeric_limits::max(); }; +// A templated unsigned integer type with N bytes +template +struct uintN; + +template <> +struct uintN<8> { + using type = uint8_t; +}; + +template <> +struct uintN<16> { + using type = uint16_t; +}; + +template <> +struct uintN<32> { + using type = uint32_t; +}; + +template <> +struct uintN<64> { + using type = uint64_t; +}; + +template +using uintN_t = typename uintN::type; + +// A templated unsigned integer type with the same size as T +template +using uintT_t = uintN_t; + +// Keep only the `N` most significant bytes of `t`, and set the others to zero +template > +void truncate(T& t) { + const int shift = 8 * (sizeof(T) - N); + union { + T t; + uintT_t u; + } c; + c.t = t; + c.u = c.u >> shift << shift; + t = c.t; +} + template void go(bool useShared) { std::mt19937 eng; @@ -100,7 +146,7 @@ void go(bool useShared) { cudaCheck(hipMemcpy(off_d.get(), offsets, 4 * (blocks + 1), hipMemcpyHostToDevice)); if (i < 2) - std::cout << "lauch for " << offsets[blocks] << std::endl; + std::cout << "launch for " << offsets[blocks] << std::endl; auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; @@ -137,14 +183,8 @@ void go(bool useShared) { auto a = v + offsets[ib]; auto k1 = a[ind[j]]; auto k2 = a[ind[j - 1]]; - auto sh = sizeof(uint64_t) - NS; - sh *= 8; - auto shorten = [sh](T& t) { - auto k = (uint64_t*)(&t); - *k = (*k >> sh) << sh; - }; - shorten(k1); - shorten(k2); + truncate(k1); + truncate(k2); if (k1 < k2) std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl; }