Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 49 additions & 9 deletions HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <limits>
#include <memory>
#include <random>
#include <set>
#include <type_traits>

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
Expand All @@ -32,6 +34,50 @@ struct RS<float> {
static constexpr int imax = std::numeric_limits<int>::max();
};

// A templated unsigned integer type with N bytes
template <int N>
struct uintN;

template <>
struct uintN<8> {
using type = uint8_t;
};

template <>
struct uintN<16> {
using type = uint16_t;
};

template <>
struct uintN<32> {
using type = uint32_t;
};

template <>
struct uintN<64> {
using type = uint64_t;
};

template <int N>
using uintN_t = typename uintN<N>::type;

// A templated unsigned integer type with the same size as T
template <typename T>
using uintT_t = uintN_t<sizeof(T) * 8>;

// Keep only the `N` most significant bytes of `t`, and set the others to zero
template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
void truncate(T& t) {
const int shift = 8 * (sizeof(T) - N);
union {
T t;
uintT_t<T> u;
} c;
c.t = t;
c.u = c.u >> shift << shift;
t = c.t;
}

template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
void go(bool useShared) {
std::mt19937 eng;
Expand Down Expand Up @@ -100,7 +146,7 @@ void go(bool useShared) {
cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));

if (i < 2)
std::cout << "lauch for " << offsets[blocks] << std::endl;
std::cout << "launch for " << offsets[blocks] << std::endl;

auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;

Expand Down Expand Up @@ -138,14 +184,8 @@ void go(bool useShared) {
auto a = v + offsets[ib];
auto k1 = a[ind[j]];
auto k2 = a[ind[j - 1]];
auto sh = sizeof(uint64_t) - NS;
sh *= 8;
auto shorten = [sh](T& t) {
auto k = (uint64_t*)(&t);
*k = (*k >> sh) << sh;
};
shorten(k1);
shorten(k2);
truncate<NS>(k1);
truncate<NS>(k2);
if (k1 < k2)
std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl;
}
Expand Down