-
Notifications
You must be signed in to change notification settings - Fork 47
Fix shorten function and add HIP workarounds #195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -92,6 +92,8 @@ __device__ __forceinline__ void radixSortImpl( | |
| __shared__ int32_t c[sb], ct[sb], cu[sb]; | ||
|
|
||
| __shared__ int ibs; | ||
| __shared__ int ibs2; | ||
| __shared__ int ibs3; | ||
| __shared__ int p; | ||
|
|
||
| assert(size > 0); | ||
|
|
@@ -148,6 +150,10 @@ __device__ __forceinline__ void radixSortImpl( | |
|
|
||
| // broadcast | ||
| ibs = size - 1; | ||
|
|
||
| // Workaround for hang in gpuVertexFinder_t. | ||
| ibs3 = ibs; | ||
|
|
||
| __syncthreads(); | ||
| while (__syncthreads_and(ibs > 0)) { | ||
| int i = ibs - threadIdx.x; | ||
|
|
@@ -177,8 +183,11 @@ __device__ __forceinline__ void radixSortImpl( | |
| __syncthreads(); | ||
| if (bin >= 0) | ||
| assert(c[bin] >= 0); | ||
| if (threadIdx.x == 0) | ||
| if (threadIdx.x == 0) { | ||
| ibs -= sb; | ||
| // Workaround for problems in radixSort_t. | ||
| ibs2 = ibs; | ||
|
||
| } | ||
| __syncthreads(); | ||
| } | ||
|
|
||
|
|
@@ -260,7 +269,9 @@ namespace cms { | |
| namespace hip { | ||
|
|
||
| template <typename T, int NS = sizeof(T)> | ||
| __global__ void __launch_bounds__(256, 4) | ||
| // The launch bounds seems to cause the kernel to silently fail to run (rocm 4.2) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. known issue? |
||
| //__global__ void __launch_bounds__(256, 4) | ||
| __global__ void | ||
| radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { | ||
| radixSortMulti<T, NS>(v, index, offsets, workspace); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -140,8 +140,9 @@ void go(bool useShared) { | |
| auto sh = sizeof(uint64_t) - NS; | ||
| sh *= 8; | ||
| auto shorten = [sh](T& t) { | ||
| auto k = (uint64_t*)(&t); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this approach used only in the test ? or also in some of the application code ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (apologies for long delay) I'm trying to understand what this lambda is supposed to do auto sh = sizeof(uint64_t) - NS;
sh *= 8;
auto shorten = [sh](T& t) {
auto k = (uint64_t*)(&t);
*k = (*k >> sh) << sh;
};It gets run for @VinInn, could you help here with the intention of this lambda? (the code is exactly the same in CMSSW)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to test sorting? (one way to speed up radix sort is to consider only the MSBs. for instance in ptsort only the 16MSBs are used.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Right, but setting the compared-to values to zero for all types
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok. most probably was written for 64bit values and never fix it...
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here is a version that keep GCC 11 happy and seems to work as intended for arbitrary types: auto shorten = [](T& t) {
// byte representation of t
char* bytes = reinterpret_cast<char*>(&t);
// bytes to zero out
const int zeroes = static_cast<int>(sizeof(T)) - NS;
// zero out the least significant bytes (assuming a little endian architecture)
static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "This test assumes a little-endian architecture");
for (int i = 0; i < zeroes; ++i) {
bytes[i] = 0x00;
}
};
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And here is a version that is more similar to the original, and avoids the loop, but adds a lot of boilerplate to keep the compiler happy about "aliasing rules": // A templated unsigned integer type with N bytes
template <int N>
struct uintN;
template <>
struct uintN<8> {
using type = uint8_t;
};
template <>
struct uintN<16> {
using type = uint16_t;
};
template <>
struct uintN<32> {
using type = uint32_t;
};
template <>
struct uintN<64> {
using type = uint64_t;
};
template <int N>
using uintN_t = typename uintN<N>::type;
// A templated unsigned integer type with the same size as T
template <typename T>
using uintT_t = uintN_t<sizeof(T) * 8>;
// Keep only the `N` most significant bytes of `t`, and set the others to zero
template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
void shorten(T& t) {
const int shift = 8 * (sizeof(T) - N);
union {
T t;
uintT_t<T> u;
} c;
c.t = t;
c.u = c.u >> shift << shift;
t = c.t;
}
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PR'ed in #209. |
||
| *k = (*k >> sh) << sh; | ||
| uint64_t k = *(uint64_t *)(&t); | ||
| k = (k >> sh) << sh; | ||
| t = *(T*)(&k); | ||
| }; | ||
| shorten(k1); | ||
| shorten(k2); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could you add comments on the code about why the ibs2/ibs3 workaround is needed ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
where ibs3 is used?