diff --git a/src/hip/CUDACore/radixSort.h b/src/hip/CUDACore/radixSort.h index b675efd7c..d9d9c7d2f 100644 --- a/src/hip/CUDACore/radixSort.h +++ b/src/hip/CUDACore/radixSort.h @@ -148,6 +148,8 @@ __device__ __forceinline__ void radixSortImpl( // broadcast ibs = size - 1; + + __syncthreads(); while (__syncthreads_and(ibs > 0)) { int i = ibs - threadIdx.x; @@ -177,8 +179,13 @@ __device__ __forceinline__ void radixSortImpl( __syncthreads(); if (bin >= 0) assert(c[bin] >= 0); - if (threadIdx.x == 0) + if (threadIdx.x == 0) { ibs -= sb; + + // Fix for problems in radixSort_t. + // Without this, this c[bin] >=0 assert above will be triggered. + __threadfence(); + } __syncthreads(); } @@ -260,7 +267,9 @@ namespace cms { namespace hip { template - __global__ void __launch_bounds__(256, 4) + // The launch bounds seems to cause the kernel to silently fail to run (rocm 4.2) + //__global__ void __launch_bounds__(256, 4) + __global__ void radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { radixSortMulti(v, index, offsets, workspace); } diff --git a/src/hip/test/radixSort_t.cu b/src/hip/test/radixSort_t.cu index 299eb5637..a2697e536 100644 --- a/src/hip/test/radixSort_t.cu +++ b/src/hip/test/radixSort_t.cu @@ -140,8 +140,9 @@ void go(bool useShared) { auto sh = sizeof(uint64_t) - NS; sh *= 8; auto shorten = [sh](T& t) { - auto k = (uint64_t*)(&t); - *k = (*k >> sh) << sh; + uint64_t k = *(uint64_t *)(&t); + k = (k >> sh) << sh; + t = *(T*)(&k); }; shorten(k1); shorten(k2);