cms-patatrack · mdewing · Jul 2, 2021 · Jul 2, 2021 · Jul 4, 2021 · Aug 5, 2021
diff --git a/src/hip/CUDACore/radixSort.h b/src/hip/CUDACore/radixSort.h
@@ -148,6 +148,8 @@ __device__ __forceinline__ void radixSortImpl(
 
     // broadcast
     ibs = size - 1;
+
+
     __syncthreads();
     while (__syncthreads_and(ibs > 0)) {
       int i = ibs - threadIdx.x;
@@ -177,8 +179,13 @@ __device__ __forceinline__ void radixSortImpl(
       __syncthreads();
       if (bin >= 0)
         assert(c[bin] >= 0);
-      if (threadIdx.x == 0)
+      if (threadIdx.x == 0) {
         ibs -= sb;
+
+        // Fix for problems in radixSort_t.
+        // Without this, this c[bin] >=0 assert above will be triggered.
+        __threadfence();
+      }
       __syncthreads();
     }
 
@@ -260,7 +267,9 @@ namespace cms {
   namespace hip {
 
     template <typename T, int NS = sizeof(T)>
-    __global__ void __launch_bounds__(256, 4)
+    // The launch bounds seems to cause the kernel to silently fail to run (rocm 4.2)
+    //__global__ void __launch_bounds__(256, 4)
+    __global__ void
         radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
       radixSortMulti<T, NS>(v, index, offsets, workspace);
     }

diff --git a/src/hip/test/radixSort_t.cu b/src/hip/test/radixSort_t.cu
@@ -140,8 +140,9 @@ void go(bool useShared) {
         auto sh = sizeof(uint64_t) - NS;
         sh *= 8;
         auto shorten = [sh](T& t) {
-          auto k = (uint64_t*)(&t);
-          *k = (*k >> sh) << sh;
+          uint64_t k = *(uint64_t *)(&t);
+          k = (k >> sh) << sh;
+          t = *(T*)(&k);
         };
         shorten(k1);
         shorten(k2);