cms-patatrack · mdewing · Jul 2, 2021 · Jul 2, 2021 · Jul 4, 2021 · Aug 5, 2021
diff --git a/src/hip/CUDACore/radixSort.h b/src/hip/CUDACore/radixSort.h
@@ -92,6 +92,8 @@ __device__ __forceinline__ void radixSortImpl(
   __shared__ int32_t c[sb], ct[sb], cu[sb];
 
   __shared__ int ibs;
+  __shared__ int ibs2;
+  __shared__ int ibs3;
   __shared__ int p;
 
   assert(size > 0);
@@ -148,6 +150,10 @@ __device__ __forceinline__ void radixSortImpl(
 
     // broadcast
     ibs = size - 1;
+
+    // Workaround for hang in gpuVertexFinder_t.
+    ibs3 = ibs;
+
     __syncthreads();
     while (__syncthreads_and(ibs > 0)) {
       int i = ibs - threadIdx.x;
@@ -177,8 +183,11 @@ __device__ __forceinline__ void radixSortImpl(
       __syncthreads();
       if (bin >= 0)
         assert(c[bin] >= 0);
-      if (threadIdx.x == 0)
+      if (threadIdx.x == 0) {
         ibs -= sb;
+        // Workaround for problems in radixSort_t.
+        ibs2 = ibs;
+      }
       __syncthreads();
     }
 
@@ -260,7 +269,9 @@ namespace cms {
   namespace hip {
 
     template <typename T, int NS = sizeof(T)>
-    __global__ void __launch_bounds__(256, 4)
+    // The launch bounds seems to cause the kernel to silently fail to run (rocm 4.2)
+    //__global__ void __launch_bounds__(256, 4)
+    __global__ void
         radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
       radixSortMulti<T, NS>(v, index, offsets, workspace);
     }

diff --git a/src/hip/test/radixSort_t.cu b/src/hip/test/radixSort_t.cu
@@ -140,8 +140,9 @@ void go(bool useShared) {
         auto sh = sizeof(uint64_t) - NS;
         sh *= 8;
         auto shorten = [sh](T& t) {
-          auto k = (uint64_t*)(&t);
-          *k = (*k >> sh) << sh;
+          uint64_t k = *(uint64_t *)(&t);
+          k = (k >> sh) << sh;
+          t = *(T*)(&k);
         };
         shorten(k1);
         shorten(k2);