diff --git a/src/cuda/test/radixSort_t.cu b/src/cuda/test/radixSort_t.cu
index e1b9bca4c..d2c469510 100644
--- a/src/cuda/test/radixSort_t.cu
+++ b/src/cuda/test/radixSort_t.cu
@@ -1,18 +1,20 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <cstdint>
 #include <iomanip>
 #include <iostream>
 #include <limits>
 #include <memory>
 #include <random>
 #include <set>
+#include <type_traits>
 
-#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
-#include "CUDACore/requireDevices.h"
+#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
 #include "CUDACore/radixSort.h"
+#include "CUDACore/requireDevices.h"
 
 using namespace cms::cuda;
 
@@ -32,6 +34,50 @@ struct RS<float> {
   static constexpr int imax = std::numeric_limits<int>::max();
 };
 
+// A templated unsigned integer type with N bytes
+template <int N>
+struct uintN;
+
+template <>
+struct uintN<8> {
+  using type = uint8_t;
+};
+
+template <>
+struct uintN<16> {
+  using type = uint16_t;
+};
+
+template <>
+struct uintN<32> {
+  using type = uint32_t;
+};
+
+template <>
+struct uintN<64> {
+  using type = uint64_t;
+};
+
+template <int N>
+using uintN_t = typename uintN<N>::type;
+
+// A templated unsigned integer type with the same size as T
+template <typename T>
+using uintT_t = uintN_t<sizeof(T) * 8>;
+
+// Keep only the `N` most significant bytes of `t`, and set the others to zero
+template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
+void truncate(T& t) {
+  const int shift = 8 * (sizeof(T) - N);
+  union {
+    T t;
+    uintT_t<T> u;
+  } c;
+  c.t = t;
+  c.u = c.u >> shift << shift;
+  t = c.t;
+}
+
 template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
 void go(bool useShared) {
   std::mt19937 eng;
@@ -100,7 +146,7 @@ void go(bool useShared) {
     cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));
 
     if (i < 2)
-      std::cout << "lauch for " << offsets[blocks] << std::endl;
+      std::cout << "launch for " << offsets[blocks] << std::endl;
 
     auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
 
@@ -138,14 +184,8 @@ void go(bool useShared) {
         auto a = v + offsets[ib];
         auto k1 = a[ind[j]];
         auto k2 = a[ind[j - 1]];
-        auto sh = sizeof(uint64_t) - NS;
-        sh *= 8;
-        auto shorten = [sh](T& t) {
-          auto k = (uint64_t*)(&t);
-          *k = (*k >> sh) << sh;
-        };
-        shorten(k1);
-        shorten(k2);
+        truncate<NS>(k1);
+        truncate<NS>(k2);
         if (k1 < k2)
           std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl;
       }
diff --git a/src/cudacompat/test/radixSort_t.cu b/src/cudacompat/test/radixSort_t.cu
index e1b9bca4c..d2c469510 100644
--- a/src/cudacompat/test/radixSort_t.cu
+++ b/src/cudacompat/test/radixSort_t.cu
@@ -1,18 +1,20 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <cstdint>
 #include <iomanip>
 #include <iostream>
 #include <limits>
 #include <memory>
 #include <random>
 #include <set>
+#include <type_traits>
 
-#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
-#include "CUDACore/requireDevices.h"
+#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
 #include "CUDACore/radixSort.h"
+#include "CUDACore/requireDevices.h"
 
 using namespace cms::cuda;
 
@@ -32,6 +34,50 @@ struct RS<float> {
   static constexpr int imax = std::numeric_limits<int>::max();
 };
 
+// A templated unsigned integer type with N bytes
+template <int N>
+struct uintN;
+
+template <>
+struct uintN<8> {
+  using type = uint8_t;
+};
+
+template <>
+struct uintN<16> {
+  using type = uint16_t;
+};
+
+template <>
+struct uintN<32> {
+  using type = uint32_t;
+};
+
+template <>
+struct uintN<64> {
+  using type = uint64_t;
+};
+
+template <int N>
+using uintN_t = typename uintN<N>::type;
+
+// A templated unsigned integer type with the same size as T
+template <typename T>
+using uintT_t = uintN_t<sizeof(T) * 8>;
+
+// Keep only the `N` most significant bytes of `t`, and set the others to zero
+template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
+void truncate(T& t) {
+  const int shift = 8 * (sizeof(T) - N);
+  union {
+    T t;
+    uintT_t<T> u;
+  } c;
+  c.t = t;
+  c.u = c.u >> shift << shift;
+  t = c.t;
+}
+
 template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
 void go(bool useShared) {
   std::mt19937 eng;
@@ -100,7 +146,7 @@ void go(bool useShared) {
     cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));
 
     if (i < 2)
-      std::cout << "lauch for " << offsets[blocks] << std::endl;
+      std::cout << "launch for " << offsets[blocks] << std::endl;
 
     auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
 
@@ -138,14 +184,8 @@ void go(bool useShared) {
         auto a = v + offsets[ib];
         auto k1 = a[ind[j]];
         auto k2 = a[ind[j - 1]];
-        auto sh = sizeof(uint64_t) - NS;
-        sh *= 8;
-        auto shorten = [sh](T& t) {
-          auto k = (uint64_t*)(&t);
-          *k = (*k >> sh) << sh;
-        };
-        shorten(k1);
-        shorten(k2);
+        truncate<NS>(k1);
+        truncate<NS>(k2);
         if (k1 < k2)
           std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl;
       }
diff --git a/src/cudadev/test/radixSort_t.cu b/src/cudadev/test/radixSort_t.cu
index e1b9bca4c..d2c469510 100644
--- a/src/cudadev/test/radixSort_t.cu
+++ b/src/cudadev/test/radixSort_t.cu
@@ -1,18 +1,20 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <cstdint>
 #include <iomanip>
 #include <iostream>
 #include <limits>
 #include <memory>
 #include <random>
 #include <set>
+#include <type_traits>
 
-#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
-#include "CUDACore/requireDevices.h"
+#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
 #include "CUDACore/radixSort.h"
+#include "CUDACore/requireDevices.h"
 
 using namespace cms::cuda;
 
@@ -32,6 +34,50 @@ struct RS<float> {
   static constexpr int imax = std::numeric_limits<int>::max();
 };
 
+// A templated unsigned integer type with N bytes
+template <int N>
+struct uintN;
+
+template <>
+struct uintN<8> {
+  using type = uint8_t;
+};
+
+template <>
+struct uintN<16> {
+  using type = uint16_t;
+};
+
+template <>
+struct uintN<32> {
+  using type = uint32_t;
+};
+
+template <>
+struct uintN<64> {
+  using type = uint64_t;
+};
+
+template <int N>
+using uintN_t = typename uintN<N>::type;
+
+// A templated unsigned integer type with the same size as T
+template <typename T>
+using uintT_t = uintN_t<sizeof(T) * 8>;
+
+// Keep only the `N` most significant bytes of `t`, and set the others to zero
+template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
+void truncate(T& t) {
+  const int shift = 8 * (sizeof(T) - N);
+  union {
+    T t;
+    uintT_t<T> u;
+  } c;
+  c.t = t;
+  c.u = c.u >> shift << shift;
+  t = c.t;
+}
+
 template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
 void go(bool useShared) {
   std::mt19937 eng;
@@ -100,7 +146,7 @@ void go(bool useShared) {
     cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));
 
     if (i < 2)
-      std::cout << "lauch for " << offsets[blocks] << std::endl;
+      std::cout << "launch for " << offsets[blocks] << std::endl;
 
     auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
 
@@ -138,14 +184,8 @@ void go(bool useShared) {
         auto a = v + offsets[ib];
         auto k1 = a[ind[j]];
         auto k2 = a[ind[j - 1]];
-        auto sh = sizeof(uint64_t) - NS;
-        sh *= 8;
-        auto shorten = [sh](T& t) {
-          auto k = (uint64_t*)(&t);
-          *k = (*k >> sh) << sh;
-        };
-        shorten(k1);
-        shorten(k2);
+        truncate<NS>(k1);
+        truncate<NS>(k2);
         if (k1 < k2)
           std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl;
       }
diff --git a/src/cudauvm/test/radixSort_t.cu b/src/cudauvm/test/radixSort_t.cu
index e1b9bca4c..d2c469510 100644
--- a/src/cudauvm/test/radixSort_t.cu
+++ b/src/cudauvm/test/radixSort_t.cu
@@ -1,18 +1,20 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <cstdint>
 #include <iomanip>
 #include <iostream>
 #include <limits>
 #include <memory>
 #include <random>
 #include <set>
+#include <type_traits>
 
-#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
-#include "CUDACore/requireDevices.h"
+#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
 #include "CUDACore/radixSort.h"
+#include "CUDACore/requireDevices.h"
 
 using namespace cms::cuda;
 
@@ -32,6 +34,50 @@ struct RS<float> {
   static constexpr int imax = std::numeric_limits<int>::max();
 };
 
+// A templated unsigned integer type with N bytes
+template <int N>
+struct uintN;
+
+template <>
+struct uintN<8> {
+  using type = uint8_t;
+};
+
+template <>
+struct uintN<16> {
+  using type = uint16_t;
+};
+
+template <>
+struct uintN<32> {
+  using type = uint32_t;
+};
+
+template <>
+struct uintN<64> {
+  using type = uint64_t;
+};
+
+template <int N>
+using uintN_t = typename uintN<N>::type;
+
+// A templated unsigned integer type with the same size as T
+template <typename T>
+using uintT_t = uintN_t<sizeof(T) * 8>;
+
+// Keep only the `N` most significant bytes of `t`, and set the others to zero
+template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
+void truncate(T& t) {
+  const int shift = 8 * (sizeof(T) - N);
+  union {
+    T t;
+    uintT_t<T> u;
+  } c;
+  c.t = t;
+  c.u = c.u >> shift << shift;
+  t = c.t;
+}
+
 template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
 void go(bool useShared) {
   std::mt19937 eng;
@@ -100,7 +146,7 @@ void go(bool useShared) {
     cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));
 
     if (i < 2)
-      std::cout << "lauch for " << offsets[blocks] << std::endl;
+      std::cout << "launch for " << offsets[blocks] << std::endl;
 
     auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
 
@@ -138,14 +184,8 @@ void go(bool useShared) {
         auto a = v + offsets[ib];
         auto k1 = a[ind[j]];
         auto k2 = a[ind[j - 1]];
-        auto sh = sizeof(uint64_t) - NS;
-        sh *= 8;
-        auto shorten = [sh](T& t) {
-          auto k = (uint64_t*)(&t);
-          *k = (*k >> sh) << sh;
-        };
-        shorten(k1);
-        shorten(k2);
+        truncate<NS>(k1);
+        truncate<NS>(k2);
         if (k1 < k2)
           std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl;
       }
diff --git a/src/hip/test/radixSort_t.cu b/src/hip/test/radixSort_t.cu
index 299eb5637..947f98d9a 100644
--- a/src/hip/test/radixSort_t.cu
+++ b/src/hip/test/radixSort_t.cu
@@ -1,18 +1,20 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <cstdint>
 #include <iomanip>
 #include <iostream>
 #include <limits>
 #include <memory>
 #include <random>
 #include <set>
+#include <type_traits>
 
-#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
-#include "CUDACore/requireDevices.h"
+#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
 #include "CUDACore/radixSort.h"
+#include "CUDACore/requireDevices.h"
 
 using namespace cms::hip;
 
@@ -32,6 +34,50 @@ struct RS<float> {
   static constexpr int imax = std::numeric_limits<int>::max();
 };
 
+// A templated unsigned integer type with N bytes
+template <int N>
+struct uintN;
+
+template <>
+struct uintN<8> {
+  using type = uint8_t;
+};
+
+template <>
+struct uintN<16> {
+  using type = uint16_t;
+};
+
+template <>
+struct uintN<32> {
+  using type = uint32_t;
+};
+
+template <>
+struct uintN<64> {
+  using type = uint64_t;
+};
+
+template <int N>
+using uintN_t = typename uintN<N>::type;
+
+// A templated unsigned integer type with the same size as T
+template <typename T>
+using uintT_t = uintN_t<sizeof(T) * 8>;
+
+// Keep only the `N` most significant bytes of `t`, and set the others to zero
+template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
+void truncate(T& t) {
+  const int shift = 8 * (sizeof(T) - N);
+  union {
+    T t;
+    uintT_t<T> u;
+  } c;
+  c.t = t;
+  c.u = c.u >> shift << shift;
+  t = c.t;
+}
+
 template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
 void go(bool useShared) {
   std::mt19937 eng;
@@ -100,7 +146,7 @@ void go(bool useShared) {
     cudaCheck(hipMemcpy(off_d.get(), offsets, 4 * (blocks + 1), hipMemcpyHostToDevice));
 
     if (i < 2)
-      std::cout << "lauch for " << offsets[blocks] << std::endl;
+      std::cout << "launch for " << offsets[blocks] << std::endl;
 
     auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
 
@@ -137,14 +183,8 @@ void go(bool useShared) {
         auto a = v + offsets[ib];
         auto k1 = a[ind[j]];
         auto k2 = a[ind[j - 1]];
-        auto sh = sizeof(uint64_t) - NS;
-        sh *= 8;
-        auto shorten = [sh](T& t) {
-          auto k = (uint64_t*)(&t);
-          *k = (*k >> sh) << sh;
-        };
-        shorten(k1);
-        shorten(k2);
+        truncate<NS>(k1);
+        truncate<NS>(k2);
         if (k1 < k2)
           std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl;
       }