diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
index 6d8527935e334..6271b1cc0941b 100644
--- a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
+++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -13,6 +13,7 @@
 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"
 
 namespace edm {
   class StreamContext;
@@ -27,6 +28,7 @@ class CUDAMonitoringService {
 
   void postModuleConstruction(edm::ModuleDescription const& desc);
   void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
+  void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
   void postEvent(edm::StreamContext const& sc);
 
 private:
@@ -46,6 +48,9 @@ CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, ed
   if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
     registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream);
   }
+  if (config.getUntrackedParameter<bool>("memoryPerModule")) {
+    registry.watchPostModuleEvent(this, &CUDAMonitoringService::postModuleEvent);
+  }
   if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
     registry.watchPostEvent(this, &CUDAMonitoringService::postEvent);
   }
@@ -58,6 +63,8 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& des
       ->setComment("Print memory information for each device after the construction of each module");
   desc.addUntracked<bool>("memoryBeginStream", true)
       ->setComment("Print memory information for each device after the beginStream() of each module");
+  desc.addUntracked<bool>("memoryPerModule", true)
+      ->setComment("Print memory information for each device after the event of each module");
   desc.addUntracked<bool>("memoryPerEvent", true)
       ->setComment("Print memory information for each device after each event");
 
@@ -71,15 +78,23 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& des
 namespace {
   template <typename T>
   void dumpUsedMemory(T& log, int num) {
+    auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus();
     int old = 0;
     cudaCheck(cudaGetDevice(&old));
+    constexpr auto mbytes = 1 << 20;
     for (int i = 0; i < num; ++i) {
       size_t freeMemory, totalMemory;
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
       log << "\n"
-          << i << ": " << (totalMemory - freeMemory) / (1 << 20) << " MB used / " << totalMemory / (1 << 20)
-          << " MB total";
+          << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
+      auto found = cachingDeviceAllocatorStatus.find(i);
+      if (found != cachingDeviceAllocatorStatus.end()) {
+        auto const& cached = found->second;
+        log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live "
+            << "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free "
+            << (cached.live + cached.free) / mbytes << " MB total cached";
+      }
     }
     cudaCheck(cudaSetDevice(old));
   }
@@ -98,6 +113,13 @@ void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm
   dumpUsedMemory(log, numberOfDevices_);
 }
 
+void CUDAMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
+  auto log = edm::LogPrint("CUDAMonitoringService");
+  log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
+      << mcc.moduleDescription()->moduleName() << ")";
+  dumpUsedMemory(log, numberOfDevices_);
+}
+
 void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log << "CUDA device memory after event";
diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
index bde3e26382976..b78fe644422f7 100755
--- a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
+++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
@@ -1,10 +1,78 @@
 #! /bin/bash -e
 
+function help() {
+  cat <<@EOF
+Usage:
+  cmsCudaRebuild.sh [-g|-G] [-v] [-h]
+
+Check out and build all packages that contain CUDA code in .cu files.
+
+Options:
+  -g    Compile with debugging symbols, passing
+          "-g -rdynamic" to the host compiler, and
+          "-g -lineinfo" to CUDA compiler
+
+  -G    Compile with debugging symbols and enable asserts on the GPU, passing
+          "-g -rdynamic -DGPU_DEBUG" to the host compiler, and
+          "-g -lineinfo -DGPU_DEBUG" to the CUDA compiler.
+
+  -h    Show this help, and exit.
+
+  -v    Make scram be verbose.
+
+@EOF
+}
+
+
+DEBUG=0
+VERBOSE=0
+
+while [ "$*" ]; do
+  case "$1" in
+  -h)
+    help
+    exit 0
+    ;;
+  -g)
+    DEBUG=1
+    shift
+    ;;
+  -G)
+    DEBUG=2
+    shift
+    ;;
+  -v)
+    VERBOSE=$((VERBOSE + 1))
+    shift
+    ;;
+  *)
+    help
+    exit 1
+    ;;
+  esac
+done
+
 # move to the .../src directory
 cd $CMSSW_BASE/src/
 
 # check out all packages containing .cu files
 git ls-files --full-name | grep '.*\.cu$' | cut -d/ -f-2 | sort -u | xargs git cms-addpkg
 
+# set additional compilation flags
+if (( DEBUG == 1 )); then
+  export USER_CXXFLAGS="-g -rdynamic $USER_CXXFLAGS"
+  export USER_CUDA_FLAGS="-g -lineinfo $USER_CUDA_FLAGS"
+elif (( DEBUG == 2 )); then
+  export USER_CXXFLAGS="-g -rdynamic -DGPU_DEBUG $USER_CXXFLAGS"
+  export USER_CUDA_FLAGS="-g -lineinfo -DGPU_DEBUG $USER_CUDA_FLAGS"
+fi
+
+if (( VERBOSE > 0 )); then
+  SCRAM_VERBOSE="-v"
+fi
+
+# clean all built packages
+scram b clean
+
 # rebuild all checked out packages
-scram b -j
+scram b $SCRAM_VERBOSE -j
diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
index f3335f4cd409f..c130bf77688d5 100755
--- a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
+++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
@@ -6,7 +6,8 @@ DOTS=$(cudaComputeCapabilities | awk '{ print $2 }' | sort -u)
 CAPS=$(echo $DOTS | sed -e's#\.*##g')
 
 # remove existing capabilities
-sed -i $TOOL -e'\#<flags CUDA_FLAGS="-gencode arch=compute_..,code=sm_.."/>#d'
+sed -i $TOOL -e"s#-gencode arch=compute_..,code=sm_.. *##g"
+sed -i $TOOL -e"\#<flags CUDA_FLAGS=\"\"/>#d"
 
 # add support for the capabilities found on this machine
 for CAP in $CAPS; do
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
index 102a1d7bf2d86..c7a84573d4538 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -3,13 +3,16 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 #include <type_traits>
 
 namespace cms {
   namespace cuda {
+
     // Single element
+
     template <typename T>
     inline void copyAsync(device::unique_ptr<T>& dst, const host::unique_ptr<T>& src, cudaStream_t stream) {
       // Shouldn't compile for array types because of sizeof(T), but
@@ -19,6 +22,15 @@ namespace cms {
       cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
     }
 
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T>& dst, const host::noncached::unique_ptr<T>& src, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
     template <typename T>
     inline void copyAsync(host::unique_ptr<T>& dst, const device::unique_ptr<T>& src, cudaStream_t stream) {
       static_assert(std::is_array<T>::value == false,
@@ -27,6 +39,7 @@ namespace cms {
     }
 
     // Multiple elements
+
     template <typename T>
     inline void copyAsync(device::unique_ptr<T[]>& dst,
                           const host::unique_ptr<T[]>& src,
@@ -35,6 +48,14 @@ namespace cms {
       cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
     }
 
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T[]>& dst,
+                          const host::noncached::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
     template <typename T>
     inline void copyAsync(host::unique_ptr<T[]>& dst,
                           const device::unique_ptr<T[]>& src,
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index 593821fe805ed..f9b4b2f8a4c16 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -27,6 +27,13 @@ namespace cms {
     extern thread_local dim3 blockIdx;
     extern thread_local dim3 gridDim;
 
+    template <typename T1, typename T2>
+    T1 atomicCAS(T1* address, T1 compare, T2 val) {
+      T1 old = *address;
+      *address = old == compare ? val : old;
+      return old;
+    }
+
     template <typename T1, typename T2>
     T1 atomicInc(T1* a, T2 b) {
       auto ret = *a;
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h b/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h
deleted file mode 100644
index 89f131edd941e..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
-#define HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
-
-#include <initializer_list>
-
-// CUDA does not support C++17 yet, so we define here some of the missing library functions
-#if __cplusplus <= 201402L
-
-namespace std {
-
-  // from https://en.cppreference.com/w/cpp/iterator/size
-  template <class C>
-  constexpr auto size(const C& c) -> decltype(c.size()) {
-    return c.size();
-  }
-
-  template <class T, std::size_t N>
-  constexpr std::size_t size(const T (&array)[N]) noexcept {
-    return N;
-  }
-
-  // from https://en.cppreference.com/w/cpp/iterator/empty
-  template <class C>
-  constexpr auto empty(const C& c) -> decltype(c.empty()) {
-    return c.empty();
-  }
-
-  template <class T, std::size_t N>
-  constexpr bool empty(const T (&array)[N]) noexcept {
-    return false;
-  }
-
-  template <class E>
-  constexpr bool empty(std::initializer_list<E> il) noexcept {
-    return il.size() == 0;
-  }
-
-  // from https://en.cppreference.com/w/cpp/iterator/data
-  template <class C>
-  constexpr auto data(C& c) -> decltype(c.data()) {
-    return c.data();
-  }
-
-  template <class C>
-  constexpr auto data(const C& c) -> decltype(c.data()) {
-    return c.data();
-  }
-
-  template <class T, std::size_t N>
-  constexpr T* data(T (&array)[N]) noexcept {
-    return array;
-  }
-
-  template <class E>
-  constexpr const E* data(std::initializer_list<E> il) noexcept {
-    return il.begin();
-  }
-
-}  // namespace std
-
-#endif
-
-#endif  // HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h b/HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h
new file mode 100644
index 0000000000000..92f9f87e890ac
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+
+#include <map>
+
+namespace cms {
+  namespace cuda {
+    namespace allocator {
+      struct TotalBytes {
+        size_t free;
+        size_t live;
+        size_t liveRequested;  // CMS: monitor also requested amount
+        TotalBytes() { free = live = liveRequested = 0; }
+      };
+      /// Map type of device ordinals to the number of cached bytes cached by each device
+      using GpuCachedBytes = std::map<int, TotalBytes>;
+    }  // namespace allocator
+
+    allocator::GpuCachedBytes deviceAllocatorStatus();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
index 6d4d5f4e7cc5e..33dc6a18ffa2a 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -41,9 +41,9 @@ namespace cms {
   namespace cuda {
 
     // limited to 32*32 elements....
-    template <typename T>
-    __host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci,
-                                                             T* __restrict__ co,
+    template <typename VT, typename T>
+    __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
+                                                             VT* co,
                                                              uint32_t size,
                                                              T* ws
 #ifndef __CUDA_ARCH__
@@ -138,7 +138,9 @@ namespace cms {
 
     // in principle not limited....
     template <typename T>
-    __global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) {
+    __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
+      volatile T const* ci = ici;
+      volatile T* co = ico;
       __shared__ T ws[32];
 #ifdef __CUDA_ARCH__
       assert(sizeof(T) * gridDim.x <= dynamic_smem_size());  // size of psum below
@@ -152,6 +154,7 @@ namespace cms {
       // count blocks that finished
       __shared__ bool isLastBlockDone;
       if (0 == threadIdx.x) {
+        __threadfence();
         auto value = atomicAdd(pc, 1);  // block counter
         isLastBlockDone = (value == (int(gridDim.x) - 1));
       }
diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
index 1803f68a4ff78..9b164c42921d1 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
@@ -44,6 +44,7 @@
 #include <mutex>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"
 
 /// CUB namespace
 namespace notcub {
@@ -122,6 +123,7 @@ namespace notcub {
     struct BlockDescriptor {
       void *d_ptr;                     // Device pointer
       size_t bytes;                    // Size of allocation in bytes
+      size_t bytesRequested;           // CMS: requested allocatoin size (for monitoring only)
       unsigned int bin;                // Bin enumeration
       int device;                      // device ordinal
       cudaStream_t associated_stream;  // Associated associated_stream
@@ -129,12 +131,19 @@ namespace notcub {
 
       // Constructor (suitable for searching maps for a specific block, given its pointer and device)
       BlockDescriptor(void *d_ptr, int device)
-          : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {}
+          : d_ptr(d_ptr),
+            bytes(0),
+            bytesRequested(0),  // CMS
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
 
       // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
       BlockDescriptor(int device)
           : d_ptr(nullptr),
             bytes(0),
+            bytesRequested(0),  // CMS
             bin(INVALID_BIN),
             device(device),
             associated_stream(nullptr),
@@ -160,12 +169,7 @@ namespace notcub {
     /// BlockDescriptor comparator function interface
     typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
 
-    class TotalBytes {
-    public:
-      size_t free;
-      size_t live;
-      TotalBytes() { free = live = 0; }
-    };
+    // CMS: Moved TotalBytes to deviceAllocatorStatus.h
 
     /// Set type for cached blocks (ordered by size)
     typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
@@ -174,7 +178,8 @@ namespace notcub {
     typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
 
     /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<int, TotalBytes> GpuCachedBytes;
+    // CMS: Moved definition to deviceAllocatorStatus.h
+    using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes;
 
     //---------------------------------------------------------------------
     // Utility functions
@@ -219,8 +224,8 @@ namespace notcub {
     // Fields
     //---------------------------------------------------------------------
 
-    // CMS: use std::mutex instead of cub::Mutex
-    std::mutex mutex;  /// Mutex for thread-safety
+    // CMS: use std::mutex instead of cub::Mutex, declare mutable
+    mutable std::mutex mutex;  /// Mutex for thread-safety
 
     unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
     unsigned int min_bin;     /// Minimum bin enumeration
@@ -344,6 +349,7 @@ namespace notcub {
       // Create a block descriptor for the requested allocation
       bool found = false;
       BlockDescriptor search_key(device);
+      search_key.bytesRequested = bytes;  // CMS
       search_key.associated_stream = active_stream;
       NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
 
@@ -381,6 +387,7 @@ namespace notcub {
             // Remove from free blocks
             cached_bytes[device].free -= search_key.bytes;
             cached_bytes[device].live += search_key.bytes;
+            cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
 
             if (debug)
               // CMS: improved debug message
@@ -490,6 +497,7 @@ namespace notcub {
         mutex_locker.lock();
         live_blocks.insert(search_key);
         cached_bytes[device].live += search_key.bytes;
+        cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
         mutex_locker.unlock();
 
         if (debug)
@@ -569,6 +577,7 @@ namespace notcub {
         search_key = *block_itr;
         live_blocks.erase(block_itr);
         cached_bytes[device].live -= search_key.bytes;
+        cached_bytes[device].liveRequested -= search_key.bytesRequested;  // CMS
 
         // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
         if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
@@ -715,6 +724,12 @@ namespace notcub {
       return error;
     }
 
+    // CMS: give access to cache allocation status
+    GpuCachedBytes CacheStatus() const {
+      std::unique_lock mutex_locker(mutex);
+      return cached_bytes;
+    }
+
     /**
      * \brief Destructor
      */
diff --git a/HeterogeneousCore/CUDAUtilities/src/deviceAllocatorStatus.cc b/HeterogeneousCore/CUDAUtilities/src/deviceAllocatorStatus.cc
new file mode 100644
index 0000000000000..75111bd8fa548
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/deviceAllocatorStatus.cc
@@ -0,0 +1,7 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"
+
+#include "getCachingDeviceAllocator.h"
+
+namespace cms::cuda {
+  allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); }
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
index 8158f414b07d4..3770dbac574d9 100644
--- a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
@@ -13,11 +13,11 @@ namespace cms::cuda::allocator {
   // Use caching or not
   constexpr bool useCaching = true;
   // Growth factor (bin_growth in cub::CachingDeviceAllocator
-  constexpr unsigned int binGrowth = 8;
+  constexpr unsigned int binGrowth = 2;
   // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
-  constexpr unsigned int minBin = 1;
+  constexpr unsigned int minBin = 8;
   // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
-  constexpr unsigned int maxBin = 10;
+  constexpr unsigned int maxBin = 30;
   // Total storage for the allocator. 0 means no limit.
   constexpr size_t maxCachedBytes = 0;
   // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.