diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index c116dd5d164ed..808a6fe7d690f 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -5,8 +5,6 @@
 #include <cuda.h>
 #include <cuda/api_wrappers.h>
 
-#include <cub/util_allocator.cuh>
-
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
@@ -14,6 +12,7 @@
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+#include "CachingDeviceAllocator.h"
 #include "CachingHostAllocator.h"
 
 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
@@ -311,7 +310,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
       << "  max bin    " << maxBin << "\n"
       << "  resulting bins:\n";
   for (auto bin = minBin; bin <= maxBin; ++bin) {
-    auto binSize = cub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+    auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
     if (binSize >= (1<<30) and binSize % (1<<30) == 0) {
       log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
     } else if (binSize >= (1<<20) and binSize % (1<<20) == 0) {
@@ -324,7 +323,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
   }
   log << "  maximum amount of cached memory: " << (minCachedBytes >> 20) << " MB\n";
 
-  allocator_ = std::make_unique<Allocator>(cub::CachingDeviceAllocator::IntPow(binGrowth, maxBin),
+  allocator_ = std::make_unique<Allocator>(notcub::CachingDeviceAllocator::IntPow(binGrowth, maxBin),
                                            binGrowth, minBin, maxBin, minCachedBytes,
                                            false, // do not skip cleanup
                                            debug
@@ -427,8 +426,8 @@ struct CUDAService::Allocator {
   void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
 
   size_t maxAllocation;
-  cub::CachingDeviceAllocator deviceAllocator;
-  cub::CachingHostAllocator hostAllocator;
+  notcub::CachingDeviceAllocator deviceAllocator;
+  notcub::CachingHostAllocator hostAllocator;
 };
 
 void *CUDAService::allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream) {
diff --git a/HeterogeneousCore/CUDAServices/src/CachingDeviceAllocator.h b/HeterogeneousCore/CUDAServices/src/CachingDeviceAllocator.h
new file mode 100644
index 0000000000000..2dc2645a8d013
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/src/CachingDeviceAllocator.h
@@ -0,0 +1,711 @@
+#ifndef HeterogenousCore_CUDAServices_src_CachingDeviceAllocator_h
+#define HeterogenousCore_CUDAServices_src_CachingDeviceAllocator_h
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * Forked to CMSSW by Matti Kortelainen
+ */ 
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#include <cub/util_debug.cuh>
+
+#include <set>
+#include <map>
+
+#include <cub/host/mutex.cuh>
+#include <math.h>
+
+/// CUB namespace
+namespace notcub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        void*           d_ptr;              // Device pointer
+        size_t          bytes;              // Size of allocation in bytes
+        unsigned int    bin;                // Bin enumeration
+        int             device;             // device ordinal
+        cudaStream_t    associated_stream;  // Associated associated_stream
+        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+        BlockDescriptor(int device) :
+            d_ptr(NULL),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.d_ptr < b.d_ptr);
+            else
+                return (a.device < b.device);
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.bytes < b.bytes);
+            else
+                return (a.device < b.device);
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+        size_t free;
+        size_t live;
+        TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(
+        unsigned int    &power,
+        size_t          &rounded_bytes,
+        unsigned int    base,
+        size_t          value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        if (value * base < value)
+        {
+            // Overflow
+            power = sizeof(size_t) * 8;
+            rounded_bytes = size_t(0) - 1;
+            return;
+        }
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex      mutex;              /// Mutex for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+    :
+        bin_growth(bin_growth),
+        min_bin(min_bin),
+        max_bin(max_bin),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes(max_cached_bytes),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(
+        bool skip_cleanup = false,
+        bool debug = false)
+    :
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+        // Lock
+        mutex.Lock();
+
+        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        // Unlock
+        mutex.Unlock();
+
+        return cudaSuccess;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int             device,             ///< [in] Device on which to place the allocation
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        *d_ptr                          = NULL;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            device = entrypoint_device;
+        }
+
+        // Create a block descriptor for the requested allocation
+        bool found = false;
+        BlockDescriptor search_key(device);
+        search_key.associated_stream = active_stream;
+        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+        if (search_key.bin > max_bin)
+        {
+            // Bin is greater than our maximum bin: allocate the request
+            // exactly and give out-of-bounds bin.  It will not be cached
+            // for reuse when returned.
+            search_key.bin      = INVALID_BIN;
+            search_key.bytes    = bytes;
+        }
+        else
+        {
+            // Search for a suitable cached allocation: lock
+            mutex.Lock();
+
+            if (search_key.bin < min_bin)
+            {
+                // Bin is less than minimum bin: round up
+                search_key.bin      = min_bin;
+                search_key.bytes    = min_bin_bytes;
+            }
+
+            // Iterate through the range of cached blocks on the same device in the same bin
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            while ((block_itr != cached_blocks.end())
+                    && (block_itr->device == device)
+                    && (block_itr->bin == search_key.bin))
+            {
+                // To prevent races with reusing blocks returned by the host but still
+                // in use by the device, only consider cached blocks that are
+                // either (from the active stream) or (from an idle stream)
+                if ((active_stream == block_itr->associated_stream) ||
+                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
+                {
+                    // Reuse existing cache block.  Insert into live blocks.
+                    found = true;
+                    search_key = *block_itr;
+                    search_key.associated_stream = active_stream;
+                    live_blocks.insert(search_key);
+
+                    // Remove from free blocks
+                    cached_bytes[device].free -= search_key.bytes;
+                    cached_bytes[device].live += search_key.bytes;
+
+                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
+                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+
+                    cached_blocks.erase(block_itr);
+
+                    break;
+                }
+                block_itr++;
+            }
+
+            // Done searching: unlock
+            mutex.Unlock();
+        }
+
+        // Allocate the block if necessary
+        if (!found)
+        {
+            // Set runtime's current device to specified device (entrypoint may not be set)
+            if (device != entrypoint_device)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+                if (CubDebug(error = cudaSetDevice(device))) return error;
+            }
+
+            // Attempt to allocate
+            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
+            {
+                // The allocation attempt failed: free all cached blocks on device and retry
+                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+                error = cudaSuccess;    // Reset the error we will return
+                cudaGetLastError();     // Reset CUDART's error
+
+                // Lock
+                mutex.Lock();
+
+                // Iterate the range of free blocks on the same device
+                BlockDescriptor free_key(device);
+                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
+                {
+                    // No need to worry about synchronization with the device: cudaFree is
+                    // blocking and will synchronize across all kernels executing
+                    // on the current device
+
+                    // Free device memory and destroy stream event.
+                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
+                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+
+                    // Reduce balance and erase entry
+                    cached_bytes[device].free -= block_itr->bytes;
+
+                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+                    cached_blocks.erase(block_itr);
+
+                    block_itr++;
+                }
+
+                // Unlock
+                mutex.Unlock();
+
+                // Return under error
+                if (error) return error;
+
+                // Try to allocate again
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
+            }
+
+            // Create ready event
+            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+
+            // Insert into live blocks
+            mutex.Lock();
+            live_blocks.insert(search_key);
+            cached_bytes[device].live += search_key.bytes;
+            mutex.Unlock();
+
+            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
+                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+            // Attempt to revert back to previous device if necessary
+            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+            {
+                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            }
+        }
+
+        // Copy device pointer to output parameter
+        *d_ptr = search_key.d_ptr;
+
+        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        int             device,
+        void*           d_ptr)
+    {
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+                return error;
+            device = entrypoint_device;
+        }
+
+        // Lock
+        mutex.Lock();
+
+        // Find corresponding block descriptor
+        bool recached = false;
+        BlockDescriptor search_key(d_ptr, device);
+        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+        if (block_itr != live_blocks.end())
+        {
+            // Remove from live blocks
+            search_key = *block_itr;
+            live_blocks.erase(block_itr);
+            cached_bytes[device].live -= search_key.bytes;
+
+            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
+            {
+                // Insert returned allocation into free blocks
+                recached = true;
+                cached_blocks.insert(search_key);
+                cached_bytes[device].free += search_key.bytes;
+
+                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+            }
+        }
+
+        // First set to specified device (entrypoint may not be set)
+        if (device != entrypoint_device)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            if (CubDebug(error = cudaSetDevice(device))) return error;
+        }
+
+        if (recached)
+        {
+            // Insert the ready event in the associated stream (must have current device set properly)
+            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+        }
+
+        // Unlock
+        mutex.Unlock();
+
+        if (!recached)
+        {
+            // Free the allocation from the runtime and cleanup the event.
+            if (CubDebug(error = cudaFree(d_ptr))) return error;
+            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+        }
+
+        // Reset device
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        void*           d_ptr)
+    {
+        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+        cudaError_t error         = cudaSuccess;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        mutex.Lock();
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes[current_device].free -= begin->bytes;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
+
+            cached_blocks.erase(begin);
+        }
+
+        mutex.Unlock();
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+
+#endif
diff --git a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
index d22d7d960c3d9..abd12b1864c86 100644
--- a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
+++ b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
@@ -1,3 +1,6 @@
+#ifndef HeterogenousCore_CUDAServices_src_CachingHostAllocator_h
+#define HeterogenousCore_CUDAServices_src_CachingHostAllocator_h
+
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
@@ -35,21 +38,15 @@
  * thread-safe.
  ******************************************************************************/
 
-#pragma once
-
 #include <set>
 #include <map>
 #include <cmath>
 
-#include <cub/util_namespace.cuh>
 #include <cub/util_debug.cuh>
 #include <cub/host/mutex.cuh>
 
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
 /// CUB namespace
-namespace cub {
+namespace notcub {
 
 
 /**
@@ -642,4 +639,5 @@ struct CachingHostAllocator
 /** @} */       // end group UtilMgmt
 
 }               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+#endif