diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 2f9e8381615ec..c116dd5d164ed 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -442,7 +442,7 @@ void *CUDAService::allocate_device(int dev, size_t nbytes, cuda::stream_t<>& str
 }
 
 void CUDAService::free_device(int device, void *ptr) {
-  allocator_->deviceAllocator.DeviceFree(device, ptr);
+  cuda::throw_if_error(allocator_->deviceAllocator.DeviceFree(device, ptr));
 }
 
 void *CUDAService::allocate_host(size_t nbytes, cuda::stream_t<>& stream) {
@@ -456,5 +456,5 @@ void *CUDAService::allocate_host(size_t nbytes, cuda::stream_t<>& stream) {
 }
 
 void CUDAService::free_host(void *ptr) {
-  allocator_->hostAllocator.HostFree(ptr);
+  cuda::throw_if_error(allocator_->hostAllocator.HostFree(ptr));
 }
diff --git a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
index 60cdf8bbbb617..05e8e551404fb 100644
--- a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
+++ b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
@@ -395,14 +395,23 @@ struct CachingHostAllocator
                     found = true;
                     search_key = *block_itr;
                     search_key.associated_stream = active_stream;
+                    if(search_key.device != device) {
+                      // If "associated" device changes, need to re-create the event on the right device
+                      if (CubDebug(error = cudaSetDevice(search_key.device))) return error;
+                      if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+                      if (CubDebug(error = cudaSetDevice(device))) return error;
+                      if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) return error;
+                      search_key.device = device;
+                    }
+
                     live_blocks.insert(search_key);
 
                     // Remove from free blocks
                     cached_bytes.free -= search_key.bytes;
                     cached_bytes.live += search_key.bytes;
 
-                    if (debug) _CubLog("\tHost reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
-                        search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+                    if (debug) _CubLog("\tHost reused cached block at %p (%lld bytes) for stream %lld on device %lld (previously associated with stream %lld).\n",
+                        search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.device, (long long)  block_itr->associated_stream);
 
                     cached_blocks.erase(block_itr);
 
@@ -423,8 +432,8 @@ struct CachingHostAllocator
             if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == cudaErrorMemoryAllocation)
             {
                 // The allocation attempt failed: free all cached blocks on device and retry
-                if (debug) _CubLog("\tHost failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
-                      (long long) search_key.bytes, (long long) search_key.associated_stream);
+                if (debug) _CubLog("\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached allocations",
+                      (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.device);
 
                 error = cudaSuccess;    // Reset the error we will return
                 cudaGetLastError();     // Reset CUDART's error
@@ -476,8 +485,8 @@ struct CachingHostAllocator
             cached_bytes.live += search_key.bytes;
             mutex.Unlock();
 
-            if (debug) _CubLog("\tHost allocated new host block at %p (%lld bytes associated with stream %lld).\n",
-                      search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+            if (debug) _CubLog("\tHost allocated new host block at %p (%lld bytes associated with stream %lld on device %lld).\n",
+                      search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.device);
         }
 
         // Copy host pointer to output parameter
@@ -523,8 +532,8 @@ struct CachingHostAllocator
                 cached_blocks.insert(search_key);
                 cached_bytes.free += search_key.bytes;
 
-                if (debug) _CubLog("\tHost returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                    (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                if (debug) _CubLog("\tHost returned %lld bytes from associated stream %lld on device %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.device, (long long) cached_blocks.size(),
                     (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
             }
         }
@@ -547,8 +556,8 @@ struct CachingHostAllocator
             if (CubDebug(error = cudaFreeHost(d_ptr))) return error;
             if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
 
-            if (debug) _CubLog("\tHost freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
+            if (debug) _CubLog("\tHost freed %lld bytes from associated stream %lld on device %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.device, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
         }
 
         // Reset device