diff --git a/CUDADataFormats/Common/interface/device_unique_ptr.h b/CUDADataFormats/Common/interface/device_unique_ptr.h
new file mode 100644
index 0000000000000..1282c52125fa6
--- /dev/null
+++ b/CUDADataFormats/Common/interface/device_unique_ptr.h
@@ -0,0 +1,16 @@
+#ifndef CUDADataFormats_Common_interface_device_unique_ptr_h
+#define CUDADataFormats_Common_interface_device_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+namespace edm {
+  namespace cuda {
+    namespace device {
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
+    }
+  }
+}
+
+#endif
diff --git a/CUDADataFormats/Common/interface/host_unique_ptr.h b/CUDADataFormats/Common/interface/host_unique_ptr.h
new file mode 100644
index 0000000000000..c945d9b0aa027
--- /dev/null
+++ b/CUDADataFormats/Common/interface/host_unique_ptr.h
@@ -0,0 +1,16 @@
+#ifndef CUDADataFormats_Common_interface_host_unique_ptr_h
+#define CUDADataFormats_Common_interface_host_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+namespace edm {
+  namespace cuda {
+    namespace host {
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
+    }
+  }
+}
+
+#endif
diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml
new file mode 100644
index 0000000000000..21c527e7b2f0d
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml
@@ -0,0 +1,8 @@
+<use name="FWCore/ServiceRegistry"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda-api-wrappers"/>
+
+<export>
+    <lib name="1"/>
+</export>
+
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
new file mode 100644
index 0000000000000..22d9ff9d103ba
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -0,0 +1,73 @@
+#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
+#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
+
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+class SiPixelClustersCUDA {
+public:
+  SiPixelClustersCUDA() = default;
+  explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream);
+  ~SiPixelClustersCUDA() = default;
+
+  SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete;
+  SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete;
+  SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default;
+  SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default;
+
+  uint32_t *moduleStart() { return moduleStart_d.get(); }
+  int32_t  *clus() { return clus_d.get(); }
+  uint32_t *clusInModule() { return clusInModule_d.get(); }
+  uint32_t *moduleId() { return moduleId_d.get(); }
+  uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
+
+  uint32_t const *moduleStart() const { return moduleStart_d.get(); }
+  int32_t  const *clus() const { return clus_d.get(); }
+  uint32_t const *clusInModule() const { return clusInModule_d.get(); }
+  uint32_t const *moduleId() const { return moduleId_d.get(); }
+  uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
+
+  uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
+  int32_t  const *c_clus() const { return clus_d.get(); }
+  uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
+  uint32_t const *c_moduleId() const { return moduleId_d.get(); }
+  uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
+
+  class DeviceConstView {
+  public:
+    DeviceConstView() = default;
+
+#ifdef __CUDACC__
+    __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); }
+    __device__ __forceinline__ int32_t  clus(int i) const { return __ldg(clus_+i); }
+    __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); }
+    __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); }
+    __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); }
+#endif
+
+    friend SiPixelClustersCUDA;
+
+  private:
+    uint32_t const *moduleStart_ = nullptr;
+    int32_t  const *clus_ = nullptr;
+    uint32_t const *clusInModule_ = nullptr;
+    uint32_t const *moduleId_ = nullptr;
+    uint32_t const *clusModuleStart_ = nullptr;
+  };
+
+  DeviceConstView *view() const { return view_d.get(); }
+
+private:
+  edm::cuda::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
+  edm::cuda::device::unique_ptr<int32_t[]>  clus_d;          // cluster id of each pixel
+  edm::cuda::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
+  edm::cuda::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
+
+  // originally from rechits
+  edm::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;
+
+  edm::cuda::device::unique_ptr<DeviceConstView> view_d;    // "me" pointer
+};
+
+#endif
diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
new file mode 100644
index 0000000000000..7363c2fd364af
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
@@ -0,0 +1,24 @@
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) {
+  edm::Service<CUDAService> cs;
+
+  moduleStart_d     = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
+  clus_d            = cs->make_device_unique< int32_t[]>(feds, stream);
+  clusInModule_d    = cs->make_device_unique<uint32_t[]>(nelements, stream);
+  moduleId_d        = cs->make_device_unique<uint32_t[]>(nelements, stream);
+  clusModuleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
+
+  auto view = cs->make_host_unique<DeviceConstView>(stream);
+  view->moduleStart_ = moduleStart_d.get();
+  view->clus_ = clus_d.get();
+  view->clusInModule_ = clusInModule_d.get();
+  view->moduleId_ = moduleId_d.get();
+  view->clusModuleStart_ = clusModuleStart_d.get();
+
+  view_d = cs->make_device_unique<DeviceConstView>(stream);
+  cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
+}
diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml
new file mode 100644
index 0000000000000..259aa9f08d054
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml
@@ -0,0 +1,7 @@
+<use name="FWCore/ServiceRegistry"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda-api-wrappers"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
new file mode 100644
index 0000000000000..25e8b54a743c2
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -0,0 +1,65 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
+
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "FWCore/Utilities/interface/propagate_const.h"
+
+#include <cuda/api_wrappers.h>
+
+class SiPixelDigisCUDA {
+public:
+  SiPixelDigisCUDA() = default;
+  explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream);
+  ~SiPixelDigisCUDA() = default;
+
+  SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete;
+  SiPixelDigisCUDA& operator=(const SiPixelDigisCUDA&) = delete;
+  SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default;
+  SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default;
+
+  uint16_t * xx() { return xx_d.get(); }
+  uint16_t * yy() { return yy_d.get(); }
+  uint16_t * adc() { return adc_d.get(); }
+  uint16_t * moduleInd() { return moduleInd_d.get(); }
+
+  uint16_t const *xx() const { return xx_d.get(); }
+  uint16_t const *yy() const { return yy_d.get(); }
+  uint16_t const *adc() const { return adc_d.get(); }
+  uint16_t const *moduleInd() const { return moduleInd_d.get(); }
+
+  uint16_t const *c_xx() const { return xx_d.get(); }
+  uint16_t const *c_yy() const { return yy_d.get(); }
+  uint16_t const *c_adc() const { return adc_d.get(); }
+  uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }
+
+  class DeviceConstView {
+  public:
+    DeviceConstView() = default;
+
+#ifdef __CUDACC__
+    __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_+i); }
+    __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); }
+    __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); }
+    __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); }
+#endif
+
+    friend class SiPixelDigisCUDA;
+
+  private:
+    uint16_t const *xx_ = nullptr;
+    uint16_t const *yy_ = nullptr;
+    uint16_t const *adc_ = nullptr;
+    uint16_t const *moduleInd_ = nullptr;
+  };
+
+  const DeviceConstView *view() const { return view_d.get(); }
+
+private:
+  edm::cuda::device::unique_ptr<uint16_t[]> xx_d;        // local coordinates of each pixel
+  edm::cuda::device::unique_ptr<uint16_t[]> yy_d;        //
+  edm::cuda::device::unique_ptr<uint16_t[]> adc_d;       // ADC of each pixel
+  edm::cuda::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
+  edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
+};
+
+#endif
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
new file mode 100644
index 0000000000000..5ba2e920e9b04
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -0,0 +1,24 @@
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include <cuda_runtime.h>
+
+SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) {
+  edm::Service<CUDAService> cs;
+
+  xx_d              = cs->make_device_unique<uint16_t[]>(nelements, stream);
+  yy_d              = cs->make_device_unique<uint16_t[]>(nelements, stream);
+  adc_d             = cs->make_device_unique<uint16_t[]>(nelements, stream);
+  moduleInd_d       = cs->make_device_unique<uint16_t[]>(nelements, stream);
+
+  auto view = cs->make_host_unique<DeviceConstView>(stream);
+  view->xx_ = xx_d.get();
+  view->yy_ = yy_d.get();
+  view->adc_ = adc_d.get();
+  view->moduleInd_ = moduleInd_d.get();
+
+  view_d = cs->make_device_unique<DeviceConstView>(stream);
+  cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
+}
diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
index 958abf619b2ce..154c3edf80411 100644
--- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc
+++ b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
@@ -74,15 +74,19 @@ namespace heterogeneous {
                                      waitingTaskHolder, // copy needed for the catch block
                                      locationSetter = iEvent.locationSetter()
                                      ](cuda::stream::id_t streamId, cuda::status_t status) mutable {
-                                      if(status == cudaSuccess) {
+                                      if (status == cudaSuccess) {
                                         locationSetter(HeterogeneousDeviceId(HeterogeneousDevice::kGPUCuda, deviceId));
                                         LogTrace("GPUCuda") << "  GPU kernel finished (in callback) device " << deviceId << " CUDA stream " << streamId;
                                         waitingTaskHolder.doneWaiting(nullptr);
-                                      }
-                                      else {
-                                        auto error = cudaGetErrorName(status);
-                                        auto message = cudaGetErrorString(status);
-                                        waitingTaskHolder.doneWaiting(std::make_exception_ptr(cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message));
+                                      } else {
+                                        // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+                                        try {
+                                          auto error = cudaGetErrorName(status);
+                                          auto message = cudaGetErrorString(status);
+                                          throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << deviceId << " error " << error << ": " << message;
+                                        } catch(...) {
+                                          waitingTaskHolder.doneWaiting(std::current_exception());
+                                        }
                                       }
                                     });
     } catch(...) {
diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml
index c7232e16d910b..61572b96fb26e 100644
--- a/HeterogeneousCore/CUDAServices/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/BuildFile.xml
@@ -2,8 +2,10 @@
 <use name="FWCore/ServiceRegistry"/>
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/MessageLogger"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="cuda"/>
 <use name="cuda-api-wrappers"/>
+<use name="cub"/>
 
 <export>
     <lib name="1"/>
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index e359a5018813a..feeb557042548 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -4,14 +4,35 @@
 #include <utility>
 #include <vector>
 
+#include <cuda/api_wrappers.h>
+
 #include "FWCore/Utilities/interface/StreamID.h"
 
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+
 namespace edm {
   class ParameterSet;
   class ActivityRegistry;
   class ConfigurationDescriptions;
 }
 
+namespace cudaserviceimpl {
+  template <typename T>
+  struct make_device_unique_selector { using non_array = edm::cuda::device::unique_ptr<T>; };
+  template <typename T>
+  struct make_device_unique_selector<T[]> { using unbounded_array = edm::cuda::device::unique_ptr<T[]>; };
+  template <typename T, size_t N>
+  struct make_device_unique_selector<T[N]> { struct bounded_array {}; };
+
+  template <typename T>
+  struct make_host_unique_selector { using non_array = edm::cuda::host::unique_ptr<T>; };
+  template <typename T>
+  struct make_host_unique_selector<T[]> { using unbounded_array = edm::cuda::host::unique_ptr<T[]>; };
+  template <typename T, size_t N>
+  struct make_host_unique_selector<T[N]> { struct bounded_array {}; };
+}
+
 /**
  * TODO:
  * - CUDA stream management?
@@ -47,7 +68,73 @@ class CUDAService {
   // Get the current device
   int getCurrentDevice() const;
 
+  // Allocate device memory
+  template <typename T>
+  typename cudaserviceimpl::make_device_unique_selector<T>::non_array
+  make_device_unique(cuda::stream_t<>& stream) {
+    int dev = getCurrentDevice();
+    void *mem = allocate_device(dev, sizeof(T), stream);
+    return typename cudaserviceimpl::make_device_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
+                                                                               [this, dev](void *ptr) {
+                                                                                 this->free_device(dev, ptr);
+                                                                               });
+  }
+
+  template <typename T>
+  typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array
+  make_device_unique(size_t n, cuda::stream_t<>& stream) {
+    int dev = getCurrentDevice();
+    using element_type = typename std::remove_extent<T>::type;
+    void *mem = allocate_device(dev, n*sizeof(element_type), stream);
+    return typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
+                                                                                     [this, dev](void *ptr) {
+                                                                                       this->free_device(dev, ptr);
+                                                                                     });
+  }
+
+  template <typename T, typename ...Args>
+  typename cudaserviceimpl::make_device_unique_selector<T>::bounded_array
+  make_device_unique(Args&&...) = delete;
+
+  // Allocate pinned host memory
+  template <typename T>
+  typename cudaserviceimpl::make_host_unique_selector<T>::non_array
+  make_host_unique(cuda::stream_t<>& stream) {
+    void *mem = allocate_host(sizeof(T), stream);
+    return typename cudaserviceimpl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
+                                                                             [this](void *ptr) {
+                                                                               this->free_host(ptr);
+                                                                             });
+  }
+
+  template <typename T>
+  typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array
+  make_host_unique(size_t n, cuda::stream_t<>& stream) {
+    using element_type = typename std::remove_extent<T>::type;
+    void *mem = allocate_host(n*sizeof(element_type), stream);
+    return typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
+                                                                                   [this](void *ptr) {
+                                                                                     this->free_host(ptr);
+                                                                                   });
+  }
+
+  template <typename T, typename ...Args>
+  typename cudaserviceimpl::make_host_unique_selector<T>::bounded_array
+  make_host_unique(Args&&...) = delete;
+  
+  // Free device memory (to be called from unique_ptr)
+  void free_device(int device, void *ptr);
+
+  // Free pinned host memory (to be called from unique_ptr)
+  void free_host(void *ptr);
+
 private:
+  // PIMPL to hide details of allocator
+  struct Allocator;
+  std::unique_ptr<Allocator> allocator_;
+  void *allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream);
+  void *allocate_host(size_t nbytes, cuda::stream_t<>& stream);
+
   int numberOfDevices_ = 0;
   unsigned int numberOfStreamsTotal_ = 0;
   std::vector<std::pair<int, int>> computeCapabilities_;
diff --git a/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py
new file mode 100755
index 0000000000000..331ddd30f73bd
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import re
+import sys
+import argparse
+
+def main(opts):
+    device = []
+    host = []
+
+    device_re = re.compile("Device.*allocated new device block.*\((?P<bytes>\d+) bytes")
+    host_re = re.compile("Host.*allocated new host block.*\((?P<bytes>\d+) bytes")
+
+    f = open(opts.file)
+    for line in f:
+        m = device_re.search(line)
+        if m:
+            device.append(m.group("bytes"))
+            continue
+        m = host_re.search(line)
+        if m:
+            host.append(m.group("bytes"))
+    f.close()
+
+    print("process.CUDAService.allocator.devicePreallocate = cms.untracked.vuint32(%s)" % ",".join(device))
+    print("process.CUDAService.allocator.hostPreallocate = cms.untracked.vuint32(%s)" % ",".join(host))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""Extract CUDAService preallocation parameters from a log file.
+
+To use, run the job once with "process.CUDAService.allocator.debug =
+True" and direct the output to a file. Then run this script by passing
+the file as an argument, and copy the output of this script back to
+the configuration file.""")
+    parser.add_argument("file", type=str, help="Log file to parse")
+    opts = parser.parse_args()
+    main(opts)
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 84aebd58648f3..0defe163c8fb4 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -1,9 +1,12 @@
 #include <iomanip>
 #include <iostream>
+#include <limits>
 
 #include <cuda.h>
 #include <cuda/api_wrappers.h>
 
+#include <cub/util_allocator.cuh>
+
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
@@ -11,6 +14,8 @@
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+#include "CachingHostAllocator.h"
+
 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
   // read the current device
   int device;
@@ -75,7 +80,40 @@ unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
   }
 }
 
+namespace {
+  template <template<typename> typename UniquePtr,
+            typename Allocate>
+  void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
+    auto current_device = cuda::device::current::get();
+    auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+
+    std::vector<UniquePtr<char[]> > buffers;
+    buffers.reserve(bufferSizes.size());
+    for(auto size: bufferSizes) {
+      buffers.push_back(allocate(size, stream));
+    }
+  }
+
+  void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
+    int device;
+    cudaCheck(cudaGetDevice(&device));
+    for(int i=0; i<numberOfDevices; ++i) {
+      cudaCheck(cudaSetDevice(i));
+      preallocate<edm::cuda::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
+          return cs.make_device_unique<char[]>(size, stream);
+        }, bufferSizes);
+    }
+    cudaCheck(cudaSetDevice(device));
+  }
+
+  void hostPreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
+    preallocate<edm::cuda::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
+        return cs.make_host_unique<char[]>(size, stream);
+      }, bufferSizes);
+  }
+}
 
+/// Constructor
 CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& iRegistry) {
   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
   if (not configEnabled) {
@@ -245,12 +283,67 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     log << '\n';
   }
 
+  // create allocator
+  auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
+  auto binGrowth         = allocator.getUntrackedParameter<unsigned int>("binGrowth");
+  auto minBin            = allocator.getUntrackedParameter<unsigned int>("minBin");
+  auto maxBin            = allocator.getUntrackedParameter<unsigned int>("maxBin");
+  size_t maxCachedBytes  = allocator.getUntrackedParameter<unsigned int>("maxCachedBytes");
+  auto maxCachedFraction = allocator.getUntrackedParameter<double>("maxCachedFraction");
+  auto debug = allocator.getUntrackedParameter<bool>("debug");
+
+  size_t minCachedBytes = std::numeric_limits<size_t>::max();
+  int currentDevice;
+  cudaCheck(cudaGetDevice(&currentDevice));
+  for (int i = 0; i < numberOfDevices_; ++i) {
+    size_t freeMemory, totalMemory;
+    cudaCheck(cudaSetDevice(i));
+    cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+    minCachedBytes = std::min(minCachedBytes, static_cast<size_t>(maxCachedFraction * freeMemory));
+  }
+  cudaCheck(cudaSetDevice(currentDevice));
+  if (maxCachedBytes > 0) {
+    minCachedBytes = std::min(minCachedBytes, maxCachedBytes);
+  }
+  log << "cub::CachingDeviceAllocator settings\n"
+      << "  bin growth " << binGrowth << "\n"
+      << "  min bin    " << minBin << "\n"
+      << "  max bin    " << maxBin << "\n"
+      << "  resulting bins:\n";
+  for (auto bin = minBin; bin <= maxBin; ++bin) {
+    auto binSize = cub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+    if (binSize >= (1<<30) and binSize % (1<<30) == 0) {
+      log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+    } else if (binSize >= (1<<20) and binSize % (1<<20) == 0) {
+      log << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+    } else if (binSize >= (1<<10) and binSize % (1<<10) == 0) {
+      log << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+    } else {
+      log << "    " << std::setw(9) << binSize << " B\n";
+    }
+  }
+  log << "  maximum amount of cached memory: " << (minCachedBytes >> 20) << " MB\n";
+
+  allocator_ = std::make_unique<Allocator>(cub::CachingDeviceAllocator::IntPow(binGrowth, maxBin),
+                                           binGrowth, minBin, maxBin, minCachedBytes,
+                                           false, // do not skip cleanup
+                                           debug
+                                           );
+  log << "\n";
+
   log << "CUDAService fully initialized";
   enabled_ = true;
+
+  // Preallocate buffers if asked to
+  devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
+  hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
 }
 
 CUDAService::~CUDAService() {
   if (enabled_) {
+    // Explicitly destruct the allocator before the device resets below
+    allocator_.reset();
+
     for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaDeviceSynchronize());
@@ -275,6 +368,17 @@ void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions
   limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
   desc.addUntracked<edm::ParameterSetDescription>("limits", limits)->setComment("See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps the default value.");
 
+  edm::ParameterSetDescription allocator;
+  allocator.addUntracked<unsigned int>("binGrowth", 8)->setComment("Growth factor (bin_growth in cub::CachingDeviceAllocator");
+  allocator.addUntracked<unsigned int>("minBin", 1)->setComment("Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator");
+  allocator.addUntracked<unsigned int>("maxBin", 9)->setComment("Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.");
+  allocator.addUntracked<unsigned int>("maxCachedBytes", 0)->setComment("Total storage for the allocator. 0 means no limit.");
+  allocator.addUntracked<double>("maxCachedFraction", 0.8)->setComment("Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.");
+  allocator.addUntracked<bool>("debug", false)->setComment("Enable debug prints");
+  allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})->setComment("Preallocates buffers of given bytes on all devices");
+  allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})->setComment("Preallocates buffers of given bytes on the host");
+  desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator)->setComment("See the documentation of cub::CachingDeviceAllocator for more details.");
+
   descriptions.add("CUDAService", desc);
 }
 
@@ -312,3 +416,45 @@ void CUDAService::setCurrentDevice(int device) const {
 int CUDAService::getCurrentDevice() const {
   return cuda::device::current::get().id();
 }
+
+
+// allocator
+struct CUDAService::Allocator {
+  template <typename ...Args>
+  Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}
+
+  void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
+  void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
+
+  size_t maxAllocation;
+  cub::CachingDeviceAllocator deviceAllocator;
+  cub::CachingHostAllocator hostAllocator;
+};
+
+void *CUDAService::allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream) {
+  if(nbytes > allocator_->maxAllocation) {
+    throw std::runtime_error("Tried to allocate "+std::to_string(nbytes)+" bytes, but the allocator maximum is "+std::to_string(allocator_->maxAllocation));
+  }
+
+  void *ptr = nullptr;
+  cuda::throw_if_error(allocator_->deviceAllocator.DeviceAllocate(dev, &ptr, nbytes, stream.id()));
+  return ptr;
+}
+
+void CUDAService::free_device(int device, void *ptr) {
+  allocator_->deviceAllocator.DeviceFree(device, ptr);
+}
+
+void *CUDAService::allocate_host(size_t nbytes, cuda::stream_t<>& stream) {
+  if(nbytes > allocator_->maxAllocation) {
+    throw std::runtime_error("Tried to allocate "+std::to_string(nbytes)+" bytes, but the allocator maximum is "+std::to_string(allocator_->maxAllocation));
+  }
+
+  void *ptr = nullptr;
+  cuda::throw_if_error(allocator_->hostAllocator.HostAllocate(&ptr, nbytes, stream.id()));
+  return ptr;
+}
+
+void CUDAService::free_host(void *ptr) {
+  allocator_->hostAllocator.HostFree(ptr);
+}
diff --git a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
new file mode 100644
index 0000000000000..97be93c79131a
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
@@ -0,0 +1,635 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * Modified to cache pinned host allocations by Matti Kortelainen
+ */
+
+/******************************************************************************
+ * Simple caching allocator for pinned host memory allocations. The allocator is
+ * thread-safe.
+ ******************************************************************************/
+
+#pragma once
+
+#include <set>
+#include <map>
+#include <cmath>
+
+#include <cub/util_namespace.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/host/mutex.cuh>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingHostAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator pinned host memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe.  It behaves as follows:
+ *
+ * I presume the CUDA stream-safeness is not useful as to read/write
+ * from/to the pinned host memory one needs to synchronize anyway. The
+ * difference wrt. device memory is that in the CPU all operations to
+ * the device memory are scheduled via the CUDA stream, while for the
+ * host memory one can perform operations directly.
+ *
+ * \par
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused host allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations  will exceed
+ *   \p max_cached_bytes, allocations are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingHostAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes
+ *
+ */
+struct CachingHostAllocator
+{
+
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for pinned host memory allocations
+     */
+    struct BlockDescriptor
+    {
+        void*           d_ptr;              // Host pointer
+        size_t          bytes;              // Size of allocation in bytes
+        unsigned int    bin;                // Bin enumeration
+        int             device;             // device ordinal
+        cudaStream_t    associated_stream;  // Associated associated_stream
+        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer)
+        BlockDescriptor(void *d_ptr) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(INVALID_DEVICE_ORDINAL),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Constructor (suitable for searching maps for a range of suitable blocks)
+        BlockDescriptor() :
+            d_ptr(NULL),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(INVALID_DEVICE_ORDINAL),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Comparison functor for comparing host pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            return (a.d_ptr < b.d_ptr);
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            return (a.bytes < b.bytes);
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+        size_t free;
+        size_t live;
+        TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(
+        unsigned int    &power,
+        size_t          &rounded_bytes,
+        unsigned int    base,
+        size_t          value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        if (value * base < value)
+        {
+            // Overflow
+            power = sizeof(size_t) * 8;
+            rounded_bytes = size_t(0) - 1;
+            return;
+        }
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex      mutex;              /// Mutex for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes
+
+    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+    TotalBytes      cached_bytes;       /// Aggregate cached bytes
+    CachedBlocks    cached_blocks;      /// Set of cached pinned host allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live pinned host allocations currently in use
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingHostAllocator(
+        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes (default is no limit)
+        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+    :
+        bin_growth(bin_growth),
+        min_bin(min_bin),
+        max_bin(max_bin),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes(max_cached_bytes),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes
+     */
+    CachingHostAllocator(
+        bool skip_cleanup = false,
+        bool debug = false)
+    :
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    void SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+        // Lock
+        mutex.Lock();
+
+        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        // Unlock
+        mutex.Unlock();
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of pinned host memory for the given size.
+     *
+     * Once freed, the allocation becomes available immediately for reuse.
+     */
+    cudaError_t HostAllocate(
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        *d_ptr                          = NULL;
+        int device                      = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (CubDebug(error = cudaGetDevice(&device))) return error;
+
+        // Create a block descriptor for the requested allocation
+        bool found = false;
+        BlockDescriptor search_key;
+        search_key.device = device;
+        search_key.associated_stream = active_stream;
+        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+        if (search_key.bin > max_bin)
+        {
+            // Bin is greater than our maximum bin: allocate the request
+            // exactly and give out-of-bounds bin.  It will not be cached
+            // for reuse when returned.
+            search_key.bin      = INVALID_BIN;
+            search_key.bytes    = bytes;
+        }
+        else
+        {
+            // Search for a suitable cached allocation: lock
+            mutex.Lock();
+
+            if (search_key.bin < min_bin)
+            {
+                // Bin is less than minimum bin: round up
+                search_key.bin      = min_bin;
+                search_key.bytes    = min_bin_bytes;
+            }
+
+            // Iterate through the range of cached blocks in the same bin
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            while ((block_itr != cached_blocks.end())
+                    && (block_itr->bin == search_key.bin))
+            {
+                // To prevent races with reusing blocks returned by the host but still
+                // in use for transfers, only consider cached blocks that are from an idle stream
+                if(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
+                    // Reuse existing cache block.  Insert into live blocks.
+                    found = true;
+                    search_key = *block_itr;
+                    search_key.associated_stream = active_stream;
+                    live_blocks.insert(search_key);
+
+                    // Remove from free blocks
+                    cached_bytes.free -= search_key.bytes;
+                    cached_bytes.live += search_key.bytes;
+
+                    if (debug) _CubLog("\tHost reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
+                        search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+
+                    cached_blocks.erase(block_itr);
+
+                    break;
+                }
+                block_itr++;
+            }
+
+            // Done searching: unlock
+            mutex.Unlock();
+        }
+
+        // Allocate the block if necessary
+        if (!found)
+        {
+            // Attempt to allocate
+            // TODO: eventually support allocation flags
+            if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == cudaErrorMemoryAllocation)
+            {
+                // The allocation attempt failed: free all cached blocks on device and retry
+                if (debug) _CubLog("\tHost failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                      (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+                error = cudaSuccess;    // Reset the error we will return
+                cudaGetLastError();     // Reset CUDART's error
+
+                // Lock
+                mutex.Lock();
+
+                // Iterate the range of free blocks
+                CachedBlocks::iterator block_itr = cached_blocks.begin();
+
+                while ((block_itr != cached_blocks.end()))
+                {
+                    // No need to worry about synchronization with the device: cudaFree is
+                    // blocking and will synchronize across all kernels executing
+                    // on the current device
+  
+                    // Free pinned host memory.
+                    if (CubDebug(error = cudaFreeHost(block_itr->d_ptr))) break;
+                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+
+                    // Reduce balance and erase entry
+                    cached_bytes.free -= block_itr->bytes;
+
+                    if (debug) _CubLog("\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                        (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
+
+                    cached_blocks.erase(block_itr);
+
+                    block_itr++;
+                }
+
+                // Unlock
+                mutex.Unlock();
+
+                // Return under error
+                if (error) return error;
+
+                // Try to allocate again
+                if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault))) return error;
+            }
+
+            // Create ready event
+            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+
+            // Insert into live blocks
+            mutex.Lock();
+            live_blocks.insert(search_key);
+            cached_bytes.live += search_key.bytes;
+            mutex.Unlock();
+
+            if (debug) _CubLog("\tHost allocated new host block at %p (%lld bytes associated with stream %lld).\n",
+                      search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+        }
+
+        // Copy host pointer to output parameter
+        *d_ptr = search_key.d_ptr;
+
+        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+            (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees a live allocation of pinned host memory, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse.
+     */
+    cudaError_t HostFree(
+        void*           d_ptr)
+    {
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        // Lock
+        mutex.Lock();
+
+        // Find corresponding block descriptor
+        bool recached = false;
+        BlockDescriptor search_key(d_ptr);
+        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+        if (block_itr != live_blocks.end())
+        {
+            // Remove from live blocks
+            search_key = *block_itr;
+            live_blocks.erase(block_itr);
+            cached_bytes.live -= search_key.bytes;
+
+            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+            if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes))
+            {
+                // Insert returned allocation into free blocks
+                recached = true;
+                cached_blocks.insert(search_key);
+                cached_bytes.free += search_key.bytes;
+
+                if (debug) _CubLog("\tHost returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                    (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
+            }
+        }
+
+        // Unlock
+        mutex.Unlock();
+
+        if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+        if (entrypoint_device != search_key.device) {
+            if (CubDebug(error = cudaSetDevice(search_key.device))) return error;
+        }
+
+        if (recached) {
+            // Insert the ready event in the associated stream (must have current device set properly)
+            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+        }
+        else
+        {
+            // Free the allocation from the runtime and cleanup the event.
+            if (CubDebug(error = cudaFreeHost(d_ptr))) return error;
+            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+
+            if (debug) _CubLog("\tHost freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
+        }
+
+        // Reset device
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device))
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees all cached pinned host allocations
+     */
+    cudaError_t FreeAllCached()
+    {
+        cudaError_t error         = cudaSuccess;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        mutex.Lock();
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free host memory
+            if (CubDebug(error = cudaFreeHost(begin->d_ptr))) break;
+            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes.free -= begin->bytes;
+
+            if (debug) _CubLog("\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
+
+            cached_blocks.erase(begin);
+        }
+
+        mutex.Unlock();
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    ~CachingHostAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
index dc1b01c2db9fe..d0a1afcc8203f 100644
--- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -5,6 +5,7 @@
 #include <utility>
 
 #include <cuda_runtime_api.h>
+#include <cuda/api_wrappers.h>
 
 #include "catch.hpp"
 
@@ -169,6 +170,73 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
 
   }
 
+  SECTION("Device allocator") {
+    edm::ParameterSet ps;
+    ps.addUntrackedParameter("enabled", true);
+    edm::ParameterSet alloc;
+    alloc.addUntrackedParameter("minBin", 1U);
+    alloc.addUntrackedParameter("maxBin", 3U);
+    ps.addUntrackedParameter("allocator", alloc);
+    auto cs = makeCUDAService(ps, ar);
+    cs.setCurrentDevice(0);
+    auto current_device = cuda::device::current::get();
+    auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+    
+    SECTION("Destructor") {
+      auto ptr = cs.make_device_unique<int>(cudaStream);
+      REQUIRE(ptr.get() != nullptr);
+      cudaStream.synchronize();
+    }
+
+    SECTION("Reset") {
+      auto ptr = cs.make_device_unique<int[]>(5, cudaStream);
+      REQUIRE(ptr.get() != nullptr);
+      cudaStream.synchronize();
+
+      ptr.reset();
+      REQUIRE(ptr.get() == nullptr);
+    }
+
+    SECTION("Allocating too much") {
+      auto ptr = cs.make_device_unique<char[]>(512, cudaStream);
+      ptr.reset();
+      REQUIRE_THROWS(ptr = cs.make_device_unique<char[]>(513, cudaStream));
+    }
+  }
+
+
+  SECTION("Host allocator") {
+    edm::ParameterSet ps;
+    ps.addUntrackedParameter("enabled", true);
+    edm::ParameterSet alloc;
+    alloc.addUntrackedParameter("minBin", 1U);
+    alloc.addUntrackedParameter("maxBin", 3U);
+    ps.addUntrackedParameter("allocator", alloc);
+    auto cs = makeCUDAService(ps, ar);
+    cs.setCurrentDevice(0);
+    auto current_device = cuda::device::current::get();
+    auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+    
+    SECTION("Destructor") {
+      auto ptr = cs.make_host_unique<int>(cudaStream);
+      REQUIRE(ptr.get() != nullptr);
+    }
+
+    SECTION("Reset") {
+      auto ptr = cs.make_host_unique<int[]>(5, cudaStream);
+      REQUIRE(ptr.get() != nullptr);
+
+      ptr.reset();
+      REQUIRE(ptr.get() == nullptr);
+    }
+
+    SECTION("Allocating too much") {
+      auto ptr = cs.make_host_unique<char[]>(512, cudaStream);
+      ptr.reset();
+      REQUIRE_THROWS(ptr = cs.make_host_unique<char[]>(513, cudaStream));
+    }
+  }
+
   //Fake the end-of-job signal.
   ar.postEndJobSignal_();
 }
diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
index e4a07e55bd3c9..50e1fb51949e3 100644
--- a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
+++ b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
@@ -1,6 +1,8 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
 #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
 
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
@@ -33,7 +35,7 @@ class SiPixelFedCablingMapGPUWrapper {
   // operations on the device memory have completed.
   class ModulesToUnpack {
   public:
-    ModulesToUnpack();
+    ModulesToUnpack(cuda::stream_t<>& cudaStream);
     ~ModulesToUnpack() = default;
 
     void fillAsync(SiPixelFedCablingMap const& cablingMap, std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream);
@@ -41,8 +43,8 @@ class SiPixelFedCablingMapGPUWrapper {
     const unsigned char *get() const { return modToUnpDevice.get(); }
 
   private:
-    cuda::memory::device::unique_ptr<unsigned char[]> modToUnpDevice;
-    std::vector<unsigned char, CUDAHostAllocator<unsigned char>> modToUnpHost;
+    edm::cuda::device::unique_ptr<unsigned char[]> modToUnpDevice;
+    edm::cuda::host::unique_ptr<unsigned char[]> modToUnpHost;
   };
   
 private:
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
index 9db4a46f367b3..40a489f763397 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
@@ -7,6 +7,8 @@
 <use   name="RecoTracker/Record"/>
 <use   name="CalibTracker/SiPixelESProducers"/>
 <use   name="EventFilter/SiPixelRawToDigi"/>
+<use   name="CUDADataFormats/SiPixelDigi"/>
+<use   name="CUDADataFormats/SiPixelCluster"/>
 <use   name="HeterogeneousCore/Producer"/>
 <use   name="HeterogeneousCore/Product"/>
 <use   name="HeterogeneousCore/CUDACore"/>
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 7bd6eac473cc7..dc768ce8f643d 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -31,6 +31,8 @@
 #include <cub/cub.cuh>
 
 // CMSSW includes
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
@@ -48,114 +50,17 @@ namespace pixelgpudetails {
 
   // number of words for all the FEDs
   constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
-  constexpr uint32_t MAX_WORD08_SIZE = MAX_FED_WORDS * sizeof(uint8_t);
-  constexpr uint32_t MAX_WORD32_SIZE = MAX_FED_WORDS * sizeof(uint32_t);
-  constexpr uint32_t MAX_WORD16_SIZE = MAX_FED_WORDS * sizeof(uint16_t);
   constexpr uint32_t MAX_ERROR_SIZE  = MAX_FED_WORDS * esize;
 
-  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) {
-
-    cudaCheck(cudaMallocHost(&word,       MAX_FED_WORDS * sizeof(unsigned int)));
-    cudaCheck(cudaMallocHost(&fedId_h,    MAX_FED_WORDS * sizeof(unsigned char)));
-
-    // to store the output of RawToDigi
-    cudaCheck(cudaMallocHost(&pdigi_h,    MAX_FED_WORDS * sizeof(uint32_t)));
-    cudaCheck(cudaMallocHost(&rawIdArr_h, MAX_FED_WORDS * sizeof(uint32_t)));
-
-    cudaCheck(cudaMallocHost(&adc_h,      MAX_FED_WORDS * sizeof(uint16_t)));
-    cudaCheck(cudaMallocHost(&clus_h,     MAX_FED_WORDS * sizeof(int32_t)));
-
-    cudaCheck(cudaMallocHost(&error_h,     vsize));
-    cudaCheck(cudaMallocHost(&error_h_tmp, vsize));
-    cudaCheck(cudaMallocHost(&data_h, MAX_ERROR_SIZE));
-
-    cudaCheck(cudaMalloc((void**) & word_d,        MAX_WORD32_SIZE));
-    cudaCheck(cudaMalloc((void**) & fedId_d,       MAX_WORD08_SIZE));
-    cudaCheck(cudaMalloc((void**) & pdigi_d,       MAX_WORD32_SIZE)); // to store thepacked digi
-    cudaCheck(cudaMalloc((void**) & xx_d,          MAX_WORD16_SIZE)); // to store the x and y coordinate
-    cudaCheck(cudaMalloc((void**) & yy_d,          MAX_WORD16_SIZE));
-    cudaCheck(cudaMalloc((void**) & adc_d,         MAX_WORD16_SIZE));
-
-    cudaCheck(cudaMalloc((void**) & moduleInd_d,   MAX_WORD16_SIZE));
-    cudaCheck(cudaMalloc((void**) & rawIdArr_d,    MAX_WORD32_SIZE));
-    cudaCheck(cudaMalloc((void**) & error_d,       vsize));
-    cudaCheck(cudaMalloc((void**) & data_d,        MAX_ERROR_SIZE));
-    cudaCheck(cudaMemset(data_d, 0x00,             MAX_ERROR_SIZE));
-
-    // for the clusterizer
-    cudaCheck(cudaMalloc((void**) & clus_d,        MAX_WORD32_SIZE)); // cluser index in module
-
-    using namespace gpuClustering;
-    cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
-    cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
-    cudaCheck(cudaMalloc((void**) & moduleId_d,    (MaxNumModules)*sizeof(uint32_t) ));
-
-    new (error_h) GPU::SimpleVector<pixelgpudetails::error_obj>(MAX_FED_WORDS, data_h);
-    new (error_h_tmp) GPU::SimpleVector<pixelgpudetails::error_obj>(MAX_FED_WORDS, data_d);
-    assert(error_h->size() == 0);
-    assert(error_h->capacity() == static_cast<int>(MAX_FED_WORDS));
-    assert(error_h_tmp->size() == 0);
-    assert(error_h_tmp->capacity() == static_cast<int>(MAX_FED_WORDS));
-
-    // Need these in pinned memory to be truly asynchronous
-    cudaCheck(cudaMallocHost(&nModulesActive, sizeof(uint32_t)));
-    cudaCheck(cudaMallocHost(&nClusters, sizeof(uint32_t)));
-
-    cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
-    gpuProduct = getProduct();
-    assert(xx_d==gpuProduct.xx_d);
-
-    cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
-
-    // originally from rechits
-    cudaCheck(cudaMalloc((void**) & clusModuleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
-    uint32_t *tmp = nullptr;
-    cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
-    cudaCheck(cudaMalloc(&tempScanStorage_d, tempScanStorageSize));
+  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
+    edm::Service<CUDAService> cs;
+    word_ = cs->make_host_unique<unsigned int[]>(MAX_FED_WORDS, cudaStream);
+    fedId_ = cs->make_host_unique<unsigned char[]>(MAX_FED_WORDS, cudaStream);
   }
 
-  SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
-    // free the host memory
-    cudaCheck(cudaFreeHost(word));
-    cudaCheck(cudaFreeHost(fedId_h));
-    cudaCheck(cudaFreeHost(pdigi_h));
-    cudaCheck(cudaFreeHost(rawIdArr_h));
-    cudaCheck(cudaFreeHost(adc_h));
-    cudaCheck(cudaFreeHost(clus_h));
-    cudaCheck(cudaFreeHost(error_h));
-    cudaCheck(cudaFreeHost(error_h_tmp));
-    cudaCheck(cudaFreeHost(data_h));
-    cudaCheck(cudaFreeHost(nModulesActive));
-    cudaCheck(cudaFreeHost(nClusters));
-
-    // free device memory used for RawToDigi on GPU
-    // free the GPU memory
-    cudaCheck(cudaFree(word_d));
-    cudaCheck(cudaFree(fedId_d));
-    cudaCheck(cudaFree(pdigi_d));
-    cudaCheck(cudaFree(xx_d));
-    cudaCheck(cudaFree(yy_d));
-    cudaCheck(cudaFree(adc_d));
-    cudaCheck(cudaFree(moduleInd_d));
-    cudaCheck(cudaFree(rawIdArr_d));
-    cudaCheck(cudaFree(error_d));
-    cudaCheck(cudaFree(data_d));
-
-    // these are for the clusterizer
-    cudaCheck(cudaFree(moduleStart_d));
-    cudaCheck(cudaFree(clus_d));
-    cudaCheck(cudaFree(clusInModule_d));
-    cudaCheck(cudaFree(moduleId_d));
-    cudaCheck(cudaFree(gpuProduct_d));
-
-    // originally from rechits
-    cudaCheck(cudaFree(tempScanStorage_d));
-    cudaCheck(cudaFree(clusModuleStart_d));
-  }
-
-  void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
-    std::memcpy(word+wordCounterGPU, src, sizeof(cms_uint32_t)*length);
-    std::memset(fedId_h+wordCounterGPU/2, fedId - 1200, length/2);
+  void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
+    std::memcpy(word_.get()+wordCounterGPU, src, sizeof(cms_uint32_t)*length);
+    std::memset(fedId_.get()+wordCounterGPU/2, fedId - 1200, length/2);
   }
 
   ////////////////////
@@ -613,6 +518,7 @@ namespace pixelgpudetails {
       const SiPixelFedCablingMapGPU *cablingMap,
       const unsigned char *modToUnp,
       const SiPixelGainForHLTonGPU *gains,
+      const WordFedAppender& wordFed,
       const uint32_t wordCounter, const uint32_t fedCounter,
       bool convertADCtoElectrons,
       bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug,
@@ -620,52 +526,82 @@ namespace pixelgpudetails {
   {
     nDigis = wordCounter;
 
-    const int threadsPerBlock = 512;
-    const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
-
-    assert(0 == wordCounter%2);
-    // wordCounter is the total no of words in each event to be trasfered on device
-    cudaCheck(cudaMemcpyAsync(&word_d[0],  &word[0],    wordCounter*sizeof(uint32_t),    cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(&fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(error_d, error_h_tmp, vsize, cudaMemcpyDefault, stream.id()));
-
-    // Launch rawToDigi kernel
-    RawToDigi_kernel<<<blocks, threadsPerBlock, 0, stream.id()>>>(
-        cablingMap,
-        modToUnp,
-        wordCounter,
-        word_d,
-        fedId_d,
-        xx_d, yy_d, adc_d,
-        pdigi_d,
-        rawIdArr_d,
-        moduleInd_d,
-        error_d,
-        useQualityInfo,
-        includeErrors,
-        debug);
-    cudaCheck(cudaGetLastError());
-
-    // copy data to host variable
-    if(transferToCPU) {
-      cudaCheck(cudaMemcpyAsync(pdigi_h, pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-      cudaCheck(cudaMemcpyAsync(rawIdArr_h, rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-
-      if (includeErrors) {
-        cudaCheck(cudaMemcpyAsync(error_h, error_d, vsize, cudaMemcpyDefault, stream.id()));
-        cudaCheck(cudaMemcpyAsync(data_h, data_d, MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id()));
-        // If we want to transfer only the minimal amount of data, we
-        // need a synchronization point. A single ExternalWork (of
-        // SiPixelRawToClusterHeterogeneous) does not help because it is
-        // already used to synchronize the data movement. So we'd need
-        // two ExternalWorks (or explicit use of TBB tasks). The
-        // prototype of #100 would allow this easily (as there would be
-        // two ExternalWorks).
-        //
-        //error_h->set_data(data_h);
-        //cudaCheck(cudaStreamSynchronize(stream.id()));
-        //int size = error_h->size();
-        //cudaCheck(cudaMemcpyAsync(data_h, data_d, size*esize, cudaMemcpyDefault, stream.id()));
+    constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
+    digis_d = SiPixelDigisCUDA(MAX_FED_WORDS, stream);
+    clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream);
+
+    edm::Service<CUDAService> cs;
+    digis_clusters_h.nModules_Clusters = cs->make_host_unique<uint32_t[]>(2, stream);
+
+    {
+      const int threadsPerBlock = 512;
+      const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
+
+      assert(0 == wordCounter%2);
+      // wordCounter is the total no of words in each event to be trasfered on device
+      auto word_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
+      auto fedId_d = cs->make_device_unique<uint8_t[]>(wordCounter, stream);
+
+      auto error_d = cs->make_device_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
+      auto data_d = cs->make_device_unique<pixelgpudetails::error_obj[]>(MAX_FED_WORDS, stream);
+      cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id()));
+      auto error_h_tmp = cs->make_host_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
+      new (error_h_tmp.get()) GPU::SimpleVector<pixelgpudetails::error_obj>(MAX_FED_WORDS, data_d.get()); // should make_host_unique() call the constructor as well? note that even if std::make_unique does that, we can't do that in make_device_unique
+      assert(error_h_tmp->size() == 0);
+      assert(error_h_tmp->capacity() == static_cast<int>(MAX_FED_WORDS));
+
+      cudaCheck(cudaMemcpyAsync(word_d.get(),  wordFed.word(), wordCounter*sizeof(uint32_t),    cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(fedId_d.get(), wordFed.fedId(), wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp.get(), vsize, cudaMemcpyDefault, stream.id()));
+
+      auto pdigi_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
+      auto rawIdArr_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
+
+      // Launch rawToDigi kernel
+      RawToDigi_kernel<<<blocks, threadsPerBlock, 0, stream.id()>>>(
+          cablingMap,
+          modToUnp,
+          wordCounter,
+          word_d.get(),
+          fedId_d.get(),
+          digis_d.xx(), digis_d.yy(), digis_d.adc(),
+          pdigi_d.get(),
+          rawIdArr_d.get(),
+          digis_d.moduleInd(),
+          error_d.get(),
+          useQualityInfo,
+          includeErrors,
+          debug);
+      cudaCheck(cudaGetLastError());
+
+      // copy data to host variable
+      if(transferToCPU) {
+        digis_clusters_h.pdigi = cs->make_host_unique<uint32_t[]>(MAX_FED_WORDS, stream);
+        digis_clusters_h.rawIdArr = cs->make_host_unique<uint32_t[]>(MAX_FED_WORDS, stream);
+        cudaCheck(cudaMemcpyAsync(digis_clusters_h.pdigi.get(), pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+        cudaCheck(cudaMemcpyAsync(digis_clusters_h.rawIdArr.get(), rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+        if (includeErrors) {
+          digis_clusters_h.data = cs->make_host_unique<pixelgpudetails::error_obj[]>(MAX_FED_WORDS, stream);
+          digis_clusters_h.error = cs->make_host_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
+          new (digis_clusters_h.error.get()) GPU::SimpleVector<pixelgpudetails::error_obj>(MAX_FED_WORDS, digis_clusters_h.data.get());
+          assert(digis_clusters_h.error->size() == 0);
+          assert(digis_clusters_h.error->capacity() == static_cast<int>(MAX_FED_WORDS));
+
+          cudaCheck(cudaMemcpyAsync(digis_clusters_h.error.get(), error_d.get(), vsize, cudaMemcpyDefault, stream.id()));
+          cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id()));
+          // If we want to transfer only the minimal amount of data, we
+          // need a synchronization point. A single ExternalWork (of
+          // SiPixelRawToClusterHeterogeneous) does not help because it is
+          // already used to synchronize the data movement. So we'd need
+          // two ExternalWorks (or explicit use of TBB tasks). The
+          // prototype of #100 would allow this easily (as there would be
+          // two ExternalWorks).
+          //
+          //cudaCheck(cudaStreamSynchronize(stream.id()));
+          //int size = digis_clusters_h.error->size();
+          //cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), size*esize, cudaMemcpyDefault, stream.id()));
+        }
       }
     }
     // End  of Raw2Digi and passing data for cluserisation
@@ -677,15 +613,16 @@ namespace pixelgpudetails {
       int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;
 
       gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream.id()>>>(
-          moduleInd_d,
-          xx_d, yy_d, adc_d,
+          digis_d.moduleInd(),
+          digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(),
           gains,
           wordCounter);
       cudaCheck(cudaGetLastError());
 
       // calibrated adc
       if(transferToCPU) {
-        cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+        digis_clusters_h.adc = cs->make_host_unique<uint16_t[]>(MAX_FED_WORDS, stream);
+        cudaCheck(cudaMemcpyAsync(digis_clusters_h.adc.get(), digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
       }
 
 #ifdef GPU_DEBUG
@@ -694,13 +631,13 @@ namespace pixelgpudetails {
          << " blocks of " << threadsPerBlock << " threads\n";
 #endif
 
-      cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id()));
+      cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id()));
 
-      countModules<<<blocks, threadsPerBlock, 0, stream.id()>>>(moduleInd_d, moduleStart_d, clus_d, wordCounter);
+      countModules<<<blocks, threadsPerBlock, 0, stream.id()>>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), clusters_d.clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
       // read the number of modules into a data member, used by getProduct())
-      cudaCheck(cudaMemcpyAsync(nModulesActive, moduleStart_d, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 
       threadsPerBlock = 256;
       blocks = MaxNumModules;
@@ -708,23 +645,23 @@ namespace pixelgpudetails {
          std::cout << "CUDA findClus kernel launch with " << blocks
          << " blocks of " << threadsPerBlock << " threads\n";
 #endif
-      cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
+      cudaCheck(cudaMemsetAsync(clusters_d.clusInModule(), 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
       findClus<<<blocks, threadsPerBlock, 0, stream.id()>>>(
-          moduleInd_d,
-          xx_d, yy_d,
-          moduleStart_d,
-          clusInModule_d, moduleId_d,
-          clus_d,
+          digis_d.c_moduleInd(),
+          digis_d.c_xx(), digis_d.c_yy(),
+          clusters_d.c_moduleStart(),
+          clusters_d.clusInModule(), clusters_d.moduleId(),
+          clusters_d.clus(),
           wordCounter);
       cudaCheck(cudaGetLastError());
 
       // apply charge cut
       clusterChargeCut<<<blocks, threadsPerBlock, 0, stream.id()>>>(
-          moduleInd_d,
-          adc_d,
-          moduleStart_d,
-          clusInModule_d, moduleId_d,
-          clus_d,
+          digis_d.moduleInd(),
+          digis_d.c_adc(),
+          clusters_d.c_moduleStart(),
+          clusters_d.clusInModule(), clusters_d.c_moduleId(),
+          clusters_d.clus(),
           wordCounter);
       cudaCheck(cudaGetLastError());
 
@@ -735,19 +672,27 @@ namespace pixelgpudetails {
       // available in the rechit producer without additional points of
       // synchronization/ExternalWork
       //
+      // Temporary storage
+      size_t tempScanStorageSize = 0;
+      {
+        uint32_t *tmp = nullptr;
+        cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
+      }
+      auto tempScanStorage_d = cs->make_device_unique<uint32_t[]>(tempScanStorageSize, stream);
       // Set first the first element to 0
-      cudaCheck(cudaMemsetAsync(clusModuleStart_d, 0, sizeof(uint32_t), stream.id()));
+      cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id()));
       // Then use inclusive_scan to get the partial sum to the rest
-      cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d, tempScanStorageSize,
-                                              clusInModule_d, &clusModuleStart_d[1], gpuClustering::MaxNumModules,
+      cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d.get(), tempScanStorageSize,
+                                              clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules,
                                               stream.id()));
       // last element holds the number of all clusters
-      cudaCheck(cudaMemcpyAsync(nClusters, clusModuleStart_d+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 
 
       // clusters
       if(transferToCPU) {
-        cudaCheck(cudaMemcpyAsync(clus_h, clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+        digis_clusters_h.clus = cs->make_host_unique<int32_t[]>(MAX_FED_WORDS, stream);
+        cudaCheck(cudaMemcpyAsync(digis_clusters_h.clus.get(), clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
       }
     } // end clusterizer scope
   }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index ca8bd73106c2c..a2d9cdda92573 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -5,6 +5,7 @@
 #include <cuda_runtime.h>
 #include "cuda/api_wrappers.h"
 
+#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
 #include "FWCore/Utilities/interface/typedefs.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include "siPixelRawToClusterHeterogeneousProduct.h"
@@ -159,8 +160,43 @@ namespace pixelgpudetails {
 
     using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
 
-    SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream);
-    ~SiPixelRawToClusterGPUKernel();
+    struct CPUData {
+      CPUData() = default;
+      ~CPUData() = default;
+
+      CPUData(const CPUData&) = delete;
+      CPUData& operator=(const CPUData&) = delete;
+      CPUData(CPUData&&) = default;
+      CPUData& operator=(CPUData&&) = default;
+      
+      edm::cuda::host::unique_ptr<uint32_t[]> nModules_Clusters; // These should really be part of the GPU product
+
+      edm::cuda::host::unique_ptr<pixelgpudetails::error_obj[]> data;
+      edm::cuda::host::unique_ptr<GPU::SimpleVector<pixelgpudetails::error_obj>> error;
+
+      edm::cuda::host::unique_ptr<uint32_t[]> pdigi;
+      edm::cuda::host::unique_ptr<uint32_t[]> rawIdArr;
+      edm::cuda::host::unique_ptr<uint16_t[]> adc;
+      edm::cuda::host::unique_ptr<int32_t[]> clus;
+    };
+
+    class WordFedAppender {
+    public:
+      WordFedAppender(cuda::stream_t<>& cudaStream);
+      ~WordFedAppender() = default;
+
+      void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
+
+      const unsigned int *word() const { return word_.get(); }
+      const unsigned char *fedId() const { return fedId_.get(); }
+
+    private:
+      edm::cuda::host::unique_ptr<unsigned int[]> word_;
+      edm::cuda::host::unique_ptr<unsigned char[]> fedId_;
+    };
+
+    SiPixelRawToClusterGPUKernel() = default;
+    ~SiPixelRawToClusterGPUKernel() = default;
 
 
     SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete;
@@ -168,69 +204,35 @@ namespace pixelgpudetails {
     SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete;
     SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;
 
-    void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
-
     void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
                            const SiPixelGainForHLTonGPU *gains,
+                           const WordFedAppender& wordFed,
                            const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons,
                            bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug,
                            cuda::stream_t<>& stream);
 
-    auto getProduct() {
-      error_h->set_data(data_h);
-      return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
-        pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
-        gpuProduct_d,
-        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
-        clusModuleStart_d,
-        nDigis, *nModulesActive, *nClusters
-      };
+    siPixelRawToClusterHeterogeneousProduct::GPUProduct getProduct() {
+      return siPixelRawToClusterHeterogeneousProduct::GPUProduct(
+        std::move(digis_d), std::move(clusters_d),
+        nDigis,
+        digis_clusters_h.nModules_Clusters[0],
+        digis_clusters_h.nModules_Clusters[1]
+      );
     }
 
-  private:
-    // input
-    unsigned int *word = nullptr;        // to hold input for rawtodigi
-    unsigned char *fedId_h = nullptr;    // to hold fed index for each word
-
-    // output
-    GPUProduct gpuProduct;
-    GPUProduct * gpuProduct_d;
-
-    // FIXME cleanup all these are in the gpuProduct above...
-
-    uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr;                   // host copy of output
-    uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output
-    pixelgpudetails::error_obj *data_h = nullptr;
-    GPU::SimpleVector<pixelgpudetails::error_obj> *error_h = nullptr;
-    GPU::SimpleVector<pixelgpudetails::error_obj> *error_h_tmp = nullptr;
+    CPUData&& getCPUData() {
+      return std::move(digis_clusters_h);
+    }
 
+  private:
     uint32_t nDigis = 0;
-    uint32_t *nModulesActive = nullptr;
-    uint32_t *nClusters = nullptr;
-
-    // scratch memory buffers
-    uint32_t * word_d;
-    uint8_t *  fedId_d;
-    uint32_t * pdigi_d;
-    uint16_t * xx_d;
-    uint16_t * yy_d;
-    uint16_t * adc_d;
-    uint16_t * moduleInd_d;
-    uint32_t * rawIdArr_d;
 
-    GPU::SimpleVector<error_obj> * error_d;
-    error_obj * data_d;
-
-    // these are for the clusterizer (to be moved)
-    uint32_t * moduleStart_d;
-    int32_t *  clus_d;
-    uint32_t * clusInModule_d;
-    uint32_t * moduleId_d;
+    // CPU data
+    CPUData digis_clusters_h;
 
-    // originally in rechit, moved here
-    uint32_t *clusModuleStart_d = nullptr;
-    void *tempScanStorage_d = nullptr;
-    size_t tempScanStorageSize = 0;
+    // Data to be put in the event
+    SiPixelDigisCUDA digis_d;
+    SiPixelClustersCUDA clusters_d;
   };
 
   // configuration and memory buffers alocated on the GPU
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
index 384a4732b32e1..0c8139b8cba49 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
@@ -110,10 +110,9 @@ class SiPixelRawToClusterHeterogeneous: public HeterogeneousEDProducer<heterogen
   void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override;
 
   // GPU implementation
-  void beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) override;
   void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override;
   void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override;
-  void convertGPUtoCPU(edm::Event& ev, const GPUProduct& gpu) const;
+  void convertGPUtoCPU(edm::Event& ev, unsigned int nDigis, pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData) const;
 
   // Commonalities
   const FEDRawDataCollection *initialize(const edm::Event& ev, const edm::EventSetup& es);
@@ -149,8 +148,7 @@ std::unique_ptr<PixelUnpackingRegions> regions_;
   SiPixelGainCalibrationForHLTService  theSiPixelGainCalibration_;
 
   // GPU algo
-  std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel> gpuAlgo_;
-  std::unique_ptr<SiPixelFedCablingMapGPUWrapper::ModulesToUnpack> gpuModulesToUnpack_;
+  pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
   PixelDataFormatter::Errors errors_;
 
   bool enableTransfer_;
@@ -455,22 +453,17 @@ void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, c
 }
 
 // -----------------------------------------------------------------------------
-void SiPixelRawToClusterHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) {
-  // Allocate GPU resources here
-  gpuAlgo_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel>(cudaStream);
-  gpuModulesToUnpack_ = std::make_unique<SiPixelFedCablingMapGPUWrapper::ModulesToUnpack>();
-}
-
 void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
   const auto buffers = initialize(ev.event(), es);
 
+  auto gpuModulesToUnpack = SiPixelFedCablingMapGPUWrapper::ModulesToUnpack(cudaStream);
   if (regions_) {
     std::set<unsigned int> modules = *(regions_->modulesToUnpack());
-    gpuModulesToUnpack_->fillAsync(*cablingMap_, modules, cudaStream);
+    gpuModulesToUnpack.fillAsync(*cablingMap_, modules, cudaStream);
   }
   else if(recordWatcherUpdatedSinceLastTransfer_) {
     // If regions_ are disabled, it is enough to fill and transfer only if cablingMap has changed
-    gpuModulesToUnpack_->fillAsync(*cablingMap_, std::set<unsigned int>(), cudaStream);
+    gpuModulesToUnpack.fillAsync(*cablingMap_, std::set<unsigned int>(), cudaStream);
     recordWatcherUpdatedSinceLastTransfer_ = false;
   }
 
@@ -494,6 +487,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv
 
   // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
   ErrorChecker errorcheck;
+  auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(cudaStream);
   for (auto aFed = fedIds.begin(); aFed != fedIds.end(); ++aFed) {
     int fedId = *aFed;
 
@@ -543,29 +537,30 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv
     const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);
 
     assert(0 == (ew-bw)%2);
-    gpuAlgo_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
+    wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
     wordCounterGPU+=(ew-bw);
 
   } // end of for loop
 
-  gpuAlgo_->makeClustersAsync(gpuMap, gpuModulesToUnpack_->get(), hgains->getGPUProductAsync(cudaStream),
-                              wordCounterGPU, fedCounter, convertADCtoElectrons,
-                              useQuality, includeErrors, enableTransfer_, debug, cudaStream);
+  gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack.get(), hgains->getGPUProductAsync(cudaStream),
+                             wordFedAppender,
+                             wordCounterGPU, fedCounter, convertADCtoElectrons,
+                             useQuality, includeErrors, enableTransfer_, debug, cudaStream);
 }
 
 void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
-  auto output = std::make_unique<GPUProduct>(gpuAlgo_->getProduct());
-  assert(output->me_d);
+  auto output = std::make_unique<GPUProduct>(gpuAlgo_.getProduct());
 
   if(enableConversion_) {
-    convertGPUtoCPU(ev.event(), *output);
+    convertGPUtoCPU(ev.event(), output->nDigis, gpuAlgo_.getCPUData());
   }
 
   ev.put<Output>(std::move(output), heterogeneous::DisableTransfer{});
 }
 
 void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
-                                                       const SiPixelRawToClusterHeterogeneous::GPUProduct& gpu) const {
+                                                       unsigned int nDigis,
+                                                       pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData digis_clusters_h) const {
   // TODO: add the transfers here as well?
 
   auto collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
@@ -576,9 +571,9 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
   auto outputClusters = std::make_unique<SiPixelClusterCollectionNew>();
 
   edm::DetSet<PixelDigi> * detDigis=nullptr;
-  for (uint32_t i = 0; i < gpu.nDigis; i++) {
-    if (gpu.pdigi_h[i]==0) continue;
-    detDigis = &collection->find_or_insert(gpu.rawIdArr_h[i]);
+  for (uint32_t i = 0; i < nDigis; i++) {
+    if (digis_clusters_h.pdigi[i]==0) continue;
+    detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]);
     if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations
     break;
   }
@@ -609,29 +604,29 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
     if ( spc.empty() ) spc.abort();
   };
 
-  for (uint32_t i = 0; i < gpu.nDigis; i++) {
-    if (gpu.pdigi_h[i]==0) continue;
-    if (gpu.clus_h[i]>9000) continue; // not in cluster
-    assert(gpu.rawIdArr_h[i] > 109999);
-    if ( (*detDigis).detId() != gpu.rawIdArr_h[i])
+  for (uint32_t i = 0; i < nDigis; i++) {
+    if (digis_clusters_h.pdigi[i]==0) continue;
+    if (digis_clusters_h.clus[i]>9000) continue; // not in cluster
+    assert(digis_clusters_h.rawIdArr[i] > 109999);
+    if ( (*detDigis).detId() != digis_clusters_h.rawIdArr[i])
       {
         fillClusters((*detDigis).detId());
         assert(nclus==-1);
-        detDigis = &collection->find_or_insert(gpu.rawIdArr_h[i]);
+        detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]);
         if ( (*detDigis).empty() )
           (*detDigis).data.reserve(32); // avoid the first relocations
         else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; }
       }
-    (*detDigis).data.emplace_back(gpu.pdigi_h[i]);
+    (*detDigis).data.emplace_back(digis_clusters_h.pdigi[i]);
     auto const & dig = (*detDigis).data.back();
     // fill clusters
-    assert(gpu.clus_h[i]>=0);
-    assert(gpu.clus_h[i]<1024);
-    nclus = std::max(gpu.clus_h[i],nclus);
+    assert(digis_clusters_h.clus[i]>=0);
+    assert(digis_clusters_h.clus[i]<1024);
+    nclus = std::max(digis_clusters_h.clus[i],nclus);
     auto row = dig.row();
     auto col = dig.column();
     SiPixelCluster::PixelPos pix(row,col);
-    aclusters[gpu.clus_h[i]].add(pix,gpu.adc_h[i]);
+    aclusters[digis_clusters_h.clus[i]].add(pix, digis_clusters_h.adc[i]);
   }
 
   // fill final clusters
@@ -642,9 +637,9 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
   auto errors = errors_; // make a copy
   PixelDataFormatter::DetErrors nodeterrors;
 
-  auto size = gpu.error_h->size();
+  auto size = digis_clusters_h.error->size();
   for (auto i = 0; i < size; i++) {
-    pixelgpudetails::error_obj err = (*gpu.error_h)[i];
+    pixelgpudetails::error_obj err = (*digis_clusters_h.error)[i];
     if (err.errorType != 0) {
       SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
       errors[err.rawId].push_back(error);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
index e75afd3481b25..0cab299582410 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
@@ -1,6 +1,8 @@
 #ifndef EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h
 #define EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h
 
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "FWCore/Utilities/interface/typedefs.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
@@ -25,27 +27,21 @@ namespace siPixelRawToClusterHeterogeneousProduct {
 
   // FIXME split in two
   struct GPUProduct {
-    // Needed for digi and cluster CPU output
-    uint32_t const * pdigi_h = nullptr;
-    uint32_t const * rawIdArr_h = nullptr;
-    int32_t const * clus_h = nullptr;
-    uint16_t const * adc_h = nullptr;
-    GPU::SimpleVector<error_obj> const * error_h = nullptr;
-
-    GPUProduct const * me_d = nullptr;
-
-    // Needed for GPU rechits
-    uint16_t const * xx_d;
-    uint16_t const * yy_d;
-    uint16_t const * adc_d;
-    uint16_t const * moduleInd_d;
-    uint32_t const * moduleStart_d;
-    int32_t const *  clus_d;
-    uint32_t const * clusInModule_d;
-    uint32_t const * moduleId_d;
-
-    // originally from rechits
-    uint32_t const * clusModuleStart_d;
+    GPUProduct() = default;
+    GPUProduct(const GPUProduct&) = delete;
+    GPUProduct& operator=(const GPUProduct&) = delete;
+    GPUProduct(GPUProduct&&) = default;
+    GPUProduct& operator=(GPUProduct&&) = default;
+
+    GPUProduct(SiPixelDigisCUDA&& digis,
+               SiPixelClustersCUDA&& clusters,
+               uint32_t ndig, uint32_t nmod, uint32_t nclus):
+      digis_d(std::move(digis)), clusters_d(std::move(clusters)),
+      nDigis(ndig), nModules(nmod), nClusters(nclus)
+    {}
+
+    SiPixelDigisCUDA digis_d;
+    SiPixelClustersCUDA clusters_d;
 
     uint32_t nDigis;
     uint32_t nModules;
diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
index df2f488fe488c..1f872f3cb7464 100644
--- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -14,6 +14,7 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "Geometry/CommonDetUnit/interface/GeomDetType.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
 
@@ -123,10 +124,11 @@ const SiPixelFedCablingMapGPU *SiPixelFedCablingMapGPUWrapper::getGPUProductAsyn
   return data.cablingMapDevice;
 }
 
-SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::ModulesToUnpack():
-  modToUnpDevice(cuda::memory::device::make_unique<unsigned char[]>(cuda::device::current::get().id(), pixelgpudetails::MAX_SIZE)),
-  modToUnpHost(pixelgpudetails::MAX_SIZE)
+SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::ModulesToUnpack(cuda::stream_t<>& cudaStream)
 {
+  edm::Service<CUDAService> cs;
+  modToUnpDevice = cs->make_device_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);
+  modToUnpHost = cs->make_host_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);
 }
 
 void SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::fillAsync(SiPixelFedCablingMap const& cablingMap, std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream) {
@@ -154,7 +156,7 @@ void SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::fillAsync(SiPixelFedCablin
     }
   }
 
-  cuda::memory::async::copy(modToUnpDevice.get(), modToUnpHost.data(), modToUnpHost.size() * sizeof(unsigned char), cudaStream.id());
+  cuda::memory::async::copy(modToUnpDevice.get(), modToUnpHost.get(), pixelgpudetails::MAX_SIZE * sizeof(unsigned char), cudaStream.id());
 }
 
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index c63466f157a1b..333898eaa601a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -127,10 +127,10 @@ namespace pixelgpudetails {
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            bool transferToCPU,
                                            cuda::stream_t<>& stream) {
-   cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
-   gpu_.hitsModuleStart_d = input.clusModuleStart_d;
-   gpu_.cpeParams = cpeParams; // copy it for use in clients
-   cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
+    gpu_.hitsModuleStart_d = input.clusters_d.clusModuleStart();
+    gpu_.cpeParams = cpeParams; // copy it for use in clients
+    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
 
     int threadsPerBlock = 256;
     int blocks = input.nModules; // active modules (with digis)
@@ -141,11 +141,11 @@ namespace pixelgpudetails {
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
       gpu_.bs_d,
-      input.moduleInd_d,
-      input.xx_d, input.yy_d, input.adc_d,
-      input.moduleStart_d,
-      input.clusInModule_d, input.moduleId_d,
-      input.clus_d,
+      input.digis_d.moduleInd(),
+      input.digis_d.xx(), input.digis_d.yy(), input.digis_d.adc(),
+      input.clusters_d.moduleStart(),
+      input.clusters_d.clusInModule(), input.clusters_d.moduleId(),
+      input.clusters_d.clus(),
       input.nDigis,
       gpu_.hitsModuleStart_d,
       gpu_.charge_d,
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index 83a7e63e628a0..b402daef07a05 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -3,6 +3,8 @@
 #include <mutex>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
@@ -12,14 +14,13 @@
 using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
 
 __global__
-void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n)
+void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const SiPixelClustersCUDA::DeviceConstView *cc, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n)
 {
   assert(slp == slp->me_d);
 
   constexpr int32_t  invTK = 0;     // std::numeric_limits<int32_t>::max();
   constexpr uint16_t InvId = 9999;  // must be > MaxNumModules
 
-  auto const & dd = *ddp;
   auto const & hh = *hhp;
   auto const & sl = *slp;
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -27,14 +28,14 @@ void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLO
   if (i >= ndigis)
     return;
 
-  auto id = dd.moduleInd_d[i];
+  auto id = dd->moduleInd(i);
   if (InvId == id)
     return;
   assert(id < 2000);
 
-  auto ch = pixelgpudetails::pixelToChannel(dd.xx_d[i], dd.yy_d[i]);
+  auto ch = pixelgpudetails::pixelToChannel(dd->xx(i), dd->yy(i));
   auto first = hh.hitsModuleStart_d[id];
-  auto cl = first + dd.clus_d[i];
+  auto cl = first + cc->clus(i);
   assert(cl < 2000 * blockDim.x);
 
   const std::array<uint32_t, 4> me{{id, ch, 0, 0}};
@@ -176,7 +177,7 @@ namespace clusterSLOnGPU {
     blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;
 
     assert(sl.me_d);
-    simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.me_d, ndigis, hh.gpu_d, sl.me_d, n);
+    simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.digis_d.view(), ndigis, dd.clusters_d.view(), hh.gpu_d, sl.me_d, n);
     cudaCheck(cudaGetLastError());
 
     if (doDump) {