diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
new file mode 100644
index 0000000000000..e7a5ba74d80be
--- /dev/null
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -0,0 +1,7 @@
+<iftool name="cuda-gcc-support">
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+    <lib name="1"/>
+</export>
+</iftool>
diff --git a/CUDADataFormats/Common/interface/Product.h b/CUDADataFormats/Common/interface/Product.h
new file mode 100644
index 0000000000000..41bb8356e67cf
--- /dev/null
+++ b/CUDADataFormats/Common/interface/Product.h
@@ -0,0 +1,60 @@
+#ifndef CUDADataFormats_Common_Product_h
+#define CUDADataFormats_Common_Product_h
+
+#include <memory>
+
+#include "CUDADataFormats/Common/interface/ProductBase.h"
+
+namespace edm {
+  template <typename T>
+  class Wrapper;
+}
+
+namespace cms {
+  namespace cuda {
+    namespace impl {
+      class ScopedContextGetterBase;
+    }
+
+    /**
+     * The purpose of this class is to wrap CUDA data to edm::Event in a
+     * way which forces correct use of various utilities.
+     *
+     * The non-default construction has to be done with cms::cuda::ScopedContext
+     * (in order to properly register the CUDA event).
+     *
+     * The default constructor is needed only for the ROOT dictionary generation.
+     *
+     * The CUDA event is in practice needed only for stream-stream
+     * synchronization, but someone with long-enough lifetime has to own
+     * it. Here is a somewhat natural place. If overhead is too much, we
+     * can use them only where synchronization between streams is needed.
+     */
+    template <typename T>
+    class Product : public ProductBase {
+    public:
+      Product() = default;  // Needed only for ROOT dictionary generation
+
+      Product(const Product&) = delete;
+      Product& operator=(const Product&) = delete;
+      Product(Product&&) = default;
+      Product& operator=(Product&&) = default;
+
+    private:
+      friend class impl::ScopedContextGetterBase;
+      friend class ScopedContextProduce;
+      friend class edm::Wrapper<Product<T>>;
+
+      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
+          : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
+
+      template <typename... Args>
+      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args)
+          : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
+
+      T data_;  //!
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h
new file mode 100644
index 0000000000000..efe2242903bd0
--- /dev/null
+++ b/CUDADataFormats/Common/interface/ProductBase.h
@@ -0,0 +1,93 @@
+#ifndef CUDADataFormats_Common_ProductBase_h
+#define CUDADataFormats_Common_ProductBase_h
+
+#include <atomic>
+#include <memory>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+
+namespace cms {
+  namespace cuda {
+    namespace impl {
+      class ScopedContextBase;
+    }
+
+    /**
+     * Base class for all instantiations of CUDA<T> to hold the
+     * non-T-dependent members.
+     */
+    class ProductBase {
+    public:
+      ProductBase() = default;  // Needed only for ROOT dictionary generation
+      ~ProductBase();
+
+      ProductBase(const ProductBase&) = delete;
+      ProductBase& operator=(const ProductBase&) = delete;
+      ProductBase(ProductBase&& other)
+          : stream_{std::move(other.stream_)},
+            event_{std::move(other.event_)},
+            mayReuseStream_{other.mayReuseStream_.load()},
+            device_{other.device_} {}
+      ProductBase& operator=(ProductBase&& other) {
+        stream_ = std::move(other.stream_);
+        event_ = std::move(other.event_);
+        mayReuseStream_ = other.mayReuseStream_.load();
+        device_ = other.device_;
+        return *this;
+      }
+
+      bool isValid() const { return stream_.get() != nullptr; }
+      bool isAvailable() const;
+
+      int device() const { return device_; }
+
+      // cudaStream_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaStream_t stream() const { return stream_.get(); }
+
+      // cudaEvent_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaEvent_t event() const { return event_.get(); }
+
+    protected:
+      explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event)
+          : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
+
+    private:
+      friend class impl::ScopedContextBase;
+      friend class ScopedContextProduce;
+
+      // The following function is intended to be used only from ScopedContext
+      const SharedStreamPtr& streamPtr() const { return stream_; }
+
+      bool mayReuseStream() const {
+        bool expected = true;
+        bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+        // If the current thread is the one flipping the flag, it may
+        // reuse the stream.
+        return changed;
+      }
+
+      // The cudaStream_t is really shared among edm::Event products, so
+      // using shared_ptr also here
+      SharedStreamPtr stream_;  //!
+      // shared_ptr because of caching in cms::cuda::EventCache
+      SharedEventPtr event_;  //!
+
+      // This flag tells whether the CUDA stream may be reused by a
+      // consumer or not. The goal is to have a "chain" of modules to
+      // queue their work to the same stream.
+      mutable std::atomic<bool> mayReuseStream_ = true;  //!
+
+      // The CUDA device associated with this product
+      int device_ = -1;  //!
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/CUDADataFormats/Common/src/ProductBase.cc b/CUDADataFormats/Common/src/ProductBase.cc
new file mode 100644
index 0000000000000..8e1cf64b17122
--- /dev/null
+++ b/CUDADataFormats/Common/src/ProductBase.cc
@@ -0,0 +1,29 @@
+#include "CUDADataFormats/Common/interface/ProductBase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+
+namespace cms::cuda {
+  bool ProductBase::isAvailable() const {
+    // if default-constructed, the product is not available
+    if (not event_) {
+      return false;
+    }
+    return eventWorkHasCompleted(event_.get());
+  }
+
+  ProductBase::~ProductBase() {
+    // Make sure that the production of the product in the GPU is
+    // complete before destructing the product. This is to make sure
+    // that the EDM stream does not move to the next event before all
+    // asynchronous processing of the current is complete.
+
+    // TODO: a callback notifying a WaitingTaskHolder (or similar)
+    // would avoid blocking the CPU, but would also require more work.
+    //
+    // Intentionally not checking the return value to avoid throwing
+    // exceptions. If this call would fail, we should get failures
+    // elsewhere as well.
+    if (event_) {
+      cudaEventSynchronize(event_.get());
+    }
+  }
+}  // namespace cms::cuda
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
new file mode 100644
index 0000000000000..30b84c9ce2f22
--- /dev/null
+++ b/CUDADataFormats/Common/test/BuildFile.xml
@@ -0,0 +1,7 @@
+<iftool name="cuda-gcc-support">
+<bin file="test*.cc" name="testCUDADataFormatsCommon">
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
+</iftool>
diff --git a/CUDADataFormats/Common/test/test_Product.cc b/CUDADataFormats/Common/test/test_Product.cc
new file mode 100644
index 0000000000000..d70ff3e63e903
--- /dev/null
+++ b/CUDADataFormats/Common/test/test_Product.cc
@@ -0,0 +1,68 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+
+#include <cuda_runtime_api.h>
+
+namespace cms::cudatest {
+  class TestScopedContext {
+  public:
+    static cuda::ScopedContextProduce make(int dev, bool createEvent) {
+      cms::cuda::SharedEventPtr event;
+      if (createEvent) {
+        event = cms::cuda::getEventCache().get();
+      }
+      return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
+    }
+  };
+}  // namespace cms::cudatest
+
+TEST_CASE("Use of cms::cuda::Product template", "[CUDACore]") {
+  SECTION("Default constructed") {
+    auto foo = cms::cuda::Product<int>();
+    REQUIRE(!foo.isValid());
+
+    auto bar = std::move(foo);
+  }
+
+  if (not cms::cudatest::testDevices()) {
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+  cudaCheck(cudaSetDevice(defaultDevice));
+  {
+    auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
+    std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
+    auto& data = *dataPtr;
+
+    SECTION("Construct from cms::cuda::ScopedContext") {
+      REQUIRE(data.isValid());
+      REQUIRE(data.device() == defaultDevice);
+      REQUIRE(data.stream() == ctx.stream());
+      REQUIRE(data.event() != nullptr);
+    }
+
+    SECTION("Move constructor") {
+      auto data2 = cms::cuda::Product<int>(std::move(data));
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+
+    SECTION("Move assignment") {
+      cms::cuda::Product<int> data2;
+      data2 = std::move(data);
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+  }
+
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  // Note: CUDA resources are cleaned up by the destructors of the global cache objects
+}
diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/CUDADataFormats/Common/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
index efc7d9e6cde0b..ccd99247e501e 100644
--- a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
+++ b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
@@ -25,7 +25,6 @@
 #include "tbb/task_arena.h"
 
 namespace edm {
-
   class WaitingTask;
   class WaitingTaskHolder;
 
@@ -72,5 +71,29 @@ namespace edm {
     WaitingTask* m_task;
     std::shared_ptr<tbb::task_arena> m_arena;
   };
+
+  template <typename F>
+  auto make_lambda_with_holder(WaitingTaskWithArenaHolder h, F&& f) {
+    return [holder = std::move(h), func = std::forward<F>(f)]() mutable {
+      try {
+        func(holder);
+      } catch (...) {
+        holder.doneWaiting(std::current_exception());
+      }
+    };
+  }
+
+  template <typename ALLOC, typename F>
+  auto make_waiting_task_with_holder(ALLOC&& iAlloc, WaitingTaskWithArenaHolder h, F&& f) {
+    return make_waiting_task(
+        std::forward<ALLOC>(iAlloc),
+        [holder = h, func = make_lambda_with_holder(h, std::forward<F>(f))](std::exception_ptr const* excptr) mutable {
+          if (excptr) {
+            holder.doneWaiting(*excptr);
+            return;
+          }
+          func();
+        });
+  }
 }  // namespace edm
 #endif
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
new file mode 100644
index 0000000000000..a10567db16edb
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -0,0 +1,14 @@
+<iftool name="cuda-gcc-support">
+<use name="FWCore/Concurrency"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSet"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Provenance"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda"/>
+
+<export>
+    <lib name="1"/>
+</export>
+</iftool>
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
new file mode 100644
index 0000000000000..75d9cf4c227a7
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -0,0 +1,1003 @@
+# CUDA algorithms in CMSSW
+
+## Outline
+
+* [Introduction](#introduction)
+  * [Design goals](#design-goals)
+  * [Overall guidelines](#overall-guidelines)
+* [Sub-packages](#sub-packages)
+* [Examples](#examples)
+  * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output)
+  * [Producer with CUDA output](#producer-with-cuda-output)
+  * [Producer with CUDA input](#producer-with-cuda-input)
+  * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork)
+  * [Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)](producer-with-cuda-input-and-output-and-internal-chain-of-cpu-and-gpu-tasks-with-externalwork)
+  * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork)
+  * [Analyzer with CUDA input](#analyzer-with-cuda-input)
+  * [Configuration](#configuration)
+    * [GPU-only configuration](#gpu-only-configuration)
+    * [Automatic switching between CPU and GPU modules](#automatic-switching-between-cpu-and-gpu-modules)
+* [More details](#more-details)
+  * [Device choice](#device-choice)
+  * [Data model](#data-model)
+  * [CUDA EDProducer](#cuda-edproducer)
+    * [Class declaration](#class-declaration)
+    * [Memory allocation](#memory-allocation)
+      * [Caching allocator](#caching-allocator)
+      * [Non-cached pinned host `unique_ptr`](#non-cached-pinned-host-unique_ptr)
+      * [CUDA API](#cuda-api)
+    * [Setting the current device](#setting-the-current-device)
+    * [Getting input](#getting-input)
+    * [Calling the CUDA kernels](#calling-the-cuda-kernels)
+    * [Putting output](#putting-output)
+    * [`ExternalWork` extension](#externalwork-extension)
+    * [Module-internal chain of CPU and GPU tasks](#module-internal-chain-of-cpu-and-gpu-tasks)
+    * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu)
+    * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams)
+  * [CUDA ESProduct](#cuda-esproduct)
+
+## Introduction
+
+This page documents the CUDA integration within CMSSW
+
+### Design goals
+
+1. Provide a mechanism for a chain of modules to share a resource
+   * Resource can be e.g. CUDA device memory or a CUDA stream
+2. Minimize data movements between the CPU and the device
+3. Support multiple devices
+4. Allow the same job configuration to be used on all hardware combinations
+
+### Overall guidelines
+
+1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e.
+   * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc.
+   * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event
+     * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator))
+   * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h`
+     * With the latter the `assert()` calls in CUDA code are disabled by
+       default, but can be enabled by defining a `GPU_DEBUG` macro
+       (before the aforementioned include)
+2. Synchronization needs should be fulfilled with
+   [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork)
+   extension to EDProducers
+   * `ExternalWork` can be used to replace one synchronization point
+     (e.g. between device kernels and copying a known amount of data
+     back to CPU).
+   * For further synchronization points (e.g. copying data whose
+     amount is known only at the device side), split the work to
+     multiple `ExternalWork` producers. This approach has the added
+     benefit that e.g. data transfers to CPU become on-demand automatically
+   * A general breakdown of the possible steps:
+     * Convert input legacy CPU data format to CPU SoA
+     * Transfer input CPU SoA to GPU
+     * Launch kernels
+     * Transfer the number of output elements to CPU
+     * Transfer the output data from GPU to CPU SoA
+     * Convert the output SoA to legacy CPU data formats
+3. Within `acquire()`/`produce()`, the current CUDA device is set
+   implicitly and the CUDA stream is provided by the system (with
+   `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`)
+   * It is strongly recommended to use the provided CUDA stream for all operations
+     * If that is not feasible for some reason, the provided CUDA
+       stream must synchronize with the work queued on other CUDA
+       streams (with CUDA events and `cudaStreamWaitEvent()`)
+4. Outside of `acquire()`/`produce()`, CUDA API functions may be
+   called only if `CUDAService::enabled()` returns `true`.
+   * With point 3 it follows that in these cases multiple devices have
+     to be dealt with explicitly, as well as CUDA streams
+
+## Sub-packages
+* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components
+* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
+* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
+* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations
+* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data
+
+## Examples
+
+### Isolated producer (no CUDA input nor output)
+
+```cpp
+class IsolatedProducerCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  IsolatedProducerGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<InputData> inputToken_;
+  edm::EDPutTokenT<OutputData> outputToken_;
+};
+...
+void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Sets the current device and creates a CUDA stream
+  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  auto const& inputData = iEvent.get(inputToken_);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by cms::cuda::ScopedContextAcquire::stream()
+  gpuAlgo_.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // Real life is likely more complex than this simple example. Here
+  // getResult() returns some data in CPU memory that is passed
+  // directly to the OutputData constructor.
+  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+### Producer with CUDA output
+
+```cpp
+class ProducerOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  ProducerOutputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<InputData> inputToken_;
+  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
+  cms::cuda::ContextState ctxState_;
+};
+...
+void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Sets the current device and creates a CUDA stream
+  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
+
+  auto const& inputData = iEvent.get(inputToken_);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by cms::cuda::ScopedContextAcquire::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished,
+  // and saves the device and CUDA stream to ctxState_
+}
+
+// Called after the asynchronous work has finished
+void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+  // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+### Producer with CUDA input
+
+```cpp
+class ProducerInputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<cms::cuda:Product<InputData>> inputToken_;
+  edm::EDGetTokenT<cms::cuda::Product<OtherInputData>> otherInputToken_;
+  edm::EDPutTokenT<OutputData> outputToken_;
+};
+...
+void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and possibly use the same CUDA stream
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Input data from another producer
+  auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
+  // or
+  auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
+
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by cms::cuda::ScopedContextAcquire::stream()
+  gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Real life is likely more complex than this simple example. Here
+  // getResult() returns some data in CPU memory that is passed
+  // directly to the OutputData constructor.
+  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+See [further below](#setting-the-current-device) for the conditions
+when the `cms::cuda::ScopedContextAcquire` constructor reuses the CUDA stream. Note
+that the `cms::cuda::ScopedContextAcquire` constructor taking `edm::StreamID` is
+allowed, it will just always create a new CUDA stream.
+
+
+### Producer with CUDA input and output (with ExternalWork)
+
+```cpp
+class ProducerInputOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
+  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by cms::cuda::ScopedContextAcquire::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished,
+  // and saves the device and CUDA stream to ctxState_
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+  // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc)
+
+
+### Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)
+
+```cpp
+class ProducerInputOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  ...
+private:
+  void addMoreWork(edm::WaitingTaskWithArenaHolder waitingTashHolder);
+
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
+  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by cms::cuda::ScopedContextAcquire::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Push a functor on top of "a stack of tasks" to be run as a next
+  // task after the work queued above before produce(). In this case ctx
+  // is a context constructed by the calling TBB task, and therefore the
+  // current device and CUDA stream have been already set up. The ctx
+  // internally holds the WaitingTaskWithArenaHolder for the next task.
+
+  ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+    addMoreWork(ctx);
+  });
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished,
+  // and saves the device and CUDA stream to ctxState_
+}
+
+// Called after the asynchronous work queued in acquire() has finished
+void ProducerInputOutputCUDA::addMoreWork(cms::cuda::ScopedContextTask& ctx) {
+  // Current device and CUDA stream have already been set
+
+  // Queues more asynchronous data transfer and kernels to the CUDA
+  // stream returned by cms::cuda::ScopedContextTask::stream()
+  gpuAlgo.makeMoreAsync(ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work queued in addMoreWork() has finished
+void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+  // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEWTask.cc)
+
+
+### Producer with CUDA input and output (without ExternalWork)
+
+If the producer does not need to transfer anything back to CPU (like
+the number of output elements), the `ExternalWork` extension is not
+needed as there is no need to synchronize.
+
+```cpp
+class ProducerInputOutputCUDA: public edm::global::EDProducer<> {
+public:
+  ...
+  void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
+  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const {
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and possibly use the same CUDA stream
+  cms::cuda::ScopedContextProduce ctx{inputDataWrapped};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the cms::cuda::ScopedContextProduce holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by cms::cuda::ScopedContextProduce::stream(). Here makeAsync() also
+  // returns data in GPU memory that is passed to the constructor of
+  // OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the OutputData to
+  // cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also the current
+  // device and the CUDA stream since those will be needed in the
+  // consumer side.
+  ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAProducerGPU.cc)
+
+
+### Analyzer with CUDA input
+
+Analyzer with CUDA input is similar to [producer with CUDA
+input](#producer-with-cuda-input). Note that currently we do not have
+a mechanism for portable configurations with analyzers (like
+[`SwitchProducer`](#automatic-switching-between-cpu-and-gpu-modules)
+for producers). This means that a configuration with a CUDA analyzer
+can only run on a machine with CUDA device(s).
+
+```cpp
+class AnalyzerInputCUDA: public edm::global::EDAnalyzer<> {
+public:
+  ...
+  void analyzer(edm::Event const& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  AnalyzerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
+  edm::EDGetTokenT<cms::cuda::Product<OtherInputData>> otherInputToken_;
+};
+...
+void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetup) {
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and possibly use the same CUDA stream
+  cms::cuda::ScopedContextAnalyze ctx{inputDataWrapped};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the cms::cuda::ScopedContextAnalyze holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Input data from another producer
+  auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
+  // or
+  auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
+
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by cms::cuda::ScopedContextAnalyze::stream()
+  gpuAlgo.analyzeAsync(inputData, otherInputData, ctx.stream());
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAAnalyzerGPU.cc)
+
+
+### Configuration
+
+#### GPU-only configuration
+
+For a GPU-only configuration there is nothing special to be done, just
+construct the Paths/Sequences/Tasks from the GPU modules.
+
+#### Automatic switching between CPU and GPU modules
+
+The `SwitchProducer` mechanism can be used to switch automatically
+between CPU and GPU modules based on the availability of GPUs on the
+machine where the configuration is done. Framework decides at the
+beginning of the job which of the modules to run for a given module
+label.
+
+Framework requires that the modules in the switch must produce the
+same types of output products (the closer the actual results are the
+better, but the framework can not enforce that). This means that for a
+chain of GPU modules, it is the module that transforms the SoA data
+format back to the legacy data formats (possibly, but not necessarily,
+transferring the SoA data from GPU to CPU) that should be switched
+between the legacy CPU module. The rest of the GPU modules should be
+placed to a `Task`, in which case framework runs them only if their
+output is needed by another module.
+
+```python
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+process.foo = SwitchProducerCUDA(
+    cpu = cms.EDProducer("FooProducer"), # legacy CPU
+    cuda = cms.EDProducer("FooProducerFromCUDA",
+        src="fooCUDA"
+    )
+)
+process.fooCUDA = cms.EDProducer("FooProducerCUDA")
+
+process.fooTaskCUDA = cms.Task(process.fooCUDA)
+process.fooTask = cms.Task(
+    process.foo,
+    process.fooTaskCUDA
+)
+```
+
+For a more complete example, see [here](../CUDATest/test/testCUDASwitch_cfg.py).
+
+
+
+
+
+## More details
+
+### Device choice
+
+As discussed above, with `SwitchProducer` the choice between CPU and
+GPU modules is done at the beginning of the job.
+
+For multi-GPU setup the device is chosen in the first CUDA module in a
+chain of modules by one of the constructors of
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`
+```cpp
+// In ExternalWork acquire()
+cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
+
+// In normal produce() (or filter())
+cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
+```
+As the choice is still the static EDM stream to device assignment, the
+EDM stream ID is needed. The logic will likely evolve in the future to
+be more dynamic, and likely the device choice has to be made for the
+full event.
+
+### Data model
+
+The "GPU data product" should be a class/struct containing smart
+pointer(s) to device data (see [Memory allocation](#memory-allocation)).
+When putting the data to event, the data is wrapped to
+`cms::cuda::Product<T>` template, which holds
+* the GPU data product
+  * must be moveable, but no other restrictions
+* the current device where the data was produced, and the CUDA stream the data was produced with
+* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
+
+Note that the `cms::cuda::Product<T>` wrapper can be constructed only with
+`cms::cuda::ScopedContextProduce::wrap()`, and the data `T` can be obtained
+from it only with
+`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
+as described further below. When putting the data product directly to
+`edm::Event`, also `cms::cuda::SCopedContextProduce::emplace()` can be used.
+
+The GPU data products that depend on the CUDA runtime should be placed
+under `CUDADataFormats` package, using the same name for sub-package
+that would be used in `DataFormats`. Everything else, e.g. SoA for
+CPU, should go under `DataFormats` as usual.
+
+
+### CUDA EDProducer
+
+#### Class declaration
+
+The CUDA producers are normal EDProducers. The `ExternalWork`
+extension should be used if a synchronization between the GPU and CPU
+is needed, e.g. when transferring data from GPU to CPU.
+
+#### Memory allocation
+
+##### Caching allocator
+
+The memory allocations should be done dynamically with the following functions
+```cpp
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+cms::cuda::device::unique_ptr<float[]> device_buffer = cms::cuda::make_device_unique<float[]>(50, cudaStream);
+cms::cuda::host::unique_ptr<float[]>   host_buffer   = cms::cuda::make_host_unique<float[]>(50, cudaStream);
+```
+
+in the `acquire()` and `produce()` functions. The same
+`cudaStream_t` object that is used for transfers and kernels
+should be passed to the allocator.
+
+The allocator is based on `cub::CachingDeviceAllocator`. The memory is
+guaranteed to be reserved
+* for the host: up to the destructor of the `unique_ptr`
+* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished
+
+##### Non-cached pinned host `unique_ptr`
+
+In producers transferring data to GPU one may want to pinned host
+memory allocated with `cudaHostAllocWriteCombined`. As of now we don't
+want to include the flag dimension to the caching allocator. The CUDA
+API wrapper library does not support allocation flags, so we add our
+own `unique_ptr` for that.
+
+```cpp
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+
+cms::cuda::host::noncached_unique_ptr<float[]> host_buffer = cms::cuda::make_host_noncached_unique<float[]>(50, flags);
+```
+The `flags` is passed directly to `cudaHostAlloc()`.
+
+##### CUDA API
+
+The `cudaMalloc()` etc may be used outside of the event loop, but that
+should be limited to only relatively small allocations in order to
+allow as much re-use of device memory as possible.
+
+If really needed, the `cudaMalloc()` etc may be used also within the
+event loop, but then the cost of allocation and implicit
+synchronization should be explicitly amortized e.g. by caching.
+
+#### Setting the current device
+
+A CUDA producer should construct `cms::cuda::ScopedContextAcquire` in
+`acquire()` (`cms::cuda::ScopedContextProduce` `produce()` if not using
+`ExternalWork`) either with `edm::StreamID`, or with a
+`cms::cuda::Product<T>` read as an input.
+
+```cpp
+// From edm::StreamID
+cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
+// or
+cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
+
+
+// From cms::cuda::Product<T>
+cms::cuda::Product<GPUClusters> const& cclus = iEvent.get(srcToken_);
+cms::cuda::ScopedContextAcquire ctx{cclus, ...};
+// or
+cms::cuda::ScopedContextProduce ctx{cclus};
+```
+
+A CUDA analyzer should construct `cms::cuda::ScopedContextAnalyze` with a
+`cms::cuda::Product<T>` read as an input.
+
+```cpp
+cms::cuda::Product<GPUClusters> const& cclus = iEvent.get(srcToken_);
+cms::cuda::ScopedContextAnalyze ctx{cclus};
+```
+
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze` work in the RAII way and does the following
+* Sets the current device for the current scope
+  - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream
+  - If constructed from the `cms::cuda::Product<T>`, uses the same device and possibly the same CUDA stream as was used to produce the `cms::cuda::Product<T>`
+    * The CUDA stream is reused if this producer is the first consumer
+      of the `cms::cuda::Product<T>`, otherwise a new CUDA stream is created.
+      This approach is simple compromise to automatically express the work of
+      parallel producers in different CUDA streams, and at the same
+      time allow a chain of producers to queue their work to the same
+      CUDA stream.
+* Gives access to the CUDA stream the algorithm should use to queue asynchronous work
+* `cms::cuda::ScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor)
+* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
+* Needed to get `cms::cuda::Product<T>` from the event
+  * `cms::cuda::ScopedContextProduce` is needed to put `cms::cuda::Product<T>` to the event
+
+In case of multiple input products, from possibly different CUDA
+streams and/or CUDA devices, this approach gives the developer full
+control in which of them the kernels of the algorithm should be run.
+
+#### Getting input
+
+The real product (`T`) can be obtained from `cms::cuda::Product<T>` only with
+the help of
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
+
+```cpp
+// From cms::cuda::Product<T>
+cms::cuda::Product<GPUClusters> cclus = iEvent.get(srcToken_);
+GPUClusters const& clus = ctx.get(cclus);
+
+// Directly from Event
+GPUClusters const& clus = ctx.get(iEvent, srcToken_);
+```
+
+This step is needed to
+* check that the data are on the same CUDA device
+  * if not, throw an exception (with unified memory could prefetch instead)
+* if the CUDA streams are different, synchronize between them
+
+#### Calling the CUDA kernels
+
+It is usually best to wrap the CUDA kernel calls to a separate class,
+and then call methods of that class from the EDProducer. The only
+requirement is that the CUDA stream where to queue the operations
+should be the one from the
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
+
+```cpp
+gpuAlgo.makeClustersAsync(..., ctx.stream());
+```
+
+If necessary, different CUDA streams may be used internally, but they
+should to be made to synchronize with the provided CUDA stream with
+CUDA events and `cudaStreamWaitEvent()`.
+
+
+#### Putting output
+
+The GPU data needs to be wrapped to `cms::cuda::Product<T>` template with
+`cms::cuda::ScopedContextProduce::wrap()` or `cms::cuda::ScopedContextProduce::emplace()`
+
+```cpp
+GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream());
+std::unique_ptr<cms::cuda::Product<GPUClusters>> ret = ctx.wrap(clusters);
+iEvent.put(std::move(ret));
+
+// or with one line
+iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
+
+// or avoid one unique_ptr with emplace
+edm::PutTokenT<cms::cuda::Product<GPUClusters>> putToken_ = produces<cms::cuda::Product<GPUClusters>>(); // in constructor
+...
+ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream()));
+```
+
+This step is needed to
+* store the current device and CUDA stream into `cms::cuda::Product<T>`
+* record the CUDA event needed for CUDA stream synchronization
+
+#### `ExternalWork` extension
+
+Everything above works both with and without `ExternalWork`.
+
+Without `ExternalWork` the `EDProducer`s act similar to TBB
+flowgraph's "streaming node". In other words, they just queue more
+asynchronous work to the CUDA stream in their `produce()`.
+
+The `ExternalWork` is needed when one would otherwise call
+`cudeStreamSynchronize()`. For example transferring something to CPU
+needed for downstream DQM, or queueing more asynchronous work. With
+`ExternalWork` an `acquire()` method needs to be implemented that gets
+an `edm::WaitingTaskWithArenaHolder` parameter. The
+`edm::WaitingTaskWithArenaHolder` should then be passed to the
+constructor of `cms::cuda::ScopedContextAcquire` along
+
+```cpp
+void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<GPUClusters> const& cclus = iEvent.get(token_);
+  cms::cuda::ScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well
+  ...
+```
+
+When constructed this way, `cms::cuda::ScopedContextAcquire` registers a
+callback function to the CUDA stream in its destructor to call
+`waitingTaskHolder.doneWaiting()`.
+
+A GPU->GPU producer needs a `cms::cuda::ScopedContext` also in its
+`produce()`. The device and CUDA stream are transferred via
+`cms::cuda::ContextState` member variable:
+
+```cpp
+class FooProducerCUDA ... {
+  ...
+  cms::cuda::ContextState ctxState_;
+};
+
+void FooProducerCUDA::acquire(...) {
+  ...
+  cms::cuda::ScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_};
+  ...
+}
+
+void FooProducerCUDA::produce(...( {
+  ...
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
+}
+```
+
+The `cms::cuda::ScopedContextAcquire` saves its state to the `ctxState_` in
+the destructor, and `cms::cuda::ScopedContextProduce` then restores the
+context.
+
+#### Module-internal chain of CPU and GPU tasks
+
+Technically `ExternalWork` works such that the framework calls
+`acquire()` with a `edm::WaitingTaskWithArenaHolder` that holds an
+`edm::WaitingTask` (that inherits from `tbb::task`) for calling
+`produce()` in a `std::shared_ptr` semantics: spawn the task when
+reference count hits `0`. It is also possible to create a longer chain
+of such tasks, alternating between CPU and GPU work. This mechanism
+can also be used to re-run (part of) the GPU work.
+
+The "next tasks" to run are essentially structured as a stack, such
+that
+- `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask::pushNextTask()`
+  pushes a new functor on top of the stack
+- Completion of both the asynchronous work and the queueing function
+  pops the top task of the stack and enqueues it (so that TBB
+  eventually runs the task)
+  * Technically the task is made eligible to run when all copies of
+    `edm::WaitingTaskWithArenaHolder` of the acquire() (or "previous"
+    function) have either been destructed or their `doneWaiting()` has
+    been called
+  * The code calling `acquire()` or the functor holds one copy of
+    `edm::WaitingTaskWithArenaHolder` so it is guaranteed that the
+    next function will not run before the earlier one has finished
+
+
+Below is an example how to push a functor on top of the stack of tasks
+to run next (following the example of the previous section)
+```cpp
+void FooProducerCUDA::acquire(...) {
+   ...
+   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+     ...
+   });
+   ...
+}
+```
+
+In this case the `ctx`argument to the function is a
+`cms::cuda::ScopedContexTask` object constructed by the TBB task calling the
+user-given function. It follows that the current device and CUDA
+stream have been set up already. The `pushNextTask()` can be called
+many times. On each invocation the `pushNextTask()` pushes a new task
+on top of the stack (i.e. in front of the chain). It follows that in
+```cpp
+void FooProducerCUDA::acquire(...) {
+   ...
+   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+     ... // function 1
+   });
+   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+     ... // function 2
+   });
+   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+     ... // function 3
+   });
+   ...
+}
+```
+the functions will be run in the order 3, 2, 1.
+
+**Note** that the `CUDAService` is **not** available (nor is any other
+service) in these intermediate tasks. In the near future memory
+allocations etc. will be made possible by taking them out from the
+`CUDAService`.
+
+The `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask` have also a
+more generic member function, `replaceWaitingTaskHolder()`, that can
+be used to just replace the currently-hold
+`edm::WaitingTaskWithArenaHolder` (that will get notified by the
+callback function) with anything. In this case the caller is
+responsible of creating the task(s) and setting up the chain of them.
+
+
+#### Transferring GPU data to CPU
+
+The GPU->CPU data transfer needs synchronization to ensure the CPU
+memory to have all data before putting that to the event. This means
+the `ExternalWork` needs to be used along
+* In `acquire()`
+  * (allocate CPU memory buffers)
+  * Queue all GPU->CPU transfers asynchronously
+* In `produce()`
+  * If needed, read additional CPU products (e.g. from `edm::Ref`s)
+  * Reformat data back to legacy data formats
+  * Note: `cms::cuda::ScopedContextProduce` is **not** needed in `produce()`
+
+#### Synchronizing between CUDA streams
+
+In case the producer needs input data that were produced in two (or
+more) CUDA streams, these streams have to be synchronized. Here this
+synchronization is achieved with CUDA events.
+
+Each `cms::cuda::Product<T>` constains also a CUDA event object. The call to
+`cms::cuda::ScopedContextProduce::wrap()` will *record* the event in the CUDA
+stream. This means that when all work queued to the CUDA stream up to
+that point has been finished, the CUDA event becomes *occurred*. Then,
+in
+`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
+if the `cms::cuda::Product<T>` to get from has a different CUDA stream than
+the
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`,
+`cudaStreamWaitEvent(stream, event)` is called. This means that all
+subsequent work queued to the CUDA stream will wait for the CUDA event
+to become occurred. Therefore this subsequent work can assume that the
+to-be-getted CUDA product exists.
+
+
+### CUDA ESProduct
+
+Conditions data can be transferred to the device with the following
+pattern.
+
+1. Define a `class`/`struct` for the data to be transferred in the format accessed in the device (hereafter referred to as "payload")
+2. Define a wrapper ESProduct that holds the aforementioned data in the pinned host memory
+3. The wrapper should have a function returning the payload on the
+   device memory. The function should transfer the data to the device
+   asynchronously with the help of `cms::cuda::ESProduct<T>`.
+
+#### Example
+
+```cpp
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+
+// Declare the struct for the payload to be transferred. Here the
+// example is an array with (potentially) dynamic size. Note that all of
+// below becomes simpler if the array has compile-time size.
+struct ESProductExampleCUDA {
+  float *someData;
+  unsigned int size;
+};
+
+// Declare the wrapper ESProduct. The corresponding ESProducer should
+// produce objects of this type.
+class ESProductExampleCUDAWrapper {
+public:
+  // Constructor takes the standard CPU ESProduct, and transforms the
+  // necessary data to array(s) in pinned host memory
+  ESProductExampleCUDAWrapper(ESProductExample const&);
+
+  // Deallocates all pinned host memory
+  ~ESProductExampleCUDAWrapper();
+
+  // Function to return the actual payload on the memory of the current device
+  ESProductExampleCUDA const *getGPUProductAsync(cudaStream_t stream) const;
+
+private:
+  // Holds the data in pinned CPU memory
+  float *someData_;
+  unsigned int size_;
+
+  // Helper struct to hold all information that has to be allocated and
+  // deallocated per device
+  struct GPUData {
+    // Destructor should free all member pointers
+    ~GPUData();
+    // internal pointers are on device, struct itself is on CPU
+    ESProductExampleCUDA *esproductHost = nullptr;
+    // internal pounters and struct are on device
+    ESProductExampleCUDA *esproductDevice = nullptr;
+  };
+
+  // Helper that takes care of complexity of transferring the data to
+  // multiple devices
+  cms::cuda::ESProduct<GPUData> gpuData_;
+};
+
+ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const& cpuProduct) {
+  cudaCheck(cudaMallocHost(&someData_, sizeof(float)*NUM_ELEMENTS));
+  // fill someData_ and size_ from cpuProduct
+}
+
+ESProductExampleCUDA const *ESProductExampleCUDAWrapper::getGPUProductAsync(cudaStream_t stream) const {
+  // cms::cuda::ESProduct<T> essentially holds an array of GPUData objects,
+  // one per device. If the data have already been transferred to the
+  // current device (or the transfer has been queued), the helper just
+  // returns a reference to that GPUData object. Otherwise, i.e. data are
+  // not yet on the current device, the helper calls the lambda to do the
+  // necessary memory allocations and to queue the transfers.
+  auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](GPUData& data, cudaStream_t stream) {
+    // Allocate memory. Currently this can be with the CUDA API,
+    // sometime we'll migrate to the caching allocator. Assumption is
+    // that IOV changes are rare enough that adding global synchronization
+    // points is not that bad (for now).
+
+    // Allocate the payload object on pinned host memory.
+    cudaCheck(cudaMallocHost(&data.esproductHost, sizeof(ESProductExampleCUDA)));
+    // Allocate the payload array(s) on device memory.
+    cudaCheck(cudaMalloc(&data.esproductHost->someData, sizeof(float)*NUM_ELEMENTS));
+
+    // Allocate the payload object on the device memory.
+    cudaCheck(cudaMalloc(&data.esproductDevice, sizeof(ESProductDevice)));
+
+    // Complete the host-side information on the payload
+    data.cablingMapHost->size = this->size_;
+
+
+    // Transfer the payload, first the array(s) ...
+    cudaCheck(cudaMemcpyAsync(data.esproductHost->someData, this->someData, sizeof(float)*NUM_ELEMENTS, cudaMemcpyDefault, stream));
+    // ... and then the payload object
+    cudaCheck(cudaMemcpyAsync(data.esproductDevice, data.esproduceHost, sizeof(ESProductExampleCUDA), cudaMemcpyDefault, stream));
+});
+
+  // Returns the payload object on the memory of the current device
+  return data.esproductDevice;
+}
+
+// Destructor frees all member pointers
+ESProductExampleCUDA::GPUData::~GPUData() {
+  if(esproductHost != nullptr) {
+    cudaCheck(cudaFree(esproductHost->someData));
+    cudaCheck(cudaFreeHost(esproductHost));
+  }
+  cudaCheck(cudaFree(esProductDevice));
+}
+
+```
diff --git a/HeterogeneousCore/CUDACore/interface/ContextState.h b/HeterogeneousCore/CUDACore/interface/ContextState.h
new file mode 100644
index 0000000000000..4de13c643a79f
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/ContextState.h
@@ -0,0 +1,61 @@
+#ifndef HeterogeneousCore_CUDACore_ContextState_h
+#define HeterogeneousCore_CUDACore_ContextState_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+#include <memory>
+
+namespace cms {
+  namespace cuda {
+    /**
+     * The purpose of this class is to deliver the device and CUDA stream
+     * information from ExternalWork's acquire() to producer() via a
+     * member/StreamCache variable.
+     */
+    class ContextState {
+    public:
+      ContextState() = default;
+      ~ContextState() = default;
+
+      ContextState(const ContextState&) = delete;
+      ContextState& operator=(const ContextState&) = delete;
+      ContextState(ContextState&&) = delete;
+      ContextState& operator=(ContextState&& other) = delete;
+
+    private:
+      friend class ScopedContextAcquire;
+      friend class ScopedContextProduce;
+      friend class ScopedContextTask;
+
+      void set(int device, SharedStreamPtr stream) {
+        throwIfStream();
+        device_ = device;
+        stream_ = std::move(stream);
+      }
+
+      int device() const { return device_; }
+
+      const SharedStreamPtr& streamPtr() const {
+        throwIfNoStream();
+        return stream_;
+      }
+
+      SharedStreamPtr releaseStreamPtr() {
+        throwIfNoStream();
+        // This function needs to effectively reset stream_ (i.e. stream_
+        // must be empty after this function). This behavior ensures that
+        // the SharedStreamPtr is not hold for inadvertedly long (i.e. to
+        // the next event), and is checked at run time.
+        return std::move(stream_);
+      }
+
+      void throwIfStream() const;
+      void throwIfNoStream() const;
+
+      SharedStreamPtr stream_;
+      int device_;
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/ESProduct.h b/HeterogeneousCore/CUDACore/interface/ESProduct.h
new file mode 100644
index 0000000000000..485d694e5b522
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/ESProduct.h
@@ -0,0 +1,103 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAESProduct_h
+#define HeterogeneousCore_CUDACore_CUDAESProduct_h
+
+#include <atomic>
+#include <cassert>
+#include <mutex>
+#include <vector>
+
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+
+namespace cms {
+  namespace cuda {
+    template <typename T>
+    class ESProduct {
+    public:
+      ESProduct() : gpuDataPerDevice_(deviceCount()) {
+        for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
+          gpuDataPerDevice_[i].m_event = getEventCache().get();
+        }
+      }
+      ~ESProduct() = default;
+
+      // transferAsync should be a function of (T&, cudaStream_t)
+      // which enqueues asynchronous transfers (possibly kernels as well)
+      // to the CUDA stream
+      template <typename F>
+      const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
+        auto device = currentDevice();
+
+        auto& data = gpuDataPerDevice_[device];
+
+        // If GPU data has already been filled, we can return it
+        // immediately
+        if (not data.m_filled.load()) {
+          // It wasn't, so need to fill it
+          std::scoped_lock<std::mutex> lk{data.m_mutex};
+
+          if (data.m_filled.load()) {
+            // Other thread marked it filled while we were locking the mutex, so we're free to return it
+            return data.m_data;
+          }
+
+          if (data.m_fillingStream != nullptr) {
+            // Someone else is filling
+
+            // Check first if the recorded event has occurred
+            if (eventWorkHasCompleted(data.m_event.get())) {
+              // It was, so data is accessible from all CUDA streams on
+              // the device. Set the 'filled' for all subsequent calls and
+              // return the value
+              auto should_be_false = data.m_filled.exchange(true);
+              assert(not should_be_false);
+              data.m_fillingStream = nullptr;
+            } else if (data.m_fillingStream != cudaStream) {
+              // Filling is still going on. For other CUDA stream, add
+              // wait on the CUDA stream and return the value. Subsequent
+              // work queued on the stream will wait for the event to
+              // occur (i.e. transfer to finish).
+              cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
+                        "Failed to make a stream to wait for an event");
+            }
+            // else: filling is still going on. But for the same CUDA
+            // stream (which would be a bit strange but fine), we can just
+            // return as all subsequent work should be enqueued to the
+            // same CUDA stream (or stream to be explicitly synchronized
+            // by the caller)
+          } else {
+            // Now we can be sure that the data is not yet on the GPU, and
+            // this thread is the first to try that.
+            transferAsync(data.m_data, cudaStream);
+            assert(data.m_fillingStream == nullptr);
+            data.m_fillingStream = cudaStream;
+            // Now the filling has been enqueued to the cudaStream, so we
+            // can return the GPU data immediately, since all subsequent
+            // work must be either enqueued to the cudaStream, or the cudaStream
+            // must be synchronized by the caller
+          }
+        }
+
+        return data.m_data;
+      }
+
+    private:
+      struct Item {
+        mutable std::mutex m_mutex;
+        CMS_THREAD_GUARD(m_mutex) mutable SharedEventPtr m_event;
+        // non-null if some thread is already filling (cudaStream_t is just a pointer)
+        CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
+        mutable std::atomic<bool> m_filled = false;  // easy check if data has been filled already or not
+        CMS_THREAD_GUARD(m_mutex) mutable T m_data;
+      };
+
+      std::vector<Item> gpuDataPerDevice_;
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContext.h b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
new file mode 100644
index 0000000000000..cdc3e2dd2c620
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
@@ -0,0 +1,242 @@
+#ifndef HeterogeneousCore_CUDACore_ScopedContext_h
+#define HeterogeneousCore_CUDACore_ScopedContext_h
+
+#include <optional>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/EDPutToken.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+namespace cms {
+  namespace cudatest {
+    class TestScopedContext;
+  }
+
+  namespace cuda {
+
+    namespace impl {
+      // This class is intended to be derived by other ScopedContext*, not for general use
+      class ScopedContextBase {
+      public:
+        int device() const { return currentDevice_; }
+
+        // cudaStream_t is a pointer to a thread-safe object, for which a
+        // mutable access is needed even if the ScopedContext itself
+        // would be const. Therefore it is ok to return a non-const
+        // pointer from a const method here.
+        cudaStream_t stream() const { return stream_.get(); }
+        const SharedStreamPtr& streamPtr() const { return stream_; }
+
+      protected:
+        // The constructors set the current device, but the device
+        // is not set back to the previous value at the destructor. This
+        // should be sufficient (and tiny bit faster) as all CUDA API
+        // functions relying on the current device should be called from
+        // the scope where this context is. The current device doesn't
+        // really matter between modules (or across TBB tasks).
+        explicit ScopedContextBase(edm::StreamID streamID);
+
+        explicit ScopedContextBase(const ProductBase& data);
+
+        explicit ScopedContextBase(int device, SharedStreamPtr stream);
+
+      private:
+        int currentDevice_;
+        SharedStreamPtr stream_;
+      };
+
+      class ScopedContextGetterBase : public ScopedContextBase {
+      public:
+        template <typename T>
+        const T& get(const Product<T>& data) {
+          synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
+          return data.data_;
+        }
+
+        template <typename T>
+        const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
+          return get(iEvent.get(token));
+        }
+
+      protected:
+        template <typename... Args>
+        ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward<Args>(args)...) {}
+
+        void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
+      };
+
+      class ScopedContextHolderHelper {
+      public:
+        ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+            : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
+
+        template <typename F>
+        void pushNextTask(F&& f, ContextState const* state);
+
+        void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+          waitingTaskHolder_ = std::move(waitingTaskHolder);
+        }
+
+        void enqueueCallback(int device, cudaStream_t stream);
+
+      private:
+        edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+      };
+    }  // namespace impl
+
+    /**
+     * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
+     * - setting the current device
+     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+     * - synchronizing between CUDA streams if necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextAcquire : public impl::ScopedContextGetterBase {
+    public:
+      /// Constructor to create a new CUDA stream (no need for context beyond acquire())
+      explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
+
+      /// Constructor to create a new CUDA stream, and the context is needed after acquire()
+      explicit ScopedContextAcquire(edm::StreamID streamID,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+                                    ContextState& state)
+          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+      /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
+      explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
+
+      /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
+      explicit ScopedContextAcquire(const ProductBase& data,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+                                    ContextState& state)
+          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+      ~ScopedContextAcquire();
+
+      template <typename F>
+      void pushNextTask(F&& f) {
+        if (contextState_ == nullptr)
+          throwNoState();
+        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+      }
+
+      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+      }
+
+    private:
+      void throwNoState();
+
+      impl::ScopedContextHolderHelper holderHelper_;
+      ContextState* contextState_ = nullptr;
+    };
+
+    /**
+     * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
+     * - setting the current device
+     * - synchronizing between CUDA streams if necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextProduce : public impl::ScopedContextGetterBase {
+    public:
+      /// Constructor to create a new CUDA stream (non-ExternalWork module)
+      explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {}
+
+      /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
+      explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {}
+
+      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+      explicit ScopedContextProduce(ContextState& state)
+          : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
+
+      /// Record the CUDA event, all asynchronous work must have been queued before the destructor
+      ~ScopedContextProduce();
+
+      template <typename T>
+      std::unique_ptr<Product<T>> wrap(T data) {
+        // make_unique doesn't work because of private constructor
+        return std::unique_ptr<Product<T>>(new Product<T>(device(), streamPtr(), event_, std::move(data)));
+      }
+
+      template <typename T, typename... Args>
+      auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
+        return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
+      }
+
+    private:
+      friend class cudatest::TestScopedContext;
+
+      // This construcor is only meant for testing
+      explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event)
+          : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
+
+      // create the CUDA Event upfront to catch possible errors from its creation
+      SharedEventPtr event_ = getEventCache().get();
+    };
+
+    /**
+     * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
+     * - setting the current device
+     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextTask : public impl::ScopedContextBase {
+    public:
+      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+      explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : ScopedContextBase(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
+            holderHelper_{std::move(waitingTaskHolder)},
+            contextState_{state} {}
+
+      ~ScopedContextTask();
+
+      template <typename F>
+      void pushNextTask(F&& f) {
+        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+      }
+
+      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+      }
+
+    private:
+      impl::ScopedContextHolderHelper holderHelper_;
+      ContextState const* contextState_;
+    };
+
+    /**
+     * The aim of this class is to do necessary per-event "initialization" in analyze()
+     * - setting the current device
+     * - synchronizing between CUDA streams if necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextAnalyze : public impl::ScopedContextGetterBase {
+    public:
+      /// Constructor to (possibly) re-use a CUDA stream
+      explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
+    };
+
+    namespace impl {
+      template <typename F>
+      void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
+        replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
+            edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
+                                               std::move(waitingTaskHolder_),
+                                               [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
+                                                 func(ScopedContextTask{state, std::move(h)});
+                                               })});
+      }
+    }  // namespace impl
+  }    // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
new file mode 100644
index 0000000000000..ded114e2fddfe
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
@@ -0,0 +1,34 @@
+import FWCore.ParameterSet.Config as cms
+
+_cuda_enabled_cached = None
+
+def _switch_cuda():
+    global _cuda_enabled_cached
+    if _cuda_enabled_cached is None:
+        import os
+        _cuda_enabled_cached = (os.system("cudaIsEnabled") == 0)
+    return (_cuda_enabled_cached, 2)
+
+class SwitchProducerCUDA(cms.SwitchProducer):
+    def __init__(self, **kargs):
+        super(SwitchProducerCUDA,self).__init__(
+            dict(cpu = cms.SwitchProducer.getCpu(),
+                 cuda = _switch_cuda),
+            **kargs
+        )
+cms.specialImportRegistry.registerSpecialImportForType(SwitchProducerCUDA, "from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA")
+
+if __name__ == "__main__":
+    import unittest
+
+    class TestSwitchProducerCUDA(unittest.TestCase):
+        def testPickle(self):
+            import pickle
+            sp = SwitchProducerCUDA(cpu = cms.EDProducer("Foo"), cuda = cms.EDProducer("Bar"))
+            pkl = pickle.dumps(sp)
+            unpkl = pickle.loads(pkl)
+            self.assertEqual(unpkl.cpu.type_(), "Foo")
+            self.assertEqual(unpkl.cuda.type_(), "Bar")
+
+    unittest.main()
+
diff --git a/HeterogeneousCore/CUDACore/src/ContextState.cc b/HeterogeneousCore/CUDACore/src/ContextState.cc
new file mode 100644
index 0000000000000..0670f01d472f3
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/ContextState.cc
@@ -0,0 +1,16 @@
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+namespace cms::cuda {
+  void ContextState::throwIfStream() const {
+    if (stream_) {
+      throw cms::Exception("LogicError") << "Trying to set ContextState, but it already had a valid state";
+    }
+  }
+
+  void ContextState::throwIfNoStream() const {
+    if (not stream_) {
+      throw cms::Exception("LogicError") << "Trying to get ContextState, but it did not have a valid state";
+    }
+  }
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
new file mode 100644
index 0000000000000..7461ebbee9f0d
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
@@ -0,0 +1,118 @@
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "chooseDevice.h"
+
+namespace {
+  struct CallbackData {
+    edm::WaitingTaskWithArenaHolder holder;
+    int device;
+  };
+
+  void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
+    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
+    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
+    int device = guard->device;
+    if (status == cudaSuccess) {
+      LogTrace("ScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream "
+                                << streamId;
+      waitingTaskHolder.doneWaiting(nullptr);
+    } else {
+      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+      try {
+        auto error = cudaGetErrorName(status);
+        auto message = cudaGetErrorString(status);
+        throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device
+                                          << " error " << error << ": " << message;
+      } catch (cms::Exception&) {
+        waitingTaskHolder.doneWaiting(std::current_exception());
+      }
+    }
+  }
+}  // namespace
+
+namespace cms::cuda {
+  namespace impl {
+    ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+      stream_ = getStreamCache().get();
+    }
+
+    ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+      if (data.mayReuseStream()) {
+        stream_ = data.streamPtr();
+      } else {
+        stream_ = getStreamCache().get();
+      }
+    }
+
+    ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
+        : currentDevice_(device), stream_(std::move(stream)) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+    }
+
+    ////////////////////
+
+    void ScopedContextGetterBase::synchronizeStreams(int dataDevice,
+                                                     cudaStream_t dataStream,
+                                                     bool available,
+                                                     cudaEvent_t dataEvent) {
+      if (dataDevice != device()) {
+        // Eventually replace with prefetch to current device (assuming unified memory works)
+        // If we won't go to unified memory, need to figure out something else...
+        throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
+      }
+
+      if (dataStream != stream()) {
+        // Different streams, need to synchronize
+        if (not available) {
+          // Event not yet occurred, so need to add synchronization
+          // here. Sychronization is done by making the CUDA stream to
+          // wait for an event, so all subsequent work in the stream
+          // will run only after the event has "occurred" (i.e. data
+          // product became available).
+          cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
+        }
+      }
+    }
+
+    void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
+      cudaCheck(
+          cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
+    }
+  }  // namespace impl
+
+  ////////////////////
+
+  ScopedContextAcquire::~ScopedContextAcquire() {
+    holderHelper_.enqueueCallback(device(), stream());
+    if (contextState_) {
+      contextState_->set(device(), std::move(streamPtr()));
+    }
+  }
+
+  void ScopedContextAcquire::throwNoState() {
+    throw cms::Exception("LogicError")
+        << "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with "
+           "ContextState, but that was not the case";
+  }
+
+  ////////////////////
+
+  ScopedContextProduce::~ScopedContextProduce() {
+    // Intentionally not checking the return value to avoid throwing
+    // exceptions. If this call would fail, we should get failures
+    // elsewhere as well.
+    cudaEventRecord(event_.get(), stream());
+  }
+
+  ////////////////////
+
+  ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseDevice.cc b/HeterogeneousCore/CUDACore/src/chooseDevice.cc
new file mode 100644
index 0000000000000..7312760be7d84
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/chooseDevice.cc
@@ -0,0 +1,18 @@
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "chooseDevice.h"
+
+namespace cms::cuda {
+  int chooseDevice(edm::StreamID id) {
+    edm::Service<CUDAService> cudaService;
+
+    // For startes we "statically" assign the device based on
+    // edm::Stream number. This is suboptimal if the number of
+    // edm::Streams is not a multiple of the number of CUDA devices
+    // (and even then there is no load balancing).
+    //
+    // TODO: improve the "assignment" logic
+    return id % cudaService->numberOfDevices();
+  }
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseDevice.h b/HeterogeneousCore/CUDACore/src/chooseDevice.h
new file mode 100644
index 0000000000000..ab642325aaecf
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/chooseDevice.h
@@ -0,0 +1,10 @@
+#ifndef HeterogeneousCore_CUDACore_chooseDevice_h
+#define HeterogeneousCore_CUDACore_chooseDevice_h
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace cms::cuda {
+  int chooseDevice(edm::StreamID id);
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
new file mode 100644
index 0000000000000..474f3cee1b3e8
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -0,0 +1,18 @@
+<iftool name="cuda-gcc-support">
+<bin file="test_*.cc test_*.cu" name="testHeterogeneousCoreCUDACore">
+  <use name="CUDADataFormats/Common"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="FWCore/ParameterSetReader"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ServiceRegistry"/>
+  <use name="FWCore/TestProcessor"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAServices"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
+<bin file="testStreamEvent.cu" name="testHeterogeneousCoreCUDACoreStreamEvent">
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+</bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
new file mode 100644
index 0000000000000..b0a66e2b777e4
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
@@ -0,0 +1,134 @@
+/**
+ * The purpose of this test program is to ensure that the logic for
+ * CUDA event use in cms::cuda::Product and cms::cuda::ScopedContext
+ */
+
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <chrono>
+#include <thread>
+#include <cassert>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+namespace {
+  constexpr int ARRAY_SIZE = 20000000;
+  constexpr int NLOOPS = 10;
+}  // namespace
+
+__global__ void kernel_looping(float *point, unsigned int num) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int iloop = 0; iloop < NLOOPS; ++iloop) {
+    for (size_t offset = idx; offset < num; offset += gridDim.x * blockDim.x) {
+      point[offset] += 1;
+    }
+  }
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  constexpr bool debug = false;
+
+  float *dev_points1;
+  float *host_points1;
+  cudaStream_t stream1, stream2;
+  cudaEvent_t event1, event2;
+
+  cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float)));
+  cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float)));
+  cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
+  cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
+  cudaEventCreate(&event1);
+  cudaEventCreate(&event2);
+
+  for (size_t j = 0; j < ARRAY_SIZE; ++j) {
+    host_points1[j] = static_cast<float>(j);
+  }
+
+  cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1));
+  kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE);
+  if (debug)
+    std::cout << "Kernel launched on stream1" << std::endl;
+
+  auto status = cudaStreamQuery(stream1);
+  if (debug)
+    std::cout << "Stream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl;
+  cudaEventRecord(event1, stream1);
+  status = cudaEventQuery(event1);
+  if (debug)
+    std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  assert(status == cudaErrorNotReady);
+
+  status = cudaStreamQuery(stream2);
+  if (debug)
+    std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl;
+  assert(status == cudaSuccess);
+  if (debug) {
+    cudaEventRecord(event2, stream2);
+    status = cudaEventQuery(event2);
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    status = cudaEventQuery(event2);
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  }
+
+  cudaStreamWaitEvent(stream2, event1, 0);
+  if (debug)
+    std::cout << "\nStream2 waiting for event1" << std::endl;
+  status = cudaStreamQuery(stream2);
+  if (debug)
+    std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl;
+  assert(status == cudaErrorNotReady);
+  cudaEventRecord(event2, stream2);
+  status = cudaEventQuery(event2);
+  if (debug)
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  assert(status == cudaErrorNotReady);
+  if (debug) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    status = cudaEventQuery(event2);
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  }
+
+  status = cudaStreamQuery(stream1);
+  if (debug) {
+    std::cout << "\nStream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess)
+              << std::endl;
+    std::cout << "Synchronizing stream1" << std::endl;
+  }
+  assert(status == cudaErrorNotReady);
+  cudaStreamSynchronize(stream1);
+  if (debug)
+    std::cout << "Synchronized stream1" << std::endl;
+
+  status = cudaEventQuery(event1);
+  if (debug)
+    std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  assert(status == cudaSuccess);
+  status = cudaEventQuery(event2);
+  if (debug)
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  assert(status == cudaSuccess);
+
+  cudaFree(dev_points1);
+  cudaFreeHost(host_points1);
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);
+  cudaEventDestroy(event1);
+  cudaEventDestroy(event2);
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
new file mode 100644
index 0000000000000..7a6543a667c3d
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
@@ -0,0 +1,134 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "FWCore/Concurrency/interface/WaitingTask.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
+
+#include "test_ScopedContextKernels.h"
+
+namespace cms::cudatest {
+  class TestScopedContext {
+  public:
+    static cuda::ScopedContextProduce make(int dev, bool createEvent) {
+      cms::cuda::SharedEventPtr event;
+      if (createEvent) {
+        event = cms::cuda::getEventCache().get();
+      }
+      return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
+    }
+  };
+}  // namespace cms::cudatest
+
+namespace {
+  std::unique_ptr<cms::cuda::Product<int*>> produce(int device, int* d, int* h) {
+    auto ctx = cms::cudatest::TestScopedContext::make(device, true);
+    cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
+    cms::cudatest::testScopedContextKernels_single(d, ctx.stream());
+    return ctx.wrap(d);
+  }
+}  // namespace
+
+TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
+  if (not cms::cudatest::testDevices()) {
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+  {
+    auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
+
+    SECTION("Construct from device ID") { REQUIRE(cms::cuda::currentDevice() == defaultDevice); }
+
+    SECTION("Wrap T to cms::cuda::Product<T>") {
+      std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
+      REQUIRE(dataPtr.get() != nullptr);
+      REQUIRE(dataPtr->device() == ctx.device());
+      REQUIRE(dataPtr->stream() == ctx.stream());
+    }
+
+    SECTION("Construct from from cms::cuda::Product<T>") {
+      std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
+      const auto& data = *dataPtr;
+
+      cms::cuda::ScopedContextProduce ctx2{data};
+      REQUIRE(cms::cuda::currentDevice() == data.device());
+      REQUIRE(ctx2.stream() == data.stream());
+
+      // Second use of a product should lead to new stream
+      cms::cuda::ScopedContextProduce ctx3{data};
+      REQUIRE(cms::cuda::currentDevice() == data.device());
+      REQUIRE(ctx3.stream() != data.stream());
+    }
+
+    SECTION("Storing state in cms::cuda::ContextState") {
+      cms::cuda::ContextState ctxstate;
+      {  // acquire
+        std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
+        const auto& data = *dataPtr;
+        edm::WaitingTaskWithArenaHolder dummy{
+            edm::make_waiting_task(tbb::task::allocate_root(), [](std::exception_ptr const* iPtr) {})};
+        cms::cuda::ScopedContextAcquire ctx2{data, std::move(dummy), ctxstate};
+      }
+
+      {  // produce
+        cms::cuda::ScopedContextProduce ctx2{ctxstate};
+        REQUIRE(cms::cuda::currentDevice() == ctx.device());
+        REQUIRE(ctx2.stream() == ctx.stream());
+      }
+    }
+
+    SECTION("Joining multiple CUDA streams") {
+      cms::cuda::ScopedSetDevice setDeviceForThisScope(defaultDevice);
+
+      // Mimick a producer on the first CUDA stream
+      int h_a1 = 1;
+      auto d_a1 = cms::cuda::make_device_unique<int>(nullptr);
+      auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1);
+
+      // Mimick a producer on the second CUDA stream
+      int h_a2 = 2;
+      auto d_a2 = cms::cuda::make_device_unique<int>(nullptr);
+      auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2);
+
+      REQUIRE(wprod1->stream() != wprod2->stream());
+
+      // Mimick a third producer "joining" the two streams
+      cms::cuda::ScopedContextProduce ctx2{*wprod1};
+
+      auto prod1 = ctx2.get(*wprod1);
+      auto prod2 = ctx2.get(*wprod2);
+
+      auto d_a3 = cms::cuda::make_device_unique<int>(nullptr);
+      cms::cudatest::testScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
+      cudaCheck(cudaStreamSynchronize(ctx2.stream()));
+      REQUIRE(wprod2->isAvailable());
+      REQUIRE(cms::cuda::eventWorkHasCompleted(wprod2->event()));
+
+      h_a1 = 0;
+      h_a2 = 0;
+      int h_a3 = 0;
+
+      cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+      cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+      cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+
+      REQUIRE(h_a1 == 2);
+      REQUIRE(h_a2 == 4);
+      REQUIRE(h_a3 == 6);
+    }
+  }
+
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  // Note: CUDA resources are cleaned up by the destructors of the global cache objects
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
new file mode 100644
index 0000000000000..b87f1e20a5f24
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
@@ -0,0 +1,17 @@
+#include "test_ScopedContextKernels.h"
+
+namespace {
+  __global__ void single_mul(int *d) { d[0] = d[0] * 2; }
+
+  __global__ void join_add(const int *d1, const int *d2, int *d3) { d3[0] = d1[0] + d2[0]; }
+}  // namespace
+
+namespace cms {
+  namespace cudatest {
+    void testScopedContextKernels_single(int *d, cudaStream_t stream) { single_mul<<<1, 1, 0, stream>>>(d); }
+
+    void testScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream) {
+      join_add<<<1, 1, 0, stream>>>(d1, d2, d3);
+    }
+  }  // namespace cudatest
+}  // namespace cms
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
new file mode 100644
index 0000000000000..dfc55682afc76
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
@@ -0,0 +1,13 @@
+#ifndef HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
+#define HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cudatest {
+    void testScopedContextKernels_single(int *d, cudaStream_t stream);
+    void testScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream);
+  }  // namespace cudatest
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc
new file mode 100644
index 0000000000000..2e1027598a4de
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_main.cc
@@ -0,0 +1,31 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "FWCore/ParameterSetReader/interface/ParameterSetReader.h"
+#include "FWCore/PluginManager/interface/PluginManager.h"
+#include "FWCore/PluginManager/interface/standard.h"
+#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h"
+
+class ServiceRegistryListener : public Catch::TestEventListenerBase {
+public:
+  using Catch::TestEventListenerBase::TestEventListenerBase;  // inherit constructor
+
+  void testRunStarting(Catch::TestRunInfo const& testRunInfo) override {
+    edmplugin::PluginManager::configure(edmplugin::standard::config());
+
+    const std::string config{
+        R"_(import FWCore.ParameterSet.Config as cms
+process = cms.Process('Test')
+process.CUDAService = cms.Service('CUDAService')
+)_"};
+
+    std::unique_ptr<edm::ParameterSet> params;
+    edm::makeParameterSets(config, params);
+    edm::ServiceToken tempToken(edm::ServiceRegistry::createServicesFromConfig(std::move(params)));
+    operate_.reset(new edm::ServiceRegistry::Operate(tempToken));
+  }
+
+private:
+  std::unique_ptr<edm::ServiceRegistry::Operate> operate_;
+};
+CATCH_REGISTER_LISTENER(ServiceRegistryListener);
diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml
new file mode 100644
index 0000000000000..c2d566baf5c1b
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/BuildFile.xml
@@ -0,0 +1,12 @@
+<iftool name="cuda-gcc-support">
+<use name="FWCore/Framework"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/MessageLogger"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
+
+<export>
+    <lib name="1"/>
+</export>
+</iftool>
diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
new file mode 100644
index 0000000000000..68e32b64b4032
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -0,0 +1,9 @@
+<iftool name="cuda-gcc-support">
+<bin name="cudaComputeCapabilities" file="cudaComputeCapabilities.cpp">
+  <use name="cuda"/>
+</bin>
+
+<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
+  <use name="cuda"/>
+</bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp
new file mode 100644
index 0000000000000..5a65575873116
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp
@@ -0,0 +1,23 @@
+// C++ standard headers
+#include <iomanip>
+#include <iostream>
+
+// CUDA headers
+#include <cuda_runtime.h>
+
+// CMSSW headers
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+int main() {
+  int devices = 0;
+  cudaCheck(cudaGetDeviceCount(&devices));
+
+  for (int i = 0; i < devices; ++i) {
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties(&properties, i);
+    std::cout << std::setw(4) << i << "    " << std::setw(2) << properties.major << "." << properties.minor << "    "
+              << properties.name << std::endl;
+  }
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
new file mode 100644
index 0000000000000..eb21f22cd0c5c
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -0,0 +1,31 @@
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+int main() {
+  int devices = 0;
+  auto status = cudaGetDeviceCount(&devices);
+  if (status != cudaSuccess) {
+    return EXIT_FAILURE;
+  }
+
+  int minimumMajor = 6;  // min minor is implicitly 0
+
+  // This approach (requiring all devices are supported) is rather
+  // conservative. In principle we could consider just dropping the
+  // unsupported devices. Currently that would be easiest to achieve
+  // in CUDAService though.
+  for (int i = 0; i < devices; ++i) {
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties(&properties, i);
+
+    if ((not(properties.major == 3 and properties.minor == 5)) and properties.major < minimumMajor) {
+      return EXIT_FAILURE;
+    }
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
new file mode 100644
index 0000000000000..5295af75513b0
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -0,0 +1,38 @@
+#ifndef HeterogeneousCore_CUDAServices_CUDAService_h
+#define HeterogeneousCore_CUDAServices_CUDAService_h
+
+#include <utility>
+#include <vector>
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace edm {
+  class ParameterSet;
+  class ActivityRegistry;
+  class ConfigurationDescriptions;
+}  // namespace edm
+
+class CUDAService {
+public:
+  CUDAService(edm::ParameterSet const& iConfig);
+  ~CUDAService();
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  bool enabled() const { return enabled_; }
+
+  int numberOfDevices() const { return numberOfDevices_; }
+
+  // major, minor
+  std::pair<int, int> computeCapability(int device) const { return computeCapabilities_.at(device); }
+
+  // Returns the id of device with most free memory. If none is found, returns -1.
+  int deviceWithMostFreeMemory() const;
+
+private:
+  int numberOfDevices_ = 0;
+  std::vector<std::pair<int, int>> computeCapabilities_;
+  bool enabled_ = false;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h
new file mode 100644
index 0000000000000..b563b98b516cf
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h
@@ -0,0 +1,9 @@
+#ifndef HeterogeneousCore_CUDAServices_numberOfCUDADevices_h
+#define HeterogeneousCore_CUDAServices_numberOfCUDADevices_h
+
+// Returns the number of CUDA devices
+// The difference wrt. the standard CUDA function is that if
+// CUDAService is disabled, this function returns 0.
+int numberOfCUDADevices();
+
+#endif
diff --git a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
index afcf86afdef75..95857d74e7dfa 100644
--- a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
@@ -1,18 +1,17 @@
-#Skip building plugins by dropping all files for none-AMD64 build
-<architecture match="!_amd64_">
-  <flags SKIP_FILES="*"/>
-</architecture>
-<use   name="DataFormats/Provenance"/>
-<use   name="DataFormats/Common"/>
-<use   name="DataFormats/Streamer"/>
-<use   name="FWCore/MessageLogger"/>
-<use   name="FWCore/ParameterSet"/>
-<use   name="FWCore/PluginManager"/>
-<use   name="FWCore/ServiceRegistry"/>
-<use   name="FWCore/Utilities"/>
-<use   name="FWCore/Concurrency"/>
-<use   name="FWCore/Framework"/>
-<use   name="cuda"/>
-<library   file="*.cc" name="HeterogeneousCoreCUDAServicesPlugins">
-  <flags   EDM_PLUGIN="1"/>
+<iftool name="cuda-gcc-support">
+<use name="cuda"/>
+<use name="DataFormats/Common"/>
+<use name="DataFormats/Provenance"/>
+<use name="DataFormats/Streamer"/>
+<use name="FWCore/Concurrency"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/MessageLogger"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/Utilities"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<library file="*.cc" name="HeterogeneousCoreCUDAServicesPlugins">
+  <flags EDM_PLUGIN="1"/>
 </library>
+</iftool>
diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
new file mode 100644
index 0000000000000..6d8527935e334
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -0,0 +1,107 @@
+#include <iostream>
+
+#include <cuda.h>
+
+#include "DataFormats/Provenance/interface/ModuleDescription.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
+#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace edm {
+  class StreamContext;
+}
+
+class CUDAMonitoringService {
+public:
+  CUDAMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
+  ~CUDAMonitoringService() = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void postModuleConstruction(edm::ModuleDescription const& desc);
+  void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
+  void postEvent(edm::StreamContext const& sc);
+
+private:
+  int numberOfDevices_ = 0;
+};
+
+CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
+  // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
+  edm::Service<CUDAService> cudaService;
+  if (!cudaService->enabled())
+    return;
+  numberOfDevices_ = cudaService->numberOfDevices();
+
+  if (config.getUntrackedParameter<bool>("memoryConstruction")) {
+    registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
+  }
+  if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
+    registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream);
+  }
+  if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
+    registry.watchPostEvent(this, &CUDAMonitoringService::postEvent);
+  }
+}
+
+void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.addUntracked<bool>("memoryConstruction", false)
+      ->setComment("Print memory information for each device after the construction of each module");
+  desc.addUntracked<bool>("memoryBeginStream", true)
+      ->setComment("Print memory information for each device after the beginStream() of each module");
+  desc.addUntracked<bool>("memoryPerEvent", true)
+      ->setComment("Print memory information for each device after each event");
+
+  descriptions.add("CUDAMonitoringService", desc);
+  descriptions.setComment(
+      "The memory information is the global state of the device. This gets confusing if there are multiple processes "
+      "running on the same device. Probably the information retrieval should be re-thought?");
+}
+
+// activity handlers
+namespace {
+  template <typename T>
+  void dumpUsedMemory(T& log, int num) {
+    int old = 0;
+    cudaCheck(cudaGetDevice(&old));
+    for (int i = 0; i < num; ++i) {
+      size_t freeMemory, totalMemory;
+      cudaCheck(cudaSetDevice(i));
+      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+      log << "\n"
+          << i << ": " << (totalMemory - freeMemory) / (1 << 20) << " MB used / " << totalMemory / (1 << 20)
+          << " MB total";
+    }
+    cudaCheck(cudaSetDevice(old));
+  }
+}  // namespace
+
+void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
+  auto log = edm::LogPrint("CUDAMonitoringService");
+  log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
+  dumpUsedMemory(log, numberOfDevices_);
+}
+
+void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
+  auto log = edm::LogPrint("CUDAMonitoringService");
+  log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
+      << mcc.moduleDescription()->moduleName() << ")";
+  dumpUsedMemory(log, numberOfDevices_);
+}
+
+void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
+  auto log = edm::LogPrint("CUDAMonitoringService");
+  log << "CUDA device memory after event";
+  dumpUsedMemory(log, numberOfDevices_);
+}
+
+DEFINE_FWK_SERVICE(CUDAMonitoringService);
diff --git a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc
index ec8c4deac4d4d..29fa1ab959025 100644
--- a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc
+++ b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc
@@ -41,6 +41,7 @@
 #include "FWCore/Utilities/interface/Exception.h"
 #include "FWCore/Utilities/interface/ProductKindOfType.h"
 #include "FWCore/Utilities/interface/TimeOfDay.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
 using namespace std::string_literals;
 
@@ -285,9 +286,8 @@ class NVProfilerService {
 
   std::vector<std::string> highlightModules_;
   const bool showModulePrefetching_;
-  bool skipFirstEvent_;
+  const bool skipFirstEvent_;
 
-  unsigned int concurrentStreams_;
   std::atomic<bool> globalFirstEventDone_ = false;
   std::vector<std::atomic<bool>> streamFirstEventDone_;
   std::vector<nvtxRangeId_t> event_;                        // per-stream event ranges
@@ -295,49 +295,22 @@ class NVProfilerService {
   // use a tbb::concurrent_vector rather than an std::vector because its final size is not known
   tbb::concurrent_vector<nvtxRangeId_t> global_modules_;  // global per-module events
 
-private:
-  struct Domains {
-    nvtxDomainHandle_t global;
-    std::vector<nvtxDomainHandle_t> stream;
-
-    Domains(NVProfilerService* service) {
-      global = nvtxDomainCreate("EDM Global");
-      allocate_streams(service->concurrentStreams_);
-    }
-
-    ~Domains() {
-      nvtxDomainDestroy(global);
-      for (unsigned int sid = 0; sid < stream.size(); ++sid) {
-        nvtxDomainDestroy(stream[sid]);
-      }
-    }
-
-    void allocate_streams(unsigned int streams) {
-      stream.resize(streams);
-      for (unsigned int sid = 0; sid < streams; ++sid) {
-        stream[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str());
-      }
-    }
-  };
-
-  // allow access to concurrentStreams_
-  friend struct Domains;
-
-  tbb::enumerable_thread_specific<Domains> domains_;
-
-  nvtxDomainHandle_t global_domain() { return domains_.local().global; }
-
-  nvtxDomainHandle_t stream_domain(unsigned int sid) { return domains_.local().stream.at(sid); }
+  nvtxDomainHandle_t global_domain_;               // NVTX domain for global EDM transitions
+  std::vector<nvtxDomainHandle_t> stream_domain_;  // NVTX domains for per-EDM-stream transitions
 };
 
 NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::ActivityRegistry& registry)
     : highlightModules_(config.getUntrackedParameter<std::vector<std::string>>("highlightModules")),
       showModulePrefetching_(config.getUntrackedParameter<bool>("showModulePrefetching")),
-      skipFirstEvent_(config.getUntrackedParameter<bool>("skipFirstEvent")),
-      concurrentStreams_(0),
-      domains_(this) {
+      skipFirstEvent_(config.getUntrackedParameter<bool>("skipFirstEvent")) {
+  // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
+  edm::Service<CUDAService> cudaService;
+
   std::sort(highlightModules_.begin(), highlightModules_.end());
 
+  // create the NVTX domain for global EDM transitions
+  global_domain_ = nvtxDomainCreate("EDM Global");
+
   // enables profile collection; if profiling is already enabled it has no effect
   if (not skipFirstEvent_) {
     cudaProfilerStart();
@@ -491,7 +464,13 @@ NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::Activ
   registry.watchPostEventReadFromSource(this, &NVProfilerService::postEventReadFromSource);
 }
 
-NVProfilerService::~NVProfilerService() { cudaProfilerStop(); }
+NVProfilerService::~NVProfilerService() {
+  for (unsigned int sid = 0; sid < stream_domain_.size(); ++sid) {
+    nvtxDomainDestroy(stream_domain_[sid]);
+  }
+  nvtxDomainDestroy(global_domain_);
+  cudaProfilerStop();
+}
 
 void NVProfilerService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -517,17 +496,20 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) {
   out << "preallocate: " << bounds.maxNumberOfConcurrentRuns() << " concurrent runs, "
       << bounds.maxNumberOfConcurrentLuminosityBlocks() << " luminosity sections, " << bounds.maxNumberOfStreams()
       << " streams\nrunning on" << bounds.maxNumberOfThreads() << " threads";
-  nvtxDomainMark(global_domain(), out.str().c_str());
+  nvtxDomainMark(global_domain_, out.str().c_str());
 
-  concurrentStreams_ = bounds.maxNumberOfStreams();
-  for (auto& domain : domains_) {
-    domain.allocate_streams(concurrentStreams_);
+  auto concurrentStreams = bounds.maxNumberOfStreams();
+  // create the NVTX domains for per-EDM-stream transitions
+  stream_domain_.resize(concurrentStreams);
+  for (unsigned int sid = 0; sid < concurrentStreams; ++sid) {
+    stream_domain_[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str());
   }
-  event_.resize(concurrentStreams_);
-  stream_modules_.resize(concurrentStreams_);
+
+  event_.resize(concurrentStreams);
+  stream_modules_.resize(concurrentStreams);
   if (skipFirstEvent_) {
     globalFirstEventDone_ = false;
-    std::vector<std::atomic<bool>> tmp(concurrentStreams_);
+    std::vector<std::atomic<bool>> tmp(concurrentStreams);
     for (auto& element : tmp)
       std::atomic_init(&element, false);
     streamFirstEventDone_ = std::move(tmp);
@@ -536,86 +518,86 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) {
 
 void NVProfilerService::preBeginJob(edm::PathsAndConsumesOfModulesBase const& pathsAndConsumes,
                                     edm::ProcessContext const& pc) {
-  nvtxDomainMark(global_domain(), "preBeginJob");
+  nvtxDomainMark(global_domain_, "preBeginJob");
 
   // FIXME this probably works only in the absence of subprocesses
   // size() + 1 because pathsAndConsumes.allModules() does not include the source
   unsigned int modules = pathsAndConsumes.allModules().size() + 1;
   global_modules_.resize(modules, nvtxInvalidRangeId);
-  for (unsigned int sid = 0; sid < concurrentStreams_; ++sid) {
+  for (unsigned int sid = 0; sid < stream_modules_.size(); ++sid) {
     stream_modules_[sid].resize(modules, nvtxInvalidRangeId);
   }
 }
 
 void NVProfilerService::postBeginJob() {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainMark(global_domain(), "postBeginJob");
+    nvtxDomainMark(global_domain_, "postBeginJob");
   }
 }
 
 void NVProfilerService::postEndJob() {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainMark(global_domain(), "postEndJob");
+    nvtxDomainMark(global_domain_, "postEndJob");
   }
 }
 
 void NVProfilerService::preSourceEvent(edm::StreamID sid) {
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePush(stream_domain(sid), "source");
+    nvtxDomainRangePush(stream_domain_[sid], "source");
   }
 }
 
 void NVProfilerService::postSourceEvent(edm::StreamID sid) {
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preSourceLumi(edm::LuminosityBlockIndex index) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "source lumi");
+    nvtxDomainRangePush(global_domain_, "source lumi");
   }
 }
 
 void NVProfilerService::postSourceLumi(edm::LuminosityBlockIndex index) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preSourceRun(edm::RunIndex index) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "source run");
+    nvtxDomainRangePush(global_domain_, "source run");
   }
 }
 
 void NVProfilerService::postSourceRun(edm::RunIndex index) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preOpenFile(std::string const& lfn, bool) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), ("open file "s + lfn).c_str());
+    nvtxDomainRangePush(global_domain_, ("open file "s + lfn).c_str());
   }
 }
 
 void NVProfilerService::postOpenFile(std::string const& lfn, bool) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preCloseFile(std::string const& lfn, bool) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), ("close file "s + lfn).c_str());
+    nvtxDomainRangePush(global_domain_, ("close file "s + lfn).c_str());
   }
 }
 
 void NVProfilerService::postCloseFile(std::string const& lfn, bool) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
@@ -626,7 +608,7 @@ void NVProfilerService::preModuleBeginStream(edm::StreamContext const& sc, edm::
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " begin stream";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -634,7 +616,7 @@ void NVProfilerService::postModuleBeginStream(edm::StreamContext const& sc, edm:
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -646,7 +628,7 @@ void NVProfilerService::preModuleEndStream(edm::StreamContext const& sc, edm::Mo
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " end stream";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -654,124 +636,124 @@ void NVProfilerService::postModuleEndStream(edm::StreamContext const& sc, edm::M
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
 
 void NVProfilerService::preGlobalBeginRun(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "global begin run");
+    nvtxDomainRangePush(global_domain_, "global begin run");
   }
 }
 
 void NVProfilerService::postGlobalBeginRun(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preGlobalEndRun(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "global end run");
+    nvtxDomainRangePush(global_domain_, "global end run");
   }
 }
 
 void NVProfilerService::postGlobalEndRun(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preStreamBeginRun(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePush(stream_domain(sid), "stream begin run");
+    nvtxDomainRangePush(stream_domain_[sid], "stream begin run");
   }
 }
 
 void NVProfilerService::postStreamBeginRun(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preStreamEndRun(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePush(stream_domain(sid), "stream end run");
+    nvtxDomainRangePush(stream_domain_[sid], "stream end run");
   }
 }
 
 void NVProfilerService::postStreamEndRun(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preGlobalBeginLumi(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "global begin lumi");
+    nvtxDomainRangePush(global_domain_, "global begin lumi");
   }
 }
 
 void NVProfilerService::postGlobalBeginLumi(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preGlobalEndLumi(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "global end lumi");
+    nvtxDomainRangePush(global_domain_, "global end lumi");
   }
 }
 
 void NVProfilerService::postGlobalEndLumi(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preStreamBeginLumi(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePush(stream_domain(sid), "stream begin lumi");
+    nvtxDomainRangePush(stream_domain_[sid], "stream begin lumi");
   }
 }
 
 void NVProfilerService::postStreamBeginLumi(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preStreamEndLumi(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
-  nvtxDomainRangePush(stream_domain(sid), "stream end lumi");
+  nvtxDomainRangePush(stream_domain_[sid], "stream end lumi");
 }
 
 void NVProfilerService::postStreamEndLumi(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preEvent(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    event_[sid] = nvtxDomainRangeStartColor(stream_domain(sid), "event", nvtxDarkGreen);
+    event_[sid] = nvtxDomainRangeStartColor(stream_domain_[sid], "event", nvtxDarkGreen);
   }
 }
 
 void NVProfilerService::postEvent(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangeEnd(stream_domain(sid), event_[sid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], event_[sid]);
     event_[sid] = nvtxInvalidRangeId;
   } else {
     streamFirstEventDone_[sid] = true;
@@ -787,7 +769,7 @@ void NVProfilerService::postEvent(edm::StreamContext const& sc) {
 void NVProfilerService::prePathEvent(edm::StreamContext const& sc, edm::PathContext const& pc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainMark(global_domain(), ("before path "s + pc.pathName()).c_str());
+    nvtxDomainMark(global_domain_, ("before path "s + pc.pathName()).c_str());
   }
 }
 
@@ -796,7 +778,7 @@ void NVProfilerService::postPathEvent(edm::StreamContext const& sc,
                                       edm::HLTPathStatus const& hlts) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainMark(global_domain(), ("after path "s + pc.pathName()).c_str());
+    nvtxDomainMark(global_domain_, ("after path "s + pc.pathName()).c_str());
   }
 }
 
@@ -807,7 +789,7 @@ void NVProfilerService::preModuleEventPrefetching(edm::StreamContext const& sc,
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " prefetching";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label));
   }
 }
 
@@ -815,7 +797,7 @@ void NVProfilerService::postModuleEventPrefetching(edm::StreamContext const& sc,
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -826,14 +808,14 @@ void NVProfilerService::preModuleConstruction(edm::ModuleDescription const& desc
     global_modules_.grow_to_at_least(mid + 1);
     auto const& label = desc.moduleLabel();
     auto const& msg = label + " construction";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleConstruction(edm::ModuleDescription const& desc) {
   if (not skipFirstEvent_) {
     auto mid = desc.id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -843,14 +825,14 @@ void NVProfilerService::preModuleBeginJob(edm::ModuleDescription const& desc) {
     auto mid = desc.id();
     auto const& label = desc.moduleLabel();
     auto const& msg = label + " begin job";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleBeginJob(edm::ModuleDescription const& desc) {
   if (not skipFirstEvent_) {
     auto mid = desc.id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -860,14 +842,14 @@ void NVProfilerService::preModuleEndJob(edm::ModuleDescription const& desc) {
     auto mid = desc.id();
     auto const& label = desc.moduleLabel();
     auto const& msg = label + " end job";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleEndJob(edm::ModuleDescription const& desc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = desc.id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -879,7 +861,7 @@ void NVProfilerService::preModuleEventAcquire(edm::StreamContext const& sc, edm:
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " acquire";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -887,7 +869,7 @@ void NVProfilerService::postModuleEventAcquire(edm::StreamContext const& sc, edm
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -898,7 +880,7 @@ void NVProfilerService::preModuleEvent(edm::StreamContext const& sc, edm::Module
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColor(label));
   }
 }
 
@@ -906,7 +888,7 @@ void NVProfilerService::postModuleEvent(edm::StreamContext const& sc, edm::Modul
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -919,7 +901,7 @@ void NVProfilerService::preModuleEventDelayedGet(edm::StreamContext const& sc, e
     auto const & label = mcc.moduleDescription()->moduleLabel();
     auto const & msg = label + " delayed get";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColorLight(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColorLight(label));
   }
   */
 }
@@ -929,7 +911,7 @@ void NVProfilerService::postModuleEventDelayedGet(edm::StreamContext const& sc,
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
   */
@@ -943,7 +925,7 @@ void NVProfilerService::preEventReadFromSource(edm::StreamContext const& sc, edm
     auto const & label = mcc.moduleDescription()->moduleLabel();
     auto const & msg = label + " read from source";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label));
   }
   */
 }
@@ -953,7 +935,7 @@ void NVProfilerService::postEventReadFromSource(edm::StreamContext const& sc, ed
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
   */
@@ -966,7 +948,7 @@ void NVProfilerService::preModuleStreamBeginRun(edm::StreamContext const& sc, ed
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " stream begin run";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -974,7 +956,7 @@ void NVProfilerService::postModuleStreamBeginRun(edm::StreamContext const& sc, e
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -986,7 +968,7 @@ void NVProfilerService::preModuleStreamEndRun(edm::StreamContext const& sc, edm:
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " stream end run";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -994,7 +976,7 @@ void NVProfilerService::postModuleStreamEndRun(edm::StreamContext const& sc, edm
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -1006,7 +988,7 @@ void NVProfilerService::preModuleStreamBeginLumi(edm::StreamContext const& sc, e
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " stream begin lumi";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -1014,7 +996,7 @@ void NVProfilerService::postModuleStreamBeginLumi(edm::StreamContext const& sc,
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -1026,7 +1008,7 @@ void NVProfilerService::preModuleStreamEndLumi(edm::StreamContext const& sc, edm
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " stream end lumi";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -1034,7 +1016,7 @@ void NVProfilerService::postModuleStreamEndLumi(edm::StreamContext const& sc, ed
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -1044,14 +1026,14 @@ void NVProfilerService::preModuleGlobalBeginRun(edm::GlobalContext const& gc, ed
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " global begin run";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleGlobalBeginRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -1061,14 +1043,14 @@ void NVProfilerService::preModuleGlobalEndRun(edm::GlobalContext const& gc, edm:
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " global end run";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleGlobalEndRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -1078,14 +1060,14 @@ void NVProfilerService::preModuleGlobalBeginLumi(edm::GlobalContext const& gc, e
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " global begin lumi";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleGlobalBeginLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -1095,14 +1077,14 @@ void NVProfilerService::preModuleGlobalEndLumi(edm::GlobalContext const& gc, edm
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " global end lumi";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleGlobalEndLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -1113,14 +1095,14 @@ void NVProfilerService::preSourceConstruction(edm::ModuleDescription const& desc
     global_modules_.grow_to_at_least(mid + 1);
     auto const& label = desc.moduleLabel();
     auto const& msg = label + " construction";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postSourceConstruction(edm::ModuleDescription const& desc) {
   if (not skipFirstEvent_) {
     auto mid = desc.id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
diff --git a/HeterogeneousCore/CUDAServices/plugins/plugins.cc b/HeterogeneousCore/CUDAServices/plugins/plugins.cc
new file mode 100644
index 0000000000000..120ca10cb13ce
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/plugins/plugins.cc
@@ -0,0 +1,4 @@
+#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+DEFINE_FWK_SERVICE_MAKER(CUDAService, edm::serviceregistry::ParameterSetMaker<CUDAService>);
diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
new file mode 100644
index 0000000000000..bde3e26382976
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
@@ -0,0 +1,10 @@
+#! /bin/bash -e
+
+# move to the .../src directory
+cd $CMSSW_BASE/src/
+
+# check out all packages containing .cu files
+git ls-files --full-name | grep '.*\.cu$' | cut -d/ -f-2 | sort -u | xargs git cms-addpkg
+
+# rebuild all checked out packages
+scram b -j
diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
new file mode 100644
index 0000000000000..f3335f4cd409f
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
@@ -0,0 +1,19 @@
+#! /bin/bash
+TOOL=$CMSSW_BASE/config/toolbox/$SCRAM_ARCH/tools/selected/cuda.xml
+
+# enumerate the supported streaming multiprocessor (sm) compute capabilites
+DOTS=$(cudaComputeCapabilities | awk '{ print $2 }' | sort -u)
+CAPS=$(echo $DOTS | sed -e's#\.*##g')
+
+# remove existing capabilities
+sed -i $TOOL -e'\#<flags CUDA_FLAGS="-gencode arch=compute_..,code=sm_.."/>#d'
+
+# add support for the capabilities found on this machine
+for CAP in $CAPS; do
+  sed -i $TOOL -e"\#</client>#a\  <flags CUDA_FLAGS=\"-gencode arch=compute_$CAP,code=sm_$CAP\"/>"
+done
+
+# reconfigure the cuda.xml tool
+scram setup cuda
+
+echo "SCRAM configured to support CUDA streaming multiprocessor architectures $DOTS"
diff --git a/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py
new file mode 100644
index 0000000000000..331ddd30f73bd
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import re
+import sys
+import argparse
+
+def main(opts):
+    device = []
+    host = []
+
+    device_re = re.compile("Device.*allocated new device block.*\((?P<bytes>\d+) bytes")
+    host_re = re.compile("Host.*allocated new host block.*\((?P<bytes>\d+) bytes")
+
+    f = open(opts.file)
+    for line in f:
+        m = device_re.search(line)
+        if m:
+            device.append(m.group("bytes"))
+            continue
+        m = host_re.search(line)
+        if m:
+            host.append(m.group("bytes"))
+    f.close()
+
+    print("process.CUDAService.allocator.devicePreallocate = cms.untracked.vuint32(%s)" % ",".join(device))
+    print("process.CUDAService.allocator.hostPreallocate = cms.untracked.vuint32(%s)" % ",".join(host))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""Extract CUDAService preallocation parameters from a log file.
+
+To use, run the job once with "process.CUDAService.allocator.debug =
+True" and direct the output to a file. Then run this script by passing
+the file as an argument, and copy the output of this script back to
+the configuration file.""")
+    parser.add_argument("file", type=str, help="Log file to parse")
+    opts = parser.parse_args()
+    main(opts)
diff --git a/HeterogeneousCore/CUDAServices/scripts/nvprof-remote b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote
new file mode 100644
index 0000000000000..3b010c005291f
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote
@@ -0,0 +1,23 @@
+#! /bin/bash
+
+# find the CMSSW release
+if [ -z "$CMSSW_BASE" ]; then
+  export CMSSW_BASE=$(readlink -f $(dirname $0)/../..)
+fi
+
+# load the CMS environment
+source $(< "$CMSSW_BASE"/config/scram_basedir)/cmsset_default.sh
+
+# load the CMSSW release environment
+eval `cd "$CMSSW_BASE"; scram runtime -sh 2> /dev/null`
+
+# log the commands being run
+{
+  date
+  echo "cwd: $PWD"
+  echo "cmd: $0 $@"
+  echo
+} >> $CMSSW_BASE/tmp/nvprof.log
+
+# run the CUDA profiler
+nvprof "$@"
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
new file mode 100644
index 0000000000000..5d1bd30264186
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -0,0 +1,389 @@
+#include <iomanip>
+#include <iostream>
+#include <limits>
+
+#include <cuda.h>
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h"
+
+void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
+  // read the current device
+  int device;
+  cudaCheck(cudaGetDevice(&device));
+  // try to set the requested limit
+  auto result = cudaDeviceSetLimit(limit, request);
+  if (cudaErrorUnsupportedLimit == result) {
+    edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
+    return;
+  }
+  // read back the limit value
+  size_t value;
+  cudaCheck(cudaDeviceGetLimit(&value, limit));
+  if (cudaSuccess != result) {
+    edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
+                                   << request << ", current value is " << value;
+  } else if (value != request) {
+    edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
+                                   << " instead of requested " << request;
+  }
+}
+
+constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
+  switch (major * 10 + minor) {
+    // Fermi architecture
+    case 20:  // SM 2.0: GF100 class
+      return 32;
+    case 21:  // SM 2.1: GF10x class
+      return 48;
+
+    // Kepler architecture
+    case 30:  // SM 3.0: GK10x class
+    case 32:  // SM 3.2: GK10x class
+    case 35:  // SM 3.5: GK11x class
+    case 37:  // SM 3.7: GK21x class
+      return 192;
+
+    // Maxwell architecture
+    case 50:  // SM 5.0: GM10x class
+    case 52:  // SM 5.2: GM20x class
+    case 53:  // SM 5.3: GM20x class
+      return 128;
+
+    // Pascal architecture
+    case 60:  // SM 6.0: GP100 class
+      return 64;
+    case 61:  // SM 6.1: GP10x class
+    case 62:  // SM 6.2: GP10x class
+      return 128;
+
+    // Volta architecture
+    case 70:  // SM 7.0: GV100 class
+    case 72:  // SM 7.2: GV11b class
+      return 64;
+
+    // Turing architecture
+    case 75:  // SM 7.5: TU10x class
+      return 64;
+
+    // unknown architecture, return a default value
+    default:
+      return 64;
+  }
+}
+
+namespace {
+  template <template <typename> typename UniquePtr, typename Allocate>
+  void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
+    if (bufferSizes.empty())
+      return;
+
+    auto streamPtr = cms::cuda::getStreamCache().get();
+
+    std::vector<UniquePtr<char[]> > buffers;
+    buffers.reserve(bufferSizes.size());
+    for (auto size : bufferSizes) {
+      buffers.push_back(allocate(size, streamPtr.get()));
+    }
+  }
+
+  void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
+    int device;
+    cudaCheck(cudaGetDevice(&device));
+    for (int i = 0; i < numberOfDevices; ++i) {
+      cudaCheck(cudaSetDevice(i));
+      preallocate<cms::cuda::device::unique_ptr>(
+          [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
+          bufferSizes);
+    }
+    cudaCheck(cudaSetDevice(device));
+  }
+
+  void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
+    preallocate<cms::cuda::host::unique_ptr>(
+        [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
+        bufferSizes);
+  }
+}  // namespace
+
+/// Constructor
+CUDAService::CUDAService(edm::ParameterSet const& config) {
+  bool configEnabled = config.getUntrackedParameter<bool>("enabled");
+  if (not configEnabled) {
+    edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
+    return;
+  }
+
+  auto status = cudaGetDeviceCount(&numberOfDevices_);
+  if (cudaSuccess != status) {
+    edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
+                                   << "Disabling the CUDAService.";
+    return;
+  }
+  edm::LogInfo log("CUDAService");
+  computeCapabilities_.reserve(numberOfDevices_);
+  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
+
+  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
+  auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
+  auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
+  auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
+  auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
+  auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
+
+  for (int i = 0; i < numberOfDevices_; ++i) {
+    // read information about the compute device.
+    // see the documentation of cudaGetDeviceProperties() for more information.
+    cudaDeviceProp properties;
+    cudaCheck(cudaGetDeviceProperties(&properties, i));
+    log << "CUDA device " << i << ": " << properties.name << '\n';
+
+    // compute capabilities
+    log << "  compute capability:          " << properties.major << "." << properties.minor << " (sm_"
+        << properties.major << properties.minor << ")\n";
+    computeCapabilities_.emplace_back(properties.major, properties.minor);
+    log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
+    log << "  CUDA cores: " << std::setw(28)
+        << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
+    log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
+
+    // compute mode
+    static constexpr const char* computeModeDescription[] = {
+        "default (shared)",            // cudaComputeModeDefault
+        "exclusive (single thread)",   // cudaComputeModeExclusive
+        "prohibited",                  // cudaComputeModeProhibited
+        "exclusive (single process)",  // cudaComputeModeExclusiveProcess
+        "unknown"};
+    log << "  compute mode:" << std::right << std::setw(27)
+        << computeModeDescription[std::min(properties.computeMode,
+                                           static_cast<int>(std::size(computeModeDescription)) - 1)]
+        << '\n';
+
+    // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
+    cudaCheck(cudaSetDevice(i));
+    cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
+
+    // read the free and total amount of memory available for allocation by the device, in bytes.
+    // see the documentation of cudaMemGetInfo() for more information.
+    size_t freeMemory, totalMemory;
+    cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+    log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
+        << totalMemory / (1 << 20) << " MB total\n";
+    log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
+    log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
+
+    // L1 cache behaviour
+    static constexpr const char* l1CacheModeDescription[] = {
+        "unknown", "local memory", "global memory", "local and global memory"};
+    int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
+    log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
+    log << '\n';
+
+    log << "Other capabilities\n";
+    log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
+        << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
+    log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
+        << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
+    log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
+        << " access pageable memory via the host's page tables\n";
+    log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
+        << " access host registered memory at the same virtual address as the host\n";
+    log << "  " << (properties.unifiedAddressing ? "shares" : "does not share")
+        << " a unified address space with the host\n";
+    log << "  " << (properties.managedMemory ? "supports" : "does not support")
+        << " allocating managed memory on this system\n";
+    log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
+        << " coherently access managed memory concurrently with the host\n";
+    log << "  "
+        << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
+        << " directly access managed memory on the device without migration\n";
+    log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
+        << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
+    log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
+        << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
+    log << '\n';
+
+    // set and read the CUDA device flags.
+    // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for  more information.
+    log << "CUDA flags\n";
+    unsigned int flags;
+    cudaCheck(cudaGetDeviceFlags(&flags));
+    switch (flags & cudaDeviceScheduleMask) {
+      case cudaDeviceScheduleAuto:
+        log << "  thread policy:                   default\n";
+        break;
+      case cudaDeviceScheduleSpin:
+        log << "  thread policy:                      spin\n";
+        break;
+      case cudaDeviceScheduleYield:
+        log << "  thread policy:                     yield\n";
+        break;
+      case cudaDeviceScheduleBlockingSync:
+        log << "  thread policy:             blocking sync\n";
+        break;
+      default:
+        log << "  thread policy:                 undefined\n";
+    }
+    if (flags & cudaDeviceMapHost) {
+      log << "  pinned host memory allocations:  enabled\n";
+    } else {
+      log << "  pinned host memory allocations: disabled\n";
+    }
+    if (flags & cudaDeviceLmemResizeToMax) {
+      log << "  kernel host memory reuse:        enabled\n";
+    } else {
+      log << "  kernel host memory reuse:       disabled\n";
+    }
+    log << '\n';
+
+    // set and read the CUDA resource limits.
+    // see the documentation of cudaDeviceSetLimit() for more information.
+
+    // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
+    // printf() device system call.
+    if (printfFifoSize >= 0) {
+      setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
+    }
+    // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
+    if (stackSize >= 0) {
+      setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
+    }
+    // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
+    // and free() device system calls.
+    if (mallocHeapSize >= 0) {
+      setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
+    }
+    if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
+      // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
+      // a thread can safely call cudaDeviceSynchronize().
+      if (devRuntimeSyncDepth >= 0) {
+        setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
+      }
+      // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
+      // device runtime launches that can be made from the current device.
+      if (devRuntimePendingLaunchCount >= 0) {
+        setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
+                     "cudaLimitDevRuntimePendingLaunchCount",
+                     devRuntimePendingLaunchCount);
+      }
+    }
+
+    size_t value;
+    log << "CUDA limits\n";
+    cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
+    log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
+    cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
+    log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
+    cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
+    log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
+    if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
+      cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
+      log << "  runtime sync depth:           " << std::setw(10) << value << '\n';
+      cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
+      log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
+    }
+    log << '\n';
+  }
+  log << "\n";
+
+  // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
+  if constexpr (cms::cuda::allocator::useCaching) {
+    cms::cuda::allocator::getCachingDeviceAllocator();
+    cms::cuda::allocator::getCachingHostAllocator();
+  }
+  cms::cuda::getEventCache().clear();
+  cms::cuda::getStreamCache().clear();
+
+  log << "CUDAService fully initialized";
+  enabled_ = true;
+
+  // Preallocate buffers if asked to
+  auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
+  devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
+  hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
+}
+
+CUDAService::~CUDAService() {
+  if (enabled_) {
+    // Explicitly destruct the allocator before the device resets below
+    if constexpr (cms::cuda::allocator::useCaching) {
+      cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
+      cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
+    }
+    cms::cuda::getEventCache().clear();
+    cms::cuda::getStreamCache().clear();
+
+    for (int i = 0; i < numberOfDevices_; ++i) {
+      cudaCheck(cudaSetDevice(i));
+      cudaCheck(cudaDeviceSynchronize());
+      // Explicitly destroys and cleans up all resources associated with the current device in the
+      // current process. Any subsequent API call to this device will reinitialize the device.
+      // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
+      cudaDeviceReset();
+    }
+  }
+}
+
+void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.addUntracked<bool>("enabled", true);
+
+  edm::ParameterSetDescription limits;
+  limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
+      ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
+  limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
+  limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
+      ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
+  limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
+      ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
+  limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
+      ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
+  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
+      ->setComment(
+          "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
+          "the default value.");
+
+  edm::ParameterSetDescription allocator;
+  allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
+      ->setComment("Preallocates buffers of given bytes on all devices");
+  allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
+      ->setComment("Preallocates buffers of given bytes on the host");
+  desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
+
+  descriptions.add("CUDAService", desc);
+}
+
+int CUDAService::deviceWithMostFreeMemory() const {
+  // save the current device
+  int currentDevice;
+  cudaCheck(cudaGetDevice(&currentDevice));
+
+  size_t maxFreeMemory = 0;
+  int device = -1;
+  for (int i = 0; i < numberOfDevices_; ++i) {
+    size_t freeMemory, totalMemory;
+    cudaSetDevice(i);
+    cudaMemGetInfo(&freeMemory, &totalMemory);
+    edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
+                                 << totalMemory / (1 << 20) << " MB total memory";
+    if (freeMemory > maxFreeMemory) {
+      maxFreeMemory = freeMemory;
+      device = i;
+    }
+  }
+  // restore the current device
+  cudaCheck(cudaSetDevice(currentDevice));
+  return device;
+}
diff --git a/HeterogeneousCore/CUDAServices/src/numberOfCUDADevices.cc b/HeterogeneousCore/CUDAServices/src/numberOfCUDADevices.cc
new file mode 100644
index 0000000000000..a0bbc503c3451
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/src/numberOfCUDADevices.cc
@@ -0,0 +1,8 @@
+#include "HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+int numberOfCUDADevices() {
+  edm::Service<CUDAService> cs;
+  return cs->enabled() ? cs->numberOfDevices() : 0;
+}
diff --git a/HeterogeneousCore/CUDAServices/test/BuildFile.xml b/HeterogeneousCore/CUDAServices/test/BuildFile.xml
new file mode 100644
index 0000000000000..020b3736617c5
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/BuildFile.xml
@@ -0,0 +1,7 @@
+<iftool name="cuda-gcc-support">
+<bin  file="testCUDAService.cpp test_main.cpp" name="testCUDAService">
+  <use  name="HeterogeneousCore/CUDAServices"/>
+  <use  name="cuda"/>
+  <use name="catch2"/>
+</bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
new file mode 100644
index 0000000000000..265703ccf5903
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -0,0 +1,119 @@
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <utility>
+
+#include <cuda_runtime_api.h>
+
+#include "catch.hpp"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+namespace {
+  CUDAService makeCUDAService(edm::ParameterSet ps) {
+    auto desc = edm::ConfigurationDescriptions("Service", "CUDAService");
+    CUDAService::fillDescriptions(desc);
+    desc.validate(ps, "CUDAService");
+    return CUDAService(ps);
+  }
+}  // namespace
+
+TEST_CASE("Tests of CUDAService", "[CUDAService]") {
+  // Test setup: check if a simple CUDA runtime API call fails:
+  // if so, skip the test with the CUDAService enabled
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount(&deviceCount);
+
+  if (ret != cudaSuccess) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString(ret) << ". Running only tests not requiring devices.");
+  }
+
+  SECTION("CUDAService enabled") {
+    edm::ParameterSet ps;
+    ps.addUntrackedParameter("enabled", true);
+    SECTION("Enabled only if there are CUDA capable GPUs") {
+      auto cs = makeCUDAService(ps);
+      if (deviceCount <= 0) {
+        REQUIRE(cs.enabled() == false);
+        WARN("CUDAService is disabled as there are no CUDA GPU devices");
+      } else {
+        REQUIRE(cs.enabled() == true);
+        INFO("CUDAService is enabled");
+      }
+    }
+
+    if (deviceCount <= 0) {
+      return;
+    }
+
+    auto cs = makeCUDAService(ps);
+
+    SECTION("CUDA Queries") {
+      int driverVersion = 0, runtimeVersion = 0;
+      ret = cudaDriverGetVersion(&driverVersion);
+      if (ret != cudaSuccess) {
+        FAIL("Unable to query the CUDA driver version from the CUDA runtime API: (" << ret << ") "
+                                                                                    << cudaGetErrorString(ret));
+      }
+      ret = cudaRuntimeGetVersion(&runtimeVersion);
+      if (ret != cudaSuccess) {
+        FAIL("Unable to query the CUDA runtime API version: (" << ret << ") " << cudaGetErrorString(ret));
+      }
+
+      WARN("CUDA Driver Version / Runtime Version: " << driverVersion / 1000 << "." << (driverVersion % 100) / 10
+                                                     << " / " << runtimeVersion / 1000 << "."
+                                                     << (runtimeVersion % 100) / 10);
+
+      // Test that the number of devices found by the service
+      // is the same as detected by the CUDA runtime API
+      REQUIRE(cs.numberOfDevices() == deviceCount);
+      WARN("Detected " << cs.numberOfDevices() << " CUDA Capable device(s)");
+
+      // Test that the compute capabilities of each device
+      // are the same as detected by the CUDA runtime API
+      for (int i = 0; i < deviceCount; ++i) {
+        cudaDeviceProp deviceProp;
+        ret = cudaGetDeviceProperties(&deviceProp, i);
+        if (ret != cudaSuccess) {
+          FAIL("Unable to query the CUDA properties for device " << i << " from the CUDA runtime API: (" << ret << ") "
+                                                                 << cudaGetErrorString(ret));
+        }
+
+        REQUIRE(deviceProp.major == cs.computeCapability(i).first);
+        REQUIRE(deviceProp.minor == cs.computeCapability(i).second);
+        INFO("Device " << i << ": " << deviceProp.name << "\n CUDA Capability Major/Minor version number: "
+                       << deviceProp.major << "." << deviceProp.minor);
+      }
+    }
+
+    SECTION("CUDAService device free memory") {
+      size_t mem = 0;
+      int dev = -1;
+      for (int i = 0; i < deviceCount; ++i) {
+        size_t free, tot;
+        cudaSetDevice(i);
+        cudaMemGetInfo(&free, &tot);
+        WARN("Device " << i << " memory total " << tot << " free " << free);
+        if (free > mem) {
+          mem = free;
+          dev = i;
+        }
+      }
+      WARN("Device with most free memory " << dev << "\n"
+                                           << "     as given by CUDAService " << cs.deviceWithMostFreeMemory());
+    }
+  }
+
+  SECTION("Force to be disabled") {
+    edm::ParameterSet ps;
+    ps.addUntrackedParameter("enabled", false);
+    auto cs = makeCUDAService(ps);
+    REQUIRE(cs.enabled() == false);
+    REQUIRE(cs.numberOfDevices() == 0);
+  }
+}
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.py b/HeterogeneousCore/CUDAServices/test/testCUDAService.py
new file mode 100644
index 0000000000000..06edefcac2b66
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.py
@@ -0,0 +1,13 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process( "TEST" )
+
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+process.MessageLogger.categories.append("CUDAService")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32( 0 )
+)
diff --git a/HeterogeneousCore/CUDAServices/test/test_main.cpp b/HeterogeneousCore/CUDAServices/test/test_main.cpp
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/test_main.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/HeterogeneousCore/CUDATest/BuildFile.xml b/HeterogeneousCore/CUDATest/BuildFile.xml
new file mode 100644
index 0000000000000..84905be5e2db8
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/BuildFile.xml
@@ -0,0 +1,5 @@
+<iftool name="cuda-gcc-support">
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="rootcore"/>
+</iftool>
diff --git a/HeterogeneousCore/CUDATest/interface/Thing.h b/HeterogeneousCore/CUDATest/interface/Thing.h
new file mode 100644
index 0000000000000..27dc58e1443f4
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/interface/Thing.h
@@ -0,0 +1,21 @@
+#ifndef HeterogeneousCore_CUDATest_Thing_H
+#define HeterogeneousCore_CUDATest_Thing_H
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+namespace cms {
+  namespace cudatest {
+    class Thing {
+    public:
+      Thing() = default;
+      explicit Thing(cms::cuda::device::unique_ptr<float[]> ptr) : ptr_(std::move(ptr)) {}
+
+      const float *get() const { return ptr_.get(); }
+
+    private:
+      cms::cuda::device::unique_ptr<float[]> ptr_;
+    };
+  }  // namespace cudatest
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
new file mode 100644
index 0000000000000..13f9ef6c06cd2
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
@@ -0,0 +1,11 @@
+<iftool name="cuda-gcc-support">
+<library file="*.cc *.cu" name="HeterogeneousCoreCUDATestPlugins">
+  <flags EDM_PLUGIN="1"/>
+  <use name="FWCore/Framework"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+</library>
+</iftool>
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
new file mode 100644
index 0000000000000..2778ed02f3ac6
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
@@ -0,0 +1,81 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+
+#include "TestCUDAAnalyzerGPUKernel.h"
+
+class TestCUDAAnalyzerGPU : public edm::global::EDAnalyzer<> {
+public:
+  explicit TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig);
+  ~TestCUDAAnalyzerGPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void analyze(edm::StreamID, edm::Event const& iEvent, edm::EventSetup const& iSetup) const override;
+  void endJob() override;
+
+private:
+  std::string const label_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
+  double const minValue_;
+  double const maxValue_;
+  // the public interface is thread safe
+  CMS_THREAD_SAFE mutable std::unique_ptr<TestCUDAAnalyzerGPUKernel> gpuAlgo_;
+};
+
+TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig)
+    : label_(iConfig.getParameter<std::string>("@module_label")),
+      srcToken_(consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))),
+      minValue_(iConfig.getParameter<double>("minValue")),
+      maxValue_(iConfig.getParameter<double>("maxValue")) {
+  edm::Service<CUDAService> cs;
+  if (cs->enabled()) {
+    auto streamPtr = cms::cuda::getStreamCache().get();
+    gpuAlgo_ = std::make_unique<TestCUDAAnalyzerGPUKernel>(streamPtr.get());
+  }
+}
+
+void TestCUDAAnalyzerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of cms::cuda::Product<cms::cudatest::Thing>.");
+  desc.add<double>("minValue", -1e308);
+  desc.add<double>("maxValue", 1e308);
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDAnalyzer is part of the TestCUDAProducer* family. It models a GPU analyzer.");
+}
+
+void TestCUDAAnalyzerGPU::analyze(edm::StreamID, edm::Event const& iEvent, edm::EventSetup const& iSetup) const {
+  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::analyze begin event "
+                                          << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  auto const& in = iEvent.get(srcToken_);
+  cms::cuda::ScopedContextAnalyze ctx{in};
+  cms::cudatest::Thing const& input = ctx.get(in);
+  gpuAlgo_->analyzeAsync(input.get(), ctx.stream());
+
+  edm::LogVerbatim("TestCUDAAnalyzerGPU")
+      << label_ << " TestCUDAAnalyzerGPU::analyze end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAAnalyzerGPU::endJob() {
+  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob begin";
+
+  auto streamPtr = cms::cuda::getStreamCache().get();
+  auto value = gpuAlgo_->value(streamPtr.get());
+  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << "  accumulated value " << value;
+  assert(minValue_ <= value && value <= maxValue_);
+
+  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob end";
+}
+
+DEFINE_FWK_MODULE(TestCUDAAnalyzerGPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
new file mode 100644
index 0000000000000..2b3951a2b5cfe
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
@@ -0,0 +1,46 @@
+#include "TestCUDAAnalyzerGPUKernel.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace {
+  __global__ void analyze(const float *input, float *sum, int numElements) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < numElements) {
+      atomicAdd(sum + i, input[i]);
+    }
+  }
+
+  __global__ void sum(const float *input, float *output, int numElements) {
+    float val = 0.f;
+    for (int i = 0; i < numElements; ++i) {
+      val += input[i];
+    }
+    *output = val;
+  }
+}  // namespace
+
+TestCUDAAnalyzerGPUKernel::TestCUDAAnalyzerGPUKernel(cudaStream_t stream) {
+  sum_ = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
+  cms::cuda::memsetAsync(sum_, 0, NUM_VALUES, stream);
+  // better to synchronize since there is no guarantee that the stream
+  // of analyzeAsync() would be otherwise synchronized with this one
+  cudaCheck(cudaStreamSynchronize(stream));
+}
+
+void TestCUDAAnalyzerGPUKernel::analyzeAsync(const float *d_input, cudaStream_t stream) {
+  analyze<<<int(ceil(float(NUM_VALUES) / 256)), 256, 0, stream>>>(d_input, sum_.get(), NUM_VALUES);
+}
+
+float TestCUDAAnalyzerGPUKernel::value(cudaStream_t stream) const {
+  auto accumulator = cms::cuda::make_device_unique<float>(stream);
+  auto h_accumulator = cms::cuda::make_host_unique<float>(stream);
+  sum<<<1, 1, 0, stream>>>(sum_.get(), accumulator.get(), NUM_VALUES);
+  cms::cuda::copyAsync(h_accumulator, accumulator, stream);
+  // need to synchronize
+  cudaCheck(cudaStreamSynchronize(stream));
+  return *h_accumulator;
+}
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
new file mode 100644
index 0000000000000..a9a6b962c2cc4
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDACore_TestCUDAAnalyzerGPUKernel_h
+#define HeterogeneousCore_CUDACore_TestCUDAAnalyzerGPUKernel_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include <cuda_runtime.h>
+
+class TestCUDAAnalyzerGPUKernel {
+public:
+  static constexpr int NUM_VALUES = 4000;
+
+  TestCUDAAnalyzerGPUKernel(cudaStream_t stream);
+  ~TestCUDAAnalyzerGPUKernel() = default;
+
+  // thread safe
+  void analyzeAsync(const float* d_input, cudaStream_t stream);
+  float value(cudaStream_t stream) const;
+
+private:
+  cms::cuda::device::unique_ptr<float[]> sum_;  // all writes are atomic in CUDA
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
new file mode 100644
index 0000000000000..c25a44023ebc0
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
@@ -0,0 +1,67 @@
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include <chrono>
+#include <random>
+#include <thread>
+
+class TestCUDAProducerCPU : public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerCPU(edm::ParameterSet const& iConfig);
+  ~TestCUDAProducerCPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID id, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
+
+private:
+  std::string const label_;
+  edm::EDGetTokenT<int> srcToken_;
+  edm::EDPutTokenT<int> const dstToken_;
+};
+
+TestCUDAProducerCPU::TestCUDAProducerCPU(edm::ParameterSet const& iConfig)
+    : label_{iConfig.getParameter<std::string>("@module_label")}, dstToken_{produces<int>()} {
+  auto srcTag = iConfig.getParameter<edm::InputTag>("src");
+  if (!srcTag.label().empty()) {
+    srcToken_ = consumes<int>(srcTag);
+  }
+}
+
+void TestCUDAProducerCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Optional source of another TestCUDAProducerCPU.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a CPU algorithm.");
+}
+
+void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, edm::EventSetup const& iSetup) const {
+  edm::LogVerbatim("TestCUDAProducerCPU")
+      << label_ << " TestCUDAProducerCPU::produce begin event " << iEvent.id().event() << " stream " << id;
+
+  int input = 0;
+  if (!srcToken_.isUninitialized()) {
+    input = iEvent.get(srcToken_);
+  }
+
+  std::random_device r;
+  std::mt19937 gen(r());
+  auto dist = std::uniform_real_distribution<>(0.2, 1.5);
+  auto dur = dist(gen);
+  edm::LogVerbatim("TestCUDAProducerCPU")
+      << " Task (CPU) for event " << iEvent.id().event() << " in stream " << id << " will take " << dur << " seconds";
+  std::this_thread::sleep_for(std::chrono::seconds(1) * dur);
+
+  unsigned int const output = input + id * 100 + iEvent.id().event();
+
+  iEvent.emplace(dstToken_, output);
+
+  edm::LogVerbatim("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce end event " << iEvent.id().event()
+                                          << " stream " << id << " result " << output;
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
new file mode 100644
index 0000000000000..6f92ac91dd922
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -0,0 +1,58 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPU : public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerGPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
+
+private:
+  std::string const label_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
+  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
+  TestCUDAProducerGPUKernel const gpuAlgo_;
+};
+
+TestCUDAProducerGPU::TestCUDAProducerGPU(edm::ParameterSet const& iConfig)
+    : label_(iConfig.getParameter<std::string>("@module_label")),
+      srcToken_(consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))),
+      dstToken_(produces<cms::cuda::Product<cms::cudatest::Thing>>()) {}
+
+void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of cms::cuda::Product<cms::cudatest::Thing>.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
+      "algorithm in the chain of the GPU EDProducers. Produces cms::cuda::Product<cms::cudatest::Thing>.");
+}
+
+void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup const& iSetup) const {
+  edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event "
+                                          << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  auto const& in = iEvent.get(srcToken_);
+  cms::cuda::ScopedContextProduce ctx{in};
+  cms::cudatest::Thing const& input = ctx.get(in);
+
+  ctx.emplace(iEvent, dstToken_, cms::cudatest::Thing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
+
+  edm::LogVerbatim("TestCUDAProducerGPU")
+      << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
new file mode 100644
index 0000000000000..9b6fe85636026
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -0,0 +1,95 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUEW : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUEW(edm::ParameterSet const& iConfig);
+  ~TestCUDAProducerGPUEW() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+private:
+  std::string const label_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
+  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
+  cms::cuda::ContextState ctxState_;
+  cms::cuda::device::unique_ptr<float[]> devicePtr_;
+  cms::cuda::host::noncached::unique_ptr<float> hostData_;
+};
+
+TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(edm::ParameterSet const& iConfig)
+    : label_{iConfig.getParameter<std::string>("@module_label")},
+      srcToken_{consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))},
+      dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {
+  edm::Service<CUDAService> cs;
+  if (cs->enabled()) {
+    hostData_ = cms::cuda::make_host_noncached_unique<float>();
+  }
+}
+
+void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag());
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
+      "algorithm in the chain of the GPU EDProducers, and that transfers some data from GPU to CPU and thus needs to "
+      "synchronize GPU and CPU. The synchronization is implemented with the ExternalWork extension. Produces "
+      "cms::cuda::Product<cms::cuda::Thing>.");
+}
+
+void TestCUDAProducerGPUEW::acquire(edm::Event const& iEvent,
+                                    edm::EventSetup const& iSetup,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event "
+                                            << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  auto const& in = iEvent.get(srcToken_);
+  cms::cuda::ScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_};
+  cms::cudatest::Thing const& input = ctx.get(in);
+
+  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
+  // Mimick the need to transfer some of the GPU data back to CPU to
+  // be used for something within this module, or to be put in the
+  // event.
+  cudaCheck(
+      cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event "
+                                            << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  edm::LogVerbatim("TestCUDAProducerGPUEW")
+      << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream "
+      << iEvent.streamID() << " 10th element " << *hostData_;
+
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+  ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
+
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event "
+                                            << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUEW);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
new file mode 100644
index 0000000000000..d1e4f94a30d96
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -0,0 +1,136 @@
+#include "FWCore/Concurrency/interface/FunctorTask.h"
+#include "FWCore/Concurrency/interface/WaitingTask.h"
+#include "FWCore/Concurrency/interface/WaitingTaskHolder.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+#include <thread>
+
+class TestCUDAProducerGPUEWTask : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUEWTask(edm::ParameterSet const& iConfig);
+  ~TestCUDAProducerGPUEWTask() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+private:
+  void addSimpleWork(edm::EventNumber_t eventID, edm::StreamID streamID, cms::cuda::ScopedContextTask& ctx);
+
+  std::string const label_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
+  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
+  cms::cuda::ContextState ctxState_;
+  cms::cuda::device::unique_ptr<float[]> devicePtr_;
+  cms::cuda::host::noncached::unique_ptr<float> hostData_;
+};
+
+TestCUDAProducerGPUEWTask::TestCUDAProducerGPUEWTask(edm::ParameterSet const& iConfig)
+    : label_{iConfig.getParameter<std::string>("@module_label")},
+      srcToken_{consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))},
+      dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {
+  edm::Service<CUDAService> cs;
+  if (cs->enabled()) {
+    hostData_ = cms::cuda::make_host_noncached_unique<float>();
+  }
+}
+
+void TestCUDAProducerGPUEWTask::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag());
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
+      "algorithm in the chain of the GPU EDProducers, and that transfers some data from GPU to CPU multiple times "
+      "alternating the transfers and kernel executions (e.g. to decide which kernel to run next based on a value from "
+      "GPU). A synchronization between GPU and CPU is needed after each transfer. The synchronizations are implemented "
+      "with the ExternalWork extension and explicit TBB tasks within the module. Produces "
+      "cms::cuda::Product<cms::cudatest::Thing>.");
+}
+
+void TestCUDAProducerGPUEWTask::acquire(edm::Event const& iEvent,
+                                        edm::EventSetup const& iSetup,
+                                        edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::acquire begin event "
+                                                << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  auto const& in = iEvent.get(srcToken_);
+  cms::cuda::ScopedContextAcquire ctx{in, waitingTaskHolder, ctxState_};
+
+  cms::cudatest::Thing const& input = ctx.get(in);
+
+  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
+  // Mimick the need to transfer some of the GPU data back to CPU to
+  // be used for something within this module, or to be put in the
+  // event.
+  cudaCheck(
+      cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
+  // Push a task to run addSimpleWork() after the asynchronous work
+  // (and acquire()) has finished instead of produce()
+  ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](cms::cuda::ScopedContextTask ctx) {
+    addSimpleWork(iev, istr, ctx);
+  });
+
+  edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::acquire end event "
+                                                << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
+                                              edm::StreamID streamID,
+                                              cms::cuda::ScopedContextTask& ctx) {
+  if (*hostData_ < 13) {
+    edm::LogVerbatim("TestCUDAProducerGPUEWTask")
+        << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID
+        << " 10th element " << *hostData_ << " not satisfied, queueing more work";
+    cudaCheck(
+        cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
+
+    ctx.pushNextTask(
+        [eventID, streamID, this](cms::cuda::ScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
+    gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
+    edm::LogVerbatim("TestCUDAProducerGPUEWTask")
+        << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork end event " << eventID << " stream " << streamID;
+  } else {
+    edm::LogVerbatim("TestCUDAProducerGPUEWTask")
+        << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork event " << eventID << " stream " << streamID
+        << " 10th element " << *hostData_ << " not queueing more work";
+  }
+}
+
+void TestCUDAProducerGPUEWTask::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  edm::LogVerbatim("TestCUDAProducerGPUEWTask")
+      << label_ << " TestCUDAProducerGPUEWTask::produce begin event " << iEvent.id().event() << " stream "
+      << iEvent.streamID() << " 10th element " << *hostData_;
+  if (*hostData_ != 13) {
+    throw cms::Exception("Assert") << "Expecting 10th element to be 13, got " << *hostData_;
+  }
+
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+  ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
+
+  edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::produce end event "
+                                                << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUEWTask);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
new file mode 100644
index 0000000000000..b9752f6f41630
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -0,0 +1,56 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUFirst : public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerGPUFirst(edm::ParameterSet const& iConfig);
+  ~TestCUDAProducerGPUFirst() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID stream, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
+
+private:
+  std::string const label_;
+  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
+  TestCUDAProducerGPUKernel const gpuAlgo_;
+};
+
+TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(edm::ParameterSet const& iConfig)
+    : label_(iConfig.getParameter<std::string>("@module_label")),
+      dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {}
+
+void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in "
+      "the chain of the GPU EDProducers. Produces cms::cuda::Productcms::cudatest::Thing>.");
+}
+
+void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID,
+                                       edm::Event& iEvent,
+                                       edm::EventSetup const& iSetup) const {
+  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  cms::cuda::ScopedContextProduce ctx{streamID};
+
+  cms::cuda::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
+  ctx.emplace(iEvent, dstToken_, std::move(output));
+
+  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUFirst);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
new file mode 100644
index 0000000000000..69264a40aca62
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -0,0 +1,131 @@
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+namespace {
+  template <typename T>
+  __global__ void vectorAddConstant(T *a, T b, int numElements) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < numElements) {
+      a[i] += b;
+    }
+  }
+
+  template <typename T>
+  __global__ void vectorAdd(const T *a, const T *b, T *c, int numElements) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < numElements) {
+      c[i] = a[i] + b[i];
+    }
+  }
+
+  template <typename T>
+  __global__ void vectorProd(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (row < numElements && col < numElements) {
+      c[row * numElements + col] = a[row] * b[col];
+    }
+  }
+
+  template <typename T>
+  __global__ void matrixMul(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (row < numElements && col < numElements) {
+      T tmp = 0;
+      for (int i = 0; i < numElements; ++i) {
+        tmp += a[row * numElements + i] * b[i * numElements + col];
+      }
+      c[row * numElements + col] = tmp;
+    }
+  }
+
+  template <typename T>
+  __global__ void matrixMulVector(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (row < numElements) {
+      T tmp = 0;
+      for (int i = 0; i < numElements; ++i) {
+        tmp += a[row * numElements + i] * b[i];
+      }
+      c[row] = tmp;
+    }
+  }
+}  // namespace
+
+cms::cuda::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string &label,
+                                                                          const float *d_input,
+                                                                          cudaStream_t stream) const {
+  // First make the sanity check
+  if (d_input != nullptr) {
+    auto h_check = std::make_unique<float[]>(NUM_VALUES);
+    cudaCheck(cudaMemcpyAsync(h_check.get(), d_input, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+    for (int i = 0; i < NUM_VALUES; ++i) {
+      if (h_check[i] != i) {
+        throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
+                                       << h_check[i];
+      }
+    }
+  }
+
+  auto h_a = cms::cuda::make_host_unique<float[]>(NUM_VALUES, stream);
+  auto h_b = cms::cuda::make_host_unique<float[]>(NUM_VALUES, stream);
+
+  for (auto i = 0; i < NUM_VALUES; i++) {
+    h_a[i] = i;
+    h_b[i] = i * i;
+  }
+
+  auto d_a = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
+  auto d_b = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
+
+  cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
+  cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
+
+  int threadsPerBlock{32};
+  int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
+
+  auto d_c = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
+  auto current_device = cms::cuda::currentDevice();
+  cms::cuda::LogVerbatim("TestHeterogeneousEDProducerGPU")
+      << "  " << label << " GPU launching kernels device " << current_device << " CUDA stream " << stream;
+  vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
+
+  auto d_ma = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  auto d_mb = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  auto d_mc = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES};
+  dim3 blocksPerGrid3{1, 1};
+  if (NUM_VALUES * NUM_VALUES > 32) {
+    threadsPerBlock3.x = 32;
+    threadsPerBlock3.y = 32;
+    blocksPerGrid3.x = ceil(double(NUM_VALUES) / double(threadsPerBlock3.x));
+    blocksPerGrid3.y = ceil(double(NUM_VALUES) / double(threadsPerBlock3.y));
+  }
+  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream>>>(d_a.get(), d_b.get(), d_ma.get(), NUM_VALUES);
+  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream>>>(d_a.get(), d_c.get(), d_mb.get(), NUM_VALUES);
+  matrixMul<<<blocksPerGrid3, threadsPerBlock3, 0, stream>>>(d_ma.get(), d_mb.get(), d_mc.get(), NUM_VALUES);
+
+  matrixMulVector<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES);
+
+  cms::cuda::LogVerbatim("TestHeterogeneousEDProducerGPU")
+      << "  " << label << " GPU kernels launched, returning return pointer device " << current_device << " CUDA stream "
+      << stream;
+  return d_a;
+}
+
+void TestCUDAProducerGPUKernel::runSimpleAlgo(float *d_data, cudaStream_t stream) const {
+  int threadsPerBlock{32};
+  int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
+  vectorAddConstant<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_data, 1.0f, NUM_VALUES);
+}
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
new file mode 100644
index 0000000000000..5eeba0009656e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
@@ -0,0 +1,37 @@
+#ifndef HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
+#define HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
+
+#include <string>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+/**
+ * This class models the actual CUDA implementation of an algorithm.
+ *
+ * Memory is allocated dynamically with the allocator in cms::cuda.
+ *
+ * The algorithm is intended to waste time with large matrix
+ * operations so that the asynchronous nature of the CUDA integration
+ * becomes visible with debug prints.
+ */
+class TestCUDAProducerGPUKernel {
+public:
+  static constexpr int NUM_VALUES = 4000;
+
+  TestCUDAProducerGPUKernel() = default;
+  ~TestCUDAProducerGPUKernel() = default;
+
+  // returns (owning) pointer to device memory
+  cms::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label, cudaStream_t stream) const {
+    return runAlgo(label, nullptr, stream);
+  }
+  cms::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label,
+                                                 const float* d_input,
+                                                 cudaStream_t stream) const;
+
+  void runSimpleAlgo(float* d_data, cudaStream_t stream) const;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
new file mode 100644
index 0000000000000..dc07fc0add7f7
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -0,0 +1,89 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUtoCPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUtoCPU(edm::ParameterSet const& iConfig);
+  ~TestCUDAProducerGPUtoCPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+private:
+  std::string const label_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
+  edm::EDPutTokenT<int> const dstToken_;
+  cms::cuda::host::unique_ptr<float[]> buffer_;
+};
+
+TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(edm::ParameterSet const& iConfig)
+    : label_{iConfig.getParameter<std::string>("@module_label")},
+      srcToken_{consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))},
+      dstToken_{produces<int>()} {}
+
+void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for cms::cuda::Product<cms::cudatest::Thing>.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models the GPU->CPU data transfer and formatting of "
+      "the data to legacy data format. Produces int, to be compatible with TestCUDAProducerCPU.");
+}
+
+void TestCUDAProducerGPUtoCPU::acquire(edm::Event const& iEvent,
+                                       edm::EventSetup const& iSetup,
+                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  auto const& in = iEvent.get(srcToken_);
+  cms::cuda::ScopedContextAcquire ctx{in, std::move(waitingTaskHolder)};
+  cms::cudatest::Thing const& device = ctx.get(in);
+
+  buffer_ = cms::cuda::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
+  // Enqueue async copy, continue in produce once finished
+  cudaCheck(cudaMemcpyAsync(buffer_.get(),
+                            device.get(),
+                            TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce begin event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  int counter = 0;
+  for (int i = 0; i < TestCUDAProducerGPUKernel::NUM_VALUES; ++i) {
+    counter += buffer_[i];
+  }
+  buffer_.reset();  // not so nice, but no way around?
+
+  iEvent.emplace(dstToken_, counter);
+
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU")
+      << label_ << " TestCUDAProducerGPUtoCPU::produce end event " << iEvent.id().event() << " stream "
+      << iEvent.streamID() << " result " << counter;
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUtoCPU);
diff --git a/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py
new file mode 100644
index 0000000000000..421d01c07ea46
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod1CPU = _testCUDAProducerCPU.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
new file mode 100644
index 0000000000000..cf50287bcb15e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
+prod1CUDA = _testCUDAProducerGPUFirst.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py
new file mode 100644
index 0000000000000..de6ed9b0ff179
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod1FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod1CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py
new file mode 100644
index 0000000000000..72221ade422fa
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py
@@ -0,0 +1,17 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod1CPU_cfi import prod1CPU as _prod1CPU
+from HeterogeneousCore.CUDATest.prod1CUDA_cfi import prod1CUDA
+from HeterogeneousCore.CUDATest.prod1FromCUDA_cfi import prod1FromCUDA as _prod1FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod1 = SwitchProducerCUDA(
+    cpu = _prod1CPU.clone(),
+    cuda = _prod1FromCUDA.clone()
+)
+
+prod1Task = cms.Task(
+    prod1CUDA,
+    prod1
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py
new file mode 100644
index 0000000000000..3353c774f8f27
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod5CPU = _testCUDAProducerCPU.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
new file mode 100644
index 0000000000000..2eb97358a1b4c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
+prod5CUDA = _testCUDAProducerGPUFirst.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py
new file mode 100644
index 0000000000000..c25aa2c5a1043
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod5FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod5CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py
new file mode 100644
index 0000000000000..3b5e90d497641
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py
@@ -0,0 +1,17 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod5CPU_cfi import prod5CPU as _prod5CPU
+from HeterogeneousCore.CUDATest.prod5CUDA_cfi import prod5CUDA
+from HeterogeneousCore.CUDATest.prod5FromCUDA_cfi import prod5FromCUDA as _prod5FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod5 = SwitchProducerCUDA(
+    cpu = _prod5CPU.clone(),
+    cuda = _prod5FromCUDA.clone()
+)
+
+prod5Task = cms.Task(
+    prod5CUDA,
+    prod5
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
new file mode 100644
index 0000000000000..a6fadc65195db
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod6CPU = _testCUDAProducerCPU.clone(src = "prod5")
diff --git a/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
new file mode 100644
index 0000000000000..5d8f9d7b0e46b
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUEWTask_cfi import testCUDAProducerGPUEWTask as _testCUDAProducerGPUEWTask
+prod6CUDA = _testCUDAProducerGPUEWTask.clone(src = "prod5CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py
new file mode 100644
index 0000000000000..de894a5f32617
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod6FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod6CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py
new file mode 100644
index 0000000000000..fe05a11d4ce5a
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py
@@ -0,0 +1,18 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod6CPU_cfi import prod6CPU as _prod6CPU
+from HeterogeneousCore.CUDATest.prod6CUDA_cfi import prod6CUDA
+from HeterogeneousCore.CUDATest.prod6FromCUDA_cfi import prod6FromCUDA as _prod6FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod6 = SwitchProducerCUDA(
+    cpu = _prod6CPU.clone(),
+    cuda = _prod6FromCUDA.clone()
+)
+
+prod6Task = cms.Task(
+    prod6CUDA,
+    prod6
+)
+
diff --git a/HeterogeneousCore/CUDATest/src/classes.h b/HeterogeneousCore/CUDATest/src/classes.h
new file mode 100644
index 0000000000000..9cd0cf05bdf04
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/src/classes.h
@@ -0,0 +1,3 @@
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
diff --git a/HeterogeneousCore/CUDATest/src/classes_def.xml b/HeterogeneousCore/CUDATest/src/classes_def.xml
new file mode 100644
index 0000000000000..74760047caf99
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+    <class name="cms::cuda::Product<cms::cudatest::Thing>" persistent="false"/>
+    <class name="edm::Wrapper<cms::cuda::Product<cms::cudatest::Thing>>" persistent="false"/>
+</lcgdict>
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
new file mode 100644
index 0000000000000..23f20762c629c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -0,0 +1,9 @@
+<iftool name="cuda-gcc-support">
+<bin file="test*.cc" name="testHeterogeneousCoreCUDATest">
+  <use name="FWCore/TestProcessor"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+</bin>
+
+<test name="TestCUDATest" command="runtests.sh"/>
+</iftool>
diff --git a/HeterogeneousCore/CUDATest/test/runtests.sh b/HeterogeneousCore/CUDATest/test/runtests.sh
new file mode 100755
index 0000000000000..6a9050388ea2e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/runtests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+function die { echo Failure $1: status $2 ; exit $2 ; }
+
+TEST_DIR=src/HeterogeneousCore/CUDATest/test
+
+echo "*************************************************"
+echo "CUDA producer configuration with SwitchProducer"
+cmsRun ${TEST_DIR}/testCUDASwitch_cfg.py || die "cmsRun testCUDASwitch_cfg.py 1" $?
diff --git a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
new file mode 100644
index 0000000000000..805617091686b
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
@@ -0,0 +1,101 @@
+import FWCore.ParameterSet.Config as cms
+
+silent = True
+#silent = False
+
+#includeAnalyzer = True
+includeAnalyzer = False
+
+process = cms.Process("Test")
+process.load("FWCore.MessageService.MessageLogger_cfi")
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents.input = 3
+if not silent:
+    process.maxEvents.input = 10
+    process.MessageLogger.cerr.threshold = cms.untracked.string("INFO")
+    process.MessageLogger.cerr.INFO.limit = process.MessageLogger.cerr.default.limit
+
+
+#process.options.numberOfThreads = 4
+process.options.numberOfStreams = 0
+#process.Tracer = cms.Service("Tracer")
+
+# Flow diagram of the modules
+#
+#     1   5
+#    / \  |
+#   2  4  6
+#   |
+#   3
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+process.load("HeterogeneousCore.CUDATest.prod1Switch_cff")
+process.load("HeterogeneousCore.CUDATest.prod5Switch_cff")
+process.load("HeterogeneousCore.CUDATest.prod6Switch_cff")
+
+# GPU producers
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
+from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
+
+process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA")
+process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
+process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
+
+# CPU producers, switched with modules to copy data from GPU to CPU
+# (as "on demand" as any other EDProducer, i.e. according to
+# consumes() and prefetching). If a separate conversion step is needed
+# to get the same data formats as the CPU modules, those are then ones
+# that should be replaced-with here.
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU
+process.prod2 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod1"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA")
+)
+process.prod3 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod2"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA")
+)
+process.prod4 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod1"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA")
+)
+
+# GPU analyzer (optionally)
+from HeterogeneousCore.CUDATest.testCUDAAnalyzerGPU_cfi import testCUDAAnalyzerGPU
+process.anaCUDA = testCUDAAnalyzerGPU.clone(src="prod6CUDA")
+if silent:
+    process.anaCUDA.minValue = 2.3e7
+    process.anaCUDA.maxValue = 2.5e7
+
+process.out = cms.OutputModule("AsciiOutputModule",
+    outputCommands = cms.untracked.vstring(
+        "keep *_prod3_*_*",
+        "keep *_prod4_*_*",
+        "keep *_prod6_*_*",
+    ),
+    verbosity = cms.untracked.uint32(0),
+)
+
+process.prod2Task = cms.Task(process.prod2, process.prod2CUDA)
+process.prod3Task = cms.Task(process.prod3, process.prod3CUDA)
+process.prod4Task = cms.Task(process.prod4, process.prod4CUDA)
+
+process.t = cms.Task(
+    process.prod1Task,
+    process.prod2Task,
+    process.prod3Task,
+    process.prod4Task,
+    process.prod5Task,
+    process.prod6Task
+)
+process.p = cms.Path()
+if includeAnalyzer:
+    process.p += process.anaCUDA
+process.p.associate(process.t)
+process.ep = cms.EndPath(process.out)
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
new file mode 100644
index 0000000000000..a7e4f16043a1f
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -0,0 +1,93 @@
+#include "catch.hpp"
+#include "FWCore/TestProcessor/interface/TestProcessor.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <iostream>
+
+static constexpr auto s_tag = "[TestCUDAProducerGPUFirst]";
+
+TEST_CASE("Standard checks of TestCUDAProducerGPUFirst", s_tag) {
+  const std::string baseConfig{
+      R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
+process.moduleToTest(process.toTest)
+)_"};
+
+  edm::test::TestProcessor::Config config{baseConfig};
+  SECTION("base configuration is OK") { REQUIRE_NOTHROW(edm::test::TestProcessor(config)); }
+
+  SECTION("No event data") {
+    // Calls produce(), so don't call without a GPU
+    if (not cms::cudatest::testDevices()) {
+      return;
+    }
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.test());
+  }
+
+  SECTION("beginJob and endJob only") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
+  }
+
+  SECTION("Run with no LuminosityBlocks") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
+  }
+
+  SECTION("LuminosityBlock with no Events") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
+  }
+}
+
+TEST_CASE("TestCUDAProducerGPUFirst operation", s_tag) {
+  const std::string baseConfig{
+      R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
+process.moduleToTest(process.toTest)
+)_"};
+  edm::test::TestProcessor::Config config{baseConfig};
+
+  if (not cms::cudatest::testDevices()) {
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+
+  SECTION("Produce") {
+    edm::test::TestProcessor tester{config};
+    auto event = tester.test();
+    auto prod = event.get<cms::cuda::Product<cms::cudatest::Thing> >();
+    REQUIRE(prod->device() == defaultDevice);
+    auto ctx = cms::cuda::ScopedContextProduce(*prod);
+    const cms::cudatest::Thing& thing = ctx.get(*prod);
+    const float* data = thing.get();
+    REQUIRE(data != nullptr);
+
+    float firstElements[10];
+    cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()));
+
+    std::cout << "Synchronizing with CUDA stream" << std::endl;
+    auto stream = prod->stream();
+    cudaCheck(cudaStreamSynchronize(stream));
+    std::cout << "Synchronized" << std::endl;
+    REQUIRE(firstElements[0] == 0.f);
+    REQUIRE(firstElements[1] == 1.f);
+    REQUIRE(firstElements[9] == 9.f);
+  }
+};
diff --git a/HeterogeneousCore/CUDATest/test/test_main.cc b/HeterogeneousCore/CUDATest/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
new file mode 100644
index 0000000000000..4a8e644a231f6
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -0,0 +1,10 @@
+<iftool name="cuda-gcc-support">
+<use name="cub"/>
+<use name="cuda"/>
+<use name="FWCore/Utilities"/>
+<use name="FWCore/MessageLogger"/>
+
+<export>
+    <lib name="1"/>
+</export>
+</iftool>
diff --git a/HeterogeneousCore/CUDAUtilities/interface/EventCache.h b/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
new file mode 100644
index 0000000000000..8ee56884a40dc
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
@@ -0,0 +1,57 @@
+#ifndef HeterogeneousCore_CUDAUtilities_EventCache_h
+#define HeterogeneousCore_CUDAUtilities_EventCache_h
+
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+
+class CUDAService;
+
+namespace cms {
+  namespace cuda {
+    class EventCache {
+    public:
+      using BareEvent = SharedEventPtr::element_type;
+
+      EventCache();
+
+      // Gets a (cached) CUDA event for the current device. The event
+      // will be returned to the cache by the shared_ptr destructor. The
+      // returned event is guaranteed to be in the state where all
+      // captured work has completed, i.e. cudaEventQuery() == cudaSuccess.
+      //
+      // This function is thread safe
+      SharedEventPtr get();
+
+    private:
+      friend class ::CUDAService;
+
+      // thread safe
+      SharedEventPtr makeOrGet(int dev);
+
+      // not thread safe, intended to be called only from CUDAService destructor
+      void clear();
+
+      class Deleter {
+      public:
+        Deleter() = default;
+        Deleter(int d) : device_{d} {}
+        void operator()(cudaEvent_t event) const;
+
+      private:
+        int device_ = -1;
+      };
+
+      std::vector<edm::ReusableObjectHolder<BareEvent, Deleter>> cache_;
+    };
+
+    // Gets the global instance of a EventCache
+    // This function is thread safe
+    EventCache& getEventCache();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h b/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
new file mode 100644
index 0000000000000..95900ec845470
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
@@ -0,0 +1,106 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_MessageLogger_h
+#define HeterogeneousCore_CUDAUtilities_interface_MessageLogger_h
+
+#include <sstream>
+#include <string>
+
+namespace cms {
+  namespace cuda {
+
+    /**
+   * This class is a temporary measure to hide C++17 constructs in
+   * MessaLogger from .cu files (those are mainly files that launch
+   * kernels). It will be removed once we will be able to compile .cu
+   * files with C++17 capable compiler.
+   */
+    class MessageLogger {
+    public:
+      MessageLogger(std::string const& category) : category_(category) {}
+
+      MessageLogger(std::string&& category) : category_(std::move(category)) {}
+
+      ~MessageLogger() = default;
+
+      MessageLogger(MessageLogger const&) = delete;
+      MessageLogger(MessageLogger&&) = delete;
+      MessageLogger& operator=(MessageLogger const&) = delete;
+      MessageLogger& operator=(MessageLogger&&) = delete;
+
+      template <typename T>
+      MessageLogger& operator<<(T const& element) {
+        message_ << element;
+        return *this;
+      }
+
+    protected:
+      std::string category_;
+      std::stringstream message_;
+    };
+
+    class LogSystem : public MessageLogger {
+    public:
+      LogSystem(std::string const& category) : MessageLogger(category) {}
+      LogSystem(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogSystem();
+    };
+
+    class LogAbsolute : public MessageLogger {
+    public:
+      LogAbsolute(std::string const& category) : MessageLogger(category) {}
+      LogAbsolute(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogAbsolute();
+    };
+
+    class LogError : public MessageLogger {
+    public:
+      LogError(std::string const& category) : MessageLogger(category) {}
+      LogError(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogError();
+    };
+
+    class LogProblem : public MessageLogger {
+    public:
+      LogProblem(std::string const& category) : MessageLogger(category) {}
+      LogProblem(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogProblem();
+    };
+
+    class LogImportant : public MessageLogger {
+    public:
+      LogImportant(std::string const& category) : MessageLogger(category) {}
+      LogImportant(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogImportant();
+    };
+
+    class LogWarning : public MessageLogger {
+    public:
+      LogWarning(std::string const& category) : MessageLogger(category) {}
+      LogWarning(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogWarning();
+    };
+
+    class LogPrint : public MessageLogger {
+    public:
+      LogPrint(std::string const& category) : MessageLogger(category) {}
+      LogPrint(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogPrint();
+    };
+
+    class LogInfo : public MessageLogger {
+    public:
+      LogInfo(std::string const& category) : MessageLogger(category) {}
+      LogInfo(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogInfo();
+    };
+
+    class LogVerbatim : public MessageLogger {
+    public:
+      LogVerbatim(std::string const& category) : MessageLogger(category) {}
+      LogVerbatim(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogVerbatim();
+    };
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_MessageLogger_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h b/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h
new file mode 100644
index 0000000000000..9b296dd390ea3
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h
@@ -0,0 +1,30 @@
+#ifndef HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h
+#define HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    class ScopedSetDevice {
+    public:
+      explicit ScopedSetDevice(int newDevice) {
+        cudaCheck(cudaGetDevice(&prevDevice_));
+        cudaCheck(cudaSetDevice(newDevice));
+      }
+
+      ~ScopedSetDevice() {
+        // Intentionally don't check the return value to avoid
+        // exceptions to be thrown. If this call fails, the process is
+        // doomed anyway.
+        cudaSetDevice(prevDevice_);
+      }
+
+    private:
+      int prevDevice_;
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h b/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h
new file mode 100644
index 0000000000000..7aa10327a4e63
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h
@@ -0,0 +1,18 @@
+#ifndef HeterogeneousCore_CUDAUtilities_SharedEventPtr_h
+#define HeterogeneousCore_CUDAUtilities_SharedEventPtr_h
+
+#include <memory>
+#include <type_traits>
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    // cudaEvent_t itself is a typedef for a pointer, for the use with
+    // edm::ReusableObjectHolder the pointed-to type is more interesting
+    // to avoid extra layer of indirection
+    using SharedEventPtr = std::shared_ptr<std::remove_pointer_t<cudaEvent_t>>;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h b/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h
new file mode 100644
index 0000000000000..14f54e35f6aa6
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h
@@ -0,0 +1,18 @@
+#ifndef HeterogeneousCore_CUDAUtilities_SharedStreamPtr_h
+#define HeterogeneousCore_CUDAUtilities_SharedStreamPtr_h
+
+#include <memory>
+#include <type_traits>
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    // cudaStream_t itself is a typedef for a pointer, for the use with
+    // edm::ReusableObjectHolder the pointed-to type is more interesting
+    // to avoid extra layer of indirection
+    using SharedStreamPtr = std::shared_ptr<std::remove_pointer_t<cudaStream_t>>;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/StreamCache.h b/HeterogeneousCore/CUDAUtilities/interface/StreamCache.h
new file mode 100644
index 0000000000000..a9c93546fac09
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/StreamCache.h
@@ -0,0 +1,50 @@
+#ifndef HeterogeneousCore_CUDAUtilities_StreamCache_h
+#define HeterogeneousCore_CUDAUtilities_StreamCache_h
+
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+class CUDAService;
+
+namespace cms {
+  namespace cuda {
+    class StreamCache {
+    public:
+      using BareStream = SharedStreamPtr::element_type;
+
+      StreamCache();
+
+      // Gets a (cached) CUDA stream for the current device. The stream
+      // will be returned to the cache by the shared_ptr destructor.
+      // This function is thread safe
+      SharedStreamPtr get();
+
+    private:
+      friend class ::CUDAService;
+      // not thread safe, intended to be called only from CUDAService destructor
+      void clear();
+
+      class Deleter {
+      public:
+        Deleter() = default;
+        Deleter(int d) : device_{d} {}
+        void operator()(cudaStream_t stream) const;
+
+      private:
+        int device_ = -1;
+      };
+
+      std::vector<edm::ReusableObjectHolder<BareStream, Deleter>> cache_;
+    };
+
+    // Gets the global instance of a StreamCache
+    // This function is thread safe
+    StreamCache& getStreamCache();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
new file mode 100644
index 0000000000000..9c271fc2fbff1
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
@@ -0,0 +1,16 @@
+#ifndef HeterogeneousCore_CUDAUtilities_allocate_device_h
+#define HeterogeneousCore_CUDAUtilities_allocate_device_h
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    // Allocate device memory
+    void *allocate_device(int dev, size_t nbytes, cudaStream_t stream);
+
+    // Free device memory (to be called from unique_ptr)
+    void free_device(int device, void *ptr);
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
new file mode 100644
index 0000000000000..1bba4580028d3
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
@@ -0,0 +1,16 @@
+#ifndef HeterogeneousCore_CUDAUtilities_allocate_host_h
+#define HeterogeneousCore_CUDAUtilities_allocate_host_h
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    // Allocate pinned host memory (to be called from unique_ptr)
+    void *allocate_host(size_t nbytes, cudaStream_t stream);
+
+    // Free pinned host memory (to be called from unique_ptr)
+    void free_host(void *ptr);
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
new file mode 100644
index 0000000000000..102a1d7bf2d86
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -0,0 +1,48 @@
+#ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
+#define HeterogeneousCore_CUDAUtilities_copyAsync_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include <type_traits>
+
+namespace cms {
+  namespace cuda {
+    // Single element
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T>& dst, const host::unique_ptr<T>& src, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
+    template <typename T>
+    inline void copyAsync(host::unique_ptr<T>& dst, const device::unique_ptr<T>& src, cudaStream_t stream) {
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream));
+    }
+
+    // Multiple elements
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T[]>& dst,
+                          const host::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
+    template <typename T>
+    inline void copyAsync(host::unique_ptr<T[]>& dst,
+                          const device::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream));
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
new file mode 100644
index 0000000000000..71959994e0044
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
@@ -0,0 +1,64 @@
+#ifndef HeterogeneousCore_CUDAUtilities_cudaCheck_h
+#define HeterogeneousCore_CUDAUtilities_cudaCheck_h
+
+// C++ standard headers
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+// CUDA headers
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// CMSSW headers
+#include "FWCore/Utilities/interface/Likely.h"
+
+namespace cms {
+  namespace cuda {
+
+    [[noreturn]] inline void abortOnCudaError(const char* file,
+                                              int line,
+                                              const char* cmd,
+                                              const char* error,
+                                              const char* message,
+                                              const char* description = nullptr) {
+      std::ostringstream out;
+      out << "\n";
+      out << file << ", line " << line << ":\n";
+      out << "cudaCheck(" << cmd << ");\n";
+      out << error << ": " << message << "\n";
+      if (description)
+        out << description << "\n";
+      throw std::runtime_error(out.str());
+    }
+
+    inline bool cudaCheck_(
+        const char* file, int line, const char* cmd, CUresult result, const char* description = nullptr) {
+      if (LIKELY(result == CUDA_SUCCESS))
+        return true;
+
+      const char* error;
+      const char* message;
+      cuGetErrorName(result, &error);
+      cuGetErrorString(result, &message);
+      abortOnCudaError(file, line, cmd, error, message, description);
+      return false;
+    }
+
+    inline bool cudaCheck_(
+        const char* file, int line, const char* cmd, cudaError_t result, const char* description = nullptr) {
+      if (LIKELY(result == cudaSuccess))
+        return true;
+
+      const char* error = cudaGetErrorName(result);
+      const char* message = cudaGetErrorString(result);
+      abortOnCudaError(file, line, cmd, error, message, description);
+      return false;
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#define cudaCheck(ARG, ...) (cms::cuda::cudaCheck_(__FILE__, __LINE__, #ARG, (ARG), ##__VA_ARGS__))
+
+#endif  // HeterogeneousCore_CUDAUtilities_cudaCheck_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h b/HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h
new file mode 100644
index 0000000000000..f3f452dc31471
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h
@@ -0,0 +1,18 @@
+// The omission of #include guards is on purpose: it does make sense to #include
+// this file multiple times, setting a different value of GPU_DEBUG beforehand.
+
+#ifdef __CUDA_ARCH__
+#ifndef GPU_DEBUG
+// disable asserts
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+#else
+// enable asserts
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+#endif
+#endif  // __CUDA_ARCH__
+
+#include <cassert>
diff --git a/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h b/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h
new file mode 100644
index 0000000000000..06ddbe00489ec
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h
@@ -0,0 +1,18 @@
+#ifndef HeterogenousCore_CUDAUtilities_currentDevice_h
+#define HeterogenousCore_CUDAUtilities_currentDevice_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    inline int currentDevice() {
+      int dev;
+      cudaCheck(cudaGetDevice(&dev));
+      return dev;
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/deviceCount.h b/HeterogeneousCore/CUDAUtilities/interface/deviceCount.h
new file mode 100644
index 0000000000000..de9a809f993b0
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/deviceCount.h
@@ -0,0 +1,18 @@
+#ifndef HeterogenousCore_CUDAUtilities_deviceCount_h
+#define HeterogenousCore_CUDAUtilities_deviceCount_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    inline int deviceCount() {
+      int ndevices;
+      cudaCheck(cudaGetDeviceCount(&ndevices));
+      return ndevices;
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
new file mode 100644
index 0000000000000..eb86c05be465c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
@@ -0,0 +1,102 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include "FWCore/Utilities/interface/Likely.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+
+namespace cms {
+  namespace cuda {
+    namespace device {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class DeviceDeleter {
+        public:
+          DeviceDeleter() = default;  // for edm::Wrapper
+          DeviceDeleter(int device) : device_{device} {}
+
+          void operator()(void *ptr) {
+            if (LIKELY(device_ >= 0)) {
+              free_device(device_, ptr);
+            }
+          }
+
+        private:
+          int device_ = -1;
+        };
+      }  // namespace impl
+
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T, impl::DeviceDeleter>;
+
+      namespace impl {
+        template <typename T>
+        struct make_device_unique_selector {
+          using non_array = cms::cuda::device::unique_ptr<T>;
+        };
+        template <typename T>
+        struct make_device_unique_selector<T[]> {
+          using unbounded_array = cms::cuda::device::unique_ptr<T[]>;
+        };
+        template <typename T, size_t N>
+        struct make_device_unique_selector<T[N]> {
+          struct bounded_array {};
+        };
+      }  // namespace impl
+    }    // namespace device
+
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T), stream);
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
+
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
+                                                                                              cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type), stream);
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
+
+    template <typename T, typename... Args>
+    typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;
+
+    // No check for the trivial constructor, make it clear in the interface
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
+        cudaStream_t stream) {
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T), stream);
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
+
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
+        size_t n, cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type), stream);
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
+
+    template <typename T, typename... Args>
+    typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique_uninitialized(Args &&...) =
+        delete;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h b/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h
new file mode 100644
index 0000000000000..6dfb04ac9e10d
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h
@@ -0,0 +1,32 @@
+#ifndef HeterogeneousCore_CUDAUtilities_eventWorkHasCompleted_h
+#define HeterogeneousCore_CUDAUtilities_eventWorkHasCompleted_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cuda {
+    /**
+   * Returns true if the work captured by the event (=queued to the
+   * CUDA stream at the point of cudaEventRecord()) has completed.
+   *
+   * Returns false if any captured work is incomplete.
+   *
+   * In case of errors, throws an exception.
+   */
+    inline bool eventWorkHasCompleted(cudaEvent_t event) {
+      const auto ret = cudaEventQuery(event);
+      if (ret == cudaSuccess) {
+        return true;
+      } else if (ret == cudaErrorNotReady) {
+        return false;
+      }
+      // leave error case handling to cudaCheck
+      cudaCheck(ret);
+      return false;  // to keep compiler happy
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
new file mode 100644
index 0000000000000..009f4b859fb7d
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
@@ -0,0 +1,74 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+
+#include <memory>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace cms {
+  namespace cuda {
+    namespace host {
+      namespace noncached {
+        namespace impl {
+          // Additional layer of types to distinguish from host::unique_ptr
+          class HostDeleter {
+          public:
+            void operator()(void *ptr) { cudaCheck(cudaFreeHost(ptr)); }
+          };
+        }  // namespace impl
+
+        template <typename T>
+        using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+        namespace impl {
+          template <typename T>
+          struct make_host_unique_selector {
+            using non_array = cms::cuda::host::noncached::unique_ptr<T>;
+          };
+          template <typename T>
+          struct make_host_unique_selector<T[]> {
+            using unbounded_array = cms::cuda::host::noncached::unique_ptr<T[]>;
+          };
+          template <typename T, size_t N>
+          struct make_host_unique_selector<T[N]> {
+            struct bounded_array {};
+          };
+        }  // namespace impl
+      }    // namespace noncached
+    }      // namespace host
+
+    /**
+   * The difference wrt. make_host_unique is that these
+   * do not cache, so they should not be called per-event.
+   */
+    template <typename T>
+    typename host::noncached::impl::make_host_unique_selector<T>::non_array make_host_noncached_unique(
+        unsigned int flags = cudaHostAllocDefault) {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem;
+      cudaCheck(cudaHostAlloc(&mem, sizeof(T), flags));
+      return typename host::noncached::impl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem));
+    }
+
+    template <typename T>
+    typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array make_host_noncached_unique(
+        size_t n, unsigned int flags = cudaHostAllocDefault) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem;
+      cudaCheck(cudaHostAlloc(&mem, n * sizeof(element_type), flags));
+      return typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array(
+          reinterpret_cast<element_type *>(mem));
+    }
+
+    template <typename T, typename... Args>
+    typename host::noncached::impl::make_host_unique_selector<T>::bounded_array make_host_noncached_unique(Args &&...) =
+        delete;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
new file mode 100644
index 0000000000000..a64b7c9869d6c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
@@ -0,0 +1,80 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
+
+namespace cms {
+  namespace cuda {
+    namespace host {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class HostDeleter {
+        public:
+          void operator()(void *ptr) { cms::cuda::free_host(ptr); }
+        };
+      }  // namespace impl
+
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+      namespace impl {
+        template <typename T>
+        struct make_host_unique_selector {
+          using non_array = cms::cuda::host::unique_ptr<T>;
+        };
+        template <typename T>
+        struct make_host_unique_selector<T[]> {
+          using unbounded_array = cms::cuda::host::unique_ptr<T[]>;
+        };
+        template <typename T, size_t N>
+        struct make_host_unique_selector<T[N]> {
+          struct bounded_array {};
+        };
+      }  // namespace impl
+    }    // namespace host
+
+    // Allocate pinned host memory
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem = allocate_host(sizeof(T), stream);
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+    }
+
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem = allocate_host(n * sizeof(element_type), stream);
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+    }
+
+    template <typename T, typename... Args>
+    typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique(Args &&...) = delete;
+
+    // No check for the trivial constructor, make it clear in the interface
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized(cudaStream_t stream) {
+      void *mem = allocate_host(sizeof(T), stream);
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+    }
+
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique_uninitialized(
+        size_t n, cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      void *mem = allocate_host(n * sizeof(element_type), stream);
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+    }
+
+    template <typename T, typename... Args>
+    typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique_uninitialized(Args &&...) = delete;
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h
new file mode 100644
index 0000000000000..60d6e5d844fea
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/launch.h
@@ -0,0 +1,147 @@
+#ifndef HeterogeneousCore_CUDAUtilities_launch_h
+#define HeterogeneousCore_CUDAUtilities_launch_h
+
+#include <tuple>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+/*
+ * `cms::cuda::launch` and `cms::cuda::launch_cooperative` are wrappers around
+ * the CUDA Runtime API calls to setup and call a CUDA kernel from the host.
+ *
+ * `kernel` should be a pointer to a __global__ void(...) function.
+ * `config` describe the launch configuration: the grid size and block size, the
+ *          dynamic shared memory size (default to 0) and the CUDA stream to use
+ *          (default to 0, the default stream).
+ * `args` are the arguments passed (by value) to the kernel.
+ *
+ *  Currently this is requires an extra copy to perform the necessary implicit
+ *  conversions and ensure that the arguments match the kernel function signature;
+ *  the extra copy could eventually be avoided for arguments that are already of
+ *  the exact type.
+ *
+ *  Unlike the `kernel<<<...>>>(...)` syntax and the `cuda::launch(...)` 
+ *  implementation from the CUDA API Wrappers, `cms::cuda::launch(...)` and 
+ *  `cms::cuda::launch_cooperative` can be called from standard C++ host code.
+ *
+ *  Possible optimisations
+ *
+ *    - once C++17 is available in CUDA, replace the `pointer_setter` functor
+ *      with a simpler function using fold expressions:
+ *
+ *  template<int N, class Tuple, std::size_t... Is>
+ *  void pointer_setter(void* ptrs[N], Tuple const& t, std::index_sequence<Is...>)
+ *  {
+ *    ((ptrs[Is] = & std::get<Is>(t)), ...);
+ *  }
+ *
+ *    - add a template specialisation to `launch` and `launch_cooperative` to
+ *      avoid making a temporary copy of the parameters when they match the
+ *      kernel signature.
+ */
+
+namespace cms {
+  namespace cuda {
+
+    struct LaunchParameters {
+      dim3 gridDim;
+      dim3 blockDim;
+      size_t sharedMem;
+      cudaStream_t stream;
+
+      LaunchParameters(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
+          : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
+
+      LaunchParameters(int gridDim, int blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
+          : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
+    };
+
+    namespace detail {
+
+      template <typename T>
+      struct kernel_traits;
+
+      template <typename... Args>
+      struct kernel_traits<void(Args...)> {
+        static constexpr size_t arguments_size = sizeof...(Args);
+
+        using argument_type_tuple = std::tuple<Args...>;
+
+        template <size_t i>
+        using argument_type = typename std::tuple_element<i, argument_type_tuple>::type;
+      };
+
+      // fill an array with the pointers to the elements of a tuple
+      template <int I>
+      struct pointer_setter {
+        template <typename Tuple>
+        void operator()(void const* ptrs[], Tuple const& t) {
+          pointer_setter<I - 1>()(ptrs, t);
+          ptrs[I - 1] = &std::get<I - 1>(t);
+        }
+      };
+
+      template <>
+      struct pointer_setter<0> {
+        template <typename Tuple>
+        void operator()(void const* ptrs[], Tuple const& t) {}
+      };
+
+    }  // namespace detail
+
+    // wrappers for cudaLaunchKernel
+
+    inline void launch(void (*kernel)(), LaunchParameters config) {
+      cudaCheck(cudaLaunchKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+    }
+
+    template <typename F, typename... Args>
+#if __cplusplus >= 201703L
+    std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
+#else
+    std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
+#endif
+    launch(F* kernel, LaunchParameters config, Args&&... args) {
+      using function_type = detail::kernel_traits<F>;
+      typename function_type::argument_type_tuple args_copy(args...);
+
+      constexpr auto size = function_type::arguments_size;
+      void const* pointers[size];
+
+      detail::pointer_setter<size>()(pointers, args_copy);
+      cudaCheck(cudaLaunchKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+    }
+
+    // wrappers for cudaLaunchCooperativeKernel
+
+    inline void launch_cooperative(void (*kernel)(), LaunchParameters config) {
+      cudaCheck(cudaLaunchCooperativeKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+    }
+
+    template <typename F, typename... Args>
+#if __cplusplus >= 201703L
+    std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
+#else
+    std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
+#endif
+    launch_cooperative(F* kernel, LaunchParameters config, Args&&... args) {
+      using function_type = detail::kernel_traits<F>;
+      typename function_type::argument_type_tuple args_copy(args...);
+
+      constexpr auto size = function_type::arguments_size;
+      void const* pointers[size];
+
+      detail::pointer_setter<size>()(pointers, args_copy);
+      cudaCheck(cudaLaunchCooperativeKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_launch_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
new file mode 100644
index 0000000000000..3a4dbe3d1f6de
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -0,0 +1,33 @@
+#ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h
+#define HeterogeneousCore_CUDAUtilities_memsetAsync_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include <type_traits>
+
+namespace cms {
+  namespace cuda {
+    template <typename T>
+    inline void memsetAsync(device::unique_ptr<T>& ptr, T value, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
+    }
+
+    /**
+   * The type of `value` is `int` because of `cudaMemsetAsync()` takes
+   * it as an `int`. Note that `cudaMemsetAsync()` sets the value of
+   * each **byte** to `value`. This may lead to unexpected results if
+   * `sizeof(T) > 1` and `value != 0`.
+   */
+    template <typename T>
+    inline void memsetAsync(device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
+      cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/requireDevices.h b/HeterogeneousCore/CUDAUtilities/interface/requireDevices.h
new file mode 100644
index 0000000000000..0795175b31048
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/requireDevices.h
@@ -0,0 +1,17 @@
+#ifndef HeterogeneousCore_CUDAUtilities_requireDevices_h
+#define HeterogeneousCore_CUDAUtilities_requireDevices_h
+
+/**
+ * These functions are meant to be called only from unit tests.
+ */
+namespace cms {
+  namespace cudatest {
+    /// In presence of CUDA devices, return true; otherwise print message and return false
+    bool testDevices();
+
+    /// Print message and exit if there are no CUDA devices
+    void requireDevices();
+  }  // namespace cudatest
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_requireDevices_h
diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
new file mode 100644
index 0000000000000..075d568f21039
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
@@ -0,0 +1,717 @@
+#ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
+#define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * Forked to CMSSW by Matti Kortelainen
+ */
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#include <cmath>
+#include <map>
+#include <set>
+
+#include <cub/util_debug.cuh>
+#include <cub/host/mutex.cuh>
+
+/// CUB namespace
+namespace notcub {
+
+  /**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+  /******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+  /**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+  struct CachingDeviceAllocator {
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int)-1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t)-1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor {
+      void *d_ptr;                     // Device pointer
+      size_t bytes;                    // Size of allocation in bytes
+      unsigned int bin;                // Bin enumeration
+      int device;                      // device ordinal
+      cudaStream_t associated_stream;  // Associated associated_stream
+      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
+
+      // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+      BlockDescriptor(void *d_ptr, int device)
+          : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {}
+
+      // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+      BlockDescriptor(int device)
+          : d_ptr(nullptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Comparison functor for comparing device pointers
+      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
+        if (a.device == b.device)
+          return (a.d_ptr < b.d_ptr);
+        else
+          return (a.device < b.device);
+      }
+
+      // Comparison functor for comparing allocation sizes
+      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
+        if (a.device == b.device)
+          return (a.bytes < b.bytes);
+        else
+          return (a.device < b.device);
+      }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+      size_t free;
+      size_t live;
+      TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(unsigned int base, unsigned int exp) {
+      unsigned int retval = 1;
+      while (exp > 0) {
+        if (exp & 1) {
+          retval = retval * base;  // multiply the result by the current base
+        }
+        base = base * base;  // square the base
+        exp = exp >> 1;      // divide the exponent in half
+      }
+      return retval;
+    }
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
+      power = 0;
+      rounded_bytes = 1;
+
+      if (value * base < value) {
+        // Overflow
+        power = sizeof(size_t) * 8;
+        rounded_bytes = size_t(0) - 1;
+        return;
+      }
+
+      while (rounded_bytes < value) {
+        rounded_bytes *= base;
+        power++;
+      }
+    }
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex mutex;  /// Mutex for thread-safety
+
+    unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
+    unsigned int min_bin;     /// Minimum bin enumeration
+    unsigned int max_bin;     /// Maximum bin enumeration
+
+    size_t min_bin_bytes;     /// Minimum bin size
+    size_t max_bin_bytes;     /// Maximum bin size
+    size_t max_cached_bytes;  /// Maximum aggregate cached bytes per device
+
+    const bool
+        skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool debug;        /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes cached_bytes;  /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks cached_blocks;   /// Set of cached device allocations available for reuse
+    BusyBlocks live_blocks;       /// Set of live device allocations currently in use
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int bin_growth,                 ///< Geometric growth factor for bin-sizes
+        unsigned int min_bin = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int max_bin = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t max_cached_bytes = INVALID_SIZE,  ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool skip_cleanup =
+            false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool debug = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+        : bin_growth(bin_growth),
+          min_bin(min_bin),
+          max_bin(max_bin),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes(max_cached_bytes),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false)
+        : bin_growth(8),
+          min_bin(3),
+          max_bin(7),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes((max_bin_bytes * 3) - 1),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {
+      // Lock
+      mutex.Lock();
+
+      if (debug)
+        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
+                (long long)this->max_cached_bytes,
+                (long long)max_cached_bytes);
+
+      this->max_cached_bytes = max_cached_bytes;
+
+      // Unlock
+      mutex.Unlock();
+
+      return cudaSuccess;
+    }
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int device,                            ///< [in] Device on which to place the allocation
+        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
+        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
+    {
+      *d_ptr = nullptr;
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      if (device == INVALID_DEVICE_ORDINAL) {
+        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          return error;
+        device = entrypoint_device;
+      }
+
+      // Create a block descriptor for the requested allocation
+      bool found = false;
+      BlockDescriptor search_key(device);
+      search_key.associated_stream = active_stream;
+      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+      if (search_key.bin > max_bin) {
+        // Bin is greater than our maximum bin: allocate the request
+        // exactly and give out-of-bounds bin.  It will not be cached
+        // for reuse when returned.
+        search_key.bin = INVALID_BIN;
+        search_key.bytes = bytes;
+      } else {
+        // Search for a suitable cached allocation: lock
+        mutex.Lock();
+
+        if (search_key.bin < min_bin) {
+          // Bin is less than minimum bin: round up
+          search_key.bin = min_bin;
+          search_key.bytes = min_bin_bytes;
+        }
+
+        // Iterate through the range of cached blocks on the same device in the same bin
+        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+        while ((block_itr != cached_blocks.end()) && (block_itr->device == device) &&
+               (block_itr->bin == search_key.bin)) {
+          // To prevent races with reusing blocks returned by the host but still
+          // in use by the device, only consider cached blocks that are
+          // either (from the active stream) or (from an idle stream)
+          if ((active_stream == block_itr->associated_stream) ||
+              (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
+            // Reuse existing cache block.  Insert into live blocks.
+            found = true;
+            search_key = *block_itr;
+            search_key.associated_stream = active_stream;
+            live_blocks.insert(search_key);
+
+            // Remove from free blocks
+            cached_bytes[device].free -= search_key.bytes;
+            cached_bytes[device].live += search_key.bytes;
+
+            if (debug)
+              // CMS: improved debug message
+              _CubLog(
+                  "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
+                  "associated with stream %lld, event %lld).\n",
+                  device,
+                  search_key.d_ptr,
+                  (long long)search_key.bytes,
+                  (long long)search_key.associated_stream,
+                  (long long)search_key.ready_event,
+                  (long long)block_itr->associated_stream,
+                  (long long)block_itr->ready_event);
+
+            cached_blocks.erase(block_itr);
+
+            break;
+          }
+          block_itr++;
+        }
+
+        // Done searching: unlock
+        mutex.Unlock();
+      }
+
+      // Allocate the block if necessary
+      if (!found) {
+        // Set runtime's current device to specified device (entrypoint may not be set)
+        if (device != entrypoint_device) {
+          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+            return error;
+          if (CubDebug(error = cudaSetDevice(device)))
+            return error;
+        }
+
+        // Attempt to allocate
+        if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
+          // The allocation attempt failed: free all cached blocks on device and retry
+          if (debug)
+            _CubLog(
+                "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                device,
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream);
+
+          error = cudaSuccess;  // Reset the error we will return
+          cudaGetLastError();   // Reset CUDART's error
+
+          // Lock
+          mutex.Lock();
+
+          // Iterate the range of free blocks on the same device
+          BlockDescriptor free_key(device);
+          CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+          while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) {
+            // No need to worry about synchronization with the device: cudaFree is
+            // blocking and will synchronize across all kernels executing
+            // on the current device
+
+            // Free device memory and destroy stream event.
+            if (CubDebug(error = cudaFree(block_itr->d_ptr)))
+              break;
+            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
+              break;
+
+            // Reduce balance and erase entry
+            cached_bytes[device].free -= block_itr->bytes;
+
+            if (debug)
+              _CubLog(
+                  "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
+                  "(%lld bytes) outstanding.\n",
+                  device,
+                  (long long)block_itr->bytes,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes[device].free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes[device].live);
+
+            cached_blocks.erase(block_itr);
+
+            block_itr++;
+          }
+
+          // Unlock
+          mutex.Unlock();
+
+          // Return under error
+          if (error)
+            return error;
+
+          // Try to allocate again
+          if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)))
+            return error;
+        }
+
+        // Create ready event
+        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+          return error;
+
+        // Insert into live blocks
+        mutex.Lock();
+        live_blocks.insert(search_key);
+        cached_bytes[device].live += search_key.bytes;
+        mutex.Unlock();
+
+        if (debug)
+          // CMS: improved debug message
+          _CubLog(
+              "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
+              device,
+              search_key.d_ptr,
+              (long long)search_key.bytes,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event);
+
+        // Attempt to revert back to previous device if necessary
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
+          if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+            return error;
+        }
+      }
+
+      // Copy device pointer to output parameter
+      *d_ptr = search_key.d_ptr;
+
+      if (debug)
+        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes[device].free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes[device].live);
+
+      return error;
+    }
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
+        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
+    {
+      return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(int device, void *d_ptr) {
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      if (device == INVALID_DEVICE_ORDINAL) {
+        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          return error;
+        device = entrypoint_device;
+      }
+
+      // Lock
+      mutex.Lock();
+
+      // Find corresponding block descriptor
+      bool recached = false;
+      BlockDescriptor search_key(d_ptr, device);
+      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+      if (block_itr != live_blocks.end()) {
+        // Remove from live blocks
+        search_key = *block_itr;
+        live_blocks.erase(block_itr);
+        cached_bytes[device].live -= search_key.bytes;
+
+        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+        if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
+          // Insert returned allocation into free blocks
+          recached = true;
+          cached_blocks.insert(search_key);
+          cached_bytes[device].free += search_key.bytes;
+
+          if (debug)
+            // CMS: improved debug message
+            _CubLog(
+                "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
+                "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                device,
+                (long long)search_key.bytes,
+                d_ptr,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes[device].free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes[device].live);
+        }
+      }
+
+      // First set to specified device (entrypoint may not be set)
+      if (device != entrypoint_device) {
+        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          return error;
+        if (CubDebug(error = cudaSetDevice(device)))
+          return error;
+      }
+
+      if (recached) {
+        // Insert the ready event in the associated stream (must have current device set properly)
+        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
+          return error;
+      }
+
+      // Unlock
+      mutex.Unlock();
+
+      if (!recached) {
+        // Free the allocation from the runtime and cleanup the event.
+        if (CubDebug(error = cudaFree(d_ptr)))
+          return error;
+        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
+          return error;
+
+        if (debug)
+          // CMS: improved debug message
+          _CubLog(
+              "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
+              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+              device,
+              (long long)search_key.bytes,
+              d_ptr,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes[device].free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes[device].live);
+      }
+
+      // Reset device
+      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
+        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+          return error;
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); }
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached() {
+      cudaError_t error = cudaSuccess;
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      int current_device = INVALID_DEVICE_ORDINAL;
+
+      mutex.Lock();
+
+      while (!cached_blocks.empty()) {
+        // Get first block
+        CachedBlocks::iterator begin = cached_blocks.begin();
+
+        // Get entry-point device ordinal if necessary
+        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
+          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+            break;
+        }
+
+        // Set current device ordinal if necessary
+        if (begin->device != current_device) {
+          if (CubDebug(error = cudaSetDevice(begin->device)))
+            break;
+          current_device = begin->device;
+        }
+
+        // Free device memory
+        if (CubDebug(error = cudaFree(begin->d_ptr)))
+          break;
+        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
+          break;
+
+        // Reduce balance and erase entry
+        cached_bytes[current_device].free -= begin->bytes;
+
+        if (debug)
+          _CubLog(
+              "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+              "bytes) outstanding.\n",
+              current_device,
+              (long long)begin->bytes,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes[current_device].free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes[current_device].live);
+
+        cached_blocks.erase(begin);
+      }
+
+      mutex.Unlock();
+
+      // Attempt to revert back to entry-point device if necessary
+      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
+        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+          return error;
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator() {
+      if (!skip_cleanup)
+        FreeAllCached();
+    }
+  };
+
+  /** @} */  // end group UtilMgmt
+
+}  // namespace notcub
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h
new file mode 100644
index 0000000000000..53901e1f1cb27
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h
@@ -0,0 +1,661 @@
+#ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
+#define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * Modified to cache pinned host allocations by Matti Kortelainen
+ */
+
+/******************************************************************************
+ * Simple caching allocator for pinned host memory allocations. The allocator is
+ * thread-safe.
+ ******************************************************************************/
+
+#include <cmath>
+#include <map>
+#include <set>
+
+#include <cub/util_debug.cuh>
+#include <cub/host/mutex.cuh>
+
+/// CUB namespace
+namespace notcub {
+
+  /**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+  /******************************************************************************
+ * CachingHostAllocator (host use)
+ ******************************************************************************/
+
+  /**
+ * \brief A simple caching allocator pinned host memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe.  It behaves as follows:
+ *
+ * I presume the CUDA stream-safeness is not useful as to read/write
+ * from/to the pinned host memory one needs to synchronize anyway. The
+ * difference wrt. device memory is that in the CPU all operations to
+ * the device memory are scheduled via the CUDA stream, while for the
+ * host memory one can perform operations directly.
+ *
+ * \par
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused host allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations  will exceed
+ *   \p max_cached_bytes, allocations are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingHostAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes
+ *
+ */
+  struct CachingHostAllocator {
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int)-1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t)-1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for pinned host memory allocations
+     */
+    struct BlockDescriptor {
+      void *d_ptr;                     // Host pointer
+      size_t bytes;                    // Size of allocation in bytes
+      unsigned int bin;                // Bin enumeration
+      int device;                      // device ordinal
+      cudaStream_t associated_stream;  // Associated associated_stream
+      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
+
+      // Constructor (suitable for searching maps for a specific block, given its pointer)
+      BlockDescriptor(void *d_ptr)
+          : d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(INVALID_DEVICE_ORDINAL),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Constructor (suitable for searching maps for a range of suitable blocks)
+      BlockDescriptor()
+          : d_ptr(nullptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(INVALID_DEVICE_ORDINAL),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Comparison functor for comparing host pointers
+      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
+
+      // Comparison functor for comparing allocation sizes
+      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+      size_t free;
+      size_t live;
+      TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(unsigned int base, unsigned int exp) {
+      unsigned int retval = 1;
+      while (exp > 0) {
+        if (exp & 1) {
+          retval = retval * base;  // multiply the result by the current base
+        }
+        base = base * base;  // square the base
+        exp = exp >> 1;      // divide the exponent in half
+      }
+      return retval;
+    }
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
+      power = 0;
+      rounded_bytes = 1;
+
+      if (value * base < value) {
+        // Overflow
+        power = sizeof(size_t) * 8;
+        rounded_bytes = size_t(0) - 1;
+        return;
+      }
+
+      while (rounded_bytes < value) {
+        rounded_bytes *= base;
+        power++;
+      }
+    }
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex mutex;  /// Mutex for thread-safety
+
+    unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
+    unsigned int min_bin;     /// Minimum bin enumeration
+    unsigned int max_bin;     /// Maximum bin enumeration
+
+    size_t min_bin_bytes;     /// Minimum bin size
+    size_t max_bin_bytes;     /// Maximum bin size
+    size_t max_cached_bytes;  /// Maximum aggregate cached bytes
+
+    const bool
+        skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool debug;        /// Whether or not to print (de)allocation events to stdout
+
+    TotalBytes cached_bytes;     /// Aggregate cached bytes
+    CachedBlocks cached_blocks;  /// Set of cached pinned host allocations available for reuse
+    BusyBlocks live_blocks;      /// Set of live pinned host allocations currently in use
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingHostAllocator(
+        unsigned int bin_growth,                 ///< Geometric growth factor for bin-sizes
+        unsigned int min_bin = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int max_bin = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t max_cached_bytes = INVALID_SIZE,  ///< Maximum aggregate cached bytes (default is no limit)
+        bool skip_cleanup =
+            false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool debug = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+        : bin_growth(bin_growth),
+          min_bin(min_bin),
+          max_bin(max_bin),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes(max_cached_bytes),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes
+     */
+    CachingHostAllocator(bool skip_cleanup = false, bool debug = false)
+        : bin_growth(8),
+          min_bin(3),
+          max_bin(7),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes((max_bin_bytes * 3) - 1),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    void SetMaxCachedBytes(size_t max_cached_bytes) {
+      // Lock
+      mutex.Lock();
+
+      if (debug)
+        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
+                (long long)this->max_cached_bytes,
+                (long long)max_cached_bytes);
+
+      this->max_cached_bytes = max_cached_bytes;
+
+      // Unlock
+      mutex.Unlock();
+    }
+
+    /**
+     * \brief Provides a suitable allocation of pinned host memory for the given size.
+     *
+     * Once freed, the allocation becomes available immediately for reuse.
+     */
+    cudaError_t HostAllocate(
+        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
+        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
+    {
+      *d_ptr = nullptr;
+      int device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      if (CubDebug(error = cudaGetDevice(&device)))
+        return error;
+
+      // Create a block descriptor for the requested allocation
+      bool found = false;
+      BlockDescriptor search_key;
+      search_key.device = device;
+      search_key.associated_stream = active_stream;
+      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+      if (search_key.bin > max_bin) {
+        // Bin is greater than our maximum bin: allocate the request
+        // exactly and give out-of-bounds bin.  It will not be cached
+        // for reuse when returned.
+        search_key.bin = INVALID_BIN;
+        search_key.bytes = bytes;
+      } else {
+        // Search for a suitable cached allocation: lock
+        mutex.Lock();
+
+        if (search_key.bin < min_bin) {
+          // Bin is less than minimum bin: round up
+          search_key.bin = min_bin;
+          search_key.bytes = min_bin_bytes;
+        }
+
+        // Iterate through the range of cached blocks in the same bin
+        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+        while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
+          // To prevent races with reusing blocks returned by the host but still
+          // in use for transfers, only consider cached blocks that are from an idle stream
+          if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
+            // Reuse existing cache block.  Insert into live blocks.
+            found = true;
+            search_key = *block_itr;
+            search_key.associated_stream = active_stream;
+            if (search_key.device != device) {
+              // If "associated" device changes, need to re-create the event on the right device
+              if (CubDebug(error = cudaSetDevice(search_key.device)))
+                return error;
+              if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
+                return error;
+              if (CubDebug(error = cudaSetDevice(device)))
+                return error;
+              if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+              search_key.device = device;
+            }
+
+            live_blocks.insert(search_key);
+
+            // Remove from free blocks
+            cached_bytes.free -= search_key.bytes;
+            cached_bytes.live += search_key.bytes;
+
+            if (debug)
+              _CubLog(
+                  "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
+                  "(previously associated with stream %lld, event %lld).\n",
+                  search_key.d_ptr,
+                  (long long)search_key.bytes,
+                  (long long)search_key.associated_stream,
+                  (long long)search_key.ready_event,
+                  (long long)search_key.device,
+                  (long long)block_itr->associated_stream,
+                  (long long)block_itr->ready_event);
+
+            cached_blocks.erase(block_itr);
+
+            break;
+          }
+          block_itr++;
+        }
+
+        // Done searching: unlock
+        mutex.Unlock();
+      }
+
+      // Allocate the block if necessary
+      if (!found) {
+        // Attempt to allocate
+        // TODO: eventually support allocation flags
+        if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
+            cudaErrorMemoryAllocation) {
+          // The allocation attempt failed: free all cached blocks on device and retry
+          if (debug)
+            _CubLog(
+                "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
+                "allocations",
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream,
+                (long long)search_key.device);
+
+          error = cudaSuccess;  // Reset the error we will return
+          cudaGetLastError();   // Reset CUDART's error
+
+          // Lock
+          mutex.Lock();
+
+          // Iterate the range of free blocks
+          CachedBlocks::iterator block_itr = cached_blocks.begin();
+
+          while ((block_itr != cached_blocks.end())) {
+            // No need to worry about synchronization with the device: cudaFree is
+            // blocking and will synchronize across all kernels executing
+            // on the current device
+
+            // Free pinned host memory.
+            if (CubDebug(error = cudaFreeHost(block_itr->d_ptr)))
+              break;
+            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
+              break;
+
+            // Reduce balance and erase entry
+            cached_bytes.free -= block_itr->bytes;
+
+            if (debug)
+              _CubLog(
+                  "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+                  "bytes) outstanding.\n",
+                  (long long)block_itr->bytes,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes.free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes.live);
+
+            cached_blocks.erase(block_itr);
+
+            block_itr++;
+          }
+
+          // Unlock
+          mutex.Unlock();
+
+          // Return under error
+          if (error)
+            return error;
+
+          // Try to allocate again
+          if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)))
+            return error;
+        }
+
+        // Create ready event
+        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+          return error;
+
+        // Insert into live blocks
+        mutex.Lock();
+        live_blocks.insert(search_key);
+        cached_bytes.live += search_key.bytes;
+        mutex.Unlock();
+
+        if (debug)
+          _CubLog(
+              "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
+              "%lld).\n",
+              search_key.d_ptr,
+              (long long)search_key.bytes,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event,
+              (long long)search_key.device);
+      }
+
+      // Copy host pointer to output parameter
+      *d_ptr = search_key.d_ptr;
+
+      if (debug)
+        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes.free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes.live);
+
+      return error;
+    }
+
+    /**
+     * \brief Frees a live allocation of pinned host memory, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse.
+     */
+    cudaError_t HostFree(void *d_ptr) {
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      // Lock
+      mutex.Lock();
+
+      // Find corresponding block descriptor
+      bool recached = false;
+      BlockDescriptor search_key(d_ptr);
+      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+      if (block_itr != live_blocks.end()) {
+        // Remove from live blocks
+        search_key = *block_itr;
+        live_blocks.erase(block_itr);
+        cached_bytes.live -= search_key.bytes;
+
+        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+        if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
+          // Insert returned allocation into free blocks
+          recached = true;
+          cached_blocks.insert(search_key);
+          cached_bytes.free += search_key.bytes;
+
+          if (debug)
+            _CubLog(
+                "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
+                "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event,
+                (long long)search_key.device,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes.free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes.live);
+        }
+      }
+
+      if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+        return error;
+      if (entrypoint_device != search_key.device) {
+        if (CubDebug(error = cudaSetDevice(search_key.device)))
+          return error;
+      }
+
+      if (recached) {
+        // Insert the ready event in the associated stream (must have current device set properly)
+        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
+          return error;
+      }
+
+      // Unlock
+      mutex.Unlock();
+
+      if (!recached) {
+        // Free the allocation from the runtime and cleanup the event.
+        if (CubDebug(error = cudaFreeHost(d_ptr)))
+          return error;
+        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
+          return error;
+
+        if (debug)
+          _CubLog(
+              "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
+              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+              (long long)search_key.bytes,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event,
+              (long long)search_key.device,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes.free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes.live);
+      }
+
+      // Reset device
+      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
+        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+          return error;
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Frees all cached pinned host allocations
+     */
+    cudaError_t FreeAllCached() {
+      cudaError_t error = cudaSuccess;
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      int current_device = INVALID_DEVICE_ORDINAL;
+
+      mutex.Lock();
+
+      while (!cached_blocks.empty()) {
+        // Get first block
+        CachedBlocks::iterator begin = cached_blocks.begin();
+
+        // Get entry-point device ordinal if necessary
+        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
+          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+            break;
+        }
+
+        // Set current device ordinal if necessary
+        if (begin->device != current_device) {
+          if (CubDebug(error = cudaSetDevice(begin->device)))
+            break;
+          current_device = begin->device;
+        }
+
+        // Free host memory
+        if (CubDebug(error = cudaFreeHost(begin->d_ptr)))
+          break;
+        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
+          break;
+
+        // Reduce balance and erase entry
+        cached_bytes.free -= begin->bytes;
+
+        if (debug)
+          _CubLog(
+              "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+              "bytes) outstanding.\n",
+              (long long)begin->bytes,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes.free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes.live);
+
+        cached_blocks.erase(begin);
+      }
+
+      mutex.Unlock();
+
+      // Attempt to revert back to entry-point device if necessary
+      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
+        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+          return error;
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Destructor
+     */
+    ~CachingHostAllocator() {
+      if (!skip_cleanup)
+        FreeAllCached();
+    }
+  };
+
+  /** @} */  // end group UtilMgmt
+
+}  // namespace notcub
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/EventCache.cc b/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
new file mode 100644
index 0000000000000..a80cfdd412ec5
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
@@ -0,0 +1,69 @@
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
+
+namespace cms::cuda {
+  void EventCache::Deleter::operator()(cudaEvent_t event) const {
+    if (device_ != -1) {
+      ScopedSetDevice deviceGuard{device_};
+      cudaCheck(cudaEventDestroy(event));
+    }
+  }
+
+  // EventCache should be constructed by the first call to
+  // getEventCache() only if we have CUDA devices present
+  EventCache::EventCache() : cache_(deviceCount()) {}
+
+  SharedEventPtr EventCache::get() {
+    const auto dev = currentDevice();
+    auto event = makeOrGet(dev);
+    // captured work has completed, or a just-created event
+    if (eventWorkHasCompleted(event.get())) {
+      return event;
+    }
+
+    // Got an event with incomplete captured work. Try again until we
+    // get a completed (or a just-created) event. Need to keep all
+    // incomplete events until a completed event is found in order to
+    // avoid ping-pong with an incomplete event.
+    std::vector<SharedEventPtr> ptrs{std::move(event)};
+    bool completed;
+    do {
+      event = makeOrGet(dev);
+      completed = eventWorkHasCompleted(event.get());
+      if (not completed) {
+        ptrs.emplace_back(std::move(event));
+      }
+    } while (not completed);
+    return event;
+  }
+
+  SharedEventPtr EventCache::makeOrGet(int dev) {
+    return cache_[dev].makeOrGet([dev]() {
+      cudaEvent_t event;
+      // it should be a bit faster to ignore timings
+      cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+      return std::unique_ptr<BareEvent, Deleter>(event, Deleter{dev});
+    });
+  }
+
+  void EventCache::clear() {
+    // Reset the contents of the caches, but leave an
+    // edm::ReusableObjectHolder alive for each device. This is needed
+    // mostly for the unit tests, where the function-static
+    // EventCache lives through multiple tests (and go through
+    // multiple shutdowns of the framework).
+    cache_.clear();
+    cache_.resize(deviceCount());
+  }
+
+  EventCache& getEventCache() {
+    // the public interface is thread safe
+    CMS_THREAD_SAFE static EventCache cache;
+    return cache;
+  }
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc b/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc
new file mode 100644
index 0000000000000..ffd9c1ca4f613
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc
@@ -0,0 +1,24 @@
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h"
+
+namespace cms::cuda {
+
+  LogSystem::~LogSystem() { edm::LogSystem(category_) << message_.str(); }
+
+  LogAbsolute::~LogAbsolute() { edm::LogAbsolute(category_) << message_.str(); }
+
+  LogError::~LogError() { edm::LogError(category_) << message_.str(); }
+
+  LogProblem::~LogProblem() { edm::LogProblem(category_) << message_.str(); }
+
+  LogImportant::~LogImportant() { edm::LogImportant(category_) << message_.str(); }
+
+  LogWarning::~LogWarning() { edm::LogWarning(category_) << message_.str(); }
+
+  LogPrint::~LogPrint() { edm::LogPrint(category_) << message_.str(); }
+
+  LogInfo::~LogInfo() { edm::LogInfo(category_) << message_.str(); }
+
+  LogVerbatim::~LogVerbatim() { edm::LogVerbatim(category_) << message_.str(); }
+
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/StreamCache.cc b/HeterogeneousCore/CUDAUtilities/src/StreamCache.cc
new file mode 100644
index 0000000000000..0f39a6ea720da
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/StreamCache.cc
@@ -0,0 +1,44 @@
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
+
+namespace cms::cuda {
+  void StreamCache::Deleter::operator()(cudaStream_t stream) const {
+    if (device_ != -1) {
+      ScopedSetDevice deviceGuard{device_};
+      cudaCheck(cudaStreamDestroy(stream));
+    }
+  }
+
+  // StreamCache should be constructed by the first call to
+  // getStreamCache() only if we have CUDA devices present
+  StreamCache::StreamCache() : cache_(deviceCount()) {}
+
+  SharedStreamPtr StreamCache::get() {
+    const auto dev = currentDevice();
+    return cache_[dev].makeOrGet([dev]() {
+      cudaStream_t stream;
+      cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+      return std::unique_ptr<BareStream, Deleter>(stream, Deleter{dev});
+    });
+  }
+
+  void StreamCache::clear() {
+    // Reset the contents of the caches, but leave an
+    // edm::ReusableObjectHolder alive for each device. This is needed
+    // mostly for the unit tests, where the function-static
+    // StreamCache lives through multiple tests (and go through
+    // multiple shutdowns of the framework).
+    cache_.clear();
+    cache_.resize(deviceCount());
+  }
+
+  StreamCache& getStreamCache() {
+    // the public interface is thread safe
+    CMS_THREAD_SAFE static StreamCache cache;
+    return cache;
+  }
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
new file mode 100644
index 0000000000000..c3a33fcee3553
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
@@ -0,0 +1,40 @@
+#include <limits>
+
+#include "FWCore/Utilities/interface/Likely.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "getCachingDeviceAllocator.h"
+
+namespace {
+  const size_t maxAllocationSize =
+      notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
+}
+
+namespace cms::cuda {
+  void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) {
+    void *ptr = nullptr;
+    if constexpr (allocator::useCaching) {
+      if (UNLIKELY(nbytes > maxAllocationSize)) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
+    } else {
+      ScopedSetDevice setDeviceForThisScope(dev);
+      cudaCheck(cudaMalloc(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
+  void free_device(int device, void *ptr) {
+    if constexpr (allocator::useCaching) {
+      cudaCheck(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
+    } else {
+      ScopedSetDevice setDeviceForThisScope(device);
+      cudaCheck(cudaFree(ptr));
+    }
+  }
+
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
new file mode 100644
index 0000000000000..1e8c8f9cd33c5
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
@@ -0,0 +1,37 @@
+#include <limits>
+
+#include "FWCore/Utilities/interface/Likely.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "getCachingHostAllocator.h"
+
+namespace {
+  const size_t maxAllocationSize =
+      notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
+}
+
+namespace cms::cuda {
+  void *allocate_host(size_t nbytes, cudaStream_t stream) {
+    void *ptr = nullptr;
+    if constexpr (allocator::useCaching) {
+      if (UNLIKELY(nbytes > maxAllocationSize)) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
+    } else {
+      cudaCheck(cudaMallocHost(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
+  void free_host(void *ptr) {
+    if constexpr (allocator::useCaching) {
+      cudaCheck(allocator::getCachingHostAllocator().HostFree(ptr));
+    } else {
+      cudaCheck(cudaFreeHost(ptr));
+    }
+  }
+
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
new file mode 100644
index 0000000000000..8158f414b07d4
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
@@ -0,0 +1,78 @@
+#ifndef HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
+#define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+#include "CachingDeviceAllocator.h"
+
+#include <iomanip>
+
+namespace cms::cuda::allocator {
+  // Use caching or not
+  constexpr bool useCaching = true;
+  // Growth factor (bin_growth in cub::CachingDeviceAllocator
+  constexpr unsigned int binGrowth = 8;
+  // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
+  constexpr unsigned int minBin = 1;
+  // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+  constexpr unsigned int maxBin = 10;
+  // Total storage for the allocator. 0 means no limit.
+  constexpr size_t maxCachedBytes = 0;
+  // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
+  constexpr double maxCachedFraction = 0.8;
+  constexpr bool debug = false;
+
+  inline size_t minCachedBytes() {
+    size_t ret = std::numeric_limits<size_t>::max();
+    int currentDevice;
+    cudaCheck(cudaGetDevice(&currentDevice));
+    const int numberOfDevices = deviceCount();
+    for (int i = 0; i < numberOfDevices; ++i) {
+      size_t freeMemory, totalMemory;
+      cudaCheck(cudaSetDevice(i));
+      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+      ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
+    }
+    cudaCheck(cudaSetDevice(currentDevice));
+    if (maxCachedBytes > 0) {
+      ret = std::min(ret, maxCachedBytes);
+    }
+    return ret;
+  }
+
+  inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
+    LogDebug("CachingDeviceAllocator").log([](auto& log) {
+      log << "cub::CachingDeviceAllocator settings\n"
+          << "  bin growth " << binGrowth << "\n"
+          << "  min bin    " << minBin << "\n"
+          << "  max bin    " << maxBin << "\n"
+          << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          log << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    });
+
+    // the public interface is thread safe
+    CMS_THREAD_SAFE static notcub::CachingDeviceAllocator allocator{binGrowth,
+                                                                    minBin,
+                                                                    maxBin,
+                                                                    minCachedBytes(),
+                                                                    false,  // do not skip cleanup
+                                                                    debug};
+    return allocator;
+  }
+}  // namespace cms::cuda::allocator
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
new file mode 100644
index 0000000000000..6e74648696dd8
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
@@ -0,0 +1,47 @@
+#ifndef HeterogeneousCore_CUDACore_src_getCachingHostAllocator
+#define HeterogeneousCore_CUDACore_src_getCachingHostAllocator
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "CachingHostAllocator.h"
+
+#include "getCachingDeviceAllocator.h"
+
+#include <iomanip>
+
+namespace cms::cuda::allocator {
+  inline notcub::CachingHostAllocator& getCachingHostAllocator() {
+    LogDebug("CachingHostAllocator").log([](auto& log) {
+      log << "cub::CachingHostAllocator settings\n"
+          << "  bin growth " << binGrowth << "\n"
+          << "  min bin    " << minBin << "\n"
+          << "  max bin    " << maxBin << "\n"
+          << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          log << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    });
+
+    // the public interface is thread safe
+    CMS_THREAD_SAFE static notcub::CachingHostAllocator allocator{binGrowth,
+                                                                  minBin,
+                                                                  maxBin,
+                                                                  minCachedBytes(),
+                                                                  false,  // do not skip cleanup
+                                                                  debug};
+    return allocator;
+  }
+}  // namespace cms::cuda::allocator
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/requireDevices.cc b/HeterogeneousCore/CUDAUtilities/src/requireDevices.cc
new file mode 100644
index 0000000000000..8140578250d2c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/requireDevices.cc
@@ -0,0 +1,30 @@
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+namespace cms::cudatest {
+  bool testDevices() {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status != cudaSuccess) {
+      std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped."
+                << "\n";
+      return false;
+    }
+    if (devices == 0) {
+      std::cerr << "No CUDA devices available, the test will be skipped."
+                << "\n";
+      return false;
+    }
+    return true;
+  }
+
+  void requireDevices() {
+    if (not testDevices()) {
+      exit(EXIT_SUCCESS);
+    }
+  }
+}  // namespace cms::cudatest
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
new file mode 100644
index 0000000000000..ae6835126f9cf
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -0,0 +1,14 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<iftool name="cuda-gcc-support">
+<bin file="assert_t.cu" name="cudaAssert_t">
+</bin>
+
+<bin file="assert_t.cu"    name="cudaAssert_debug">
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
+  <use name="catch2"/>
+</bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDAUtilities/test/assert_t.cu b/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
new file mode 100644
index 0000000000000..2e74f4f69bec2
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
@@ -0,0 +1,19 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+/**
+ * This file tests that the assert() and #include ".../cuda_assert.h"
+ * compiles and runs when compiled with and without -DGPU_DEBUG (see
+ * also BuildFile.xml).
+ */
+
+__global__ void testIt(int one) { assert(one == 1); }
+
+int main(int argc, char* argv[]) {
+  cms::cudatest::requireDevices();
+
+  testIt<<<1, 1>>>(argc);
+  cudaDeviceSynchronize();
+
+  return (argc == 1) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
new file mode 100644
index 0000000000000..24a114db68ce1
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -0,0 +1,118 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+TEST_CASE("copyAsync", "[cudaMemTools]") {
+  if (not cms::cudatest::testDevices()) {
+    return;
+  }
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  SECTION("Host to device") {
+    SECTION("Single element") {
+      auto host_orig = cms::cuda::make_host_unique<int>(stream);
+      *host_orig = 42;
+
+      auto device = cms::cuda::make_device_unique<int>(stream);
+      auto host = cms::cuda::make_host_unique<int>(stream);
+
+      cms::cuda::copyAsync(device, host_orig, stream);
+      cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+      cudaCheck(cudaStreamSynchronize(stream));
+
+      REQUIRE(*host == 42);
+    }
+
+    SECTION("Multiple elements") {
+      constexpr int N = 100;
+
+      auto host_orig = cms::cuda::make_host_unique<int[]>(N, stream);
+      for (int i = 0; i < N; ++i) {
+        host_orig[i] = i;
+      }
+
+      auto device = cms::cuda::make_device_unique<int[]>(N, stream);
+      auto host = cms::cuda::make_host_unique<int[]>(N, stream);
+
+      SECTION("Copy all") {
+        cms::cuda::copyAsync(device, host_orig, N, stream);
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream));
+        cudaCheck(cudaStreamSynchronize(stream));
+        for (int i = 0; i < N; ++i) {
+          CHECK(host[i] == i);
+        }
+      }
+
+      for (int i = 0; i < N; ++i) {
+        host_orig[i] = 200 + i;
+      }
+
+      SECTION("Copy some") {
+        cms::cuda::copyAsync(device, host_orig, 42, stream);
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream));
+        cudaCheck(cudaStreamSynchronize(stream));
+        for (int i = 0; i < 42; ++i) {
+          CHECK(host[i] == 200 + i);
+        }
+      }
+    }
+  }
+
+  SECTION("Device to host") {
+    SECTION("Single element") {
+      auto host_orig = cms::cuda::make_host_unique<int>(stream);
+      *host_orig = 42;
+
+      auto device = cms::cuda::make_device_unique<int>(stream);
+      auto host = cms::cuda::make_host_unique<int>(stream);
+
+      cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream));
+      cms::cuda::copyAsync(host, device, stream);
+      cudaCheck(cudaStreamSynchronize(stream));
+
+      REQUIRE(*host == 42);
+    }
+
+    SECTION("Multiple elements") {
+      constexpr int N = 100;
+
+      auto host_orig = cms::cuda::make_host_unique<int[]>(N, stream);
+      for (int i = 0; i < N; ++i) {
+        host_orig[i] = i;
+      }
+
+      auto device = cms::cuda::make_device_unique<int[]>(N, stream);
+      auto host = cms::cuda::make_host_unique<int[]>(N, stream);
+
+      SECTION("Copy all") {
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream));
+        cms::cuda::copyAsync(host, device, N, stream);
+        cudaCheck(cudaStreamSynchronize(stream));
+        for (int i = 0; i < N; ++i) {
+          CHECK(host[i] == i);
+        }
+      }
+
+      for (int i = 0; i < N; ++i) {
+        host_orig[i] = 200 + i;
+      }
+
+      SECTION("Copy some") {
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream));
+        cms::cuda::copyAsync(host, device, 42, stream);
+        cudaCheck(cudaStreamSynchronize(stream));
+        for (int i = 0; i < 42; ++i) {
+          CHECK(host[i] == 200 + i);
+        }
+      }
+    }
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
new file mode 100644
index 0000000000000..4c8412f2cecc2
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
@@ -0,0 +1,46 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+TEST_CASE("device_unique_ptr", "[cudaMemTools]") {
+  if (not cms::cudatest::testDevices()) {
+    return;
+  }
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  SECTION("Single element") {
+    auto ptr = cms::cuda::make_device_unique<int>(stream);
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Reset") {
+    auto ptr = cms::cuda::make_device_unique<int>(stream);
+    REQUIRE(ptr != nullptr);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr = cms::cuda::make_device_unique<int[]>(10, stream);
+    REQUIRE(ptr != nullptr);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
+  }
+
+  SECTION("Allocating too much") {
+    constexpr size_t maxSize = 1 << 30;  // 8**10
+    auto ptr = cms::cuda::make_device_unique<char[]>(maxSize, stream);
+    ptr.reset();
+    REQUIRE_THROWS(ptr = cms::cuda::make_device_unique<char[]>(maxSize + 1, stream));
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
new file mode 100644
index 0000000000000..62dcaaf1eba9e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
@@ -0,0 +1,24 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") {
+  if (not cms::cudatest::testDevices()) {
+    return;
+  }
+
+  SECTION("Single element") {
+    auto ptr1 = cms::cuda::make_host_noncached_unique<int>();
+    REQUIRE(ptr1 != nullptr);
+    auto ptr2 = cms::cuda::make_host_noncached_unique<int>(cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    REQUIRE(ptr2 != nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr1 = cms::cuda::make_host_noncached_unique<int[]>(10);
+    REQUIRE(ptr1 != nullptr);
+    auto ptr2 = cms::cuda::make_host_noncached_unique<int[]>(10, cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    REQUIRE(ptr2 != nullptr);
+  }
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
new file mode 100644
index 0000000000000..8ab2bca86106b
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
@@ -0,0 +1,46 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+TEST_CASE("host_unique_ptr", "[cudaMemTools]") {
+  if (not cms::cudatest::testDevices()) {
+    return;
+  }
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  SECTION("Single element") {
+    auto ptr = cms::cuda::make_host_unique<int>(stream);
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Reset") {
+    auto ptr = cms::cuda::make_host_unique<int>(stream);
+    REQUIRE(ptr != nullptr);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr = cms::cuda::make_host_unique<int[]>(10, stream);
+    REQUIRE(ptr != nullptr);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
+  }
+
+  SECTION("Allocating too much") {
+    constexpr size_t maxSize = 1 << 30;  // 8**10
+    auto ptr = cms::cuda::make_host_unique<char[]>(maxSize, stream);
+    ptr.reset();
+    REQUIRE_THROWS(ptr = cms::cuda::make_host_unique<char[]>(maxSize + 1, stream));
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
new file mode 100644
index 0000000000000..1afa1e5806ff6
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
@@ -0,0 +1,53 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+TEST_CASE("memsetAsync", "[cudaMemTools]") {
+  if (not cms::cudatest::testDevices()) {
+    return;
+  }
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  SECTION("Single element") {
+    auto host_orig = cms::cuda::make_host_unique<int>(stream);
+    *host_orig = 42;
+
+    auto device = cms::cuda::make_device_unique<int>(stream);
+    auto host = cms::cuda::make_host_unique<int>(stream);
+    cms::cuda::copyAsync(device, host_orig, stream);
+    cms::cuda::memsetAsync(device, 0, stream);
+    cms::cuda::copyAsync(host, device, stream);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    REQUIRE(*host == 0);
+  }
+
+  SECTION("Multiple elements") {
+    constexpr int N = 100;
+
+    auto host_orig = cms::cuda::make_host_unique<int[]>(N, stream);
+    for (int i = 0; i < N; ++i) {
+      host_orig[i] = i;
+    }
+
+    auto device = cms::cuda::make_device_unique<int[]>(N, stream);
+    auto host = cms::cuda::make_host_unique<int[]>(N, stream);
+    cms::cuda::copyAsync(device, host_orig, N, stream);
+    cms::cuda::memsetAsync(device, 0, N, stream);
+    cms::cuda::copyAsync(host, device, N, stream);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    for (int i = 0; i < N; ++i) {
+      CHECK(host[i] == 0);
+    }
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"