diff --git a/src/cudadev/CUDACore/AcquireContext.cc b/src/cudadev/CUDACore/AcquireContext.cc
new file mode 100644
index 000000000..dd6e8d1de
--- /dev/null
+++ b/src/cudadev/CUDACore/AcquireContext.cc
@@ -0,0 +1,5 @@
+#include "CUDACore/AcquireContext.h"
+
+namespace cms::cuda {
+  void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); }
+}
diff --git a/src/cudadev/CUDACore/AcquireContext.h b/src/cudadev/CUDACore/AcquireContext.h
new file mode 100644
index 000000000..409320abe
--- /dev/null
+++ b/src/cudadev/CUDACore/AcquireContext.h
@@ -0,0 +1,45 @@
+#ifndef HeterogeneousCore_CUDACore_AcquireContext_h
+#define HeterogeneousCore_CUDACore_AcquireContext_h
+
+#include "CUDACore/EDGetterContextBase.h"
+#include "CUDACore/TaskContext.h"
+
+namespace cms::cuda {
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
+   * - setting the current device
+   * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+   * - synchronizing between CUDA streams if necessary
+   * Users should not, however, construct it explicitly.
+   */
+  class AcquireContext : public impl::EDGetterContextBase {
+  public:
+    explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+        : EDGetterContextBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {}
+    ~AcquireContext() = default;
+
+    template <typename F>
+    void pushNextTask(F&& f) {
+      holderHelper_.pushNextTask(std::forward<F>(f));
+    }
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+    }
+
+    // internal API
+    void commit();
+
+  private:
+    impl::FwkContextHolderHelper holderHelper_;
+  };
+
+  template <typename F>
+  void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) {
+    AcquireContext context(streamID, std::move(holder));
+    func(context);
+    context.commit();
+  }
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/AnalyzeContext.h b/src/cudadev/CUDACore/AnalyzeContext.h
new file mode 100644
index 000000000..cbe9a66dd
--- /dev/null
+++ b/src/cudadev/CUDACore/AnalyzeContext.h
@@ -0,0 +1,20 @@
+#ifndef HeterogeneousCore_CUDACore_AnalyzeContext_h
+#define HeterogeneousCore_CUDACore_AnalyzeContext_h
+
+#include "CUDACore/EDGetterContextBase.h"
+
+namespace cms::cuda {
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in analyze()
+   * - setting the current device
+   * - synchronizing between CUDA streams if necessary
+   * and enforce that those get done in a proper way in RAII fashion.
+   */
+  class AnalyzeContext : public impl::EDGetterContextBase {
+  public:
+    /// Constructor to (possibly) re-use a CUDA stream
+    explicit AnalyzeContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {}
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/ContextState.h b/src/cudadev/CUDACore/ContextState.h
index 3c73054ab..b0109e33d 100644
--- a/src/cudadev/CUDACore/ContextState.h
+++ b/src/cudadev/CUDACore/ContextState.h
@@ -26,6 +26,9 @@ namespace cms {
       friend class ScopedContextAcquire;
       friend class ScopedContextProduce;
       friend class ScopedContextTask;
+      friend class AcquireContext;
+      friend class ProduceContext;
+      friend class TaskContext;
 
       void set(int device, SharedStreamPtr stream) {
         throwIfStream();
diff --git a/src/cudadev/CUDACore/EDGetterContextBase.cc b/src/cudadev/CUDACore/EDGetterContextBase.cc
new file mode 100644
index 000000000..cfeccb178
--- /dev/null
+++ b/src/cudadev/CUDACore/EDGetterContextBase.cc
@@ -0,0 +1,27 @@
+#include "CUDACore/EDGetterContextBase.h"
+#include "CUDACore/cudaCheck.h"
+
+namespace cms::cuda::impl {
+  void EDGetterContextBase::synchronizeStreams(int dataDevice,
+                                               cudaStream_t dataStream,
+                                               bool available,
+                                               cudaEvent_t dataEvent) {
+    if (dataDevice != device()) {
+      // Eventually replace with prefetch to current device (assuming unified memory works)
+      // If we won't go to unified memory, need to figure out something else...
+      throw std::runtime_error("Handling data from multiple devices is not yet supported");
+    }
+
+    if (dataStream != stream()) {
+      // Different streams, need to synchronize
+      if (not available) {
+        // Event not yet occurred, so need to add synchronization
+        // here. Sychronization is done by making the CUDA stream to
+        // wait for an event, so all subsequent work in the stream
+        // will run only after the event has "occurred" (i.e. data
+        // product became available).
+        cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
+      }
+    }
+  }
+}
diff --git a/src/cudadev/CUDACore/EDGetterContextBase.h b/src/cudadev/CUDACore/EDGetterContextBase.h
new file mode 100644
index 000000000..d17814d69
--- /dev/null
+++ b/src/cudadev/CUDACore/EDGetterContextBase.h
@@ -0,0 +1,38 @@
+#ifndef HeterogeneousCore_CUDACore_EDGetterContextBase_h
+#define HeterogeneousCore_CUDACore_EDGetterContextBase_h
+
+#include "CUDACore/FwkContextBase.h"
+#include "CUDACore/Product.h"
+#include "Framework/EDGetToken.h"
+
+namespace cms::cuda::impl {
+  /**
+   * This class is a base class for Context classes that should be
+   * able to read Event Data products
+   */
+  class EDGetterContextBase : public FwkContextBase {
+  public:
+    template <typename T>
+    const T& get(const Product<T>& data) {
+      if (not isInitialized()) {
+        initialize(data);
+      }
+      synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
+      return data.data_;
+    }
+
+    template <typename T>
+    const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
+      return get(iEvent.get(token));
+    }
+
+  protected:
+    template <typename... Args>
+    EDGetterContextBase(Args&&... args) : FwkContextBase(std::forward<Args>(args)...) {}
+
+  private:
+    void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/EDProducer.cc b/src/cudadev/CUDACore/EDProducer.cc
new file mode 100644
index 000000000..2e08ea2d2
--- /dev/null
+++ b/src/cudadev/CUDACore/EDProducer.cc
@@ -0,0 +1,17 @@
+#include "CUDACore/EDProducer.h"
+
+namespace cms::cuda {
+  void EDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
+    runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
+  }
+
+  void SynchronizingEDProducer::acquire(edm::Event const& event,
+                                        edm::EventSetup const& eventSetup,
+                                        edm::WaitingTaskWithArenaHolder holder) {
+    runAcquire(event.streamID(), std::move(holder), [&](auto& ctx) { acquire(event, eventSetup, ctx); });
+  }
+
+  void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
+    runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
+  }
+}  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/EDProducer.h b/src/cudadev/CUDACore/EDProducer.h
new file mode 100644
index 000000000..92529c502
--- /dev/null
+++ b/src/cudadev/CUDACore/EDProducer.h
@@ -0,0 +1,27 @@
+#ifndef HeterogeneousCore_CUDACore_stream_EDProducer_h
+#define HeterogeneousCore_CUDACore_stream_EDProducer_h
+
+#include "Framework/EDProducer.h"
+#include "CUDACore/AcquireContext.h"
+#include "CUDACore/ProduceContext.h"
+
+namespace cms::cuda {
+  class EDProducer : public edm::EDProducer {
+  public:
+    void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;
+    virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
+  };
+
+  class SynchronizingEDProducer : public edm::EDProducerExternalWork {
+  public:
+    void acquire(edm::Event const& event,
+                 edm::EventSetup const& eventSetup,
+                 edm::WaitingTaskWithArenaHolder holder) override;
+    void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;
+
+    virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0;
+    virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
+  };
+}  // namespace cms::cuda
+
+#endif
diff --git a/src/cudadev/CUDACore/FwkContextBase.cc b/src/cudadev/CUDACore/FwkContextBase.cc
new file mode 100644
index 000000000..47e2d8199
--- /dev/null
+++ b/src/cudadev/CUDACore/FwkContextBase.cc
@@ -0,0 +1,28 @@
+#include "CUDACore/FwkContextBase.h"
+#include "CUDACore/StreamCache.h"
+#include "CUDACore/cudaCheck.h"
+
+#include "chooseDevice.h"
+
+namespace cms::cuda::impl {
+  FwkContextBase::FwkContextBase(edm::StreamID streamID) : FwkContextBase(chooseDevice(streamID)) {}
+
+  FwkContextBase::FwkContextBase(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); }
+
+  FwkContextBase::FwkContextBase(int device, SharedStreamPtr stream)
+    : currentDevice_(device), stream_(std::make_shared<impl::StreamSharingHelper>(std::move(stream))) {
+    cudaCheck(cudaSetDevice(currentDevice_));
+  }
+
+  void FwkContextBase::initialize() { stream_ = std::make_shared<impl::StreamSharingHelper>(getStreamCache().get()); }
+
+  void FwkContextBase::initialize(const ProductBase& data) {
+    SharedStreamPtr stream;
+    if (data.mayReuseStream()) {
+      stream = data.streamPtr();
+    } else {
+      stream = getStreamCache().get();
+    }
+    stream_ = std::make_shared<impl::StreamSharingHelper>(std::move(stream));
+  }
+}
diff --git a/src/cudadev/CUDACore/FwkContextBase.h b/src/cudadev/CUDACore/FwkContextBase.h
new file mode 100644
index 000000000..1b329cb03
--- /dev/null
+++ b/src/cudadev/CUDACore/FwkContextBase.h
@@ -0,0 +1,66 @@
+#ifndef HeterogeneousCore_CUDACore_FwkContextBase_h
+#define HeterogeneousCore_CUDACore_FwkContextBase_h
+
+#include "CUDACore/ProductBase.h"
+#include "CUDACore/SharedStreamPtr.h"
+#include "Framework/Event.h"
+
+namespace cms::cuda::impl {
+  /**
+   * This class is a base class for other Context classes for interacting with the framework
+   */
+  class FwkContextBase {
+  public:
+    FwkContextBase(FwkContextBase const&) = delete;
+    FwkContextBase& operator=(FwkContextBase const&) = delete;
+    FwkContextBase(FwkContextBase&&) = delete;
+    FwkContextBase& operator=(FwkContextBase&&) = delete;
+    
+    int device() const { return currentDevice_; }
+
+    cudaStream_t stream() {
+      if (not isInitialized()) {
+        initialize();
+      }
+      return stream_->streamPtr().get();
+    }
+    const SharedStreamPtr& streamPtr() {
+      if (not isInitialized()) {
+        initialize();
+      }
+      return stream_->streamPtr();
+    }
+
+  protected:
+    // The constructors set the current device, but the device
+    // is not set back to the previous value at the destructor. This
+    // should be sufficient (and tiny bit faster) as all CUDA API
+    // functions relying on the current device should be called from
+    // the scope where this context is. The current device doesn't
+    // really matter between modules (or across TBB tasks).
+    explicit FwkContextBase(edm::StreamID streamID);
+
+    explicit FwkContextBase(int device);
+
+    // meant only for testing
+    explicit FwkContextBase(int device, SharedStreamPtr stream);
+
+    bool isInitialized() const { return bool(stream_); }
+
+    void initialize();
+    void initialize(const ProductBase& data);
+
+    const std::shared_ptr<impl::StreamSharingHelper>& streamSharingHelper() {
+      if (not isInitialized()) {
+        initialize();
+      }
+      return stream_;
+    }
+
+  private:
+    int currentDevice_ = -1;
+    std::shared_ptr<impl::StreamSharingHelper> stream_;
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/FwkContextHolderHelper.h b/src/cudadev/CUDACore/FwkContextHolderHelper.h
new file mode 100644
index 000000000..18e465142
--- /dev/null
+++ b/src/cudadev/CUDACore/FwkContextHolderHelper.h
@@ -0,0 +1,25 @@
+#ifndef HeterogeneousCore_CUDACore_FwkContextHolderHelper_h
+#define HeterogeneousCore_CUDACore_FwkContextHolderHelper_h
+
+namespace cms::cuda::impl {
+  class FwkContextHolderHelper {
+  public:
+    FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device)
+      : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {}
+
+    template <typename F>
+    void pushNextTask(F&& f);
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      waitingTaskHolder_ = std::move(waitingTaskHolder);
+    }
+
+    void enqueueCallback(cudaStream_t stream);
+
+  private:
+    edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+    int device_;
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/ProduceContext.cc b/src/cudadev/CUDACore/ProduceContext.cc
new file mode 100644
index 000000000..0af59c3a7
--- /dev/null
+++ b/src/cudadev/CUDACore/ProduceContext.cc
@@ -0,0 +1,6 @@
+#include "CUDACore/ProduceContext.h"
+#include "CUDACore/cudaCheck.h"
+
+namespace cms::cuda {
+  void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); }
+}
diff --git a/src/cudadev/CUDACore/ProduceContext.h b/src/cudadev/CUDACore/ProduceContext.h
new file mode 100644
index 000000000..980e53377
--- /dev/null
+++ b/src/cudadev/CUDACore/ProduceContext.h
@@ -0,0 +1,52 @@
+#ifndef HeterogeneousCore_CUDACore_ProduceContext_h
+#define HeterogeneousCore_CUDACore_ProduceContext_h
+
+#include "CUDACore/EDGetterContextBase.h"
+#include "CUDACore/EventCache.h"
+#include "Framework/EDPutToken.h"
+
+namespace cms::cuda {
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
+   * - setting the current device
+   * - synchronizing between CUDA streams if necessary
+   * Users should not, however, construct it explicitly.
+   */
+  class ProduceContext : public impl::EDGetterContextBase {
+  public:
+    explicit ProduceContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {}
+
+    ~ProduceContext() = default;
+
+    template <typename T>
+    std::unique_ptr<Product<T>> wrap(T data) {
+      // make_unique doesn't work because of private constructor
+      return std::unique_ptr<Product<T>>(new Product<T>(device(), streamSharingHelper(), event_, std::move(data)));
+    }
+
+    template <typename T, typename... Args>
+    auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
+      return iEvent.emplace(token, device(), streamSharingHelper(), event_, std::forward<Args>(args)...);
+    }
+
+    // internal API
+    void commit();
+
+  private:
+    // This construcor is only meant for testing
+    explicit ProduceContext(int device, SharedStreamPtr stream, SharedEventPtr event)
+        : EDGetterContextBase(device, std::move(stream)), event_{std::move(event)} {}
+
+    // create the CUDA Event upfront to catch possible errors from its creation
+    SharedEventPtr event_ = getEventCache().get();
+  };
+
+  template <typename F>
+  void runProduce(edm::StreamID streamID, F&& func) {
+    ProduceContext context(streamID);
+    func(context);
+    context.commit();
+  }
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h
index c60e994f0..5d9aaf36e 100644
--- a/src/cudadev/CUDACore/Product.h
+++ b/src/cudadev/CUDACore/Product.h
@@ -13,14 +13,14 @@ namespace edm {
 namespace cms {
   namespace cuda {
     namespace impl {
-      class ScopedContextGetterBase;
-    }
+      class EDGetterContextBase;
+    }  // namespace impl
 
     /**
      * The purpose of this class is to wrap CUDA data to edm::Event in a
      * way which forces correct use of various utilities.
      *
-     * The non-default construction has to be done with cms::cuda::ScopedContext
+     * The non-default construction has to be done with cms::cuda::Context
      * (in order to properly register the CUDA event).
      *
      * The default constructor is needed only for the ROOT dictionary generation.
@@ -41,15 +41,18 @@ namespace cms {
       Product& operator=(Product&&) = default;
 
     private:
-      friend class impl::ScopedContextGetterBase;
-      friend class ScopedContextProduce;
+      friend class impl::EDGetterContextBase;
+      friend class ProduceContext;
       friend class edm::Wrapper<Product<T>>;
 
-      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
+      explicit Product(int device, std::shared_ptr<impl::StreamSharingHelper> stream, SharedEventPtr event, T data)
           : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
 
       template <typename... Args>
-      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args)
+      explicit Product(int device,
+                       std::shared_ptr<impl::StreamSharingHelper> stream,
+                       SharedEventPtr event,
+                       Args&&... args)
           : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
 
       T data_;  //!
diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h
index cb3fd4db9..e1f04ec12 100644
--- a/src/cudadev/CUDACore/ProductBase.h
+++ b/src/cudadev/CUDACore/ProductBase.h
@@ -10,8 +10,42 @@
 namespace cms {
   namespace cuda {
     namespace impl {
-      class ScopedContextBase;
-    }
+      class FwkContextBase;
+
+      /**
+       * The CUDA stream is shared between all the Event products of
+       * the EDProducer. If the stream gets re-used, only one consumer
+       * of all the products should be allowed to use the stream. An
+       * objects of this class is shared between such Event products
+       * and takes care of letting only those consumers get the stream.
+       */
+      class StreamSharingHelper {
+      public:
+        explicit StreamSharingHelper(SharedStreamPtr stream) : stream_(std::move(stream)) {}
+        StreamSharingHelper(const StreamSharingHelper&) = delete;
+        StreamSharingHelper& operator=(const StreamSharingHelper&) = delete;
+        StreamSharingHelper(StreamSharingHelper&&) = delete;
+        StreamSharingHelper& operator=(StreamSharingHelper) = delete;
+
+        const SharedStreamPtr& streamPtr() const { return stream_; }
+
+        bool mayReuseStream() const {
+          bool expected = true;
+          bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+          // If the current thread is the one flipping the flag, it may
+          // reuse the stream.
+          return changed;
+        }
+
+      private:
+        SharedStreamPtr stream_;
+
+        // This flag tells whether the CUDA stream may be reused by a
+        // consumer or not. The goal is to have a "chain" of modules to
+        // queue their work to the same stream.
+        mutable std::atomic<bool> mayReuseStream_ = true;
+      };
+    }  // namespace impl
 
     /**
      * Base class for all instantiations of CUDA<T> to hold the
@@ -24,66 +58,45 @@ namespace cms {
 
       ProductBase(const ProductBase&) = delete;
       ProductBase& operator=(const ProductBase&) = delete;
-      ProductBase(ProductBase&& other)
-          : stream_{std::move(other.stream_)},
-            event_{std::move(other.event_)},
-            mayReuseStream_{other.mayReuseStream_.load()},
-            device_{other.device_} {}
-      ProductBase& operator=(ProductBase&& other) {
-        stream_ = std::move(other.stream_);
-        event_ = std::move(other.event_);
-        mayReuseStream_ = other.mayReuseStream_.load();
-        device_ = other.device_;
-        return *this;
-      }
-
-      bool isValid() const { return stream_.get() != nullptr; }
+      ProductBase(ProductBase&& other) = default;
+      ProductBase& operator=(ProductBase&& other) = default;
+
       bool isAvailable() const;
 
       int device() const { return device_; }
 
       // cudaStream_t is a pointer to a thread-safe object, for which a
-      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // mutable access is needed even if the ProductBase itself
       // would be const. Therefore it is ok to return a non-const
       // pointer from a const method here.
-      cudaStream_t stream() const { return stream_.get(); }
+      cudaStream_t stream() const { return stream_->streamPtr().get(); }
 
       // cudaEvent_t is a pointer to a thread-safe object, for which a
-      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // mutable access is needed even if the ProductBase itself
       // would be const. Therefore it is ok to return a non-const
       // pointer from a const method here.
       cudaEvent_t event() const { return event_.get(); }
 
     protected:
-      explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event)
+      explicit ProductBase(int device, std::shared_ptr<impl::StreamSharingHelper> stream, SharedEventPtr event)
           : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
 
     private:
-      friend class impl::ScopedContextBase;
-      friend class ScopedContextProduce;
-
-      // The following function is intended to be used only from ScopedContext
-      const SharedStreamPtr& streamPtr() const { return stream_; }
-
-      bool mayReuseStream() const {
-        bool expected = true;
-        bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
-        // If the current thread is the one flipping the flag, it may
-        // reuse the stream.
-        return changed;
-      }
-
-      // The cudaStream_t is really shared among edm::Event products, so
-      // using shared_ptr also here
-      SharedStreamPtr stream_;  //!
+      friend class impl::FwkContextBase;
+      friend class ProduceContext;
+
+      // The following function is intended to be used only from Context
+      const SharedStreamPtr& streamPtr() const { return stream_->streamPtr(); }
+
+      bool mayReuseStream() const { return stream_->mayReuseStream(); }
+
+      // Helper shared between all cms::cuda::Product<T> event
+      // products of an EDProducer
+      std::shared_ptr<impl::StreamSharingHelper> stream_;  //!
+
       // shared_ptr because of caching in cms::cuda::EventCache
       SharedEventPtr event_;  //!
 
-      // This flag tells whether the CUDA stream may be reused by a
-      // consumer or not. The goal is to have a "chain" of modules to
-      // queue their work to the same stream.
-      mutable std::atomic<bool> mayReuseStream_ = true;  //!
-
       // The CUDA device associated with this product
       int device_ = -1;  //!
     };
diff --git a/src/cudadev/CUDACore/ScopedContext.cc b/src/cudadev/CUDACore/ScopedContext.cc
deleted file mode 100644
index 14bff04eb..000000000
--- a/src/cudadev/CUDACore/ScopedContext.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "CUDACore/ScopedContext.h"
-
-#include "CUDACore/StreamCache.h"
-#include "CUDACore/cudaCheck.h"
-
-#include "chooseDevice.h"
-
-namespace {
-  struct CallbackData {
-    edm::WaitingTaskWithArenaHolder holder;
-    int device;
-  };
-
-  void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
-    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
-    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
-    int device = guard->device;
-    if (status == cudaSuccess) {
-      //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream "
-      //          << streamId << std::endl;
-      waitingTaskHolder.doneWaiting(nullptr);
-    } else {
-      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
-      try {
-        auto error = cudaGetErrorName(status);
-        auto message = cudaGetErrorString(status);
-        throw std::runtime_error("Callback of CUDA stream " +
-                                 std::to_string(reinterpret_cast<unsigned long>(streamId)) + " in device " +
-                                 std::to_string(device) + " error " + std::string(error) + ": " + std::string(message));
-      } catch (std::exception&) {
-        waitingTaskHolder.doneWaiting(std::current_exception());
-      }
-    }
-  }
-}  // namespace
-
-namespace cms::cuda {
-  namespace impl {
-    ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-      stream_ = getStreamCache().get();
-    }
-
-    ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-      if (data.mayReuseStream()) {
-        stream_ = data.streamPtr();
-      } else {
-        stream_ = getStreamCache().get();
-      }
-    }
-
-    ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
-        : currentDevice_(device), stream_(std::move(stream)) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-    }
-
-    ////////////////////
-
-    void ScopedContextGetterBase::synchronizeStreams(int dataDevice,
-                                                     cudaStream_t dataStream,
-                                                     bool available,
-                                                     cudaEvent_t dataEvent) {
-      if (dataDevice != device()) {
-        // Eventually replace with prefetch to current device (assuming unified memory works)
-        // If we won't go to unified memory, need to figure out something else...
-        throw std::runtime_error("Handling data from multiple devices is not yet supported");
-      }
-
-      if (dataStream != stream()) {
-        // Different streams, need to synchronize
-        if (not available) {
-          // Event not yet occurred, so need to add synchronization
-          // here. Sychronization is done by making the CUDA stream to
-          // wait for an event, so all subsequent work in the stream
-          // will run only after the event has "occurred" (i.e. data
-          // product became available).
-          cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
-        }
-      }
-    }
-
-    void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
-      cudaCheck(
-          cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
-    }
-  }  // namespace impl
-
-  ////////////////////
-
-  ScopedContextAcquire::~ScopedContextAcquire() {
-    holderHelper_.enqueueCallback(device(), stream());
-    if (contextState_) {
-      contextState_->set(device(), streamPtr());
-    }
-  }
-
-  void ScopedContextAcquire::throwNoState() {
-    throw std::runtime_error(
-        "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with "
-        "ContextState, but that was not the case");
-  }
-
-  ////////////////////
-
-  ScopedContextProduce::~ScopedContextProduce() {
-    // Intentionally not checking the return value to avoid throwing
-    // exceptions. If this call would fail, we should get failures
-    // elsewhere as well.
-    cudaEventRecord(event_.get(), stream());
-  }
-
-  ////////////////////
-
-  ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
-}  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/ScopedContext.h b/src/cudadev/CUDACore/ScopedContext.h
deleted file mode 100644
index 4f6669883..000000000
--- a/src/cudadev/CUDACore/ScopedContext.h
+++ /dev/null
@@ -1,241 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_ScopedContext_h
-#define HeterogeneousCore_CUDACore_ScopedContext_h
-
-#include <optional>
-
-#include "CUDACore/Product.h"
-#include "Framework/WaitingTaskWithArenaHolder.h"
-#include "Framework/Event.h"
-#include "Framework/EDGetToken.h"
-#include "Framework/EDPutToken.h"
-#include "CUDACore/ContextState.h"
-#include "CUDACore/EventCache.h"
-#include "CUDACore/SharedEventPtr.h"
-#include "CUDACore/SharedStreamPtr.h"
-
-namespace cms {
-  namespace cudatest {
-    class TestScopedContext;
-  }
-
-  namespace cuda {
-
-    namespace impl {
-      // This class is intended to be derived by other ScopedContext*, not for general use
-      class ScopedContextBase {
-      public:
-        int device() const { return currentDevice_; }
-
-        // cudaStream_t is a pointer to a thread-safe object, for which a
-        // mutable access is needed even if the ScopedContext itself
-        // would be const. Therefore it is ok to return a non-const
-        // pointer from a const method here.
-        cudaStream_t stream() const { return stream_.get(); }
-        const SharedStreamPtr& streamPtr() const { return stream_; }
-
-      protected:
-        // The constructors set the current device, but the device
-        // is not set back to the previous value at the destructor. This
-        // should be sufficient (and tiny bit faster) as all CUDA API
-        // functions relying on the current device should be called from
-        // the scope where this context is. The current device doesn't
-        // really matter between modules (or across TBB tasks).
-        explicit ScopedContextBase(edm::StreamID streamID);
-
-        explicit ScopedContextBase(const ProductBase& data);
-
-        explicit ScopedContextBase(int device, SharedStreamPtr stream);
-
-      private:
-        int currentDevice_;
-        SharedStreamPtr stream_;
-      };
-
-      class ScopedContextGetterBase : public ScopedContextBase {
-      public:
-        template <typename T>
-        const T& get(const Product<T>& data) {
-          synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
-          return data.data_;
-        }
-
-        template <typename T>
-        const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
-          return get(iEvent.get(token));
-        }
-
-      protected:
-        template <typename... Args>
-        ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward<Args>(args)...) {}
-
-        void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
-      };
-
-      class ScopedContextHolderHelper {
-      public:
-        ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-            : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
-
-        template <typename F>
-        void pushNextTask(F&& f, ContextState const* state);
-
-        void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-          waitingTaskHolder_ = std::move(waitingTaskHolder);
-        }
-
-        void enqueueCallback(int device, cudaStream_t stream);
-
-      private:
-        edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
-      };
-    }  // namespace impl
-
-    /**
-     * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
-     * - setting the current device
-     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
-     * - synchronizing between CUDA streams if necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextAcquire : public impl::ScopedContextGetterBase {
-    public:
-      /// Constructor to create a new CUDA stream (no need for context beyond acquire())
-      explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
-
-      /// Constructor to create a new CUDA stream, and the context is needed after acquire()
-      explicit ScopedContextAcquire(edm::StreamID streamID,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
-                                    ContextState& state)
-          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
-      /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
-      explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
-
-      /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
-      explicit ScopedContextAcquire(const ProductBase& data,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
-                                    ContextState& state)
-          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
-      ~ScopedContextAcquire();
-
-      template <typename F>
-      void pushNextTask(F&& f) {
-        if (contextState_ == nullptr)
-          throwNoState();
-        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
-      }
-
-      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-      }
-
-    private:
-      void throwNoState();
-
-      impl::ScopedContextHolderHelper holderHelper_;
-      ContextState* contextState_ = nullptr;
-    };
-
-    /**
-     * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
-     * - setting the current device
-     * - synchronizing between CUDA streams if necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextProduce : public impl::ScopedContextGetterBase {
-    public:
-      /// Constructor to create a new CUDA stream (non-ExternalWork module)
-      explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {}
-
-      /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
-      explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {}
-
-      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-      explicit ScopedContextProduce(ContextState& state)
-          : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
-
-      /// Record the CUDA event, all asynchronous work must have been queued before the destructor
-      ~ScopedContextProduce();
-
-      template <typename T>
-      std::unique_ptr<Product<T>> wrap(T data) {
-        // make_unique doesn't work because of private constructor
-        return std::unique_ptr<Product<T>>(new Product<T>(device(), streamPtr(), event_, std::move(data)));
-      }
-
-      template <typename T, typename... Args>
-      auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
-        return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
-      }
-
-    private:
-      friend class cudatest::TestScopedContext;
-
-      // This construcor is only meant for testing
-      explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event)
-          : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
-
-      // create the CUDA Event upfront to catch possible errors from its creation
-      SharedEventPtr event_ = getEventCache().get();
-    };
-
-    /**
-     * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
-     * - setting the current device
-     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextTask : public impl::ScopedContextBase {
-    public:
-      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-      explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : ScopedContextBase(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
-            holderHelper_{std::move(waitingTaskHolder)},
-            contextState_{state} {}
-
-      ~ScopedContextTask();
-
-      template <typename F>
-      void pushNextTask(F&& f) {
-        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
-      }
-
-      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-      }
-
-    private:
-      impl::ScopedContextHolderHelper holderHelper_;
-      ContextState const* contextState_;
-    };
-
-    /**
-     * The aim of this class is to do necessary per-event "initialization" in analyze()
-     * - setting the current device
-     * - synchronizing between CUDA streams if necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextAnalyze : public impl::ScopedContextGetterBase {
-    public:
-      /// Constructor to (possibly) re-use a CUDA stream
-      explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
-    };
-
-    namespace impl {
-      template <typename F>
-      void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
-        replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
-            edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
-                                               std::move(waitingTaskHolder_),
-                                               [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
-                                                 func(ScopedContextTask{state, std::move(h)});
-                                               })});
-      }
-    }  // namespace impl
-  }    // namespace cuda
-}  // namespace cms
-
-#endif
diff --git a/src/cudadev/CUDACore/TaskContext.cc b/src/cudadev/CUDACore/TaskContext.cc
new file mode 100644
index 000000000..a2c5b0194
--- /dev/null
+++ b/src/cudadev/CUDACore/TaskContext.cc
@@ -0,0 +1,41 @@
+#include "CUDACore/TaskContext.h"
+#include "CUDACore/cudaCheck.h"
+
+namespace {
+  struct CallbackData {
+    edm::WaitingTaskWithArenaHolder holder;
+    int device;
+  };
+
+  void CUDART_CB cudaContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
+    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
+    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
+    int device = guard->device;
+    if (status == cudaSuccess) {
+      //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream "
+      //          << streamId << std::endl;
+      waitingTaskHolder.doneWaiting(nullptr);
+    } else {
+      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+      try {
+        auto error = cudaGetErrorName(status);
+        auto message = cudaGetErrorString(status);
+        throw std::runtime_error("Callback of CUDA stream " +
+                                 std::to_string(reinterpret_cast<unsigned long>(streamId)) + " in device " +
+                                 std::to_string(device) + " error " + std::string(error) + ": " + std::string(message));
+      } catch (std::exception&) {
+        waitingTaskHolder.doneWaiting(std::current_exception());
+      }
+    }
+  }
+}  // namespace
+
+namespace cms::cuda {
+  namespace impl {
+    void FwkContextHolderHelper::enqueueCallback(cudaStream_t stream) {
+      cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device_}, 0));
+    }
+  }
+
+  void TaskContext::commit() { holderHelper_.enqueueCallback(stream()); }
+}
diff --git a/src/cudadev/CUDACore/TaskContext.h b/src/cudadev/CUDACore/TaskContext.h
new file mode 100644
index 000000000..121c9165d
--- /dev/null
+++ b/src/cudadev/CUDACore/TaskContext.h
@@ -0,0 +1,71 @@
+#ifndef HeterogeneousCore_CUDACore_TaskContext_h
+#define HeterogeneousCore_CUDACore_TaskContext_h
+
+#include "CUDACore/FwkContextBase.h"
+#include "Framework/WaitingTaskWithArenaHolder.h"
+
+namespace cms::cuda {
+  namespace impl {
+    class FwkContextHolderHelper {
+    public:
+      FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device)
+        : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {}
+
+      template <typename F>
+      void pushNextTask(F&& f);
+
+      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+        waitingTaskHolder_ = std::move(waitingTaskHolder);
+      }
+
+      void enqueueCallback(cudaStream_t stream);
+
+    private:
+      edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+      int device_;
+    };
+  }
+
+  /**
+   * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
+   * - setting the current device
+   * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+   */
+  class TaskContext : public impl::FwkContextBase {
+  public:
+    /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+    explicit TaskContext(int device, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+        : FwkContextBase(device), holderHelper_{std::move(waitingTaskHolder), device} {}
+
+    ~TaskContext() = default;
+
+    template <typename F>
+    void pushNextTask(F&& f) {
+      holderHelper_.pushNextTask(std::forward<F>(f));
+    }
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+    }
+
+    // Internal API
+    void commit();
+
+  private:
+    impl::FwkContextHolderHelper holderHelper_;
+  };
+
+  namespace impl {
+    template <typename F>
+    void FwkContextHolderHelper::pushNextTask(F&& f) {
+      replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder(
+          tbb::task::allocate_root(),
+          std::move(waitingTaskHolder_),
+          [device = device_, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
+            func(TaskContext{device, std::move(h)});
+          })});
+    }
+  }  // namespace impl
+}
+
+#endif
diff --git a/src/cudadev/Framework/Event.h b/src/cudadev/Framework/Event.h
index 9f952c492..68a04e729 100644
--- a/src/cudadev/Framework/Event.h
+++ b/src/cudadev/Framework/Event.h
@@ -9,7 +9,32 @@
 
 // type erasure
 namespace edm {
-  using StreamID = int;
+  class Event;
+
+  class StreamID {
+  public:
+    ~StreamID() = default;
+    StreamID() = delete;
+    StreamID(const StreamID&) = default;
+    StreamID& operator=(const StreamID&) = default;
+
+    bool operator==(const StreamID& iID) const { return iID.value_ == value_; }
+
+    operator unsigned int() const { return value_; }
+
+    /** \return value ranging from 0 to one less than max number of streams.
+     */
+    unsigned int value() const { return value_; }
+
+    static StreamID invalidStreamID() { return StreamID(0xFFFFFFFFU); }
+
+  private:
+    ///Only a Event is allowed to create one of these
+    friend class Event;
+    explicit StreamID(unsigned int iValue) : value_(iValue) {}
+
+    unsigned int value_;
+  };
 
   class WrapperBase {
   public:
diff --git a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
index 48badcabf..b9633a856 100644
--- a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
+++ b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc
@@ -2,23 +2,22 @@
 
 #include <cuda_runtime.h>
 
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDACore/copyAsync.h"
 #include "CUDACore/host_noncached_unique_ptr.h"
 #include "CUDADataFormats/BeamSpotCUDA.h"
 #include "DataFormats/BeamSpotPOD.h"
-#include "Framework/EDProducer.h"
 #include "Framework/Event.h"
 #include "Framework/EventSetup.h"
 #include "Framework/PluginFactory.h"
 
-class BeamSpotToCUDA : public edm::EDProducer {
+class BeamSpotToCUDA : public cms::cuda::EDProducer {
 public:
   explicit BeamSpotToCUDA(edm::ProductRegistry& reg);
   ~BeamSpotToCUDA() override = default;
 
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override;
 
 private:
   const edm::EDPutTokenT<cms::cuda::Product<BeamSpotCUDA>> bsPutToken_;
@@ -30,11 +29,9 @@ BeamSpotToCUDA::BeamSpotToCUDA(edm::ProductRegistry& reg)
     : bsPutToken_{reg.produces<cms::cuda::Product<BeamSpotCUDA>>()},
       bsHost{cms::cuda::make_host_noncached_unique<BeamSpotPOD>(cudaHostAllocWriteCombined)} {}
 
-void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) {
   *bsHost = iSetup.get<BeamSpotPOD>();
 
-  cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
-
   BeamSpotCUDA bsDevice(ctx.stream());
   cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream());
 
diff --git a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
index 408450ea8..c9e0cc5d9 100644
--- a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
+++ b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
@@ -1,27 +1,24 @@
 #include <cuda_runtime.h>
 
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDACore/HostProduct.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
-#include "CUDACore/ScopedContext.h"
 
 // Switch on to enable checks and printout for found tracks
 #undef PIXEL_DEBUG_PRODUCE
 
-class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork {
+class PixelTrackSoAFromCUDA : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit PixelTrackSoAFromCUDA(edm::ProductRegistry& reg);
   ~PixelTrackSoAFromCUDA() override = default;
 
 private:
-  void acquire(edm::Event const& iEvent,
-               edm::EventSetup const& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) override;
 
   edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
@@ -35,15 +32,13 @@ PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg)
 
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
-  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
+                                    cms::cuda::AcquireContext& ctx) {
+  auto const& inputData = ctx.get(iEvent, tokenCUDA_);
 
   soa_ = inputData.toHostAsync(ctx.stream());
 }
 
-void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) {
 #ifdef PIXEL_DEBUG_PRODUCE
   auto const& tsoa = *soa_;
   auto maxTracks = tsoa.stride();
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
index 94085d784..aca6bd319 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
@@ -1,24 +1,23 @@
 #include <cuda_runtime.h>
 
 #include "CUDACore/Product.h"
+#include "CUDACore/EDProducer.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
 #include "Framework/RunningAverage.h"
-#include "CUDACore/ScopedContext.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
-class CAHitNtupletCUDA : public edm::EDProducer {
+class CAHitNtupletCUDA : public cms::cuda::EDProducer {
 public:
   explicit CAHitNtupletCUDA(edm::ProductRegistry& reg);
   ~CAHitNtupletCUDA() override = default;
 
 private:
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override;
 
   edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
   edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
@@ -31,12 +30,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(edm::ProductRegistry& reg)
       tokenTrackGPU_{reg.produces<cms::cuda::Product<PixelTrackHeterogeneous>>()},
       gpuAlgo_(reg) {}
 
-void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
+void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cms::cuda::ProduceContext& ctx) {
   auto bf = 0.0114256972711507;  // 1/fieldInGeV
 
-  auto const& phits = iEvent.get(tokenHitGPU_);
-  cms::cuda::ScopedContextProduce ctx{phits};
-  auto const& hits = ctx.get(phits);
+  auto const& hits = ctx.get(iEvent, tokenHitGPU_);
 
   ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
 }
diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
index 723f7eb4c..d4ca2196b 100644
--- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
@@ -1,12 +1,12 @@
 #include <cuda_runtime.h>
 
 #include "CUDACore/Product.h"
+#include "CUDACore/ProduceContext.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
 #include "Framework/EDProducer.h"
 #include "Framework/RunningAverage.h"
-#include "CUDACore/ScopedContext.h"
 
 #include "gpuVertexFinder.h"
 
@@ -58,14 +58,13 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(edm::ProductRegistry& reg)
 }
 
 void PixelVertexProducerCUDA::produceOnGPU(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  auto const& ptracks = iEvent.get(tokenGPUTrack_);
+  cms::cuda::runProduce(iEvent.streamID(), [&](cms::cuda::ProduceContext& ctx) {
+    auto const* tracks = ctx.get(iEvent, tokenGPUTrack_).get();
 
-  cms::cuda::ScopedContextProduce ctx{ptracks};
-  auto const* tracks = ctx.get(ptracks).get();
+    assert(tracks);
 
-  assert(tracks);
-
-  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_));
+    ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_));
+  });
 }
 
 void PixelVertexProducerCUDA::produceOnCPU(edm::Event& iEvent, const edm::EventSetup& iSetup) {
diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
index d709f0c5e..4f38c0ad8 100644
--- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
@@ -1,25 +1,22 @@
 #include <cuda_runtime.h>
 
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDACore/HostProduct.h"
 #include "CUDADataFormats/ZVertexHeterogeneous.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
 #include "Framework/RunningAverage.h"
-#include "CUDACore/ScopedContext.h"
 
-class PixelVertexSoAFromCUDA : public edm::EDProducerExternalWork {
+class PixelVertexSoAFromCUDA : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit PixelVertexSoAFromCUDA(edm::ProductRegistry& reg);
   ~PixelVertexSoAFromCUDA() override = default;
 
 private:
-  void acquire(edm::Event const& iEvent,
-               edm::EventSetup const& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) override;
 
   edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
@@ -33,15 +30,13 @@ PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(edm::ProductRegistry& reg)
 
 void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
                                      edm::EventSetup const& iSetup,
-                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
-  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
+                                     cms::cuda::AcquireContext& ctx) {
+  auto const& inputData = ctx.get(iEvent, tokenCUDA_);
 
   m_soa = inputData.toHostAsync(ctx.stream());
 }
 
-void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) {
   // No copies....
   iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
 }
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
index a5229b295..4f06bf540 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
@@ -4,8 +4,8 @@
 #include <vector>
 
 // CMSSW includes
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
@@ -16,7 +16,6 @@
 #include "DataFormats/FEDRawData.h"
 #include "DataFormats/FEDRawDataCollection.h"
 #include "DataFormats/SiPixelErrorCompact.h"
-#include "Framework/EDProducer.h"
 #include "Framework/Event.h"
 #include "Framework/EventSetup.h"
 #include "Framework/PluginFactory.h"
@@ -26,18 +25,14 @@
 #include "SiPixelClusterThresholds.h"
 #include "SiPixelRawToClusterGPUKernel.h"
 
-class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
+class SiPixelRawToClusterCUDA : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg);
   ~SiPixelRawToClusterCUDA() override = default;
 
 private:
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
-
-  cms::cuda::ContextState ctxState_;
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override;
 
   edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
   edm::EDPutTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiPutToken_;
@@ -72,9 +67,7 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg)
 
 void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
                                       const edm::EventSetup& iSetup,
-                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
-
+                                      cms::cuda::AcquireContext& ctx) {
   auto const& hgpuMap = iSetup.get<SiPixelROCsStatusAndMappingWrapper>();
   if (hgpuMap.hasQuality() != useQuality_) {
     throw std::runtime_error(
@@ -170,9 +163,9 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
                              ctx.stream());
 }
 
-void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  cms::cuda::ScopedContextProduce ctx{ctxState_};
-
+void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent,
+                                      const edm::EventSetup& iSetup,
+                                      cms::cuda::ProduceContext& ctx) {
   auto tmp = gpuAlgo_.getResults();
   ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
   ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
diff --git a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
index 448f4b797..8d1f8b3c4 100644
--- a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
+++ b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
@@ -1,23 +1,20 @@
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
 #include "DataFormats/SiPixelDigisSoA.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
-#include "Framework/EDProducer.h"
 #include "Framework/PluginFactory.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDACore/host_unique_ptr.h"
 
-class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork {
+class SiPixelDigisSoAFromCUDA : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg);
   ~SiPixelDigisSoAFromCUDA() override = default;
 
 private:
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) override;
 
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
   edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
@@ -36,10 +33,7 @@ SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg)
 
 void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
                                       const edm::EventSetup& iSetup,
-                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
-  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
-
+                                      cms::cuda::AcquireContext& ctx) {
   const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
 
   nDigis_ = gpuDigis.nDigis();
@@ -49,7 +43,7 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
   clus_ = gpuDigis.clusToHostAsync(ctx.stream());
 }
 
-void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) {
   // The following line copies the data from the pinned host memory to
   // regular host memory. In principle that feels unnecessary (why not
   // just use the pinned host memory?). There are a few arguments for
diff --git a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
index 413982fc3..133fa8eaf 100644
--- a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
+++ b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
@@ -1,6 +1,7 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/BeamSpotCUDA.h"
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
@@ -8,19 +9,17 @@
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
-#include "CUDACore/ScopedContext.h"
 #include "CondFormats/PixelCPEFast.h"
 
 #include "PixelRecHitGPUKernel.h"
 
-class SiPixelRecHitCUDA : public edm::EDProducer {
+class SiPixelRecHitCUDA : public cms::cuda::EDProducer {
 public:
   explicit SiPixelRecHitCUDA(edm::ProductRegistry& reg);
   ~SiPixelRecHitCUDA() override = default;
 
 private:
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override;
 
   // The mess with inputs will be cleaned up when migrating to the new framework
   const edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
@@ -36,13 +35,10 @@ SiPixelRecHitCUDA::SiPixelRecHitCUDA(edm::ProductRegistry& reg)
       tokenDigi_(reg.consumes<cms::cuda::Product<SiPixelDigisCUDA>>()),
       tokenHit_(reg.produces<cms::cuda::Product<TrackingRecHit2DCUDA>>()) {}
 
-void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
+void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cms::cuda::ProduceContext& ctx) {
   PixelCPEFast const& fcpe = es.get<PixelCPEFast>();
 
-  auto const& pclusters = iEvent.get(token_);
-  cms::cuda::ScopedContextProduce ctx{pclusters};
-
-  auto const& clusters = ctx.get(pclusters);
+  auto const& clusters = ctx.get(iEvent, token_);
   auto const& digis = ctx.get(iEvent, tokenDigi_);
   auto const& bs = ctx.get(iEvent, tBeamSpot);
 
diff --git a/src/cudadev/plugin-Validation/CountValidator.cc b/src/cudadev/plugin-Validation/CountValidator.cc
index 23352f5ba..bd9f40989 100644
--- a/src/cudadev/plugin-Validation/CountValidator.cc
+++ b/src/cudadev/plugin-Validation/CountValidator.cc
@@ -1,5 +1,5 @@
+#include "CUDACore/ProduceContext.h"
 #include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
@@ -63,24 +63,24 @@ void CountValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup)
   ss << "Event " << iEvent.eventID() << " ";
 
   {
-    auto const& pdigis = iEvent.get(digiToken_);
-    cms::cuda::ScopedContextProduce ctx{pdigis};
-    auto const& count = iEvent.get(digiClusterCountToken_);
-    auto const& digis = ctx.get(iEvent, digiToken_);
-    auto const& clusters = ctx.get(iEvent, clusterToken_);
-
-    if (digis.nModules() != count.nModules()) {
-      ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules();
-      ok = false;
-    }
-    if (digis.nDigis() != count.nDigis()) {
-      ss << "\n N(digis) is " << digis.nDigis() << " expected " << count.nDigis();
-      ok = false;
-    }
-    if (clusters.nClusters() != count.nClusters()) {
-      ss << "\n N(clusters) is " << clusters.nClusters() << " expected " << count.nClusters();
-      ok = false;
-    }
+    cms::cuda::runProduce(iEvent.streamID(), [&](cms::cuda::ProduceContext& ctx) {
+      auto const& count = iEvent.get(digiClusterCountToken_);
+      auto const& digis = ctx.get(iEvent, digiToken_);
+      auto const& clusters = ctx.get(iEvent, clusterToken_);
+
+      if (digis.nModules() != count.nModules()) {
+        ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules();
+        ok = false;
+      }
+      if (digis.nDigis() != count.nDigis()) {
+        ss << "\n N(digis) is " << digis.nDigis() << " expected " << count.nDigis();
+        ok = false;
+      }
+      if (clusters.nClusters() != count.nClusters()) {
+        ss << "\n N(clusters) is " << clusters.nClusters() << " expected " << count.nClusters();
+        ok = false;
+      }
+    });
   }
 
   {
diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc
index 8a888666b..f8af1773d 100644
--- a/src/cudadev/plugin-Validation/HistoValidator.cc
+++ b/src/cudadev/plugin-Validation/HistoValidator.cc
@@ -1,5 +1,5 @@
+#include "CUDACore/EDProducer.h"
 #include "CUDACore/Product.h"
-#include "CUDACore/ScopedContext.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
@@ -8,22 +8,19 @@
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
 
 #include "SimpleAtomicHisto.h"
 
 #include <map>
 #include <fstream>
 
-class HistoValidator : public edm::EDProducerExternalWork {
+class HistoValidator : public cms::cuda::SynchronizingEDProducer {
 public:
   explicit HistoValidator(edm::ProductRegistry& reg);
 
 private:
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) override;
   void endJob() override;
 
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiToken_;
@@ -88,11 +85,7 @@ HistoValidator::HistoValidator(edm::ProductRegistry& reg)
       trackToken_(reg.consumes<PixelTrackHeterogeneous>()),
       vertexToken_(reg.consumes<ZVertexHeterogeneous>()) {}
 
-void HistoValidator::acquire(const edm::Event& iEvent,
-                             const edm::EventSetup& iSetup,
-                             edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  auto const& pdigis = iEvent.get(digiToken_);
-  cms::cuda::ScopedContextAcquire ctx{pdigis, std::move(waitingTaskHolder)};
+void HistoValidator::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) {
   auto const& digis = ctx.get(iEvent, digiToken_);
   auto const& clusters = ctx.get(iEvent, clusterToken_);
   auto const& hits = ctx.get(iEvent, hitToken_);
@@ -113,7 +106,7 @@ void HistoValidator::acquire(const edm::Event& iEvent,
   h_size = hits.sizeToHostAsync(ctx.stream());
 }
 
-void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) {
   histos["digi_n"].fill(nDigis);
   for (uint32_t i = 0; i < nDigis; ++i) {
     histos["digi_adc"].fill(h_adc[i]);