[alpaka] Template varius alpakatools classes on the Queue type #255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

fwyzard merged 6 commits into cms-patatrack:master from fwyzard:alpaka_developments

Oct 28, 2021

src/alpaka/AlpakaCore/ContextState.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -3,18 +3,28 @@
  
    #include <memory>

    #include "AlpakaCore/alpakaConfig.h"

    #include <alpaka/alpaka.hpp>

    namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {

    namespace cms::alpakatools {

      template <typename TQueue>

      class ScopedContextAcquire;

      template <typename TQueue>

      class ScopedContextProduce;

      template <typename TQueue>

      class ScopedContextTask;

      /**

         * The purpose of this class is to deliver the device and CUDA stream

         * information from ExternalWork's acquire() to producer() via a

         * member/StreamCache variable.

         */

      template <typename TQueue>

      class ContextState {

      public:

        using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue;

        using Queue = TQueue;

        using Device = alpaka::Dev<Queue>;

        ContextState() = default;

    @@ -26,18 +36,26 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
  
        ContextState& operator=(ContextState&& other) = delete;

      private:

        friend class ScopedContextAcquire;

        friend class ScopedContextProduce;

        friend class ScopedContextTask;

        friend class ScopedContextAcquire<TQueue>;

        friend class ScopedContextProduce<TQueue>;

        friend class ScopedContextTask<TQueue>;

        void set(std::shared_ptr<Queue> stream) {

          throwIfStream();

          stream_ = std::move(stream);

        }

        Device device() const { return alpaka::getDev(*stream_); }

        Device device() const {

          throwIfNoStream();

          return alpaka::getDev(*stream_);

        }

        const std::shared_ptr<Queue>& streamPtr() const {

        Queue stream() const {

          throwIfNoStream();

          return *stream_;

        }

        std::shared_ptr<Queue> const& streamPtr() const {

          throwIfNoStream();

          return stream_;

        }

    @@ -51,12 +69,21 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
  
          return std::move(stream_);

        }

        void throwIfStream() const;

        void throwIfNoStream() const;

        void throwIfStream() const {

          if (stream_) {

            throw std::runtime_error("Trying to set ContextState, but it already had a valid state");

          }

        }

        void throwIfNoStream() const {

          if (not stream_) {

            throw std::runtime_error("Trying to get ContextState, but it did not have a valid state");

          }

        }

        std::shared_ptr<Queue> stream_;

      };

    }  // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE

    }  // namespace cms::alpakatools

    #endif  // HeterogeneousCore_AlpakaCore_ContextState_h

src/alpaka/AlpakaCore/ESProduct.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -8,21 +8,19 @@
  
    #include "AlpakaCore/alpakaConfig.h"

    #include "AlpakaCore/EventCache.h"

    #include "AlpakaCore/currentDevice.h"

    #include "AlpakaCore/deviceCount.h"

    #include "AlpakaCore/eventWorkHasCompleted.h"

    namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {

      template <typename T>

      class ESProduct {

      public:

        using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue;

        using Event = ::ALPAKA_ACCELERATOR_NAMESPACE::Event;

        template <typename T_Acc>

        ESProduct(T_Acc acc) : gpuDataPerDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()) {

        ESProduct() : gpuDataPerDevice_(::ALPAKA_ACCELERATOR_NAMESPACE::devices.size()) {

          for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {

            gpuDataPerDevice_[i].m_event = ::cms::alpakatools::getEventCache<Event>().get(acc);

            gpuDataPerDevice_[i].m_event = ::cms::alpakatools::getEventCache<Event>().get(::ALPAKA_ACCELERATOR_NAMESPACE::devices[i]);

          }

        }

    @@ -32,8 +30,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
  
        // which enqueues asynchronous transfers (possibly kernels as well)

        // to the CUDA stream

        template <typename F>

        const T& dataForCurrentDeviceAsync(::ALPAKA_ACCELERATOR_NAMESPACE::Queue queue, F transferAsync) const {

          auto device = currentDevice();

        const T& dataForDeviceAsync(Queue queue, F transferAsync) const {

          auto device = cms::alpakatools::getDevIndex(alpaka::getDev(queue));

          auto& data = gpuDataPerDevice_[device];

          // If GPU data has already been filled, we can return it

src/alpaka/AlpakaCore/HistoContainer.h

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -56,8 +56,8 @@ namespace cms {
  
        ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchZero(

            Histo *__restrict__ h, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) {

          uint32_t *poff = (uint32_t *)(char *)(&(h->off));

          auto histoOffView = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView<typename Histo::Counter>(

              poff, Histo::totbins());

          auto histoOffView =

              cms::alpakatools::createDeviceView<typename Histo::Counter>(alpaka::getDev(queue), poff, Histo::totbins());

          alpaka::memset(queue, histoOffView, 0, Histo::totbins());

          alpaka::wait(queue);

    @@ -77,16 +77,21 @@ namespace cms {
  
          const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(

              blocksPerGrid, threadsPerBlockOrElementsPerThread);

          alpaka::enqueue(queue,

                          alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(

                              workDiv, ::cms::alpakatools::multiBlockPrefixScanFirstStep<uint32_t>(), poff, poff, num_items));

          const WorkDiv1D &workDivWith1Block = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(

              Vec1D::all(1), threadsPerBlockOrElementsPerThread);

          alpaka::enqueue(

              queue,

              alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(

                  workDivWith1Block, ::cms::alpakatools::multiBlockPrefixScanSecondStep<uint32_t>(), poff, poff, num_items, nblocks));

                  workDiv, ::cms::alpakatools::multiBlockPrefixScanFirstStep<uint32_t>(), poff, poff, num_items));

          const WorkDiv1D &workDivWith1Block = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(

              Vec1D::all(1), threadsPerBlockOrElementsPerThread);

          alpaka::enqueue(queue,

                          alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(

                              workDivWith1Block,

                              ::cms::alpakatools::multiBlockPrefixScanSecondStep<uint32_t>(),

                              poff,

                              poff,

                              num_items,

                              nblocks));

        }

        template <typename Histo, typename T>

    @@ -106,14 +111,14 @@ namespace cms {
  
          const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(

              blocksPerGrid, threadsPerBlockOrElementsPerThread);

          alpaka::enqueue(

              queue,

              alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, countFromVector(), h, nh, v, offsets));

          alpaka::enqueue(queue,

                          alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(

                              workDiv, countFromVector(), h, nh, v, offsets));

          launchFinalize(h, queue);

          alpaka::enqueue(

              queue,

              alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, fillFromVector(), h, nh, v, offsets));

          alpaka::enqueue(queue,

                          alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(

                              workDiv, fillFromVector(), h, nh, v, offsets));

        }

        struct finalizeBulk {

src/alpaka/AlpakaCore/Product.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -11,17 +11,18 @@ namespace edm { @@
       class Wrapper;
     }
-    namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
+    namespace cms::alpakatools {
       namespace impl {
+        template <typename TQueue>
         class ScopedContextGetterBase;
       }
       /**
          * The purpose of this class is to wrap CUDA data to edm::Event in a
          * way which forces correct use of various utilities.
          *
-         * The non-default construction has to be done with ::cms::alpakatools::ScopedContext
+         * The non-default construction has to be done with ScopedContext
          * (in order to properly register the CUDA event).
          *
          * The default constructor is needed only for the ROOT dictionary generation.
@@ Expand All / @@ -31,9 +32,12 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@
          * it. Here is a somewhat natural place. If overhead is too much, we
          * can use them only where synchronization between streams is needed.
          */
-      template <typename T>
-      class Product : public ProductBase {
+      template <typename TQueue, typename T>
+      class Product : public ProductBase<TQueue> {
       public:
+        using Queue = TQueue;
+        using Event = alpaka::Event<Queue>;
         Product() = default;  // Needed only for ROOT dictionary generation
         Product(const Product&) = delete;
@@ Expand All / @@ -42,20 +46,20 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@
         Product& operator=(Product&&) = default;
       private:
-        friend class impl::ScopedContextGetterBase;
-        friend class ScopedContextProduce;
-        friend class edm::Wrapper<Product<T>>;
+        friend class impl::ScopedContextGetterBase<Queue>;
+        friend class ScopedContextProduce<Queue>;
+        friend class edm::Wrapper<Product<Queue, T>>;
         explicit Product(std::shared_ptr<Queue> stream, std::shared_ptr<Event> event, T data)
-            : ProductBase(std::move(stream), std::move(event)), data_(std::move(data)) {}
+            : ProductBase<Queue>(std::move(stream), std::move(event)), data_(std::move(data)) {}
         template <typename... Args>
         explicit Product(std::shared_ptr<Queue> stream, std::shared_ptr<Event> event, Args&&... args)
-            : ProductBase(std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
+            : ProductBase<Queue>(std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
         T data_;  //!
       };
-    }  // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE
+    }  // namespace cms::alpakatools
     #endif  // AlpakaDataFormats_Common_Product_h

src/alpaka/AlpakaCore/ProductBase.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -6,25 +6,43 @@ @@
     #include <alpaka/alpaka.hpp>
-    #include "AlpakaCore/alpakaConfigAcc.h"
-    namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
+    namespace cms::alpakatools {
       namespace impl {
+        template <typename TQueue>
         class ScopedContextBase;
       }
+      template <typename TQueue>
+      class ScopedContextProduce;
       /**
-         * Base class for all instantiations of CUDA<T> to hold the
+         * Base class for all instantiations of Product<TQueue, T> to hold the
          * non-T-dependent members.
          */
+      template <typename TQueue>
       class ProductBase {
       public:
-        using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue;
+        using Queue = TQueue;
         using Event = alpaka::Event<Queue>;
+        using Device = alpaka::Dev<Queue>;
         ProductBase() = default;  // Needed only for ROOT dictionary generation
-        ~ProductBase();
+        ~ProductBase() {
+          // Make sure that the production of the product in the GPU is
+          // complete before destructing the product. This is to make sure
+          // that the EDM stream does not move to the next event before all
+          // asynchronous processing of the current is complete.
+          // TODO: a callback notifying a WaitingTaskHolder (or similar)
+          // would avoid blocking the CPU, but would also require more work.
+          // FIXME: this may throw an execption if the underlaying call fails.
+          if (event_) {
+            alpaka::wait(*event_);
+          }
+        }
         ProductBase(const ProductBase&) = delete;
         ProductBase& operator=(const ProductBase&) = delete;
@@ Expand All / @@ -40,18 +58,25 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@
         }
         bool isValid() const { return stream_.get() != nullptr; }
-        bool isAvailable() const;
-        alpaka::Dev<Queue> device() const { return alpaka::getDev(stream()); }
+        bool isAvailable() const {
+          // if default-constructed, the product is not available
+          if (not event_) {
+            return false;
+          }
+          return eventWorkHasCompleted(*(event_.get()));
+        }
+        Device device() const { return alpaka::getDev(stream()); }
         // cudaStream_t is a pointer to a thread-safe object, for which a
-        // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself
+        // mutable access is needed even if the ScopedContext itself
         // would be const. Therefore it is ok to return a non-const
         // pointer from a const method here.
         Queue& stream() const { return *(stream_.get()); }
         // cudaEvent_t is a pointer to a thread-safe object, for which a
-        // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself
+        // mutable access is needed even if the ScopedContext itself
         // would be const. Therefore it is ok to return a non-const
         // pointer from a const method here.
         Event& event() const { return *(event_.get()); }
@@ Expand All / @@ -61,8 +86,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@
             : stream_{std::move(stream)}, event_{std::move(event)} {}
       private:
-        friend class impl::ScopedContextBase;
-        friend class ScopedContextProduce;
+        friend class impl::ScopedContextBase<Queue>;
+        friend class ScopedContextProduce<Queue>;
         // The following function is intended to be used only from ScopedContext
         const std::shared_ptr<Queue>& streamPtr() const { return stream_; }
@@ Expand All / @@ -78,7 +103,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@
         // The cudaStream_t is really shared among edm::Event products, so
         // using shared_ptr also here
         std::shared_ptr<Queue> stream_;  //!
-        // shared_ptr because of caching in ::cms::alpakatools::EventCache
+        // shared_ptr because of caching in EventCache
         std::shared_ptr<Event> event_;  //!
         // This flag tells whether the CUDA stream may be reused by a
@@ Expand All / @@ -87,6 +112,6 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@
         mutable std::atomic<bool> mayReuseStream_ = true;  //!
       };
-    }  // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE
+    }  // namespace cms::alpakatools
     #endif  // AlpakaDataFormats_Common_ProductBase_h

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[alpaka] Template varius alpakatools classes on the Queue type #255

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

Uh oh!