[graphite][compute] Add mechanism for late-bound workgroup memory

naga-generated MSL declares WGSL workgroup shared memory buffers as entry-point function parameters. This triggers a Metal API validation error requiring that they be initialized like a late bound resource. Unlike device-memory buffers and textures, workgroup resources are not represented by API objects. Metal requires that the size of the buffers to be specified by calling [MTLComputeCommandEncoder setThreadgroupMemoryLength:atIndex:]. ComputeSteps now allow their subclasses to optionally specify a list of workgroup buffer sizes and indices separately from buffer and texture resources. The backend is responsible for making the appropriate API calls (if any) when it processes a dispatch command submission. Bug: b/272520336 Change-Id: I285f3546ebb6745d1ea9bec55dd54d5a9b5e125c Reviewed-on: https://skia-review.googlesource.com/c/skia/+/666636 Reviewed-by: Jim Van Verth <[email protected]> Commit-Queue: Arman Uguray <[email protected]>
armansito · Apr 28, 2023 · a4f6149 · a4f6149
1 parent 79052b1
commit a4f6149
Show file tree

Hide file tree

Showing 7 changed files with 162 additions and 2 deletions.
diff --git a/src/gpu/graphite/compute/ComputeStep.cpp b/src/gpu/graphite/compute/ComputeStep.cpp
@@ -28,11 +28,13 @@ static uint32_t next_id() {
 ComputeStep::ComputeStep(std::string_view name,
                          WorkgroupSize localDispatchSize,
                          SkSpan<const ResourceDesc> resources,
+                         SkSpan<const WorkgroupBufferDesc> workgroupBuffers,
                          Flags baseFlags)
         : fUniqueID(next_id())
         , fFlags(baseFlags)
         , fName(name)
-        , fResources(resources.begin(), resources.end())
+        , fResources(resources.data(), resources.size())
+        , fWorkgroupBuffers(workgroupBuffers.data(), workgroupBuffers.size())
         , fLocalDispatchSize(localDispatchSize) {
 #ifdef SK_DEBUG
     std::unordered_set<int> slots;

diff --git a/src/gpu/graphite/compute/ComputeStep.h b/src/gpu/graphite/compute/ComputeStep.h
@@ -11,6 +11,7 @@
 #include "include/core/SkColorType.h"
 #include "include/core/SkSize.h"
 #include "include/core/SkSpan.h"
+#include "include/private/base/SkTArray.h"
 #include "src/core/SkEnumBitMask.h"
 #include "src/gpu/graphite/ComputeTypes.h"
 
@@ -128,6 +129,14 @@ class ComputeStep {
                 : fType(type), fFlow(flow), fPolicy(policy), fSlot(slot) {}
     };
 
+    // On platforms that support late bound workgroup shared resources (e.g. Metal) a ComputeStep
+    // can optionally provide a list of memory sizes and binding indices.
+    struct WorkgroupBufferDesc {
+        // The buffer size in bytes.
+        size_t size;
+        size_t index;
+    };
+
     virtual ~ComputeStep() = default;
 
     // Returns a complete SkSL compute program. The returned SkSL must constitute a complete compute
@@ -218,6 +227,7 @@ class ComputeStep {
                                       UniformManager*) const;
 
     SkSpan<const ResourceDesc> resources() const { return SkSpan(fResources); }
+    SkSpan<const WorkgroupBufferDesc> workgroupBuffers() const { return SkSpan(fWorkgroupBuffers); }
 
     // Identifier that can be used as part of a unique key for a compute pipeline state object
     // associated with this `ComputeStep`.
@@ -253,6 +263,7 @@ class ComputeStep {
     ComputeStep(std::string_view name,
                 WorkgroupSize localDispatchSize,
                 SkSpan<const ResourceDesc> resources,
+                SkSpan<const WorkgroupBufferDesc> workgroupBuffers = {},
                 Flags baseFlags = Flags::kNone);
 
 private:
@@ -263,7 +274,8 @@ class ComputeStep {
     uint32_t fUniqueID;
     SkEnumBitMask<Flags> fFlags;
     std::string fName;
-    std::vector<ResourceDesc> fResources;
+    skia_private::TArray<ResourceDesc> fResources;
+    skia_private::TArray<WorkgroupBufferDesc> fWorkgroupBuffers;
 
     // TODO(b/240615224): Subclasses should simply specify the workgroup size that they need.
     // The ComputeStep constructor should check and reduce that number based on the maximum

diff --git a/src/gpu/graphite/compute/DispatchGroup.cpp b/src/gpu/graphite/compute/DispatchGroup.cpp
@@ -155,6 +155,11 @@ bool Builder::appendStep(const ComputeStep* step,
         dispatch.fBindings.push_back({static_cast<BindingIndex>(bindingIndex), dispatchResource});
     }
 
+    auto wgBufferDescs = step->workgroupBuffers();
+    if (!wgBufferDescs.empty()) {
+        dispatch.fWorkgroupBuffers.push_back_n(wgBufferDescs.size(), wgBufferDescs.data());
+    }
+
     // We need to switch pipelines if this step uses a different pipeline from the previous step.
     if (fObj->fPipelineDescs.empty() ||
         fObj->fPipelineDescs.back().uniqueID() != step->uniqueID()) {

diff --git a/src/gpu/graphite/compute/DispatchGroup.h b/src/gpu/graphite/compute/DispatchGroup.h
@@ -56,6 +56,7 @@ class DispatchGroup final {
     struct Dispatch {
         ComputePassDesc fParams;
         skia_private::TArray<ResourceBinding> fBindings;
+        skia_private::TArray<ComputeStep::WorkgroupBufferDesc> fWorkgroupBuffers;
         int fPipelineIndex = 0;
     };
 

diff --git a/src/gpu/graphite/mtl/MtlCommandBuffer.mm b/src/gpu/graphite/mtl/MtlCommandBuffer.mm
@@ -139,6 +139,10 @@
                     this->bindTexture(group->getTexture(*texIdx), binding.fIndex);
                 }
             }
+            SkASSERT(fActiveComputeCommandEncoder);
+            for (const ComputeStep::WorkgroupBufferDesc& wgBuf : dispatch.fWorkgroupBuffers) {
+                fActiveComputeCommandEncoder->setThreadgroupMemoryLength(wgBuf.size, wgBuf.index);
+            }
             this->dispatchThreadgroups(dispatch.fParams.fGlobalDispatchSize,
                                        dispatch.fParams.fLocalDispatchSize);
         }

diff --git a/src/gpu/graphite/mtl/MtlComputeCommandEncoder.h b/src/gpu/graphite/mtl/MtlComputeCommandEncoder.h
@@ -88,6 +88,10 @@ class MtlComputeCommandEncoder : public Resource {
         }
     }
 
+    void setThreadgroupMemoryLength(NSUInteger length, NSUInteger index) {
+        [(*fCommandEncoder) setThreadgroupMemoryLength:length atIndex:index];
+    }
+
     void dispatchThreadgroups(const WorkgroupSize& globalSize, const WorkgroupSize& localSize) {
         MTLSize threadgroupCount =
                 MTLSizeMake(globalSize.fWidth, globalSize.fHeight, globalSize.fDepth);

diff --git a/tests/graphite/ComputeTest.cpp b/tests/graphite/ComputeTest.cpp
@@ -1512,6 +1512,7 @@ DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal, reporter, c
                         /*slot=*/0,
                     }
                 },
+                /*workgroupBuffers=*/{},
                 /*baseFlags=*/Flags::kSupportsNativeShader) {}
         ~TestComputeStep() override = default;
 
@@ -1615,3 +1616,134 @@ DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal, reporter, c
                     kExpectedCount,
                     result);
 }
+
+DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal, reporter, context) {
+    std::unique_ptr<Recorder> recorder = context->makeRecorder();
+
+    constexpr uint32_t kWorkgroupCount = 32;
+    constexpr uint32_t kWorkgroupSize = 1024;
+
+    class TestComputeStep : public ComputeStep {
+    public:
+        TestComputeStep() : ComputeStep(
+                /*name=*/"TestAtomicOperationsMetal",
+                /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
+                /*resources=*/{
+                    {
+                        /*type=*/ResourceType::kStorageBuffer,
+                        /*flow=*/DataFlow::kShared,
+                        /*policy=*/ResourcePolicy::kMapped,
+                        /*slot=*/0,
+                    }
+                },
+                /*workgroupBuffers=*/{
+                    {
+                        /*size=*/sizeof(uint32_t),
+                        /*index=*/0u,
+                    }
+                },
+                /*baseFlags=*/Flags::kSupportsNativeShader) {}
+        ~TestComputeStep() override = default;
+
+        // This is the same MSL kernel as in Compute_NativeShaderSourceMetal, except `localCounter`
+        // is an entry-point parameter instead of a local variable. This forces the workgroup
+        // binding to be encoded explicitly in the command encoder.
+        NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
+            SkASSERT(format == NativeShaderFormat::kMSL);
+            static constexpr std::string_view kSource = R"(
+                #include <metal_stdlib>
+
+                using namespace metal;
+
+                kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
+                                        device atomic_uint& globalCounter [[buffer(0)]],
+                                        threadgroup atomic_uint& localCounter [[threadgroup(0)]]) {
+                    // Initialize the local counter.
+                    if (localId.x == 0u) {
+                        atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
+                    }
+
+                    // Synchronize the threads in the workgroup so they all see the initial value.
+                    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+                    // All threads increment the counter.
+                    atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
+
+                    // Synchronize the threads again to ensure they have all executed the increment
+                    // and the following load reads the same value across all threads in the
+                    // workgroup.
+                    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+                    // Add the workgroup-only tally to the global counter.
+                    if (localId.x == 0u) {
+                        uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
+                        atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
+                    }
+                }
+            )";
+            return {SkSpan(reinterpret_cast<const uint8_t*>(kSource.data()), kSource.length()),
+                    "atomicCount"};
+        }
+
+        size_t calculateBufferSize(const DrawParams&,
+                                   int index,
+                                   const ResourceDesc& r) const override {
+            SkASSERT(index == 0);
+            SkASSERT(r.fSlot == 0);
+            SkASSERT(r.fFlow == DataFlow::kShared);
+            return sizeof(uint32_t);
+        }
+
+        WorkgroupSize calculateGlobalDispatchSize(const DrawParams&) const override {
+            return WorkgroupSize(kWorkgroupCount, 1, 1);
+        }
+
+        void prepareStorageBuffer(const DrawParams&,
+                                  int ssboIndex,
+                                  int resourceIndex,
+                                  const ResourceDesc& r,
+                                  void* buffer,
+                                  size_t bufferSize) const override {
+            SkASSERT(resourceIndex == 0);
+            *static_cast<uint32_t*>(buffer) = 0;
+        }
+    } step;
+
+    DispatchGroup::Builder builder(recorder.get());
+    builder.appendStep(&step, fake_draw_params_for_testing(), 0);
+
+    BindBufferInfo info = builder.getSharedBufferResource(0);
+    if (!info) {
+        ERRORF(reporter, "shared resource at slot 0 is missing");
+        return;
+    }
+
+    // Record the compute pass task.
+    ComputeTask::DispatchGroupList groups;
+    groups.push_back(builder.finalize());
+    recorder->priv().add(ComputeTask::Make(std::move(groups)));
+
+    // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
+    recorder->priv().add(SynchronizeToCpuTask::Make(sk_ref_sp(info.fBuffer)));
+
+    // Submit the work and wait for it to complete.
+    std::unique_ptr<Recording> recording = recorder->snap();
+    if (!recording) {
+        ERRORF(reporter, "Failed to make recording");
+        return;
+    }
+
+    InsertRecordingInfo insertInfo;
+    insertInfo.fRecording = recording.get();
+    context->insertRecording(insertInfo);
+    context->submit(SyncToCpu::kYes);
+
+    // Verify the contents of the output buffer.
+    constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
+    const uint32_t result = static_cast<const uint32_t*>(map_bind_buffer(info))[0];
+    REPORTER_ASSERT(reporter,
+                    result == kExpectedCount,
+                    "expected '%d', found '%d'",
+                    kExpectedCount,
+                    result);
+}