Skip to content

Commit

Permalink
[graphite][compute] Add mechanism for late-bound workgroup memory
Browse files Browse the repository at this point in the history
naga-generated MSL declares WGSL workgroup shared memory buffers as
entry-point function parameters. This triggers a Metal API validation
error requiring that they be initialized like a late bound resource.

Unlike device-memory buffers and textures, workgroup resources are not
represented by API objects. Metal requires that the size of the buffers
to be specified by calling
[MTLComputeCommandEncoder setThreadgroupMemoryLength:atIndex:].

ComputeSteps now allow their subclasses to optionally specify a list of
workgroup buffer sizes and indices separately from buffer and texture
resources. The backend is responsible for making the appropriate API
calls (if any) when it processes a dispatch command submission.

Bug: b/272520336
Change-Id: I285f3546ebb6745d1ea9bec55dd54d5a9b5e125c
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/666636
Reviewed-by: Jim Van Verth <[email protected]>
Commit-Queue: Arman Uguray <[email protected]>
  • Loading branch information
armansito authored and SkCQ committed Apr 28, 2023
1 parent 79052b1 commit a4f6149
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 2 deletions.
4 changes: 3 additions & 1 deletion src/gpu/graphite/compute/ComputeStep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ static uint32_t next_id() {
ComputeStep::ComputeStep(std::string_view name,
WorkgroupSize localDispatchSize,
SkSpan<const ResourceDesc> resources,
SkSpan<const WorkgroupBufferDesc> workgroupBuffers,
Flags baseFlags)
: fUniqueID(next_id())
, fFlags(baseFlags)
, fName(name)
, fResources(resources.begin(), resources.end())
, fResources(resources.data(), resources.size())
, fWorkgroupBuffers(workgroupBuffers.data(), workgroupBuffers.size())
, fLocalDispatchSize(localDispatchSize) {
#ifdef SK_DEBUG
std::unordered_set<int> slots;
Expand Down
14 changes: 13 additions & 1 deletion src/gpu/graphite/compute/ComputeStep.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "include/core/SkColorType.h"
#include "include/core/SkSize.h"
#include "include/core/SkSpan.h"
#include "include/private/base/SkTArray.h"
#include "src/core/SkEnumBitMask.h"
#include "src/gpu/graphite/ComputeTypes.h"

Expand Down Expand Up @@ -128,6 +129,14 @@ class ComputeStep {
: fType(type), fFlow(flow), fPolicy(policy), fSlot(slot) {}
};

// On platforms that support late bound workgroup shared resources (e.g. Metal) a ComputeStep
// can optionally provide a list of memory sizes and binding indices.
struct WorkgroupBufferDesc {
// The buffer size in bytes.
size_t size;
size_t index;
};

virtual ~ComputeStep() = default;

// Returns a complete SkSL compute program. The returned SkSL must constitute a complete compute
Expand Down Expand Up @@ -218,6 +227,7 @@ class ComputeStep {
UniformManager*) const;

SkSpan<const ResourceDesc> resources() const { return SkSpan(fResources); }
SkSpan<const WorkgroupBufferDesc> workgroupBuffers() const { return SkSpan(fWorkgroupBuffers); }

// Identifier that can be used as part of a unique key for a compute pipeline state object
// associated with this `ComputeStep`.
Expand Down Expand Up @@ -253,6 +263,7 @@ class ComputeStep {
ComputeStep(std::string_view name,
WorkgroupSize localDispatchSize,
SkSpan<const ResourceDesc> resources,
SkSpan<const WorkgroupBufferDesc> workgroupBuffers = {},
Flags baseFlags = Flags::kNone);

private:
Expand All @@ -263,7 +274,8 @@ class ComputeStep {
uint32_t fUniqueID;
SkEnumBitMask<Flags> fFlags;
std::string fName;
std::vector<ResourceDesc> fResources;
skia_private::TArray<ResourceDesc> fResources;
skia_private::TArray<WorkgroupBufferDesc> fWorkgroupBuffers;

// TODO(b/240615224): Subclasses should simply specify the workgroup size that they need.
// The ComputeStep constructor should check and reduce that number based on the maximum
Expand Down
5 changes: 5 additions & 0 deletions src/gpu/graphite/compute/DispatchGroup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,11 @@ bool Builder::appendStep(const ComputeStep* step,
dispatch.fBindings.push_back({static_cast<BindingIndex>(bindingIndex), dispatchResource});
}

auto wgBufferDescs = step->workgroupBuffers();
if (!wgBufferDescs.empty()) {
dispatch.fWorkgroupBuffers.push_back_n(wgBufferDescs.size(), wgBufferDescs.data());
}

// We need to switch pipelines if this step uses a different pipeline from the previous step.
if (fObj->fPipelineDescs.empty() ||
fObj->fPipelineDescs.back().uniqueID() != step->uniqueID()) {
Expand Down
1 change: 1 addition & 0 deletions src/gpu/graphite/compute/DispatchGroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class DispatchGroup final {
struct Dispatch {
ComputePassDesc fParams;
skia_private::TArray<ResourceBinding> fBindings;
skia_private::TArray<ComputeStep::WorkgroupBufferDesc> fWorkgroupBuffers;
int fPipelineIndex = 0;
};

Expand Down
4 changes: 4 additions & 0 deletions src/gpu/graphite/mtl/MtlCommandBuffer.mm
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@
this->bindTexture(group->getTexture(*texIdx), binding.fIndex);
}
}
SkASSERT(fActiveComputeCommandEncoder);
for (const ComputeStep::WorkgroupBufferDesc& wgBuf : dispatch.fWorkgroupBuffers) {
fActiveComputeCommandEncoder->setThreadgroupMemoryLength(wgBuf.size, wgBuf.index);
}
this->dispatchThreadgroups(dispatch.fParams.fGlobalDispatchSize,
dispatch.fParams.fLocalDispatchSize);
}
Expand Down
4 changes: 4 additions & 0 deletions src/gpu/graphite/mtl/MtlComputeCommandEncoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ class MtlComputeCommandEncoder : public Resource {
}
}

void setThreadgroupMemoryLength(NSUInteger length, NSUInteger index) {
[(*fCommandEncoder) setThreadgroupMemoryLength:length atIndex:index];
}

void dispatchThreadgroups(const WorkgroupSize& globalSize, const WorkgroupSize& localSize) {
MTLSize threadgroupCount =
MTLSizeMake(globalSize.fWidth, globalSize.fHeight, globalSize.fDepth);
Expand Down
132 changes: 132 additions & 0 deletions tests/graphite/ComputeTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1512,6 +1512,7 @@ DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal, reporter, c
/*slot=*/0,
}
},
/*workgroupBuffers=*/{},
/*baseFlags=*/Flags::kSupportsNativeShader) {}
~TestComputeStep() override = default;

Expand Down Expand Up @@ -1615,3 +1616,134 @@ DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal, reporter, c
kExpectedCount,
result);
}

DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal, reporter, context) {
std::unique_ptr<Recorder> recorder = context->makeRecorder();

constexpr uint32_t kWorkgroupCount = 32;
constexpr uint32_t kWorkgroupSize = 1024;

class TestComputeStep : public ComputeStep {
public:
TestComputeStep() : ComputeStep(
/*name=*/"TestAtomicOperationsMetal",
/*localDispatchSize=*/{kWorkgroupSize, 1, 1},
/*resources=*/{
{
/*type=*/ResourceType::kStorageBuffer,
/*flow=*/DataFlow::kShared,
/*policy=*/ResourcePolicy::kMapped,
/*slot=*/0,
}
},
/*workgroupBuffers=*/{
{
/*size=*/sizeof(uint32_t),
/*index=*/0u,
}
},
/*baseFlags=*/Flags::kSupportsNativeShader) {}
~TestComputeStep() override = default;

// This is the same MSL kernel as in Compute_NativeShaderSourceMetal, except `localCounter`
// is an entry-point parameter instead of a local variable. This forces the workgroup
// binding to be encoded explicitly in the command encoder.
NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
SkASSERT(format == NativeShaderFormat::kMSL);
static constexpr std::string_view kSource = R"(
#include <metal_stdlib>
using namespace metal;
kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
device atomic_uint& globalCounter [[buffer(0)]],
threadgroup atomic_uint& localCounter [[threadgroup(0)]]) {
// Initialize the local counter.
if (localId.x == 0u) {
atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
}
// Synchronize the threads in the workgroup so they all see the initial value.
threadgroup_barrier(mem_flags::mem_threadgroup);
// All threads increment the counter.
atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
// Synchronize the threads again to ensure they have all executed the increment
// and the following load reads the same value across all threads in the
// workgroup.
threadgroup_barrier(mem_flags::mem_threadgroup);
// Add the workgroup-only tally to the global counter.
if (localId.x == 0u) {
uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
}
}
)";
return {SkSpan(reinterpret_cast<const uint8_t*>(kSource.data()), kSource.length()),
"atomicCount"};
}

size_t calculateBufferSize(const DrawParams&,
int index,
const ResourceDesc& r) const override {
SkASSERT(index == 0);
SkASSERT(r.fSlot == 0);
SkASSERT(r.fFlow == DataFlow::kShared);
return sizeof(uint32_t);
}

WorkgroupSize calculateGlobalDispatchSize(const DrawParams&) const override {
return WorkgroupSize(kWorkgroupCount, 1, 1);
}

void prepareStorageBuffer(const DrawParams&,
int ssboIndex,
int resourceIndex,
const ResourceDesc& r,
void* buffer,
size_t bufferSize) const override {
SkASSERT(resourceIndex == 0);
*static_cast<uint32_t*>(buffer) = 0;
}
} step;

DispatchGroup::Builder builder(recorder.get());
builder.appendStep(&step, fake_draw_params_for_testing(), 0);

BindBufferInfo info = builder.getSharedBufferResource(0);
if (!info) {
ERRORF(reporter, "shared resource at slot 0 is missing");
return;
}

// Record the compute pass task.
ComputeTask::DispatchGroupList groups;
groups.push_back(builder.finalize());
recorder->priv().add(ComputeTask::Make(std::move(groups)));

// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
recorder->priv().add(SynchronizeToCpuTask::Make(sk_ref_sp(info.fBuffer)));

// Submit the work and wait for it to complete.
std::unique_ptr<Recording> recording = recorder->snap();
if (!recording) {
ERRORF(reporter, "Failed to make recording");
return;
}

InsertRecordingInfo insertInfo;
insertInfo.fRecording = recording.get();
context->insertRecording(insertInfo);
context->submit(SyncToCpu::kYes);

// Verify the contents of the output buffer.
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
const uint32_t result = static_cast<const uint32_t*>(map_bind_buffer(info))[0];
REPORTER_ASSERT(reporter,
result == kExpectedCount,
"expected '%d', found '%d'",
kExpectedCount,
result);
}

0 comments on commit a4f6149

Please sign in to comment.