[ExecuTorch][WebGPU] GPU timestamp query profiling for SDPA

JCNTH · JCNTH · commit 99ec14f7c9b9 · 2026-06-10T14:37:12.000-07:00
Pull Request resolved: #20167 SDPA-specific instrumentation layered on the general GPU-timestamp infrastructure (companion diff below): tag each fused SDPA dispatch with its `kernel_name` so the `WebGPUQueryPool` can attribute on-GPU time to the attention stage that produced it. `sdpa_with_kv_cache` runs four chained dispatches — `update_cache` -> QK (`attn_weights`) -> softmax -> AV (`compute_out`); `WebGPUGraph::execute()` brackets each compute pass with a timestamp when the pool is active, and this diff labels each dispatch so the per-pass durations map back to the right stage. Opt-in via the `WEBGPU_TIMESTAMP_QUERY` env var; off by default, so the production `execute()` path is byte-identical. This is the per-kernel hook a forthcoming SDPA kernel benchmark will read; the benchmark itself (and any comparative numbers) is a separate follow-up. Co-authored with Claude. ghstack-source-id: 392093463 @exported-using-ghexport Differential Revision: [D107678235](https://our.internmc.facebook.com/intern/diff/D107678235/)
diff --git a/backends/webgpu/runtime/ops/sdpa/Sdpa.cpp b/backends/webgpu/runtime/ops/sdpa/Sdpa.cpp
@@ -156,7 +156,8 @@ void build_dispatch(
     uint64_t uniform_size,
     uint32_t workgroup_count_x,
     uint32_t wg_size,
-    bool retain_uniform = false) {
+    bool retain_uniform = false,
+    const char* kernel_name = "") {
   WGPUDevice device = graph.device();
 
   WGPUShaderSourceWGSL wgsl_desc = {};
@@ -227,7 +228,7 @@ void build_dispatch(
   bg_desc.entries = bg_entries;
   WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
 
-  graph.add_dispatch({pipeline, bind_group, workgroup_count_x});
+  graph.add_dispatch({pipeline, bind_group, workgroup_count_x, kernel_name});
 
   wgpuShaderModuleRelease(shader);
   wgpuBindGroupLayoutRelease(bgl);
@@ -269,7 +270,8 @@ static WGPUBuffer record_update_cache_dispatch(
       sizeof(uc),
       wgc,
       uc_wg,
-      dynamic_pos);
+      dynamic_pos,
+      "update_cache");
   return ubuf;
 }
 
@@ -473,7 +475,8 @@ void sdpa_with_kv_cache_impl(WebGPUGraph& graph, const std::vector<int>& args) {
         sizeof(p),
         wgc,
         qk_wg,
-        dynamic_pos);
+        dynamic_pos,
+        "sdpa_compute_attn_weights");
     qk_buf = ubuf;
     qk_idx = graph.num_dispatches() - 1;
   }
@@ -496,7 +499,8 @@ void sdpa_with_kv_cache_impl(WebGPUGraph& graph, const std::vector<int>& args) {
         sizeof(p),
         wgc,
         0,
-        dynamic_pos);
+        dynamic_pos,
+        "sdpa_softmax");
     softmax_buf = ubuf;
   }
 
@@ -521,7 +525,8 @@ void sdpa_with_kv_cache_impl(WebGPUGraph& graph, const std::vector<int>& args) {
         sizeof(p),
         wgc,
         av_wg,
-        dynamic_pos);
+        dynamic_pos,
+        "sdpa_compute_out");
     av_buf = ubuf;
   }