Refactor into ggml_cuda_should_use_topk_moe

am17an · am17an · commit 613b6c394cef · 2025-09-21T11:58:55.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2835,29 +2835,9 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
         }
 
         ggml_tensor * softmax = cgraph->nodes[node_idx];
-
-        float scale    = 1.0f;
-        float max_bias = 0.0f;
-
-        memcpy(&scale,    (const float *) softmax->op_params + 0, sizeof(float));
-        memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
-
-        if (scale != 1.0f || max_bias != 0.0f) {
-            return false;
-        }
-
-        // don't fuse when masks or sinks are present
-        if (softmax->src[1] || softmax->src[2]) {
-            return false;
-        }
-
-        const int n_expert = softmax->ne[0];
-        // n_expert must be a power of 2
-        if (n_expert & (n_expert - 1) != 0 || n_expert > 512) {
-            return false;
+        if (ggml_cuda_should_use_topk_moe(softmax)) {
+            return true;
         }
-
-        return true;
     }
 
     if (!ggml_can_fuse(cgraph, node_idx, ops)) {
@@ -2927,8 +2907,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
         return true;
     }
 
-
-
     return false;
 }
 
@@ -3010,7 +2988,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
                         continue;
                     }
-
                 }
 #ifndef NDEBUG
                 assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu
@@ -1,3 +1,4 @@
+#include "ggml.h"
 #include "topk-moe.cuh"
 
 /*
@@ -10,10 +11,10 @@
 */
 template <size_t n_experts>
 __global__ void topk_moe_cuda(const float * logits,
-                                float *       weights,
-                                int32_t *     ids,
-                                const int     n_rows,
-                                const int     n_expert_used) {
+                              float *       weights,
+                              int32_t *     ids,
+                              const int     n_rows,
+                              const int     n_expert_used) {
     const int row = blockIdx.x * blockDim.y + threadIdx.y;
     if (row >= n_rows) {
         return;
@@ -94,12 +95,12 @@ __global__ void topk_moe_cuda(const float * logits,
 }
 
 static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
-                                    const float *               logits,
-                                    float *                     weights,
-                                    int32_t *                   ids,
-                                    const int                   n_rows,
-                                    const int                   n_expert,
-                                    const int                   n_expert_used) {
+                                 const float *               logits,
+                                 float *                     weights,
+                                 int32_t *                   ids,
+                                 const int                   n_rows,
+                                 const int                   n_expert,
+                                 const int                   n_expert_used) {
     const int    rows_per_block = 4;
     dim3         grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
     dim3         block_dims(32, rows_per_block, 1);
@@ -143,9 +144,9 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
 }
 
 void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
-                             ggml_tensor *               logits,
-                             ggml_tensor *               weights,
-                             ggml_tensor *               ids) {
+                           const ggml_tensor *         logits,
+                           ggml_tensor *               weights,
+                           ggml_tensor *               ids) {
     GGML_ASSERT(logits->type == GGML_TYPE_F32);
     GGML_ASSERT(weights->type == GGML_TYPE_F32);
     GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -163,3 +164,28 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
 
     launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
 }
+
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax) {
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale, (const float *) softmax->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
+
+    if (scale != 1.0f || max_bias != 0.0f) {
+        return false;
+    }
+
+    // don't fuse when masks or sinks are present
+    if (softmax->src[1] || softmax->src[2]) {
+        return false;
+    }
+
+    const int n_expert = softmax->ne[0];
+    // n_expert must be a power of 2
+    if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
+        return false;
+    }
+
+    return true;
+}
diff --git a/ggml/src/ggml-cuda/topk-moe.cuh b/ggml/src/ggml-cuda/topk-moe.cuh
@@ -1,3 +1,5 @@
 #include "common.cuh"
 
-void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, ggml_tensor * logits, ggml_tensor * weights, ggml_tensor * top_k);
+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, const ggml_tensor * logits, ggml_tensor * weights, ggml_tensor * top_k);
+
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -4419,7 +4419,7 @@ struct test_topk_moe: public test_case {
 
     std::string op_desc(ggml_tensor * t) override {
         GGML_UNUSED(t);
-        return "TOPK_GATED_MOE";
+        return "TOPK_MOE";
     }
 
     bool run_whole_graph() override { return true; }

Original file line number	Diff line number	Diff line change
`@@ -4419,7 +4419,7 @@ struct test_topk_moe: public test_case {`
`4419`	`4419`
`4420`	`4420`	`std::string op_desc(ggml_tensor * t) override {`
`4421`	`4421`	`GGML_UNUSED(t);`
`4422`		`- return "TOPK_GATED_MOE";`
	`4422`	`+ return "TOPK_MOE";`
`4423`	`4423`	`}`
`4424`	`4424`
`4425`	`4425`	`bool run_whole_graph() override { return true; }`