hw-native-sys · ChaoZheng109 · May 13, 2026
diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h
@@ -77,6 +77,18 @@ constexpr int PLATFORM_MAX_AIV_PER_THREAD = PLATFORM_MAX_BLOCKDIM * PLATFORM_AIV
 
 constexpr int PLATFORM_MAX_CORES_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD + PLATFORM_MAX_AIV_PER_THREAD;  // 108
 
+// AICore UB reservation for the legacy SIMT launch path.
+//
+// rtKernelLaunchWithHandleV2 + rtRegisterAllKernel checks
+// kernel->ShareMemSize_() + cfg.localMemorySize against
+// RT_SIMT_REMAIN_UB_SIZE (224 KB = 256 KB UB − 32 KB dcache). The kernel
+// advertises PLATFORM_AICORE_SHARE_MEM_SIZE via the SIMT TLV record
+// injected in onboard/aicore/kernel.cpp; the host passes
+// PLATFORM_AICORE_LOCAL_MEMORY_SIZE through cfg.localMemorySize. They sum
+// to exactly 224 KB; runtime's check is strict >, so equality is accepted.
+constexpr uint32_t PLATFORM_AICORE_SHARE_MEM_SIZE = 8 * 1024;       // 8 KB
+constexpr uint32_t PLATFORM_AICORE_LOCAL_MEMORY_SIZE = 216 * 1024;  // 216 KB
+
 // =============================================================================
 // Performance Profiling Configuration
 // =============================================================================

diff --git a/src/a5/platform/onboard/aicore/CMakeLists.txt b/src/a5/platform/onboard/aicore/CMakeLists.txt
@@ -57,6 +57,7 @@ set(AICORE_FLAGS
     -mllvm -cce-aicore-record-overflow=false \
     -mllvm -cce-aicore-addr-transform \
     -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    -mllvm -cce-dyn-kernel-stack-size=false \
     ${CMAKE_CUSTOM_INCLUDE_DIR_FLAGS}"
 )
 separate_arguments(AICORE_FLAGS)

diff --git a/src/a5/platform/onboard/aicore/kernel.cpp b/src/a5/platform/onboard/aicore/kernel.cpp
@@ -13,6 +13,8 @@
  */
 #include "aicore/aicore.h"
 #include "common/core_type.h"
+#include "common/platform_config.h"
+#include "simt_meta.h"
 
 class Runtime;
 
@@ -33,6 +35,20 @@ class Runtime;
 
 extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type);
 
+// Derive the section name from the same KERNEL_ENTRY macro that mangles the
+// entry symbol, so the meta section name cannot drift if the suffix scheme
+// changes. STRINGIFY needs two levels to expand the macro before stringizing.
+#define SIMPLER_STRINGIFY_(x) #x
+#define SIMPLER_STRINGIFY(x) SIMPLER_STRINGIFY_(x)
+#define KERNEL_META_SECTION(func) ".ascend.meta." SIMPLER_STRINGIFY(KERNEL_ENTRY(func))
+
+#ifdef __DAV_VEC__
+static const FuncLevelMeta func_simt_section __attribute__((used, section(KERNEL_META_SECTION(aicore_kernel)))) = {
+    {{F_TYPE_COMPILER_ALLOC_UB_SIZE, sizeof(unsigned int)}, PLATFORM_AICORE_SHARE_MEM_SIZE},
+    {{F_TYPE_AIV_TYPE_FLAG, sizeof(unsigned int)}, AIV_TYPE_SIMD_SIMT_MIX_VF},
+};
+#endif
+
 /**
  * Kernel entry point with control loop
  *

diff --git a/src/a5/platform/onboard/aicore/simt_meta.h b/src/a5/platform/onboard/aicore/simt_meta.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file simt_meta.h
+ * @brief SIMT metadata TLV records for AICore kernel ELF (onboard / a5)
+ *
+ * The legacy launch path (rtKernelLaunchWithHandleV2 + rtRegisterAllKernel)
+ * requires the kernel ELF to carry two TLV records that runtime reads at
+ * register time:
+ *   - F_TYPE_COMPILER_ALLOC_UB_SIZE (7)  -> Kernel::shareMemSize_
+ *   - F_TYPE_AIV_TYPE_FLAG          (12) -> Kernel::kernelVfType_
+ * bisheng emits these only when it can statically infer the kernel uses
+ * SIMT intrinsics. Our entry is an SU dispatcher (vector ops live in task
+ * .o files invoked through aicore_execute), so the compiler cannot tag it.
+ *
+ * kernel.cpp's CMakeLists.txt pairs the hand-written record with
+ * `-mllvm -cce-dyn-kernel-stack-size=false`, which stops bisheng from
+ * auto-emitting a sibling `.ascend.meta.<funcname>` section. Without that
+ * flag, runtime's parser (kernelInfoMap keyed by section name) would
+ * overwrite our values with bisheng's NO_VF / shareMemSize=0 defaults.
+ *
+ * TLV type IDs mirror RT_FUNCTION_TYPE_COMPILER_ALLOC_UB_SIZE (7) and
+ * RT_FUNCTION_TYPE_AIV_TYPE_FLAG (12) in CANN's runtime/runtime/elf_base.h.
+ * That header is host-side (extern "C", part of the runtime API) so we
+ * re-declare the two values we need rather than pull runtime headers into
+ * an AICore device-side TU.
+ */
+
+#ifndef PLATFORM_A5_AICORE_SIMT_META_H_
+#define PLATFORM_A5_AICORE_SIMT_META_H_
+
+enum FuncMetaType {
+    F_TYPE_COMPILER_ALLOC_UB_SIZE = 7,
+    F_TYPE_AIV_TYPE_FLAG = 12,
+};
+
+// AIVType values are not exposed in any CANN C/C++ header. The canonical
+// source is CANN's compiler-side Python script
+// (python/site-packages/tbe/tikcpp/ascendc_identify_meta_section_info.py),
+// which is what bisheng / asc_op_compiler consult when classifying kernels.
+enum AIVType {
+    AIV_TYPE_NO_VF = 1,
+    AIV_TYPE_SIMD_VF_ONLY = 2,
+    AIV_TYPE_SIMT_VF_ONLY = 3,
+    AIV_TYPE_SIMD_SIMT_MIX_VF = 4,
+};
+
+struct TlvHeader {
+    unsigned short type;
+    unsigned short len;
+};
+
+struct FuncMetaCompilerUbSize {
+    TlvHeader head;
+    unsigned int ub_size;
+};
+
+struct FuncMetaAivTypeFlag {
+    TlvHeader head;
+    unsigned int aiv_type;
+};
+
+struct FuncLevelMeta {
+    FuncMetaCompilerUbSize ub_size_meta;
+    FuncMetaAivTypeFlag aiv_type_meta;
+};
+
+#endif  // PLATFORM_A5_AICORE_SIMT_META_H_
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
@@ -1017,6 +1017,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime *runtime) {
 
     rtTaskCfgInfo_t cfg = {};
     cfg.schemMode = RT_SCHEM_MODE_BATCH;
+    cfg.localMemorySize = PLATFORM_AICORE_LOCAL_MEMORY_SIZE;
 
     rc = rtKernelLaunchWithHandleV2(bin_handle, 0, block_dim_, &rt_args, nullptr, stream, &cfg);
     if (rc != RT_ERROR_NONE) {

diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Minimal SIMT element-scatter kernel (AIV).
+//
+// Distilled from the ptoas-generated mscatter reference. Drops cosmetic
+// noise (v1..v30 names, dummy v4/v5/v6/v7 constants, explicit
+// Layout::ND, the verbose Tile template tail, GM-offset arithmetic that
+// always reduces to zero, the ptoas_auto_sync_tail wrapper).
+//
+// Kept on purpose:
+//   - per-data 3-tile alias pattern (TLOAD binds one tile, MSCATTER
+//     reads from another aliased to the same UB address; a single-tile
+//     form has reproduced golden mismatches on hw)
+//   - `set_mask_norm` / `set_vector_mask` SIMT mask init
+//   - `MTE2 → V` sync before MSCATTER (the ptoas default `MTE2 → MTE3`
+//     silently drops the scatter on a5 hw)
+//   - `__DAV_VEC__` guard so the AIC variant compiles to a no-op
+//
+// Operation: out[idx[r, c]] = src[r, c] for an 8x32 source and 256-slot
+// destination.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+#include "pipe_sync.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static constexpr int TILE_ROWS = 8;
+static constexpr int TILE_COLS = 32;
+static constexpr int DST_LEN = TILE_ROWS * TILE_COLS;  // 256
+static constexpr int SRC_TILE_BYTES = TILE_ROWS * TILE_COLS * sizeof(float);
+
+static __aicore__ void simt_scatter_impl(__gm__ float *src, __gm__ int32_t *idx, __gm__ float *out) {
+    using SrcTile = Tile<TileType::Vec, float, TILE_ROWS, TILE_COLS, BLayout::RowMajor, -1, -1>;
+    using IdxTile = Tile<TileType::Vec, int32_t, TILE_ROWS, TILE_COLS, BLayout::RowMajor, -1, -1>;
+
+    using TileShape = Shape<1, 1, 1, TILE_ROWS, TILE_COLS>;
+    using TileStride = pto::Stride<DST_LEN, DST_LEN, DST_LEN, TILE_COLS, 1>;
+    using SrcGT = GlobalTensor<float, TileShape, TileStride>;
+    using IdxGT = GlobalTensor<int32_t, TileShape, TileStride>;
+
+    using DstShape = Shape<1, 1, 1, 1, DST_LEN>;
+    using DstStride = pto::Stride<DST_LEN, DST_LEN, DST_LEN, DST_LEN, 1>;
+    using DstGT = GlobalTensor<float, DstShape, DstStride>;
+
+    // Per-data 3-tile alias pattern:
+    //   *_loader  — bound directly at the UB offset; consumed by TLOAD
+    //   *_scatter — bound via the loader's data() pointer; consumed by MSCATTER
+    //   *_anchor  — bound to the same offset literal; preserves the
+    //               original ptoas binding sequence
+    constexpr int SRC_UB = 0;
+    constexpr int IDX_UB = SRC_TILE_BYTES;
+
+    SrcTile src_loader(TILE_ROWS, TILE_COLS);
+    TASSIGN(src_loader, SRC_UB);
+    SrcTile src_scatter(TILE_ROWS, TILE_COLS);
+    TASSIGN(src_scatter, reinterpret_cast<uint64_t>(src_loader.data()));
+    SrcTile src_anchor(TILE_ROWS, TILE_COLS);
+    TASSIGN(src_anchor, static_cast<uint64_t>(SRC_UB));
+
+    IdxTile idx_loader(TILE_ROWS, TILE_COLS);
+    TASSIGN(idx_loader, IDX_UB);
+    IdxTile idx_scatter(TILE_ROWS, TILE_COLS);
+    TASSIGN(idx_scatter, reinterpret_cast<uint64_t>(idx_loader.data()));
+    IdxTile idx_anchor(TILE_ROWS, TILE_COLS);
+    TASSIGN(idx_anchor, static_cast<uint64_t>(IDX_UB));
+
+    SrcGT srcGlobal(src);
+    IdxGT idxGlobal(idx);
+    DstGT dstGlobal(out);
+
+    TLOAD(src_anchor, srcGlobal);
+    TLOAD(idx_anchor, idxGlobal);
+
+    // MTE2 → V before MSCATTER (critical: MTE2 → MTE3 silently drops the
+    // scatter on a5 hw).
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    MSCATTER(dstGlobal, src_scatter, idx_scatter);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset;
+
+    __gm__ Tensor *idx_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ int32_t *idx = reinterpret_cast<__gm__ int32_t *>(idx_tensor->buffer.addr) + idx_tensor->start_offset;
+
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    simt_scatter_impl(src, idx, out);
+}
diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/orchestration/simt_basic_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/orchestration/simt_basic_orch.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SIMT basic orchestration: submit a single AIV SIMT scatter task.
+ *
+ * Args layout: [src, indices, out]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SIMT_SCATTER 0
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
+    Tensor src = from_tensor_arg(orch_args.tensor(0));
+    Tensor indices = from_tensor_arg(orch_args.tensor(1));
+    Tensor out = from_tensor_arg(orch_args.tensor(2));
+
+    // PTO2_SCOPE ensures rt_submit_aiv_task flushes through the task
+    // ringbuffer before the entry returns. No set_core_num — let the
+    // runtime use the config's block_dim, matching the ptoas-validated
+    // mscatter reference.
+    PTO2_SCOPE() {
+        Arg args;
+        args.add_input(src);
+        args.add_input(indices);
+        args.add_output(out);
+        rt_submit_aiv_task(FUNC_SIMT_SCATTER, args);
+    }
+}
+
+}  // extern "C"
diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SIMT basic element-scatter: minimal AIV scatter kernel that exercises the SIMT launch path.
+
+Config (block_dim=24, aicpu_thread_num=4, sequential identity indices)
+mirrors the ptoas-validated mscatter reference at
+mscatter_fp32_8x32_seq_20260513_140539/test_mscatter.py. Identity
+indices keep the golden trivially src-equals-out so a failure here
+points at the SIMT launch path itself (TLV injection, localMemorySize
+budget, sync) rather than at the scatter index semantics.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+TILE_ROWS = 8
+TILE_COLS = 32
+SRC_ELEMS = TILE_ROWS * TILE_COLS  # 256
+DST_LEN = SRC_ELEMS  # 256
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSimtBasic(SceneTestCase):
+    RTOL = 1e-5
+    ATOL = 1e-5
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/simt_basic_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SIMT_SCATTER",
+                "source": "kernels/aiv/kernel_simt_scatter.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        torch.manual_seed(0)
+        src = torch.randn(SRC_ELEMS, dtype=torch.float32)
+        # Identity indices (0..DST_LEN-1) — matches the ptoas reference and
+        # makes the golden trivially `out == src`. Switch to torch.randperm
+        # later once the baseline launch path is confirmed green.
+        indices = torch.arange(DST_LEN, dtype=torch.int32)
+        out = torch.zeros(DST_LEN, dtype=torch.float32)
+        return TaskArgsBuilder(
+            Tensor("src", src),
+            Tensor("indices", indices),
+            Tensor("out", out),
+        )
+
+    def compute_golden(self, args, params):
+        args.out.zero_()
+        args.out[args.indices.to(torch.int64)] = args.src
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)