diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h index 8adda77fc..e33802ecb 100644 --- a/src/a5/platform/include/common/platform_config.h +++ b/src/a5/platform/include/common/platform_config.h @@ -77,6 +77,18 @@ constexpr int PLATFORM_MAX_AIV_PER_THREAD = PLATFORM_MAX_BLOCKDIM * PLATFORM_AIV constexpr int PLATFORM_MAX_CORES_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD + PLATFORM_MAX_AIV_PER_THREAD; // 108 +// AICore UB reservation for the legacy SIMT launch path. +// +// rtKernelLaunchWithHandleV2 + rtRegisterAllKernel checks +// kernel->ShareMemSize_() + cfg.localMemorySize against +// RT_SIMT_REMAIN_UB_SIZE (224 KB = 256 KB UB − 32 KB dcache). The kernel +// advertises PLATFORM_AICORE_SHARE_MEM_SIZE via the SIMT TLV record +// injected in onboard/aicore/kernel.cpp; the host passes +// PLATFORM_AICORE_LOCAL_MEMORY_SIZE through cfg.localMemorySize. They sum +// to exactly 224 KB; runtime's check is strict >, so equality is accepted. +constexpr uint32_t PLATFORM_AICORE_SHARE_MEM_SIZE = 8 * 1024; // 8 KB +constexpr uint32_t PLATFORM_AICORE_LOCAL_MEMORY_SIZE = 216 * 1024; // 216 KB + // ============================================================================= // Performance Profiling Configuration // ============================================================================= diff --git a/src/a5/platform/onboard/aicore/CMakeLists.txt b/src/a5/platform/onboard/aicore/CMakeLists.txt index 770b26530..e2abae61d 100644 --- a/src/a5/platform/onboard/aicore/CMakeLists.txt +++ b/src/a5/platform/onboard/aicore/CMakeLists.txt @@ -57,6 +57,7 @@ set(AICORE_FLAGS -mllvm -cce-aicore-record-overflow=false \ -mllvm -cce-aicore-addr-transform \ -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + -mllvm -cce-dyn-kernel-stack-size=false \ ${CMAKE_CUSTOM_INCLUDE_DIR_FLAGS}" ) separate_arguments(AICORE_FLAGS) diff --git a/src/a5/platform/onboard/aicore/kernel.cpp b/src/a5/platform/onboard/aicore/kernel.cpp index aa3b776eb..dbf91f1e4 100644 --- a/src/a5/platform/onboard/aicore/kernel.cpp +++ b/src/a5/platform/onboard/aicore/kernel.cpp @@ -13,6 +13,8 @@ */ #include "aicore/aicore.h" #include "common/core_type.h" +#include "common/platform_config.h" +#include "simt_meta.h" class Runtime; @@ -33,6 +35,20 @@ class Runtime; extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type); +// Derive the section name from the same KERNEL_ENTRY macro that mangles the +// entry symbol, so the meta section name cannot drift if the suffix scheme +// changes. STRINGIFY needs two levels to expand the macro before stringizing. +#define SIMPLER_STRINGIFY_(x) #x +#define SIMPLER_STRINGIFY(x) SIMPLER_STRINGIFY_(x) +#define KERNEL_META_SECTION(func) ".ascend.meta." SIMPLER_STRINGIFY(KERNEL_ENTRY(func)) + +#ifdef __DAV_VEC__ +static const FuncLevelMeta func_simt_section __attribute__((used, section(KERNEL_META_SECTION(aicore_kernel)))) = { + {{F_TYPE_COMPILER_ALLOC_UB_SIZE, sizeof(unsigned int)}, PLATFORM_AICORE_SHARE_MEM_SIZE}, + {{F_TYPE_AIV_TYPE_FLAG, sizeof(unsigned int)}, AIV_TYPE_SIMD_SIMT_MIX_VF}, +}; +#endif + /** * Kernel entry point with control loop * diff --git a/src/a5/platform/onboard/aicore/simt_meta.h b/src/a5/platform/onboard/aicore/simt_meta.h new file mode 100644 index 000000000..3249f67d6 --- /dev/null +++ b/src/a5/platform/onboard/aicore/simt_meta.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file simt_meta.h + * @brief SIMT metadata TLV records for AICore kernel ELF (onboard / a5) + * + * The legacy launch path (rtKernelLaunchWithHandleV2 + rtRegisterAllKernel) + * requires the kernel ELF to carry two TLV records that runtime reads at + * register time: + * - F_TYPE_COMPILER_ALLOC_UB_SIZE (7) -> Kernel::shareMemSize_ + * - F_TYPE_AIV_TYPE_FLAG (12) -> Kernel::kernelVfType_ + * bisheng emits these only when it can statically infer the kernel uses + * SIMT intrinsics. Our entry is an SU dispatcher (vector ops live in task + * .o files invoked through aicore_execute), so the compiler cannot tag it. + * + * kernel.cpp's CMakeLists.txt pairs the hand-written record with + * `-mllvm -cce-dyn-kernel-stack-size=false`, which stops bisheng from + * auto-emitting a sibling `.ascend.meta.` section. Without that + * flag, runtime's parser (kernelInfoMap keyed by section name) would + * overwrite our values with bisheng's NO_VF / shareMemSize=0 defaults. + * + * TLV type IDs mirror RT_FUNCTION_TYPE_COMPILER_ALLOC_UB_SIZE (7) and + * RT_FUNCTION_TYPE_AIV_TYPE_FLAG (12) in CANN's runtime/runtime/elf_base.h. + * That header is host-side (extern "C", part of the runtime API) so we + * re-declare the two values we need rather than pull runtime headers into + * an AICore device-side TU. + */ + +#ifndef PLATFORM_A5_AICORE_SIMT_META_H_ +#define PLATFORM_A5_AICORE_SIMT_META_H_ + +enum FuncMetaType { + F_TYPE_COMPILER_ALLOC_UB_SIZE = 7, + F_TYPE_AIV_TYPE_FLAG = 12, +}; + +// AIVType values are not exposed in any CANN C/C++ header. The canonical +// source is CANN's compiler-side Python script +// (python/site-packages/tbe/tikcpp/ascendc_identify_meta_section_info.py), +// which is what bisheng / asc_op_compiler consult when classifying kernels. +enum AIVType { + AIV_TYPE_NO_VF = 1, + AIV_TYPE_SIMD_VF_ONLY = 2, + AIV_TYPE_SIMT_VF_ONLY = 3, + AIV_TYPE_SIMD_SIMT_MIX_VF = 4, +}; + +struct TlvHeader { + unsigned short type; + unsigned short len; +}; + +struct FuncMetaCompilerUbSize { + TlvHeader head; + unsigned int ub_size; +}; + +struct FuncMetaAivTypeFlag { + TlvHeader head; + unsigned int aiv_type; +}; + +struct FuncLevelMeta { + FuncMetaCompilerUbSize ub_size_meta; + FuncMetaAivTypeFlag aiv_type_meta; +}; + +#endif // PLATFORM_A5_AICORE_SIMT_META_H_ diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index a84cae24e..9d144d347 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -1017,6 +1017,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime *runtime) { rtTaskCfgInfo_t cfg = {}; cfg.schemMode = RT_SCHEM_MODE_BATCH; + cfg.localMemorySize = PLATFORM_AICORE_LOCAL_MEMORY_SIZE; rc = rtKernelLaunchWithHandleV2(bin_handle, 0, block_dim_, &rt_args, nullptr, stream, &cfg); if (rc != RT_ERROR_NONE) { diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp new file mode 100644 index 000000000..c259716bd --- /dev/null +++ b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +// Minimal SIMT element-scatter kernel (AIV). +// +// Distilled from the ptoas-generated mscatter reference. Drops cosmetic +// noise (v1..v30 names, dummy v4/v5/v6/v7 constants, explicit +// Layout::ND, the verbose Tile template tail, GM-offset arithmetic that +// always reduces to zero, the ptoas_auto_sync_tail wrapper). +// +// Kept on purpose: +// - per-data 3-tile alias pattern (TLOAD binds one tile, MSCATTER +// reads from another aliased to the same UB address; a single-tile +// form has reproduced golden mismatches on hw) +// - `set_mask_norm` / `set_vector_mask` SIMT mask init +// - `MTE2 → V` sync before MSCATTER (the ptoas default `MTE2 → MTE3` +// silently drops the scatter on a5 hw) +// - `__DAV_VEC__` guard so the AIC variant compiles to a no-op +// +// Operation: out[idx[r, c]] = src[r, c] for an 8x32 source and 256-slot +// destination. + +#include +#include + +#include "tensor.h" +#include "pipe_sync.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +static constexpr int TILE_ROWS = 8; +static constexpr int TILE_COLS = 32; +static constexpr int DST_LEN = TILE_ROWS * TILE_COLS; // 256 +static constexpr int SRC_TILE_BYTES = TILE_ROWS * TILE_COLS * sizeof(float); + +static __aicore__ void simt_scatter_impl(__gm__ float *src, __gm__ int32_t *idx, __gm__ float *out) { + using SrcTile = Tile; + using IdxTile = Tile; + + using TileShape = Shape<1, 1, 1, TILE_ROWS, TILE_COLS>; + using TileStride = pto::Stride; + using SrcGT = GlobalTensor; + using IdxGT = GlobalTensor; + + using DstShape = Shape<1, 1, 1, 1, DST_LEN>; + using DstStride = pto::Stride; + using DstGT = GlobalTensor; + + // Per-data 3-tile alias pattern: + // *_loader — bound directly at the UB offset; consumed by TLOAD + // *_scatter — bound via the loader's data() pointer; consumed by MSCATTER + // *_anchor — bound to the same offset literal; preserves the + // original ptoas binding sequence + constexpr int SRC_UB = 0; + constexpr int IDX_UB = SRC_TILE_BYTES; + + SrcTile src_loader(TILE_ROWS, TILE_COLS); + TASSIGN(src_loader, SRC_UB); + SrcTile src_scatter(TILE_ROWS, TILE_COLS); + TASSIGN(src_scatter, reinterpret_cast(src_loader.data())); + SrcTile src_anchor(TILE_ROWS, TILE_COLS); + TASSIGN(src_anchor, static_cast(SRC_UB)); + + IdxTile idx_loader(TILE_ROWS, TILE_COLS); + TASSIGN(idx_loader, IDX_UB); + IdxTile idx_scatter(TILE_ROWS, TILE_COLS); + TASSIGN(idx_scatter, reinterpret_cast(idx_loader.data())); + IdxTile idx_anchor(TILE_ROWS, TILE_COLS); + TASSIGN(idx_anchor, static_cast(IDX_UB)); + + SrcGT srcGlobal(src); + IdxGT idxGlobal(idx); + DstGT dstGlobal(out); + + TLOAD(src_anchor, srcGlobal); + TLOAD(idx_anchor, idxGlobal); + + // MTE2 → V before MSCATTER (critical: MTE2 → MTE3 silently drops the + // scatter on a5 hw). + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + MSCATTER(dstGlobal, src_scatter, idx_scatter); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset; + + __gm__ Tensor *idx_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ int32_t *idx = reinterpret_cast<__gm__ int32_t *>(idx_tensor->buffer.addr) + idx_tensor->start_offset; + + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + simt_scatter_impl(src, idx, out); +} diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/orchestration/simt_basic_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/orchestration/simt_basic_orch.cpp new file mode 100644 index 000000000..ad905984e --- /dev/null +++ b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/orchestration/simt_basic_orch.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * SIMT basic orchestration: submit a single AIV SIMT scatter task. + * + * Args layout: [src, indices, out] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_SIMT_SCATTER 0 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 3, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) { + Tensor src = from_tensor_arg(orch_args.tensor(0)); + Tensor indices = from_tensor_arg(orch_args.tensor(1)); + Tensor out = from_tensor_arg(orch_args.tensor(2)); + + // PTO2_SCOPE ensures rt_submit_aiv_task flushes through the task + // ringbuffer before the entry returns. No set_core_num — let the + // runtime use the config's block_dim, matching the ptoas-validated + // mscatter reference. + PTO2_SCOPE() { + Arg args; + args.add_input(src); + args.add_input(indices); + args.add_output(out); + rt_submit_aiv_task(FUNC_SIMT_SCATTER, args); + } +} + +} // extern "C" diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py new file mode 100644 index 000000000..d223b2c0a --- /dev/null +++ b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SIMT basic element-scatter: minimal AIV scatter kernel that exercises the SIMT launch path. + +Config (block_dim=24, aicpu_thread_num=4, sequential identity indices) +mirrors the ptoas-validated mscatter reference at +mscatter_fp32_8x32_seq_20260513_140539/test_mscatter.py. Identity +indices keep the golden trivially src-equals-out so a failure here +points at the SIMT launch path itself (TLV injection, localMemorySize +budget, sync) rather than at the scatter index semantics. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +TILE_ROWS = 8 +TILE_COLS = 32 +SRC_ELEMS = TILE_ROWS * TILE_COLS # 256 +DST_LEN = SRC_ELEMS # 256 + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestSimtBasic(SceneTestCase): + RTOL = 1e-5 + ATOL = 1e-5 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/simt_basic_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "name": "SIMT_SCATTER", + "source": "kernels/aiv/kernel_simt_scatter.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a5sim", "a5"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {}, + } + ] + + def generate_args(self, params): + torch.manual_seed(0) + src = torch.randn(SRC_ELEMS, dtype=torch.float32) + # Identity indices (0..DST_LEN-1) — matches the ptoas reference and + # makes the golden trivially `out == src`. Switch to torch.randperm + # later once the baseline launch path is confirmed green. + indices = torch.arange(DST_LEN, dtype=torch.int32) + out = torch.zeros(DST_LEN, dtype=torch.float32) + return TaskArgsBuilder( + Tensor("src", src), + Tensor("indices", indices), + Tensor("out", out), + ) + + def compute_golden(self, args, params): + args.out.zero_() + args.out[args.indices.to(torch.int64)] = args.src + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__)