Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/a5/platform/include/common/platform_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ constexpr int PLATFORM_MAX_AIV_PER_THREAD = PLATFORM_MAX_BLOCKDIM * PLATFORM_AIV

constexpr int PLATFORM_MAX_CORES_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD + PLATFORM_MAX_AIV_PER_THREAD; // 108

// AICore UB reservation for the legacy SIMT launch path.
//
// rtKernelLaunchWithHandleV2 + rtRegisterAllKernel checks
// kernel->ShareMemSize_() + cfg.localMemorySize against
// RT_SIMT_REMAIN_UB_SIZE (224 KB = 256 KB UB − 32 KB dcache). The kernel
// advertises PLATFORM_AICORE_SHARE_MEM_SIZE via the SIMT TLV record
// injected in onboard/aicore/kernel.cpp; the host passes
// PLATFORM_AICORE_LOCAL_MEMORY_SIZE through cfg.localMemorySize. They sum
// to exactly 224 KB; runtime's check is strict >, so equality is accepted.
constexpr uint32_t PLATFORM_AICORE_SHARE_MEM_SIZE = 8 * 1024; // 8 KB
constexpr uint32_t PLATFORM_AICORE_LOCAL_MEMORY_SIZE = 216 * 1024; // 216 KB

// =============================================================================
// Performance Profiling Configuration
// =============================================================================
Expand Down
1 change: 1 addition & 0 deletions src/a5/platform/onboard/aicore/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ set(AICORE_FLAGS
-mllvm -cce-aicore-record-overflow=false \
-mllvm -cce-aicore-addr-transform \
-mllvm -cce-aicore-dcci-insert-for-scalar=false \
-mllvm -cce-dyn-kernel-stack-size=false \
${CMAKE_CUSTOM_INCLUDE_DIR_FLAGS}"
)
separate_arguments(AICORE_FLAGS)
Expand Down
16 changes: 16 additions & 0 deletions src/a5/platform/onboard/aicore/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
*/
#include "aicore/aicore.h"
#include "common/core_type.h"
#include "common/platform_config.h"
#include "simt_meta.h"

class Runtime;

Expand All @@ -33,6 +35,20 @@ class Runtime;

extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type);

// Derive the section name from the same KERNEL_ENTRY macro that mangles the
// entry symbol, so the meta section name cannot drift if the suffix scheme
// changes. STRINGIFY needs two levels to expand the macro before stringizing.
#define SIMPLER_STRINGIFY_(x) #x
#define SIMPLER_STRINGIFY(x) SIMPLER_STRINGIFY_(x)
#define KERNEL_META_SECTION(func) ".ascend.meta." SIMPLER_STRINGIFY(KERNEL_ENTRY(func))

#ifdef __DAV_VEC__
static const FuncLevelMeta func_simt_section __attribute__((used, section(KERNEL_META_SECTION(aicore_kernel)))) = {
{{F_TYPE_COMPILER_ALLOC_UB_SIZE, sizeof(unsigned int)}, PLATFORM_AICORE_SHARE_MEM_SIZE},
{{F_TYPE_AIV_TYPE_FLAG, sizeof(unsigned int)}, AIV_TYPE_SIMD_SIMT_MIX_VF},
};
#endif

/**
* Kernel entry point with control loop
*
Expand Down
77 changes: 77 additions & 0 deletions src/a5/platform/onboard/aicore/simt_meta.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/

/**
* @file simt_meta.h
* @brief SIMT metadata TLV records for AICore kernel ELF (onboard / a5)
*
* The legacy launch path (rtKernelLaunchWithHandleV2 + rtRegisterAllKernel)
* requires the kernel ELF to carry two TLV records that runtime reads at
* register time:
* - F_TYPE_COMPILER_ALLOC_UB_SIZE (7) -> Kernel::shareMemSize_
* - F_TYPE_AIV_TYPE_FLAG (12) -> Kernel::kernelVfType_
* bisheng emits these only when it can statically infer the kernel uses
* SIMT intrinsics. Our entry is an SU dispatcher (vector ops live in task
* .o files invoked through aicore_execute), so the compiler cannot tag it.
*
* kernel.cpp's CMakeLists.txt pairs the hand-written record with
* `-mllvm -cce-dyn-kernel-stack-size=false`, which stops bisheng from
* auto-emitting a sibling `.ascend.meta.<funcname>` section. Without that
* flag, runtime's parser (kernelInfoMap keyed by section name) would
* overwrite our values with bisheng's NO_VF / shareMemSize=0 defaults.
*
* TLV type IDs mirror RT_FUNCTION_TYPE_COMPILER_ALLOC_UB_SIZE (7) and
* RT_FUNCTION_TYPE_AIV_TYPE_FLAG (12) in CANN's runtime/runtime/elf_base.h.
* That header is host-side (extern "C", part of the runtime API) so we
* re-declare the two values we need rather than pull runtime headers into
* an AICore device-side TU.
*/

#ifndef PLATFORM_A5_AICORE_SIMT_META_H_
#define PLATFORM_A5_AICORE_SIMT_META_H_

enum FuncMetaType {
F_TYPE_COMPILER_ALLOC_UB_SIZE = 7,
F_TYPE_AIV_TYPE_FLAG = 12,
};

// AIVType values are not exposed in any CANN C/C++ header. The canonical
// source is CANN's compiler-side Python script
// (python/site-packages/tbe/tikcpp/ascendc_identify_meta_section_info.py),
// which is what bisheng / asc_op_compiler consult when classifying kernels.
enum AIVType {
AIV_TYPE_NO_VF = 1,
AIV_TYPE_SIMD_VF_ONLY = 2,
AIV_TYPE_SIMT_VF_ONLY = 3,
AIV_TYPE_SIMD_SIMT_MIX_VF = 4,
};

struct TlvHeader {
unsigned short type;
unsigned short len;
};

struct FuncMetaCompilerUbSize {
TlvHeader head;
unsigned int ub_size;
};

struct FuncMetaAivTypeFlag {
TlvHeader head;
unsigned int aiv_type;
};

struct FuncLevelMeta {
FuncMetaCompilerUbSize ub_size_meta;
FuncMetaAivTypeFlag aiv_type_meta;
};

#endif // PLATFORM_A5_AICORE_SIMT_META_H_
1 change: 1 addition & 0 deletions src/a5/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1017,6 +1017,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, Runtime *runtime) {

rtTaskCfgInfo_t cfg = {};
cfg.schemMode = RT_SCHEM_MODE_BATCH;
cfg.localMemorySize = PLATFORM_AICORE_LOCAL_MEMORY_SIZE;

rc = rtKernelLaunchWithHandleV2(bin_handle, 0, block_dim_, &rt_args, nullptr, stream, &cfg);
if (rc != RT_ERROR_NONE) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/

// Minimal SIMT element-scatter kernel (AIV).
//
// Distilled from the ptoas-generated mscatter reference. Drops cosmetic
// noise (v1..v30 names, dummy v4/v5/v6/v7 constants, explicit
// Layout::ND, the verbose Tile template tail, GM-offset arithmetic that
// always reduces to zero, the ptoas_auto_sync_tail wrapper).
//
// Kept on purpose:
// - per-data 3-tile alias pattern (TLOAD binds one tile, MSCATTER
// reads from another aliased to the same UB address; a single-tile
// form has reproduced golden mismatches on hw)
// - `set_mask_norm` / `set_vector_mask` SIMT mask init
// - `MTE2 → V` sync before MSCATTER (the ptoas default `MTE2 → MTE3`
// silently drops the scatter on a5 hw)
// - `__DAV_VEC__` guard so the AIC variant compiles to a no-op
//
// Operation: out[idx[r, c]] = src[r, c] for an 8x32 source and 256-slot
// destination.

#include <cstdint>
#include <pto/pto-inst.hpp>

#include "tensor.h"
#include "pipe_sync.h"

using namespace pto;

#ifndef __gm__
#define __gm__
#endif

#ifndef __aicore__
#define __aicore__ [aicore]
#endif

static constexpr int TILE_ROWS = 8;
static constexpr int TILE_COLS = 32;
static constexpr int DST_LEN = TILE_ROWS * TILE_COLS; // 256
static constexpr int SRC_TILE_BYTES = TILE_ROWS * TILE_COLS * sizeof(float);

static __aicore__ void simt_scatter_impl(__gm__ float *src, __gm__ int32_t *idx, __gm__ float *out) {
using SrcTile = Tile<TileType::Vec, float, TILE_ROWS, TILE_COLS, BLayout::RowMajor, -1, -1>;
using IdxTile = Tile<TileType::Vec, int32_t, TILE_ROWS, TILE_COLS, BLayout::RowMajor, -1, -1>;

using TileShape = Shape<1, 1, 1, TILE_ROWS, TILE_COLS>;
using TileStride = pto::Stride<DST_LEN, DST_LEN, DST_LEN, TILE_COLS, 1>;
using SrcGT = GlobalTensor<float, TileShape, TileStride>;
using IdxGT = GlobalTensor<int32_t, TileShape, TileStride>;

using DstShape = Shape<1, 1, 1, 1, DST_LEN>;
using DstStride = pto::Stride<DST_LEN, DST_LEN, DST_LEN, DST_LEN, 1>;
using DstGT = GlobalTensor<float, DstShape, DstStride>;

// Per-data 3-tile alias pattern:
// *_loader — bound directly at the UB offset; consumed by TLOAD
// *_scatter — bound via the loader's data() pointer; consumed by MSCATTER
// *_anchor — bound to the same offset literal; preserves the
// original ptoas binding sequence
constexpr int SRC_UB = 0;
constexpr int IDX_UB = SRC_TILE_BYTES;

SrcTile src_loader(TILE_ROWS, TILE_COLS);
TASSIGN(src_loader, SRC_UB);
SrcTile src_scatter(TILE_ROWS, TILE_COLS);
TASSIGN(src_scatter, reinterpret_cast<uint64_t>(src_loader.data()));
SrcTile src_anchor(TILE_ROWS, TILE_COLS);
TASSIGN(src_anchor, static_cast<uint64_t>(SRC_UB));

IdxTile idx_loader(TILE_ROWS, TILE_COLS);
TASSIGN(idx_loader, IDX_UB);
IdxTile idx_scatter(TILE_ROWS, TILE_COLS);
TASSIGN(idx_scatter, reinterpret_cast<uint64_t>(idx_loader.data()));
IdxTile idx_anchor(TILE_ROWS, TILE_COLS);
TASSIGN(idx_anchor, static_cast<uint64_t>(IDX_UB));

SrcGT srcGlobal(src);
IdxGT idxGlobal(idx);
DstGT dstGlobal(out);

TLOAD(src_anchor, srcGlobal);
TLOAD(idx_anchor, idxGlobal);

// MTE2 → V before MSCATTER (critical: MTE2 → MTE3 silently drops the
// scatter on a5 hw).
set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);

MSCATTER(dstGlobal, src_scatter, idx_scatter);

pipe_sync();
}

extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
__gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
__gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset;

__gm__ Tensor *idx_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
__gm__ int32_t *idx = reinterpret_cast<__gm__ int32_t *>(idx_tensor->buffer.addr) + idx_tensor->start_offset;

__gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
__gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;

simt_scatter_impl(src, idx, out);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/

/**
* SIMT basic orchestration: submit a single AIV SIMT scatter task.
*
* Args layout: [src, indices, out]
*/

#include <stddef.h>
#include <stdint.h>

#include "pto_orchestration_api.h"

#define FUNC_SIMT_SCATTER 0

extern "C" {

__attribute__((visibility("default"))) PTO2OrchestrationConfig
aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
(void)orch_args; // NOLINT(readability/casting)
return PTO2OrchestrationConfig{
.expected_arg_count = 3,
};
}

__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
Tensor src = from_tensor_arg(orch_args.tensor(0));
Tensor indices = from_tensor_arg(orch_args.tensor(1));
Tensor out = from_tensor_arg(orch_args.tensor(2));

// PTO2_SCOPE ensures rt_submit_aiv_task flushes through the task
// ringbuffer before the entry returns. No set_core_num — let the
// runtime use the config's block_dim, matching the ptoas-validated
// mscatter reference.
PTO2_SCOPE() {
Arg args;
args.add_input(src);
args.add_input(indices);
args.add_output(out);
rt_submit_aiv_task(FUNC_SIMT_SCATTER, args);
}
}

} // extern "C"
82 changes: 82 additions & 0 deletions tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env python3
# Copyright (c) PyPTO Contributors.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
"""SIMT basic element-scatter: minimal AIV scatter kernel that exercises the SIMT launch path.

Config (block_dim=24, aicpu_thread_num=4, sequential identity indices)
mirrors the ptoas-validated mscatter reference at
mscatter_fp32_8x32_seq_20260513_140539/test_mscatter.py. Identity
indices keep the golden trivially src-equals-out so a failure here
points at the SIMT launch path itself (TLV injection, localMemorySize
budget, sync) rather than at the scatter index semantics.
"""

import torch
from simpler.task_interface import ArgDirection as D

from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test

TILE_ROWS = 8
TILE_COLS = 32
SRC_ELEMS = TILE_ROWS * TILE_COLS # 256
DST_LEN = SRC_ELEMS # 256


@scene_test(level=2, runtime="tensormap_and_ringbuffer")
class TestSimtBasic(SceneTestCase):
RTOL = 1e-5
ATOL = 1e-5

CALLABLE = {
"orchestration": {
"source": "kernels/orchestration/simt_basic_orch.cpp",
"function_name": "aicpu_orchestration_entry",
"signature": [D.IN, D.IN, D.OUT],
},
"incores": [
{
"func_id": 0,
"name": "SIMT_SCATTER",
"source": "kernels/aiv/kernel_simt_scatter.cpp",
"core_type": "aiv",
"signature": [D.IN, D.IN, D.OUT],
},
],
}

CASES = [
{
"name": "Case1",
"platforms": ["a5sim", "a5"],
"config": {"aicpu_thread_num": 4, "block_dim": 3},
"params": {},
}
]

def generate_args(self, params):
torch.manual_seed(0)
src = torch.randn(SRC_ELEMS, dtype=torch.float32)
# Identity indices (0..DST_LEN-1) — matches the ptoas reference and
# makes the golden trivially `out == src`. Switch to torch.randperm
# later once the baseline launch path is confirmed green.
indices = torch.arange(DST_LEN, dtype=torch.int32)
out = torch.zeros(DST_LEN, dtype=torch.float32)
return TaskArgsBuilder(
Tensor("src", src),
Tensor("indices", indices),
Tensor("out", out),
)

def compute_golden(self, args, params):
args.out.zero_()
args.out[args.indices.to(torch.int64)] = args.src


if __name__ == "__main__":
SceneTestCase.run_module(__name__)
Loading