diff --git a/.agents/evals/minimal_skill_trigger_eval.jsonl b/.agents/evals/minimal_skill_trigger_eval.jsonl new file mode 100644 index 000000000..484ed5f20 --- /dev/null +++ b/.agents/evals/minimal_skill_trigger_eval.jsonl @@ -0,0 +1,17 @@ +{"id":"overview-01","prompt":"解释 npuir 分支里 target=npuir 的完整编译链路,从 lower 到 codegen","expected_skill":"tilelang-npuir-overview"} +{"id":"overview-02","prompt":"Developer 模式和 Expert 模式在 npuir 下有什么区别,怎么切换","expected_skill":"tilelang-npuir-overview"} +{"id":"vector-01","prompt":"帮我写一个 vector add,默认用 vadd 不要 npuir_add","expected_skill":"tilelang-vector-skill"} +{"id":"vector-02","prompt":"实现 sigmoid 的向量版本,用 vexp vadd vrec 这类接口","expected_skill":"tilelang-vector-skill"} +{"id":"cube-01","prompt":"写一个 matmul,要求用 load_nd2nz 和 store_fixpipe","expected_skill":"tilelang-cube-skill"} +{"id":"cube-02","prompt":"Cube 路径下 L1 和 L0C 怎么分配,GEMM initC 怎么控制","expected_skill":"tilelang-cube-skill"} +{"id":"mixcv-01","prompt":"实现 flash attention 的 mixcv 流水,Cube 阶段和 Vector 阶段要同步","expected_skill":"tilelang-mixcv-skill"} +{"id":"mixcv-02","prompt":"sync_block_set 和 sync_block_wait 在融合算子里怎么配对","expected_skill":"tilelang-mixcv-skill"} +{"id":"mlir-01","prompt":"tilelangir-cv-split 和 tilelangir-vectorize 在哪一层执行,如何调试","expected_skill":"tilelang-mlir-skill"} +{"id":"mlir-02","prompt":"帮我定位 MLIR pass pipeline 哪一步把 IR 变坏了","expected_skill":"tilelang-mlir-skill"} +{"id":"debug-01","prompt":"这个 npuir kernel 结果不对,想通过 T.print 和 IR dump 定位","expected_skill":"tilelang-debug-helper"} +{"id":"debug-02","prompt":"需要给 Python 脚本加 GDB 附加点来定位崩溃","expected_skill":"tilelang-debug-helper"} +{"id":"fixer-01","prompt":"编译时报错并且出现段错误,帮我定位并给修复方案","expected_skill":"tilelang-error-fixer"} +{"id":"review-01","prompt":"请对这个 PR 做代码审查,优先看行为回归和测试缺口","expected_skill":"tilelang-review-skill"} +{"id":"review-02","prompt":"提交前帮我跑一遍格式和 lint 检查,看看 CI 风险","expected_skill":"tilelang-review-skill"} +{"id":"github-01","prompt":"我准备把分支提 PR 到 npuir,给我完整 commit push PR 流程","expected_skill":"tilelang-github-operations"} +{"id":"github-02","prompt":"帮我同步 upstream/npuir,rebase 后再发起 PR","expected_skill":"tilelang-github-operations"} diff --git a/.agents/skills/tilelang-cube-skill/SKILL.md b/.agents/skills/tilelang-cube-skill/SKILL.md new file mode 100644 index 000000000..b6d53b531 --- /dev/null +++ b/.agents/skills/tilelang-cube-skill/SKILL.md @@ -0,0 +1,63 @@ +--- +name: tilelang-cube-skill +description: TileLang npuir Cube 算子开发指南。用户提及 GEMM、matmul、batch gemm、L1/L0C、load_nd2nz、store_fixpipe、NZ 格式、Cube scope、矩阵分块与流水优化时必须使用本技能。 +--- + +# TileLang Cube Skill + +## Mandatory routing rule + +Before answering, follow AGENTS.md section "Docs Auto Routing Rules (Mandatory)". + +## Operator baseline rule (Mandatory) + +- Before writing a new cube operator, first check examples/ and testing/npuir/. +- Prefer adapting an existing operator case rather than writing from scratch. + +## Primary use cases + +- matmul and batched matmul kernels +- cube-heavy stages in mixed kernels +- explicit L1 and L0C memory usage + +## Core APIs + +- T.alloc_shared (Developer mode) +- T.alloc_L1 (Expert mode only) +- T.alloc_L0C (Expert mode only) +- T.load_nd2nz (Expert mode only) +- T.gemm +- T.store_fixpipe (Expert mode only) + +## Minimal flow + +1. Partition blocks for M and N +2. Load global tiles with load_nd2nz in Expert mode or T.copy in Developer mode +3. Accumulate with T.gemm(initC controlled by k-loop) +4. Store outputs with store_fixpipe in Expert mode or T.copy in Developer mode + +## NZ format rule + +- NZ format path is Expert mode only. +- In Developer mode kernels, keep ND layout and use T.copy-based data movement. + +## References + +- references/api-cube.md +- references/examples-matmul.md +- references/nz-format.md + +## Official docs to consult + +- docs/Tilelang.language/内存操作/T.alloc_shared.md +- docs/Tilelang.language/线性代数操作/T.gemm.md +- docs/Tilelang.language/内存操作/T.load_nd2nz.md +- docs/Tilelang.language/内存操作/T.store_fixpipe.md +- docs/Tilelang.language/内存操作/T.alloc_L1.md +- docs/Tilelang.language/内存操作/T.alloc_L0C.md + +## Related skills + +- tilelang-vector-skill +- tilelang-mixcv-skill +- tilelang-debug-helper diff --git a/.agents/skills/tilelang-cube-skill/references/api-cube.md b/.agents/skills/tilelang-cube-skill/references/api-cube.md new file mode 100644 index 000000000..57aa24b84 --- /dev/null +++ b/.agents/skills/tilelang-cube-skill/references/api-cube.md @@ -0,0 +1,31 @@ +# Cube API Guide + +## Memory + +- T.alloc_shared(shape, dtype) (Developer mode) +- T.alloc_L1(shape, dtype) (Expert mode only) +- T.alloc_L0C(shape, accum_dtype) (Expert mode only) + +## Data movement + +- T.copy(src, dst) +- T.load_nd2nz(src, dst, size) (Expert mode only) + +## Compute + +- T.gemm(A, B, C, initC=True or False, b_transpose=True or False, size=[M, K, N]) + +## Store path + +- T.copy(C_buf, C_out) +- T.store_fixpipe(C_buf, C_out, size=[M, N], enable_nz2nd=True) (Expert mode only) + +## Scope recommendation + +Use explicit T.Scope("Cube") for cube sections in expert mode kernels. +Don't use explicit T.Scope for cube sections in developer mode kernels. + +## Mode guidance + +- Expert mode: ND -> NZ (load_nd2nz), cube compute, NZ -> ND (store_fixpipe). +- Developer mode: Keep ND tensors and use T.copy without explicit NZ conversion. diff --git a/.agents/skills/tilelang-cube-skill/references/examples-matmul.md b/.agents/skills/tilelang-cube-skill/references/examples-matmul.md new file mode 100644 index 000000000..ddca7e351 --- /dev/null +++ b/.agents/skills/tilelang-cube-skill/references/examples-matmul.md @@ -0,0 +1,14 @@ +# Matmul Pattern + +## Standard K-loop accumulation + +- for each k tile +- load A and B into L1 +- gemm into L0C with initC=(k==0) +- store once final tile is complete + +## Practical notes + +- Keep K tile size aligned with target constraints +- Validate transpose configuration for B path +- Validate numerical tolerance with torch reference diff --git a/.agents/skills/tilelang-cube-skill/references/nz-format.md b/.agents/skills/tilelang-cube-skill/references/nz-format.md new file mode 100644 index 000000000..013b0b7e2 --- /dev/null +++ b/.agents/skills/tilelang-cube-skill/references/nz-format.md @@ -0,0 +1,24 @@ +# NZ Format Notes + +NZ format path is intended for Expert mode kernels only. + +## Why NZ path matters + +Cube compute often benefits from ND to NZ layout conversion for compute-friendly access. + +## Typical path + +- load_nd2nz for input tiles (Expert mode) +- gemm in cube path +- store_fixpipe with enable_nz2nd=True for output conversion (Expert mode) + +## Developer mode note + +- Developer mode kernels should keep ND layout. +- Use T.copy/T.alloc_shared path and do not force NZ conversion. + +## Validation checklist + +- check tile size consistency across load, gemm, and store +- check transpose setting and layout assumptions +- compare outputs with reference implementation diff --git a/.agents/skills/tilelang-debug-helper/SKILL.md b/.agents/skills/tilelang-debug-helper/SKILL.md new file mode 100644 index 000000000..46f319cc8 --- /dev/null +++ b/.agents/skills/tilelang-debug-helper/SKILL.md @@ -0,0 +1,37 @@ +--- +name: tilelang-debug-helper +description: TileLang npuir 调试辅助技能。用户提及调试 npuir kernel、GDB 附加、IR dump、精度异常定位、编译失败定位、pass 阶段定位、T.print 调试、最小复现缩减时必须使用本技能。 +--- + +# TileLang Debug Helper (npuir) + +## Mandatory routing rule + +Before answering, follow AGENTS.md section "Docs Auto Routing Rules (Mandatory)". + +## Debug workflow + +1. Reproduce with minimal script +2. Add process attach window if native debug is needed +3. Capture IR snapshots around transformation boundaries +4. Narrow down failing pass or API misuse + +## For API debugging + +- First verify v-prefix API usage +- Then verify alias compatibility if legacy npuir_xxx appears + +## References + +- references/mlir-dump-guide.md + +## Official docs to consult + +- docs/Tilelang算子调试指南.md +- docs/Tilelang.language/调试操作/T.print.md +- docs/developer/EnvironmentVariables.md + +## Related skills + +- tilelang-mlir-skill +- tilelang-error-fixer diff --git a/.agents/skills/tilelang-debug-helper/references/mlir-dump-guide.md b/.agents/skills/tilelang-debug-helper/references/mlir-dump-guide.md new file mode 100644 index 000000000..0335ddbcd --- /dev/null +++ b/.agents/skills/tilelang-debug-helper/references/mlir-dump-guide.md @@ -0,0 +1,13 @@ +# IR Dump Guide for npuir branch + +## Strategy + +- capture IR before and after major pass stages +- compare operation-level diffs +- correlate with failing runtime behavior + +## Common checkpoints + +- after lower entry +- after tilelangir pass application +- before backend codegen diff --git a/.agents/skills/tilelang-error-fixer/SKILL.md b/.agents/skills/tilelang-error-fixer/SKILL.md new file mode 100644 index 000000000..28a9c9a58 --- /dev/null +++ b/.agents/skills/tilelang-error-fixer/SKILL.md @@ -0,0 +1,62 @@ +--- +name: tilelang-error-fixer +description: TileLang npuir 错误诊断与修复技能。用户提及编译失败、运行错误、pass 异常、结果错误、性能回退、Core Dump、段错误、BishengIR 编译报错、sync 死锁、load/store 维度不一致时必须使用本技能。 +--- + +# TileLang Error Fixer (npuir) + +## Mandatory routing rule + +Before answering, follow AGENTS.md section "Docs Auto Routing Rules (Mandatory)". + +## Scope + +- compile errors in npuir path +- runtime failures and invalid results +- pass pipeline divergence +- performance regressions + +## Diagnosis workflow + +1. Confirm environment and target setting +2. Reproduce with smallest kernel +3. Classify issue type: compile, runtime, pass, precision, performance +4. Capture evidence: logs, IR snapshot, failing stage +5. Propose minimal patch and validate + +## NPUIR-specific checks + +- verify default vector API style uses v-prefix ops +- verify alias callsites are semantically equivalent +- verify load_nd2nz and store_fixpipe size/layout consistency +- verify sync_block_set and sync_block_wait pairing + +## Official docs to consult + +- docs/Tilelang算子调试指南.md +- docs/开发指南.md +- docs/developer/EnvironmentVariables.md +- docs/Tilelang.language/内存操作/T.load_nd2nz.md +- docs/Tilelang.language/内存操作/T.store_fixpipe.md + +## Output template + +## TileLang JIT Issue Report + +### Summary +- Symptom: +- Repro script: +- Impact: + +### Root Cause +- Layer: frontend or pass or codegen or runtime +- Fault pattern: + +### Fix +- Minimal change: +- Why this fixes it: + +### Verification +- Repro after fix: +- Numerical check: +- Regression risk: diff --git a/.agents/skills/tilelang-github-operations/SKILL.md b/.agents/skills/tilelang-github-operations/SKILL.md new file mode 100644 index 000000000..acc1b40be --- /dev/null +++ b/.agents/skills/tilelang-github-operations/SKILL.md @@ -0,0 +1,40 @@ +--- +name: tilelang-github-operations +description: TileLang npuir 分支 GitHub 工作流技能。用户提及 commit、push、PR、rebase、upstream、issue、GitHub Actions、gh CLI、分支同步时必须使用本技能。默认遵循 npuir 分支协作规范并提示 Issue 标题使用 [AscendNPU-IR] 或 [npuir] 前缀。 +--- + +# TileLang GitHub Operations Skill + +## Mandatory routing rule + +Before answering, follow AGENTS.md section "Docs Auto Routing Rules (Mandatory)". + +## Scope + +- branch sync and rebase workflow for npuir +- commit and push sequence +- pull request creation and readiness checks +- issue and PR metadata conventions + +## Workflow baseline + +1. Sync with upstream npuir +2. Run pre-PR format validation from repo root: bash format.sh --files changed_files +3. Commit focused changes +4. Push branch and create PR +5. Verify CI status and address feedback + +## Docs to consult first + +- docs/Tilelang-Ascend贡献指南.md +- docs/developer/EnvironmentVariables.md + +## References + +- references/pr-workflow.txt +- references/issue-template.txt + +## Related skills + +- tilelang-review-skill +- tilelang-error-fixer diff --git a/.agents/skills/tilelang-github-operations/references/issue-template.txt b/.agents/skills/tilelang-github-operations/references/issue-template.txt new file mode 100644 index 000000000..f75a8de5c --- /dev/null +++ b/.agents/skills/tilelang-github-operations/references/issue-template.txt @@ -0,0 +1,11 @@ +Issue template hints for npuir + +Title: +- Prefix with [npuir] or [AscendNPU-IR] + +Body: +- Environment and branch +- Minimal repro script +- Expected vs actual behavior +- Logs or IR snippet +- Impact scope and urgency diff --git a/.agents/skills/tilelang-github-operations/references/pr-workflow.txt b/.agents/skills/tilelang-github-operations/references/pr-workflow.txt new file mode 100644 index 000000000..93b9775ab --- /dev/null +++ b/.agents/skills/tilelang-github-operations/references/pr-workflow.txt @@ -0,0 +1,22 @@ +PR workflow for npuir + +1) Branch sync +- git fetch upstream +- git checkout +- git rebase upstream/npuir + +2) Local validation +- run focused tests for changed area +- run formatting/lint checks if needed +- run mandatory format validation from repo root: + bash format.sh --files changed_files + +3) Commit and push +- write clear commit message +- git push origin + +4) PR creation +- target branch: npuir +- PR title must start with [AscendNPU-IR] +- include repro, scope, and risk notes +- track CI and update quickly on failures diff --git a/.agents/skills/tilelang-mixcv-skill/SKILL.md b/.agents/skills/tilelang-mixcv-skill/SKILL.md new file mode 100644 index 000000000..29c566a76 --- /dev/null +++ b/.agents/skills/tilelang-mixcv-skill/SKILL.md @@ -0,0 +1,55 @@ +--- +name: tilelang-mixcv-skill +description: TileLang npuir 混合 Cube+Vector 算子开发技能。用户提及 flash attention、mixcv、online softmax、流水并行、sync_block_set/wait、Scope("Cube")+Scope("Vector")、PIPE_FIX、跨核 workspace 协同或融合算子性能调优时必须使用本技能。Developer 模式下,只要同一 kernel 同时包含 Cube 中的 T.gemm 与 Vector 中任意一个 v 前缀算子(如 vadd/vmul/vexp/vcast/vbrc),也必须触发本技能。 +--- + +# TileLang MixCV Skill + +## Mandatory routing rule + +Before answering, follow AGENTS.md section "Docs Auto Routing Rules (Mandatory)". + +## Operator baseline rule (Mandatory) + +- Before writing a new MixCV operator, first check examples/ and testing/npuir/. +- Prefer adapting an existing operator case rather than writing from scratch. + +## Focus + +- mixed kernels with both Cube and Vector stages +- staged producer-consumer synchronization +- flash-attention-like patterns + +## Developer mode identification rule (Mandatory) + +- In Developer mode, classify as MixCV when both conditions are true in the same kernel: + 1) Cube-side compute contains T.gemm. + 2) Vector-side compute contains at least one v-prefix op (for example T.vmul, T.vadd, T.vexp, T.vcast, T.vbrc). +- If both conditions hold, route to this skill even if the user does not explicitly say "mixcv". + +## Key primitives + +- T.Scope("Cube") and T.Scope("Vector") +- T.rs("PIPE_FIX") and other pipe regions +- T.sync_block_set and T.sync_block_wait +- Pipelined loops where suitable + +## References + +- references/pipeline.md +- references/flash-attn-pattern.md +- references/flash-attn-dev.md + +## Official docs to consult + +- docs/Tilelang.language/同步管道操作/T.sync_block_set.md +- docs/Tilelang.language/同步管道操作/T.sync_block_wait.md +- docs/Tilelang.language/同步管道操作/T.pipe_barrier.md +- docs/Tilelang.language/线性代数操作/T.gemm.md +- docs/Tilelang.language/数学操作/T.vexp.md + +## Related skills + +- tilelang-cube-skill +- tilelang-vector-skill +- tilelang-debug-helper diff --git a/.agents/skills/tilelang-mixcv-skill/references/flash-attn-dev.md b/.agents/skills/tilelang-mixcv-skill/references/flash-attn-dev.md new file mode 100644 index 000000000..28691c29f --- /dev/null +++ b/.agents/skills/tilelang-mixcv-skill/references/flash-attn-dev.md @@ -0,0 +1,17 @@ +# Flash Attention Pattern (Developer mode) + +## MixCV feature definition (Developer mode) + +- A kernel is treated as MixCV when it has: + - Cube-side T.gemm compute, and + - Vector-side at least one v-prefix op (for example T.vmul, T.vadd, T.vexp, T.vcast, T.vbrc). + +## Typical style + +- allocate with alloc_shared and alloc_fragment +- use v-prefix vector APIs for softmax math +- use Pipelined for staged loops when possible + +## Migration tip + +Start from Developer style for correctness, then migrate hot paths to Expert style blocks. diff --git a/.agents/skills/tilelang-mixcv-skill/references/flash-attn-pattern.md b/.agents/skills/tilelang-mixcv-skill/references/flash-attn-pattern.md new file mode 100644 index 000000000..0b321d574 --- /dev/null +++ b/.agents/skills/tilelang-mixcv-skill/references/flash-attn-pattern.md @@ -0,0 +1,20 @@ +# Flash Attention Pattern (Expert-leaning) + +## Stage A: score computation in Cube scope + +- tile Q and K +- compute score tiles with gemm +- emit intermediates to workspace +- signal readiness via sync_block_set + +## Stage B: softmax and accumulation in Vector scope + +- wait on sync_block_wait +- cast, scale, exp, reduce using v-prefix APIs +- update running max and running sum + +## Stage C: value accumulation and output + +- consume V tiles +- accumulate weighted outputs +- normalize and store final output diff --git a/.agents/skills/tilelang-mixcv-skill/references/pipeline.md b/.agents/skills/tilelang-mixcv-skill/references/pipeline.md new file mode 100644 index 000000000..42e11b401 --- /dev/null +++ b/.agents/skills/tilelang-mixcv-skill/references/pipeline.md @@ -0,0 +1,14 @@ +# MixCV Pipeline Pattern + +## Producer-consumer flow + +1. Cube stage writes intermediate workspace +2. sync_block_set marks stage completion +3. Vector stage waits with sync_block_wait +4. Vector stage consumes intermediate data + +## Good practice + +- keep sync id consistent between set and wait +- minimize workspace footprint by tiling +- isolate stage-specific memory buffers diff --git a/.agents/skills/tilelang-mlir-skill/SKILL.md b/.agents/skills/tilelang-mlir-skill/SKILL.md new file mode 100644 index 000000000..26ea16051 --- /dev/null +++ b/.agents/skills/tilelang-mlir-skill/SKILL.md @@ -0,0 +1,42 @@ +--- +name: tilelang-mlir-skill +description: TileLang npuir 的 TileLangIR 和 MLIR pass 工作流技能。用户提及 tilelangir、mlir、pass pipeline、cv_split、vectorize、IR dump、pass 前后对比、transform 调试、tilelangir-opt 或 BishengIR pass 失败时必须使用本技能。 +--- + +# TileLang MLIR Skill + +## Mandatory routing rule + +Before answering, follow AGENTS.md section "Docs Auto Routing Rules (Mandatory)". + +## What this skill handles + +- tilelangir pass understanding and usage +- pass pipeline composition and isolation +- mlir file level troubleshooting + +## Test baseline (Mandatory) + +- Prioritize examples/ and testing/npuir/ as the primary correctness baseline. +- Do not treat unittest/npuir/mlir_files as the primary validation source. + +## Known pass entry points + +- tilelang/tladapter/transforms/tilelangir.py +- pass names: tilelangir-cv-split and tilelangir-vectorize + +## References + +- references/mlir-test-guide.md +- references/tilelangir-pass.md +- references/tladapter-guide.md + +## Official docs to consult + +- docs/Tilelang算子调试指南.md +- docs/developer/EnvironmentVariables.md + +## Related skills + +- tilelang-debug-helper +- tilelang-error-fixer diff --git a/.agents/skills/tilelang-mlir-skill/references/mlir-test-guide.md b/.agents/skills/tilelang-mlir-skill/references/mlir-test-guide.md new file mode 100644 index 000000000..784a9b104 --- /dev/null +++ b/.agents/skills/tilelang-mlir-skill/references/mlir-test-guide.md @@ -0,0 +1,28 @@ +# MLIR Test Guide + +## Main location + +- examples/ +- testing/npuir/ + +## Deprecated location + +- unittest/npuir/mlir_files is deprecated and should not be used as the primary correctness baseline. + +## Suggested workflow + +1. Reproduce with a minimal kernel under examples/ first. +2. Validate operator behavior with tests under testing/npuir/ (CI-protected baseline). +3. Build or regenerate target MLIR for the failing case. +4. Compare IR structure before and after suspect passes. +5. Isolate the first failing transformation stage. + +## What to inspect + +- operation sequence +- region nesting +- data movement ops and sync ops + +## Optional auxiliary checks + +- testing/mlir/ can be used for MLIR lit-style checks as a supplementary signal. diff --git a/.agents/skills/tilelang-mlir-skill/references/tilelangir-pass.md b/.agents/skills/tilelang-mlir-skill/references/tilelangir-pass.md new file mode 100644 index 000000000..00d0fe9bd --- /dev/null +++ b/.agents/skills/tilelang-mlir-skill/references/tilelangir-pass.md @@ -0,0 +1,18 @@ +# TileLangIR Pass Notes + +## Current pass names + +- tilelangir-cv-split +- tilelangir-vectorize + +## Source locations + +- tilelangir/include/tilelangir/Transforms/Passes.td +- tilelangir/lib/Transforms/CVSplit.cpp +- tilelangir/lib/Transforms/Vectorize.cpp + +## Debug method + +- run pass one by one +- inspect IR after each pass +- narrow down first divergence point diff --git a/.agents/skills/tilelang-mlir-skill/references/tladapter-guide.md b/.agents/skills/tilelang-mlir-skill/references/tladapter-guide.md new file mode 100644 index 000000000..c00fed20a --- /dev/null +++ b/.agents/skills/tilelang-mlir-skill/references/tladapter-guide.md @@ -0,0 +1,13 @@ +# tladapter Guide + +## Key modules + +- tilelang/tladapter/transforms/mlir.py +- tilelang/tladapter/transforms/tilelangir.py +- tilelang/tladapter/utils.py + +## Practical usage + +- compose a small pass pipeline +- execute with minimal reproducible kernel +- dump intermediate representation between passes diff --git a/.agents/skills/tilelang-npuir-overview/SKILL.md b/.agents/skills/tilelang-npuir-overview/SKILL.md new file mode 100644 index 000000000..8a3f91a7a --- /dev/null +++ b/.agents/skills/tilelang-npuir-overview/SKILL.md @@ -0,0 +1,57 @@ +--- +name: tilelang-npuir-overview +description: TileLang npuir 分支总体架构与编译链路指南。用户提及 npuir 分支结构、target=npuir、编译流程、lower/codegen 链路、Developer/Expert 模式、tladapter、tilelangir、bishengir-compile、环境变量时必须使用本技能。 +--- + +# TileLang NPUIR Overview + +## What this skill provides + +- npuir branch architecture map +- compilation pipeline from Python DSL to NPUIR codegen +- mode selection guidance for Developer and Expert +- module role mapping for tilelangir and tladapter + +## Mandatory routing rule + +Before answering, follow AGENTS.md section "Docs Auto Routing Rules (Mandatory)". + +## Architecture map + +- Frontend DSL: tilelang/language +- JIT entry: tilelang/jit/jit_npu.py +- Lowering entry: tilelang/engine/lower.py +- Adapter layer: tilelang/tladapter +- MLIR dialect and passes: tilelangir +- Backend codegen (Expert mode): src/target/codegen_npuir_api.cc and src/target/codegen_npuir_api.h +- Backend codegen (Developer mode): src/target/codegen_npuir_dev.cc and src/target/codegen_npuir_dev.h +- Deprecated backend file: src/target/codegen_npuir.cc + +## Mode selection + +- Developer mode: concise implementation, compiler-managed behavior +- Expert mode: explicit Scope control and fine-grained memory/sync + +Common mode switch: +- os.environ["TILELANG_ASCEND_MODE"] = "Developer" + +## References to read on demand + +- references/arch.md +- references/compile-pipeline.md +- references/modes.md +- references/env-setup.md + +## Official docs to consult + +- docs/快速入门.md +- docs/开发指南.md +- docs/developer/EnvironmentVariables.md +- docs/developer/npu runtime.md + +## Related skills + +- tilelang-vector-skill +- tilelang-cube-skill +- tilelang-mixcv-skill +- tilelang-mlir-skill diff --git a/.agents/skills/tilelang-npuir-overview/references/arch.md b/.agents/skills/tilelang-npuir-overview/references/arch.md new file mode 100644 index 000000000..480534554 --- /dev/null +++ b/.agents/skills/tilelang-npuir-overview/references/arch.md @@ -0,0 +1,19 @@ +# npuir Architecture + +## Core modules + +- tilelang/language: DSL and API surface, including v-prefix aliases +- tilelang/jit/jit_npu.py: target=npuir JIT compile flow +- tilelang/engine/lower.py: high-level lowering pipeline +- tilelang/tladapter: adapter for transformation pipelines +- tilelangir: MLIR dialect, pass definitions, and opt tool +- src/target/codegen_npuir_api.cc and src/target/codegen_npuir_api.h: Expert mode NPUIR codegen implementation +- src/target/codegen_npuir_dev.cc and src/target/codegen_npuir_dev.h: Developer mode NPUIR codegen implementation +- src/target/codegen_npuir.cc: deprecated backend file + +## Key directories + +- tilelangir/include/tilelangir/Transforms/Passes.td +- tilelangir/lib/Transforms/CVSplit.cpp +- tilelangir/lib/Transforms/Vectorize.cpp +- tilelang/tladapter/transforms/tilelangir.py diff --git a/.agents/skills/tilelang-npuir-overview/references/compile-pipeline.md b/.agents/skills/tilelang-npuir-overview/references/compile-pipeline.md new file mode 100644 index 000000000..1054e09f5 --- /dev/null +++ b/.agents/skills/tilelang-npuir-overview/references/compile-pipeline.md @@ -0,0 +1,19 @@ +# NPUIR Compile Pipeline + +## End-to-end flow + +1. Python DSL kernel definition with @tilelang.jit(target="npuir") +2. Lowering through tilelang/engine/lower.py +3. Optional pass orchestration through tilelang/tladapter +4. TileLangIR and MLIR pass application +5. Backend codegen through: + - Expert mode: src/target/codegen_npuir_api.cc and src/target/codegen_npuir_api.h + - Developer mode: src/target/codegen_npuir_dev.cc and src/target/codegen_npuir_dev.h + - Deprecated: src/target/codegen_npuir.cc +6. Runtime launch integration via jit_npu workflow + +## Practical checks + +- Confirm target uses npuir +- Confirm vector ops prefer v-prefix aliases in generated examples +- Confirm pass failures with MLIR dump and pass isolation diff --git a/.agents/skills/tilelang-npuir-overview/references/env-setup.md b/.agents/skills/tilelang-npuir-overview/references/env-setup.md new file mode 100644 index 000000000..baa64febb --- /dev/null +++ b/.agents/skills/tilelang-npuir-overview/references/env-setup.md @@ -0,0 +1,16 @@ +# Environment Setup Notes + +## Recommended install entry + +Use install_npuir.sh for npuir workflow. + +## Basic verification + +- Python environment activated +- NPU toolchain paths available +- target set to npuir in JIT entry + +## Runtime hygiene + +- clear tilelang cache when validating kernel changes +- keep sample scripts minimal and reproducible diff --git a/.agents/skills/tilelang-npuir-overview/references/modes.md b/.agents/skills/tilelang-npuir-overview/references/modes.md new file mode 100644 index 000000000..38a5ec5fb --- /dev/null +++ b/.agents/skills/tilelang-npuir-overview/references/modes.md @@ -0,0 +1,22 @@ +# Developer and Expert Modes + +## Developer mode + +Use for faster implementation and easier maintenance. +Typical style: +- alloc_shared and alloc_fragment +- T.Parallel and reduce helpers +- concise kernel logic + +## Expert mode + +Use for fine control and performance tuning. +Typical style: +- explicit Scope blocks such as Scope("Cube") and Scope("Vector") +- explicit memory hierarchy and synchronization +- load_nd2nz and store_fixpipe in cube pipelines + +## Rule of thumb + +- Start with Developer mode for correctness +- Move hotspots to Expert mode after profiling diff --git a/.agents/skills/tilelang-review-skill/SKILL.md b/.agents/skills/tilelang-review-skill/SKILL.md new file mode 100644 index 000000000..833d72319 --- /dev/null +++ b/.agents/skills/tilelang-review-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: tilelang-review-skill +description: TileLang npuir 代码审查与格式校验技能。用户提及 review、代码审查、PR 前检查、lint、format、ruff、clang-format、规范检查、CI 不通过时必须使用本技能。优先识别行为回归、数值风险、同步风险与测试缺口,其次才是风格问题。 +--- + +# TileLang Review Skill + +## Mandatory routing rule + +Before answering, follow AGENTS.md section "Docs Auto Routing Rules (Mandatory)". + +## Scope + +- pre-PR code review for npuir branch +- format and lint checks aligned with CI +- risk-focused review for correctness, performance, and synchronization + +## Review priorities + +1. Behavior regressions +2. Precision and dtype risks +3. Synchronization and pipeline hazards +4. Missing tests +5. Style and format consistency + +## Docs to consult first + +- docs/Tilelang-Ascend贡献指南.md +- docs/Tilelang算子调试指南.md +- docs/开发指南.md + +## References + +- references/checklist.txt + +## Related skills + +- tilelang-error-fixer +- tilelang-debug-helper diff --git a/.agents/skills/tilelang-review-skill/references/checklist.txt b/.agents/skills/tilelang-review-skill/references/checklist.txt new file mode 100644 index 000000000..65b741773 --- /dev/null +++ b/.agents/skills/tilelang-review-skill/references/checklist.txt @@ -0,0 +1,25 @@ +TileLang npuir review checklist + +1) API style +- New vector paths use v-prefix APIs by default. +- Legacy npuir_xxx usage is only for compatibility. + +2) Correctness +- Check boundary/tail logic for tiled loops. +- Check dtype and cast paths for numerical stability. +- Check sync_block_set/wait pairing in mixed pipelines. + +3) Performance +- Check redundant copies and excessive casts. +- Check tile size and loop ordering reasonableness. + +4) Tests +- Verify at least one focused repro or unit test exists. +- Prefer covering both normal and boundary tile shapes. +- Check whether implementation is based on existing patterns in examples/ and testing/npuir/. +- Flag scratch-built operator code when a close existing template is available. + +5) Formatting +- Keep CI style requirements satisfied. +- Before PR, run from repository root: + bash format.sh --files changed_files diff --git a/.agents/skills/tilelang-vector-skill/SKILL.md b/.agents/skills/tilelang-vector-skill/SKILL.md new file mode 100644 index 000000000..cca54e98b --- /dev/null +++ b/.agents/skills/tilelang-vector-skill/SKILL.md @@ -0,0 +1,53 @@ +--- +name: tilelang-vector-skill +description: TileLang npuir Vector 算子开发指南。用户提及逐元素、激活函数、归约、广播、sigmoid、rmsnorm、softmax 子流程、vadd/vmul/vexp/vcast/vbrc、向量精度或向量性能优化时必须使用本技能。默认输出必须优先采用 v 前缀 API,而非 npuir_xxx 形式。 +--- + +# TileLang Vector Skill (npuir) + +## Mandatory routing rule + +Before answering, follow AGENTS.md section "Docs Auto Routing Rules (Mandatory)". + +## Operator baseline rule (Mandatory) + +- Before writing a new vector operator, first check examples/ and testing/npuir/. +- Prefer adapting an existing operator case rather than writing from scratch. + +## API style policy + +Mandatory default style: +- Prefer T.vadd, T.vsub, T.vmul, T.vdiv +- Prefer T.vexp, T.vln, T.vsqrt, T.vrsqrt, T.vrelu, T.vsigmoid +- Prefer T.vcast, T.vbrc, T.vcmp, T.vselect + +Compatibility: +- T.npuir_add and friends are allowed only for compatibility with legacy code. + +## Core workflow + +1. Define shape and block strategy +2. Allocate UB or shared buffers based on mode +3. Copy in, compute with v-prefix APIs, copy out +4. Validate against torch reference + +## References + +- references/api-quickref.md +- references/examples.md +- references/troubleshooting.md + +## Official docs to consult + +- docs/Tilelang.language/数学操作/T.vadd.md +- docs/Tilelang.language/数学操作/T.vmul.md +- docs/Tilelang.language/数学操作/T.vexp.md +- docs/Tilelang.language/数据类型转换操作/T.vcast.md +- docs/Tilelang.language/shape操作/T.vbrc.md +- docs/Tilelang.language/规约操作/T.reduce.md + +## Related skills + +- tilelang-cube-skill +- tilelang-mixcv-skill +- tilelang-debug-helper diff --git a/.agents/skills/tilelang-vector-skill/references/api-quickref.md b/.agents/skills/tilelang-vector-skill/references/api-quickref.md new file mode 100644 index 000000000..8b53ef578 --- /dev/null +++ b/.agents/skills/tilelang-vector-skill/references/api-quickref.md @@ -0,0 +1,38 @@ +# Vector API Quick Reference (v-prefix first) + +## Binary ops + +- T.vadd(A, B, C) +- T.vsub(A, B, C) +- T.vmul(A, B, C) +- T.vdiv(A, B, C) +- T.vmax(A, B, C) +- T.vmin(A, B, C) + +## Unary ops + +- T.vexp(A, B) +- T.vln(A, B) +- T.vsqrt(A, B) +- T.vrsqrt(A, B) +- T.vabs(A, B) +- T.vrelu(A, B) +- T.vsigmoid(A, B) +- T.vtanh(A, B) +- T.verf(A, B) + +## Utility ops + +- T.vcast(src, dst, round_mode="rint") +- T.vbrc(value, dst) +- T.vcmp(a, b, dst, cmp_mode) +- T.vselect(mask, a, b, dst, mode) +- T.reduce(src, dst, dims=[...], reduce_mode="sum|max|min") + +## Compatibility mapping + +- T.vmul == T.npuir_mul +- T.vadd == T.npuir_add +- T.vexp == T.npuir_exp +- T.vcast == T.npuir_cast +- T.vbrc == T.npuir_brc diff --git a/.agents/skills/tilelang-vector-skill/references/examples.md b/.agents/skills/tilelang-vector-skill/references/examples.md new file mode 100644 index 000000000..55da53140 --- /dev/null +++ b/.agents/skills/tilelang-vector-skill/references/examples.md @@ -0,0 +1,18 @@ +# Vector Examples (v-prefix style) + +## Example pattern: elementwise add + +- copy input tiles to local buffer +- call T.vadd +- copy result back + +## Example pattern: normalization pieces + +- square: T.vmul(x, x, tmp) +- reduce: T.reduce(tmp, sum, dims=[1], reduce_mode="sum") +- scale and rsqrt: T.vdiv, T.vadd, T.vrsqrt +- finalize: T.vmul(x, inv_std, y) + +## Rule + +When both forms are valid, always generate the v-prefix form first. diff --git a/.agents/skills/tilelang-vector-skill/references/troubleshooting.md b/.agents/skills/tilelang-vector-skill/references/troubleshooting.md new file mode 100644 index 000000000..7531fcc16 --- /dev/null +++ b/.agents/skills/tilelang-vector-skill/references/troubleshooting.md @@ -0,0 +1,22 @@ +# Vector Troubleshooting + +## Symptom: wrong numerical result + +Checks: +- verify dtype for compute path and accumulation path +- verify tail handling on block boundaries +- verify vcast round mode when converting types + +## Symptom: compile-time op mismatch + +Checks: +- confirm API signature for v-prefix call +- confirm src and dst shapes are compatible +- confirm reduce dims and reduce mode are valid + +## Symptom: performance regression + +Checks: +- reduce redundant copy steps +- avoid unnecessary cast pairs +- profile block sizes and balance kernel launch granularity diff --git a/.github/workflows/ci_npuir.yml b/.github/workflows/ci_npuir.yml index 5becb718d..82df083d8 100644 --- a/.github/workflows/ci_npuir.yml +++ b/.github/workflows/ci_npuir.yml @@ -77,22 +77,23 @@ jobs: name: tilelang-npuir-py3.11-arm64 path: dist/ - - name: Install clang - run: | - apt-get update && apt-get install -y clang - - name: Install tilelang and test dependencies env: TORCH_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/whl/cpu" PYPI_CACHE_URL: "http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple" run: | CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list pip config set global.index-url http://${CACHING_URL}/pypi/simple pip config set global.trusted-host "${CACHING_URL}" pip install torch==2.7.1 -i ${TORCH_CACHE_URL} --extra-index-url ${PYPI_CACHE_URL} pip install torch-npu==2.7.1 pip install dist/*.whl pip install pytest pytest-xdist pytest-html numpy + + - name: Install clang + run: | + apt-get update && apt-get install -y clang - name: Prepare npuir test workspace run: | diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..972c28cad --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,127 @@ +# TileLang npuir Agent Guide + +This repository uses AGENTS skills for TileLang NPUIR development. + +## Scope + +The skills in .agents/skills are designed for target="npuir" workflows. +They prioritize v-prefix vector APIs such as vadd, vmul, vexp, vcast, vbrc. +Legacy npuir_xxx APIs remain valid as compatibility aliases. + +## API Convention (Mandatory) + +- Prefer v-prefix APIs in new examples and generated code. +- Keep compatibility with npuir_xxx when reading existing code. +- If both forms are available, output should default to v-prefix. + +Examples: +- Prefer: T.vmul(A, B, C) +- Compatible: T.npuir_mul(A, B, C) + +## Skill Index + +1. tilelang-npuir-overview +Purpose: architecture and compile pipeline for npuir branch. + +2. tilelang-vector-skill +Purpose: vector operator generation with v-prefix API style. + +3. tilelang-cube-skill +Purpose: cube operator generation with load_nd2nz and store_fixpipe. + +4. tilelang-mixcv-skill +Purpose: mixed Cube+Vector kernels such as flash attention pipelines. + +5. tilelang-mlir-skill +Purpose: TileLangIR and MLIR pass workflow and debugging. + +6. tilelang-debug-helper +Purpose: GDB + IR dump + pass-level debug workflow for npuir. + +7. tilelang-error-fixer +Purpose: diagnosis and repair workflow for compile/runtime/pass failures. + +8. tilelang-review-skill +Purpose: risk-first code review and CI-aligned format checks. + +9. tilelang-github-operations +Purpose: npuir branch commit/rebase/PR/issue workflow. + +## Trigger Guidance + +Use the matching skill whenever the user asks for: +- npuir kernel writing, performance tuning, vector math, cube gemm, mixed kernels +- pass debugging, IR dump, MLIR transform troubleshooting +- compile/runtime error analysis on npuir branch +- code review, lint/format checks, PR readiness +- commit/push/rebase/upstream sync, PR or issue workflow + +Developer-mode MixCV trigger rule: +- If one kernel contains Cube-side T.gemm and Vector-side at least one v-prefix op (such as T.vadd/T.vmul/T.vexp/T.vcast/T.vbrc), treat it as MixCV and use tilelang-mixcv-skill. + +## Operator Implementation Baseline (Mandatory) + +For operator-writing tasks, always start from existing examples and tests: + +- First consult examples/ and testing/npuir/ for the closest existing pattern. +- Prefer modifying an existing operator case instead of generating a brand-new kernel from scratch. +- If no close template exists, explicitly state that and then build the minimal new kernel. + +## Pre-PR Formatting Rule (Mandatory) + +Before creating or updating a PR, run format validation for changed files from repository root: + +- bash format.sh --files changed_files + +Notes: +- This is a required self-check for clean code and style consistency. +- The changed_files placeholder represents the file list modified in the current branch. + +## Docs Auto Routing Rules (Mandatory) + +When any skill answers technical questions, it must route references by docs directory first. + +### Routing Priority + +1. docs/Tilelang.language/ (API semantics and signatures) +2. docs/Tilelang算子调试指南.md (debug and issue localization) +3. docs/developer/ (runtime and environment variables) +4. docs/开发指南.md and docs/快速入门.md (workflow and onboarding) +5. docs/Tilelang-Ascend贡献指南.md (PR, issue, contribution process) + +### Keyword to Docs Mapping + +- Vector ops (vadd/vmul/vexp/vcast/vbrc/reduce/sigmoid/rmsnorm): + docs/Tilelang.language/数学操作/ + docs/Tilelang.language/数据类型转换操作/ + docs/Tilelang.language/shape操作/ + docs/Tilelang.language/规约操作/ + +- Cube ops (gemm/load_nd2nz/store_fixpipe/L1/L0C/NZ): + docs/Tilelang.language/线性代数操作/ + docs/Tilelang.language/内存操作/ + +- Pipeline and sync (sync_block_set/wait/pipe_barrier/set_flag/wait_flag): + docs/Tilelang.language/同步管道操作/ + +- Debug, compile failure, runtime failure, precision issue: + docs/Tilelang算子调试指南.md + docs/Tilelang.language/调试操作/ + +- MLIR, pass, tilelangir, bishengir-compile: + docs/Tilelang算子调试指南.md + docs/developer/EnvironmentVariables.md + +- Runtime target, mode switch, env setup: + docs/developer/npu runtime.md + docs/developer/EnvironmentVariables.md + docs/安装指南.md + +- PR, rebase, commit, issue, CI workflow: + docs/Tilelang-Ascend贡献指南.md + +### Conflict Resolution + +- If multiple mappings match, select by priority and keep at most 3 primary doc references. +- Always include at least 1 concrete API doc under docs/Tilelang.language/ when the question is API-related. +- If API docs and examples differ, API docs are source of truth and examples are secondary. diff --git a/pyproject.toml b/pyproject.toml index 3028508dd..c73e26067 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ column_limit = 100 indent_width = 4 [tool.codespell] -ignore-words-list = "nd, te, ist, LOD, offen, NotIn, HSA" +ignore-words-list = "nd, te, ist, LOD, offen, NotIn, HSA, cann" skip = [ "build", "3rdparty",