diff --git a/.claude/rules/pass-doc-ordering.md b/.claude/rules/pass-doc-ordering.md index 45e4e0b00..c0f455369 100644 --- a/.claude/rules/pass-doc-ordering.md +++ b/.claude/rules/pass-doc-ordering.md @@ -30,7 +30,7 @@ Developers read pass docs sequentially to understand the compilation pipeline. I | 15 | `15-flatten_tile_nd_to_2d.md` | 15th pass | | 16 | `16-auto_tile_matmul_l0.md` | 16th pass | | 17 | `17-infer_tile_memory_space.md` | 17th pass | -| 18 | `18-resolve_transpose_layout.md` | 18th pass | +| 18 | `18-lower_transpose_load_param_layout.md` | 18th pass (RFC #1300 P6 — replaces ResolveTransposeLayout) | | 19 | `19-resolve_backend_op_layouts.md` | 19th pass | | 20 | `20-expand_mixed_kernel.md` | 20th pass | | 21 | `21-inject_gm_pipe_buffer.md` | Runs immediately after `ExpandMixedKernel` (backend-gated, Ascend910B) | @@ -38,7 +38,7 @@ Developers read pass docs sequentially to understand the compilation pipeline. I | 23 | `23-normalize_return_order.md` | 23rd pass | | 24 | `24-lower_pipeline_loops.md` | 24th pass | | 25 | `25-canonicalize_io_order.md` | 25th pass | -| 26 | `26-materialize_tensor_strides.md` | 26th pass (RFC #1300 P3 — registered, not yet wired into Default; activates with P6/P7) | +| 26 | `26-materialize_tensor_strides.md` | 26th pass (RFC #1300 P3 — wired into Default starting from P6) | | 27 | `27-init_memref.md` | 27th pass | | 28 | `28-memory_reuse.md` | 28th pass | | 29 | `29-legalize_pto_buffer_reuse.md` | 29th pass | diff --git a/CMakeLists.txt b/CMakeLists.txt index bf83c5a7b..6515b61f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,7 +165,7 @@ set(PYPTO_SOURCES src/ir/transforms/pass_context.cpp src/ir/transforms/passes.cpp src/ir/transforms/resolve_backend_op_layouts_pass.cpp - src/ir/transforms/resolve_transpose_layout_pass.cpp + src/ir/transforms/lower_transpose_load_param_layout_pass.cpp src/ir/transforms/python_printer.cpp src/ir/transforms/simplify_pass.cpp src/ir/transforms/inject_gm_pipe_buffer_pass.cpp diff --git a/docs/en/dev/passes/00-pass_manager.md b/docs/en/dev/passes/00-pass_manager.md index e779a9003..e35573126 100644 --- a/docs/en/dev/passes/00-pass_manager.md +++ b/docs/en/dev/passes/00-pass_manager.md @@ -373,7 +373,7 @@ The PTO-oriented tile stage shared by `Default` and `DebugTileOptimization` is: 2. [`FlattenTileNdTo2D`](15-flatten_tile_nd_to_2d.md) 3. [`AutoTileMatmulL0`](16-auto_tile_matmul_l0.md) 4. `InferTileMemorySpace` -5. `ResolveTransposeLayout` +5. [`LowerTransposeLoadParamLayout`](18-lower_transpose_load_param_layout.md) (RFC #1300 P6 — replaces `ResolveTransposeLayout`) 6. [`ResolveBackendOpLayouts`](19-resolve_backend_op_layouts.md) 7. `NormalizeStmtStructure` 8. `ExpandMixedKernel` @@ -382,7 +382,7 @@ The PTO-oriented tile stage shared by `Default` and `DebugTileOptimization` is: 11. `NormalizeReturnOrder` 12. [`LowerPipelineLoops`](24-lower_pipeline_loops.md) 13. [`CanonicalizeIOOrder`](25-canonicalize_io_order.md) -14. [`MaterializeTensorStrides`](26-materialize_tensor_strides.md) — registered, not yet wired into the default pipeline (will activate alongside the codegen cleanup in RFC #1300 P6/P7) +14. [`MaterializeTensorStrides`](26-materialize_tensor_strides.md) — wired into the default pipeline starting from RFC #1300 P6 15. `InitMemRef` 16. `MemoryReuse` 17. [`LegalizePTOBufferReuse`](29-legalize_pto_buffer_reuse.md) diff --git a/docs/en/dev/passes/17-infer_tile_memory_space.md b/docs/en/dev/passes/17-infer_tile_memory_space.md index 62d11fade..2c67dac49 100644 --- a/docs/en/dev/passes/17-infer_tile_memory_space.md +++ b/docs/en/dev/passes/17-infer_tile_memory_space.md @@ -15,7 +15,7 @@ After this pass every `TileType` in InCore functions carries a concrete `memory_ - InCore / Orchestration outlining must be done (`SplitIncoreOrch`) - Statement structure must be normalized (`NormalizedStmtStructure`) -**When to use**: Run immediately after `FlattenTileNdTo2D` and before `ResolveTransposeLayout` / `ResolveBackendOpLayouts` / `ExpandMixedKernel`. It is the canonical point at which tile memory becomes a contract that downstream passes (especially `ExpandMixedKernel`'s mixed-kernel detection and `InitMemRef`'s buffer allocation) read. +**When to use**: Run immediately after `FlattenTileNdTo2D` and before `LowerTransposeLoadParamLayout` / `ResolveBackendOpLayouts` / `ExpandMixedKernel`. It is the canonical point at which tile memory becomes a contract that downstream passes (especially `ExpandMixedKernel`'s mixed-kernel detection and `InitMemRef`'s buffer allocation) read. ## API diff --git a/docs/en/dev/passes/18-lower_transpose_load_param_layout.md b/docs/en/dev/passes/18-lower_transpose_load_param_layout.md new file mode 100644 index 000000000..3272bca78 --- /dev/null +++ b/docs/en/dev/passes/18-lower_transpose_load_param_layout.md @@ -0,0 +1,200 @@ +# LowerTransposeLoadParamLayout Pass + +Lowers ``tile.load(..., transpose=True)`` to canonical-form DN parameter layout (RFC #1300 P6). + +## Overview + +Before this pass, ``tile.load(transpose=True)`` is the user's way of saying "I want +the column-major view of this source tensor at the load site". After this pass, that +intent is encoded into the InCore parameter's TensorType itself — the source/load +combo is rewritten to RFC #1300 §3.3 canonical form so codegen, verifier, and +downstream passes consume a single, self-consistent ``(shape, stride, layout)`` triple. + +For each InCore parameter ``p`` loaded via ``tile.load(p, ..., transpose=True)``: + +- ``p``'s TensorType is promoted from ``[..., a, b] ND`` to ``[..., b, a] DN`` — + the trailing-pair shape swap plus the DN layout tag. The new TensorView carries + an empty stride; ``MaterializeTensorStrides`` (which runs later in the default + pipeline, after ``CanonicalizeIOOrder``) fills it with the packed canonical + strides. +- Every ``tile.load(p, offsets, shapes, valid_shapes, ..., transpose=True)`` whose + source is a promoted parameter is rewritten so the three tuples' trailing pair + is swapped to canonical coords and the ``transpose=True`` kwarg is dropped. + ``DeduceTileLoadType`` reads the source's DN layout to derive the Mat tile-view + layout that the legacy ``transpose=True`` swap produced — the two signals are + equivalent (§4.2 canonical pair). +- Every non-InCore call site that targets a promoted callee wraps the promoted + argument in ``tensor.as_layout(arg, DN)`` (RFC #1300 P4). The bridging op is + pure metadata — it emits no PTOAS instruction; ``make_tensor_view`` consumes + the new view directly. + +**Requirements**: + +- Input IR must be in SSA form +- InCore functions must already be split out (``SplitIncoreOrch``) +- Tile ops must be present and 2D (``IncoreTileOps``, ``TileOps2D``) +- Promoted parameters must have rank ≥ 2 + +**When to use**: 18th pass in the ``Default`` strategy, after +``InferTileMemorySpace`` and before ``ResolveBackendOpLayouts``. The 2D shape +produced by ``FlattenTileNdTo2D`` is a precondition. ``MaterializeTensorStrides`` +runs later in the pipeline (after ``CanonicalizeIOOrder``) to materialize the +DN-packed canonical strides on the promoted parameters. + +## API + +| C++ | Python | Level | +| --- | ------ | ----- | +| ``pass::LowerTransposeLoadParamLayout()`` | ``passes.lower_transpose_load_param_layout()`` | Program-level | + +**Python usage**: + +```python +from pypto.pypto_core import passes + +p = passes.lower_transpose_load_param_layout() +program_canonical = p(program) +``` + +## Algorithm + +```text +For each InCore function f: + scan body → set P_t = {param idx with tile.load(p, ..., transpose=True)} + set P_nt = {param idx with tile.load(p, ..., transpose=False/absent)} + reject P_t ∩ P_nt (mixed-use) + for each idx in P_t: + promote f.params[idx].type: [..., a, b] ND → [..., b, a] DN (empty stride) + substitute old Var → new Var in body + rewrite each tile.load(promoted_param, off, shp, vs, transpose=True) in body: + swap last two dims of off / shp / vs + drop transpose=True kwarg + +For each non-InCore function: + walk body; for every Call whose op is a GlobalVar of a promoted callee: + wrap each promoted-slot arg with tensor.as_layout(arg, DN) +``` + +**Complexity:** O(N log N) — one body walk per function plus one program-wide call-site +walk. Map lookups (``promotions_by_callee_name``) are ``log N`` per call. + +| Behavior | Trigger | +| -------- | ------- | +| Promote param to ``[..., b, a] DN`` | InCore param is source of ``tile.load(..., transpose=True)`` | +| Skip param | Already DN, or no transposed load | +| Skip whole function | Function is Orchestration / Opaque / Group | +| Wrap call-site arg in ``tensor.as_layout`` | Non-InCore call to a promoted callee | +| Reject | Mixed transpose=True / transpose=False on same param | +| Reject | DN + explicit physical stride source (would compose as double transpose) | + +## Example + +**Before**: + +```python +@pl.program +class Before: + @pl.function(type=pl.FunctionType.InCore) + def matmul_incore( + self, + a: pl.Tensor[[64, 128], pl.FP32], + b: pl.Tensor[[32, 128], pl.FP32], + c: pl.Out[pl.Tensor[[64, 32], pl.FP32]], + ) -> pl.Tensor[[64, 32], pl.FP32]: + tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) + tile_b = pl.load(b, [0, 0], [32, 128], target_memory=pl.MemorySpace.Mat, transpose=True) + ... + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator(self, a, b): + c = pl.create_tensor([64, 32], dtype=pl.FP32) + return self.matmul_incore(a, b, c) +``` + +**After** (semantic — ``tensor.as_layout`` is an internal IR op, not exposed in pl.*): + +```text +@pl.function(type=pl.FunctionType.InCore) +def matmul_incore( + self, + a: pl.Tensor[[64, 128], pl.FP32], + b: pl.Tensor[[128, 32], pl.FP32, pl.DN], # ← shape swapped + DN tag + c: pl.Out[pl.Tensor[[64, 32], pl.FP32]], +) -> pl.Tensor[[64, 32], pl.FP32]: + tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) + tile_b = pl.load(b, [0, 0], [128, 32], target_memory=pl.MemorySpace.Mat) + # ↑ no transpose kwarg + # ↑ shapes swapped to canonical coords + ... + +@pl.function(type=pl.FunctionType.Orchestration) +def orchestrator(self, a, b): + c = pl.create_tensor([64, 32], dtype=pl.FP32) + # b is wrapped in tensor.as_layout to bridge ND → DN at the call site: + bridged_b = tensor.as_layout(b, pl.DN) # type: [128, 32] DN + return self.matmul_incore(a, bridged_b, c) +``` + +``a`` is loaded without transpose, so it is unchanged. ``b`` is promoted in the +InCore signature, all body loads of ``b`` are rewritten to canonical coords with +no transpose, and the orchestrator's call site wraps ``b`` in +``tensor.as_layout`` to bridge ``[32, 128] ND`` → ``[128, 32] DN`` over the same +physical buffer. + +## Implementation + +**Header**: ``include/pypto/ir/transforms/passes.h`` + +**Implementation**: ``src/ir/transforms/lower_transpose_load_param_layout_pass.cpp`` + +**Python binding**: ``python/bindings/modules/passes.cpp`` + +**Tests**: ``tests/ut/ir/transforms/test_lower_transpose_load_param_layout_pass.py`` + +## Pass Properties + +| Property | Value | +| -------- | ----- | +| Required | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | +| Produced | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | +| Invalidated | — | + +## Scope + +| Function type | Action | +| ------------- | ------ | +| InCore (InCore, AIC, AIV) | Scanned, possibly promoted | +| Orchestration / Group / Opaque | Scanned for call sites; promoted-arg wrapped in ``tensor.as_layout`` | + +| Parameter state | Action | +| --------------- | ------ | +| Sourced by ``tile.load(..., transpose=True)``, layout != DN, rank ≥ 2 | Promoted (shape swap + DN tag) | +| Sourced by ``tile.load(..., transpose=True)``, already DN | Idempotent — unchanged | +| Mixed transpose=True / transpose=False on same param | ``CHECK`` failure | +| Not sourced by any transposed load | Unchanged | +| Rank < 2 candidate | ``CHECK`` failure | + +## Interaction with ``tensor.as_layout`` (P4) and ``MaterializeTensorStrides`` (P3) + +This pass is the first real consumer of ``tensor.as_layout`` in the default +pipeline. The bridging op is single-purpose: it flips the layout tag and derives +the new shape from §4.2 canonical pair semantics — callers never write the +target shape, so the call-site rewriter cannot get it wrong. + +Downstream, ``MaterializeTensorStrides`` fills the empty stride slot on each +promoted parameter with the packed canonical DN strides (RFC §2.4). The +combination of P6 + P3 is what gives codegen a self-consistent +``(shape, stride, layout)`` triple — no further ``dn_swap`` / ``get_shape_source_idx`` +fix-ups are needed in the codegen path for promoted parameters. + +## Interaction with ``tensor.transpose`` at Orchestration + +A parameter whose source TensorView carries both ``layout = DN`` *and* an +explicit non-empty ``stride`` is the signature of a ``tensor.transpose`` result. +This pass rejects ``tile.load(transpose=True)`` on such parameters with a +``CHECK`` failure — the two encodings would compose as a double transpose at +codegen time and emit wrong addresses. Slice-derived inputs (explicit strides + +``layout = ND``, attached by ``OptimizeOrchTensors``) are unaffected. + +Workaround for the rejected case: drop one of the two transpose layers in the +source program. diff --git a/docs/en/dev/passes/18-resolve_transpose_layout.md b/docs/en/dev/passes/18-resolve_transpose_layout.md deleted file mode 100644 index a5d04d705..000000000 --- a/docs/en/dev/passes/18-resolve_transpose_layout.md +++ /dev/null @@ -1,173 +0,0 @@ -# ResolveTransposeLayout Pass - -Annotates InCore tensor parameters that source `tile.load(..., transpose=True)` with the `DN` (column-major) layout. - -## Overview - -When a `tile.load` is issued with `transpose=True`, PTO codegen needs the source tensor to be materialized in column-major (`DN`) layout — the transpose is realized by the layout choice rather than by reshaping data. This pass propagates that layout requirement back from the load site to the function parameter type, so that downstream passes and codegen can rely on the parameter's `TensorType` as the single source of truth for layout. - -The pass annotates the parameter only — **shape is preserved**. `DN` is a layout/codegen hint; the logical tensor dimensions are not swapped. (This is the invariant enforced by the regression test for #606: a partial transpose load on `[128, 128]` must keep the parameter shape at `[128, 128]`, not the load-window shape.) - -**Requirements**: - -- Input IR must be in SSA form -- InCore functions must already be split out (`SplitIncoreOrch`) -- Tile ops must be present and 2D (`IncoreTileOps`, `TileOps2D`) -- Annotated tensor parameters must have rank ≥ 2 - -**When to use**: Run as the 15th pass in the `Default` strategy, after `InferTileMemorySpace` and before `ResolveBackendOpLayouts`. The 2D shape produced by `FlattenTileNdTo2D` is a precondition. - -## API - -| C++ | Python | Level | -| --- | ------ | ----- | -| `pass::ResolveTransposeLayout()` | `passes.resolve_transpose_layout()` | Program-level | - -**Python usage**: - -```python -from pypto.pypto_core import passes - -resolve_pass = passes.resolve_transpose_layout() -program_dn = resolve_pass(program) -``` - -## Algorithm - -For each function in the program: - -1. **Skip non-InCore functions**: Orchestration and Opaque functions are returned unchanged. Only InCore-type functions (InCore, AIC, AIV) are processed. -2. **Scan body for transposed loads**: walk the function body and collect, for each `tile.load` call whose kwarg `transpose=True` and whose first argument is one of the function's parameters, the index of that parameter. Duplicates across multiple load sites are deduplicated. -3. **Rewrite parameters**: for each collected parameter: - - **Skip if already DN**: if the parameter's `TensorType` already carries `TensorView{layout=DN}`, no rewrite is needed (idempotent). - - **Require rank ≥ 2**: a 1D tensor cannot meaningfully be column-major; the pass aborts with a `CHECK` if it sees one. - - Build a new `Var` with the same `name_hint`, span, and shape, but with a new `TensorType` whose `tensor_view_` is `TensorView({}, TensorLayout::DN)`. -4. **Substitute**: rewrite all uses of the old `Var` inside the function body via `Substitute`, then rebuild the function via `MutableCopy` with the new parameter list and body. - -No Orchestration-side rewrite happens. Downstream passes and codegen consume the InCore signature as the layout source of truth. - -| Behavior | Trigger | -| -------- | ------- | -| Annotate param with `DN` | InCore function param is the source of `tile.load(..., transpose=True)` | -| Skip param | Already `DN`, or no transposed load reaches it | -| Skip whole function | Function is Orchestration or Opaque | -| `CHECK` failure | Annotated param is not a `TensorType`, or rank < 2 | - -## Example - -**Before**: - -```python -@pl.program -class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[64, 128], pl.FP32], - b: pl.Tensor[[32, 128], pl.FP32], - c: pl.Out[pl.Tensor[[64, 32], pl.FP32]], - ) -> pl.Tensor[[64, 32], pl.FP32]: - tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [32, 128], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[64, 128], pl.FP32], b: pl.Tensor[[32, 128], pl.FP32] - ) -> pl.Tensor[[64, 32], pl.FP32]: - c: pl.Tensor[[64, 32], pl.FP32] = pl.create_tensor([64, 32], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result -``` - -**After**: - -```python -@pl.program -class After: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[64, 128], pl.FP32], - b: pl.Tensor[[32, 128], pl.FP32, pl.DN], - c: pl.Out[pl.Tensor[[64, 32], pl.FP32]], - ) -> pl.Tensor[[64, 32], pl.FP32]: - tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [32, 128], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[64, 128], pl.FP32], b: pl.Tensor[[32, 128], pl.FP32] - ) -> pl.Tensor[[64, 32], pl.FP32]: - c: pl.Tensor[[64, 32], pl.FP32] = pl.create_tensor([64, 32], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result -``` - -`b` is the source of a `tile.load` with `transpose=True`, so the InCore parameter type gains the `pl.DN` layout annotation. The shape `[32, 128]` is unchanged. `a` is loaded without transpose, so it is left alone. The Orchestration `orchestrator` signature is **not** rewritten. - -## Implementation - -**Header**: `include/pypto/ir/transforms/passes.h` - -**Implementation**: `src/ir/transforms/resolve_transpose_layout_pass.cpp` - -**Python binding**: `python/bindings/modules/passes.cpp` - -**Tests**: `tests/ut/ir/transforms/test_resolve_transpose_layout_pass.py` - -## Pass Properties - -| Property | Value | -| -------- | ----- | -| Required | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | -| Produced | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | -| Invalidated | — | - -The pass preserves all input properties: it only rewrites tensor parameter type annotations, not statement structure or SSA form. - -## Scope - -| Function type | Action | -| ------------- | ------ | -| InCore (InCore, AIC, AIV) | Scanned and possibly rewritten | -| Orchestration | Unchanged | -| Opaque | Unchanged | - -| Parameter state | Action | -| --------------- | ------ | -| Sourced by `tile.load(..., transpose=True)`, layout != DN, rank ≥ 2 | Rewritten to add `DN` | -| Sourced by `tile.load(..., transpose=True)`, layout already DN | Unchanged (idempotent) | -| Not sourced by any transposed load | Unchanged | -| Rank < 2 candidate | `CHECK` failure | - -The pass is a no-op when no InCore function contains a `tile.load(..., transpose=True)` whose source is a parameter (verified by the `TestResolveTransposeLayoutNoOp` test class). - -## Interaction with `tensor.transpose` at Orchestration - -`tensor.transpose` at the Orchestration layer (issue #1209) records two complementary pieces of information on its result type (in `DeduceTensorTransposeType`): - -1. **Layout tag.** For canonical trailing-two-dim swaps, `layout` toggles between `ND` and `DN`. PTOAS reads this tag to validate kernel boundaries. -2. **Explicit physical strides.** The runtime `Tensor::transpose` is a metadata-only swap of `shapes` / `offsets` — the underlying GM bytes stay in the source's row-major layout. So the post-transpose view's physical strides are the input's strides reordered at `(axis1, axis2)`. `DeduceTensorTransposeType` builds row-major strides over the input shape (`MakeIndexMul` collapses ConstInt chains, so static shapes get plain ConstInt strides; dynamic shapes get symbolic ones) and swaps them at the same axes as the shape. Non-trailing transposes are also supported via this path — they keep `layout = ND` and rely solely on the strides. - -Codegen disambiguates the two callers of the DN tag by checking `tensor_view_->stride`: - -| Source of DN | `stride` | `EmitMakeTensorViews` / partition emit | -| ------------ | -------- | -------------------------------------- | -| This pass (`tile.load(transpose=True)`) | empty | implicit "swap last two dims" — emit `[N, M]` shape with `[1, M]` strides over the IR shape `[M, N]` | -| `tensor.transpose` | non-empty | skip the implicit swap — emit IR shape directly with the recorded strides | - -The codegen rule above lets *non-transposed* downstream consumers (e.g. `tile.load(..., transpose=False)`) read a DN-tagged `tensor.transpose` result via the explicit-stride path without re-applying the implicit shape swap. - -**`tile.load(transpose=True)` directly on a `tensor.transpose` result is not yet supported.** That specific combination — an InCore param that arrives carrying both explicit physical strides AND `layout = DN` (the unique signature of a `tensor.transpose` result) — is rejected by this pass with a `CHECK` failure, because the two encodings would compose as a double transpose at codegen time and emit wrong addresses. Slice-derived inputs (explicit strides + `layout = ND`, attached by `OptimizeOrchTensors`) are unaffected and continue to flow through the standard "promote ND → DN, drop strides" path used by patterns like paged_attention's matmul B^T. Reconciling explicit physical strides with the pass's DN-only output convention requires a separate design — see follow-up tracked from issue #1209. Workaround for the rejected case: do the transpose at the tile level via `tile.load(transpose=True)` directly on the source tensor instead of via orchestration `tensor.transpose`. - -**Vec target is supported on a5 only.** Cross-layout `TLOAD(VecTile_RowMajor, GlobalTensor)` is rejected by PTOAS on a2a3 (the static assertion is `TLOAD(VecTile, GlobalTensor) only support ND2ND/DN2DN/NZ2NZ`), and `tile.load` further restricts `transpose=True` to `target_memory=Mat`. a5 (and the `a5sim` simulator) lifts this restriction, so a `pl.transpose` followed by a Vec-target consumer (e.g. `pl.slice` inside a non-matmul `pl.at(level=CORE_GROUP)` block) compiles and runs correctly there. The regression test `tests/st/runtime/test_trans.py::test_transpose_slice_assemble[a5sim]` covers exactly this case. On a2a3, the same DSL pattern compiles to correct IR/.pto but fails at the kernel C++ stage; workarounds are unchanged from before — route the load through a Mat tile (matmul-style), perform an explicit `tile.transpose` after the load, or materialize a contiguous transposed copy at orchestration. diff --git a/docs/en/dev/passes/19-resolve_backend_op_layouts.md b/docs/en/dev/passes/19-resolve_backend_op_layouts.md index c6cc3b979..03047d91d 100644 --- a/docs/en/dev/passes/19-resolve_backend_op_layouts.md +++ b/docs/en/dev/passes/19-resolve_backend_op_layouts.md @@ -1,10 +1,10 @@ # ResolveBackendOpLayouts Pass -Repairs backend-required tile layouts for elementwise ops. `[N, 1]` col-major vectors are reshaped into `[1, N]` row-major views, while general non-row-major tiles are coerced through `tile.move(..., blayout=row_major)`. Runs in the tile-PTO stage between `ResolveTransposeLayout` and the trailing `NormalizeStmtStructure`. +Repairs backend-required tile layouts for elementwise ops. `[N, 1]` col-major vectors are reshaped into `[1, N]` row-major views, while general non-row-major tiles are coerced through `tile.move(..., blayout=row_major)`. Runs in the tile-PTO stage between `LowerTransposeLoadParamLayout` and the trailing `NormalizeStmtStructure`. ## Overview -After `FlattenTileNdTo2D` and `ResolveTransposeLayout`, every tile op is in 2-D form with a known layout. Several PTO elementwise ops (registered in `src/backend/common/pto_ops_common.cpp`) require their tile operands and result to be `row_major`. This pass repairs those local violations at the consumer: +After `FlattenTileNdTo2D` and `LowerTransposeLoadParamLayout`, every tile op is in 2-D form with a known layout. Several PTO elementwise ops (registered in `src/backend/common/pto_ops_common.cpp`) require their tile operands and result to be `row_major`. This pass repairs those local violations at the consumer: 1. For each `AssignStmt` / `EvalStmt` whose RHS is a `Call`, query `Backend::GetTileLayoutSpec(op_name)`. 2. Skip if no spec is registered, or if all constrained tile inputs and output already use `row_major`. @@ -20,7 +20,7 @@ The pass is **backend-driven**: the set of constrained ops and their per-input r - Function must be `InCore` — Orchestration / Group functions are skipped. - A backend must be configured via `BackendConfig::Set(...)`. Otherwise the pass is a no-op. -**When to use**: As part of the `Default` tile-PTO pipeline, after layout-altering passes (`FlattenTileNdTo2D`, `InferTileMemorySpace`, `ResolveTransposeLayout`) and before `NormalizeStmtStructure`. The pass manager already places it in the correct slot. +**When to use**: As part of the `Default` tile-PTO pipeline, after layout-altering passes (`FlattenTileNdTo2D`, `InferTileMemorySpace`, `LowerTransposeLoadParamLayout`) and before `NormalizeStmtStructure`. The pass manager already places it in the correct slot. ## API diff --git a/docs/en/dev/passes/26-materialize_tensor_strides.md b/docs/en/dev/passes/26-materialize_tensor_strides.md index ff6d56956..71e6e3988 100644 --- a/docs/en/dev/passes/26-materialize_tensor_strides.md +++ b/docs/en/dev/passes/26-materialize_tensor_strides.md @@ -21,7 +21,7 @@ Codegen needs one machine-readable contract, so `MaterializeTensorStrides` walks - `TensorViewCanonical` — `PassPipeline` auto-verifies after the pass (using the registry's weak-mode verifier) -**When this lands in the default pipeline**: between [`CanonicalizeIOOrder`](25-canonicalize_io_order.md) and [`InitMemRef`](27-init_memref.md). This is the codegen-prep boundary — every layout-mutating pass (`ResolveTransposeLayout`, `ResolveBackendOpLayouts`, `ExpandMixedKernel`, `SplitVectorKernel`) has finished, and `InitMemRef` is the first consumer that needs explicit stride. +**Position in the default pipeline** (active since RFC #1300 P6): between [`CanonicalizeIOOrder`](25-canonicalize_io_order.md) and [`InitMemRef`](27-init_memref.md). This is the codegen-prep boundary — every layout-mutating pass (`LowerTransposeLoadParamLayout`, `ResolveBackendOpLayouts`, `ExpandMixedKernel`, `SplitVectorKernel`) has finished, and `InitMemRef` is the first consumer that needs explicit stride. ## API @@ -112,6 +112,6 @@ Because the pass declares `produced = {... ∪ TensorViewCanonical}`, `PassPipel - [`CanonicalizeIOOrder`](25-canonicalize_io_order.md) — runs immediately before; produces the program state the materialization consumes - [`InitMemRef`](27-init_memref.md) — first downstream consumer that depends on explicit stride -- [`ResolveTransposeLayout`](17-resolve_transpose_layout.md) — current source of empty-stride DN views (will be replaced by `LowerTransposeLoadParamLayout` in P6) +- [`LowerTransposeLoadParamLayout`](18-lower_transpose_load_param_layout.md) — primary source of empty-stride DN views in the default pipeline (active since RFC #1300 P6) - [`tensor_view_semantics.h`](../../../../include/pypto/ir/transforms/utils/tensor_view_semantics.h) — the helpers (`BuildLogicalStridesFromLayout`, `CheckCanonicalView`, `CanonicalizeView`) - RFC [#1300](https://github.com/hw-native-sys/pypto/issues/1300) — Self-consistent IR TensorType layout representation diff --git a/docs/en/user/01-language_guide.md b/docs/en/user/01-language_guide.md index 57c58dc78..642130da5 100644 --- a/docs/en/user/01-language_guide.md +++ b/docs/en/user/01-language_guide.md @@ -558,7 +558,7 @@ The `Default` strategy runs these passes in order: 10. **ConvertTensorToTileOps** — convert tensor operations to tile operations 11. **FlattenTileNdTo2D** — normalize ND tile ops to 2D 12. **InferTileMemorySpace** — infer tile memory spaces -13. **ResolveTransposeLayout** — repair transpose layout handling +13. **LowerTransposeLoadParamLayout** — repair transpose layout handling 14. **ResolveBackendOpLayouts** — repair backend-constrained tile layouts 15. **ExpandMixedKernel** — split mixed kernels when needed 16. **InitMemRef** — assign memory spaces and insert buffer allocations diff --git a/docs/zh-cn/dev/passes/00-pass_manager.md b/docs/zh-cn/dev/passes/00-pass_manager.md index 9640ce8d9..48c360f9a 100644 --- a/docs/zh-cn/dev/passes/00-pass_manager.md +++ b/docs/zh-cn/dev/passes/00-pass_manager.md @@ -373,7 +373,7 @@ with passes.PassContext([passes.VerificationInstrument(passes.VerificationMode.A 2. [`FlattenTileNdTo2D`](15-flatten_tile_nd_to_2d.md) 3. [`AutoTileMatmulL0`](16-auto_tile_matmul_l0.md) 4. `InferTileMemorySpace` -5. `ResolveTransposeLayout` +5. [`LowerTransposeLoadParamLayout`](18-lower_transpose_load_param_layout.md)(RFC #1300 P6 —— 替代 `ResolveTransposeLayout`) 6. [`ResolveBackendOpLayouts`](19-resolve_backend_op_layouts.md) 7. `NormalizeStmtStructure` 8. `ExpandMixedKernel` @@ -382,7 +382,7 @@ with passes.PassContext([passes.VerificationInstrument(passes.VerificationMode.A 11. `NormalizeReturnOrder` 12. [`LowerPipelineLoops`](24-lower_pipeline_loops.md) 13. [`CanonicalizeIOOrder`](25-canonicalize_io_order.md) -14. [`MaterializeTensorStrides`](26-materialize_tensor_strides.md) —— 已注册但尚未接入默认 pipeline(将随 RFC #1300 P6/P7 的 codegen 清理一起启用) +14. [`MaterializeTensorStrides`](26-materialize_tensor_strides.md) —— 自 RFC #1300 P6 起接入默认 pipeline 15. `InitMemRef` 16. `MemoryReuse` 17. [`LegalizePTOBufferReuse`](29-legalize_pto_buffer_reuse.md) diff --git a/docs/zh-cn/dev/passes/17-infer_tile_memory_space.md b/docs/zh-cn/dev/passes/17-infer_tile_memory_space.md index 78cc118b0..a79d7007d 100644 --- a/docs/zh-cn/dev/passes/17-infer_tile_memory_space.md +++ b/docs/zh-cn/dev/passes/17-infer_tile_memory_space.md @@ -15,7 +15,7 @@ - InCore / Orchestration 拆分必须已完成(`SplitIncoreOrch`) - 语句结构必须已规范化(`NormalizedStmtStructure`) -**使用时机**:紧接 `FlattenTileNdTo2D` 之后运行,先于 `ResolveTransposeLayout` / `ResolveBackendOpLayouts` / `ExpandMixedKernel`。它是 tile memory 成为下游契约的标准时点——尤其是 `ExpandMixedKernel` 的混合 kernel 检测和 `InitMemRef` 的缓冲区分配都直接读取该结果。 +**使用时机**:紧接 `FlattenTileNdTo2D` 之后运行,先于 `LowerTransposeLoadParamLayout` / `ResolveBackendOpLayouts` / `ExpandMixedKernel`。它是 tile memory 成为下游契约的标准时点——尤其是 `ExpandMixedKernel` 的混合 kernel 检测和 `InitMemRef` 的缓冲区分配都直接读取该结果。 ## API diff --git a/docs/zh-cn/dev/passes/18-lower_transpose_load_param_layout.md b/docs/zh-cn/dev/passes/18-lower_transpose_load_param_layout.md new file mode 100644 index 000000000..c3c65790a --- /dev/null +++ b/docs/zh-cn/dev/passes/18-lower_transpose_load_param_layout.md @@ -0,0 +1,162 @@ +# LowerTransposeLoadParamLayout Pass + +将 `tile.load(..., transpose=True)` 下沉为 canonical 形式的 DN 参数布局(RFC #1300 P6)。 + +## 概述 + +本 Pass 之前,`tile.load(transpose=True)` 是用户表达"我希望在 load 站点看到源张量的列主序视图"的方式。Pass 之后,这一意图被编码进 InCore 参数的 TensorType 本身 —— 源张量/load 组合被改写为 RFC #1300 §3.3 的 canonical 形式,使 codegen、verifier、下游 Pass 看到一份自洽的 `(shape, stride, layout)` 三元组。 + +对每个被 `tile.load(p, ..., transpose=True)` 加载的 InCore 参数 `p`: + +- `p` 的 TensorType 从 `[..., a, b] ND` 提升为 `[..., b, a] DN` —— 末两维形状互换 + DN 布局标签。新 TensorView 的 stride 为空;`MaterializeTensorStrides`(在默认 pipeline 中位于 `CanonicalizeIOOrder` 之后运行)会把它填为 packed canonical 的 stride。 +- 每个 `tile.load(p, offsets, shapes, valid_shapes, ..., transpose=True)`(源是已提升的参数)被改写为:三个 tuple 的末两维互换以匹配 canonical 坐标,丢弃 `transpose=True` kwarg。`DeduceTileLoadType` 通过源张量的 DN 布局推导出 Mat tile-view 的 layout —— 这两种信号在 §4.2 canonical pair 下是等价的。 +- 每个目标是已提升 callee 的非 InCore 函数调用站点,会把对应实参用 `tensor.as_layout(arg, DN)` 包一层(RFC #1300 P4)。该桥接 op 是纯元数据 —— 不生成 PTOAS 指令;`make_tensor_view` 直接消费新视图。 + +**前置条件**: + +- 输入 IR 必须为 SSA 形式 +- InCore 函数已完成拆分(`SplitIncoreOrch`) +- Tile op 已存在且为 2D(`IncoreTileOps`、`TileOps2D`) +- 被提升的参数 rank ≥ 2 + +**使用时机**:在 `Default` 策略中作为第 18 个 Pass 运行(文档编号 18 对应于 docs/passes/ 中的执行顺序槽位,与 pass_manager.py 中的相对顺序匹配),位于 `InferTileMemorySpace` 之后、`ResolveBackendOpLayouts` 之前。`FlattenTileNdTo2D` 产生的 2D 形状是前置条件。`MaterializeTensorStrides` 在 pipeline 后段运行(在 `CanonicalizeIOOrder` 之后)以物化 DN-packed canonical stride。 + +## API + +| C++ | Python | 级别 | +| --- | ------ | ---- | +| `pass::LowerTransposeLoadParamLayout()` | `passes.lower_transpose_load_param_layout()` | Program 级 | + +**Python 用法**: + +```python +from pypto.pypto_core import passes + +p = passes.lower_transpose_load_param_layout() +program_canonical = p(program) +``` + +## 算法 + +```text +对每个 InCore 函数 f: + 扫描 body → 得到 P_t = {tile.load(p, ..., transpose=True) 命中的 param 索引} + 得到 P_nt = {tile.load(p, ..., transpose=False/缺省) 命中的 param 索引} + 拒绝 P_t ∩ P_nt (混用) + 对每个 idx in P_t: + 提升 f.params[idx].type:[..., a, b] ND → [..., b, a] DN(stride 留空) + 在 body 中以新 Var 替换旧 Var + 改写 body 中每个 tile.load(promoted_param, off, shp, vs, transpose=True): + 交换 off / shp / vs 末两维 + 丢弃 transpose=True kwarg + +对每个非 InCore 函数: + 遍历 body;对每个 op 为已提升 callee 的 GlobalVar 的 Call: + 给每个已提升槽位的实参包一层 tensor.as_layout(arg, DN) +``` + +**复杂度:** O(N log N) —— 每个函数一次 body 走查,加一次全程序级调用站点走查。Map 查找(`promotions_by_callee_name`)为每次调用 `log N`。 + +| 行为 | 触发条件 | +| ---- | -------- | +| 提升参数到 `[..., b, a] DN` | InCore 参数是 `tile.load(..., transpose=True)` 的源 | +| 跳过参数 | 已经是 DN,或没有转置 load | +| 整个函数跳过 | 函数为 Orchestration / Opaque / Group | +| 调用站点 wrap `tensor.as_layout` | 非 InCore 函数调用已提升 callee | +| 拒绝 | 同一参数既被 transpose=True 也被 transpose=False 加载 | +| 拒绝 | DN + 显式物理 stride 源(与 tile.load 转置会叠成双重转置) | + +## 示例 + +**前**: + +```python +@pl.program +class Before: + @pl.function(type=pl.FunctionType.InCore) + def matmul_incore( + self, + a: pl.Tensor[[64, 128], pl.FP32], + b: pl.Tensor[[32, 128], pl.FP32], + c: pl.Out[pl.Tensor[[64, 32], pl.FP32]], + ) -> pl.Tensor[[64, 32], pl.FP32]: + tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) + tile_b = pl.load(b, [0, 0], [32, 128], target_memory=pl.MemorySpace.Mat, transpose=True) + ... + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator(self, a, b): + c = pl.create_tensor([64, 32], dtype=pl.FP32) + return self.matmul_incore(a, b, c) +``` + +**后**(语义层面 —— `tensor.as_layout` 是内部 IR op,不在 pl.* 暴露): + +```text +@pl.function(type=pl.FunctionType.InCore) +def matmul_incore( + self, + a: pl.Tensor[[64, 128], pl.FP32], + b: pl.Tensor[[128, 32], pl.FP32, pl.DN], # ← 形状互换 + DN 标签 + c: pl.Out[pl.Tensor[[64, 32], pl.FP32]], +) -> pl.Tensor[[64, 32], pl.FP32]: + tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) + tile_b = pl.load(b, [0, 0], [128, 32], target_memory=pl.MemorySpace.Mat) + # ↑ 没有 transpose kwarg + # ↑ shapes 已互换到 canonical 坐标 + ... + +@pl.function(type=pl.FunctionType.Orchestration) +def orchestrator(self, a, b): + c = pl.create_tensor([64, 32], dtype=pl.FP32) + # b 在调用站点被 tensor.as_layout 包一层做 ND → DN 桥接: + bridged_b = tensor.as_layout(b, pl.DN) # type: [128, 32] DN + return self.matmul_incore(a, bridged_b, c) +``` + +`a` 不转置加载,原样保留。`b` 在 InCore 签名被提升,body 中所有对 `b` 的加载改写到 canonical 坐标且无转置 kwarg,orchestrator 调用站点把 `b` 用 `tensor.as_layout` 包起来,把 `[32, 128] ND` 桥接到 `[128, 32] DN`(同一片物理内存)。 + +## 实现 + +**头文件**:`include/pypto/ir/transforms/passes.h` + +**实现**:`src/ir/transforms/lower_transpose_load_param_layout_pass.cpp` + +**Python 绑定**:`python/bindings/modules/passes.cpp` + +**测试**:`tests/ut/ir/transforms/test_lower_transpose_load_param_layout_pass.py` + +## Pass 属性 + +| 属性 | 值 | +| ---- | -- | +| 必需 | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | +| 产出 | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | +| 失效 | — | + +## 范围 + +| 函数类型 | 行为 | +| -------- | ---- | +| InCore(InCore、AIC、AIV) | 扫描,可能被提升 | +| Orchestration / Group / Opaque | 扫描调用站点;已提升实参 wrap `tensor.as_layout` | + +| 参数状态 | 行为 | +| -------- | ---- | +| 是 `tile.load(..., transpose=True)` 的源,layout != DN,rank ≥ 2 | 提升(形状互换 + DN 标签) | +| 是 `tile.load(..., transpose=True)` 的源,已是 DN | 幂等 —— 保持不变 | +| 同一参数既 transpose=True 又 transpose=False | `CHECK` 失败 | +| 没有转置 load 引用 | 保持不变 | +| Rank < 2 候选 | `CHECK` 失败 | + +## 与 `tensor.as_layout`(P4)和 `MaterializeTensorStrides`(P3)的交互 + +本 Pass 是默认 pipeline 中 `tensor.as_layout` 的第一个真实消费者。该桥接 op 单一职责:翻转 layout 标签,目标 shape 由 §4.2 canonical pair 机械导出 —— 调用方不传 target shape,所以调用站点改写器不会出错。 + +下游的 `MaterializeTensorStrides` 把每个被提升的参数 TensorView 空 stride 填为 packed canonical DN strides(RFC §2.4)。P6 + P3 的组合让 codegen 看到自洽的 `(shape, stride, layout)` 三元组 —— 对被提升的参数,codegen 路径无需再做 `dn_swap` / `get_shape_source_idx` 修正。 + +## 与 Orchestration 层 `tensor.transpose` 的交互 + +源 TensorView 同时携带 `layout = DN` 和非空 `stride` 的参数是 `tensor.transpose` 结果的特征。本 Pass 对这类参数上的 `tile.load(transpose=True)` 直接拒绝(`CHECK` 失败)—— 否则两层转置编码会在 codegen 时叠成双重转置、地址错误。Slice 派生的入参(显式 stride + `layout = ND`,由 `OptimizeOrchTensors` 附加)不受影响。 + +被拒绝场景的绕过:在源程序中去掉两层转置中的一层。 diff --git a/docs/zh-cn/dev/passes/18-resolve_transpose_layout.md b/docs/zh-cn/dev/passes/18-resolve_transpose_layout.md deleted file mode 100644 index 5250a132c..000000000 --- a/docs/zh-cn/dev/passes/18-resolve_transpose_layout.md +++ /dev/null @@ -1,171 +0,0 @@ -# ResolveTransposeLayout Pass - -为作为 `tile.load(..., transpose=True)` 源张量的 InCore 函数参数标注 `DN`(列主序)布局。 - -## 概述 - -当 `tile.load` 使用 `transpose=True` 发起时,PTO codegen 需要源张量以列主序(`DN`)布局物化 —— 转置通过布局选择来实现,而不是通过对数据进行 reshape。该 Pass 把这一布局需求从 load 站点回传到函数参数类型,让下游 Pass 与 codegen 把参数的 `TensorType` 视为布局的唯一权威。 - -该 Pass 只标注参数 —— **形状保持不变**。`DN` 是布局/codegen 提示;逻辑张量维度不会被交换。(这正是 #606 的回归测试所保护的不变量:在 `[128, 128]` 上做窗口转置加载时,参数形状必须保持 `[128, 128]`,而不是 load 窗口的形状。) - -**前置条件**: - -- 输入 IR 必须为 SSA 形式 -- InCore 函数已完成拆分(`SplitIncoreOrch`) -- Tile 操作已存在且为 2D(`IncoreTileOps`、`TileOps2D`) -- 待标注的张量参数必须 rank ≥ 2 - -**使用时机**:在 `Default` 策略中作为第 15 个 Pass 运行,位于 `InferTileMemorySpace` 之后、`ResolveBackendOpLayouts` 之前。`FlattenTileNdTo2D` 产生的 2D 形状是其前置条件。 - -## API - -| C++ | Python | 级别 | -| --- | ------ | ---- | -| `pass::ResolveTransposeLayout()` | `passes.resolve_transpose_layout()` | Program 级 | - -**Python 用法**: - -```python -from pypto.pypto_core import passes - -resolve_pass = passes.resolve_transpose_layout() -program_dn = resolve_pass(program) -``` - -## 算法 - -对程序中每个函数: - -1. **跳过非 InCore 函数**:Orchestration 与 Opaque 函数原样返回。仅处理 InCore 类函数(InCore、AIC、AIV)。 -2. **扫描 body 中的转置 load**:遍历函数体,对每个 kwarg `transpose=True` 且第一个参数是该函数某个 parameter 的 `tile.load` 调用,记录该 parameter 的索引。多次出现的同一参数会去重。 -3. **重写参数**:对每个被收集到的参数: - - **若已是 DN 则跳过**:参数的 `TensorType` 已携带 `TensorView{layout=DN}` 时无需重写(幂等)。 - - **要求 rank ≥ 2**:1D 张量谈不上列主序;遇到时通过 `CHECK` 终止。 - - 构造一个新的 `Var`,沿用原有的 `name_hint`、span 与形状,但其 `TensorType` 的 `tensor_view_` 为 `TensorView({}, TensorLayout::DN)`。 -4. **替换**:通过 `Substitute` 把函数体内对旧 `Var` 的所有引用替换为新 `Var`,再用 `MutableCopy` 以新参数列表与新 body 重建函数。 - -不会对 Orchestration 端做任何改写。下游 Pass 与 codegen 把 InCore 签名视为布局的唯一权威。 - -| 行为 | 触发条件 | -| ---- | -------- | -| 给参数加 `DN` | InCore 函数参数是 `tile.load(..., transpose=True)` 的源 | -| 跳过该参数 | 已是 `DN`,或没有任何转置 load 命中它 | -| 跳过整个函数 | 函数为 Orchestration 或 Opaque | -| `CHECK` 失败 | 待标注参数不是 `TensorType`,或 rank < 2 | - -## 示例 - -**之前**: - -```python -@pl.program -class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[64, 128], pl.FP32], - b: pl.Tensor[[32, 128], pl.FP32], - c: pl.Out[pl.Tensor[[64, 32], pl.FP32]], - ) -> pl.Tensor[[64, 32], pl.FP32]: - tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [32, 128], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[64, 128], pl.FP32], b: pl.Tensor[[32, 128], pl.FP32] - ) -> pl.Tensor[[64, 32], pl.FP32]: - c: pl.Tensor[[64, 32], pl.FP32] = pl.create_tensor([64, 32], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result -``` - -**之后**: - -```python -@pl.program -class After: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[64, 128], pl.FP32], - b: pl.Tensor[[32, 128], pl.FP32, pl.DN], - c: pl.Out[pl.Tensor[[64, 32], pl.FP32]], - ) -> pl.Tensor[[64, 32], pl.FP32]: - tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [32, 128], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[64, 128], pl.FP32], b: pl.Tensor[[32, 128], pl.FP32] - ) -> pl.Tensor[[64, 32], pl.FP32]: - c: pl.Tensor[[64, 32], pl.FP32] = pl.create_tensor([64, 32], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result -``` - -`b` 是带 `transpose=True` 的 `tile.load` 的源,因此 InCore 参数类型获得 `pl.DN` 布局标注。形状 `[32, 128]` 不变。`a` 没有转置 load,保持原样。Orchestration `orchestrator` 的签名**不会**被改写。 - -## 实现 - -**头文件**:`include/pypto/ir/transforms/passes.h` - -**实现文件**:`src/ir/transforms/resolve_transpose_layout_pass.cpp` - -**Python 绑定**:`python/bindings/modules/passes.cpp` - -**测试**:`tests/ut/ir/transforms/test_resolve_transpose_layout_pass.py` - -## Pass 属性 - -| 属性 | 值 | -| ---- | -- | -| 所需 | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | -| 产生 | SSAForm, IncoreTileOps, SplitIncoreOrch, TileOps2D | -| 失效 | — | - -该 Pass 保留所有输入属性:仅重写张量参数的类型标注,不改变语句结构或 SSA 形式。 - -## 作用范围 - -| 函数类型 | 处理方式 | -| -------- | -------- | -| InCore(InCore、AIC、AIV) | 扫描并可能改写 | -| Orchestration | 原样保留 | -| Opaque | 原样保留 | - -| 参数状态 | 处理方式 | -| -------- | -------- | -| 是 `tile.load(..., transpose=True)` 的源、布局非 DN、rank ≥ 2 | 改写并加上 `DN` | -| 是 `tile.load(..., transpose=True)` 的源、布局已是 DN | 原样保留(幂等) | -| 不是任何转置 load 的源 | 原样保留 | -| 候选参数 rank < 2 | `CHECK` 失败 | - -如果没有任何 InCore 函数包含以参数为源的 `tile.load(..., transpose=True)`,整个 Pass 是 no-op(由 `TestResolveTransposeLayoutNoOp` 测试类验证)。 - -## 与 Orchestration 层 `tensor.transpose` 的衔接 - -Orchestration 层的 `tensor.transpose`(见 issue #1209)在 `DeduceTensorTransposeType` 中,会在结果类型上同时记录两条信息: - -1. **Layout tag。** 对最后两维互换的标准情形,`layout` 在 `ND` 与 `DN` 之间切换。PTOAS 用该 tag 校验 kernel 边界。 -2. **显式物理 strides。** 运行时 `Tensor::transpose` 只交换 `shapes` / `offsets` 等元数据,底层 GM 仍按源 tensor 的行主序排布,所以转置后视图的物理 strides 是输入 strides 在 `(axis1, axis2)` 上 swap 后的结果。`DeduceTensorTransposeType` 用 `MakeIndexMul` 按输入 shape 构造行主序 strides(静态 shape 折叠为 ConstInt;动态 shape 得到符号表达式),再在同样的轴上 swap。非末两维 transpose 也由这条路径覆盖 —— 它们保持 `layout = ND`,完全依赖 strides 描述排布。 - -Codegen 通过检查 `tensor_view_->stride` 是否为空区分 DN tag 的两类来源: - -- **来自本 Pass(`tile.load(transpose=True)`):** `stride` 为空 —— 沿用旧的"末两维隐式 swap"路径,把 IR shape `[M, N]` 发射为 `[N, M]`,strides `[1, M]`。 -- **来自 `tensor.transpose`:** `stride` 非空 —— 跳过隐式 swap,直接按 IR shape 发射,strides 用 IR 上记录的。 - -上述 codegen 规则让 *非转置* 的下游消费者(例如 `tile.load(..., transpose=False)`)能够通过显式 strides 路径正确读取 `tensor.transpose` 的 DN 结果,不会再叠加一次"末两维 swap"造成的双重转置。 - -**`tile.load(transpose=True)` 直接消费 `tensor.transpose` 的产物 —— 目前不支持。** 这种特定组合(InCore 参数同时携带显式物理 strides 且 `layout = DN`,正是 `tensor.transpose` 结果的独有签名)会被本 Pass 用 `CHECK` 拒绝,因为两套编码会在 codegen 中叠加成双重转置并发出错误地址。源自 slice 的输入(显式 strides + `layout = ND`,由 `OptimizeOrchTensors` 附加)不受影响,仍走标准的"提升 ND → DN,丢弃 strides"路径,paged_attention 的 matmul B^T 等模式继续正常工作。把显式物理 strides 与本 Pass 仅输出 DN 的约定调和需要单独设计 —— 跟踪 issue #1209 的 follow-up。被拒绝场景的变通方法:在 tile 层通过 `tile.load(transpose=True)` 直接对源张量做转置,而不是先在 orchestration 用 `tensor.transpose`。 - -**Vec 目标仅在 a5 系列可用。** 跨布局 `TLOAD(VecTile_RowMajor, GlobalTensor)` 在 a2a3 上仍被 PTOAS 拒绝(静态断言 `TLOAD(VecTile, GlobalTensor) only support ND2ND/DN2DN/NZ2NZ`),且 `tile.load` 仍限制 `transpose=True` 仅在 `target_memory=Mat` 时合法。a5(及 `a5sim` 模拟器)解除了该约束,所以 `pl.transpose` 后接 Vec 目标消费者(如非 matmul `pl.at(level=CORE_GROUP)` 块内的 `pl.slice`)在 a5 上可以正确编译并运行。回归测试见 `tests/st/runtime/test_trans.py::test_transpose_slice_assemble[a5sim]`。在 a2a3 上同样的 DSL 现在能产出正确的 IR/.pto,但会在 kernel C++ 阶段失败;变通方法保持不变 —— 通过 Mat tile 走 matmul 风格 load、在 InCore 中显式 `tile.transpose`、或在 Orchestration 层物化一份连续的转置拷贝。 diff --git a/docs/zh-cn/dev/passes/19-resolve_backend_op_layouts.md b/docs/zh-cn/dev/passes/19-resolve_backend_op_layouts.md index 44070aec8..22d02e5c6 100644 --- a/docs/zh-cn/dev/passes/19-resolve_backend_op_layouts.md +++ b/docs/zh-cn/dev/passes/19-resolve_backend_op_layouts.md @@ -1,10 +1,10 @@ # ResolveBackendOpLayouts Pass -为后端有 layout 约束的 elementwise tile op 修复 layout:把 `[N, 1]` 的 col-major 向量 reshape 成 `[1, N]` 的 row-major 视图,并通过 `tile.move(..., blayout=row_major)` 修复一般非 row-major tile。该 Pass 在 tile-PTO 阶段运行,位于 `ResolveTransposeLayout` 之后、收尾的 `NormalizeStmtStructure` 之前。 +为后端有 layout 约束的 elementwise tile op 修复 layout:把 `[N, 1]` 的 col-major 向量 reshape 成 `[1, N]` 的 row-major 视图,并通过 `tile.move(..., blayout=row_major)` 修复一般非 row-major tile。该 Pass 在 tile-PTO 阶段运行,位于 `LowerTransposeLoadParamLayout` 之后、收尾的 `NormalizeStmtStructure` 之前。 ## 概述 -经过 `FlattenTileNdTo2D` 和 `ResolveTransposeLayout` 之后,所有 tile op 都已是 2-D 形式且带有明确的 layout。多个 PTO elementwise op(在 `src/backend/common/pto_ops_common.cpp` 中注册)要求其 tile 操作数与结果均为 `row_major`。本 Pass 在使用点局部修复这些约束违反: +经过 `FlattenTileNdTo2D` 和 `LowerTransposeLoadParamLayout` 之后,所有 tile op 都已是 2-D 形式且带有明确的 layout。多个 PTO elementwise op(在 `src/backend/common/pto_ops_common.cpp` 中注册)要求其 tile 操作数与结果均为 `row_major`。本 Pass 在使用点局部修复这些约束违反: 1. 对每个 RHS 是 `Call` 的 `AssignStmt` / `EvalStmt`,调用 `Backend::GetTileLayoutSpec(op_name)` 查询约束。 2. 若没有注册约束,或者所有受约束的 tile 输入与输出都已经是 `row_major`,则跳过。 @@ -20,7 +20,7 @@ - 函数必须是 `InCore`;Orchestration / Group 函数被跳过。 - 必须通过 `BackendConfig::Set(...)` 配置后端,否则本 Pass 为 no-op。 -**何时使用**:作为 `Default` tile-PTO pipeline 的一部分,在改变 layout 的若干 Pass(`FlattenTileNdTo2D`、`InferTileMemorySpace`、`ResolveTransposeLayout`)之后、`NormalizeStmtStructure` 之前运行。Pass manager 已经把它放在了正确的位置。 +**何时使用**:作为 `Default` tile-PTO pipeline 的一部分,在改变 layout 的若干 Pass(`FlattenTileNdTo2D`、`InferTileMemorySpace`、`LowerTransposeLoadParamLayout`)之后、`NormalizeStmtStructure` 之前运行。Pass manager 已经把它放在了正确的位置。 ## API diff --git a/docs/zh-cn/dev/passes/26-materialize_tensor_strides.md b/docs/zh-cn/dev/passes/26-materialize_tensor_strides.md index ea8279bb9..9b25e770d 100644 --- a/docs/zh-cn/dev/passes/26-materialize_tensor_strides.md +++ b/docs/zh-cn/dev/passes/26-materialize_tensor_strides.md @@ -21,7 +21,7 @@ PyPTO IR 上 `TensorType.tensor_view_` 当前可以处于两种等价形态: - `TensorViewCanonical` —— `PassPipeline` 在 Pass 之后自动用 registry 中的弱模式 verifier 校验 -**未来在默认 pipeline 中的位置**:[`CanonicalizeIOOrder`](25-canonicalize_io_order.md) 与 [`InitMemRef`](27-init_memref.md) 之间。这是 codegen-prep 边界 —— 所有 layout-mutating pass(`ResolveTransposeLayout` / `ResolveBackendOpLayouts` / `ExpandMixedKernel` / `SplitVectorKernel`)已结束,`InitMemRef` 是第一个依赖显式 stride 的消费者。 +**默认 pipeline 中的位置**(自 RFC #1300 P6 起激活):[`CanonicalizeIOOrder`](25-canonicalize_io_order.md) 与 [`InitMemRef`](27-init_memref.md) 之间。这是 codegen-prep 边界 —— 所有 layout-mutating pass(`LowerTransposeLoadParamLayout` / `ResolveBackendOpLayouts` / `ExpandMixedKernel` / `SplitVectorKernel`)已结束,`InitMemRef` 是第一个依赖显式 stride 的消费者。 ## API @@ -112,6 +112,6 @@ ND 情况下公式退化为标准行主序 packed stride。 - [`CanonicalizeIOOrder`](25-canonicalize_io_order.md) —— 紧邻其前;产生本 Pass 消费的程序状态 - [`InitMemRef`](27-init_memref.md) —— 第一个依赖显式 stride 的下游消费者 -- [`ResolveTransposeLayout`](17-resolve_transpose_layout.md) —— 当前空 stride DN view 的来源(P6 将由 `LowerTransposeLoadParamLayout` 取代) +- [`LowerTransposeLoadParamLayout`](18-lower_transpose_load_param_layout.md) —— 空 stride DN view 的主要来源(P6 起在默认 pipeline 中产生 canonical DN 参数) - [`tensor_view_semantics.h`](../../../../include/pypto/ir/transforms/utils/tensor_view_semantics.h) —— 工具函数(`BuildLogicalStridesFromLayout` / `CheckCanonicalView` / `CanonicalizeView`) - RFC [#1300](https://github.com/hw-native-sys/pypto/issues/1300) —— IR Tensor Layout 自洽表示方案 diff --git a/docs/zh-cn/user/01-language_guide.md b/docs/zh-cn/user/01-language_guide.md index 21d2c5d4d..81a477bac 100644 --- a/docs/zh-cn/user/01-language_guide.md +++ b/docs/zh-cn/user/01-language_guide.md @@ -558,7 +558,7 @@ output_dir = ir.compile( 10. **ConvertTensorToTileOps** —— 将张量操作转换为 tile 操作 11. **FlattenTileNdTo2D** —— 将 ND tile 操作规范化为 2D 12. **InferTileMemorySpace** —— 推断 tile 内存空间 -13. **ResolveTransposeLayout** —— 修复转置布局处理 +13. **LowerTransposeLoadParamLayout** —— 修复转置布局处理 14. **ResolveBackendOpLayouts** —— 修复 backend 受限的 tile 布局 15. **ExpandMixedKernel** —— 在需要时拆分 mixed kernel 16. **InitMemRef** —— 分配内存空间并插入缓冲区分配 diff --git a/include/pypto/ir/transforms/pass_properties.h b/include/pypto/ir/transforms/pass_properties.h index 7c3d6c190..e9a7a7a04 100644 --- a/include/pypto/ir/transforms/pass_properties.h +++ b/include/pypto/ir/transforms/pass_properties.h @@ -133,9 +133,9 @@ inline const PassProperties kInferTileMemorySpaceProperties{ IRProperty::NormalizedStmtStructure}, .produced = {IRProperty::SSAForm, IRProperty::TileMemoryInferred, IRProperty::NormalizedStmtStructure}}; -// -- Resolve transpose layout pass -------------------------------------------- +// -- Lower transpose-load parameter layout pass (RFC #1300 P6) ---------------- -inline const PassProperties kResolveTransposeLayoutProperties{ +inline const PassProperties kLowerTransposeLoadParamLayoutProperties{ .required = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch, IRProperty::TileOps2D}, .produced = {IRProperty::SSAForm, IRProperty::IncoreTileOps, IRProperty::SplitIncoreOrch, diff --git a/include/pypto/ir/transforms/passes.h b/include/pypto/ir/transforms/passes.h index 8b05154c1..186db1f7b 100644 --- a/include/pypto/ir/transforms/passes.h +++ b/include/pypto/ir/transforms/passes.h @@ -426,18 +426,32 @@ Pass AutoTileMatmulL0(); Pass InferTileMemorySpace(); /** - * @brief Resolve transpose layout for tile.load with transpose=True - * - * For each InCore function, detects tile.load(..., transpose=True) whose source - * is a function parameter and annotates that parameter's TensorType with the DN - * (column-major) layout. The shape is preserved -- DN is a codegen hint only. - * Orchestration and Opaque functions are returned unchanged. + * @brief Lower ``tile.load(transpose=True)`` to canonical-form parameter layout (RFC #1300 P6) + * + * For each InCore function, detects ``tile.load(..., transpose=True)`` whose source + * is a function parameter and promotes that parameter to canonical-form DN + * (RFC #1300 §3.3 + §4.2): + * + * - Param TensorType: ``[..., a, b] ND`` → ``[..., b, a] DN`` (trailing-pair swap + + * DN layout tag with empty stride; ``MaterializeTensorStrides`` later fills the + * packed canonical strides). + * - Each ``tile.load(p, offsets, shapes, valid_shapes, ..., transpose=True)`` whose + * source ``p`` is a promoted param is rewritten to: offsets / shapes / + * valid_shapes' trailing pair is swapped to canonical coords, and the + * ``transpose=True`` kwarg is dropped — the DN-source + Mat-target signal + * fully encodes the load's tile-view orientation. + * - Every non-InCore call site that targets a promoted callee is wrapped with + * ``tensor.as_layout(arg, DN)`` so the orch-side ``[..., a, b] ND`` runtime + * tensor bridges to the InCore-side ``[..., b, a] DN`` param type. + * + * Mixed-use parameters (same param loaded with both ``transpose=True`` and + * ``transpose=False``) are rejected with ``pypto::ValueError``. * * Requirements: * - Input IR must have tile ops (run ConvertTensorToTileOps first) * - Input IR must have InCore scopes outlined (run OutlineIncoreScopes first) */ -Pass ResolveTransposeLayout(); +Pass LowerTransposeLoadParamLayout(); /** * @brief Materialize implicit ND/DN strides on every TensorType (RFC #1300 §2.4) diff --git a/include/pypto/ir/transforms/utils/tensor_view_semantics.h b/include/pypto/ir/transforms/utils/tensor_view_semantics.h index 9318106cf..ae4ba5aba 100644 --- a/include/pypto/ir/transforms/utils/tensor_view_semantics.h +++ b/include/pypto/ir/transforms/utils/tensor_view_semantics.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "pypto/core/dtype.h" diff --git a/python/bindings/modules/passes.cpp b/python/bindings/modules/passes.cpp index 1d3929950..ec05a8a4a 100644 --- a/python/bindings/modules/passes.cpp +++ b/python/bindings/modules/passes.cpp @@ -418,12 +418,16 @@ void BindPass(nb::module_& m) { "tiling; M/N tiling and K%k!=0 cases emit a PerfHint and skip."); passes.def("infer_tile_memory_space", &pass::InferTileMemorySpace, "Create a pass that infers memory_space for TileType variables in InCore functions"); - passes.def("resolve_transpose_layout", &pass::ResolveTransposeLayout, - "Create a pass that resolves transpose layout for tile.load with transpose=True\n\n" + passes.def("lower_transpose_load_param_layout", &pass::LowerTransposeLoadParamLayout, + "Create the LowerTransposeLoadParamLayout pass (RFC #1300 P6).\n\n" "For each InCore function, detects tile.load(..., transpose=True) whose source\n" - "is a function parameter and annotates that parameter's TensorType with the DN\n" - "(column-major) layout. The shape is preserved -- DN is a codegen hint only.\n" - "Orchestration and Opaque functions are returned unchanged."); + "is a function parameter and promotes the parameter to canonical-form DN:\n" + "shape trailing-pair is swapped, the DN layout tag is added, the tile.load\n" + "body call's offsets/shapes/valid_shapes are swapped and the transpose=True\n" + "kwarg dropped, and every non-InCore call site wraps the promoted argument\n" + "in tensor.as_layout(arg, DN) to bridge orch-side ND tensors to InCore DN\n" + "params. Mixed-use params (both transpose=True and transpose=False loads on\n" + "the same param) are rejected."); passes.def("materialize_tensor_strides", &pass::MaterializeTensorStrides, "Create the MaterializeTensorStrides pass (RFC #1300 §2.4).\n\n" "Walks every TensorType reachable from the program and rewrites any\n" diff --git a/python/pypto/ir/op/tensor_ops.py b/python/pypto/ir/op/tensor_ops.py index af84fcc64..598f48ce8 100644 --- a/python/pypto/ir/op/tensor_ops.py +++ b/python/pypto/ir/op/tensor_ops.py @@ -1085,6 +1085,46 @@ def transpose( return _ir_core.create_op_call("tensor.transpose", args, {}, actual_span) +def as_layout( + tensor: Expr, + layout: TensorLayout, + span: Span | None = None, +) -> Call: + """Flip ``tensor``'s layout tag over the same physical memory (RFC #1300 §3.3). + + .. note:: + Internal IR builder — this op is not exposed via ``pypto.language``. + Passes (e.g. ``LowerTransposeLoadParamLayout`` in P6) inject + ``tensor.as_layout`` at orch ↔ InCore call sites to bridge ND ↔ DN views + over the same physical buffer. The op emits no PTOAS instruction; + downstream ``make_tensor_view`` consumes the new view directly. + + The trailing-two-dim shape swap that comes with a cross-layout flip is + mechanical (RFC §4.2: row-major ``[..., a, b]`` ND ≡ ``[..., b, a]`` + DN-packed) and derived from the source — callers don't pass a target + shape. For shape changes, use ``tensor.reshape``. + + Validity (enforced by ``DeduceTensorAsLayoutType``): + + 1. ``layout`` must not be ``NZ`` (NZ is tile-only and fractal). + 2. ``tensor`` must be packed canonical or bare (strided sub-views are + rejected — the §4.2 equivalence only holds for packed forms). + 3. Cross-layout flips require rank ≥ 2. + + Args: + tensor: Source TensorType (packed canonical or bare). + layout: Target ``TensorLayout`` (must not be ``NZ``). + span: Optional source span (auto-captured when omitted). + + Returns: + ``Call`` expression carrying a TensorType with the canonical + ``(shape, stride, layout)`` for the target view. + """ + actual_span = _get_span_or_capture(span) + kwargs: dict[str, Any] = {"layout": layout} + return _ir_core.create_op_call("tensor.as_layout", [tensor], kwargs, actual_span) + + def set_validshape( tensor: Expr, valid_rows: int | Expr, diff --git a/python/pypto/ir/pass_manager.py b/python/pypto/ir/pass_manager.py index 4ae8257b7..a028b3a2b 100644 --- a/python/pypto/ir/pass_manager.py +++ b/python/pypto/ir/pass_manager.py @@ -145,7 +145,7 @@ def _register_passes(cls): ("FlattenTileNdTo2D", lambda: passes.flatten_tile_nd_to_2d()), ("AutoTileMatmulL0", lambda: passes.auto_tile_matmul_l0()), ("InferTileMemorySpace", lambda: passes.infer_tile_memory_space()), - ("ResolveTransposeLayout", lambda: passes.resolve_transpose_layout()), + ("LowerTransposeLoadParamLayout", lambda: passes.lower_transpose_load_param_layout()), ("ResolveBackendOpLayouts", lambda: passes.resolve_backend_op_layouts()), ("NormalizeStmtStructure", lambda: passes.normalize_stmt_structure()), ("ExpandMixedKernel", lambda: passes.expand_mixed_kernel()), @@ -154,13 +154,13 @@ def _register_passes(cls): ("NormalizeReturnOrder", lambda: passes.normalize_return_order()), ("LowerPipelineLoops", lambda: passes.lower_pipeline_loops()), ("CanonicalizeIOOrder", lambda: passes.canonicalize_io_order()), - # NOTE (RFC #1300 §2.4): the MaterializeTensorStrides pass is - # registered (passes.materialize_tensor_strides()) but is NOT yet - # inserted into the default pipeline. It would materialize DN stride - # to the canonical (logical-shape) form, which still conflicts with - # the legacy "source shape + post-emit swap" path in pto codegen - # (`get_shape_source_idx`, `dn_swap`). The pipeline insertion will - # land alongside the codegen cleanup in a later phase (P6/P7). + # MaterializeTensorStrides fills empty stride slots on every + # TensorView with packed canonical strides (RFC #1300 §2.4). + # Active in the default pipeline starting at P6 — by this point + # LowerTransposeLoadParamLayout has produced canonical-form DN + # parameters, so the materialized strides match the IR shape + # without going through the legacy `dn_swap` codegen path. + ("MaterializeTensorStrides", lambda: passes.materialize_tensor_strides()), ("InitMemRef", lambda: passes.init_mem_ref()), ("MemoryReuse", lambda: passes.memory_reuse()), ("LegalizePTOBufferReuse", lambda: passes.legalize_pto_buffer_reuse()), diff --git a/python/pypto/pypto_core/passes.pyi b/python/pypto/pypto_core/passes.pyi index f7554d960..f76630fba 100644 --- a/python/pypto/pypto_core/passes.pyi +++ b/python/pypto/pypto_core/passes.pyi @@ -444,8 +444,19 @@ def auto_tile_matmul_l0() -> Pass: def infer_tile_memory_space() -> Pass: """Create a pass that infers memory_space for TileType variables in InCore functions.""" -def resolve_transpose_layout() -> Pass: - """Create a pass that resolves transpose layout for tile.load with transpose=True.""" +def lower_transpose_load_param_layout() -> Pass: + """Create the LowerTransposeLoadParamLayout pass (RFC #1300 P6). + + For each InCore function, detects ``tile.load(..., transpose=True)`` whose + source is a function parameter and promotes the parameter to canonical-form + DN: shape trailing-pair is swapped, the DN layout tag is added, body + ``tile.load`` calls have offsets/shapes/valid_shapes' trailing pair swapped + and the ``transpose=True`` kwarg dropped, and every non-InCore call site + wraps the promoted argument in ``tensor.as_layout(arg, DN)`` to bridge + orch-side ND tensors to InCore DN params. Mixed-use parameters (both + ``transpose=True`` and ``transpose=False`` loads on the same param) are + rejected. + """ def materialize_tensor_strides() -> Pass: """Create the MaterializeTensorStrides pass (RFC #1300 §2.4). @@ -700,7 +711,7 @@ __all__ = [ "flatten_tile_nd_to_2d", "auto_tile_matmul_l0", "infer_tile_memory_space", - "resolve_transpose_layout", + "lower_transpose_load_param_layout", "materialize_tensor_strides", "resolve_backend_op_layouts", "normalize_return_order", diff --git a/src/backend/common/pto_ops_common.cpp b/src/backend/common/pto_ops_common.cpp index 106e07e85..25d7389cc 100644 --- a/src/backend/common/pto_ops_common.cpp +++ b/src/backend/common/pto_ops_common.cpp @@ -1145,32 +1145,12 @@ static std::string MakeTileLoadCodegenPTO(const CallPtr& op, codegen::CodegenBas std::string tensor_view_type = codegen.GetTensorViewTypeString(tensor_type.get()); std::string tile_buf_type = codegen.GetCurrentResultTileBufTypeString(); - const auto tensor_view_value = tensor_type->tensor_view_.value_or(ir::TensorView{}); - // Apply the implicit DN last-two-dim swap only when the view is DN AND has - // no explicit strides. Explicit strides (e.g. from tensor.transpose) already - // describe the physical layout in the IR shape's coordinate system, and the - // IR slice's offsets/sizes are in that same system — swapping here would - // double-transpose the access pattern. - bool dn_swap = tensor_type->tensor_view_.has_value() && tensor_view_value.layout == ir::TensorLayout::DN && - tensor_view_value.stride.empty(); - - // Use valid_shapes (op arg 3) for partition_view sizes so the DMA copy size - // matches the logical valid region. When valid_shapes equals the physical - // shapes the resulting partition_view is identical to the previous one; when - // they differ (e.g. fillpad-on-partial-block), the partition_view becomes - // dynamic and tload only fetches the valid region from GM, leaving the - // physical padding region in the tile_buf to be written by a downstream - // fillpad. For DN layout, swap the last two valid/offset elements so that - // the partition coordinates are in the transposed coordinate system used by - // make_tensor_view. - auto valid_elems = valid_shapes_tuple->elements_; - if (dn_swap && valid_elems.size() >= 2) { - std::iter_swap(valid_elems.rbegin(), valid_elems.rbegin() + 1); - } - auto offset_elems = offsets_tuple->elements_; - if (dn_swap && offset_elems.size() >= 2) { - std::iter_swap(offset_elems.rbegin(), offset_elems.rbegin() + 1); - } + // RFC #1300 P7: the IR's offsets / shapes / valid_shapes are already in + // canonical coordinates (matching the source TensorType's shape). There is + // no implicit dn_swap here — ``LowerTransposeLoadParamLayout`` (P6) is + // responsible for ensuring all coordinate systems match before codegen. + const auto& valid_elems = valid_shapes_tuple->elements_; + const auto& offset_elems = offsets_tuple->elements_; std::string partition_type = MakePartitionTensorViewType(GetDimStrings(valid_elems), dtype_str); std::string partition_view = @@ -1224,11 +1204,10 @@ static std::string MakeTileStoreCodegenPTO(const CallPtr& op, codegen::CodegenBa std::string partition_type; const size_t tensor_rank = tensor_type->shape_.size(); - const auto tensor_view_value = tensor_type->tensor_view_.value_or(ir::TensorView{}); - // See EmitTileLoadPTO for the rationale: apply the DN last-two-dim swap - // only when there are no explicit strides on the view. - bool dn_swap = tensor_type->tensor_view_.has_value() && tensor_view_value.layout == ir::TensorLayout::DN && - tensor_view_value.stride.empty(); + // RFC #1300 P7: the IR's offsets / shapes are already in canonical + // coordinates (matching the source TensorType's shape). No implicit + // dn_swap here — the IR-level lowering passes (P6 + canonical TensorView) + // are responsible for ensuring all coordinate systems match before codegen. // Check if FlattenTileNdTo2D injected an explicit shapes tuple as args[3]. ir::MakeTuplePtr shapes_tuple; @@ -1238,14 +1217,8 @@ static std::string MakeTileStoreCodegenPTO(const CallPtr& op, codegen::CodegenBa if (shapes_tuple) { // N-rank partition path: use the explicit shapes tuple from FlattenTileNdTo2D. - auto shape_elems = shapes_tuple->elements_; - auto offset_elems = offsets_tuple->elements_; - if (dn_swap && shape_elems.size() >= 2) { - std::iter_swap(shape_elems.rbegin(), shape_elems.rbegin() + 1); - } - if (dn_swap && offset_elems.size() >= 2) { - std::iter_swap(offset_elems.rbegin(), offset_elems.rbegin() + 1); - } + const auto& shape_elems = shapes_tuple->elements_; + const auto& offset_elems = offsets_tuple->elements_; partition_type = MakePartitionTensorViewType(GetDimStrings(shape_elems), dtype_str); partition_view = EmitPartitionViewPTO(output_tensor->name_hint_, tensor_view, tensor_view_type, partition_type, GetExprCodes(offset_elems, codegen), diff --git a/src/codegen/orchestration/orchestration_codegen.cpp b/src/codegen/orchestration/orchestration_codegen.cpp index 74b095eb9..88c084c54 100644 --- a/src/codegen/orchestration/orchestration_codegen.cpp +++ b/src/codegen/orchestration/orchestration_codegen.cpp @@ -1174,6 +1174,55 @@ class OrchestrationStmtCodegen : public CodegenBase { std::move(info.inner_callee)}; } + /// Build a "wrapper-internal alias map" — for every AssignStmt in the + /// wrapper body whose RHS is a Call to a no-op view op (currently just + /// ``tensor.as_layout``), record LHS-var → upstream-var. This lets + /// ``BuildWrapperReorderedParams`` chase the inner-call's arg back to a + /// wrapper parameter through any orch-side ``tensor.as_layout`` bridge that + /// ``LowerTransposeLoadParamLayout`` may have injected. + std::unordered_map BuildWrapperAliasMap(const FunctionPtr& wrapper_func) { + std::unordered_map alias_map; + class AliasCollector : public IRVisitor { + public: + explicit AliasCollector(std::unordered_map* out) : out_(out) {} + void VisitStmt_(const AssignStmtPtr& op) override { + if (auto call = As(op->value_)) { + // ``tensor.as_layout`` is the canonical orch-side view alias that + // P6 (``LowerTransposeLoadParamLayout``) emits before the kernel + // call. Its runtime lowering is a plain ``Tensor x = src;`` alias, + // so for arg-routing purposes the LHS is interchangeable with the + // RHS's first arg. + if (call->op_ && call->op_->name_ == "tensor.as_layout" && !call->args_.empty()) { + if (auto src = AsVarLike(call->args_[0])) { + (*out_)[op->var_.get()] = src; + } + } + } + IRVisitor::VisitStmt_(op); + } + + private: + std::unordered_map* out_; + }; + if (wrapper_func->body_) { + AliasCollector(&alias_map).VisitStmt(wrapper_func->body_); + } + return alias_map; + } + + /// Resolve ``var`` to its ultimate alias source within the wrapper body + /// (walking through any ``tensor.as_layout`` bindings). Returns ``var`` + /// itself if no alias chain applies. + VarPtr ResolveAliasChain(VarPtr var, const std::unordered_map& alias_map) { + std::unordered_set seen; + while (true) { + auto it = alias_map.find(var.get()); + if (it == alias_map.end()) return var; + if (!seen.insert(var.get()).second) return var; // cycle guard + var = it->second; + } + } + /// Build task params for a wrapper function call, reordered to match the /// inner callee's parameter order. /// @@ -1188,6 +1237,7 @@ class OrchestrationStmtCodegen : public CodegenBase { for (size_t i = 0; i < wrapper_func->params_.size(); ++i) { wrapper_param_to_outer_idx[wrapper_func->params_[i].get()] = i; } + auto alias_map = BuildWrapperAliasMap(wrapper_func); // Phase-5 invariant: the outer Call must carry explicit arg_directions // (populated by DeriveCallDirections). The legacy ParamDirection fallback @@ -1223,6 +1273,16 @@ class OrchestrationStmtCodegen : public CodegenBase { } auto it = wrapper_param_to_outer_idx.find(inner_arg_var.get()); + if (it == wrapper_param_to_outer_idx.end()) { + // The inner-call arg may be a wrapper-local Var bound by a + // ``tensor.as_layout`` AssignStmt (injected by P6 to bridge + // orch-side ND tensors to the InCore-side DN param type). Chase + // the alias chain back to the upstream wrapper parameter. + auto upstream = ResolveAliasChain(inner_arg_var, alias_map); + if (upstream.get() != inner_arg_var.get()) { + it = wrapper_param_to_outer_idx.find(upstream.get()); + } + } if (it == wrapper_param_to_outer_idx.end()) { // Some wrapper-expansion paths can leave inner-call scalar ivs that are // not part of the user-visible wrapper signature. They should not be diff --git a/src/codegen/pto/pto_codegen.cpp b/src/codegen/pto/pto_codegen.cpp index 79dd636f2..cdd04b3dc 100644 --- a/src/codegen/pto/pto_codegen.cpp +++ b/src/codegen/pto/pto_codegen.cpp @@ -569,179 +569,183 @@ void PTOCodegen::BuildVarToMemRefMapping(const FunctionPtr& func) { } void PTOCodegen::EmitMakeTensorViews(const FunctionPtr& func) { + // RFC #1300 P7 (canonical codegen). + // + // Emit ``pto.make_tensor_view`` directly from the IR's canonical + // ``(shape, stride, layout)`` triple. There are no implicit swaps or + // post-emit dn_swap path here — every layout-aware transform (RFC §3.3 + // canonical promotion, ``MaterializeTensorStrides``) has already run by the + // time codegen executes, so the IR's TensorView fields can be transcribed + // verbatim. + // + // The one exception is the ``[M, 1]`` column-vector special case: PTOAS + // *infers* DN for shape ``[M, 1]`` with degenerate strides regardless of + // the IR-declared layout, so the codegen forces DN + ``[1, M]`` strides + // here to match what PTOAS expects. for (const auto& param : func->params_) { - if (auto tensor_type = As(param->GetType())) { - // Skip GM slot buffer workspace parameter (raw pointer, no view needed) - if (param->name_hint_ == "__gm_pipe_buffer") continue; + auto tensor_type = As(param->GetType()); + if (!tensor_type) continue; + if (param->name_hint_ == "__gm_pipe_buffer") continue; // GM slot buffer is a raw pointer + + std::string tensor_view = fs_.tensor_to_view.at(GetVarKey(param)); + const size_t rank = tensor_type->shape_.size(); + + // ``[..., M, 1]`` column-vector legacy path: PTOAS infers DN for any + // shape whose innermost dim is constant 1, so the codegen forces DN to + // match what ``tile.load`` produces (memory.cpp DeduceTileLoadType emits + // a ColMajor BLayout tile whenever the load shape ends with a constant 1 + // — see test_tensor_expand_clone[broadcast_dim=2] where input + // ``[B, N, 1]`` is loaded into a ColMajor tile and PTOAS TLoad enforces + // ``tile.BLayout == tensor.Layout``). + bool is_column_vector = false; + if (rank >= 2) { + auto last_dim = As(tensor_type->shape_.back()); + if (last_dim && last_dim->value_ == 1) { + is_column_vector = true; + } + } - std::string tensor_view = fs_.tensor_to_view.at(GetVarKey(param)); + ir::TensorLayout layout = ir::TensorLayout::ND; + if (tensor_type->tensor_view_.has_value()) { + layout = tensor_type->tensor_view_->layout; + } + if (is_column_vector) layout = ir::TensorLayout::DN; - bool layout_DN = false; - if (tensor_type->tensor_view_.has_value()) { - if (tensor_type->tensor_view_.value().layout == ir::TensorLayout::DN) { - layout_DN = true; - } + // Materialize one shape dimension as an MLIR SSA value. + auto get_shape_dim_mlir = [&](size_t dim_idx) -> std::string { + const auto& dim_expr = tensor_type->shape_[dim_idx]; + if (auto const_int = As(dim_expr)) { + return GetOrEmitConstant(const_int->value_, DataType::INDEX); } - - // [M, 1] column vectors: PTOAS always infers DN for shape [M, 1] with - // degenerate strides, so force DN layout and emit strides [1, M]. - bool is_column_vector = false; - if (tensor_type->shape_.size() == 2 || tensor_type->shape_.size() == 3) { - auto last_dim = As(tensor_type->shape_.back()); - if (last_dim && last_dim->value_ == 1) { - is_column_vector = true; - layout_DN = true; - } + return EmitCastToIndex(dim_expr, GetExprAsCode(dim_expr)); + }; + // Materialize a stride ExprPtr as an MLIR SSA value. + auto get_stride_mlir = [&](const ir::ExprPtr& stride_expr) -> std::string { + if (auto const_int = As(stride_expr)) { + return GetOrEmitConstant(const_int->value_, DataType::INDEX); } + return EmitCastToIndex(stride_expr, GetExprAsCode(stride_expr)); + }; + // Precompute shape dim SSA names. Dynamic shape exprs may need cast SSA + // ops (``EmitCastToIndex``) emitted before the ``pto.make_tensor_view`` + // line — materialize them all up-front so the main statement is a single + // contiguous line. + std::vector shape_dim_names(rank); + for (size_t j = 0; j < rank; ++j) { + shape_dim_names[j] = get_shape_dim_mlir(j); + } - // Check if tensor_view_ provides explicit strides (e.g. for view output tensors - // whose physical memory layout differs from the view shape). - bool has_explicit_stride = - tensor_type->tensor_view_.has_value() && !tensor_type->tensor_view_->stride.empty(); - const size_t rank = tensor_type->shape_.size(); - - // Materialize one shape dimension as an MLIR SSA value. - auto get_shape_dim_mlir = [&](size_t dim_idx) -> std::string { - const auto& dim_expr = tensor_type->shape_[dim_idx]; - if (auto const_int = As(dim_expr)) { - return GetOrEmitConstant(const_int->value_, DataType::INDEX); - } - return EmitCastToIndex(dim_expr, GetExprAsCode(dim_expr)); - }; - - // DN tensor views keep the original logical shape in IR typing, but the - // emitted make_tensor_view must expose the trailing two visible dimensions - // in DN order so PTOAS interprets shape/stride/layout consistently. - // Column vectors are handled separately. When the IR also carries explicit - // strides (e.g. from tensor.transpose), those strides already describe the - // physical layout in the IR shape's coordinate system — skip the implicit - // last-two-dim swap so the emitted shape matches the strides. - auto get_shape_source_idx = [&](size_t dim_idx) -> size_t { - if (!layout_DN || rank < 2 || is_column_vector || has_explicit_stride) return dim_idx; - if (dim_idx == rank - 2) return rank - 1; - if (dim_idx == rank - 1) return rank - 2; - return dim_idx; - }; - - // Emit one stride multiply and return the resulting SSA. - auto emit_stride_mul = [&](const std::string& lhs, size_t dim_idx, size_t stride_slot) -> std::string { - std::string mul_name = NewNamedTemp(param->name_hint_ + "_s" + std::to_string(stride_slot)); - stream_ << GetIndent() << mul_name << " = arith.muli " << lhs << ", " << get_shape_dim_mlir(dim_idx) - << " : index\n"; - return mul_name; - }; - - std::vector shape_dim_names(rank); + // Emit one stride multiply ``lhs * shape_dim_names[dim_idx]`` and return + // the resulting SSA, used for fallback stride derivation when + // ``tensor_view_->stride`` is empty. + auto emit_stride_mul = [&](const std::string& lhs, size_t dim_idx, size_t stride_slot) -> std::string { + std::string mul_name = NewNamedTemp(param->name_hint_ + "_s" + std::to_string(stride_slot)); + stream_ << GetIndent() << mul_name << " = arith.muli " << lhs << ", " << shape_dim_names[dim_idx] + << " : index\n"; + return mul_name; + }; + + // Build the stride SSA names. Prefer explicit ``tensor_view_->stride``; + // fall back to canonical derivation per ``layout`` when absent + // (``MaterializeTensorStrides`` should normally have populated it by now, + // but the codegen tolerates absent strides for any path that constructs + // IR ad-hoc and skips the pipeline). + std::vector stride_names(rank); + bool has_explicit_stride = + tensor_type->tensor_view_.has_value() && !tensor_type->tensor_view_->stride.empty(); + if (has_explicit_stride) { + const auto& strides = tensor_type->tensor_view_->stride; + CHECK(strides.size() == rank) << "EmitMakeTensorViews: explicit stride rank " << strides.size() + << " does not match tensor shape rank " << rank; for (size_t j = 0; j < rank; ++j) { - shape_dim_names[j] = get_shape_dim_mlir(get_shape_source_idx(j)); + stride_names[j] = get_stride_mlir(strides[j]); } - - // For N-D (N > 2): pre-compute strides as SSA values using arith.muli. - // ND uses standard row-major strides. DN keeps the same outer batch/page - // walk as ND, but the trailing two strides must match the visible DN shape: - // for logical [B, N, K], emit visible shape [B, K, N] with strides [N*K, 1, K]. - // Skip when explicit strides are available. - std::vector nd_stride_names; - if (!has_explicit_stride && rank > 2) { - nd_stride_names.resize(rank); - if (layout_DN) { - // For shape [B, N, K], DN strides are [N*K, 1, K]. - nd_stride_names[rank - 2] = GetOrEmitConstant(static_cast(1), DataType::INDEX); - nd_stride_names[rank - 1] = get_shape_dim_mlir(rank - 1); - if (rank > 2) { - nd_stride_names[rank - 3] = emit_stride_mul(nd_stride_names[rank - 1], rank - 2, rank - 3); - for (int j = static_cast(rank) - 4; j >= 0; j--) { - size_t dim = static_cast(j); - nd_stride_names[dim] = emit_stride_mul(nd_stride_names[dim + 1], dim + 1, dim); - } - } - } else { - // Standard row-major strides - nd_stride_names[rank - 1] = GetOrEmitConstant(static_cast(1), DataType::INDEX); - for (int j = static_cast(rank) - 2; j >= 0; j--) { - size_t dim = static_cast(j); - nd_stride_names[dim] = emit_stride_mul(nd_stride_names[dim + 1], dim + 1, dim); - } + } else if (is_column_vector) { + // Forced-DN ``[..., M, 1]`` legacy stride pattern (PTOAS column-vector + // convention): trailing pair degenerates to ``stride[rank-2]=1`` and + // ``stride[rank-1]=shape[rank-1]=1``; outer dims walk row-major over the + // ``M`` extent (``stride[rank-3]=shape[rank-2]``, ``stride[k-1]=stride[k]*shape[k]``). + // For rank 2 this collapses to the legacy ``[1, shape[0]]``. + stride_names[rank - 2] = GetOrEmitConstant(static_cast(1), DataType::INDEX); + if (rank == 2) { + stride_names[rank - 1] = shape_dim_names[0]; + } else { + // rank >= 3: stride[rank-1] = shape[rank-1] (= 1), stride[rank-3] = shape[rank-2]. + stride_names[rank - 1] = shape_dim_names[rank - 1]; + stride_names[rank - 3] = shape_dim_names[rank - 2]; + for (int j = static_cast(rank) - 4; j >= 0; --j) { + size_t dim = static_cast(j); + stride_names[dim] = emit_stride_mul(stride_names[dim + 1], dim + 1, dim); } } - - std::vector explicit_stride_names; - if (has_explicit_stride) { - const auto& strides = tensor_type->tensor_view_->stride; - explicit_stride_names.reserve(strides.size()); - for (const auto& stride_expr : strides) { - if (auto const_int = As(stride_expr)) { - explicit_stride_names.push_back(GetOrEmitConstant(const_int->value_, DataType::INDEX)); - } else { - explicit_stride_names.push_back(EmitCastToIndex(stride_expr, GetExprAsCode(stride_expr))); - } + } else if (layout == ir::TensorLayout::DN) { + CHECK(rank >= 2) << "EmitMakeTensorViews: DN layout requires rank >= 2, got " << rank; + // RFC §2.3 canonical DN: stride[-2]=1, stride[-1]=shape[-2], outer + // strides walk row-major over the DN-block volume. Use direct shape + // references for the trailing pair so 2D DN avoids a spurious + // ``arith.muli %c1, shape`` step. + stride_names[rank - 2] = GetOrEmitConstant(static_cast(1), DataType::INDEX); + stride_names[rank - 1] = shape_dim_names[rank - 2]; + if (rank >= 3) { + // stride[n-3] = shape[n-2] * shape[n-1] (one full DN-block volume). + stride_names[rank - 3] = emit_stride_mul(shape_dim_names[rank - 2], rank - 1, rank - 3); + for (int j = static_cast(rank) - 4; j >= 0; --j) { + size_t dim = static_cast(j); + stride_names[dim] = emit_stride_mul(stride_names[dim + 1], dim + 1, dim); + } + } + } else { + // Canonical ND (row-major): stride[-1]=1, stride[k]=stride[k+1]*shape[k+1]. + // For rank 2 specifically, stride[0] = shape[1] directly (avoids a + // spurious ``arith.muli %c1, shape[1]`` step). + stride_names[rank - 1] = GetOrEmitConstant(static_cast(1), DataType::INDEX); + if (rank >= 2) { + stride_names[rank - 2] = shape_dim_names[rank - 1]; + for (int j = static_cast(rank) - 3; j >= 0; --j) { + size_t dim = static_cast(j); + stride_names[dim] = emit_stride_mul(stride_names[dim + 1], dim + 1, dim); } } + } - stream_ << GetIndent() << tensor_view << " = pto.make_tensor_view "; - stream_ << GetVarName(param); + stream_ << GetIndent() << tensor_view << " = pto.make_tensor_view "; + stream_ << GetVarName(param); - stream_ << ", shape = ["; - // DN swaps the last two visible shape dimensions; ND keeps the original order. - for (size_t j = 0; j < rank; j++) { - if (j > 0) stream_ << ", "; - stream_ << shape_dim_names[j]; - } - stream_ << "],"; + // Emit shape (verbatim from IR — canonical). + stream_ << ", shape = ["; + for (size_t j = 0; j < rank; ++j) { + if (j > 0) stream_ << ", "; + stream_ << shape_dim_names[j]; + } + stream_ << "],"; - stream_ << " strides = ["; - if (has_explicit_stride) { - // Use explicit strides from tensor_view_ (e.g. physical memory strides for view tensors) - for (size_t j = 0; j < explicit_stride_names.size(); j++) { - if (j > 0) stream_ << ", "; - stream_ << explicit_stride_names[j]; - } - } else if (tensor_type->shape_.size() == 2) { - // For column vector [M, 1]: stride dim is shape[0] (= M) → strides [1, M]. - // For other 2D: stride dim is shape[1] (= C) → DN [1, C] or ND [C, 1]. - int stride_idx = is_column_vector ? 0 : 1; - const std::string& row_stride = - shape_dim_names[get_shape_source_idx(static_cast(stride_idx))]; - if (layout_DN) { - stream_ << GetOrEmitConstant(static_cast(1), DataType::INDEX) << ", " << row_stride; - } else { - stream_ << row_stride << ", " << GetOrEmitConstant(static_cast(1), DataType::INDEX); - } - } else if (tensor_type->shape_.size() == 1) { - stream_ << GetOrEmitConstant(static_cast(1), DataType::INDEX); - } else { - // Use pre-computed SSA stride names (built above via arith.muli) - for (size_t j = 0; j < nd_stride_names.size(); j++) { - if (j > 0) stream_ << ", "; - stream_ << nd_stride_names[j]; - } - } - stream_ << "]"; + // Emit strides. + stream_ << " strides = ["; + for (size_t j = 0; j < rank; ++j) { + if (j > 0) stream_ << ", "; + stream_ << stride_names[j]; + } + stream_ << "]"; - std::string layout_str = "nd"; - if (is_column_vector) { + std::string layout_str = "nd"; + switch (layout) { + case ir::TensorLayout::DN: layout_str = "dn"; - } else if (tensor_type->tensor_view_.has_value()) { - switch (tensor_type->tensor_view_.value().layout) { - case ir::TensorLayout::DN: - layout_str = "dn"; - break; - case ir::TensorLayout::NZ: - layout_str = "nz"; - break; - case ir::TensorLayout::ND: - break; - } - } - stream_ << " {layout = #pto.layout<" << layout_str << ">}"; + break; + case ir::TensorLayout::NZ: + layout_str = "nz"; + break; + case ir::TensorLayout::ND: + break; + } + stream_ << " {layout = #pto.layout<" << layout_str << ">}"; - stream_ << ": !pto.tensor_view<"; - for (size_t j = 0; j < rank; j++) { - if (j > 0) stream_ << "x"; - stream_ << "?"; - } - stream_ << "x" << GetTypeString(tensor_type->dtype_) << ">\n"; + stream_ << ": !pto.tensor_view<"; + for (size_t j = 0; j < rank; ++j) { + if (j > 0) stream_ << "x"; + stream_ << "?"; } + stream_ << "x" << GetTypeString(tensor_type->dtype_) << ">\n"; } } diff --git a/src/codegen/tensor_op_codegen.cpp b/src/codegen/tensor_op_codegen.cpp index 90a37d196..ce201fb10 100644 --- a/src/codegen/tensor_op_codegen.cpp +++ b/src/codegen/tensor_op_codegen.cpp @@ -19,6 +19,7 @@ #include "pypto/codegen/codegen_base.h" #include "pypto/codegen/orchestration_op_registry.h" +#include "pypto/core/any_cast.h" #include "pypto/core/logging.h" #include "pypto/ir/expr.h" #include "pypto/ir/kind_traits.h" @@ -348,6 +349,86 @@ REGISTER_ORCHESTRATION_OP(tensor_transpose, ("tensor.transpose")) { return oss.str(); } +REGISTER_ORCHESTRATION_OP(tensor_as_layout, ("tensor.as_layout")) { + // tensor.as_layout(input, layout=...) — metadata reinterpret over the same + // physical buffer (RFC #1300 §3.3). The op is internal-only: passes inject + // it at orch ↔ InCore bridge sites so the downstream callee's IR-declared + // param type carries the new layout / canonical shape. + // + // **Lowering:** + // + // - **Identity flip** (target layout == source layout): emit a plain + // ``Tensor result = input;`` alias. ``DeduceTensorAsLayoutType`` also + // keeps the shape unchanged in this case. + // - **Cross-layout flip** (ND ↔ DN, §4.2 canonical pair): emit an alias + // then swap the **trailing-pair shapes** so the kernel binary's + // PTOAS-generated wrapper reads the right dynamic dim values from + // ``runtime_tensor->shapes[i]`` (those slots are referenced under the + // IR-declared post-swap order). ``raw_shapes`` and ``offsets`` are + // intentionally left in the source (pre-swap) coord system — PTOAS uses + // ``raw_shape``-derived strides plus ``offsets`` to compute + // ``start_offset`` (the byte offset of the view into the physical + // buffer), and that base address must continue to point to the original + // ND-coord region (e.g. paged-attention's ``[block_offset, 0]`` slice + // into ``key_cache``). If ``is_raw_eq_shapes`` is true, materialize + // ``raw_shapes`` from the current ``shapes`` *before* the swap so the + // subsequent ``shapes`` mutation does not pollute the raw_shapes-derived + // stride arithmetic. + // + // We do NOT lower to ``input.transpose(N-2, N-1)``: that runtime helper + // additionally swaps ``raw_shapes`` and ``offsets``, which would shift + // ``start_offset`` by a factor of the raw shape and silently corrupt + // sliced/paged inputs. + CHECK(op->args_.size() == 1) << "tensor.as_layout requires 1 arg (input) plus a 'layout' kwarg"; + + std::string input_name = codegen.TryGetVarName(op->args_[0]); + CHECK(!input_name.empty()) << "tensor.as_layout input must be a variable"; + + auto input_type = As(op->args_[0]->GetType()); + CHECK(input_type) << "tensor.as_layout input must be TensorType"; + + TensorLayout src_layout = + input_type->tensor_view_.has_value() ? input_type->tensor_view_->layout : TensorLayout::ND; + TensorLayout target_layout = src_layout; + for (const auto& [k, v] : op->kwargs_) { + if (k == "layout") { + target_layout = AnyCast(v, "layout"); + break; + } + } + + std::string ext_input_name = codegen.GetExternalTensorName(input_name); + std::string result_var = codegen.GetCurrentResultTarget(); + + std::ostringstream oss; + oss << "Tensor " << result_var << " = " << ext_input_name << ";"; + + if (target_layout != src_layout) { + int64_t ndim = static_cast(input_type->shape_.size()); + INTERNAL_CHECK_SPAN(ndim >= 2, op->span_) + << "Internal error: tensor.as_layout cross-layout flip reached codegen with rank=" << ndim + << "; DeduceTensorAsLayoutType is supposed to reject cross-layout flips below rank 2"; + // Materialize raw_shapes (if currently inferred from shapes) so the + // trailing-pair shape swap below does not also mutate raw_shapes-derived + // stride arithmetic. + oss << "\n if (" << result_var << ".is_raw_eq_shapes) {\n"; + oss << " for (uint32_t _i = 0; _i < " << result_var << ".ndims; ++_i) {\n"; + oss << " " << result_var << ".raw_shapes[_i] = " << result_var << ".shapes[_i];\n"; + oss << " }\n"; + oss << " " << result_var << ".is_raw_eq_shapes = false;\n"; + oss << " }\n"; + // Swap trailing-pair shapes (§4.2 canonical pair). + oss << " {\n"; + oss << " uint32_t _t = " << result_var << ".shapes[" << (ndim - 2) << "];\n"; + oss << " " << result_var << ".shapes[" << (ndim - 2) << "] = " << result_var << ".shapes[" + << (ndim - 1) << "];\n"; + oss << " " << result_var << ".shapes[" << (ndim - 1) << "] = _t;\n"; + oss << " }"; + } + + return oss.str(); +} + REGISTER_ORCHESTRATION_OP(tensor_dim, ("tensor.dim")) { // tensor.dim(tensor, axis) -> extract shape dimension as scalar // Validation already performed by DeduceTensorDimType during type deduction. diff --git a/src/ir/op/tensor_ops/transform.cpp b/src/ir/op/tensor_ops/transform.cpp index af494cd6c..b8159bf3b 100644 --- a/src/ir/op/tensor_ops/transform.cpp +++ b/src/ir/op/tensor_ops/transform.cpp @@ -180,7 +180,7 @@ TypePtr DeduceTensorTransposeType(const std::vector& args, // unchanged because ND/DN only describes the trailing two dims. PTOAS // reads this tag and EmitMakeTensorViews / EmitTileLoadPTO use it to // drive the implicit "swap last two dims" path used by - // tile.load(transpose=True) sources (see ResolveTransposeLayout). + // tile.load(transpose=True) sources (see LowerTransposeLoadParamLayout). // // 2. Explicit strides. tensor.transpose at orchestration level lowers to // runtime Tensor::transpose, a metadata-only swap of shapes / offsets; @@ -280,6 +280,96 @@ TypePtr DeduceTensorTransposeType(const std::vector& args, // Registration Function for Tensor Transform Operations // ============================================================================ +namespace { +// Helper for reading typed kwargs from the deduce-type entry point. +// Mirrors the per-file copies in tile_ops/{memory,reduction,sort}.cpp and +// tensor_ops/reduction.cpp; consider extracting to a shared header in a +// follow-up cleanup. +template +T GetKwarg(const std::vector>& kwargs, const std::string& key, + const std::optional& default_value = std::nullopt) { + for (const auto& [k, v] : kwargs) { + if (k == key) { + return AnyCast(v, "kwarg key: " + key); + } + } + CHECK(default_value.has_value()) << "tensor op kwarg '" << key << "' is required but missing"; + return *default_value; +} +} // namespace + +TypePtr DeduceTensorAsLayoutType(const std::vector& args, + const std::vector>& kwargs) { + // tensor.as_layout(src, layout=...) — pure layout-tag flip over the same + // physical memory (RFC #1300 §3.3). Shape changes that come with the flip + // are mechanical (per RFC §4.2 canonical pair: row-major [..,a,b] ND ≡ + // [..,b,a] DN-packed) and derived here; this op never reshapes. + // ``tensor.reshape`` is the right tool for shape changes. + CHECK(args.size() == 1) << "tensor.as_layout requires 1 arg (src) plus a 'layout' kwarg, but got " + << args.size() << " positional args"; + + auto src_type = As(args[0]->GetType()); + CHECK(src_type) << "tensor.as_layout: src must be TensorType, got " << args[0]->GetType()->TypeName(); + + auto new_layout = GetKwarg(kwargs, "layout"); + CHECK(new_layout != TensorLayout::NZ) + << "tensor.as_layout: NZ layout is not allowed on TensorType (NZ is tile-only)"; + + // The source must be packed canonical (or bare = implicit ND-packed). + // Strided sub-views can't be reinterpreted via the §4.2 canonical pair + // because the offset-map equivalence only holds for the packed forms. + TensorLayout src_layout = + src_type->tensor_view_.has_value() ? src_type->tensor_view_->layout : TensorLayout::ND; + CHECK(src_layout != TensorLayout::NZ) + << "tensor.as_layout: src has NZ layout (NZ is tile-only and not allowed on TensorType)"; + if (src_type->tensor_view_.has_value() && !src_type->tensor_view_->stride.empty()) { + auto packed = tensor_view_semantics::BuildLogicalStridesFromLayout(src_type->shape_, src_layout); + bool is_packed = packed.size() == src_type->tensor_view_->stride.size(); + for (size_t i = 0; is_packed && i < packed.size(); ++i) { + auto pc = As(packed[i]); + auto sc = As(src_type->tensor_view_->stride[i]); + // Treat ExprPtr-identity as a match for symbolic dims; otherwise demand + // ConstInt value equality. + bool same = (packed[i].get() == src_type->tensor_view_->stride[i].get()) || + (pc && sc && pc->value_ == sc->value_); + is_packed = is_packed && same; + } + CHECK(is_packed) << "tensor.as_layout: src is a strided sub-view (stride does not match packed canonical " + << "for layout " << TensorLayoutToString(src_layout) + << "). Strided reinterprets are not " + << "supported; use tensor.slice or tensor.reshape on the packed parent first."; + } + + // Derive the target shape: + // - same layout (or both effectively ND): identity, shape unchanged + // - cross ND ↔ DN: trailing-two-dim swap (the only canonical pair) + std::vector new_shape = src_type->shape_; + if (src_layout != new_layout) { + CHECK(src_type->shape_.size() >= 2) + << "tensor.as_layout: cross-layout reinterpret requires rank >= 2, got " << src_type->shape_.size(); + std::swap(new_shape[new_shape.size() - 2], new_shape[new_shape.size() - 1]); + } + + auto new_view = tensor_view_semantics::CanonicalizeView(new_shape, new_layout); + // Preserve view-extending metadata (``valid_shape`` / ``pad``) from the + // source — both fields describe element-level semantics that are layout- + // invariant under the §4.2 canonical pair, so dropping them would silently + // make sliced or fill-padded tensors look like fully-valid views. + if (src_type->tensor_view_.has_value()) { + const auto& src_view = src_type->tensor_view_.value(); + if (!src_view.valid_shape.empty()) { + std::vector new_valid_shape = src_view.valid_shape; + if (src_layout != new_layout && new_valid_shape.size() >= 2) { + std::iter_swap(new_valid_shape.end() - 2, new_valid_shape.end() - 1); + } + new_view.valid_shape = std::move(new_valid_shape); + } + new_view.pad = src_view.pad; + } + return std::make_shared(new_shape, src_type->dtype_, src_type->memref_, + std::make_optional(std::move(new_view))); +} + REGISTER_OP("tensor.reshape") .set_op_category("TensorOp") .set_description("Reshape tensor to new shape") @@ -301,6 +391,28 @@ REGISTER_OP("tensor.transpose") return DeduceTensorTransposeType(args, kwargs); }); +REGISTER_OP("tensor.as_layout") + .set_op_category("TensorOp") + .set_description( + "Flip a TensorType's layout tag over the same physical memory (RFC #1300 §3.3). " + "The trailing-two-dim shape swap that comes with a ND ↔ DN flip is mechanical " + "and derived here; this op never reshapes (use tensor.reshape for shape changes). " + "Pure metadata — emits no PTOAS instructions; downstream make_tensor_view " + "consumes the new view directly. Internal-only; passes (e.g. " + "LowerTransposeLoadParamLayout) inject this at orch ↔ InCore call sites.") + .add_argument("input", "Input tensor (TensorType, packed canonical or bare)") + // Inherit the input's MemRef: ``tensor.as_layout`` is a metadata-only + // reinterpret of the same physical buffer, so its result must alias the + // input's allocation. Without this, ``InitMemRef`` would mint a fresh + // MemRef and allocate a separate buffer that the runtime alias + // (``Tensor result = input;``) never writes to, leading to silent + // memory corruption / wrong reads downstream. + .set_output_memory_inherit_input() + .f_deduce_type([](const std::vector& args, + const std::vector>& kwargs) { + return DeduceTensorAsLayoutType(args, kwargs); + }); + TypePtr DeduceTensorConcatType(const std::vector& args, const std::vector>& kwargs) { CHECK(args.size() == 2) << "tensor.concat requires 2 arguments (src0, src1), got " << args.size(); diff --git a/src/ir/op/tile_ops/memory.cpp b/src/ir/op/tile_ops/memory.cpp index 76fdfa8d5..1d2994196 100644 --- a/src/ir/op/tile_ops/memory.cpp +++ b/src/ir/op/tile_ops/memory.cpp @@ -155,12 +155,20 @@ TypePtr DeduceTileLoadType(const std::vector& args, // Nz/Zn layout: only chosen when target_memory is known. If it is absent, // the default-constructed view is kept and InferTileMemorySpace rebuilds it // once the memory space is resolved. + // + // Source-DN equivalence (RFC #1300 §3.3 + P6): a DN-tagged source tensor + // describes the same physical bytes as the canonical-pair ND view, so + // ``tile.load`` of a DN source produces the same tile layout as + // ``transpose=True`` on the equivalent ND source. Treat the two signals + // (source layout == DN, transpose kwarg) as an XOR. + bool source_is_dn = + tensor_type->tensor_view_.has_value() && tensor_type->tensor_view_->layout == TensorLayout::DN; TileView tile_view; if (target_memory_opt.has_value()) { if (*target_memory_opt == MemorySpace::Mat) { tile_view.blayout = TileLayout::col_major; tile_view.slayout = TileLayout::row_major; - if (transpose) { + if (transpose != source_is_dn) { std::swap(tile_view.blayout, tile_view.slayout); } } else if (auto last_dim = As(shapes_tuple->elements_.back()); diff --git a/src/ir/transforms/lower_transpose_load_param_layout_pass.cpp b/src/ir/transforms/lower_transpose_load_param_layout_pass.cpp new file mode 100644 index 000000000..b2d3faada --- /dev/null +++ b/src/ir/transforms/lower_transpose_load_param_layout_pass.cpp @@ -0,0 +1,460 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pypto/core/logging.h" +#include "pypto/ir/expr.h" +#include "pypto/ir/function.h" +#include "pypto/ir/kind_traits.h" +#include "pypto/ir/op_registry.h" +#include "pypto/ir/program.h" +#include "pypto/ir/stmt.h" +#include "pypto/ir/transforms/base/mutator.h" +#include "pypto/ir/transforms/base/visitor.h" +#include "pypto/ir/transforms/pass_properties.h" +#include "pypto/ir/transforms/passes.h" +#include "pypto/ir/transforms/utils/mutable_copy.h" +#include "pypto/ir/transforms/utils/transform_utils.h" +#include "pypto/ir/type.h" + +namespace pypto { +namespace ir { + +using transform_utils::Substitute; + +namespace { + +/// Scans an InCore function body for ``tile.load(param, ..., transpose=True)`` +/// where the source tensor is a function parameter. +class TransposeLoadScanner : public IRVisitor { + public: + explicit TransposeLoadScanner(const std::vector& params) { + for (size_t i = 0; i < params.size(); ++i) { + param_ptr_to_index_[params[i].get()] = i; + } + } + + // Returns the set of param indices that need DN promotion. + const std::unordered_set& GetPromoted() const { return promoted_; } + + // Returns the set of param indices whose `tile.load` calls all carry + // `transpose=False` (or absent). Used to reject mixed-use parameters. + const std::unordered_set& GetNonTransposedUses() const { return non_transposed_uses_; } + + void VisitExpr_(const CallPtr& call) override { + if (call && call->op_ && call->op_->name_ == "tile.load" && !call->args_.empty()) { + auto src_var = As(call->args_[0]); + if (src_var) { + auto it = param_ptr_to_index_.find(src_var.get()); + if (it != param_ptr_to_index_.end()) { + const size_t param_idx = it->second; + bool transpose = call->GetKwarg("transpose", false); + if (transpose) { + promoted_.insert(param_idx); + } else { + non_transposed_uses_.insert(param_idx); + } + } + } + } + IRVisitor::VisitExpr_(call); + } + + private: + std::unordered_map param_ptr_to_index_; + std::unordered_set promoted_; + std::unordered_set non_transposed_uses_; +}; + +/// Build the canonical TensorType for an InCore parameter that is loaded via +/// ``tile.load(transpose=True)`` (RFC #1300 §3.3 + §4.2): +/// src ``[..., a, b] ND`` ≡ canonical ``[..., b, a] DN`` +/// +/// The new TensorView carries an empty stride; ``MaterializeTensorStrides`` +/// (P6-b) fills it with the packed canonical strides later in the pipeline. +TensorTypePtr PromoteToCanonicalDN(const TensorTypePtr& src) { + CHECK(src->shape_.size() >= 2) + << "LowerTransposeLoadParamLayout: parameter must have rank >= 2 to apply DN " + "canonical form, got " + << src->shape_.size(); + std::vector new_shape = src->shape_; + std::iter_swap(new_shape.end() - 2, new_shape.end() - 1); + TensorView dn_view(std::vector{}, TensorLayout::DN); + return std::make_shared(new_shape, src->dtype_, src->memref_, + std::make_optional(std::move(dn_view))); +} + +/// Swap the last two elements of a ``MakeTuple`` (offsets / shapes / +/// valid_shapes argument of ``tile.load``). +MakeTuplePtr SwapTrailingPair(const MakeTuplePtr& tuple) { + INTERNAL_CHECK(tuple) << "Internal error: SwapTrailingPair called with null MakeTuple"; + INTERNAL_CHECK_SPAN(tuple->elements_.size() >= 2, tuple->span_) + << "LowerTransposeLoadParamLayout: tile.load tuple needs rank >= 2 to swap " + "trailing pair, got " + << tuple->elements_.size(); + std::vector new_elements = tuple->elements_; + std::iter_swap(new_elements.end() - 2, new_elements.end() - 1); + return std::make_shared(std::move(new_elements), tuple->span_); +} + +/// Rewrite tile.load calls whose first arg is one of the promoted parameters +/// so that: +/// - offsets / shapes / valid_shapes are swapped to canonical coords; +/// - the ``transpose=True`` kwarg is dropped (DN source + Mat target now +/// drives the tile-view swap inside ``DeduceTileLoadType``). +/// All other Calls are passed through unchanged. +class TileLoadBodyRewriter : public IRMutator { + public: + explicit TileLoadBodyRewriter(const std::unordered_map& param_subs) { + for (const auto& [old_ptr, new_var] : param_subs) { + promoted_param_set_.insert(new_var.get()); + } + } + + ExprPtr VisitExpr_(const CallPtr& op) override { + auto base = IRMutator::VisitExpr_(op); + auto call = std::dynamic_pointer_cast(base); + if (!call || !call->op_ || call->op_->name_ != "tile.load") return base; + if (call->args_.empty()) return base; + + auto src_var = As(call->args_[0]); + if (!src_var || promoted_param_set_.find(src_var.get()) == promoted_param_set_.end()) { + return base; + } + if (!call->GetKwarg("transpose", false)) return base; + + // tile.load(tensor, offsets, shapes, valid_shapes, ...) — swap the trailing + // pair of all three tuples so the load is expressed in canonical (DN + // logical) coordinates that match the promoted parameter's new shape. + INTERNAL_CHECK_SPAN(call->args_.size() == 4, call->span_) + << "LowerTransposeLoadParamLayout: expected tile.load to have 4 args, got " << call->args_.size(); + auto offsets = As(call->args_[1]); + auto shapes = As(call->args_[2]); + auto valid_shapes = As(call->args_[3]); + INTERNAL_CHECK_SPAN(offsets && shapes && valid_shapes, call->span_) + << "LowerTransposeLoadParamLayout: tile.load offsets/shapes/valid_shapes must be MakeTuple"; + + std::vector new_args = call->args_; + new_args[1] = SwapTrailingPair(offsets); + new_args[2] = SwapTrailingPair(shapes); + new_args[3] = SwapTrailingPair(valid_shapes); + + // Flip transpose=True → transpose=False; the DN-source + Mat-target signal + // is now carried entirely by the source TensorType's layout tag, but the + // kwarg slot is kept so print → reparse round-trips faithfully (the + // tile.load op registers ``transpose`` as a default-false attribute and + // the parser injects it back on reparse). + std::vector> new_kwargs; + new_kwargs.reserve(call->kwargs_.size()); + for (const auto& [k, v] : call->kwargs_) { + if (k == "transpose") { + new_kwargs.emplace_back(k, std::any(false)); + } else { + new_kwargs.emplace_back(k, v); + } + } + + // Rebuild via OpRegistry so DeduceTileLoadType recomputes the TileType + // from the new source layout (DN) + swapped shapes. + return OpRegistry::GetInstance().Create("tile.load", new_args, new_kwargs, call->span_); + } + + private: + std::unordered_set promoted_param_set_; +}; + +/// Result of promoting a single InCore function. +struct PromotionResult { + FunctionPtr func; + std::map promoted_params; // param index → new param Var +}; + +/// Promote an InCore function. Returns the rewritten Function (or the +/// original if no rewrite was needed) and the map of promoted param slots. +/// Throws if any promoted parameter is also loaded without `transpose=True` +/// in the same body (mixed use would corrupt non-transpose loads). +PromotionResult PromoteInCoreFunction(const FunctionPtr& func) { + TransposeLoadScanner scanner(func->params_); + scanner.VisitStmt(func->body_); + const auto& promoted = scanner.GetPromoted(); + const auto& non_transposed = scanner.GetNonTransposedUses(); + if (promoted.empty()) { + return {func, {}}; + } + + std::unordered_map substitutions; + std::vector new_params = func->params_; + std::map promoted_params; + + for (size_t idx : promoted) { + // Mixed-use rejection: a param promoted from `[a, b]` ND → `[b, a]` DN + // would invalidate every non-transpose `tile.load(p, ...)` that still + // expects the original coordinate system. + CHECK(non_transposed.find(idx) == non_transposed.end()) + << "LowerTransposeLoadParamLayout: parameter at index " << idx + << " is loaded both with transpose=True and transpose=False — only one " + "mode is supported per InCore parameter. Split the parameter or unify " + "the load direction."; + + const auto& old_param = func->params_[idx]; + auto old_tensor_type = As(old_param->GetType()); + CHECK(old_tensor_type) << "LowerTransposeLoadParamLayout: promoted parameter at index " << idx + << " must be TensorType"; + + // Reject the (DN view + explicit physical stride) combination — these + // came from `tensor.transpose` and would compose with the load-side + // transpose to produce a double-encoded transpose. + if (old_tensor_type->tensor_view_.has_value()) { + const auto& view = old_tensor_type->tensor_view_.value(); + CHECK(!(view.layout == TensorLayout::DN && !view.stride.empty())) + << "LowerTransposeLoadParamLayout: tile.load(transpose=True) on a " + "tensor.transpose result is not supported (the DN tag and explicit " + "physical strides would compose as a double transpose). Drop one of " + "the two transpose layers in the source program."; + // Param already promoted in a prior round (idempotent): skip. + if (view.layout == TensorLayout::DN) continue; + } + + auto new_tensor_type = PromoteToCanonicalDN(old_tensor_type); + auto new_var = std::make_shared(old_param->name_hint_, new_tensor_type, old_param->span_); + new_params[idx] = new_var; + substitutions[old_param.get()] = new_var; + promoted_params.emplace(idx, new_var); + } + + if (substitutions.empty()) { + return {func, {}}; + } + + // 1) Substitute param Vars in the body. + auto subbed_body = Substitute(func->body_, substitutions); + + // 2) Rewrite each `tile.load(promoted_param, ..., transpose=True)` in the + // body — swap offsets / shapes / valid_shapes trailing pair, drop the + // transpose kwarg. + TileLoadBodyRewriter body_rewriter(substitutions); + auto new_body = body_rewriter.VisitStmt(subbed_body); + + auto new_func = MutableCopy(func); + new_func->params_ = new_params; + new_func->body_ = new_body; + return {new_func, promoted_params}; +} + +/// Walks every non-InCore function in the program and, for each call site +/// targeting a promoted InCore callee, emits an SSA-form binding for each +/// promoted-slot arg: +/// +/// bridged_ = tensor.as_layout(, DN) +/// = (..., bridged_, ...) +/// +/// The binding is emitted as a separate ``AssignStmt`` immediately before the +/// call statement (instead of being inlined inside the call's args), which is +/// what downstream orchestration codegen expects — it consumes a ``Var`` or a +/// constant literal per call arg, not a nested ``Call``. +class CallSiteAsLayoutInjector : public IRMutator { + public: + explicit CallSiteAsLayoutInjector(const std::map>& promotions) + : promotions_(promotions) {} + + StmtPtr VisitStmt_(const SeqStmtsPtr& op) override { + std::vector new_stmts; + new_stmts.reserve(op->stmts_.size()); + bool any_changed = false; + for (const auto& stmt : op->stmts_) { + // Recurse into nested SeqStmts / control-flow first so inner call sites + // get patched too. + auto recursed = IRMutator::VisitStmt(stmt); + bool inserted = false; + auto patched = MaybeInjectBindings(recursed, new_stmts, &inserted); + if (inserted || patched.get() != recursed.get() || recursed.get() != stmt.get()) { + any_changed = true; + } + new_stmts.push_back(patched); + } + if (!any_changed) return op; + return SeqStmts::Flatten(std::move(new_stmts), op->span_); + } + + // Bare (non-SeqStmts) statement bodies — e.g. ``then_body`` of an ``IfStmt`` + // that contains a single ``AssignStmt``. Wrap any injected bindings into + // a fresh SeqStmts so the resulting body stays a single Stmt. + StmtPtr VisitStmt_(const AssignStmtPtr& op) override { + auto recursed = IRMutator::VisitStmt_(op); + std::vector pre; + bool inserted = false; + auto patched = MaybeInjectBindings(recursed, pre, &inserted); + if (!inserted) return patched; + pre.push_back(patched); + return SeqStmts::Flatten(std::move(pre), op->span_); + } + + StmtPtr VisitStmt_(const EvalStmtPtr& op) override { + auto recursed = IRMutator::VisitStmt_(op); + std::vector pre; + bool inserted = false; + auto patched = MaybeInjectBindings(recursed, pre, &inserted); + if (!inserted) return patched; + pre.push_back(patched); + return SeqStmts::Flatten(std::move(pre), op->span_); + } + + StmtPtr VisitStmt_(const ReturnStmtPtr& op) override { + auto recursed = IRMutator::VisitStmt_(op); + std::vector pre; + bool inserted = false; + auto patched = MaybeInjectBindings(recursed, pre, &inserted); + if (!inserted) return patched; + pre.push_back(patched); + return SeqStmts::Flatten(std::move(pre), op->span_); + } + + private: + /// If ``stmt``'s RHS is a Call to a promoted callee, build the binding + /// AssignStmts (one per promoted slot) and emit them into ``pre``; + /// rewrite the Call to reference the bound Vars. Returns the (possibly + /// rewritten) statement and sets ``*inserted = true`` if any bindings + /// were added. + StmtPtr MaybeInjectBindings(const StmtPtr& stmt, std::vector& pre, bool* inserted) { + auto extract_call = [](const StmtPtr& s) -> std::pair { + if (auto assign = As(s)) { + return {As(assign->value_), assign->var_}; + } + if (auto eval = As(s)) { + return {As(eval->expr_), nullptr}; + } + if (auto ret = As(s)) { + if (ret->value_.size() == 1) { + return {As(ret->value_[0]), nullptr}; + } + } + return {nullptr, nullptr}; + }; + + auto [call, lhs_var] = extract_call(stmt); + if (!call) return stmt; + auto gv = As(call->op_); + if (!gv) return stmt; + auto it = promotions_.find(gv->name_); + if (it == promotions_.end() || it->second.empty()) return stmt; + const auto& slots = it->second; + + std::vector new_args = call->args_; + bool changed = false; + for (const auto& [idx, new_param_var] : slots) { + INTERNAL_CHECK_SPAN(idx < new_args.size(), call->span_) + << "LowerTransposeLoadParamLayout: promoted param index " << idx << " out of range for call to " + << gv->name_; + auto arg = new_args[idx]; + auto arg_tensor = As(arg->GetType()); + if (!arg_tensor) continue; + // Idempotency: an arg already in DN form needs no bridge. + if (arg_tensor->tensor_view_.has_value() && arg_tensor->tensor_view_->layout == TensorLayout::DN) { + continue; + } + // Build the bridge: bridged = tensor.as_layout(arg, DN). + std::vector> kwargs = {{"layout", std::any(TensorLayout::DN)}}; + auto bridge_call = OpRegistry::GetInstance().Create("tensor.as_layout", {arg}, kwargs, arg->span_); + auto bridge_var = + std::make_shared(new_param_var->name_hint_ + "_dn_view", bridge_call->GetType(), arg->span_); + pre.push_back(std::make_shared(bridge_var, bridge_call, arg->span_)); + new_args[idx] = bridge_var; + changed = true; + } + if (!changed) return stmt; + *inserted = true; + + auto new_call = std::make_shared(call->op_, std::move(new_args), call->kwargs_, call->attrs_, + call->GetType(), call->span_); + if (auto assign = As(stmt)) { + return std::make_shared(assign->var_, new_call, assign->span_); + } + if (auto eval = As(stmt)) { + return std::make_shared(new_call, eval->span_); + } + if (auto ret = As(stmt)) { + return std::make_shared(std::vector{new_call}, ret->span_); + } + return stmt; // unreachable — extract_call only returns non-null for the three above + } + + const std::map>& promotions_; +}; + +} // namespace + +namespace pass { + +Pass LowerTransposeLoadParamLayout() { + auto pass_func = [](const ProgramPtr& program) -> ProgramPtr { + // Phase 1: rewrite InCore functions and collect promotion info keyed by + // callee name (callers reference InCore functions through Call->op_'s + // GlobalVar, which is matched on name_). + std::map new_functions; + std::map> promotions_by_callee_name; + bool modified = false; + + for (const auto& [gvar, func] : program->functions_) { + if (!IsInCoreType(func->func_type_)) { + new_functions[gvar] = func; + continue; + } + auto result = PromoteInCoreFunction(func); + new_functions[gvar] = result.func; + if (result.func.get() != func.get()) modified = true; + if (!result.promoted_params.empty()) { + promotions_by_callee_name[gvar->name_] = std::move(result.promoted_params); + } + } + + if (promotions_by_callee_name.empty()) { + return modified ? std::make_shared(std::move(new_functions), program->name_, program->span_) + : program; + } + + // Phase 2: walk non-InCore functions and inject `tensor.as_layout` at + // each call site that targets a promoted callee. + CallSiteAsLayoutInjector injector(promotions_by_callee_name); + for (auto& [gvar, func] : new_functions) { + if (IsInCoreType(func->func_type_)) continue; + if (!func->body_) continue; + auto new_body = injector.VisitStmt(func->body_); + if (new_body.get() != func->body_.get()) { + auto new_func = MutableCopy(func); + new_func->body_ = new_body; + new_functions[gvar] = new_func; + modified = true; + } + } + + if (!modified) return program; + return std::make_shared(std::move(new_functions), program->name_, program->span_); + }; + + return CreateProgramPass(pass_func, "LowerTransposeLoadParamLayout", + kLowerTransposeLoadParamLayoutProperties); +} + +} // namespace pass + +} // namespace ir +} // namespace pypto diff --git a/src/ir/transforms/materialize_tensor_strides_pass.cpp b/src/ir/transforms/materialize_tensor_strides_pass.cpp index d6c145bcc..1c2783b45 100644 --- a/src/ir/transforms/materialize_tensor_strides_pass.cpp +++ b/src/ir/transforms/materialize_tensor_strides_pass.cpp @@ -28,9 +28,11 @@ * The pass is idempotent: re-running it on already-canonical IR is a no-op. */ +#include #include #include #include +#include #include #include #include @@ -163,24 +165,60 @@ class MaterializeTensorStridesMutator : public IRMutator { auto new_return_type = MaterializeType(op->GetType()); bool type_changed = new_return_type.get() != op->GetType().get(); - if (!args_changed && !type_changed) return op; - - auto& registry = OpRegistry::GetInstance(); - if (As(op->op_) || !registry.IsRegistered(op->op_->name_)) { - // Direct ctor — must supply a result type. Prefer the materialized one - // so downstream Vars / Calls see the explicit stride. - return std::make_shared(op->op_, std::move(new_args), op->kwargs_, std::move(new_return_type), - op->span_); + // ``manual_dep_edges`` / ``user_manual_dep_edges`` carry VarPtrs that + // reference Vars defined elsewhere in the IR. When this pass mints a + // fresh Var for a Tensor whose view stride is being materialized, the + // attr entries must follow — otherwise they dangle to the pre-pass + // pointer and SSAVerify / orchestration codegen fail. + std::vector> new_attrs; + new_attrs.reserve(op->attrs_.size()); + bool attrs_changed = false; + for (const auto& [k, v] : op->attrs_) { + if (k == kAttrUserManualDepEdges || k == kAttrManualDepEdges) { + if (const auto* edges = std::any_cast>(&v)) { + std::vector new_edges; + new_edges.reserve(edges->size()); + bool any_changed = false; + for (const auto& e : *edges) { + if (!e) { + new_edges.push_back(e); + continue; + } + auto remapped_var = AsVarLike(IRMutator::VisitExpr(e)); + if (!remapped_var) { + new_edges.push_back(e); + continue; + } + if (remapped_var.get() != e.get()) any_changed = true; + new_edges.push_back(std::move(remapped_var)); + } + if (any_changed) { + attrs_changed = true; + new_attrs.emplace_back(k, std::any(std::move(new_edges))); + continue; + } + } + } + new_attrs.emplace_back(k, v); } - // OpRegistry rebuilds the Call's type via DeduceType. If the deduced type - // is bare and the materialized type is more specific (carries explicit - // stride), prefer the materialized one — but in practice most ops produce - // bare TensorType and the deduction agrees with the input. To avoid - // surprising layout regressions we just accept whatever the registry - // returns; if it disagrees with the materialized form, the - // TensorViewCanonical verifier (strict mode) will surface that as a - // diagnostic so we know to add an explicit op rebuild here. - return registry.Create(op->op_->name_, new_args, op->kwargs_, op->span_); + + if (!args_changed && !type_changed && !attrs_changed) return op; + + // Direct ctor — preserve the (materialized) original type and ``attrs_`` + // rather than re-deducing via OpRegistry. + // + // Re-deducing would discard intentional type overrides that earlier passes + // applied. Concrete case: FlattenTileNdTo2D rewrites a rank-3 ``tile.load`` + // result to a rank-2 ``TileType`` while keeping the load's offsets/shapes + // args at rank 3 (the source-window expressions). If we routed back + // through ``OpRegistry::Create`` here, ``DeduceTileLoadType`` would see + // the rank-3 shape args and synthesize a fresh rank-3 ``TileType``, + // silently undoing the 2D flattening. Forwarding ``op->attrs_`` likewise + // preserves call metadata that earlier passes wrote (e.g. arg directions, + // manual-dep edges) — re-deduction would drop those. + auto attrs_to_use = attrs_changed ? std::move(new_attrs) : op->attrs_; + return std::make_shared(op->op_, std::move(new_args), op->kwargs_, std::move(attrs_to_use), + std::move(new_return_type), op->span_); } StmtPtr VisitStmt_(const AssignStmtPtr& op) override { diff --git a/src/ir/transforms/resolve_transpose_layout_pass.cpp b/src/ir/transforms/resolve_transpose_layout_pass.cpp deleted file mode 100644 index 7faf4f900..000000000 --- a/src/ir/transforms/resolve_transpose_layout_pass.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -#include -#include -#include -#include -#include -#include - -#include "pypto/core/logging.h" -#include "pypto/ir/expr.h" -#include "pypto/ir/function.h" -#include "pypto/ir/kind_traits.h" -#include "pypto/ir/program.h" -#include "pypto/ir/stmt.h" -#include "pypto/ir/transforms/base/visitor.h" -#include "pypto/ir/transforms/pass_properties.h" -#include "pypto/ir/transforms/passes.h" -#include "pypto/ir/transforms/utils/mutable_copy.h" -#include "pypto/ir/transforms/utils/transform_utils.h" -#include "pypto/ir/type.h" - -namespace pypto { -namespace ir { - -using transform_utils::Substitute; - -namespace { - -struct TransposeParamInfo { - size_t param_index; -}; - -/** - * Visitor that scans an InCore function body for tile.load calls with - * transpose=True whose source tensor is a function parameter. - */ -class TransposeLoadScanner : public IRVisitor { - public: - explicit TransposeLoadScanner(const std::vector& params) { - for (size_t i = 0; i < params.size(); ++i) { - param_ptr_to_index_[params[i].get()] = i; - } - } - - const std::vector& GetResults() const { return results_; } - - void VisitExpr_(const CallPtr& call) override { - if (!call) return; - - if (call->op_->name_ == "tile.load") { - bool transpose = call->GetKwarg("transpose", false); - if (transpose && !call->args_.empty()) { - auto src_var = As(call->args_[0]); - if (src_var) { - auto it = param_ptr_to_index_.find(src_var.get()); - if (it != param_ptr_to_index_.end()) { - size_t param_idx = it->second; - if (visited_params_.count(param_idx) == 0) { - visited_params_.insert(param_idx); - results_.push_back({param_idx}); - } - } - } - } - } - - IRVisitor::VisitExpr_(call); - } - - private: - std::unordered_map param_ptr_to_index_; - std::unordered_set visited_params_; - std::vector results_; -}; - -// Add DN layout annotation to InCore parameters that have transpose tile.load. -// Shape is preserved (no swap); DN is a codegen hint only. -FunctionPtr TransformIncoreParams(const FunctionPtr& func) { - TransposeLoadScanner scanner(func->params_); - scanner.VisitStmt(func->body_); - - const auto& transpose_results = scanner.GetResults(); - std::unordered_set needs_dn; - for (const auto& info : transpose_results) { - needs_dn.insert(info.param_index); - } - - if (needs_dn.empty()) { - return func; - } - - std::unordered_map substitutions; - std::vector new_params = func->params_; - - for (size_t idx : needs_dn) { - const auto& old_param = func->params_[idx]; - auto old_tensor_type = As(old_param->GetType()); - CHECK(old_tensor_type) << "DN candidate param must be TensorType"; - - // Reject ONLY the "tensor.transpose result + tile.load(transpose=True)" - // combination. tensor.transpose produces a TensorView with both DN layout - // AND explicit physical strides; combining that with another transpose at - // load time would double-encode the transpose and emit wrong addresses. - // - // Slice-derived inputs (explicit strides + ND layout, attached by - // OptimizeOrchTensors) still flow through the normal "promote ND → DN, - // drop strides" path used by matmul B^T patterns like paged_attention. - if (old_tensor_type->tensor_view_.has_value()) { - const auto& view = old_tensor_type->tensor_view_.value(); - CHECK(!(view.layout == TensorLayout::DN && !view.stride.empty())) - << "tile.load(transpose=True) on a tensor.transpose result is not " - "yet supported (the DN tag and explicit physical strides would " - "compose as a double transpose). Workaround: do the transpose at " - "the tile level via tile.load(transpose=True) directly on the " - "source tensor."; - if (view.layout == TensorLayout::DN) continue; - } - - CHECK(old_tensor_type->shape_.size() >= 2) - << "transpose layout resolution requires at least 2D tensors, got " << old_tensor_type->shape_.size() - << "D"; - - auto new_tensor_type = std::make_shared( - old_tensor_type->shape_, old_tensor_type->dtype_, old_tensor_type->memref_, - std::optional(TensorView(std::vector{}, TensorLayout::DN))); - - auto new_var = std::make_shared(old_param->name_hint_, new_tensor_type, old_param->span_); - new_params[idx] = new_var; - substitutions[old_param.get()] = new_var; - } - - if (substitutions.empty()) { - return func; - } - - auto new_body = Substitute(func->body_, substitutions); - - auto new_func = MutableCopy(func); - new_func->params_ = new_params; - new_func->body_ = new_body; - return new_func; -} - -} // namespace - -namespace pass { - -Pass ResolveTransposeLayout() { - auto pass_func = [](const ProgramPtr& program) -> ProgramPtr { - bool modified = false; - std::vector functions; - - for (const auto& [gvar, func] : program->functions_) { - if (IsInCoreType(func->func_type_)) { - auto new_func = TransformIncoreParams(func); - if (new_func != func) { - modified = true; - } - functions.push_back(new_func); - } else { - functions.push_back(func); - } - } - - if (!modified) { - return program; - } - - return std::make_shared(functions, program->name_, program->span_); - }; - - return CreateProgramPass(pass_func, "ResolveTransposeLayout", kResolveTransposeLayoutProperties); -} - -} // namespace pass - -} // namespace ir -} // namespace pypto diff --git a/src/ir/transforms/simplify_pass.cpp b/src/ir/transforms/simplify_pass.cpp index 9a22c0360..eb81f068f 100644 --- a/src/ir/transforms/simplify_pass.cpp +++ b/src/ir/transforms/simplify_pass.cpp @@ -40,6 +40,7 @@ #include "pypto/ir/transforms/utils/deep_clone_utils.h" #include "pypto/ir/transforms/utils/loop_state_repair.h" #include "pypto/ir/transforms/utils/mutable_copy.h" +#include "pypto/ir/transforms/utils/tensor_view_semantics.h" #include "pypto/ir/transforms/utils/transform_utils.h" #include "pypto/ir/type.h" @@ -154,11 +155,25 @@ class SimplifyMutator : public arith::IRMutatorWithAnalyzer { /// Refresh the Call's result type_ so the in-memory IR matches what a /// fresh parse would produce (needed for roundtrip structural equality). + /// + /// Also drops identity ``tensor.as_layout`` reinterprets per RFC #1300 §3.3: + /// - ``as_layout(x, x.layout)`` → ``x`` (target layout matches source) + /// + /// Chain folding (``as_layout(as_layout(x, L1), L2)`` → ``as_layout(x, L2)``) + /// is intentionally not done at this layer: after SSA conversion the outer + /// Call references its inner result via a Var binding, not inline, so a + /// naive pointer inspection cannot see across the binding. A dedicated + /// SSA-aware chain optimizer can be added if a real pipeline produces such + /// chains. ExprPtr VisitExpr_(const CallPtr& op) override { auto base = IRMutator::VisitExpr_(op); + auto call = std::dynamic_pointer_cast(base); + if (call && call->op_ && call->op_->name_ == "tensor.as_layout") { + base = SimplifyAsLayout(call); + } auto new_type = SimplifyType(base->GetType()); if (new_type.get() == base->GetType().get()) return base; - auto call = std::dynamic_pointer_cast(base); + call = std::dynamic_pointer_cast(base); if (!call) return base; return std::make_shared(call->op_, call->args_, call->kwargs_, call->attrs_, new_type, call->span_); @@ -503,6 +518,39 @@ class SimplifyMutator : public arith::IRMutatorWithAnalyzer { } private: + /// Identity elimination per RFC #1300 §3.3: + /// ``as_layout(x, layout=x.layout)`` → ``x``. + /// + /// Drops a ``tensor.as_layout`` call when the requested target layout + /// matches what the source already carries — the call is then a no-op + /// metadata reinterpret and downstream consumers can use ``src`` directly. + /// (When layouts differ, ``as_layout`` performs the canonical-pair swap; + /// such substantive reinterprets are preserved.) + /// + /// Chain folding (``as_layout(as_layout(x, L1), L2)`` → ``as_layout(x, L2)``) + /// is intentionally not implemented here. After SSA the outer Call's arg is + /// a Var bound to the inner Call (not the inner Call inline), so naive + /// pointer inspection cannot see across the binding. A dedicated SSA-aware + /// chain optimizer can be added if real pipelines produce such chains. + ExprPtr SimplifyAsLayout(const std::shared_ptr& call) { + if (call->args_.size() != 1) return call; + auto src = call->args_[0]; + + auto src_tensor = As(src->GetType()); + auto out_tensor = As(call->GetType()); + if (!src_tensor || !out_tensor) return call; + + // Bare TensorType is implicitly ND-packed. + TensorLayout src_layout = + src_tensor->tensor_view_.has_value() ? src_tensor->tensor_view_->layout : TensorLayout::ND; + TensorLayout target_layout = + out_tensor->tensor_view_.has_value() ? out_tensor->tensor_view_->layout : TensorLayout::ND; + if (src_layout == target_layout) { + return src; + } + return call; + } + /// Compose var-remap (via the base-class `var_remap_`) with analyzer-based /// constant folding — the Analyzer only knows about its own bindings and /// ignores our Var rebuilds, so remap must run first. diff --git a/tests/ut/codegen/test_pto_codegen.py b/tests/ut/codegen/test_pto_codegen.py index 3a891f431..3b6f16020 100644 --- a/tests/ut/codegen/test_pto_codegen.py +++ b/tests/ut/codegen/test_pto_codegen.py @@ -1767,11 +1767,18 @@ def kernel( assert "layout = #pto.layout" in row_view -def test_pto_codegen_3d_dn_tensor_view_uses_last_dim_stride(): - """3D DN tensor emits swapped shape and DN strides based on the original last dim. +def test_pto_codegen_3d_dn_tensor_view_uses_canonical_stride(): + """3D DN tensor emits the canonical RFC #1300 (shape, stride, layout) triple. - Regression test for non-square batch transpose cases such as B:[B, N, K] with N != K. - The DN stride for the last dimension must be K (the original last dim), not N. + For ``[B, K, N] DN`` (RFC §2.3 canonical-form interpretation), packed strides are:: + + stride[n-2] = 1 + stride[n-1] = shape[n-2] (= K) + stride[n-3] = shape[n-2] * shape[n-1] (= K * N — the per-batch volume) + + After ``MaterializeTensorStrides`` activates in the default pipeline (RFC #1300 P6), + the codegen reads ``(shape, stride, layout)`` directly from the materialized + TensorView rather than going through the legacy ``dn_swap`` post-emit path. """ @pl.program @@ -1788,16 +1795,19 @@ def kernel( mlir_code = _generate_default_mlir(DN3DProgram) lines = _get_mlir_lines(mlir_code) b_view = _single_line(lines, "pto.make_tensor_view %arg0") - stride_mul_lines = _find_lines(lines, "arith.muli") - assert "shape = [%c2_index, %c64_index, %c48_index]" in b_view - assert "strides = [" in b_view and ", %c1_index, %c64_index]" in b_view, ( - f"3D DN stride must end with [1, 64] for shape [2, 48, 64]: {b_view}" + # Canonical form preserves the user-written logical shape — no swap. + assert "shape = [%c2_index, %c48_index, %c64_index]" in b_view, ( + f"3D DN canonical shape must match the logical IR shape [2, 48, 64]: {b_view}" ) - assert any("%c64_index" in line and "%c48_index" in line for line in stride_mul_lines), ( - "Expected batch stride to be computed from the original last two dims (64 * 48). " - f"Got muli lines: {stride_mul_lines}" + # Canonical DN strides: stride[n-2]=1, stride[n-1]=shape[n-2]=48. + assert "strides = [" in b_view and ", %c1_index, %c48_index]" in b_view, ( + f"3D DN canonical stride must end with [1, 48] for shape [2, 48, 64]: {b_view}" ) + # The batch stride (= 3072 = 48 * 64) is materialized as a single constant + # ``%c3072_index`` by ``MaterializeTensorStrides``, so it should NOT show up + # as an ``arith.muli`` at codegen. + assert "%c3072_index" in b_view, f"Batch stride must be the constant 3072 (= 48 * 64): {b_view}" assert "layout = #pto.layout" in b_view diff --git a/tests/ut/codegen/test_pto_codegen_cross_core.py b/tests/ut/codegen/test_pto_codegen_cross_core.py index 41f8306d7..32481ef5f 100644 --- a/tests/ut/codegen/test_pto_codegen_cross_core.py +++ b/tests/ut/codegen/test_pto_codegen_cross_core.py @@ -295,7 +295,7 @@ def _compile_and_generate(program) -> dict[str, str]: passes.convert_tensor_to_tile_ops, passes.flatten_tile_nd_to_2d, passes.infer_tile_memory_space, - passes.resolve_transpose_layout, + passes.lower_transpose_load_param_layout, passes.resolve_backend_op_layouts, passes.init_mem_ref, passes.memory_reuse, diff --git a/tests/ut/ir/operators/test_tensor_as_layout.py b/tests/ut/ir/operators/test_tensor_as_layout.py new file mode 100644 index 000000000..1238ec0c2 --- /dev/null +++ b/tests/ut/ir/operators/test_tensor_as_layout.py @@ -0,0 +1,197 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +"""Unit tests for ``tensor.as_layout`` op (RFC #1300, P4). + +The op flips a TensorType's layout tag over the same physical memory; it +never reshapes (use ``tensor.reshape`` for that). Target shape is mechanically +derived from the source — callers do not pass a target shape. + +This file covers ``DeduceTensorAsLayoutType`` (type inference + validity); the +Simplify-pass identity-elimination rule is covered in +``tests/ut/ir/transforms/test_simplify_pass.py``. +""" + +import pytest +from pypto import DataType, ir + + +def _span(): + return ir.Span.unknown() + + +def _const(value: int, dtype: DataType = DataType.INDEX): + return ir.ConstInt(value, dtype, _span()) + + +def _tensor_var(shape, dtype=DataType.FP32, view=None, name="t"): + span = _span() + shape_exprs = [_const(d) for d in shape] if isinstance(shape[0], int) else shape + if view is None: + t = ir.TensorType(shape_exprs, dtype) + else: + t = ir.TensorType(shape_exprs, dtype, None, view) + return ir.Var(name, t, span) + + +def _result_view(call): + """Return the TensorView (or None) on the Call's result type.""" + t = call.type + assert isinstance(t, ir.TensorType) + return t.tensor_view + + +def _values_of(exprs): + out = [] + for e in exprs: + assert isinstance(e, ir.ConstInt) + out.append(e.value) + return out + + +# ============================================================================ +# Cross-layout flips — target shape is auto-derived (trailing-2-dim swap) +# ============================================================================ + + +def test_bare_nd_to_dn_flips_trailing_dims(): + """Bare ``[N=8, K=4]`` (implicit ND) → DN auto-swaps to ``[K=4, N=8]`` + DN-packed, the §4.2 canonical pair partner.""" + src = _tensor_var([8, 4]) + call = ir.op.tensor.as_layout(src, ir.TensorLayout.DN) + + assert call.op.name == "tensor.as_layout" + out = call.type + assert isinstance(out, ir.TensorType) + assert _values_of(out.shape) == [4, 8] + view = _result_view(call) + assert view is not None + assert view.layout == ir.TensorLayout.DN + # DN-packed for [4, 8]: stride=[1, 4] + assert _values_of(view.stride) == [1, 4] + + +def test_dn_packed_to_nd_flips_back(): + """``[K=4, N=8] DN-packed`` → ND auto-swaps back to ``[N=8, K=4] ND``.""" + src_view = ir.TensorView([_const(1), _const(4)], ir.TensorLayout.DN) + src = _tensor_var([4, 8], view=src_view) + call = ir.op.tensor.as_layout(src, ir.TensorLayout.ND) + + out = call.type + assert isinstance(out, ir.TensorType) + assert _values_of(out.shape) == [8, 4] + view = _result_view(call) + assert view is not None + assert view.layout == ir.TensorLayout.ND + # ND-packed for [8, 4]: stride=[4, 1] + assert _values_of(view.stride) == [4, 1] + + +def test_3d_nd_to_dn_swaps_trailing_pair_only(): + """Outer batch dim is preserved; only the trailing 2 dims swap.""" + src = _tensor_var([2, 4, 8]) # bare ND + call = ir.op.tensor.as_layout(src, ir.TensorLayout.DN) + + out = call.type + assert isinstance(out, ir.TensorType) + # [2, 4, 8] ND → [2, 8, 4] DN (trailing pair swap) + assert _values_of(out.shape) == [2, 8, 4] + view = _result_view(call) + assert view is not None + # DN-packed for [2, 8, 4]: stride=[8*4, 1, 8] = [32, 1, 8] + assert _values_of(view.stride) == [32, 1, 8] + + +# ============================================================================ +# Identity flips — same layout, shape unchanged (Simplify will fold the call) +# ============================================================================ + + +def test_identity_flip_keeps_shape(): + """``as_layout(x, x.layout)`` produces an identity Call: same shape, + same layout (modulo packed-canonical stride materialization). The Call + survives type inference; the Simplify pass folds it away.""" + src = _tensor_var([8, 4]) # bare ND + call = ir.op.tensor.as_layout(src, ir.TensorLayout.ND) + + out = call.type + assert isinstance(out, ir.TensorType) + assert _values_of(out.shape) == [8, 4] + view = _result_view(call) + assert view is not None + assert view.layout == ir.TensorLayout.ND + assert _values_of(view.stride) == [4, 1] + + +# ============================================================================ +# Validity rejections +# ============================================================================ + + +def test_nz_target_rejected(): + """NZ on TensorType is forbidden (NZ is tile-only / fractal).""" + src = _tensor_var([8, 4]) + with pytest.raises(ValueError, match="NZ layout is not allowed"): + ir.op.tensor.as_layout(src, ir.TensorLayout.NZ) + + +def test_cross_layout_flip_below_rank_2_rejected(): + """ND ↔ DN flip needs at least 2 dims to swap; 1D is rejected.""" + src = _tensor_var([8]) + with pytest.raises(ValueError, match="rank >= 2"): + ir.op.tensor.as_layout(src, ir.TensorLayout.DN) + + +def test_strided_source_rejected(): + """Strided sub-views can't ride the canonical pair; reject them so the + caller routes through ``tensor.slice`` / ``tensor.reshape`` first.""" + # Synthesize a strided ND tensor: stride [16, 1] on shape [4, 8] (parent + # stride preserved on a 4×8 sub-view of an 8×16 row-major buffer). + src_view = ir.TensorView([_const(16), _const(1)], ir.TensorLayout.ND) + src = _tensor_var([4, 8], view=src_view, name="strided") + with pytest.raises(ValueError, match="strided sub-view"): + ir.op.tensor.as_layout(src, ir.TensorLayout.DN) + + +# ============================================================================ +# Symbolic shapes — accepted on the cross-layout flip; shape swap survives +# ============================================================================ + + +def test_symbolic_shape_flips(): + """Symbolic ``[N, K] ND`` → DN swaps to ``[K, N] DN``; ExprPtr identity + is preserved through the swap.""" + span = _span() + n_var = ir.Var("N", ir.ScalarType(DataType.INDEX), span) + k_var = ir.Var("K", ir.ScalarType(DataType.INDEX), span) + src = _tensor_var([n_var, k_var]) + call = ir.op.tensor.as_layout(src, ir.TensorLayout.DN) + + out = call.type + assert isinstance(out, ir.TensorType) + # Trailing pair swap: [N, K] -> [K, N]; ExprPtrs preserved. + assert out.shape[0] is k_var + assert out.shape[1] is n_var + view = _result_view(call) + assert view is not None + assert view.layout == ir.TensorLayout.DN + + +# ============================================================================ +# Op-registry sanity +# ============================================================================ + + +def test_op_registered(): + """``tensor.as_layout`` must be discoverable through the OpRegistry.""" + assert ir.is_op_registered("tensor.as_layout") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/ut/ir/transforms/test_lower_transpose_load_param_layout_pass.py b/tests/ut/ir/transforms/test_lower_transpose_load_param_layout_pass.py new file mode 100644 index 000000000..470ef6abb --- /dev/null +++ b/tests/ut/ir/transforms/test_lower_transpose_load_param_layout_pass.py @@ -0,0 +1,483 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +"""Unit tests for LowerTransposeLoadParamLayout pass (RFC #1300 P6). + +The pass promotes each InCore parameter loaded via ``tile.load(transpose=True)`` +to canonical-form DN (RFC §3.3 + §4.2): the trailing shape pair is swapped, +the layout tag becomes DN, the body's ``tile.load`` call swaps its +``offsets`` / ``shapes`` / ``valid_shapes`` trailing pair and drops the +``transpose=True`` kwarg, and every non-InCore call site bridges its arg +through ``tensor.as_layout(arg, DN)``. + +``tensor.as_layout`` is internal-only and not exposed via ``pypto.language``, +so we cannot write the post-pass IR as ``@pl.program``. Instead we drive the +pass with ``@pl.program`` ``Before`` programs and assert the resulting IR +shape programmatically. +""" + +import pypto.language as pl +import pytest +from pypto import ir, passes + + +def _as_tensor_type(ty: ir.Type) -> ir.TensorType: + """Narrow ``ty`` to ``TensorType`` for type-checker awareness.""" + assert isinstance(ty, ir.TensorType), f"expected TensorType, got {type(ty).__name__}" + return ty + + +def _find_function(program, name): + """Return the Function with the given name from a Program.""" + for _gv, func in program.functions.items(): + if func.name == name: + return func + raise AssertionError(f"function {name!r} not found in program") + + +def _iter_stmts(stmt): + """Yield every statement under ``stmt`` (depth-first).""" + if isinstance(stmt, ir.SeqStmts): + for s in stmt.stmts: + yield from _iter_stmts(s) + else: + yield stmt + for attr in ("body", "then_body", "else_body"): + inner = getattr(stmt, attr, None) + if inner is not None: + yield from _iter_stmts(inner) + + +def _find_tile_loads(func): + """Return every ``tile.load`` Call expression in ``func.body``.""" + loads = [] + for stmt in _iter_stmts(func.body): + value = getattr(stmt, "value", None) + if isinstance(value, ir.Call) and value.op is not None and value.op.name == "tile.load": + loads.append(value) + return loads + + +def _find_calls_to(func, callee_name): + """Return every Call in ``func.body`` whose op is GlobalVar(callee_name).""" + calls = [] + for stmt in _iter_stmts(func.body): + value = getattr(stmt, "value", None) + if isinstance(value, ir.Call) and isinstance(value.op, ir.GlobalVar) and value.op.name == callee_name: + calls.append(value) + return calls + + +def _find_assign_rhs(func, var): + """Return the RHS expression of the ``AssignStmt`` that defines ``var``.""" + for stmt in _iter_stmts(func.body): + if isinstance(stmt, ir.AssignStmt) and stmt.var is var: + return stmt.value + raise AssertionError(f"no AssignStmt defines var {var.name_hint}") + + +def _shape_dims(ty): + """Return ConstInt shape dims as ints (rejects symbolic dims for test fixtures).""" + tensor_type = _as_tensor_type(ty) + out = [] + for dim in tensor_type.shape: + assert isinstance(dim, ir.ConstInt), f"non-constant dim {dim} in test fixture" + out.append(dim.value) + return out + + +def _transpose_kwarg(call): + """Return the value of the ``transpose`` kwarg, or ``None`` if absent.""" + return call.kwargs.get("transpose") + + +class TestBTransposePromotesParam: + """``C = A @ B^T`` with B loaded via ``transpose=True`` — param promoted to DN.""" + + def test_btranspose_basic(self): + M, K, N = 64, 128, 32 + + @pl.program + class Before: + @pl.function(type=pl.FunctionType.InCore) + def matmul_incore( + self, + a: pl.Tensor[[M, K], pl.FP32], + b: pl.Tensor[[N, K], pl.FP32], + c: pl.Out[pl.Tensor[[M, N], pl.FP32]], + ) -> pl.Tensor[[M, N], pl.FP32]: + tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) + tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) + tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) + tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) + tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) + c_store = pl.store(tile_c, [0, 0], c) + return c_store + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] + ) -> pl.Tensor[[M, N], pl.FP32]: + c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) + c_result = self.matmul_incore(a, b, c) + return c_result + + After = passes.lower_transpose_load_param_layout()(Before) + + incore = _find_function(After, "matmul_incore") + b_type = _as_tensor_type(incore.params[1].type) + assert _shape_dims(b_type) == [K, N], f"b param shape: {_shape_dims(b_type)}" + assert b_type.tensor_view is not None + assert b_type.tensor_view.layout == ir.TensorLayout.DN + + a_type = _as_tensor_type(incore.params[0].type) + assert _shape_dims(a_type) == [M, K] + assert a_type.tensor_view is None + + loads_by_src = {} + for ld in _find_tile_loads(incore): + assert isinstance(ld.args[0], ir.Var) + loads_by_src[ld.args[0].name_hint] = ld + + load_b = loads_by_src["b"] + shapes_arg = load_b.args[2] + assert isinstance(shapes_arg, ir.MakeTuple) + shape_vals = [el.value for el in shapes_arg.elements if isinstance(el, ir.ConstInt)] + assert shape_vals == [K, N], f"tile.load(b) shapes: {shape_vals}" + assert _transpose_kwarg(load_b) is False, "tile.load(b) transpose kwarg must be False after P6" + + load_a = loads_by_src["a"] + shape_vals_a = [el.value for el in load_a.args[2].elements if isinstance(el, ir.ConstInt)] + assert shape_vals_a == [M, K] + + orch = _find_function(After, "orchestrator") + calls = _find_calls_to(orch, "matmul_incore") + assert len(calls) == 1 + # `b` is bridged via an SSA AssignStmt: the call arg is a Var bound to + # a separately-emitted ``tensor.as_layout(orig_b, DN)`` Call. + b_arg = calls[0].args[1] + assert isinstance(b_arg, ir.Var) + b_def_rhs = _find_assign_rhs(orch, b_arg) + assert isinstance(b_def_rhs, ir.Call) and b_def_rhs.op is not None + assert b_def_rhs.op.name == "tensor.as_layout", ( + f"orch must wrap b in tensor.as_layout, got {b_def_rhs.op.name if b_def_rhs.op else None}" + ) + bridged_t = _as_tensor_type(b_def_rhs.type) + assert _shape_dims(bridged_t) == [K, N] + assert bridged_t.tensor_view is not None + assert bridged_t.tensor_view.layout == ir.TensorLayout.DN + + def test_btranspose_non_square(self): + M, K, N = 128, 64, 32 + + @pl.program + class Before: + @pl.function(type=pl.FunctionType.InCore) + def matmul_incore( + self, + a: pl.Tensor[[M, K], pl.FP32], + b: pl.Tensor[[N, K], pl.FP32], + c: pl.Out[pl.Tensor[[M, N], pl.FP32]], + ) -> pl.Tensor[[M, N], pl.FP32]: + tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) + tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) + tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) + tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) + tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) + c_store = pl.store(tile_c, [0, 0], c) + return c_store + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] + ) -> pl.Tensor[[M, N], pl.FP32]: + c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) + c_result = self.matmul_incore(a, b, c) + return c_result + + After = passes.lower_transpose_load_param_layout()(Before) + incore = _find_function(After, "matmul_incore") + b_type = _as_tensor_type(incore.params[1].type) + assert _shape_dims(b_type) == [K, N] + assert b_type.tensor_view is not None + assert b_type.tensor_view.layout == ir.TensorLayout.DN + + +class TestATransposePromotesParam: + """``C = A^T @ B`` — A param promoted to canonical DN.""" + + def test_atranspose_basic(self): + M, K, N = 64, 128, 32 + + @pl.program + class Before: + @pl.function(type=pl.FunctionType.InCore) + def matmul_incore( + self, + a: pl.Tensor[[K, M], pl.FP32], + b: pl.Tensor[[K, N], pl.FP32], + c: pl.Out[pl.Tensor[[M, N], pl.FP32]], + ) -> pl.Tensor[[M, N], pl.FP32]: + tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) + tile_b = pl.load(b, [0, 0], [K, N], target_memory=pl.MemorySpace.Mat) + tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) + tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) + tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) + c_store = pl.store(tile_c, [0, 0], c) + return c_store + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[K, N], pl.FP32] + ) -> pl.Tensor[[M, N], pl.FP32]: + c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) + c_result = self.matmul_incore(a, b, c) + return c_result + + After = passes.lower_transpose_load_param_layout()(Before) + incore = _find_function(After, "matmul_incore") + a_t = _as_tensor_type(incore.params[0].type) + assert _shape_dims(a_t) == [M, K] + assert a_t.tensor_view is not None + assert a_t.tensor_view.layout == ir.TensorLayout.DN + + b_t = _as_tensor_type(incore.params[1].type) + assert _shape_dims(b_t) == [K, N] + assert b_t.tensor_view is None + + loads = {ld.args[0].name_hint: ld for ld in _find_tile_loads(incore)} + load_a = loads["a"] + shape_vals = [el.value for el in load_a.args[2].elements if isinstance(el, ir.ConstInt)] + assert shape_vals == [M, K] + assert _transpose_kwarg(load_a) is False + + orch = _find_function(After, "orchestrator") + call = _find_calls_to(orch, "matmul_incore")[0] + # `a` is bridged via tensor.as_layout. After P6's SSA refactor (PR + # review fix), the bridge is bound to a fresh Var by a preceding + # AssignStmt, so the call arg is a Var, not the inline Call. Look up + # the binding's RHS. + a_arg = call.args[0] + assert isinstance(a_arg, ir.Var) + a_def_rhs = _find_assign_rhs(orch, a_arg) + assert isinstance(a_def_rhs, ir.Call) and a_def_rhs.op is not None + assert a_def_rhs.op.name == "tensor.as_layout" + # `b` is not promoted, so its arg is the raw Var (no bridge). + assert isinstance(call.args[1], ir.Var) + + +class TestABTransposePromotesBothParams: + """``C = A^T @ B^T`` — both params promoted, both call args wrapped.""" + + def test_abtranspose_basic(self): + M, K, N = 64, 128, 32 + + @pl.program + class Before: + @pl.function(type=pl.FunctionType.InCore) + def matmul_incore( + self, + a: pl.Tensor[[K, M], pl.FP32], + b: pl.Tensor[[N, K], pl.FP32], + c: pl.Out[pl.Tensor[[M, N], pl.FP32]], + ) -> pl.Tensor[[M, N], pl.FP32]: + tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) + tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) + tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) + tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) + tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) + c_store = pl.store(tile_c, [0, 0], c) + return c_store + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] + ) -> pl.Tensor[[M, N], pl.FP32]: + c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) + c_result = self.matmul_incore(a, b, c) + return c_result + + After = passes.lower_transpose_load_param_layout()(Before) + incore = _find_function(After, "matmul_incore") + a_t = _as_tensor_type(incore.params[0].type) + b_t = _as_tensor_type(incore.params[1].type) + assert _shape_dims(a_t) == [M, K] + assert a_t.tensor_view is not None and a_t.tensor_view.layout == ir.TensorLayout.DN + assert _shape_dims(b_t) == [K, N] + assert b_t.tensor_view is not None and b_t.tensor_view.layout == ir.TensorLayout.DN + + orch = _find_function(After, "orchestrator") + call = _find_calls_to(orch, "matmul_incore")[0] + # Both promoted args are bridged via SSA AssignStmts. + for slot in (0, 1): + arg = call.args[slot] + assert isinstance(arg, ir.Var) + rhs = _find_assign_rhs(orch, arg) + assert isinstance(rhs, ir.Call) and rhs.op is not None + assert rhs.op.name == "tensor.as_layout" + + +class TestNoOpCases: + """Pass is a no-op when no parameter needs promotion.""" + + def test_no_transpose_unchanged(self): + M, K, N = 64, 128, 32 + + @pl.program + class Before: + @pl.function(type=pl.FunctionType.InCore) + def matmul_incore( + self, + a: pl.Tensor[[M, K], pl.FP32], + b: pl.Tensor[[K, N], pl.FP32], + c: pl.Out[pl.Tensor[[M, N], pl.FP32]], + ) -> pl.Tensor[[M, N], pl.FP32]: + tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) + tile_b = pl.load(b, [0, 0], [K, N], target_memory=pl.MemorySpace.Mat) + tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) + tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) + tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) + c_store = pl.store(tile_c, [0, 0], c) + return c_store + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[K, N], pl.FP32] + ) -> pl.Tensor[[M, N], pl.FP32]: + c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) + c_result = self.matmul_incore(a, b, c) + return c_result + + After = passes.lower_transpose_load_param_layout()(Before) + ir.assert_structural_equal(After, Before) + + def test_already_dn_param_idempotent(self): + """A param already carrying the DN tag short-circuits — IR unchanged. + + Mirrors the pre-P6 mid-state where the param has been DN-tagged but the + body's tile.load still has ``transpose=True`` (idempotent re-run of the + legacy pass form). The pass detects ``layout == DN`` on the param, + ``continue``s past the promotion, and leaves the body untouched. + """ + M, K, N = 64, 128, 32 + + @pl.program + class Before: + @pl.function(type=pl.FunctionType.InCore) + def matmul_incore( + self, + a: pl.Tensor[[M, K], pl.FP32], + b: pl.Tensor[[N, K], pl.FP32, pl.DN], + c: pl.Out[pl.Tensor[[M, N], pl.FP32]], + ) -> pl.Tensor[[M, N], pl.FP32]: + tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) + tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) + tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) + tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) + tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) + c_store = pl.store(tile_c, [0, 0], c) + return c_store + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32, pl.DN] + ) -> pl.Tensor[[M, N], pl.FP32]: + c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) + c_result = self.matmul_incore(a, b, c) + return c_result + + After = passes.lower_transpose_load_param_layout()(Before) + ir.assert_structural_equal(After, Before) + + +class TestMixedUseRejected: + """A param loaded with both transpose=True and transpose=False is rejected.""" + + def test_mixed_transpose_modes_rejected(self): + M, K, N = 64, 128, 32 + + @pl.program + class Before: + @pl.function(type=pl.FunctionType.InCore) + def matmul_incore( + self, + a: pl.Tensor[[N, K], pl.FP32], + b: pl.Tensor[[N, K], pl.FP32], + c: pl.Out[pl.Tensor[[M, N], pl.FP32]], + ) -> pl.Tensor[[M, N], pl.FP32]: + tile_a = pl.load(a, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) + tile_b = pl.load(a, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat) + tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) + tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) + tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) + c_store = pl.store(tile_c, [0, 0], c) + return c_store + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, a: pl.Tensor[[N, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] + ) -> pl.Tensor[[M, N], pl.FP32]: + c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) + c_result = self.matmul_incore(a, b, c) + return c_result + + with pytest.raises(Exception, match="only one mode is supported per InCore parameter"): + passes.lower_transpose_load_param_layout()(Before) + + +class TestPartialLoadPromotion: + """A param with a partial-window transpose load: param shape swap is based on the + full TensorType shape, not the load window.""" + + def test_partial_load_square_tensor(self): + @pl.program + class Before: + @pl.function(type=pl.FunctionType.InCore) + def kernel( + self, + a: pl.Tensor[[64, 128], pl.BF16], + key_cache: pl.Tensor[[128, 128], pl.BF16], + out: pl.Out[pl.Tensor[[64, 64], pl.FP32]], + ) -> pl.Tensor[[64, 64], pl.FP32]: + tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) + tile_k = pl.load( + key_cache, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat, transpose=True + ) + tile_a_l0 = pl.move(tile_a, target_memory=pl.MemorySpace.Left) + tile_k_l0 = pl.move(tile_k, target_memory=pl.MemorySpace.Right) + tile_c = pl.matmul(tile_a_l0, tile_k_l0) + out_store = pl.store(tile_c, [0, 0], out) + return out_store + + @pl.function(type=pl.FunctionType.Orchestration) + def orchestrator( + self, + a: pl.Tensor[[64, 128], pl.BF16], + key_cache: pl.Tensor[[128, 128], pl.BF16], + ) -> pl.Tensor[[64, 64], pl.FP32]: + out: pl.Tensor[[64, 64], pl.FP32] = pl.create_tensor([64, 64], dtype=pl.FP32) + out_result = self.kernel(a, key_cache, out) + return out_result + + After = passes.lower_transpose_load_param_layout()(Before) + incore = _find_function(After, "kernel") + kc_t = _as_tensor_type(incore.params[1].type) + assert _shape_dims(kc_t) == [128, 128] + assert kc_t.tensor_view is not None + assert kc_t.tensor_view.layout == ir.TensorLayout.DN + + loads = {ld.args[0].name_hint: ld for ld in _find_tile_loads(incore)} + load_k = loads["key_cache"] + shape_vals = [el.value for el in load_k.args[2].elements if isinstance(el, ir.ConstInt)] + assert shape_vals == [128, 64] + assert _transpose_kwarg(load_k) is False + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/ut/ir/transforms/test_pass_manager.py b/tests/ut/ir/transforms/test_pass_manager.py index df781133f..f6a6b35c2 100644 --- a/tests/ut/ir/transforms/test_pass_manager.py +++ b/tests/ut/ir/transforms/test_pass_manager.py @@ -39,7 +39,7 @@ "FlattenTileNdTo2D", "AutoTileMatmulL0", "InferTileMemorySpace", - "ResolveTransposeLayout", + "LowerTransposeLoadParamLayout", "ResolveBackendOpLayouts", "NormalizeStmtStructure", "ExpandMixedKernel", @@ -48,6 +48,7 @@ "NormalizeReturnOrder", "LowerPipelineLoops", "CanonicalizeIOOrder", + "MaterializeTensorStrides", "InitMemRef", "MemoryReuse", "LegalizePTOBufferReuse", @@ -71,7 +72,7 @@ "FlattenTileNdTo2D", "AutoTileMatmulL0", "InferTileMemorySpace", - "ResolveTransposeLayout", + "LowerTransposeLoadParamLayout", "ResolveBackendOpLayouts", "NormalizeStmtStructure", "ExpandMixedKernel", @@ -80,6 +81,7 @@ "NormalizeReturnOrder", "LowerPipelineLoops", "CanonicalizeIOOrder", + "MaterializeTensorStrides", "InitMemRef", "MemoryReuse", "LegalizePTOBufferReuse", diff --git a/tests/ut/ir/transforms/test_resolve_transpose_layout_pass.py b/tests/ut/ir/transforms/test_resolve_transpose_layout_pass.py deleted file mode 100644 index e8a1bd707..000000000 --- a/tests/ut/ir/transforms/test_resolve_transpose_layout_pass.py +++ /dev/null @@ -1,631 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- - -"""Unit tests for ResolveTransposeLayout pass.""" - -import pypto.language as pl -import pytest -from pypto import ir, passes - - -class TestResolveTransposeLayoutBTranspose: - """Test B transpose cases: C = A @ B^T.""" - - def test_btranspose_basic(self): - """B stored as [N, K], loaded with transpose=True -> param keeps shape [N, K] + DN.""" - M, K, N = 64, 128, 32 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[M, K], pl.FP32], - b: pl.Tensor[[N, K], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[M, K], pl.FP32], - b: pl.Tensor[[N, K], pl.FP32, pl.DN], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Expected) - - def test_btranspose_non_square(self): - """Non-square dimensions: M=128, K=64, N=32.""" - M, K, N = 128, 64, 32 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[M, K], pl.FP32], - b: pl.Tensor[[N, K], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[M, K], pl.FP32], - b: pl.Tensor[[N, K], pl.FP32, pl.DN], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Expected) - - -class TestResolveTransposeLayoutATranspose: - """Test A transpose cases: C = A^T @ B.""" - - def test_atranspose_basic(self): - """A stored as [K, M], loaded with transpose=True -> param keeps shape [K, M] + DN.""" - M, K, N = 64, 128, 32 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[K, M], pl.FP32], - b: pl.Tensor[[K, N], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_b = pl.load(b, [0, 0], [K, N], target_memory=pl.MemorySpace.Mat) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[K, N], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[K, M], pl.FP32, pl.DN], - b: pl.Tensor[[K, N], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_b = pl.load(b, [0, 0], [K, N], target_memory=pl.MemorySpace.Mat) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[K, N], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Expected) - - -class TestResolveTransposeLayoutABTranspose: - """Test both A and B transposed: C = A^T @ B^T.""" - - def test_abtranspose_basic(self): - """Both A and B transposed -> both params get DN layout.""" - M, K, N = 64, 128, 32 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[K, M], pl.FP32], - b: pl.Tensor[[N, K], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[K, M], pl.FP32, pl.DN], - b: pl.Tensor[[N, K], pl.FP32, pl.DN], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Expected) - - def test_abtranspose_non_square(self): - """Both transposed with non-square dimensions: M=32, K=128, N=64.""" - M, K, N = 32, 128, 64 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[K, M], pl.FP32], - b: pl.Tensor[[N, K], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[K, M], pl.FP32, pl.DN], - b: pl.Tensor[[N, K], pl.FP32, pl.DN], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Expected) - - -class TestResolveTransposeLayoutNoOp: - """Test cases where the pass should be a no-op.""" - - def test_no_transpose_unchanged(self): - """No transpose=True loads -> program unchanged.""" - M, K, N = 64, 128, 32 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[M, K], pl.FP32], - b: pl.Tensor[[K, N], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [K, N], target_memory=pl.MemorySpace.Mat) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[K, N], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Before) - - def test_already_dn_layout_unchanged(self): - """Parameter already has DN layout -> pass is idempotent.""" - M, K, N = 64, 128, 32 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[M, K], pl.FP32], - b: pl.Tensor[[N, K], pl.FP32, pl.DN], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32, pl.DN] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Before) - - def test_elementwise_no_transpose(self): - """Simple elementwise with no transpose -> unchanged.""" - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def add_incore( - self, - x: pl.Tensor[[64, 64], pl.FP32], - out_0: pl.Out[pl.Tensor[[64, 64], pl.FP32]], - ) -> pl.Tensor[[64, 64], pl.FP32]: - x_tile: pl.Tile[[64, 64], pl.FP32] = pl.load(x, [0, 0], [64, 64]) - y_tile: pl.Tile[[64, 64], pl.FP32] = pl.tile.add(x_tile, x_tile) - out_0: pl.Tensor[[64, 64], pl.FP32] = pl.store(y_tile, [0, 0], out_0) - return out_0 - - @pl.function(type=pl.FunctionType.Orchestration) - def main(self, x: pl.Tensor[[64, 64], pl.FP32]) -> pl.Tensor[[64, 64], pl.FP32]: - out_0: pl.Tensor[[64, 64], pl.FP32] = pl.create_tensor([64, 64], dtype=pl.FP32) - y: pl.Tensor[[64, 64], pl.FP32] = self.add_incore(x, out_0) - return y - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Before) - - def test_transpose_false_explicit(self): - """Explicit transpose=False -> no change.""" - M, K, N = 64, 128, 32 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[M, K], pl.FP32], - b: pl.Tensor[[K, N], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat, transpose=False) - tile_b = pl.load(b, [0, 0], [K, N], target_memory=pl.MemorySpace.Mat, transpose=False) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[K, N], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Before) - - -class TestResolveTransposeLayoutMixed: - """Test mixed scenarios with one transpose and one non-transpose param.""" - - def test_only_second_param_transposed(self): - """Only second param has transpose -> only second param changes.""" - M, K, N = 64, 128, 64 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[M, K], pl.FP32], - b: pl.Tensor[[N, K], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[M, K], pl.FP32], - b: pl.Tensor[[N, K], pl.FP32, pl.DN], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [M, K], target_memory=pl.MemorySpace.Mat) - tile_b = pl.load(b, [0, 0], [N, K], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[M, K], pl.FP32], b: pl.Tensor[[N, K], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Expected) - - def test_only_first_param_transposed(self): - """Only first param has transpose -> only first param changes.""" - M, K, N = 64, 64, 128 - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[K, M], pl.FP32], - b: pl.Tensor[[K, N], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_b = pl.load(b, [0, 0], [K, N], target_memory=pl.MemorySpace.Mat) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[K, N], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def matmul_incore( - self, - a: pl.Tensor[[K, M], pl.FP32, pl.DN], - b: pl.Tensor[[K, N], pl.FP32], - c: pl.Out[pl.Tensor[[M, N], pl.FP32]], - ) -> pl.Tensor[[M, N], pl.FP32]: - tile_a = pl.load(a, [0, 0], [K, M], target_memory=pl.MemorySpace.Mat, transpose=True) - tile_b = pl.load(b, [0, 0], [K, N], target_memory=pl.MemorySpace.Mat) - tile_a_l0a = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_b_l0b = pl.move(tile_b, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0a, tile_b_l0b) - c_store = pl.store(tile_c, [0, 0], c) - return c_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, a: pl.Tensor[[K, M], pl.FP32], b: pl.Tensor[[K, N], pl.FP32] - ) -> pl.Tensor[[M, N], pl.FP32]: - c: pl.Tensor[[M, N], pl.FP32] = pl.create_tensor([M, N], dtype=pl.FP32) - c_result = self.matmul_incore(a, b, c) - return c_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Expected) - - -class TestResolveTransposeLayoutPartialLoad: - """Test cases where tile.load reads a subset of the tensor (partial load).""" - - def test_partial_load_square_tensor(self): - """Tensor [128, 128] with partial tile.load [128, 64] transpose -> shape stays [128, 128] + DN. - - Regression test for #606: paged attention key_cache tensor shape was incorrectly - changed from [128, 128] to [128, 64] because the pass used the tile load shape - instead of transposing the original tensor shape. - """ - - @pl.program - class Before: - @pl.function(type=pl.FunctionType.InCore) - def kernel( - self, - a: pl.Tensor[[64, 128], pl.BF16], - key_cache: pl.Tensor[[128, 128], pl.BF16], - out: pl.Out[pl.Tensor[[64, 64], pl.FP32]], - ) -> pl.Tensor[[64, 64], pl.FP32]: - tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) - tile_k = pl.load( - key_cache, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat, transpose=True - ) - tile_a_l0 = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_k_l0 = pl.move(tile_k, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0, tile_k_l0) - out_store = pl.store(tile_c, [0, 0], out) - return out_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, - a: pl.Tensor[[64, 128], pl.BF16], - key_cache: pl.Tensor[[128, 128], pl.BF16], - ) -> pl.Tensor[[64, 64], pl.FP32]: - out: pl.Tensor[[64, 64], pl.FP32] = pl.create_tensor([64, 64], dtype=pl.FP32) - out_result = self.kernel(a, key_cache, out) - return out_result - - @pl.program - class Expected: - @pl.function(type=pl.FunctionType.InCore) - def kernel( - self, - a: pl.Tensor[[64, 128], pl.BF16], - key_cache: pl.Tensor[[128, 128], pl.BF16, pl.DN], - out: pl.Out[pl.Tensor[[64, 64], pl.FP32]], - ) -> pl.Tensor[[64, 64], pl.FP32]: - tile_a = pl.load(a, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat) - tile_k = pl.load( - key_cache, [0, 0], [64, 128], target_memory=pl.MemorySpace.Mat, transpose=True - ) - tile_a_l0 = pl.move(tile_a, target_memory=pl.MemorySpace.Left) - tile_k_l0 = pl.move(tile_k, target_memory=pl.MemorySpace.Right) - tile_c = pl.matmul(tile_a_l0, tile_k_l0) - out_store = pl.store(tile_c, [0, 0], out) - return out_store - - @pl.function(type=pl.FunctionType.Orchestration) - def orchestrator( - self, - a: pl.Tensor[[64, 128], pl.BF16], - key_cache: pl.Tensor[[128, 128], pl.BF16], - ) -> pl.Tensor[[64, 64], pl.FP32]: - out: pl.Tensor[[64, 64], pl.FP32] = pl.create_tensor([64, 64], dtype=pl.FP32) - out_result = self.kernel(a, key_cache, out) - return out_result - - After = passes.resolve_transpose_layout()(Before) - ir.assert_structural_equal(After, Expected) - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/tests/ut/ir/transforms/test_simplify_pass.py b/tests/ut/ir/transforms/test_simplify_pass.py index e8e3eba75..396e9a308 100644 --- a/tests/ut/ir/transforms/test_simplify_pass.py +++ b/tests/ut/ir/transforms/test_simplify_pass.py @@ -1143,5 +1143,94 @@ def main(self): ir.assert_structural_equal(after, Expected) +# ============================================================================ +# tensor.as_layout folding (RFC #1300 P4-b) +# ============================================================================ + + +class TestAsLayoutFolding: + """Simplify drops identity ``tensor.as_layout`` reinterprets per RFC §3.3. + + ``tensor.as_layout`` is internal-only (not in ``pl.*``), so these tests + drive the pass with hand-built IR rather than ``@pl.program``. + + Layout encoding refresher (RFC §4.2): row-major ``[a, b]`` ND describes + the same physical buffer as ``[b, a]`` DN-packed. The trailing-dim swap + is the canonical pair the validity check accepts. + + Note on chain folding: folding ``as_layout(as_layout(x, ...), ...)`` → + ``as_layout(x, ...)`` is intentionally not implemented at this layer. + After SSA the outer Call references its inner via a Var, not inline, + so naive pointer inspection cannot see across the binding. A dedicated + SSA-aware chain optimizer can be added if a real pipeline produces such + chains. + """ + + @staticmethod + def _build_program(make_body): + """Build a 1-function Program whose body produces a tensor expression. + + ``make_body(x_param)`` returns ``(stmts, return_var)``; the function + then returns ``return_var``. + """ + x = ir.Var("x", ir.TensorType([ci(8), ci(4)], DataType.FP32), S) + stmts, ret_var = make_body(x) + body = wrap_stmts(list(stmts) + [ir.ReturnStmt([ret_var], S)]) + func = ir.Function("main", [x], [ret_var.type], body, S) + return ir.Program([func], "test", S) + + @staticmethod + def _iter_stmts(stmt): + if isinstance(stmt, ir.SeqStmts): + for s in stmt.stmts: + yield from TestAsLayoutFolding._iter_stmts(s) + else: + yield stmt + + def _final_assign(self, program): + """Return the function's final AssignStmt (the result-producing one).""" + func = program.get_function("main") + assert func is not None + last = None + for stmt in self._iter_stmts(func.body): + if isinstance(stmt, ir.AssignStmt): + last = stmt + assert last is not None, "no AssignStmt in body" + return last + + def test_eliminates_identity_as_layout(self): + """``as_layout(x, x.layout)`` simplifies to ``x``: target layout + matches source layout, so the call is a no-op.""" + + def build(x): + # x is bare ND [8, 4]; flipping to ND is identity. + same = ir.op.tensor.as_layout(x, ir.TensorLayout.ND) + same_var = ir.Var("same", same.type, S) + return [ir.AssignStmt(same_var, same, S)], same_var + + prog = self._build_program(build) + after = passes.simplify()(prog) + + last = self._final_assign(after).value + assert isinstance(last, ir.Var) and last.name_hint == "x", ( + f"expected identity as_layout to simplify to ``x``, got {type(last).__name__} ({last})" + ) + + def test_preserves_substantive_layout_flip(self): + """Genuine ND → DN flip (with the auto trailing-pair swap) survives — + Simplify only drops layout-tag identities.""" + + def build(x): + call = ir.op.tensor.as_layout(x, ir.TensorLayout.DN) + v = ir.Var("y", call.type, S) + return [ir.AssignStmt(v, call, S)], v + + prog = self._build_program(build) + after = passes.simplify()(prog) + + last = self._final_assign(after).value + assert isinstance(last, ir.Call) and last.op.name == "tensor.as_layout" + + if __name__ == "__main__": pytest.main([__file__, "-v"])