ypapadop-amd · ypapadop-amd · May 19, 2026 · May 19, 2026
diff --git a/src/ggml-hsa/kernels/build_triton.py b/src/ggml-hsa/kernels/build_triton.py
@@ -103,8 +103,10 @@ def compile_triton_kernel(
         verbose: If True, enables verbose compilation output.
 
     """
+    from dataclasses import MISSING
+
     import triton
-    from triton.backends.amd_triton_npu.config import _UNSET, config_context
+    from triton.backends.amd_triton_npu.config import config_context
     from triton.backends.amd_triton_npu.driver import NPUDriver, get_npu_cache_dir
 
     # Determine Triton cache directory
@@ -125,7 +127,7 @@ def compile_triton_kernel(
         TempEnvSet("TRITON_CACHE_DIR", str(cache_dir)),
         config_context(
             compile_only=True,
-            transform_tiling_script=kernel_spec.config.get("transform_script", _UNSET),
+            transform_tiling_script=kernel_spec.config.get("transform_script", MISSING),
             output_format="xclbin",
             debug=1 if verbose else 0,
             target=arch,

diff --git a/src/ggml-hsa/kernels/triton/vecadd_aie2.mlir b/src/ggml-hsa/kernels/triton/vecadd_aie2.mlir
@@ -2,142 +2,36 @@
 // SPDX-License-Identifier: MIT
 
 ////////////////////////////////////////////////////////////////////////////////
-// Transform Script for Vector Addition: Step-by-Step Annotated
-// This script transforms a simple elementwise vector addition IR into a tiled,
-// bufferized, and hardware-friendly form suitable for AIE execution.
-// Each step is annotated with its purpose, assumptions, and relation to the IR.
+// Transform Script for Vector Addition (AIE2)
+// Simple elementwise add: out = a + b
+// Binary op (2 inputs + 1 output). No fusion needed. Vec tile = 16 (AIE2).
+// No type casts needed (bf16 add is native).
+// Uses shared library sequences from transform_library.mlir (auto-injected).
 ////////////////////////////////////////////////////////////////////////////////
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+  transform.named_sequence @__transform_main(
+      %arg1: !transform.any_op {transform.readonly}) {
+
+    // No Phase 1/2 for vec-add (no elementwise fusion needed)
+    transform.include @flatten_tile_forall failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @pad_and_promote_binary_bf16 failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @one_shot_bufferize failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @post_bufferize_cleanup failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+
+    transform.include @vectorize_generics_at_16 failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    %vh = transform.include @air_herd_mapping_and_vectorize
+        failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
 
-    // Step 1: Match the main elementwise op (linalg.generic).
-    // Assumption: The IR contains a linalg.generic op representing the elementwise add.
-    // This is the main computation to be transformed.
-        %add = transform.structured.match ops{["linalg.generic"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-    // Step 2: Flatten elementwise op.
-    // Purpose: Converts multi-dimensional elementwise ops into a 1D form for easier tiling/vectorization.
-    // Assumption: The op is elementwise and can be flattened without changing semantics.
-        %add_flattened = transform.structured.flatten_elementwise %add
-        : (!transform.any_op) -> !transform.any_op
-
-    // Step 3: Bufferize result to shared (L2) memory allocation.
-    // Purpose: Allocates the result buffer in memory space 1 (shared/L2), required for AIR/AIE memory hierarchy.
-    // Assumption: The result of the elementwise op will be written to L2/shared memory.
-        %add_res_shared, %new_add = transform.structured.bufferize_to_allocation %add_flattened
-          {memory_space = 1, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 4: Tile the computation using scf.forall with tile size 256.
-    // Purpose: Introduces parallelism and prepares for mapping to AIE columns.
-    // Assumption: The problem size is a multiple of 256, or padding will be handled later.
-        %add_1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %tiled_add_1, %forall_add_1 =
-          transform.structured.tile_using_forall %add_1 tile_sizes [256] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 5: Run canonicalization and CSE.
-    // Purpose: Cleans up the IR after tiling, merges redundant ops, and prepares for further transforms.
-    // Assumption: Canonicalization will simplify the IR and remove dead code.
-        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_2 : !transform.any_op
-
-    // Step 6: Match the (possibly tiled) linalg.generic for further transformation.
-        %add_2 = transform.structured.match ops{["linalg.generic"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-    // Step 7: Pad the operation.
-    // Purpose: Ensures that the computation is aligned to tile sizes, handles boundary conditions.
-    // Assumption: Padding values/types are correct for the op; nofold_flags prevent folding of padding.
-        %padded_add, %pad_add, %__ = transform.structured.pad %add_2 {
-            padding_values=[0.0 : bf16, 0.0 : bf16, 0.0 : bf16],
-            padding_dimensions=[0, 1, 2],
-            nofold_flags=[1, 1, 1],
-            copy_back_op="linalg.copy"
-        } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 8: Rewrite in destination-passing style (DPS).
-    // Purpose: Converts the op to DPS, which is required for bufferization and explicit memory management.
-    // Assumption: The op supports DPS conversion.
-        %pad_dps_add = transform.structured.rewrite_in_destination_passing_style %pad_add : (!transform.any_op) -> !transform.any_op
-
-    // Step 9: Promote the operands to local memory (AIE local, memory_space=2).
-    // Purpose: Moves input operands to fast local memory for efficient AIE execution.
-    // Assumption: The operands are suitable for promotion and local memory is available.
-        %padded_add_lhs = transform.get_producer_of_operand %padded_add[0] : (!transform.any_op) -> (!transform.any_op)
-        %padded_add_lhs_buffer, %padded_add_lhs_new = transform.structured.bufferize_to_allocation %padded_add_lhs
-            {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-        %padded_add_rhs = transform.get_producer_of_operand %padded_add[1] : (!transform.any_op) -> (!transform.any_op)
-        %padded_add_rhs_buffer, %padded_add_rhs_new = transform.structured.bufferize_to_allocation %padded_add_rhs
-            {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 10: Promote the result to local memory (AIE local, memory_space=2).
-    // Purpose: Ensures the result buffer is also in local memory for fast access.
-    // Assumption: The result fits in local memory and can be promoted.
-        %padded_add_result = transform.get_producer_of_operand %padded_add[2] : (!transform.any_op) -> (!transform.any_op)
-        %padded_add_result_buffer, %padded_add_result_new = transform.structured.bufferize_to_allocation %padded_add_result
-            {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 11: Run canonicalization and CSE again.
-    // Purpose: Cleans up after bufferization and promotion, merges redundant allocs/copies.
-    // Assumption: Canonicalization will further simplify the IR.
-        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_3 : !transform.any_op
-
-    // Step 12: One-shot bufferization of the function.
-    // Purpose: Converts all tensors to memrefs, finalizes bufferization for AIR/AIE lowering.
-    // Assumption: The function is now in DPS form and ready for bufferization.
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 13: Final canonicalization and AIR-specific cleanup.
-    // Purpose: Removes redundant memcpy ops, eliminates cascade memcpy patterns, and canonicalizes.
-    // Assumption: AIR passes will further optimize memory ops for hardware.
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %linalg_copies = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %memref_copies = transform.structured.linalg_copy_to_memref %linalg_copies : (!transform.any_op) -> !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-    // Step 14: Tile linalg.add for vectorization (tile size 16).
-    // Purpose: Final tiling to enable vectorized execution on AIE hardware.
-    // Assumption: The innermost dimension is a multiple of 16, or padding has handled the remainder. Vec size 16 for @llvm.aie2.add.accfloat(<8 x i64> %acc1, <8 x i64> %acc2).
-        %linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:1 =
-          transform.structured.tile_using_for %linalg_generics tile_sizes [16]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 15: AIR Constructs Mapping
-    // Purpose: Convert high-level parallel constructs to AIE-specific operations for hardware execution.
-    // Convert parallel loops to AIE herd operations for multi-core execution
-        %forall_as_herd = transform.structured.match ops{["scf.forall"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %parallel = transform.loop.forall_to_parallel %forall_as_herd  : (!transform.any_op) -> !transform.any_op
-        %herd = transform.air.par_to_herd %parallel : (!transform.any_op) -> !transform.any_op
-
-    // Convert memory copies to DMA operations for efficient data movement
-        %copies_in_herd = transform.structured.match ops{["memref.copy", "linalg.copy"]} in %herd : (!transform.any_op) -> !transform.any_op
-        %dmas_from_copies = transform.air.copy_to_dma %copies_in_herd : (!transform.any_op) -> !transform.any_op
-
-    // Apply vectorization to optimize for AIE vector units
-        %vectorized_herd = transform.air.herd_vectorize %herd : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }