diff --git a/src/ggml-hsa/kernels/build_triton.py b/src/ggml-hsa/kernels/build_triton.py index a15bf2e942..9119ae34bb 100644 --- a/src/ggml-hsa/kernels/build_triton.py +++ b/src/ggml-hsa/kernels/build_triton.py @@ -103,8 +103,10 @@ def compile_triton_kernel( verbose: If True, enables verbose compilation output. """ + from dataclasses import MISSING + import triton - from triton.backends.amd_triton_npu.config import _UNSET, config_context + from triton.backends.amd_triton_npu.config import config_context from triton.backends.amd_triton_npu.driver import NPUDriver, get_npu_cache_dir # Determine Triton cache directory @@ -125,7 +127,7 @@ def compile_triton_kernel( TempEnvSet("TRITON_CACHE_DIR", str(cache_dir)), config_context( compile_only=True, - transform_tiling_script=kernel_spec.config.get("transform_script", _UNSET), + transform_tiling_script=kernel_spec.config.get("transform_script", MISSING), output_format="xclbin", debug=1 if verbose else 0, target=arch, diff --git a/src/ggml-hsa/kernels/triton/vecadd_aie2.mlir b/src/ggml-hsa/kernels/triton/vecadd_aie2.mlir index b194a70450..b1923052c0 100644 --- a/src/ggml-hsa/kernels/triton/vecadd_aie2.mlir +++ b/src/ggml-hsa/kernels/triton/vecadd_aie2.mlir @@ -2,142 +2,36 @@ // SPDX-License-Identifier: MIT //////////////////////////////////////////////////////////////////////////////// -// Transform Script for Vector Addition: Step-by-Step Annotated -// This script transforms a simple elementwise vector addition IR into a tiled, -// bufferized, and hardware-friendly form suitable for AIE execution. -// Each step is annotated with its purpose, assumptions, and relation to the IR. +// Transform Script for Vector Addition (AIE2) +// Simple elementwise add: out = a + b +// Binary op (2 inputs + 1 output). No fusion needed. Vec tile = 16 (AIE2). +// No type casts needed (bf16 add is native). +// Uses shared library sequences from transform_library.mlir (auto-injected). //////////////////////////////////////////////////////////////////////////////// module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + transform.named_sequence @__transform_main( + %arg1: !transform.any_op {transform.readonly}) { + + // No Phase 1/2 for vec-add (no elementwise fusion needed) + transform.include @flatten_tile_forall failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @pad_and_promote_binary_bf16 failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @one_shot_bufferize failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @post_bufferize_cleanup failures(propagate) + (%arg1) : (!transform.any_op) -> () + + transform.include @vectorize_generics_at_16 failures(propagate) + (%arg1) : (!transform.any_op) -> () + %vh = transform.include @air_herd_mapping_and_vectorize + failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op - // Step 1: Match the main elementwise op (linalg.generic). - // Assumption: The IR contains a linalg.generic op representing the elementwise add. - // This is the main computation to be transformed. - %add = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op - - // Step 2: Flatten elementwise op. - // Purpose: Converts multi-dimensional elementwise ops into a 1D form for easier tiling/vectorization. - // Assumption: The op is elementwise and can be flattened without changing semantics. - %add_flattened = transform.structured.flatten_elementwise %add - : (!transform.any_op) -> !transform.any_op - - // Step 3: Bufferize result to shared (L2) memory allocation. - // Purpose: Allocates the result buffer in memory space 1 (shared/L2), required for AIR/AIE memory hierarchy. - // Assumption: The result of the elementwise op will be written to L2/shared memory. - %add_res_shared, %new_add = transform.structured.bufferize_to_allocation %add_flattened - {memory_space = 1, bufferize_destination_only, emit_dealloc} : !transform.any_op - - // Step 4: Tile the computation using scf.forall with tile size 256. - // Purpose: Introduces parallelism and prepares for mapping to AIE columns. - // Assumption: The problem size is a multiple of 256, or padding will be handled later. - %add_1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %tiled_add_1, %forall_add_1 = - transform.structured.tile_using_forall %add_1 tile_sizes [256] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 5: Run canonicalization and CSE. - // Purpose: Cleans up the IR after tiling, merges redundant ops, and prepares for further transforms. - // Assumption: Canonicalization will simplify the IR and remove dead code. - %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_2 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_2 : !transform.any_op - - // Step 6: Match the (possibly tiled) linalg.generic for further transformation. - %add_2 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op - - // Step 7: Pad the operation. - // Purpose: Ensures that the computation is aligned to tile sizes, handles boundary conditions. - // Assumption: Padding values/types are correct for the op; nofold_flags prevent folding of padding. - %padded_add, %pad_add, %__ = transform.structured.pad %add_2 { - padding_values=[0.0 : bf16, 0.0 : bf16, 0.0 : bf16], - padding_dimensions=[0, 1, 2], - nofold_flags=[1, 1, 1], - copy_back_op="linalg.copy" - } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - - // Step 8: Rewrite in destination-passing style (DPS). - // Purpose: Converts the op to DPS, which is required for bufferization and explicit memory management. - // Assumption: The op supports DPS conversion. - %pad_dps_add = transform.structured.rewrite_in_destination_passing_style %pad_add : (!transform.any_op) -> !transform.any_op - - // Step 9: Promote the operands to local memory (AIE local, memory_space=2). - // Purpose: Moves input operands to fast local memory for efficient AIE execution. - // Assumption: The operands are suitable for promotion and local memory is available. - %padded_add_lhs = transform.get_producer_of_operand %padded_add[0] : (!transform.any_op) -> (!transform.any_op) - %padded_add_lhs_buffer, %padded_add_lhs_new = transform.structured.bufferize_to_allocation %padded_add_lhs - {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op - - %padded_add_rhs = transform.get_producer_of_operand %padded_add[1] : (!transform.any_op) -> (!transform.any_op) - %padded_add_rhs_buffer, %padded_add_rhs_new = transform.structured.bufferize_to_allocation %padded_add_rhs - {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op - - // Step 10: Promote the result to local memory (AIE local, memory_space=2). - // Purpose: Ensures the result buffer is also in local memory for fast access. - // Assumption: The result fits in local memory and can be promoted. - %padded_add_result = transform.get_producer_of_operand %padded_add[2] : (!transform.any_op) -> (!transform.any_op) - %padded_add_result_buffer, %padded_add_result_new = transform.structured.bufferize_to_allocation %padded_add_result - {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op - - // Step 11: Run canonicalization and CSE again. - // Purpose: Cleans up after bufferization and promotion, merges redundant allocs/copies. - // Assumption: Canonicalization will further simplify the IR. - %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - - // Step 12: One-shot bufferization of the function. - // Purpose: Converts all tensors to memrefs, finalizes bufferization for AIR/AIE lowering. - // Assumption: The function is now in DPS form and ready for bufferization. - %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op - - // Step 13: Final canonicalization and AIR-specific cleanup. - // Purpose: Removes redundant memcpy ops, eliminates cascade memcpy patterns, and canonicalizes. - // Assumption: AIR passes will further optimize memory ops for hardware. - %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func6 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func6 : !transform.any_op - transform.apply_patterns to %func6 { - transform.apply_patterns.canonicalization - } : !transform.any_op - %linalg_copies = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %memref_copies = transform.structured.linalg_copy_to_memref %linalg_copies : (!transform.any_op) -> !transform.any_op - %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op - %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op - - // Step 14: Tile linalg.add for vectorization (tile size 16). - // Purpose: Final tiling to enable vectorized execution on AIE hardware. - // Assumption: The innermost dimension is a multiple of 16, or padding has handled the remainder. Vec size 16 for @llvm.aie2.add.accfloat(<8 x i64> %acc1, <8 x i64> %acc2). - %linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %inner_most_generics, %vec_loops:1 = - transform.structured.tile_using_for %linalg_generics tile_sizes [16] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 15: AIR Constructs Mapping - // Purpose: Convert high-level parallel constructs to AIE-specific operations for hardware execution. - // Convert parallel loops to AIE herd operations for multi-core execution - %forall_as_herd = transform.structured.match ops{["scf.forall"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %parallel = transform.loop.forall_to_parallel %forall_as_herd : (!transform.any_op) -> !transform.any_op - %herd = transform.air.par_to_herd %parallel : (!transform.any_op) -> !transform.any_op - - // Convert memory copies to DMA operations for efficient data movement - %copies_in_herd = transform.structured.match ops{["memref.copy", "linalg.copy"]} in %herd : (!transform.any_op) -> !transform.any_op - %dmas_from_copies = transform.air.copy_to_dma %copies_in_herd : (!transform.any_op) -> !transform.any_op - - // Apply vectorization to optimize for AIE vector units - %vectorized_herd = transform.air.herd_vectorize %herd : (!transform.any_op) -> !transform.any_op transform.yield } }