diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000000..087a0e4699 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,206 @@ +# GitHub Copilot Instructions for GGML + +## Project Overview + +GGML is a tensor library for machine learning with a focus on: +- Low-level cross-platform implementation +- Integer quantization support for efficient model inference +- Broad hardware support (CPU, CUDA, Metal, HIP/HSA, SYCL, Vulkan, WebGPU, OpenCL) +- Automatic differentiation +- Zero memory allocations during runtime +- No third-party dependencies for core functionality + +**Note:** This project is under active development. Core library development primarily happens in the [llama.cpp](https://github.com/ggerganov/llama.cpp) and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repositories. + +## Build System + +### CMake Configuration + +- **Minimum CMake version:** 3.14 +- **Languages:** C (C11), C++ (C++17), Assembly +- **Default build type:** Release (if not specified) +- **Shared libraries:** Default ON (except MINGW/Emscripten/WASM) + +### Building the Project + +```bash +mkdir build && cd build +cmake .. +cmake --build . --config Release -j 8 +``` + +### Key CMake Options + +- `BUILD_SHARED_LIBS` - Build shared libraries (default: ON except Windows/MINGW) +- `GGML_BUILD_TESTS` - Build test suite (default: ON when standalone) +- `GGML_BUILD_EXAMPLES` - Build example programs (default: ON when standalone) +- `GGML_CUDA` - Enable CUDA backend +- `GGML_METAL` - Enable Metal backend (default: ON for Apple platforms) +- `GGML_HIP` - Enable HIP backend +- `GGML_HSA` - Enable HSA backend +- `GGML_SYCL` - Enable SYCL backend +- `GGML_VULKAN` - Enable Vulkan backend +- `GGML_BLAS` - Enable BLAS support + +## Coding Standards + +### Code Style + +- **Indentation:** 4 spaces (see `.editorconfig`) +- **Line endings:** LF (Unix-style) +- **Charset:** UTF-8 +- **Final newline:** Required +- **Trailing whitespace:** Remove + +### Formatting Tools + +- A `.clang-format` file exists in `src/ggml-hsa/` based on LLVM style +- **Column limit:** 100 characters +- **Pointer alignment:** Middle (e.g., `int * ptr`) +- **Brace style:** Attach + +### Naming Conventions + +- Public API functions: `ggml_*` prefix +- Backend-specific functions: `ggml__*` (e.g., `ggml_cuda_*`, `ggml_metal_*`) +- Types: `struct ggml_*` +- Enums: `GGML_*` (uppercase with underscores) + +## Architecture + +### Directory Structure + +``` +├── include/ # Public headers (ggml.h, ggml-*.h, gguf.h) +├── src/ # Core implementation and backend implementations +│ ├── ggml.c # Core tensor library +│ ├── ggml-cpu/ # CPU-specific optimizations +│ ├── ggml-cuda/ # CUDA backend +│ ├── ggml-metal/ # Metal backend +│ ├── ggml-hip/ # HIP backend +│ ├── ggml-hsa/ # HSA backend +│ └── ... # Other backends +├── examples/ # Example applications (GPT-2, GPT-J, MNIST, SAM, etc.) +├── tests/ # Test suite +├── cmake/ # CMake modules +├── scripts/ # Utility scripts +└── docs/ # Documentation (GGUF format spec) +``` + +### Key Components + +- **ggml.h/ggml.c** - Core tensor operations and compute graph +- **ggml-backend.h** - Backend abstraction layer +- **ggml-alloc.h** - Memory allocation utilities +- **gguf.h** - GGUF file format for model serialization +- **Backend implementations** - Hardware-specific optimizations + +## Testing + +### Running Tests + +```bash +cd build +ctest --output-on-failure +``` + +### Test Organization + +- Unit tests in `tests/` directory +- Backend-specific tests in `tests/ggml-/` +- Test naming: `test-*.c` or `test-*.cpp` +- Use CTest for test execution + +### Writing Tests + +- Follow existing test patterns in `tests/` directory +- Test both correctness and performance where applicable +- Include edge cases and boundary conditions +- Backend tests should verify backend-specific functionality + +## Contributing Guidelines + +⚠️ **Important:** For changes to the core `ggml` library (including CMake build system): +- Open a PR in https://github.com/ggml-org/llama.cpp first +- This ensures better visibility, testing, and review +- See [CONTRIBUTING.md](../CONTRIBUTING.md) for details + +### Pull Request Process + +1. Ensure code follows the established style +2. Add or update tests as needed +3. Verify all tests pass locally +4. Update documentation if changing public APIs +5. Keep changes focused and minimal + +## Common Tasks + +### Adding a New Backend + +1. Create `src/ggml-/` directory +2. Implement backend interface defined in `ggml-backend.h` +3. Add CMakeLists.txt with appropriate options +4. Create public header `include/ggml-.h` +5. Add tests in `tests/ggml-/` +6. Update main CMakeLists.txt with new options + +### Adding New Tensor Operations + +1. Add operation to `enum ggml_op` in `include/ggml.h` +2. Implement forward pass in `src/ggml.c` +3. Implement backward pass (gradient) if needed +4. Add operation to backend implementations +5. Add comprehensive tests +6. Update documentation + +### Optimizing Existing Operations + +1. Profile to identify bottlenecks +2. Consider SIMD/vectorization opportunities (see `src/ggml-cpu/`) +3. Implement backend-specific optimizations +4. Add performance tests +5. Verify correctness with existing tests + +## Backend-Specific Notes + +### CUDA Backend +- Use `ggml_cuda.h` for CUDA-specific APIs +- CUDA kernels in `src/ggml-cuda/` + +### Metal Backend +- macOS/iOS GPU acceleration +- Shaders in Metal Shading Language +- Default ON for Apple platforms + +### HIP/HSA Backends +- AMD GPU support +- Use appropriate compiler flags for ROCm + +### CPU Backend +- SIMD optimizations in `src/ggml-cpu/` +- Multiple implementations for different architectures +- llamafile integration for optimized matrix multiplication + +## Python Bindings + +Python bindings are available in `examples/python/`: +- Auto-generated using CFFI +- Support for quantized tensors with automatic conversion +- See `examples/python/README.md` for usage + +## Resources + +- [Introduction to ggml](https://huggingface.co/blog/introduction-to-ggml) +- [GGUF file format specification](../docs/gguf.md) +- [llama.cpp project](https://github.com/ggerganov/llama.cpp) - Primary development hub +- [whisper.cpp project](https://github.com/ggerganov/whisper.cpp) - Speech recognition with ggml + +## Important Reminders + +1. **Minimal changes**: Make surgical, focused changes +2. **Test early and often**: Run tests after each significant change +3. **Follow existing patterns**: Match the style and structure of existing code +4. **Consider performance**: GGML is performance-critical; profile changes +5. **Cross-platform**: Ensure changes work on Linux, macOS, and Windows +6. **Documentation**: Update comments and docs for public API changes +7. **Upstream first**: Core changes should go to llama.cpp repository first diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml new file mode 100644 index 0000000000..691bb72fa3 --- /dev/null +++ b/.github/workflows/format.yml @@ -0,0 +1,74 @@ +name: Format Code + +on: + push: + branches: [ hsa-backend ] + paths: + - 'src/ggml-hsa/**' + pull_request: + branches: [ hsa-backend ] + paths: + - 'src/ggml-hsa/**' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +jobs: + format: + runs-on: ubuntu-latest + + steps: + - name: Clone + uses: actions/checkout@v6 + with: + ref: ${{ github.head_ref || github.ref_name }} + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install formatters + run: | + sudo mkdir -p /etc/apt/keyrings + curl -fsSL https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm-snapshot.gpg + echo "deb [signed-by=/etc/apt/keyrings/llvm-snapshot.gpg] http://apt.llvm.org/$(lsb_release -cs)/ llvm-toolchain-$(lsb_release -cs)-22 main" | sudo tee /etc/apt/sources.list.d/llvm-toolchain-22.list + sudo apt-get update + sudo apt-get install -y clang-format-22 + pip install black + + - name: Format C++ code with clang-format + run: | + find src/ggml-hsa -type f \( -name '*.cpp' -o -name '*.cc' -o -name '*.hpp' -o -name '*.h' \) \ + -exec clang-format-22 -i --style=file:src/ggml-hsa/.clang-format {} + + + - name: Format Python code with black + run: | + black src/ggml-hsa + + - name: Check for changes + id: verify + run: | + if ! git diff --exit-code; then + echo "changes=true" >> $GITHUB_OUTPUT + else + echo "changes=false" >> $GITHUB_OUTPUT + fi + + - name: Commit and push formatting changes + if: steps.verify.outputs.changes == 'true' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add src/ggml-hsa + git commit -m "Auto-format code in src/ggml-hsa + + - Format C++ code with clang-format + - Format Python code with black + + Co-Authored-By: github-actions[bot] " + git push diff --git a/CMakeLists.txt b/CMakeLists.txt index 4323afe57b..3f1ad422a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -213,6 +213,7 @@ option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF) option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON) option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF) +option(GGML_HSA "ggml: use HSA" OFF) option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF) option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF) @@ -319,6 +320,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-cann.h include/ggml-cpp.h include/ggml-cuda.h + include/ggml-hsa.h include/ggml-opt.h include/ggml-metal.h include/ggml-rpc.h diff --git a/README.md b/README.md index aaa7162c46..39fac36f11 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,12 @@ cmake -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc .. cmake -DCMAKE_C_COMPILER="$(hipconfig -l)/clang" -DCMAKE_CXX_COMPILER="$(hipconfig -l)/clang++" -DGGML_HIP=ON ``` +## Using HSA + +```bash +cmake -DCMAKE_C_COMPILER="$(hipconfig -l)/clang" -DCMAKE_CXX_COMPILER="$(hipconfig -l)/clang++" -DGGML_HSA=ON +``` + ## Using SYCL ```bash diff --git a/cmake/ggml-config.cmake.in b/cmake/ggml-config.cmake.in index 91c9d5cd34..511e0756bc 100644 --- a/cmake/ggml-config.cmake.in +++ b/cmake/ggml-config.cmake.in @@ -83,6 +83,11 @@ if (NOT GGML_SHARED_LIB) set(GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas) endif() + if (GGML_HSA) + find_package(hsa-runtime64 1.0 REQUIRED) + set(GGML_HSA_INTERFACE_LINK_LIBRARIES hsa-runtime64::hsa-runtime64) + endif() + if (GGML_SYCL) set(GGML_SYCL_INTERFACE_LINK_LIBRARIES "") find_package(DNNL) diff --git a/examples/gpt-2/main-backend.cpp b/examples/gpt-2/main-backend.cpp index 6c68712477..789d3dfa63 100644 --- a/examples/gpt-2/main-backend.cpp +++ b/examples/gpt-2/main-backend.cpp @@ -11,6 +11,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_HSA +#include "ggml-hsa.h" +#endif + #include "common.h" #include "common-ggml.h" @@ -220,6 +224,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & } #endif +#ifdef GGML_USE_HSA + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: using HSA backend\n", __func__); + model.backend = ggml_backend_hsa_init(0); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_hsa_init() failed\n", __func__); + } + } +#endif + if (!model.backend) { // fallback to CPU backend fprintf(stderr, "%s: using CPU backend\n", __func__); @@ -231,6 +245,12 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } + ggml_backend_dev_t device = ggml_backend_get_device(model.backend); + size_t total_memory = 0; + size_t free_memory = 0; + ggml_backend_dev_memory(device, &free_memory, &total_memory); + fprintf(stderr, "%s: free memory %zu, total memory %zu\n", __func__, free_memory, total_memory); + // create the tensors for the model { const auto & hparams = model.hparams; diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp index 7d789d68ac..b9ad9d003a 100644 --- a/examples/gpt-2/main-sched.cpp +++ b/examples/gpt-2/main-sched.cpp @@ -15,6 +15,11 @@ #include "ggml-blas.h" #endif +#ifdef GGML_USE_HSA +#include "ggml-hsa.h" +#endif + + #include "common.h" #include "common-ggml.h" @@ -145,6 +150,18 @@ void init_backends(gpt2_model & model, const gpt_params & params) { } #endif +#ifdef GGML_USE_HSA + if (params.n_gpu_layers > 0) { + fprintf(stderr, "%s: using HSA backend\n", __func__); + ggml_backend_t hsa_backend = ggml_backend_hsa_init(0); + if (!hsa_backend) { + fprintf(stderr, "%s: ggml_backend_hsa_init() failed\n", __func__); + } else { + model.backends.push_back(hsa_backend); + } + } +#endif + // always add the CPU backend as a fallback ggml_backend_t cpu_backend = ggml_backend_cpu_init(); ggml_backend_cpu_set_n_threads(cpu_backend, params.n_threads); diff --git a/include/ggml-hsa.h b/include/ggml-hsa.h new file mode 100644 index 0000000000..a27d8f3231 --- /dev/null +++ b/include/ggml-hsa.h @@ -0,0 +1,38 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_HSA_NAME "HSA" +#define GGML_HSA_MAX_DEVICES 16 + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_hsa_init(int32_t device); + +GGML_BACKEND_API bool ggml_backend_is_hsa(ggml_backend_t backend); + +// device buffer +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_hsa_buffer_type(int32_t device); + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_hsa_split_buffer_type(int32_t main_device, const float * tensor_split); + +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_hsa_host_buffer_type(void); + +GGML_BACKEND_API int32_t ggml_backend_hsa_get_device_count(void); +GGML_BACKEND_API void ggml_backend_hsa_get_device_description(int32_t device, char * description, size_t description_size); +GGML_BACKEND_API void ggml_backend_hsa_get_device_memory(int32_t device, size_t * free, size_t * total); + +GGML_BACKEND_API bool ggml_backend_hsa_register_host_buffer(void * buffer, size_t size); +GGML_BACKEND_API void ggml_backend_hsa_unregister_host_buffer(void * buffer); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hsa_reg(void); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 265023733e..2b86f0630f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -449,6 +449,7 @@ ggml_add_backend(BLAS) ggml_add_backend(CANN) ggml_add_backend(CUDA) ggml_add_backend(HIP) +ggml_add_backend(HSA) ggml_add_backend(METAL) ggml_add_backend(MUSA) ggml_add_backend(RPC) diff --git a/src/ggml-backend-reg.cpp b/src/ggml-backend-reg.cpp index 311fa5fe36..17848b9d7d 100644 --- a/src/ggml-backend-reg.cpp +++ b/src/ggml-backend-reg.cpp @@ -30,6 +30,10 @@ #include "ggml-cpu.h" #endif +#ifdef GGML_USE_HSA +#include "ggml-hsa.h" +#endif + #ifdef GGML_USE_CUDA #include "ggml-cuda.h" #endif @@ -109,6 +113,9 @@ struct ggml_backend_registry { std::vector devices; ggml_backend_registry() { +#ifdef GGML_USE_HSA + register_backend(ggml_backend_hsa_reg()); +#endif #ifdef GGML_USE_CUDA register_backend(ggml_backend_cuda_reg()); #endif @@ -549,6 +556,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("cann", silent, dir_path); ggml_backend_load_best("cuda", silent, dir_path); ggml_backend_load_best("hip", silent, dir_path); + ggml_backend_load_best("hsa", silent, dir_path); ggml_backend_load_best("metal", silent, dir_path); ggml_backend_load_best("rpc", silent, dir_path); ggml_backend_load_best("sycl", silent, dir_path); diff --git a/src/ggml-hsa/.clang-format b/src/ggml-hsa/.clang-format new file mode 100644 index 0000000000..3df317bfef --- /dev/null +++ b/src/ggml-hsa/.clang-format @@ -0,0 +1,275 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +--- +BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: true +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveShortCaseStatements: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCaseArrows: false + AlignCaseColons: false +AlignConsecutiveTableGenBreakingDAGArgColons: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveTableGenCondOperatorColons: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveTableGenDefinitionColons: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowBreakBeforeNoexceptSpecifier: Never +AllowShortBlocksOnASingleLine: Never +AllowShortCaseExpressionOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: false +AllowShortCompoundRequirementOnASingleLine: true +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: OnePerLine +BitFieldColonSpacing: Both +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: true + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakAdjacentStringLiterals: true +BreakAfterAttributes: Leave +BreakAfterJavaFieldAnnotations: false +BreakAfterReturnType: None +BreakArrays: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeConceptDeclarations: Always +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: AfterColon +BreakFunctionDefinitionParameters: false +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +BreakTemplateDeclarations: Yes +ColumnLimit: 100 +CommentPragmas: "^ IWYU pragma:" +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: ^"(llvm|llvm-c|clang|clang-c)/ + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: ^(<|"(gtest|gmock|isl|json)/) + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: .* + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: (Test)?$ +IncludeIsMainSourceRegex: "" +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: true +IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: None +IndentRequiresClause: true +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: false +InsertTrailingCommas: None +IntegerLiteralSeparator: + Binary: 0 + BinaryMinDigits: 0 + Decimal: 0 + DecimalMinDigits: 0 + Hex: 0 + HexMinDigits: 0 +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLines: + AtEndOfFile: false + AtStartOfBlock: true + AtStartOfFile: true +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: "" +MacroBlockEnd: "" +MainIncludeChar: Quote +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PPIndentWidth: -1 +PackConstructorInitializers: NextLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakScopeResolution: 500 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Middle +QualifierAlignment: Leave +ReferenceAlignment: Middle +ReflowComments: true +RemoveBracesLLVM: false +RemoveParentheses: Leave +RemoveSemicolon: false +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SkipMacroDefinitionBody: false +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceAroundPointerQualifiers: Both +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeJsonColon: false +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDeclarationName: false + AfterFunctionDefinitionName: false + AfterIfMacros: true + AfterOverloadedOperator: true + AfterPlacementOperator: true + AfterRequiresInClause: false + AfterRequiresInExpression: false + BeforeNonEmptyParentheses: false +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParens: Custom +SpacesInParensOptions: + ExceptDoubleParentheses: false + InConditionalStatements: false + InCStyleCasts: false + InEmptyParentheses: false + Other: false +SpacesInSquareBrackets: false +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +TableGenBreakInsideDAGArg: DontBreak +UseTab: Never +VerilogBreakBetweenInstancePorts: true +WhitespaceSensitiveMacros: + - BOOST_PP_STRINGIZE + - CF_SWIFT_NAME + - NS_SWIFT_NAME + - PP_STRINGIZE + - STRINGIZE +Language: Cpp diff --git a/src/ggml-hsa/AGENTS.md b/src/ggml-hsa/AGENTS.md new file mode 100644 index 0000000000..47181c8172 --- /dev/null +++ b/src/ggml-hsa/AGENTS.md @@ -0,0 +1,427 @@ +# GGML HSA backend AGENTS.md - AI Agent Guidelines for ggml-hsa + +This document provides guidance for AI agents working on the ggml-hsa codebase. + +## Project Overview + +The ggml-hsa backend enables GGML tensor operations to run on AMD XDNA NPUs (AI Engines). It supports: + +- **aie2** architecture (Phoenix, Hawk Point) +- **aie2p** architecture (Strix Halo, Krackan) + +The backend uses a multi-backend kernel compilation system with per-operation dispatch. Currently supported backends: + +- **IRON** (MLIR-AIE framework) - Optimized AIE kernels + +The system supports both JIT and AOT compilation. + +## Codebase Structure + +``` +src/ggml-hsa/ +├── ggml-hsa.cpp # Backend implementation (HSA runtime integration) +├── common.hpp # Common utilities and type definitions +├── host-ops.cpp/hpp # Host-side operation implementations +├── kernel-discovery.cpp/hpp # Runtime kernel discovery and loading +├── aie-kernel.cpp/hpp # AIE kernel abstraction layer +├── aie-kernel-compiler.cpp/hpp # JIT compilation interface +├── type-traits.hpp # GGML type to C++ type mapping +├── kernels/ # AIE kernel implementations (two-layer architecture) +│ ├── __init__.py # Package exports (ggml_compile_op, TensorDesc) +│ ├── build.py # Kernel compilation orchestrator +│ ├── build_iron.py # IRON backend compiler +│ ├── kernel.py # Core types: Backend enum, Kernel, KernelSpec +│ ├── tensor_desc.py # Tensor descriptor dataclass +│ ├── binary_ops.py # Top-level GGML binary op dispatch +│ ├── unary_ops.py # Top-level GGML unary op dispatch +│ ├── scale.py # Top-level scale op dispatch +│ ├── soft_max.py # Top-level softmax op dispatch +│ ├── clamp.py # Top-level clamp op dispatch +│ ├── mul_mat.py # Top-level matrix multiply dispatch +│ ├── argmax.py # Top-level argmax op dispatch +│ ├── count_equal.py # Top-level count_equal op dispatch +│ ├── cross_entropy_loss.py # Top-level cross entropy loss op dispatch +│ └── iron/ # IRON kernel implementations +│ ├── __init__.py # Subpackage init +│ ├── utils.py # Shared utilities (alignment, device mapping) +│ ├── binary_ops.py/cc # Binary ops IRON design + AIE core function +│ ├── unary_ops.py/cc # Unary ops IRON design + AIE core function +│ ├── scale.py/cc # Scale IRON design + AIE core function +│ ├── softmax.py/cc # Softmax IRON design + AIE core function +│ ├── clamp.py/cc # Clamp IRON design + AIE core function +│ ├── argmax.py/cc # Argmax IRON design + AIE core function +│ ├── count_equal.py/cc # Count equal IRON design + AIE core function +│ ├── cross_entropy_loss.py/cc # Cross entropy loss IRON design + AIE core function +│ ├── gemm.py # Matrix multiplication IRON design +│ ├── ggml-aie.hpp # Common AIE type definitions +│ ├── aie_kernel_utils.h # AIE kernel utility macros +│ ├── aie_kernel_math.h # AIE math utility functions (vec_exp) +│ ├── aie2/ # aie2-specific core functions (mm.cc, zero.cc) +│ └── aie2p/ # aie2p-specific core functions (mm.cc, zero.cc) +└── cmake/ # CMake utilities +``` + +### Two-Layer Dispatch Architecture + +The kernel build system uses a two-layer dispatch architecture that separates +static operation mapping from runtime backend selection: + +#### Layer 1: Static Mapping (Kernel) + +The `_op_to_kernel_map` in `build.py` maps GGML operation names to `Kernel` objects: + +```python +from kernel import Kernel + +_op_to_kernel_map = { + "ADD": Kernel("ggml_op_add", "binary_ops.py"), + "SCALE": Kernel("ggml_op_scale", "scale.py"), +} +``` + +The `Kernel` dataclass identifies: + +- `name`: The dispatch function name (e.g., `"ggml_op_add"`) +- `source_file`: The Python module containing the dispatch function + +#### Layer 2: Runtime Dispatch (KernelSpec) + +Dispatch functions examine tensor parameters and return a `KernelSpec`: + +```python +from kernel import Backend, KernelSpec +from .iron.scale import scale + +def ggml_op_scale(arch, input_tensors, output_tensor, op_params) -> KernelSpec: + return KernelSpec( + backend=Backend.IRON, + op_name="GGML_OP_SCALE", + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=scale, + ) +``` + +The `KernelSpec` specifies: + +- `backend`: Which compilation backend to use (`Backend.IRON`) +- `op_name`: Name of the operation (e.g., `"GGML_OP_SCALE"`) +- `arch`: Target architecture string (`"aie2"` or `"aie2p"`) +- `input_tensors`: List of input tensor descriptors +- `output_tensor`: Output tensor descriptor +- `op_params`: Operation-specific parameters as a bytearray +- `function`: The callable that generates backend-specific IR + +This enables per-invocation backend selection based on tensor shapes, dtypes, +or other runtime parameters. + +### Compilation Pipeline + +The compilation flow in `ggml_compile_op`: + +1. Look up `Kernel` from `_op_to_kernel_map` +2. Dynamically import the dispatch module +3. Call dispatch function to get `KernelSpec` (includes all tensor/op context) +4. Look up compiler function via `get_compiler(backend)` +5. Invoke the backend-specific compiler + +``` +ggml_compile_op("SCALE", ...) + └─> get_kernel("SCALE") -> Kernel("ggml_op_scale", "scale.py") + └─> import_from_path("ggml_op_scale", "scale.py") + └─> ggml_op_scale(...) -> KernelSpec(backend=IRON, function=scale) + └─> get_compiler(Backend.IRON) -> compile_iron_kernel + └─> compile_iron_kernel(kernel_spec, ...) +``` + +### Backend Compilers + +Each backend has a dedicated compiler module: + +- **IRON** (`build_iron.py`): Compiles IRON Python designs to PDI/instructions + - Calls the `KernelSpec.function` to generate an MLIR module + - Compiles any C++ core functions to object files + - Produces final `.pdi` and `_insts.bin` files + +Compilers are registered in `build.py`: + +```python +from kernel import Backend +from build_iron import compile_iron_kernel + +_compilers = { + Backend.IRON: compile_iron_kernel, +} +``` + +### IRON Kernel Implementations + +IRON kernels (`kernels/iron/*.py`) define: + +- Data movement via ObjectFifos (input/output streaming) +- Worker placement on AIE tiles +- Runtime sequences for DMA transfers +- External function declarations for C++ core functions + +These are paired with C++ core functions (`kernels/iron/*.cc`) that implement +the actual vectorized computations using the AIE API. + +### Broadcasting Support + +Binary operations (`ADD`, `SUB`, `MUL`, `DIV`) support multi-dimensional broadcasting +following GGML semantics where `src1` can be repeated to fill `dst`: + +- **Validation**: `dst->ne[i] % src1->ne[i] == 0` for all dimensions (per `ggml_can_repeat`) +- **Implementation**: The broadcast kernel receives full `src1` buffer and shape tuples, + then computes per-element `src1` indices via 4D coordinate decomposition and modulo + +Key data structures in `binary_ops.py`: + +```python +@dataclass(frozen=True) +class BroadcastFunctionSpec: + external_function: ExternalFunction + num_elements_out: int + num_elements_src1: int + src1_ne: tuple # (ne0, ne1, ne2, ne3) - src1 shape + dst_ne: tuple # (ne0, ne1, ne2, ne3) - dst shape +``` + +The C++ kernel computes broadcast indices using 32-bit arithmetic only (AIE cores lack +64-bit division runtime): + +```cpp +// Decompose global index g into 4D dst coordinates +int32_t i0 = g % dst_ne0; +int32_t i1 = (g / d1) % dst_ne1; +int32_t i2 = (g / d2) % dst_ne2; +int32_t i3 = g / (d2 * dst_ne2); + +// Apply broadcast modulo to get src1 coordinates +int32_t j0 = i0 % src1_ne0; +int32_t j1 = i1 % src1_ne1; +int32_t j2 = i2 % src1_ne2; +int32_t j3 = i3 % src1_ne3; + +// Compute linear src1 index +int32_t idx_src1 = j0 + j1 * s1 + j2 * s2 + j3 * s3; +``` + +## Kernel Development Pattern + +Each kernel consists of three files across two layers: + +### 1. Dispatch Function (e.g., `kernels/unary_ops.py`) + +Returns a `KernelSpec` specifying backend, function, and tensor context: + +- Imports the kernel function from the appropriate backend subpackage +- Provides the standard GGML dispatch signature +- Returns `KernelSpec` with all fields: `backend`, `op_name`, `arch`, `input_tensors`, `output_tensor`, `op_params`, `function` +- May use `functools.partial` to bind operation-specific parameters + +### 2. IRON Design (e.g., `kernels/iron/unary_ops.py`) + +Defines the IRON program structure: + +- Data movement via ObjectFifos (input/output streaming) +- Worker placement on AIE tiles +- Runtime sequences for DMA transfers +- External function declarations for C++ core functions +- Tiling and alignment calculations + +### 3. C++ Core Function (e.g., `kernels/iron/unary_ops.cc`) + +Implements the core computation using the AIE API: + +- Uses `#ifdef GGML_OP_` guards for selective compilation +- Uses `INPUT_DTYPE` and `OUTPUT_DTYPE` macros for type flexibility +- Includes `` for AIE vector intrinsics +- Functions follow naming convention: `ggml_op_` +- Uses `extern "C"` linkage for IRON integration + +## Adding a New Kernel + +1. **Register the operation** in `kernels/build.py`: + + ```python + _op_to_kernel_map = { + "NEW_OP": Kernel("ggml_op_new_op", "new_op.py"), + } + ``` + +2. **Create the dispatch function** (`kernels/new_op.py`): + + ```python + """Top-level entry point for GGML_OP_NEW_OP.""" + from .iron.new_op import new_op + from .kernel import Backend, KernelSpec + + def ggml_op_new_op( + arch: str, input_tensors: list, output_tensor, op_params: bytearray + ) -> KernelSpec: + """GGML_OP_NEW_OP implementation.""" + return KernelSpec( + backend=Backend.IRON, + op_name="GGML_OP_NEW_OP", + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=new_op, + ) + ``` + +3. **Create the IRON design** (`kernels/iron/new_op.py`): + - Import from `aie.iron` (ObjectFifo, Program, Runtime, Worker, etc.) + - Import utilities from `.utils` (arch_to_device, align_to_arch, etc.) + - Define the data flow and compute structure + - Create external function specs for the C++ core function + - Function signature: `def new_op(arch, input_tensors, output_tensor, op_params)` + +4. **Create the C++ core function** (`kernels/iron/new_op.cc`): + - Use compile guards: `#ifdef GGML_OP_NEW_OP` + - Implement: `void ggml_op_new_op(const INPUT_DTYPE*, OUTPUT_DTYPE*, int32_t N)` + - Use `extern "C"` linkage + - Include `ggml-aie.hpp` for common type definitions + +5. **Register the file with CMake** + - Add the files in the `kernels/CMakeLists.txt` + +6. (optional) **Add backend support** in `ggml-hsa.cpp`: + - Add to `ggml_hsa_op_supports()` for operation support check + - Add case in `ggml_hsa_compute_forward()` for dispatch + +## Adding a New Compilation Backend + +To add a new backend (e.g., Triton): + +1. **Add to the Backend enum** in `kernels/kernel.py`: + + ```python + class Backend(Enum): + IRON = auto() + TRITON = auto() # New backend + ``` + +2. **Create the backend compiler** (`kernels/build_triton.py`): + + ```python + def compile_triton_kernel( + kernel_spec: KernelSpec, + arch: str, + input_tensors: list[TensorDesc], + output_tensor: TensorDesc, + op_params: bytearray, + work_dir: Path, + exported_name: str, + output_directory: Path, + logger: logging.Logger, + verbose: bool, + ) -> None: + # Call kernel_spec.function to generate Triton IR + # Compile to PDI and instructions + pass + ``` + +3. **Register the compiler** in `kernels/build.py`: + + ```python + from build_triton import compile_triton_kernel + + _compilers = { + Backend.IRON: compile_iron_kernel, + Backend.TRITON: compile_triton_kernel, + } + ``` + +4. **Update dispatch functions** to return the new backend when appropriate: + + ```python + def ggml_op_new_op(...) -> KernelSpec: + if some_condition: + return KernelSpec(backend=Backend.TRITON, function=triton_new_op) + return KernelSpec(backend=Backend.IRON, function=iron_new_op) + ``` + +## Code Conventions + +### C++ (Host Code) + +- Use `std::` prefix for standard library types +- Use `GGML_ASSERT()` / `GGML_ABORT()` for error handling +- Check HSA calls with `GGML_HSA_CHECK()` macro +- Follow existing formatting (see `.clang-format`) + +### C++ (Kernel Code) + +- Include `ggml-aie.hpp` for common type definitions +- Use `event0()` / `event1()` for profiling regions +- Prefer vectorized operations from `aie_api/aie.hpp` +- Keep kernels simple and focused on compute +- Follow existing formatting (see `.clang-format`) + +### Python + +- Follow existing patterns in `iron/unary_ops.py` / `iron/binary_ops.py` +- Use `CoreFunctionSpec` dataclass for external function specifications +- Import utilities from `iron/utils.py`: + - `arch_to_device()` - Convert arch string to IRON device object + - `arch_aligned_num_elements()` - Align tensor sizes to architecture requirements + - `align_to_arch()` - Align arbitrary sizes to byte boundaries + - `max_tile_size()` - Calculate optimal tile size for vectorization + - `suppress_import_pyxrt_msg()` - Suppress noisy pyxrt import messages +- Top-level wrappers import from `.iron.` subpackage +- Follow existing formatting using `black` +- Add module docstrings to all Python files + +## Data Types + +Supported GGML types and their mappings: + +| GGML Type | Native Support | Notes | +| ----------- | --------------- | ------- | +| `GGML_TYPE_I8` | Yes | Native AIE type | +| `GGML_TYPE_I16` | Yes | Native AIE type | +| `GGML_TYPE_I32` | Yes | Native AIE type | +| `GGML_TYPE_BF16` | Yes | Native AIE type | +| `GGML_TYPE_F16` | Via BF16 | Converted internally | +| `GGML_TYPE_F32` | Emulated | Slower than native | + +## Environment Setup + +```bash +# Set up Python environment with IRON dependencies +source ./env_setup.sh +# Or manually: +python3 -m pip install -r requirements.txt +``` + +## Testing + +- Ensure that an IRON environment is present and active +- Build with `GGML_HSA=ON` and optionally `GGML_HSA_JIT_COMPILE=ON` +- Test files are in `tests` and `tests/ggml-hsa/` +- Ensure kernels work for both `aie2` and `aie2p` architectures +- A specific operation can be tested using `test-backend-ops -o OP` +- **Success:** Look for `/ tests passed`. +- **Failure:** Look for `0/0 tests passed` or `Could not create kernel for tensor`. + +## Common Pitfalls + +1. **Tensor alignment**: AIE requires specific alignment (4-byte boundaries) +2. **Tile sizes**: Must evenly divide the total element count +3. **Type casting**: Be explicit with casts in kernel code +4. **Contiguous tensors**: Many operations require contiguous memory layout +5. **op_params encoding**: Non-zero op_params are encoded in kernel names + +## Useful Environment Variables + +| Variable | Purpose | +| ---------- | --------- | +| `GGML_HSA_ENABLE_LOG` | Enable debug logging | +| `GGML_HSA_KERNEL_DIR` | Precompiled kernel directory | +| `GGML_HSA_KERNEL_CACHE_DIR` | JIT cache directory | +| `GGML_HSA_JIT_VERBOSE` | Verbose JIT output | diff --git a/src/ggml-hsa/CMakeLists.txt b/src/ggml-hsa/CMakeLists.txt new file mode 100644 index 0000000000..6716226e87 --- /dev/null +++ b/src/ggml-hsa/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved. + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake) + +if (NOT EXISTS $ENV{ROCM_PATH}) + if (NOT EXISTS /opt/rocm) + set(ROCM_PATH "/usr") + else() + set(ROCM_PATH /opt/rocm) + endif() +else() + set(ROCM_PATH $ENV{ROCM_PATH}) +endif() + +list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) + +# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. +if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) +endif() + +find_package(hsa-runtime64 1.0 REQUIRED) + +option(GGML_HSA_JIT_COMPILE "ggml-hsa: enable JIT compilation of kernels" ON) + +set(GGML_HEADERS_HSA ../../include/ggml-hsa.h) +set(GGML_SOURCES_HSA + aie-kernel.cpp + ggml-hsa.cpp + host-ops.cpp + kernel-discovery.cpp + ) + +if (GGML_HSA_JIT_COMPILE) + include(FetchContent) + + find_package(Python3 COMPONENTS Interpreter Development REQUIRED) + + FetchContent_Declare( + pybind11 + GIT_REPOSITORY https://github.com/pybind/pybind11.git + GIT_TAG v3.0 + ) + + set(PYBIND11_FINDPYTHON OFF) + FetchContent_MakeAvailable(pybind11) + + set(GGML_SOURCES_HSA aie-kernel-compiler.cpp ${GGML_SOURCES_HSA}) +endif () + +ggml_add_backend_library(ggml-hsa + ${GGML_HEADERS_HSA} + ${GGML_SOURCES_HSA} + ) + +target_link_libraries(ggml-hsa PRIVATE ggml-base hsa-runtime64::hsa-runtime64) +target_include_directories(ggml-hsa PRIVATE . ..) + +target_sources(ggml-hsa + PRIVATE + aie-kernel.hpp + common.hpp + host-ops.hpp + kernel-discovery.hpp + type-traits.hpp + ) + +if (GGML_HSA_JIT_COMPILE) + target_sources(ggml-hsa PRIVATE aie-kernel-compiler.hpp) + target_compile_definitions(ggml-hsa PRIVATE -DGGML_HSA_JIT_COMPILE) + target_link_libraries(ggml-hsa PRIVATE pybind11::embed) + + add_subdirectory(kernels) +endif () + +add_compile_definitions(GGML_USE_HSA) diff --git a/src/ggml-hsa/README.md b/src/ggml-hsa/README.md new file mode 100644 index 0000000000..b63150df9a --- /dev/null +++ b/src/ggml-hsa/README.md @@ -0,0 +1,155 @@ +# GGML HSA Backend + +The GGML HSA (`ggml-hsa`) backend enables GGML tensor operations to run on AMD XDNA NPUs (AI Engines). + +## Supported Devices + +| Architecture | NPU Generation | Example Platforms | +|--------------|----------------|-------------------------| +| `aie2` | [AMD XDNA] | Phoenix, Hawk Point | +| `aie2p` | [AMD XDNA2] | Strix Halo, Krackan | + +[AMD XDNA]: https://www.amd.com/en/technologies/xdna.html +[AMD XDNA2]: https://www.amd.com/en/technologies/xdna.html#xdna2 + +## Supported Operations + +| Category | Operations | +|-----------|----------------------------------------------------------------| +| Binary | `ADD`, `SUB`, `MUL`, `DIV` (with multi-dimensional broadcast) | +| Unary | `SQR`, `SQRT`, `LOG`, `SIN`, `COS`, `EXP` | +| Unary | `ABS`, `SGN`, `NEG`, `STEP`, `FLOOR`, `CEIL`, `ROUND`, `TRUNC` | +| Unary | `RELU`, `TANH`, `ELU`, `SIGMOID`, `SILU` | +| Unary | `GELU`, `GELU_QUICK`, `GELU_ERF`, `HARDSWISH`, `HARDSIGMOID` | +| Unary | `XIELU` | +| Matrix | `MUL_MAT` | +| Reduction | `ARGMAX`, `COUNT_EQUAL` | +| Loss | `CROSS_ENTROPY_LOSS` | +| Other | `SCALE`, `SOFT_MAX`, `CLAMP` | + +### Broadcasting + +Binary operations support GGML-style broadcasting where `src1` can be repeated to match `dst`: +- `dst->ne[i] % src1->ne[i] == 0` must hold for all dimensions +- Examples: `(10,5,4,3) + (10,5,4,3)` (element-wise), `(20,5,4,3) + (10,5,4,3)` (broadcast in dim0) +- Multi-dimensional broadcasting: `(20,10,8,6) + (10,5,4,3)` (broadcast in all dims) + +## Supported Data Types + +| Type | Support | +|------------------|----------------------------------------| +| `GGML_TYPE_I8` | Native `aie2` / `aie2p` datatype | +| `GGML_TYPE_I16` | Native `aie2` / `aie2p` datatype | +| `GGML_TYPE_I32` | Native `aie2` / `aie2p` datatype | +| `GGML_TYPE_BF16` | Native `aie2` / `aie2p` datatype | +| `GGML_TYPE_F16` | Supported via conversion to `BF16` | +| `GGML_TYPE_F32` | Emulated (slower than native types) | + +## Prerequisites + +### Tested Configurations + +| Component | Version | +|-------------|----------------------------------------------------------------------| +| OS | [Ubuntu 24.04.2], [Ubuntu 25.10] | +| ROCm | [7.2.0][ROCm 7.2.0] | +| XDNA Driver | [1.6][XDNA Driver 1.6] | +| MLIR-AIE | [1.2.1][MLIR-AIE 1.2.1] | + +[Ubuntu 24.04.2]: https://releases.ubuntu.com/noble/ +[Ubuntu 25.10]: https://releases.ubuntu.com/questing/ +[ROCm 7.2.0]: https://rocm.docs.amd.com/en/docs-7.2.0/ +[XDNA Driver 1.6]: https://github.com/amd/xdna-driver/tree/1.6 +[MLIR-AIE 1.2.1]: https://github.com/Xilinx/mlir-aie/tree/v1.2.1 + +### ROCm + +`ggml-hsa` requires [ROCm](https://github.com/ROCm/rocm-systems) 7.1.1 or newer. See the [installation instructions](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html). + +Due to ongoing NPU support work in [ROCR](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocr-runtime), it is recommended to compile the latest ROCR from source. Commit [`863ffc1`](https://github.com/ROCm/rocm-systems/commit/863ffc1c07cf56567101fff2c39b66efb4cdb579) is confirmed working. + +### AMD XDNA Driver + +`ggml-hsa` depends on the [AMD XDNA Driver](https://github.com/amd/xdna-driver). Installation instructions: + +- Via IRON: [build_drivers.sh](https://github.com/Xilinx/mlir-aie/blob/main/utils/build_drivers.sh) +- Direct: [xdna-driver README](https://github.com/amd/xdna-driver#linux-compilation-and-installation) + +### MLIR-AIE (IRON) + +`ggml-hsa` supports JIT compilation via the [IRON framework](https://github.com/Xilinx/mlir-aie). + +Install dependencies: + +```bash +python3 -m pip install -r src/ggml-hsa/requirements.txt +``` + +Or use the setup script to create a virtual environment: + +```bash +source src/ggml-hsa/env_setup.sh +``` + +> **Note:** IRON environments consume considerable storage. For pre-generated kernels, set `GGML_HSA_KERNEL_DIR` and disable JIT at compile time. + +## Building + +### Basic HSA Build + +```bash +cmake -S . -B build \ + -DGGML_HSA=ON \ + -DGGML_HSA_JIT_COMPILE=ON \ + -Dhsa-runtime64_DIR=/path/to/rocm/lib/cmake/hsa-runtime64 \ + -DCMAKE_BUILD_TYPE=Release + +cmake --build build --config Release -j +``` + +### Combined HSA + HIP Build + +```bash +HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ +cmake -S . -B build \ + -DGGML_HSA=ON \ + -DGGML_HSA_JIT_COMPILE=ON \ + -Dhsa-runtime64_DIR=/path/to/rocm/lib/cmake/hsa-runtime64 \ + -DGGML_HIP=ON \ + -DGPU_TARGETS=gfx1102 \ + -DCMAKE_BUILD_TYPE=Release + +cmake --build build --config Release -j +``` + +## JIT Compilation + +JIT compilation generates kernels on-the-fly. Precompiled kernels in `GGML_HSA_KERNEL_DIR` take precedence. + +**Cache Location** (in order of precedence): + +1. `GGML_HSA_KERNEL_CACHE_DIR` +2. `${XDG_CACHE_HOME}/.ggml/` +3. `$HOME/.cache/ggml` +4. `/tmp/ggml/ggml-hsa` + +> **Warning:** Setting `GGML_HSA_KERNEL_CACHE_CLEAR=1` deletes all files in the cache directory. + +## Reference + +### CMake Options + +| Option | Description | +|------------------------|--------------------------------------------------------------------| +| `GGML_HSA` | Enable HSA backend | +| `GGML_HSA_JIT_COMPILE` | Enable JIT compilation (requires IRON environment) | + +### Environment Variables + +| Variable | Description | +|-------------------------------|-----------------------------------------------------------------| +| `GGML_HSA_ENABLE_LOG` | Enable internal logging (`1`, `true`, or `on`) | +| `GGML_HSA_KERNEL_DIR` | Precompiled kernel directory path | +| `GGML_HSA_KERNEL_CACHE_DIR` | JIT cache directory | +| `GGML_HSA_KERNEL_CACHE_CLEAR` | Clear JIT cache on startup (`1`, `true`, or `on`) | +| `GGML_HSA_JIT_VERBOSE` | Verbose JIT output (`1`, `true`, or `on`) | diff --git a/src/ggml-hsa/aie-kernel-compiler.cpp b/src/ggml-hsa/aie-kernel-compiler.cpp new file mode 100644 index 0000000000..196ef17b08 --- /dev/null +++ b/src/ggml-hsa/aie-kernel-compiler.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +#include "ggml-hsa/aie-kernel-compiler.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include + +#include + +#include "ggml-hsa/common.hpp" +#include "ggml-impl.h" + +namespace fs = std::filesystem; +namespace py = pybind11; + +/// @brief If @c true, JIT compilation will print verbose output. +static const bool verbose_compilation = [] { + const char * env = std::getenv("GGML_HSA_JIT_VERBOSE"); + return env != nullptr && ggml_hsa_string_to_bool(env); +}(); + +/// @brief Path to the shared library directory. +static const std::filesystem::path ggml_hsa_library_dir = [] { + // retrieve the shared library path + Dl_info info; + if (dladdr(static_cast(&ggml_hsa_library_dir), &info) == 0) { + GGML_ABORT("Could not retrieve library directory\n"); + } + return std::filesystem::path{info.dli_fname}.parent_path(); +}(); + +/// @brief Path to AIE kernels. +static const fs::path kernel_path = ggml_hsa_library_dir / "kernels"; + +/// @brief Python interpreter initialization guard. +static py::scoped_interpreter python_interpreter_guard = [] { + py::scoped_interpreter guard; + auto sys = py::module_::import("sys"); + sys.attr("path").attr("append")(kernel_path.string()); + return guard; +}(); + +/** + * @brief Creates a @p py::tuple from the tensor shape. + */ +static py::tuple ggml_hsa_tensor_ne_as_pytuple(const ggml_tensor & tensor) { + auto ne = py::tuple(GGML_MAX_DIMS); + for (auto i = 0; i < GGML_MAX_DIMS; ++i) { + ne[i] = py::int_(tensor.ne[i]); + } + return ne; +} + +/** + * @brief Creates a @p py::tuple from the tensor strides. + */ +static py::tuple ggml_hsa_tensor_nb_as_pytuple(const ggml_tensor & tensor) { + auto nb = py::tuple(GGML_MAX_DIMS); + for (auto i = 0; i < GGML_MAX_DIMS; ++i) { + nb[i] = py::int_(tensor.nb[i]); + } + return nb; +} + +ggml_status ggml_hsa_compile_aie_kernel(const ggml_hsa_device_info::device_info & dev_info, + const ggml_tensor & tensor, + const std::string & op_name, + const std::string & kernel_name, + const std::filesystem::path & output_path) { + using namespace py::literals; + + const auto output_directory = output_path / dev_info.name; + + try { + // convert a GGML tensor to input and output TensorDesc objects + auto tensor_desc_mod = py::module_::import("tensor_desc"); + auto create_tensor_desc = tensor_desc_mod.attr("ggml_tensor_to_tensordesc"); + const auto src_tensor_count = ggml_hsa_nsrcs(tensor); + auto input_tensors = py::list(src_tensor_count); + for (auto i = 0; i < src_tensor_count; ++i) { + const auto src_tensor = tensor.src[i]; + input_tensors[i] = + create_tensor_desc("dtype"_a = ggml_type_name(src_tensor->type), + "ne"_a = ggml_hsa_tensor_ne_as_pytuple(*src_tensor), + "nb"_a = ggml_hsa_tensor_nb_as_pytuple(*src_tensor), + "contiguous"_a = ggml_is_contiguous(src_tensor)); + } + auto output_tensor = create_tensor_desc("dtype"_a = ggml_type_name(tensor.type), + "ne"_a = ggml_hsa_tensor_ne_as_pytuple(tensor), + "nb"_a = ggml_hsa_tensor_nb_as_pytuple(tensor), + "contiguous"_a = ggml_is_contiguous(&tensor)); + + auto op_params = py::bytearray(reinterpret_cast(tensor.op_params), + sizeof(tensor.op_params)); + + // compile the kernel + auto build_mod = py::module_::import("build"); + auto compile_kernel = build_mod.attr("ggml_compile_op"); + compile_kernel("op_name"_a = op_name, "arch"_a = dev_info.name, + "input_tensors"_a = std::move(input_tensors), + "output_tensor"_a = std::move(output_tensor), + "op_params"_a = std::move(op_params), "exported_name"_a = kernel_name, + "output_directory"_a = output_directory.string(), + "verbose"_a = verbose_compilation); + } catch (const py::error_already_set & ex) { + GGML_HSA_LOG_INFO("%s: failed to compile kernel %s for tensor \"%s\" (%s): %s", __func__, + kernel_name.c_str(), tensor.name, op_name.c_str(), ex.what()); + return GGML_STATUS_FAILED; + } + + GGML_HSA_LOG_INFO("%s: generated kernel %s in %s for tensor \"%s\" (%s)", __func__, + kernel_name.c_str(), output_directory.c_str(), tensor.name, op_name.c_str()); + + return GGML_STATUS_SUCCESS; +} + +ggml_status ggml_hsa_compile_aie_kernel(const ggml_hsa_device_info::device_info & dev_info, + const ggml_tensor & tensor, + const std::string & kernel_name, + const std::filesystem::path & output_path) { + return ggml_hsa_compile_aie_kernel(dev_info, tensor, ggml_op_desc(&tensor), kernel_name, + output_path); +} diff --git a/src/ggml-hsa/aie-kernel-compiler.hpp b/src/ggml-hsa/aie-kernel-compiler.hpp new file mode 100644 index 0000000000..bb40c106f8 --- /dev/null +++ b/src/ggml-hsa/aie-kernel-compiler.hpp @@ -0,0 +1,36 @@ +// Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +#pragma once + +#include + +#include "ggml-hsa/common.hpp" +#include "ggml.h" + +/** + * @brief Compiles an AIE kernel for the operation in @p tensor. + * + * @param[in] dev_info device information + * @param[in] tensor tensor to compile a kernel for + * @param[in] op_name operation name; overrides the @c ggml_op_desc of @p tensor + * @param[in] kernel_name kernel name + * @param[in] output_path directory to write kernel to + */ +ggml_status ggml_hsa_compile_aie_kernel(const ggml_hsa_device_info::device_info & dev_info, + const ggml_tensor & tensor, + const std::string & op_name, + const std::string & kernel_name, + const std::filesystem::path & output_path); + +/** + * @brief Compiles an AIE kernel for the operation in @p tensor. + * + * @param[in] dev_info device information + * @param[in] tensor tensor to compile a kernel for + * @param[in] kernel_name kernel name + * @param[in] output_path directory to write kernel to + */ +ggml_status ggml_hsa_compile_aie_kernel(const ggml_hsa_device_info::device_info & dev_info, + const ggml_tensor & tensor, + const std::string & kernel_name, + const std::filesystem::path & output_path); diff --git a/src/ggml-hsa/aie-kernel.cpp b/src/ggml-hsa/aie-kernel.cpp new file mode 100644 index 0000000000..29e07b09d7 --- /dev/null +++ b/src/ggml-hsa/aie-kernel.cpp @@ -0,0 +1,108 @@ +// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved. + +#include "ggml-hsa/aie-kernel.hpp" + +#include +#include +#include + +#include "ggml-impl.h" + +/** + * @brief Dispatches a packet to an AIE agent queue. + * + * @todo @p ctx.dispatch_signal is not used yet. + * + * @param[in] ctx backend context + * @param[in] payload packet payload + * @param[in] payload_size payload size in dwords + */ +static void ggml_hsa_aie_dispatch_packet(ggml_backend_hsa_context & ctx, + hsa_amd_aie_ert_start_kernel_data_t * payload, + std::size_t payload_size) { + hsa_amd_aie_ert_packet_t pkt{}; + pkt.header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + pkt.header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT; + pkt.state = HSA_AMD_AIE_ERT_STATE_NEW; + pkt.count = payload_size; + pkt.opcode = HSA_AMD_AIE_ERT_START_CU; + pkt.payload_data = reinterpret_cast(payload); + // TODO add pkt.completion_signal = ctx.dispatch_signal + + auto queue = ctx.queue; + + // Queue is full when (write_index - read_index) >= queue->size. Wait until there is space. + const std::uint64_t wr_idx = hsa_queue_add_write_index_relaxed(queue, 1); + while (wr_idx - hsa_queue_load_read_index_scacquire(queue) >= queue->size) { + ggml_hsa_wait_dispatches(ctx); + } + + const std::uint64_t packet_id = wr_idx % queue->size; + *(static_cast(queue->base_address) + packet_id) = pkt; + + hsa_signal_store_screlease(queue->doorbell_signal, wr_idx); +} + +ggml_status ggml_hsa_aie_kernel::dispatch(ggml_backend_hsa_context & ctx, + ggml_tensor * src_tensors[], + std::size_t num_src_tensors, + ggml_tensor & dst_tensor) const { + const auto & dev_info = ggml_hsa_get_device_info(ctx.device); + const std::size_t packet_dwords = + 3 /* instructions */ + (num_src_tensors + 1) * 3 /* source and destination tensors */; + void * ptr = nullptr; + if (auto status = + hsa_amd_memory_pool_allocate(dev_info.kernarg_memory.memory_pool, 64, 0, &ptr); + status != HSA_STATUS_SUCCESS) { + GGML_HSA_LOG_ERROR("%s: failed to allocate hsa_queue packet storage (%s)", __func__, + ggml_hsa_get_status_string(status)); + return GGML_STATUS_ALLOC_FAILED; + } + ctx.pending_payloads.emplace_back(ptr); + + auto cmd_payload = static_cast(ptr); + + assert(pdi.data() != nullptr); + cmd_payload->pdi_addr = + const_cast(static_cast(pdi.data())); // PDI to use with this command + + // transaction opcode; not counted in packet_dwords (see assert below) + cmd_payload->data[0] = 0x3; + cmd_payload->data[1] = 0x0; + + std::size_t dword_idx = 2; + + // instructions; 3 dwords + assert(insts.data() != nullptr); + cmd_payload->data[dword_idx] = reinterpret_cast(insts.data()) & 0xFFFFFFFF; + cmd_payload->data[dword_idx + 1] = reinterpret_cast(insts.data()) >> 32; + cmd_payload->data[dword_idx + 2] = static_cast(insts.size()); + dword_idx += 3; + + // sources; 2 dwords each + for (std::size_t src_idx = 0; src_idx < num_src_tensors; ++src_idx, dword_idx += 2) { + assert(src_tensors[src_idx]->data != nullptr); + cmd_payload->data[dword_idx] = + reinterpret_cast(src_tensors[src_idx]->data) & 0xFFFFFFFF; + cmd_payload->data[dword_idx + 1] = + reinterpret_cast(src_tensors[src_idx]->data) >> 32; + } + + // destination; 2 dwords + assert(dst_tensor.data != nullptr); + cmd_payload->data[dword_idx] = reinterpret_cast(dst_tensor.data) & 0xFFFFFFFF; + cmd_payload->data[dword_idx + 1] = reinterpret_cast(dst_tensor.data) >> 32; + dword_idx += 2; + + // sizes; 1 dword per tensor + for (std::size_t src_idx = 0; src_idx < num_src_tensors; ++src_idx, ++dword_idx) { + cmd_payload->data[dword_idx] = ggml_nbytes(src_tensors[src_idx]); + } + cmd_payload->data[dword_idx] = ggml_nbytes(&dst_tensor); + + assert(dword_idx == packet_dwords + 1); // 2 extra uncounted dwords (transaction opcode) + + ggml_hsa_aie_dispatch_packet(ctx, cmd_payload, packet_dwords); + + return GGML_STATUS_SUCCESS; +} diff --git a/src/ggml-hsa/aie-kernel.hpp b/src/ggml-hsa/aie-kernel.hpp new file mode 100644 index 0000000000..d53690f264 --- /dev/null +++ b/src/ggml-hsa/aie-kernel.hpp @@ -0,0 +1,64 @@ +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +#pragma once + +#include "ggml-hsa/common.hpp" +#include "ggml.h" + +#include +#include + +/** + * @brief PDI buffer for AIE agent kernels. + */ +class ggml_hsa_pdi_buffer { + ggml_hsa_unique_ptr m_data; + + public: + constexpr ggml_hsa_pdi_buffer() = default; + explicit ggml_hsa_pdi_buffer(std::uint64_t * data) : m_data{data} {} + + std::uint64_t * data() { return m_data.get(); } + const std::uint64_t * data() const { return m_data.get(); } +}; + +/** + * @brief Instructions buffer for AIE agent kernels. + */ +class ggml_hsa_insts_buffer { + ggml_hsa_unique_ptr m_data; + std::size_t m_size{}; + + public: + constexpr ggml_hsa_insts_buffer() = default; + ggml_hsa_insts_buffer(std::uint32_t * data, std::size_t size) : m_data{data}, m_size{size} {} + + ggml_hsa_insts_buffer(ggml_hsa_insts_buffer && other) : + m_data{std::exchange(other.m_data, nullptr)}, m_size{std::exchange(other.m_size, 0)} {} + + ~ggml_hsa_insts_buffer() = default; + + ggml_hsa_insts_buffer & operator=(ggml_hsa_insts_buffer && other) { + m_data = std::exchange(other.m_data, nullptr); + m_size = std::exchange(other.m_size, 0); + return *this; + } + + std::size_t size() const { return m_size; } + std::uint32_t * data() { return m_data.get(); } + const std::uint32_t * data() const { return m_data.get(); } +}; + +/** + * @brief Kernel for AIE agents. + */ +class ggml_hsa_aie_kernel : public ggml_hsa_kernel { + public: + ggml_hsa_pdi_buffer pdi; + ggml_hsa_insts_buffer insts; + + ggml_status dispatch(ggml_backend_hsa_context & ctx, + ggml_tensor * src_tensors[], + std::size_t num_src_tensors, + ggml_tensor & dst_tensor) const override; +}; diff --git a/src/ggml-hsa/cmake/ggml_hsa_utils.cmake b/src/ggml-hsa/cmake/ggml_hsa_utils.cmake new file mode 100644 index 0000000000..0d86e52675 --- /dev/null +++ b/src/ggml-hsa/cmake/ggml_hsa_utils.cmake @@ -0,0 +1,32 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +# Creates a target with name TARGET_NAME which copies files to build directory +# +# Arguments: +# TARGET_NAME (string): target +# DESTINATION (string): destination directory +# FILES (string): files to copy +# +function(ggml_hsa_copy_files TARGET_NAME) + set(oneValueArgs DESTINATION) + set(multiValueArgs FILES) + + cmake_parse_arguments(PARSE_ARGV 0 arg + "" "${oneValueArgs}" "${multiValueArgs}") + + foreach(FILE IN LISTS arg_FILES) + get_filename_component(FILE_NAME "${FILE}" NAME) + set(DESTINATION_FILE "${arg_DESTINATION}/${FILE_NAME}") + + add_custom_command( + OUTPUT ${DESTINATION_FILE} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${FILE} ${DESTINATION_FILE} + DEPENDS ${FILE} + COMMENT "Copying ${FILE} to build directory ${arg_DESTINATION}" + ) + + list(APPEND COPIED_FILES "${DESTINATION_FILE}") + endforeach() + + add_custom_target(${TARGET_NAME} ALL DEPENDS ${COPIED_FILES}) +endfunction() diff --git a/src/ggml-hsa/common.hpp b/src/ggml-hsa/common.hpp new file mode 100644 index 0000000000..3b74542ea9 --- /dev/null +++ b/src/ggml-hsa/common.hpp @@ -0,0 +1,354 @@ +// Copyright (c) 2024-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +#pragma once + +#include "ggml-hsa.h" +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ggml-common.h" + +#if defined(__clang__) || defined(__GNUC__) +// Optimize for the execution path that is more or less likely than the alternative. +#define LIKELY(ex) __builtin_expect(!!(ex), 1) +#define UNLIKELY(ex) __builtin_expect(!!(ex), 0) +#else +#define LIKELY(ex) (ex) +#define UNLIKELY(ex) (ex) +#endif + +/// @brief @c true if logging is enabled. +extern bool g_ggml_hsa_verbose; + +/** + * @brief Logs errors. + */ +#define GGML_HSA_LOG_ERROR(MSG, ...) \ + do { \ + if (UNLIKELY(g_ggml_hsa_verbose)) \ + GGML_LOG_ERROR(MSG "\n", __VA_ARGS__); \ + } while (false) + +/** + * @brief Logs warnings. + */ +#define GGML_HSA_LOG_WARN(MSG, ...) \ + do { \ + if (UNLIKELY(g_ggml_hsa_verbose)) \ + GGML_LOG_WARN(MSG "\n", __VA_ARGS__); \ + } while (false) + +/** + * @brief Logs information. + */ +#define GGML_HSA_LOG_INFO(MSG, ...) \ + do { \ + if (UNLIKELY(g_ggml_hsa_verbose)) \ + GGML_LOG_INFO(MSG "\n", __VA_ARGS__); \ + } while (false) + +/** + * @brief Returns if @p s evaluates to `true` or `false`. + */ +bool ggml_hsa_string_to_bool(std::string_view s); + +/** + * @brief Returns the description of @p status as a string. + */ +const char * ggml_hsa_get_status_string(hsa_status_t status); + +/** + * @brief Prints an error message based on the status and aborts. + * + * @param[in] stmt statement that caused the error + * @param[in] func function in which the error occurred + * @param[in] file file in which the error occurred + * @param[in] line line number where the error occurred + * @param[in] status error code + */ +[[noreturn]] +void ggml_hsa_error( + const char * stmt, const char * func, const char * file, int line, hsa_status_t status); + +/** + * @brief Checks if @p status is an error code, prints an error message and aborts. + */ +#define GGML_HSA_CHECK_ABORT(status) \ + do { \ + auto status_ = (status); \ + if (status_ != HSA_STATUS_SUCCESS) \ + ggml_hsa_error(#status, __func__, __FILE__, __LINE__, status_); \ + } while (false) + +/** + * @brief Checks if @p status is an error code and throws an exception. + */ +#define GGML_HSA_CHECK_THROW(status) \ + do { \ + auto status_ = (status); \ + if (status_ != HSA_STATUS_SUCCESS) \ + throw std::runtime_error{ggml_hsa_get_status_string(status_)}; \ + } while (false) + +/** + * @brief Returns the number of sources of @p tensor. + */ +std::int64_t ggml_hsa_nsrcs(const ggml_tensor & tensor); + +/** + * @brief Creates a string representation of the tensor shape. + * + * For a 3D tensor with dimensions `[3,3,4,1]`, the default representation is of the form `3x3x4`. + * + * @param[in] tensor tensor to output shape for + * @param[out] os output stream + * @param[in] delim delimiter + */ +template +void ggml_hsa_output_tensor_shape(const ggml_tensor & tensor, OutputStream & os, char delim = 'x') { + const auto ndims = ggml_n_dims(&tensor); + os << tensor.ne[0]; + for (std::int32_t i = 1; i < ndims; ++i) { + os << delim << tensor.ne[i]; + } +} + +/** + * @brief Creates a string representation of the tensor stride. + * + * For a 3D tensor with dimensions `[3,3,4,1]`, the default representation is of the form `X,Y,Z`, + * where X, Y, Z are the stride in bytes in the first, second, and third dimensions, respectively. + * + * @param[in] tensor tensor to output stride for + * @param[out] os output stream + * @param[in] delim delimiter + */ +template +void ggml_hsa_output_tensor_stride(const ggml_tensor & tensor, + OutputStream & os, + char delim = ',') { + const auto ndims = ggml_n_dims(&tensor); + os << tensor.nb[0]; + for (std::int32_t i = 1; i < ndims; ++i) { + os << delim << tensor.nb[i]; + } +} + +/** + * @brief Creates a string representation of the tensor. + * + * The representation is of the form `DimsDatatypeModifiers`, e.g., `3x3x4f32` for a contiguous 3D + * tensor with dimensions `[3,3,4]`. + * + * @param[in] tensor tensor to output + * @param[out] os output stream + */ +template +void ggml_hsa_output_tensor(const ggml_tensor & tensor, OutputStream & os) { + ggml_hsa_output_tensor_shape(tensor, os); + os << ggml_type_name(tensor.type); + if (!ggml_is_contiguous(&tensor)) { + os << 'n'; + } +} + +/** + * @brief Creates a string representation of the tensor's op_params using a hash. + * + * @param[in] tensor tensor to output + * @param[out] os output stream + */ +template +void ggml_hsa_encode_op_params(const ggml_tensor & tensor, OutputStream & os) { + std::string_view bytes(reinterpret_cast(tensor.op_params), GGML_MAX_OP_PARAMS); + std::size_t hash_value = std::hash{}(bytes); + os << std::hex << hash_value; +} + +/** + * @brief Frees memory allocated using HSA. + */ +template +struct ggml_hsa_delete { + static_assert(!std::is_array_v, "ggml_hsa_delete does not support arrays"); + + void operator()(T * ptr) const { + if (ptr) { + if constexpr (!std::is_void_v) { + std::destroy_at(ptr); + } + GGML_HSA_CHECK_ABORT(hsa_amd_memory_pool_free(ptr)); + } + } +}; + +/// @brief HSA allocated managed memory. +template +using ggml_hsa_unique_ptr = std::unique_ptr>; + +struct ggml_backend_hsa_context; + +/** + * @brief Base class for HSA kernels. + */ +class ggml_hsa_kernel { + public: + virtual ~ggml_hsa_kernel() = default; + + /** + * @brief Dispatches the kernel. + * + * @param[in] ctx backend context + * @param[in] src_tensors source tensors + * @param[in] num_src_tensors number of source tensors + * @param[out] dst_tensor destination tensor + */ + virtual ggml_status dispatch(ggml_backend_hsa_context & ctx, + ggml_tensor * src_tensors[], + std::size_t num_src_tensors, + ggml_tensor & dst_tensor) const = 0; +}; + +/** + * @brief Device information. + */ +struct ggml_hsa_device_info { + std::int32_t device_count{}; ///< Number of devices, up to @ref GGML_HSA_MAX_DEVICES. + + /** + * @brief Information about a single HSA memory pool. + */ + struct memory_pool_info { + hsa_amd_memory_pool_t memory_pool{}; ///< HSA memory pool object. + std::size_t size{}; ///< Memory available to the pool in bytes. + std::size_t alignment{}; ///< Memory pool alignment. + std::size_t max_alloc_size{}; ///< Memory pool maximum allocation. + }; + + /** + * @brief Information about a single HSA device. + */ + struct device_info { + std::int32_t device{}; ///< Device ID. + hsa_agent_t agent{}; ///< HSA agent associated with the device. + hsa_device_type_t type{}; ///< Agent type. + std::string name; ///< Agent name. + memory_pool_info dev_memory{}; ///< Kernel memory pool. + memory_pool_info kernarg_memory{}; ///< Kernel arguments memory pool. + memory_pool_info data_memory{}; ///< Data memory pool. + std::size_t alignment{64}; ///< Memory alignment requirement for buffers. + bool substitute_fp16_bf16{false}; ///< Use BF16 when FP16 is requested. + std::unordered_map> + kernels; ///< Cached device kernels. + }; + + std::array devices = {}; +}; + +/** + * @brief Returns the HSA device information. + * + * This function returns a reference to a structure containing the HSA device + * information. HSA and the information is initialized once and reused on all + * subsequent calls. + * + * @return structure with device information + */ +const ggml_hsa_device_info & ggml_hsa_info(); + +/** + * @brief Returns the device info associated with @p device_id. + */ +const ggml_hsa_device_info::device_info & ggml_hsa_get_device_info(std::int32_t device_id); + +/** + * @brief Tensor metadata. + * + * This class contains metadata about a ggml_tensor, called a parent tensor, that is used by the HSA + * backend to create an alternative graph representation that will be used at run-time. + * + * A copy of the parent tensor metadata is made, along with a copy for all the parent's source + * tensors' metadata. + * + * Those copies have a number of transformations applied to them, such as making them contiguous, + * flattening them etc. + */ +struct ggml_backend_hsa_tensor_extra { + /// @brief Internal graph node. + struct node_t { + ggml_tensor tensor{}; ///< Transformed tensor. + std::size_t buffer_size{}; ///< Temporary storage size in bytes. + bool convert_dtype{}; ///< True if data conversion is necessary. + }; + + std::int64_t nsrcs{}; ///< Number of source tensors. + node_t node{}; ///< Internal graph node. + std::array src_nodes{}; ///< Internal graph node sources. + std::shared_ptr kernel; ///< Kernel associated with the tensor. + ggml_hsa_unique_ptr buffer; ///< Temporary storage for tensor data. + bool requires_sync{false}; ///< True if CPU tensor transformations are necessary. + + ggml_backend_hsa_tensor_extra(const ggml_hsa_device_info::device_info & dev_info, + const ggml_tensor & parent_tensor); + ggml_backend_hsa_tensor_extra(const ggml_backend_hsa_tensor_extra &) = delete; + ggml_backend_hsa_tensor_extra(ggml_backend_hsa_tensor_extra &&) = delete; + + ~ggml_backend_hsa_tensor_extra() = default; + + ggml_backend_hsa_tensor_extra & operator=(const ggml_backend_hsa_tensor_extra &) = delete; + ggml_backend_hsa_tensor_extra & operator=(ggml_backend_hsa_tensor_extra &&) = delete; + + /** + * @brief Allocates storage for the internal tensor. + */ + ggml_status allocate_internal_storage(const ggml_hsa_device_info::device_info & dev_info); +}; + +/** + * @brief Context for HSA backend operations. + */ +struct ggml_backend_hsa_context { + std::int32_t device{}; ///< Device ID. + std::string name; ///< Device name. + hsa_queue_t * queue{}; ///< HSA queue. + hsa_signal_t dispatch_signal{}; ///< Signal for packet completion. + std::vector> + pending_payloads; ///< Packet payloads since last synchronization. + + explicit ggml_backend_hsa_context(const ggml_hsa_device_info::device_info & dev_info); + + ggml_backend_hsa_context(const ggml_backend_hsa_context &) = delete; + ggml_backend_hsa_context(ggml_backend_hsa_context &&) = delete; + + ~ggml_backend_hsa_context(); + + ggml_backend_hsa_context & operator=(const ggml_backend_hsa_context &) = delete; + ggml_backend_hsa_context & operator=(ggml_backend_hsa_context &&) = delete; + + /** + * @brief Frees all memory associated with pending packets. + * + * @warning This function assumes that packets have been processed. + */ + void free_pending_payloads(); +}; + +/** + * @brief Waits for all dispatched kernels to finish. + * + * @param[in] ctx backend context + */ +void ggml_hsa_wait_dispatches(ggml_backend_hsa_context & ctx); diff --git a/src/ggml-hsa/env_setup.sh b/src/ggml-hsa/env_setup.sh new file mode 100755 index 0000000000..95577a6236 --- /dev/null +++ b/src/ggml-hsa/env_setup.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +SCRIPT_DIR_NAME=$(dirname -- "${BASH_SOURCE[0]}") +VENV_NAME=.venv + +python3 -m venv ${VENV_NAME} +source ${VENV_NAME}/bin/activate +python3 -m pip install --upgrade pip +python3 -m pip install -r ${SCRIPT_DIR_NAME}/requirements.txt diff --git a/src/ggml-hsa/ggml-hsa.cpp b/src/ggml-hsa/ggml-hsa.cpp new file mode 100644 index 0000000000..bc33c1d510 --- /dev/null +++ b/src/ggml-hsa/ggml-hsa.cpp @@ -0,0 +1,1641 @@ +// Copyright (c) 2024-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +#include "ggml-hsa.h" +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +#include "ggml-hsa/common.hpp" +#include "ggml-hsa/host-ops.hpp" +#include "ggml-hsa/kernel-discovery.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +bool g_ggml_hsa_verbose = [] { + if (const char * verbose = std::getenv("GGML_HSA_ENABLE_LOG"); verbose != nullptr) { + return ggml_hsa_string_to_bool(verbose); + } +#if defined(NDEBUG) + return false; +#else + return true; +#endif +}(); + +/// @brief Last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses. +#define MATRIX_ROW_PADDING 512 + +#define NOT_IMPLEMENTED() \ + do { \ + GGML_ABORT("(%s:%d) %s not implemented\n", __FILE__, __LINE__, __PRETTY_FUNCTION__); \ + } while (false) + +bool ggml_hsa_string_to_bool(std::string_view s) { + return s == "1" || s == "true" || s == "True" || s == "TRUE" || s == "yes" || s == "Yes" || + s == "YES" || s == "on" || s == "On" || s == "ON"; +} + +const char * ggml_hsa_get_status_string(hsa_status_t status) { + const char * msg = nullptr; + if (hsa_status_string(status, &msg) != HSA_STATUS_SUCCESS) { + return "unknown"; + } + return msg; +} + +[[noreturn]] +void ggml_hsa_error( + const char * stmt, const char * func, const char * file, int line, hsa_status_t status) { + GGML_LOG_ERROR("HSA error (%s) in function %s at %s:%d: %s\n", + ggml_hsa_get_status_string(status), func, file, line, stmt); + // abort with GGML_ABORT to get a stack trace + GGML_ABORT("HSA error"); +} + +std::int64_t ggml_hsa_nsrcs(const ggml_tensor & tensor) { + std::int64_t nsrcs = 0; + for (; (nsrcs < GGML_MAX_SRC) && (tensor.src[nsrcs] != nullptr); ++nsrcs) + ; + return nsrcs; +} + +/** + * @brief Checks whether all operation parameters of a tensor are zero. + * + * This function inspects the tensor's op_params array and + * determines if every 32-bit element is zero. + * + * @param[in] tensor Tensor whose operation parameters are to be checked. + * + * @return `true` if all elements of op_params are zero; + * `false` otherwise. + */ +static bool ggml_hsa_op_params_all_zero(const ggml_tensor & tensor) { + const std::int32_t * params = tensor.op_params; + const std::size_t num_elements = GGML_MAX_OP_PARAMS / sizeof(std::int32_t); + return std::all_of(params, params + num_elements, [](int32_t x) { return x == 0; }); +} + +/** + * @brief Returns if @p op is a unary operation. + */ +constexpr bool ggml_hsa_is_unary_op(ggml_op op) { + return (op == GGML_OP_UNARY) || (op == GGML_OP_SQR) || (op == GGML_OP_SQRT) || + (op == GGML_OP_LOG) || (op == GGML_OP_SIN) || (op == GGML_OP_COS) || + (op == GGML_OP_SILU_BACK) || (op == GGML_OP_LEAKY_RELU); +} + +/** + * @brief Returns a kernel name for @p tensor using @p op_name as the operations name if it is not + * empty. + */ +static std::string ggml_hsa_create_kernel_name(const ggml_tensor & tensor, + std::string op_name = "") { + if ((tensor.op < GGML_OP_NONE) || (tensor.op >= GGML_OP_COUNT)) { + throw std::runtime_error{std::string("Tensor \"") + .append(ggml_get_name(&tensor)) + .append("\" operation index out of bounds: ") + .append(std::to_string(tensor.op)) + .append(" not in [0, GGML_OP_COUNT)")}; + } + + // no operation name supplied - use the tensor operation name + if (op_name.empty()) { + op_name = ggml_op_desc(&tensor); + } + + std::ostringstream oss; + + // convert name in lowercase + std::transform(op_name.begin(), op_name.end(), std::ostreambuf_iterator(oss), + [&](char c) { return std::tolower(c); }); + + // output tensor + oss << '-'; + ggml_hsa_output_tensor(tensor, oss); + + // input tensors + for (std::int32_t i = 0; i < GGML_MAX_SRC; ++i) { + if (tensor.src[i] == nullptr) { + break; + } + oss << '-'; + ggml_hsa_output_tensor(*(tensor.src[i]), oss); + } + + // determine if op_params need to be encoded in the kernel name + if (!ggml_hsa_is_unary_op(tensor.op) && !ggml_hsa_op_params_all_zero(tensor)) { + oss << '-'; + ggml_hsa_encode_op_params(tensor, oss); + } + + return oss.str(); +} + +/** + * @brief Returns if @p op is an element-wise operation. + */ +constexpr bool ggml_hsa_is_elementwise_op(ggml_op op) { + return (op == GGML_OP_ADD) || (op == GGML_OP_SUB) || (op == GGML_OP_MUL) || + (op == GGML_OP_DIV) || (op == GGML_OP_SCALE); +} + +/** + * @brief Returns if @p op can be flattened. + * + * An operation can be flattened if it independent of the tensor's dimensions, such as element wise + * operations where the shapes and strides of the input and output tensors match. + */ +static bool ggml_hsa_can_flatten(const ggml_tensor & op) { + // operations with non-contiguously allocated tensors cannot be flattened + if (!ggml_is_contiguously_allocated(&op)) { + return false; + } + for (auto src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) { + if (op.src[src_idx] == nullptr) { + break; + } + if (!ggml_is_contiguously_allocated(op.src[src_idx])) { + return false; + } + } + + if (ggml_hsa_is_unary_op(op.op)) { + // unary operations can be flattened independently of the tensors' shape + return true; + } + + if (ggml_hsa_is_elementwise_op(op.op)) { + // element-wise operations can be flattened only if the shapes match + for (auto src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) { + if (op.src[src_idx] == nullptr) { + break; + } + if (!ggml_are_same_shape(op.src[src_idx], &op)) { + return false; + } + } + + return true; + } + + return false; +} + +/** + * @brief Creates a device name from the device index @p device. + */ +static std::string ggml_hsa_format_name(std::int32_t device) { + return GGML_HSA_NAME + std::to_string(device); +} + +/** + * @brief Retrieves the agent info for the given agent @p agent. + */ +static std::string ggml_hsa_agent_name(hsa_agent_t agent) { + constexpr std::size_t agent_name_size = 64; + char agent_name[agent_name_size]; + GGML_HSA_CHECK_THROW(hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, &agent_name)); + return std::string{agent_name}; +} + +/** + * @brief Returns the minimum queue size. + */ +static std::uint32_t ggml_hsa_get_agent_min_queue_size(hsa_agent_t agent) { + std::uint32_t min_queue_size = 0; + GGML_HSA_CHECK_THROW(hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &min_queue_size)); + return min_queue_size; +} + +/** + * @brief Populates the information in @p info from @p pool. + */ +static hsa_status_t ggml_hsa_get_memory_pool_info(hsa_amd_memory_pool_t pool, + ggml_hsa_device_info::memory_pool_info & info) { + bool alloc_allowed = true; + if (auto status = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc_allowed); + (status != HSA_STATUS_SUCCESS) || !alloc_allowed) { + // ignore pools that we can't allocate from + return status; + } + + std::size_t size = 0; + if (auto status = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size); + status != HSA_STATUS_SUCCESS) { + return status; + } + + std::size_t alignment = 0; + if (auto status = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &alignment); + status != HSA_STATUS_SUCCESS) { + return status; + } + + std::size_t max_alloc_size = 0; + if (auto status = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE, + &max_alloc_size); + status != HSA_STATUS_SUCCESS) { + return status; + } + + info.memory_pool = pool; + info.size = size; + info.alignment = alignment; + info.max_alloc_size = max_alloc_size; + + return HSA_STATUS_SUCCESS; +} + +/** + * @brief Memory pool discovery information. + */ +struct ggml_hsa_find_memory_pool_data_t { + /// Expected memory pool flags. + hsa_amd_memory_pool_global_flag_t expected_flags = + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED; + /// @c true if allocation is expected from the pool. + bool expected_allocatable = true; + /// Retrieved memory pool information. + ggml_hsa_device_info::memory_pool_info mem_info; +}; + +/** + * @brief Find a pool with the required flags. + */ +static hsa_status_t ggml_hsa_find_memory_pool(hsa_amd_memory_pool_t pool, void * data) { + // query only global segments + hsa_amd_segment_t segment_type = {}; + if (auto status = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); + (status != HSA_STATUS_SUCCESS) || (segment_type != HSA_AMD_SEGMENT_GLOBAL)) { + return status; + } + + hsa_amd_memory_pool_global_flag_t pool_flags = {}; + if (auto status = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &pool_flags); + status != HSA_STATUS_SUCCESS) { + return status; + } + + // check if flags satisfied + auto & mem_pool_data = *static_cast(data); + if ((pool_flags & mem_pool_data.expected_flags) == 0x0) { + return HSA_STATUS_SUCCESS; + } + + // check if allocation satisfied + std::size_t alloc_rec_granule = 0; + if (auto status = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE, &alloc_rec_granule); + status != HSA_STATUS_SUCCESS) { + return status; + } + const bool allocable = (alloc_rec_granule != 0); + if (mem_pool_data.expected_allocatable != allocable) { + return HSA_STATUS_SUCCESS; + } + + if (auto status = ggml_hsa_get_memory_pool_info(pool, mem_pool_data.mem_info); + status != HSA_STATUS_SUCCESS) { + return status; + } + return HSA_STATUS_INFO_BREAK; +} + +/** + * @brief Discovers HSA agents. + */ +static hsa_status_t ggml_hsa_find_hsa_agents(hsa_agent_t agent, void * data) { + hsa_device_type_t type = {}; + if (auto status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + status != HSA_STATUS_SUCCESS) { + return status; + } + + switch (type) { + case HSA_DEVICE_TYPE_AIE: + break; + default: + // only consider AIE agents for now + return HSA_STATUS_SUCCESS; + } + + auto & info = *static_cast(data); + if (info.device_count == GGML_HSA_MAX_DEVICES - 1) { + GGML_ABORT("%s: exceeded GGML_HSA_MAX_DEVICES limit (%d)", __func__, GGML_HSA_MAX_DEVICES); + } + + // populate device information (agent, type, name, etc.) + auto & dev_info = info.devices[info.device_count]; + dev_info.device = info.device_count; + dev_info.agent = agent; + dev_info.type = type; + + char name[64] = {}; + if (auto status = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); + status != HSA_STATUS_SUCCESS) { + return status; + } + dev_info.name = std::string(name); + + if (dev_info.name == "aie2" || dev_info.name == "aie2p") { + dev_info.substitute_fp16_bf16 = true; + GGML_ASSERT(dev_info.alignment % 4 == 0); + } else { + GGML_ABORT("%s: unknown agent \"%s\"\n", __func__, dev_info.name.c_str()); + } + + // find dev memory pool (only for AIE agents) + if (type == HSA_DEVICE_TYPE_AIE) { + // XDNA dev heap is coarse-grained with alloc_rec_granule == 0 + ggml_hsa_find_memory_pool_data_t mem_pool_data = {}; + mem_pool_data.expected_flags = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED; + mem_pool_data.expected_allocatable = false; + auto status = + hsa_amd_agent_iterate_memory_pools(agent, ggml_hsa_find_memory_pool, &mem_pool_data); + switch (status) { + case HSA_STATUS_INFO_BREAK: + dev_info.dev_memory = mem_pool_data.mem_info; + break; + case HSA_STATUS_SUCCESS: + // iteration finished with no errors, but no pool found + return static_cast(HSA_STATUS_ERROR_NOT_SUPPORTED); + default: + // iteration aborted with errors + return status; + } + } + + // find data pool + { + ggml_hsa_find_memory_pool_data_t mem_pool_data = {}; + mem_pool_data.expected_flags = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED; + mem_pool_data.expected_allocatable = true; + auto status = + hsa_amd_agent_iterate_memory_pools(agent, ggml_hsa_find_memory_pool, &mem_pool_data); + switch (status) { + case HSA_STATUS_INFO_BREAK: + dev_info.data_memory = mem_pool_data.mem_info; + break; + case HSA_STATUS_SUCCESS: + // iteration finished with no errors, but no pool found + return static_cast(HSA_STATUS_ERROR_NOT_SUPPORTED); + default: + // iteration aborted with errors + return status; + } + } + + // find kernarg pool + { + ggml_hsa_find_memory_pool_data_t mem_pool_data = {}; + mem_pool_data.expected_flags = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + mem_pool_data.expected_allocatable = true; + auto status = + hsa_amd_agent_iterate_memory_pools(agent, ggml_hsa_find_memory_pool, &mem_pool_data); + switch (status) { + case HSA_STATUS_INFO_BREAK: + dev_info.kernarg_memory = mem_pool_data.mem_info; + break; + case HSA_STATUS_SUCCESS: + // iteration finished with no errors, but no pool found; use data pool + dev_info.kernarg_memory = dev_info.data_memory; + break; + default: + // iteration aborted with errors + return status; + } + } + + // add device to known devices + ++info.device_count; + + return HSA_STATUS_SUCCESS; +} + +/** + * @brief Initialize HSA device information. + * + * This function initializes HSA and retrieves all the appropriate agents and + * memory pools. + */ +static ggml_hsa_device_info ggml_hsa_init() { + GGML_HSA_CHECK_THROW(hsa_init()); + + ggml_hsa_device_info info = {}; + GGML_HSA_CHECK_THROW(hsa_iterate_agents(ggml_hsa_find_hsa_agents, &info)); + + return info; +} + +/// @copydoc ggml_hsa_info +static ggml_hsa_device_info & ggml_hsa_info_mut() { + static ggml_hsa_device_info info = ggml_hsa_init(); + return info; +} + +const ggml_hsa_device_info & ggml_hsa_info() { return ggml_hsa_info_mut(); } + +const ggml_hsa_device_info::device_info & ggml_hsa_get_device_info(std::int32_t device_id) { + const auto & info = ggml_hsa_info(); + const auto & dev_info = info.devices[device_id]; + return dev_info; +} + +/** + * @brief Caches the @p new_kernel for the tensor @p tensor.name on the device @p device_id. + */ +static void ggml_hsa_cache_kernel(std::string kernel_name, + std::int32_t device_id, + std::shared_ptr kernel) { + auto & info = ggml_hsa_info_mut(); + auto & dev_info = info.devices[device_id]; + auto & kernels = dev_info.kernels; + auto result = kernels.emplace(std::move(kernel_name), std::move(kernel)); + if (!result.second) { + GGML_ABORT("%s: kernel %s already exists on device %d\n", __func__, kernel_name.c_str(), + device_id); + } +} + +/** + * @brief Returns the cached kernel for @p kernel_name for the device @p device_id if it exists. + */ +static std::shared_ptr +ggml_hsa_get_cached_kernel(const std::string & kernel_name, + const ggml_hsa_device_info::device_info & dev_info) { + const auto & kernels = dev_info.kernels; + auto it = kernels.find(kernel_name); + if (it != kernels.end()) { + return it->second; + } + return nullptr; +} + +/** + * @brief Deletes all unused cached kernels. + */ +static void ggml_hsa_purge_unused_cached_kernels(std::int32_t device_id) { + auto & info = ggml_hsa_info_mut(); + auto & dev_info = info.devices[device_id]; + auto & kernels = dev_info.kernels; + for (auto it = kernels.begin(); it != kernels.end();) { + if (it->second.use_count() == 1) { + it = kernels.erase(it); + } else { + ++it; + } + } +} + +/** + * @brief Returns if @p tensor has a trivial layout. + * + * A tensor with a trivial layout is contiguously allocated and is not permuted. + */ +static bool ggml_hsa_has_trivial_layout(const ggml_tensor & tensor) { + return ggml_is_contiguously_allocated(&tensor) && !ggml_is_permuted(&tensor); +} + +/** + * @brief Updates the strides of @p tensor so that it has a trivial layout. + */ +static void ggml_hsa_force_unpermuted(ggml_tensor & tensor) { + tensor.nb[0] = ggml_type_size(tensor.type); + tensor.nb[1] = tensor.nb[0] * (tensor.ne[0] / ggml_blck_size(tensor.type)); + for (std::int32_t i = 2; i < GGML_MAX_DIMS; ++i) { + tensor.nb[i] = tensor.nb[i - 1] * tensor.ne[i - 1]; + } +} + +/** + * @brief Flattens @p tensor. + */ +static void ggml_hsa_flatten_tensor(ggml_tensor & tensor) { + const auto nelements = ggml_nelements(&tensor); + tensor.ne[0] = nelements; + std::fill_n(std::next(tensor.ne), GGML_MAX_DIMS - 1, 1); + tensor.nb[0] = ggml_type_size(tensor.type); + tensor.nb[1] = tensor.nb[0] * (tensor.ne[0] / ggml_blck_size(tensor.type)); + for (std::int32_t i = 2; i < GGML_MAX_DIMS; ++i) { + tensor.nb[i] = tensor.nb[i - 1] * tensor.ne[i - 1]; + } +} + +ggml_backend_hsa_tensor_extra::ggml_backend_hsa_tensor_extra( + const ggml_hsa_device_info::device_info & dev_info, const ggml_tensor & parent_tensor) : + nsrcs{ggml_hsa_nsrcs(parent_tensor)} { + + // View tensors are generally not supported, but some operations like GGML_OP_CLAMP + // are created as views in GGML even though they can be treated as non-in-place. + // We allow these specific operations to proceed. + if (ggml_is_view(&parent_tensor) && parent_tensor.op != GGML_OP_CLAMP) { + throw std::runtime_error{"View tensor is not supported."}; + } + + // initialize internal nodes + node.tensor = parent_tensor; + for (auto src_idx = 0; src_idx < nsrcs; ++src_idx) { + src_nodes[src_idx].tensor = *parent_tensor.src[src_idx]; + node.tensor.src[src_idx] = &src_nodes[src_idx].tensor; + } + assert(ggml_hsa_nsrcs(node.tensor) == nsrcs); + + // early exit if operation does not require a kernel + if (ggml_op_is_empty(node.tensor.op)) { + return; + } + + switch (node.tensor.op) { + // implemented as host kernels; nothing to be done + case GGML_OP_DUP: + case GGML_OP_CPY: + case GGML_OP_CONT: + return; + default: + break; + } + + std::array update_src_buffer_size = {}; + + // convert tensor data types if needed + if (dev_info.substitute_fp16_bf16) { + // output tensor can be converted in-place + if (node.tensor.type == GGML_TYPE_F16) { + node.tensor.type = GGML_TYPE_BF16; + node.convert_dtype = true; + } + + // inputs require temporary storage as they may be shared among tensors + for (auto src_idx = 0; src_idx < nsrcs; ++src_idx) { + auto & src_node = src_nodes[src_idx]; + if (src_node.tensor.type == GGML_TYPE_F16) { + update_src_buffer_size[src_idx] = true; + src_node.tensor.type = GGML_TYPE_BF16; + src_node.convert_dtype = true; + } + } + } + + // make tensor layouts trivial; tensors that do not have a trivial layout will need + // temporary storage + if (!ggml_hsa_has_trivial_layout(node.tensor)) { + throw std::runtime_error{"Output tensor does not have trivial layout."}; + } + for (auto src_idx = 0; src_idx < nsrcs; ++src_idx) { + auto & src_node = src_nodes[src_idx]; + if (!ggml_hsa_has_trivial_layout(src_node.tensor)) { + update_src_buffer_size[src_idx] = true; + ggml_hsa_force_unpermuted(src_node.tensor); + } + } + + // flatten tensors to reuse kernels + if (ggml_hsa_can_flatten(node.tensor)) { + ggml_hsa_flatten_tensor(node.tensor); + for (auto src_idx = 0; src_idx < nsrcs; ++src_idx) { + ggml_hsa_flatten_tensor(src_nodes[src_idx].tensor); + } + } + + // update required tensor sizes + for (auto src_idx = 0; src_idx < nsrcs; ++src_idx) { + if (update_src_buffer_size[src_idx]) { + auto & src_node = src_nodes[src_idx]; + src_node.tensor.data = nullptr; + src_node.buffer_size = GGML_PAD(ggml_nbytes(&src_node.tensor), dev_info.alignment); + requires_sync = true; + } + } + + // create a kernel for the operation + auto kernel_name = ggml_hsa_create_kernel_name(node.tensor); + kernel = ggml_hsa_get_cached_kernel(kernel_name, dev_info); + if (kernel == nullptr) { + // kernel not in cache; create a new one and store it in the cache + if (ggml_hsa_create_kernel(dev_info, kernel_name, node.tensor, kernel) != + GGML_STATUS_SUCCESS) { + throw std::runtime_error{std::string{"Could not create kernel for tensor \""} + .append(node.tensor.name) + .append("\" (") + .append(ggml_op_desc(&node.tensor)) + .append(")")}; + } + ggml_hsa_cache_kernel(std::move(kernel_name), dev_info.device, kernel); + } +} + +ggml_status ggml_backend_hsa_tensor_extra::allocate_internal_storage( + const ggml_hsa_device_info::device_info & dev_info) { + if (buffer != nullptr) { + // already allocated + return GGML_STATUS_ABORTED; + } + + std::size_t buffer_size = 0; + for (auto src_idx = 0; src_idx < nsrcs; ++src_idx) { + buffer_size += src_nodes[src_idx].buffer_size; + } + + if (buffer_size == 0) { + // no temporary storage needed + return GGML_STATUS_SUCCESS; + } + + // allocate storage for all tensors + void * ptr = nullptr; + if (auto status = hsa_amd_memory_pool_allocate(dev_info.data_memory.memory_pool, buffer_size, + /* flags = */ 0, &ptr); + status != HSA_STATUS_SUCCESS) { + GGML_HSA_LOG_ERROR("%s: failed to allocate %.2f MiB on device %s (%s)", __func__, + (buffer_size / 1024.0 / 1024.0), dev_info.name.c_str(), + ggml_hsa_get_status_string(status)); + return GGML_STATUS_ALLOC_FAILED; + } + buffer.reset(static_cast(ptr)); + + auto buffer_ptr = buffer.get(); + for (auto src_idx = 0; src_idx < nsrcs; ++src_idx) { + auto & src_node = src_nodes[src_idx]; + if (src_node.buffer_size > 0) { + assert(src_node.tensor.data == nullptr); + src_node.tensor.data = buffer_ptr; + buffer_ptr += src_node.buffer_size; + } + } + + GGML_HSA_LOG_INFO("%s: created temporary storage for tensor %s (%s)", __func__, + node.tensor.name, ggml_op_desc(&node.tensor)); + + return GGML_STATUS_SUCCESS; +} + +ggml_backend_hsa_context::ggml_backend_hsa_context( + const ggml_hsa_device_info::device_info & dev_info) : + device{dev_info.device}, name{ggml_hsa_format_name(device)} { + hsa_agent_t agent = dev_info.agent; + + // create queue + const std::uint32_t min_queue_size = ggml_hsa_get_agent_min_queue_size(agent); + if (auto status = hsa_queue_create(agent, min_queue_size, HSA_QUEUE_TYPE_SINGLE, nullptr, + nullptr, 0, 0, &queue); + status != HSA_STATUS_SUCCESS) { + throw std::runtime_error{std::string("Could not create hsa_queue (") + .append(ggml_hsa_get_status_string(status)) + .append(")")}; + } + + // create signal to wait for packets + if (auto status = hsa_signal_create(0, 0, nullptr, &dispatch_signal); + status != HSA_STATUS_SUCCESS) { + throw std::runtime_error{std::string("Could not create hsa_signal (") + .append(ggml_hsa_get_status_string(status)) + .append(")")}; + } +} + +ggml_backend_hsa_context::~ggml_backend_hsa_context() { + ggml_hsa_purge_unused_cached_kernels(device); + GGML_HSA_CHECK_ABORT(hsa_signal_destroy(dispatch_signal)); + GGML_HSA_CHECK_ABORT(hsa_queue_destroy(queue)); +} + +void ggml_backend_hsa_context::free_pending_payloads() { pending_payloads.clear(); } + +void ggml_hsa_wait_dispatches(ggml_backend_hsa_context & ctx) { + if (auto val = hsa_signal_wait_scacquire(ctx.dispatch_signal, HSA_SIGNAL_CONDITION_EQ, 0, + UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + val != 0) { + GGML_ABORT("%s: unexpected signal value (%ld)\n", __func__, val); + } + + ctx.free_pending_payloads(); +} + +// HSA buffer + +/** + * @brief Context for managing a HSA buffer associated with a specific device. + */ +struct ggml_backend_hsa_buffer_context { + std::int32_t device{}; ///< Device ID associated with this buffer context. + ggml_hsa_unique_ptr dev_ptr; ///< Pointer to the device memory. + std::vector> tensor_extras; + + ggml_backend_hsa_buffer_context(std::int32_t device, ggml_hsa_unique_ptr dev_ptr) : + device{device}, dev_ptr{std::move(dev_ptr)} {} +}; + +/** + * @brief Frees resources associated with @p buffer. + */ +static void ggml_backend_hsa_buffer_free_buffer(ggml_backend_buffer_t buffer) { + auto * buf_ctx = static_cast(buffer->context); + delete buf_ctx; +} + +/** + * @brief Returns if @p buffer is a HSA buffer. + */ +static bool ggml_backend_buffer_is_hsa(ggml_backend_buffer_t buffer) { + return buffer->iface.free_buffer == ggml_backend_hsa_buffer_free_buffer; +} + +/** + * @brief Returns the base pointer of @p buffer. + */ +static void * ggml_backend_hsa_buffer_get_base(ggml_backend_buffer_t buffer) { + auto & buf_ctx = *static_cast(buffer->context); + return buf_ctx.dev_ptr.get(); +} + +/** + * @brief Initializes the tensor. + */ +static enum ggml_status ggml_backend_hsa_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor) { + // View tensors generally don't need initialization, but some operations like CLAMP + // are created as views in GGML even though they have actual compute work. + // These need tensor_extra for kernel dispatch. + if (ggml_is_view(tensor) && tensor->op != GGML_OP_CLAMP) { + // no further initialization needed for views + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + return GGML_STATUS_SUCCESS; + } + + assert(tensor->extra == nullptr); + + auto & buf_ctx = *static_cast(buffer->context); + const auto & dev_info = ggml_hsa_get_device_info(buf_ctx.device); + + try { + // initialize tensor extra + auto tensor_extra = std::make_unique(dev_info, *tensor); + if (auto status = tensor_extra->allocate_internal_storage(dev_info); + status != GGML_STATUS_SUCCESS) { + return status; + } + // register tensor extra with the buffer context and the tensor + buf_ctx.tensor_extras.push_back(std::move(tensor_extra)); + tensor->extra = buf_ctx.tensor_extras.back().get(); + } catch (const std::exception & ex) { + GGML_HSA_LOG_ERROR("%s: exception caught: %s", __func__, ex.what()); + return GGML_STATUS_FAILED; + } + + return GGML_STATUS_SUCCESS; +} + +/** + * @brief Set tensor data to a specific value @p value. + * + * @param buffer tensor storage + * @param tensor destination tensor + * @param value value to set to the tensor + * @param offset offset in tensor + * @param size size of data to set, in bytes + */ +static void ggml_backend_hsa_buffer_memset_tensor(ggml_backend_buffer_t /* buffer */, + ggml_tensor * tensor, + uint8_t value, + size_t offset, + size_t size) { + std::memset(static_cast(tensor->data) + offset, value, size); +} + +/** + * @brief Set tensor data. + * + * @param buffer tensor storage + * @param tensor destination tensor + * @param data source data + * @param offset offset in source data + * @param size size of source data, in bytes + */ +static void ggml_backend_hsa_buffer_set_tensor(ggml_backend_buffer_t /* buffer */, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + std::memcpy(static_cast(tensor->data) + offset, data, size); +} + +/** + * @brief Get tensor data. + * + * @param buffer tensor storage + * @param tensor source tensor + * @param data pointer to destination buffer + * @param offset offset in source tensor data + * @param size size of source data, in bytes + */ +static void ggml_backend_hsa_buffer_get_tensor(ggml_backend_buffer_t /* buffer */, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + std::memcpy(data, static_cast(tensor->data) + offset, size); +} + +/** + * @brief Copy tensor data between buffers if possible. + * + * The size of the data to be copied is inferred by the source tensor @p src. + * + * @param buffer tensor storage + * @param src source tensor + * @param dst destination tensor + * @return true if the copy operation succeeded, false otherwise. + */ +static bool ggml_backend_hsa_buffer_cpy_tensor(ggml_backend_buffer_t /* buffer */, + const ggml_tensor * src, + ggml_tensor * dst) { + if (ggml_backend_buffer_is_hsa(src->buffer)) { + std::memcpy(dst->data, src->data, ggml_nbytes(dst)); + return true; + } + return false; +} + +/** + * @brief Clear buffer @p buffer by setting all its memory to @p value. + */ +static void ggml_backend_hsa_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + auto & buf_ctx = *static_cast(buffer->context); + std::memset(buf_ctx.dev_ptr.get(), value, buffer->size); +} + +/** + * @brief Interface for HSA buffers. + */ +static const ggml_backend_buffer_i ggml_backend_hsa_buffer_interface = { + /* .free_buffer = */ ggml_backend_hsa_buffer_free_buffer, + /* .get_base = */ ggml_backend_hsa_buffer_get_base, + /* .init_tensor = */ ggml_backend_hsa_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_hsa_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_hsa_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_hsa_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_hsa_buffer_cpy_tensor, + /* .clear = */ ggml_backend_hsa_buffer_clear, + /* .reset = */ nullptr, +}; + +// HSA buffer type + +/** + * @brief Context information for HSA backend buffer type. + */ +struct ggml_backend_hsa_buffer_type_context { + std::int32_t device; ///< ID of the device associated with this buffer type context. + std::string name; ///< Name of the buffer type context. + + explicit ggml_backend_hsa_buffer_type_context(std::int32_t device) : + device(device), name(ggml_hsa_format_name(device)) {} +}; + +/** + * @brief Returns the name associated with the buffer type @p buft. + */ +static const char * ggml_backend_hsa_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + const auto & buft_ctx = *static_cast(buft->context); + return buft_ctx.name.c_str(); +} + +/** + * @brief Returns if the buffer type @p buft is a HSA buffer type. + */ +static bool ggml_backend_buft_is_hsa(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_hsa_buffer_type_get_name; +} + +/** + * @brief Allocates a buffer in @p buft of size @p size. + */ +static ggml_backend_buffer_t +ggml_backend_hsa_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + const auto & buft_ctx = *static_cast(buft->context); + const auto & dev_info = ggml_hsa_get_device_info(buft_ctx.device); + + void * buffer = nullptr; + if (auto status = hsa_amd_memory_pool_allocate(dev_info.data_memory.memory_pool, size, + /* flags = */ 0, &buffer); + status != HSA_STATUS_SUCCESS) { + GGML_HSA_LOG_ERROR("%s: failed to allocate %.2f MiB on device %s (%s)", __func__, + (size / 1024.0 / 1024.0), dev_info.name.c_str(), + ggml_hsa_get_status_string(status)); + return nullptr; + } + + try { + auto * buf_ctx = + new ggml_backend_hsa_buffer_context(buft_ctx.device, ggml_hsa_unique_ptr{buffer}); + return ggml_backend_buffer_init(buft, ggml_backend_hsa_buffer_interface, buf_ctx, size); + } catch (const std::exception & ex) { + GGML_HSA_LOG_ERROR("%s: exception caught: %s", __func__, ex.what()); + return nullptr; + } +} + +/** + * @brief Returns the memory alignment requirement for buffer type @p buft in bytes. + */ +static size_t ggml_backend_hsa_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + const auto & buft_ctx = *static_cast(buft->context); + const auto & dev_info = ggml_hsa_get_device_info(buft_ctx.device); + return dev_info.alignment; +} + +/** + * @brief Returns the maximum allocation size for buffer type @p buft in bytes. + */ +static size_t ggml_backend_hsa_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + const auto & buft_ctx = *static_cast(buft->context); + const auto & dev_info = ggml_hsa_get_device_info(buft_ctx.device); + return dev_info.data_memory.max_alloc_size; +} + +/** + * @brief Returns the size required for tensor @p tensor in buffer type @p buft. + */ +static size_t ggml_backend_hsa_buffer_type_get_alloc_size(ggml_backend_buffer_type_t /* buft */, + const ggml_tensor * tensor) { + std::size_t size = ggml_nbytes(tensor); + + if (ggml_is_quantized(tensor->type)) { + const auto ne0 = tensor->ne[0]; + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return size; +} + +/** + * @brief Interface for managing HSA buffer types. + */ +static const ggml_backend_buffer_type_i ggml_backend_hsa_buffer_type_interface = { + /* .get_name = */ ggml_backend_hsa_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_hsa_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_hsa_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_hsa_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_hsa_buffer_type_get_alloc_size, + /* .is_host = */ nullptr, +}; + +/** + * @brief HSA buffer types. + */ +static struct { + ggml_backend_buffer_type type[GGML_HSA_MAX_DEVICES]; + std::once_flag flag; +} ggml_backend_hsa_buffer_type_metadata; + +ggml_backend_buffer_type_t ggml_backend_hsa_buffer_type(std::int32_t device) { + const auto device_count = ggml_backend_hsa_get_device_count(); + + if (device >= device_count) { + return nullptr; + } + + try { + std::call_once(ggml_backend_hsa_buffer_type_metadata.flag, [&device_count] { + for (std::int32_t i = 0; i < device_count; ++i) { + ggml_backend_hsa_buffer_type_metadata.type[i] = { + /* .iface = */ ggml_backend_hsa_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_hsa_reg(), i), + /* .context = */ new ggml_backend_hsa_buffer_type_context{i}, + }; + } + }); + + return &ggml_backend_hsa_buffer_type_metadata.type[device]; + } catch (const std::exception & ex) { + GGML_HSA_LOG_ERROR("%s: exception caught: %s", __func__, ex.what()); + return nullptr; + } +} + +// HSA split buffer + +// TODO + +// HSA split buffer type + +/** + * @brief Returns if @p buft is a split buffer. + */ +static bool ggml_backend_buft_is_hsa_split(ggml_backend_buffer_type_t /* buft */) { return false; } + +// host buffer type + +static const char * ggml_backend_hsa_host_buffer_type_name(ggml_backend_buffer_type_t /* buft */) { + return GGML_HSA_NAME "_Host"; +} + +static void ggml_backend_hsa_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + // TODO free buffer + NOT_IMPLEMENTED(); +} + +static void * ggml_hsa_host_malloc(size_t size) { + // TODO allocate pinned memory + NOT_IMPLEMENTED(); + return nullptr; +} + +static ggml_backend_buffer_t +ggml_backend_hsa_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr = ggml_hsa_host_malloc(size); + + if (ptr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + auto buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_hsa_host_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_hsa_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_hsa_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_hsa_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_hsa_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_hsa_reg(), 0), + /* .context = */ nullptr, + }; + + return &ggml_backend_hsa_buffer_type_host; +} + +//////////////////////////////////////////////////////////////////////////////// + +// backend + +/** + * @brief Returns the name of backend @p backend. + */ +static const char * ggml_backend_hsa_get_name(ggml_backend_t backend) { + const auto & ctx = *static_cast(backend->context); + return ctx.name.c_str(); +} + +/** + * @brief Frees the resources associated with @p backend. + */ +static void ggml_backend_hsa_free(ggml_backend_t backend) { + auto * ctx = static_cast(backend->context); + delete ctx; + delete backend; +} + +/** + * @brief Returns the buffer type of the buffer of tensor @p tensor. + */ +static ggml_backend_buffer_type_t ggml_backend_hsa_get_tensor_buft(const ggml_tensor * tensor) { + return (ggml_is_view(tensor) ? tensor->view_src->buffer : tensor->buffer)->buft; +} + +/** + * @brief Set tensor data asynchronously. + * + * @param backend backend + * @param tensor destination tensor + * @param data source data + * @param offset offset in source data + * @param size size of source data, in bytes + */ +static void ggml_backend_hsa_set_tensor_async( + ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT((ggml_backend_hsa_get_tensor_buft(tensor) == + ggml_backend_dev_buffer_type(backend->device)) && + "unsupported buffer type"); + std::memcpy(static_cast(tensor->data) + offset, data, size); + GGML_UNUSED(backend); +} + +/** + * @brief Get tensor data asynchronously. + * + * @param backend backend + * @param tensor source tensor + * @param data pointer to destination buffer + * @param offset offset in source tensor data + * @param size size of source data, in bytes + */ +static void ggml_backend_hsa_get_tensor_async( + ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT((ggml_backend_hsa_get_tensor_buft(tensor) == + ggml_backend_dev_buffer_type(backend->device)) && + "unsupported buffer type"); + std::memcpy(data, static_cast(tensor->data) + offset, size); + GGML_UNUSED(backend); +} + +/** + * @brief Copy tensor data between buffers if possible. + * + * The size of the data to be copied is inferred by the source tensor @p src. + * + * @param backend_src source backend + * @param backend_dst destination backend + * @param src source tensor + * @param dst destination tensor + * @return true if the copy operation succeeded, false otherwise. + */ +static bool ggml_backend_hsa_cpy_tensor_async(ggml_backend_t backend_src, + ggml_backend_t backend_dst, + const ggml_tensor * src, + ggml_tensor * dst) { + if (!ggml_backend_is_hsa(backend_src) || !ggml_backend_is_hsa(backend_dst)) { + return false; + } + if (!ggml_backend_buffer_is_hsa(src->buffer) || !ggml_backend_buffer_is_hsa(dst->buffer)) { + return false; + } + if (!ggml_is_contiguous(src) || !ggml_is_contiguous(dst)) { + return false; // only contiguous tensors supported + } + std::memcpy(dst->data, src->data, ggml_nbytes(dst)); + return true; +} + +static void ggml_backend_hsa_synchronize(ggml_backend_t backend) { + auto & ctx = *static_cast(backend->context); + ggml_hsa_wait_dispatches(ctx); +} + +static enum ggml_status ggml_backend_hsa_graph_compute(ggml_backend_t backend, + ggml_cgraph * cgraph) { + auto & ctx = *static_cast(backend->context); + ggml_status status = GGML_STATUS_SUCCESS; + + const std::int32_t node_count = ggml_graph_n_nodes(cgraph); + for (std::int32_t i = 0; (i < node_count) && (status == GGML_STATUS_SUCCESS); ++i) { + ggml_tensor * node = ggml_graph_node(cgraph, i); + + // early exit if operation does not require a dispatch + if (ggml_op_is_empty(node->op) || ggml_is_empty(node)) { + continue; + } + + switch (node->op) { + // implemented as host kernels, so no dispatch required + case GGML_OP_DUP: + status = ggml_hsa_compute_dup(ctx, node); + continue; + case GGML_OP_CPY: + status = ggml_hsa_compute_cpy(ctx, node); + continue; + case GGML_OP_CONT: + status = ggml_hsa_compute_cont(ctx, node); + continue; + default: + break; + } + + auto & tensor_extra = *static_cast(node->extra); + ggml_tensor & internal_node = tensor_extra.node.tensor; + + if (tensor_extra.requires_sync) { + ggml_hsa_wait_dispatches(ctx); + for (auto src_idx = 0; src_idx < tensor_extra.nsrcs; ++src_idx) { + if (tensor_extra.src_nodes[src_idx].buffer_size == 0) { + continue; + } + // change layout and/or convert datatypes + if (status = ggml_hsa_copy_tensor(node->src[src_idx], internal_node.src[src_idx]); + status != GGML_STATUS_SUCCESS) { + GGML_HSA_LOG_ERROR("%s: failed to copy source %i for tensor \"%s (%s)\"", + __func__, src_idx, node->name, ggml_op_desc(node)); + return status; + } + } + } + + if (status = tensor_extra.kernel->dispatch(ctx, internal_node.src, tensor_extra.nsrcs, + internal_node); + status != GGML_STATUS_SUCCESS) { + GGML_HSA_LOG_ERROR("%s: failed to dispatch kernel for tensor \"%s\" (%s)", __func__, + node->name, ggml_op_desc(node)); + return status; + } + + if (tensor_extra.node.convert_dtype) { + // change layout and/or convert datatypes + ggml_hsa_wait_dispatches(ctx); + if (status = ggml_hsa_copy_tensor(&internal_node, node); + status != GGML_STATUS_SUCCESS) { + GGML_HSA_LOG_ERROR("%s: failed to copy back for tensor \"%s\" (%s)", __func__, + node->name, ggml_op_desc(node)); + return status; + } + } + } + + return status; +} + +static void ggml_backend_hsa_event_record(ggml_backend_t backend, ggml_backend_event_t event) { + NOT_IMPLEMENTED(); +} + +static void ggml_backend_hsa_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + NOT_IMPLEMENTED(); +} + +/** + * @brief Interface for managing HSA backends. + */ +static const ggml_backend_i ggml_backend_hsa_interface = { + /* .get_name = */ ggml_backend_hsa_get_name, + /* .free = */ ggml_backend_hsa_free, + /* .set_tensor_async = */ ggml_backend_hsa_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_hsa_get_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_hsa_cpy_tensor_async, + /* .synchronize = */ ggml_backend_hsa_synchronize, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_update = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_hsa_graph_compute, + /* .event_record = */ ggml_backend_hsa_event_record, + /* .event_wait = */ ggml_backend_hsa_event_wait, + /* .graph_optimize = */ nullptr, +}; + +/** + * @brief Returns the unique identifier of the HSA backend. + * + * @note The identifier is a UUID v4 that was randomly generated. + */ +static ggml_guid_t ggml_backend_hsa_guid() { + static ggml_guid guid = {0xa2, 0xe9, 0xa0, 0x84, 0x2c, 0xf6, 0x4d, 0xa1, + 0xb3, 0xb2, 0xb1, 0xdc, 0x5d, 0x59, 0x21, 0x95}; + return &guid; +} + +/** + * @brief Returns if @p backend is an HSA backend. + */ +bool ggml_backend_is_hsa(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_hsa_guid()); +} + +/** + * @brief Returns if the number of devices (i.e., HSA agents) associated with the HSA backend. + */ +std::int32_t ggml_backend_hsa_get_device_count() { return ggml_hsa_info().device_count; } + +/** + * @brief Returns the device description of device @p device. + */ +void ggml_backend_hsa_get_device_description(std::int32_t device, + char * description, + size_t description_size) { + const auto & dev_info = ggml_hsa_get_device_info(device); + snprintf(description, description_size, "%s", dev_info.name.data()); +} + +/** + * @brief Returns the free and total memory in @p free and @p total respectively for device + * @p dev. + */ +void ggml_backend_hsa_get_device_memory(std::int32_t device, size_t * free, size_t * total) { + const auto & dev_info = ggml_hsa_get_device_info(device); + *total = dev_info.data_memory.size; + // HSA does not report free memory, set it to total + *free = *total; +} + +bool ggml_backend_hsa_register_host_buffer(void * buffer, size_t size) { + NOT_IMPLEMENTED(); + return false; +} + +void ggml_backend_hsa_unregister_host_buffer(void * buffer) { NOT_IMPLEMENTED(); } + +// backend device + +/** + * @brief HSA device context. + */ +struct ggml_backend_hsa_device_context { + std::int32_t device; + std::string name; + std::string description; + + ggml_backend_hsa_device_context(std::int32_t device, hsa_agent_t agent) : + device(device), + name(ggml_hsa_format_name(device)), + description(ggml_hsa_agent_name(agent)) {} +}; + +static const char * ggml_backend_hsa_device_get_name(ggml_backend_dev_t dev) { + const auto & dev_ctx = *static_cast(dev->context); + return dev_ctx.name.c_str(); +} + +static const char * ggml_backend_hsa_device_get_description(ggml_backend_dev_t dev) { + const auto & dev_ctx = *static_cast(dev->context); + return dev_ctx.description.c_str(); +} + +/** + * @brief Returns the free and total memory in @p free and @p total respectively for device + * @p dev. + */ +static void +ggml_backend_hsa_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + const auto & dev_ctx = *static_cast(dev->context); + const auto & dev_info = ggml_hsa_get_device_info(dev_ctx.device); + *total = dev_info.data_memory.size; + // HSA does not report free memory, set it to total + *free = *total; +} + +/** + * @brief Returns the device type of @p dev. + */ +static enum ggml_backend_dev_type ggml_backend_hsa_device_get_type(ggml_backend_dev_t dev) { + const auto & dev_ctx = *static_cast(dev->context); + const auto & dev_info = ggml_hsa_get_device_info(dev_ctx.device); + switch (dev_info.type) { + case HSA_DEVICE_TYPE_CPU: + return GGML_BACKEND_DEVICE_TYPE_CPU; + case HSA_DEVICE_TYPE_GPU: + return GGML_BACKEND_DEVICE_TYPE_GPU; + case HSA_DEVICE_TYPE_DSP: + case HSA_DEVICE_TYPE_AIE: + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + default: + GGML_ABORT("%s: unknown HSA device type %d", __func__, dev_info.type); + } +} + +static void ggml_backend_hsa_device_get_props(ggml_backend_dev_t dev, + ggml_backend_dev_props * props) { + props->name = ggml_backend_hsa_device_get_name(dev); + props->description = ggml_backend_hsa_device_get_description(dev); + props->type = ggml_backend_hsa_device_get_type(dev); + ggml_backend_hsa_device_get_memory(dev, &props->memory_free, &props->memory_total); + + props->caps = { + /* .async = */ true, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_hsa_device_init_backend(ggml_backend_dev_t dev, + const char * /*params*/) { + const auto & dev_ctx = *static_cast(dev->context); + return ggml_backend_hsa_init(dev_ctx.device); +} + +static ggml_backend_buffer_type_t ggml_backend_hsa_device_get_buffer_type(ggml_backend_dev_t dev) { + const auto & dev_ctx = *static_cast(dev->context); + return ggml_backend_hsa_buffer_type(dev_ctx.device); +} + +static ggml_backend_buffer_type_t +ggml_backend_hsa_device_get_host_buffer_type(ggml_backend_dev_t /*dev*/) { + return ggml_backend_hsa_host_buffer_type(); +} + +/** + * @brief Returns if the operation in tensor @p op is supported by device @p dev. + */ +static bool ggml_backend_hsa_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + // early exit if operation does not require a kernel + if (ggml_op_is_empty(op->op)) { + return true; + } + + switch (op->op) { + // implemented as host kernels + case GGML_OP_DUP: + case GGML_OP_CPY: + case GGML_OP_CONT: + return true; + default: + break; + } + + // check if tensor is already initialized with a valid kernel + if ((op->extra != nullptr) && + (static_cast(op->extra)->kernel != nullptr)) { + return true; + } + + // check if compilation artifacts exist or if the kernel can be compiled + const auto & dev_ctx = *static_cast(dev->context); + const auto & dev_info = ggml_hsa_get_device_info(dev_ctx.device); + try { + ggml_backend_hsa_tensor_extra tensor_extra{dev_info, *op}; + return (tensor_extra.kernel != nullptr); + } catch (const std::exception & ex) { + // exception is not fatal, it means that the op is not supported + GGML_HSA_LOG_WARN("%s: exception caught: %s", __func__, ex.what()); + return false; + } +} + +static bool ggml_backend_hsa_device_supports_buft(ggml_backend_dev_t dev, + ggml_backend_buffer_type_t buft) { + return (ggml_backend_buft_is_hsa(buft) || ggml_backend_buft_is_hsa_split(buft)) && + buft->device == dev; +} + +static std::int64_t get_op_batch_size(const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_GET_ROWS: + return 0; + case GGML_OP_MUL_MAT: + return op->ne[1]; + case GGML_OP_MUL_MAT_ID: + case GGML_OP_ROPE: + return op->ne[2]; + default: + return ggml_nrows(op); + } +} + +static bool ggml_backend_hsa_device_offload_op(ggml_backend_dev_t /* dev */, + const ggml_tensor * op) { + const std::int64_t min_batch_size = 32; + return get_op_batch_size(op) >= min_batch_size; +} + +static ggml_backend_event_t ggml_backend_hsa_device_event_new(ggml_backend_dev_t dev) { + NOT_IMPLEMENTED(); + return nullptr; +} + +static void ggml_backend_hsa_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { + NOT_IMPLEMENTED(); +} + +static void ggml_backend_hsa_device_event_synchronize(ggml_backend_dev_t dev, + ggml_backend_event_t event) { + NOT_IMPLEMENTED(); +} + +/** + * @brief Interface for managing HSA devices. + */ +static const ggml_backend_device_i ggml_backend_hsa_device_interface = { + /* .get_name = */ ggml_backend_hsa_device_get_name, + /* .get_description = */ ggml_backend_hsa_device_get_description, + /* .get_memory = */ ggml_backend_hsa_device_get_memory, + /* .get_type = */ ggml_backend_hsa_device_get_type, + /* .get_props = */ ggml_backend_hsa_device_get_props, + /* .init_backend = */ ggml_backend_hsa_device_init_backend, + /* .get_buffer_type = */ ggml_backend_hsa_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_hsa_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ nullptr, + /* .supports_op = */ ggml_backend_hsa_device_supports_op, + /* .supports_buft = */ ggml_backend_hsa_device_supports_buft, + /* .offload_op = */ ggml_backend_hsa_device_offload_op, + /* .event_new = */ ggml_backend_hsa_device_event_new, + /* .event_free = */ ggml_backend_hsa_device_event_free, + /* .event_synchronize = */ ggml_backend_hsa_device_event_synchronize, +}; + +// backend reg + +/** + * @brief HSA registration context. + */ +struct ggml_backend_hsa_reg_context { + static inline const char * name = GGML_HSA_NAME; + std::vector devices; + std::vector features; +}; + +static const char * ggml_backend_hsa_reg_get_name(ggml_backend_reg_t /* reg */) { + return ggml_backend_hsa_reg_context::name; +} + +static size_t ggml_backend_hsa_reg_get_device_count(ggml_backend_reg_t reg) { + const auto & reg_ctx = *static_cast(reg->context); + return reg_ctx.devices.size(); +} + +static ggml_backend_dev_t ggml_backend_hsa_reg_get_device(ggml_backend_reg_t reg, size_t index) { + const auto & reg_ctx = *static_cast(reg->context); + GGML_ASSERT(index < reg_ctx.devices.size()); + return reg_ctx.devices[index]; +} + +static ggml_backend_feature * ggml_backend_hsa_get_features(ggml_backend_reg_t reg) { + auto & reg_ctx = *static_cast(reg->context); + return reg_ctx.features.data(); +} + +static void * ggml_backend_hsa_reg_get_proc_address(ggml_backend_reg_t /* reg */, + const char * name) { + if (strcmp(name, "ggml_backend_register_host_buffer") == 0) { + return reinterpret_cast(ggml_backend_hsa_register_host_buffer); + } + if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { + return reinterpret_cast(ggml_backend_hsa_unregister_host_buffer); + } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return reinterpret_cast(ggml_backend_hsa_get_features); + } + return nullptr; +} + +/** + * @brief Interface for managing HSA registration. + */ +static const ggml_backend_reg_i ggml_backend_hsa_reg_interface = { + /* .get_name = */ ggml_backend_hsa_reg_get_name, + /* .get_device_count = */ ggml_backend_hsa_reg_get_device_count, + /* .get_device = */ ggml_backend_hsa_reg_get_device, + /* .get_proc_address = */ ggml_backend_hsa_reg_get_proc_address, +}; + +// backend registry + +static struct { + ggml_backend_reg reg; + std::once_flag flag; +} ggml_backend_hsa_reg_metadata; + +ggml_backend_reg_t ggml_backend_hsa_reg() try { + std::call_once(ggml_backend_hsa_reg_metadata.flag, [] { + const auto & info = ggml_hsa_info(); + + auto * reg_ctx = new ggml_backend_hsa_reg_context; + + reg_ctx->devices.reserve(info.device_count); + for (std::int32_t i = 0; i < info.device_count; i++) { + auto * dev_ctx = new ggml_backend_hsa_device_context{i, info.devices[i].agent}; + + auto dev = new ggml_backend_device{/* .iface = */ ggml_backend_hsa_device_interface, + /* .reg = */ &ggml_backend_hsa_reg_metadata.reg, + /* .context = */ dev_ctx}; + reg_ctx->devices.push_back(dev); + } + + ggml_backend_hsa_reg_metadata.reg = + ggml_backend_reg{/* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_hsa_reg_interface, + /* .context = */ reg_ctx}; + }); + + return &ggml_backend_hsa_reg_metadata.reg; +} catch (const std::exception & ex) { + GGML_HSA_LOG_ERROR("%s: exception caught: %s", __func__, ex.what()); + return nullptr; +} + +ggml_backend_t ggml_backend_hsa_init(std::int32_t device) { + const auto & info = ggml_hsa_info(); + + if (device < 0 || device >= info.device_count) { + GGML_HSA_LOG_ERROR("%s: invalid device ID %d", __func__, device); + return nullptr; + } + + try { + auto * ctx = new ggml_backend_hsa_context{info.devices[device]}; + + ggml_backend_t hsa_backend = new ggml_backend{ + /* .guid = */ ggml_backend_hsa_guid(), + /* .interface = */ ggml_backend_hsa_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_hsa_reg(), device), + /* .context = */ ctx, + }; + + return hsa_backend; + } catch (const std::exception & ex) { + GGML_HSA_LOG_ERROR("%s: exception caught: %s", __func__, ex.what()); + return nullptr; + } +} + +GGML_BACKEND_DL_IMPL(ggml_backend_hsa_reg) diff --git a/src/ggml-hsa/host-ops.cpp b/src/ggml-hsa/host-ops.cpp new file mode 100644 index 0000000000..df82e6da72 --- /dev/null +++ b/src/ggml-hsa/host-ops.cpp @@ -0,0 +1,257 @@ +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +#include "ggml-hsa/host-ops.hpp" + +#include +#include +#include + +#include "ggml-hsa/common.hpp" +#include "ggml-hsa/type-traits.hpp" + +/** + * @brief Copies the data from the source tensor to a destination tensor with the same shape. + * + * This function handles different types of tensors and performs necessary conversions + * based on the type traits defined for each tensor type. + */ +struct ggml_hsa_copy_same_shape_tensors_f { + template + ggml_status operator()(const ggml_tensor * src, ggml_tensor * dst) { + assert(ggml_are_same_shape(src, dst)); + + using src_traits = ggml_hsa_type_traits; + using dst_traits = ggml_hsa_type_traits; + + using src_type = typename src_traits::type; + using dst_type = typename dst_traits::type; + + for (std::int64_t i03 = 0; i03 < src->ne[3]; ++i03) { + for (std::int64_t i02 = 0; i02 < src->ne[2]; ++i02) { + for (std::int64_t i01 = 0; i01 < src->ne[1]; ++i01) { + for (std::int64_t i00 = 0; i00 < src->ne[0]; ++i00) { + auto src_ptr = std::launder(reinterpret_cast( + static_cast(src->data) + + (i00 * src->nb[0] + i01 * src->nb[1] + i02 * src->nb[2] + + i03 * src->nb[3]))); + auto dst_ptr = std::launder( + reinterpret_cast(static_cast(dst->data) + + (i00 * dst->nb[0] + i01 * dst->nb[1] + + i02 * dst->nb[2] + i03 * dst->nb[3]))); + + if constexpr (SrcT == DstT) { + // no conversion needed + *dst_ptr = *src_ptr; + } else if constexpr (src_traits::is_fundamental && + dst_traits::is_fundamental) { + // trivial conversion based on fundamental types + *dst_ptr = static_cast(*src_ptr); + } else if constexpr (src_traits::is_fundamental) { + // conversion using promotion of source type to fp32 + auto src_v = static_cast(*src_ptr); + *dst_ptr = dst_traits::from_fp32(src_v); + } else if constexpr (dst_traits::is_fundamental) { + // conversion using promotion of destination type to fp32 + auto src_v = src_traits::to_fp32(*src_ptr); + *dst_ptr = static_cast(src_v); + } else { + // conversion using promotion of source and destination types to fp32 + auto src_v = src_traits::to_fp32(*src_ptr); + *dst_ptr = dst_traits::from_fp32(src_v); + } + } + } + } + } + return GGML_STATUS_SUCCESS; + } +}; + +/** + * @brief Copies the data from the source tensor to a contiguous destination tensor. + * + * This function handles different types of tensors and performs necessary conversions + * based on the type traits defined for each tensor type. + */ +struct ggml_hsa_copy_tensor_to_cont_tensor_f { + template + ggml_status operator()(const ggml_tensor * src, ggml_tensor * dst) { + assert((ggml_nelements(src) == ggml_nelements(dst)) && ggml_is_contiguous(dst)); + + using src_traits = ggml_hsa_type_traits; + using dst_traits = ggml_hsa_type_traits; + + using src_type = typename src_traits::type; + using dst_type = typename dst_traits::type; + + auto dst_ptr = std::launder(static_cast(dst->data)); + + std::int64_t id = 0; + for (std::int64_t i03 = 0; i03 < src->ne[3]; ++i03) { + for (std::int64_t i02 = 0; i02 < src->ne[2]; ++i02) { + for (std::int64_t i01 = 0; i01 < src->ne[1]; ++i01) { + for (std::int64_t i00 = 0; i00 < src->ne[0]; ++i00) { + auto src_ptr = std::launder(reinterpret_cast( + static_cast(src->data) + + (i00 * src->nb[0] + i01 * src->nb[1] + i02 * src->nb[2] + + i03 * src->nb[3]))); + if constexpr (SrcT == DstT) { + // no conversion needed + dst_ptr[id] = *src_ptr; + } else if constexpr (src_traits::is_fundamental && + dst_traits::is_fundamental) { + // trivial conversion based on fundamental types + dst_ptr[id] = static_cast(*src_ptr); + } else if constexpr (src_traits::is_fundamental) { + // conversion using promotion of source type to fp32 + auto src_v = static_cast(*src_ptr); + dst_ptr[id] = dst_traits::from_fp32(src_v); + } else if constexpr (dst_traits::is_fundamental) { + // conversion using promotion of destination type to fp32 + auto src_v = src_traits::to_fp32(*src_ptr); + dst_ptr[id] = static_cast(src_v); + } else { + // conversion using promotion of source and destination types to fp32 + auto src_v = src_traits::to_fp32(*src_ptr); + dst_ptr[id] = dst_traits::from_fp32(src_v); + } + ++id; + } + } + } + } + return GGML_STATUS_SUCCESS; + } +}; + +/** + * @brief Assigns @p src to @p dst using @p f as the copy operation. + */ +template +ggml_status ggml_hsa_assign(F && f, const ggml_tensor * src, ggml_tensor * dst) { + switch (src->type) { + case GGML_TYPE_F32: + switch (dst->type) { + case GGML_TYPE_F32: + return std::forward(f).template operator()(src, dst); + case GGML_TYPE_F16: + return std::forward(f).template operator()( + src, dst); + case GGML_TYPE_BF16: + return std::forward(f).template operator()( + src, dst); + default: + GGML_HSA_LOG_ERROR("%s: unsupported type for destination tensor \"%s\" (%s)", + __func__, dst->name, ggml_type_name(dst->type)); + return GGML_STATUS_FAILED; + } + case GGML_TYPE_F16: + switch (dst->type) { + case GGML_TYPE_F32: + return std::forward(f).template operator()( + src, dst); + case GGML_TYPE_F16: + return std::forward(f).template operator()(src, dst); + case GGML_TYPE_BF16: + return std::forward(f).template operator()( + src, dst); + default: + GGML_HSA_LOG_ERROR("%s: unsupported type for destination tensor \"%s\" (%s)", + __func__, dst->name, ggml_type_name(dst->type)); + return GGML_STATUS_FAILED; + } + case GGML_TYPE_I16: + switch (dst->type) { + case GGML_TYPE_I8: + return std::forward(f).template operator()(src, + dst); + case GGML_TYPE_I16: + return std::forward(f).template operator()(src, dst); + case GGML_TYPE_I32: + return std::forward(f).template operator()( + src, dst); + default: + GGML_HSA_LOG_ERROR("%s: unsupported type for destination tensor \"%s\" (%s)", + __func__, dst->name, ggml_type_name(dst->type)); + return GGML_STATUS_FAILED; + } + case GGML_TYPE_BF16: + switch (dst->type) { + case GGML_TYPE_F32: + return std::forward(f).template operator()( + src, dst); + case GGML_TYPE_F16: + return std::forward(f).template operator()( + src, dst); + case GGML_TYPE_BF16: + return std::forward(f).template operator()(src, dst); + default: + GGML_HSA_LOG_ERROR("%s: unsupported type for destination tensor \"%s\" (%s)", + __func__, dst->name, ggml_type_name(dst->type)); + return GGML_STATUS_FAILED; + } + default: + GGML_HSA_LOG_ERROR("%s: unsupported type for source tensor \"%s\" (%s)", __func__, + src->name, ggml_type_name(src->type)); + return GGML_STATUS_FAILED; + } +} + +ggml_status ggml_hsa_copy_tensor(const ggml_tensor * src, ggml_tensor * dst) { + if (ggml_is_contiguous(dst)) { + return ggml_hsa_assign(ggml_hsa_copy_tensor_to_cont_tensor_f{}, src, dst); + } + + if (ggml_are_same_shape(src, dst)) { + return ggml_hsa_assign(ggml_hsa_copy_same_shape_tensors_f{}, src, dst); + } + + GGML_HSA_LOG_ERROR("%s: unsupported tensor combination between source \"%s\" (%s) and " + "destination tensors \"%s\" (%s)", + __func__, src->name, ggml_op_desc(src), dst->name, ggml_op_desc(dst)); + return GGML_STATUS_FAILED; +} + +ggml_status ggml_hsa_compute_dup(ggml_backend_hsa_context & ctx, ggml_tensor * t) { + assert((ggml_hsa_nsrcs(*t) == 1) && (t->type == t->src[0]->type) && + ggml_are_same_shape(t, t->src[0])); + + auto * src = t->src[0]; + auto * dst = t; + + if (dst->view_src == src) { + // destination tensor is a view of the source tensor + return GGML_STATUS_SUCCESS; + } + + ggml_hsa_wait_dispatches(ctx); + + if (ggml_is_contiguous(dst)) { + return ggml_hsa_assign(ggml_hsa_copy_tensor_to_cont_tensor_f{}, src, dst); + } + + return ggml_hsa_assign(ggml_hsa_copy_same_shape_tensors_f{}, src, dst); +} + +ggml_status ggml_hsa_compute_cpy(ggml_backend_hsa_context & ctx, ggml_tensor * t) { + assert((ggml_hsa_nsrcs(*t) == 2) && (ggml_nelements(t->src[0]) == ggml_nelements(t->src[1]))); + + auto * src = t->src[0]; + auto * dst = t->src[1]; + + ggml_hsa_wait_dispatches(ctx); + + return ggml_hsa_copy_tensor(src, dst); +} + +ggml_status ggml_hsa_compute_cont(ggml_backend_hsa_context & ctx, ggml_tensor * t) { + assert((ggml_hsa_nsrcs(*t) == 1) && (t->type == t->src[0]->type) && + (ggml_nelements(t) == ggml_nelements(t->src[0])) && ggml_is_contiguous(t)); + + auto * src = t->src[0]; + auto * dst = t; + + ggml_hsa_wait_dispatches(ctx); + + return ggml_hsa_assign(ggml_hsa_copy_tensor_to_cont_tensor_f{}, src, dst); +} diff --git a/src/ggml-hsa/host-ops.hpp b/src/ggml-hsa/host-ops.hpp new file mode 100644 index 0000000000..66b218eee6 --- /dev/null +++ b/src/ggml-hsa/host-ops.hpp @@ -0,0 +1,40 @@ +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +#pragma once + +#include "ggml-hsa/common.hpp" + +#include "ggml.h" + +/** + * @brief Copy the contents of tensor @p src to tensor @p dst. + * + * @warning This function does not synchronize with any device. + * + * @param[in] src tensor to copy from + * @param[in] dst tensor to copy to + */ +ggml_status ggml_hsa_copy_tensor(const ggml_tensor * src, ggml_tensor * dst); + +/** + * @brief Duplicate tensor @c t->src[0] to @p t without changing the shape or the datatype of the + * tensor. + * + * @note @p t may be a view of @c t->src[0], in which case the operation is a no-op. + */ +ggml_status ggml_hsa_compute_dup(ggml_backend_hsa_context & ctx, ggml_tensor * t); + +/** + * @brief Copy tensor @c t->src[0] to @c t->src[1] without changing the layout of the tensor. + * + * @note This operation may change the datatype between @c t->src[0] and @c t->src[1] (e.g., + * @ref ggml_cast is a special case of this operation). + */ +ggml_status ggml_hsa_compute_cpy(ggml_backend_hsa_context & ctx, ggml_tensor * t); + +/** + * @brief Store a copy of @c t->src[0] to @p t, where @p t has contiguous storage. + * + * @note This operation may change the layout of @c t->src[0] but it does not change the datatype. + */ +ggml_status ggml_hsa_compute_cont(ggml_backend_hsa_context & ctx, ggml_tensor * t); diff --git a/src/ggml-hsa/kernel-discovery.cpp b/src/ggml-hsa/kernel-discovery.cpp new file mode 100644 index 0000000000..dbe1ec5729 --- /dev/null +++ b/src/ggml-hsa/kernel-discovery.cpp @@ -0,0 +1,267 @@ +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +#include "ggml-hsa/kernel-discovery.hpp" + +#include +#include +#include +#include +#include +#include + +#include "ggml-hsa/aie-kernel.hpp" +#include "ggml-impl.h" +#ifdef GGML_HSA_JIT_COMPILE +#include "ggml-hsa/aie-kernel-compiler.hpp" +#endif + +namespace fs = std::filesystem; + +/** + * @brief Returns the precompiled kernel directory. + */ +static fs::path ggml_hsa_precompiled_kernel_dir() { + if (const char * kernel_dir = std::getenv("GGML_HSA_KERNEL_DIR"); kernel_dir != nullptr) { + auto dir = fs::path(kernel_dir); + if (!fs::is_directory(dir)) { + GGML_ABORT("%s: GGML_HSA_KERNEL_DIR (%s) is not a valid directory.\n", __func__, + dir.c_str()); + } + return dir; + } + GGML_HSA_LOG_INFO("%s: no pregenerated kernel directory defined.", __func__); + return fs::path{}; +} + +/// Precompiled kernel directory. +static const fs::path kernel_dir = ggml_hsa_precompiled_kernel_dir(); + +/** + * @brief Returns the cached kernel directory and clears it if requested. + * + * Cached kernels are stored in the following directories: + * 1. GGML_HSA_KERNEL_CACHE_DIR if defined, or + * 2. $XDG_CACHE_HOME/ggml if XDG_CACHE_HOME is defined, or, + * 3. $HOME/.cache/ggml if HOME is defined, or + * 4. /tmp/ggml/ggml-hsa otherwise. + */ +static fs::path ggml_hsa_cached_kernel_dir() { + fs::path cache_dir; + if (const char * base_dir = std::getenv("GGML_HSA_KERNEL_CACHE_DIR"); base_dir != nullptr) { + cache_dir = fs::path(base_dir); + } else if (const char * base_dir = std::getenv("XDG_CACHE_HOME"); base_dir != nullptr) { + cache_dir = fs::path(base_dir) / "ggml"; + } else if (const char * base_dir = std::getenv("HOME"); base_dir != nullptr) { + cache_dir = fs::path(base_dir) / ".cache/ggml"; + } else { + cache_dir = fs::path("/tmp/ggml/ggml-hsa"); + } + GGML_HSA_LOG_INFO("%s: cached kernels in %s", __func__, cache_dir.c_str()); + + if (const char * clear_cache = std::getenv("GGML_HSA_KERNEL_CACHE_CLEAR"); + clear_cache != nullptr && ggml_hsa_string_to_bool(clear_cache)) { + GGML_HSA_LOG_INFO("%s: clearing kernel cache in %s", __func__, cache_dir.c_str()); + fs::remove_all(cache_dir); + } + + return cache_dir; +} + +/// Cached (i.e., JIT compiled) kernel directory. +static const fs::path cached_kernel_dir = ggml_hsa_cached_kernel_dir(); + +/// PDI file suffix. +static constexpr std::string_view pdi_file_suffix = ".pdi"; + +/// Binary instructions file suffix. +static constexpr std::string_view inst_file_suffix = "_insts.bin"; + +/** + * @brief Returns if @p p is a file. + */ +static bool ggml_hsa_is_file(const fs::path & p) { + return fs::is_regular_file(p) || fs::is_symlink(p); +} + +/** + * @brief Returns if the files for a @ref ggml_hsa_aie_kernel exists in any of the directories. + */ +static bool ggml_hsa_find_aie_kernel_files(const std::string & device_name, + const std::string & kernel_name, + fs::path & pdi_path, + fs::path & insts_path) { + const auto partial_path = fs::path(device_name).append(kernel_name); + const auto partial_pdi_path = fs::path(partial_path).concat(pdi_file_suffix); + const auto partial_insts_path = fs::path(partial_path).concat(inst_file_suffix); + + if (!kernel_dir.empty()) { + // find kernel in pregenerated kernel directory + auto tmp_pdi_path = kernel_dir / partial_pdi_path; + auto tmp_insts_path = kernel_dir / partial_insts_path; + if (ggml_hsa_is_file(tmp_pdi_path) && ggml_hsa_is_file(tmp_insts_path)) { + pdi_path = std::move(tmp_pdi_path); + insts_path = std::move(tmp_insts_path); + return true; + } + } + + // find kernel in cached kernel directory + auto tmp_pdi_path = cached_kernel_dir / partial_pdi_path; + auto tmp_insts_path = cached_kernel_dir / partial_insts_path; + if (ggml_hsa_is_file(tmp_pdi_path) && ggml_hsa_is_file(tmp_insts_path)) { + pdi_path = std::move(tmp_pdi_path); + insts_path = std::move(tmp_insts_path); + return true; + } + + // kernel not found + return false; +} + +/** + * @brief Reads a PDI file from @p path and returns its contents and size in bytes in @p buffer. + */ +static ggml_status +ggml_hsa_load_pdi(hsa_amd_memory_pool_t pool, const fs::path & path, ggml_hsa_pdi_buffer & buffer) { + std::ifstream is(path, std::ios::binary | std::ios::ate | std::ios::in); + if (is.fail()) { + GGML_HSA_LOG_ERROR("%s: could not open file %s", __func__, path.c_str()); + return GGML_STATUS_FAILED; + } + + const std::size_t size = is.tellg(); + if (!is.seekg(0, std::ios::beg) || (size == 0)) { + GGML_HSA_LOG_ERROR("%s: could not get file size for %s", __func__, path.c_str()); + return GGML_STATUS_FAILED; + } + + void * ptr = nullptr; + if (auto status = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr); + status != HSA_STATUS_SUCCESS) { + GGML_HSA_LOG_ERROR("%s: failed to allocate %zu bytes (%s)", __func__, size, + ggml_hsa_get_status_string(status)); + return GGML_STATUS_ALLOC_FAILED; + } + + buffer = ggml_hsa_pdi_buffer{reinterpret_cast(ptr)}; + + is.read(reinterpret_cast(buffer.data()), size); + + return GGML_STATUS_SUCCESS; +} + +/** + * @brief Reads an instruction file from @p path and returns its contents and number of instructions + * in @p buffer. + */ +static ggml_status ggml_hsa_load_insts(hsa_amd_memory_pool_t pool, + const fs::path & path, + ggml_hsa_insts_buffer & buffer) { + std::ifstream is(path, std::ios::binary | std::ios::ate | std::ios::in); + if (is.fail()) { + GGML_HSA_LOG_ERROR("%s: could not open file %s", __func__, path.c_str()); + return GGML_STATUS_FAILED; + } + + const std::size_t size = is.tellg(); + if (!is.seekg(0, std::ios::beg) || (size == 0)) { + GGML_HSA_LOG_ERROR("%s: could not get file size for %s", __func__, path.c_str()); + return GGML_STATUS_FAILED; + } + + if (size % sizeof(std::uint32_t) != 0) { + GGML_HSA_LOG_ERROR("%s: file size %zu bytes is not a multiple of %zu bytes", __func__, size, + sizeof(std::uint32_t)); + return GGML_STATUS_FAILED; + } + + void * ptr = nullptr; + if (auto status = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr); + status != HSA_STATUS_SUCCESS) { + GGML_HSA_LOG_ERROR("%s: failed to allocate %zu bytes (%s)", __func__, size, + ggml_hsa_get_status_string(status)); + return GGML_STATUS_ALLOC_FAILED; + } + + buffer = ggml_hsa_insts_buffer{reinterpret_cast(ptr), + (size / sizeof(std::uint32_t))}; + + is.read(reinterpret_cast(buffer.data()), size); + + return GGML_STATUS_SUCCESS; +} + +/** + * @brief Creates the kernel for the tensor's operation. + * + * This function will try the following until one succeeds in order of priority: + * -# load the kernel from a precompiled kernel directory, + * -# load the kernel from a cached kernel directory, + * -# compile the kernel, store it to the cached kernel directory, and load it. + * If none of the above succeeds, an error message will be returned. + * + * @param[in] dev_info device information + * @param[in] kernel_name kernel name + * @param[in] tensor tensor to find the kernel for + * @param[out] kernel kernel for the operation of @p tensor + */ +static ggml_status ggml_hsa_create_aie_kernel(const ggml_hsa_device_info::device_info & dev_info, + const std::string & kernel_name, + const ggml_tensor & tensor, + std::shared_ptr & kernel) { + fs::path pdi_path; + fs::path insts_path; + + // search for kernel files + if (!ggml_hsa_find_aie_kernel_files(dev_info.name, kernel_name, pdi_path, insts_path)) { +#ifdef GGML_HSA_JIT_COMPILE + // kernel files not found, compile kernel + if (auto status = + ggml_hsa_compile_aie_kernel(dev_info, tensor, kernel_name, cached_kernel_dir); + status != GGML_STATUS_SUCCESS) { + return status; + } + + // search for kernel files after compilation + if (!ggml_hsa_find_aie_kernel_files(dev_info.name, kernel_name, pdi_path, insts_path)) { + return GGML_STATUS_FAILED; + } +#else + GGML_HSA_LOG_INFO("%s: JIT compilation is disabled, kernel cannot be compiled", __func__); + return GGML_STATUS_FAILED; +#endif + } + + auto aie_kernel = std::make_shared(); + + // load PDI and instructions + if (auto status = ggml_hsa_load_pdi(dev_info.dev_memory.memory_pool, pdi_path, aie_kernel->pdi); + status != GGML_STATUS_SUCCESS) { + return status; + } + + if (auto status = + ggml_hsa_load_insts(dev_info.dev_memory.memory_pool, insts_path, aie_kernel->insts); + status != GGML_STATUS_SUCCESS) { + return status; + } + + kernel = std::move(aie_kernel); + + return GGML_STATUS_SUCCESS; +} + +ggml_status ggml_hsa_create_kernel(const ggml_hsa_device_info::device_info & dev_info, + const std::string & kernel_name, + const ggml_tensor & tensor, + std::shared_ptr & kernel) { + switch (dev_info.type) { + case HSA_DEVICE_TYPE_AIE: + return ggml_hsa_create_aie_kernel(dev_info, kernel_name, tensor, kernel); + + // unsupported device types + default: + GGML_HSA_LOG_ERROR("%s: unsupported device %s", __func__, dev_info.name.c_str()); + return GGML_STATUS_FAILED; + } +} diff --git a/src/ggml-hsa/kernel-discovery.hpp b/src/ggml-hsa/kernel-discovery.hpp new file mode 100644 index 0000000000..74b9cacd88 --- /dev/null +++ b/src/ggml-hsa/kernel-discovery.hpp @@ -0,0 +1,27 @@ +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +#pragma once + +#include "ggml-hsa/common.hpp" +#include "ggml.h" + +#include + +/** + * @brief Creates the kernel for the tensor's operation. + * + * This function may try different approaches until one succeeds: + * -# load the kernel from a precompiled kernel directory, + * -# load the kernel from a cached kernel directory, + * -# compile the kernel, store it to the cached kernel directory, and load it. + * If none of the above succeeds, an error message will be returned. + * + * @param[in] dev_info device information + * @param[in] kernel_name kernel name + * @param[in] tensor tensor to find the kernel for + * @param[out] kernel kernel for the operation of @p tensor + */ +ggml_status ggml_hsa_create_kernel(const ggml_hsa_device_info::device_info & dev_info, + const std::string & kernel_name, + const ggml_tensor & tensor, + std::shared_ptr & kernel); diff --git a/src/ggml-hsa/kernels/CMakeLists.txt b/src/ggml-hsa/kernels/CMakeLists.txt new file mode 100644 index 0000000000..6c638a1347 --- /dev/null +++ b/src/ggml-hsa/kernels/CMakeLists.txt @@ -0,0 +1,39 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +include(ggml_hsa_utils) + +if (GGML_HSA_JIT_COMPILE) + add_subdirectory(iron) + + # copy root package files + set(JIT_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/__init__.py + ${CMAKE_CURRENT_SOURCE_DIR}/argmax.py + ${CMAKE_CURRENT_SOURCE_DIR}/binary_ops.py + ${CMAKE_CURRENT_SOURCE_DIR}/build_iron.py + ${CMAKE_CURRENT_SOURCE_DIR}/build.py + ${CMAKE_CURRENT_SOURCE_DIR}/clamp.py + ${CMAKE_CURRENT_SOURCE_DIR}/count_equal.py + ${CMAKE_CURRENT_SOURCE_DIR}/cross_entropy_loss.py + ${CMAKE_CURRENT_SOURCE_DIR}/kernel.py + ${CMAKE_CURRENT_SOURCE_DIR}/mul_mat.py + ${CMAKE_CURRENT_SOURCE_DIR}/scale.py + ${CMAKE_CURRENT_SOURCE_DIR}/soft_max.py + ${CMAKE_CURRENT_SOURCE_DIR}/tensor_desc.py + ${CMAKE_CURRENT_SOURCE_DIR}/unary_ops.py + ) + ggml_hsa_copy_files(ggml_hsa_copy_jit_files + FILES + ${JIT_FILES} + DESTINATION + ${CMAKE_CURRENT_BINARY_DIR} + ) + add_dependencies(ggml-hsa ggml_hsa_copy_jit_files) + install( + FILES + ${JIT_FILES} + DESTINATION + lib/kernels + ) +endif () + diff --git a/src/ggml-hsa/kernels/__init__.py b/src/ggml-hsa/kernels/__init__.py new file mode 100644 index 0000000000..7e60dfd6c9 --- /dev/null +++ b/src/ggml-hsa/kernels/__init__.py @@ -0,0 +1,20 @@ +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +""" +GGML HSA Kernels package. + +This package provides IRON-based kernel implementations for GGML operations +targeting AMD AIE (AI Engine) devices. It exposes the main compilation function +and tensor descriptor utilities needed for JIT kernel compilation. +""" + +from .build import ggml_compile_op +from .kernel import Kernel +from .tensor_desc import TensorDesc, ggml_tensor_to_tensordesc + +__all__ = [ + "Kernel", + "ggml_compile_op", + "TensorDesc", + "ggml_tensor_to_tensordesc", +] diff --git a/src/ggml-hsa/kernels/argmax.py b/src/ggml-hsa/kernels/argmax.py new file mode 100644 index 0000000000..6363403692 --- /dev/null +++ b/src/ggml-hsa/kernels/argmax.py @@ -0,0 +1,50 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +Top-level entry point for the GGML argmax operation (GGML_OP_ARGMAX). + +Returns a KernelSpec specifying the compilation backend and kernel function. +""" + +from .iron.argmax import argmax_op +from .kernel import Backend, KernelSpec + + +def ggml_op_argmax( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_ARGMAX dispatch function. + + Finds the index of the maximum value along the first dimension (ne0) of each row. + For a tensor with shape [ne0, ne1, ne2, ne3], computes argmax over ne0 for each + of the ne1 * ne2 * ne3 rows, producing an I32 output tensor with shape [ne1, ne2, ne3]. + + Parameters: + arch (str): Target architecture (e.g., "aie2" for Phoenix/Hawk Point, + "aie2p" for Strix Halo/Krackan). + input_tensors (list[TensorDesc]): List containing exactly one input tensor + descriptor. The tensor must be F32 type and contiguous in memory. + output_tensor (TensorDesc): Output tensor descriptor of type I32. Shape is + the input shape with the first dimension removed. + op_params (bytearray): Operation parameters as a 64-byte buffer (unused + for ARGMAX, but required by the dispatch interface). + + Returns: + KernelSpec: Kernel specification with backend=IRON and the argmax_op function + for generating the MLIR module. + """ + return KernelSpec( + backend=Backend.IRON, + op_name="GGML_OP_ARGMAX", + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=argmax_op, + ) diff --git a/src/ggml-hsa/kernels/binary_ops.py b/src/ggml-hsa/kernels/binary_ops.py new file mode 100644 index 0000000000..269ed5a7d7 --- /dev/null +++ b/src/ggml-hsa/kernels/binary_ops.py @@ -0,0 +1,161 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +""" +Top-level entry points for GGML binary operations (GGML_OP_ADD, GGML_OP_SUB, +GGML_OP_MUL, GGML_OP_DIV). +""" + +from functools import partial + +from .iron.binary_ops import binary_op +from .kernel import Backend, KernelSpec + + +def _iron_binary_kernel( + op_name: str, + arch: str, + input_tensors: list, + output_tensor, + op_params: bytearray, +): + """ + Wrapper for IRON binary operations matching the KernelFunction protocol. + + Parameters: + op_name (str): Name of the binary operation. + arch (str): Target architecture. + input_tensors (list): List of two input tensors. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters (unused for binary ops). + + Returns: + MLIR module for the binary operation. + """ + return binary_op( + arch=arch, + op_name=op_name, + input_tensors=input_tensors, + output_tensor=output_tensor, + ) + + +def _make_binary_kernel_spec( + arch: str, + input_tensors: list, + output_tensor, + op_params: bytearray, + op_name: str, +) -> KernelSpec: + """ + Create a KernelSpec for a binary operation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of two input tensors. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + op_name (str): Name of the operation. + + Returns: + KernelSpec configured for IRON backend. + + Raises: + ValueError: If input_tensors does not contain exactly two tensors. + """ + if len(input_tensors) != 2: + raise ValueError("Operation requires exactly two input tensors.") + + return KernelSpec( + backend=Backend.IRON, + op_name=op_name, + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=partial(_iron_binary_kernel, op_name=op_name), + ) + + +def ggml_op_add( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_ADD implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of two input tensors. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the ADD operation. + """ + return _make_binary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_OP_ADD" + ) + + +def ggml_op_sub( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_SUB implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of two input tensors. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the SUB operation. + """ + return _make_binary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_OP_SUB" + ) + + +def ggml_op_mul( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_MUL implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of two input tensors. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the MUL operation. + """ + return _make_binary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_OP_MUL" + ) + + +def ggml_op_div( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_DIV implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of two input tensors. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the DIV operation. + """ + return _make_binary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_OP_DIV" + ) diff --git a/src/ggml-hsa/kernels/build.py b/src/ggml-hsa/kernels/build.py new file mode 100644 index 0000000000..be0a519b09 --- /dev/null +++ b/src/ggml-hsa/kernels/build.py @@ -0,0 +1,416 @@ +# Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +""" +GGML HSA backend kernel build system. + +This module provides the infrastructure for compiling kernels to executable code +for AMD XDNA / XDNA2 devices. It handles mapping GGML operations to their corresponding +kernel implementations, dynamic module loading, and orchestrating the compilation +pipeline. + +The build system supports multiple compilation backends with per-operation dispatch +based on compilation parameters. + +Usage: + As a module: + from kernels import ggml_compile_op, TensorDesc + ggml_compile_op(ggml_op="ADD", arch="aie2", ...) + + As a script: + python build.py --ggml_op ADD --arch aie2 --input_tensors "(1024,1,1,1)/f32" ... +""" + +import importlib.util +import logging +from collections.abc import Callable +from pathlib import Path +import sys +import types + +from kernel import Kernel, KernelSpec, Backend +from tensor_desc import TensorDesc +from build_iron import compile_iron_kernel + +# Compiler registry mapping Backend enum to compile functions +_compilers: dict[Backend, Callable] = { + Backend.IRON: compile_iron_kernel, +} + +# Mapping of GGML operations to kernel source files. +# Each entry maps an operation name to a Kernel that identifies the dispatch module. +_op_to_kernel_map: dict[str, Kernel] = { + # unary operation to kernel source mapping + "ABS": Kernel("ggml_unary_op_abs", "unary_ops.py"), + "SGN": Kernel("ggml_unary_op_sgn", "unary_ops.py"), + "NEG": Kernel("ggml_unary_op_neg", "unary_ops.py"), + "STEP": Kernel("ggml_unary_op_step", "unary_ops.py"), + "TANH": Kernel("ggml_unary_op_tanh", "unary_ops.py"), + "ELU": Kernel("ggml_unary_op_elu", "unary_ops.py"), + "RELU": Kernel("ggml_unary_op_relu", "unary_ops.py"), + "SIGMOID": Kernel("ggml_unary_op_sigmoid", "unary_ops.py"), + "GELU": Kernel("ggml_unary_op_gelu", "unary_ops.py"), + "GELU_QUICK": Kernel("ggml_unary_op_gelu_quick", "unary_ops.py"), + "SILU": Kernel("ggml_unary_op_silu", "unary_ops.py"), + "HARDSWISH": Kernel("ggml_unary_op_hardswish", "unary_ops.py"), + "HARDSIGMOID": Kernel("ggml_unary_op_hardsigmoid", "unary_ops.py"), + "EXP": Kernel("ggml_unary_op_exp", "unary_ops.py"), + "GELU_ERF": Kernel("ggml_unary_op_gelu_erf", "unary_ops.py"), + "XIELU": Kernel("ggml_unary_op_xielu", "unary_ops.py"), + "FLOOR": Kernel("ggml_unary_op_floor", "unary_ops.py"), + "CEIL": Kernel("ggml_unary_op_ceil", "unary_ops.py"), + "ROUND": Kernel("ggml_unary_op_round", "unary_ops.py"), + "TRUNC": Kernel("ggml_unary_op_trunc", "unary_ops.py"), + # operation to kernel source mapping + "ADD": Kernel("ggml_op_add", "binary_ops.py"), + "SUB": Kernel("ggml_op_sub", "binary_ops.py"), + "MUL": Kernel("ggml_op_mul", "binary_ops.py"), + "DIV": Kernel("ggml_op_div", "binary_ops.py"), + "SQR": Kernel("ggml_op_sqr", "unary_ops.py"), + "SQRT": Kernel("ggml_op_sqrt", "unary_ops.py"), + "LOG": Kernel("ggml_op_log", "unary_ops.py"), + "SIN": Kernel("ggml_op_sin", "unary_ops.py"), + "COS": Kernel("ggml_op_cos", "unary_ops.py"), + "MUL_MAT": Kernel("ggml_op_mul_mat", "mul_mat.py"), + "SCALE": Kernel("ggml_op_scale", "scale.py"), + "SOFT_MAX": Kernel("ggml_op_soft_max", "soft_max.py"), + "CLAMP": Kernel("ggml_op_clamp", "clamp.py"), + "ARGMAX": Kernel("ggml_op_argmax", "argmax.py"), + "COUNT_EQUAL": Kernel("ggml_op_count_equal", "count_equal.py"), + "CROSS_ENTROPY_LOSS": Kernel("ggml_op_cross_entropy_loss", "cross_entropy_loss.py"), +} + + +def get_compiler(backend: Backend) -> Callable: + """ + Get the compiler function for the given backend. + + Parameters: + backend: The compilation backend to use. + + Returns: + The compiler function for the specified backend. + + Raises: + NotImplementedError: If the backend is not implemented. + + Note: + Uses backend.name for lookup to handle the case where Backend enums + from dynamically imported modules have different identity than those + in this module. + """ + # Lookup by name to handle different enum class identities from dynamic imports + for registered_backend, compiler in _compilers.items(): + if registered_backend.name == backend.name: + return compiler + raise NotImplementedError(f"Backend {backend.name} not implemented.") + + +def get_kernel(op_name: str) -> Kernel: + """ + Get the kernel for the given operation. + + Parameters: + op_name: Operation name. + + Returns: + The Kernel object associated with the operation. + + Raises: + NotImplementedError: If the Kernel is not found. + """ + kernel = _op_to_kernel_map.get(op_name) + if kernel is None: + raise NotImplementedError(f"Operation {op_name} not implemented.") + return kernel + + +def import_from_path(module_name: str, path: str | Path): + """ + Import a module by name from the specified file path. + + This function handles the complexity of importing Python modules dynamically, + including setting up the package structure for relative imports. + + Parameters: + module_name: Name of the module to import. + path: Path to the Python file containing the module. + + Returns: + The imported module object. + + Raises: + ImportError: If the module cannot be found or loaded. + """ + path = Path(path).resolve() + parent_dir = path.parent + grandparent_dir = parent_dir.parent + + # Add grandparent directory to sys.path so package imports work + grandparent_str = str(grandparent_dir) + if grandparent_str not in sys.path: + sys.path.insert(0, grandparent_str) + + # Create a package name from the directory for relative imports + package_name = parent_dir.name + + # Ensure the parent package exists in sys.modules + parent_dir_str = str(parent_dir) + if package_name not in sys.modules: + pkg = types.ModuleType(package_name) + pkg.__path__ = [parent_dir_str] + pkg.__package__ = package_name + sys.modules[package_name] = pkg + + # Create spec with submodule_search_locations for package support + full_module_name = f"{package_name}.{module_name}" + spec = importlib.util.spec_from_file_location( + full_module_name, + path, + submodule_search_locations=[parent_dir_str], + ) + if spec is None: + raise ImportError(f"Cannot find module spec for {module_name} at path {path}") + if spec.loader is None: + raise ImportError(f"Cannot find loader for module {module_name} at path {path}") + module = importlib.util.module_from_spec(spec) + # Set __package__ to enable relative imports + module.__package__ = package_name + sys.modules[full_module_name] = module + spec.loader.exec_module(module) + return module + + +def ggml_compile_op( + op_name: str, + arch: str, + input_tensors: list[TensorDesc], + output_tensor: TensorDesc, + op_params: bytearray, + exported_name: str, + output_directory: str | Path, + verbose: bool = False, +): + """ + Compile a GGML operation kernel to PDI and instruction files. + + This is the main entry point for kernel compilation. It: + 1. Looks up the kernel dispatch module for the operation + 2. Calls the dispatch function to get a KernelSpec (backend + function) + 3. Invokes the appropriate backend compiler + + Parameters: + op_name: Operation name (e.g., "ADD", "MUL_MAT"). + arch: Target architecture (e.g., "aie2", "aie2p"). + input_tensors: List of input tensor descriptions. + output_tensor: Output tensor description. + op_params: Operation-specific parameters as a bytearray. + exported_name: Name to export the compiled kernel as. + output_directory: Directory to save the compiled PDI and instruction files. + verbose: If True, enables verbose logging output. + + Raises: + ValueError: If the operation is not supported. + NotImplementedError: If the selected backend is not implemented. + """ + # Setup logging + logger = logging.getLogger(__name__) + # remove all existing handlers + for handler in logger.handlers.copy(): + try: + logger.removeHandler(handler) + except ValueError: + # ignore double removals + pass + if verbose: + logger.setLevel(logging.DEBUG) + ch = logging.StreamHandler() + ch.setLevel(logging.DEBUG) + formatter = logging.Formatter("%(levelname)s: %(message)s") + ch.setFormatter(formatter) + logger.addHandler(ch) + + # Get kernel mapping + kernel = get_kernel(op_name) + + # Load dispatch module and get dispatch function + kernel_source_file = Path(__file__).resolve().parent / kernel.source_file + module = import_from_path(kernel.name, kernel_source_file) + dispatch_fn = getattr(module, kernel.name) + + # Dispatch to get KernelSpec (determines backend and function) + kernel_spec: KernelSpec = dispatch_fn( + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + ) + + # Create output and work directories + output_dir = Path(output_directory) + output_dir.mkdir(parents=True, exist_ok=True) + work_dir = output_dir / f"{exported_name}-artifacts" + work_dir.mkdir(parents=True, exist_ok=True) + + logger.info( + ( + "Compiling op: %s for arch %s\n" + " Backend: %s\n" + " Op name: %s\n" + " Kernel source: %s\n" + " Input tensors: %s\n" + " Output tensor: %s\n" + " Operation parameters: %s\n" + " Exported name: %s\n" + " Output directory: %s\n" + " Working directory: %s" + ), + op_name, + arch, + kernel_spec.backend.name, + kernel_spec.op_name, + str(kernel_source_file), + kernel_spec.input_tensors, + kernel_spec.output_tensor, + kernel_spec.op_params, + exported_name, + str(output_dir), + str(work_dir), + ) + + # Get compiler for the selected backend and compile + compile_fn = get_compiler(kernel_spec.backend) + compile_fn( + kernel_spec=kernel_spec, + work_dir=work_dir, + exported_name=exported_name, + output_directory=output_dir, + logger=logger, + verbose=verbose, + ) + + logger.info( + "Finished compilation for kernel %s in %s", kernel.name, output_directory + ) + + +def to_tuple_of_ints(string: str) -> tuple[int, int, int, int]: + """ + Convert a string of the form "(x,y,z,w)" to a tuple of integers. + + Parameters: + string: String representation of a 4-element tuple. + + Returns: + A tuple of 4 integers. + + Raises: + ValueError: If the string does not represent exactly 4 integers. + """ + string = string.replace("(", "").replace(")", "").strip(",") + ints = map(int, string.split(",")) + t = tuple(ints) + if len(t) != 4: + raise ValueError(f"Shape must have 4 dimensions, got {len(t)}.") + return t + + +def to_tensordesc(string: str) -> TensorDesc: + """ + Create a TensorDesc from a string representation. + + Parameters: + string: String of the form "(shape)/dtype", e.g., "(1024,1,1,1)/f32". + + Returns: + A TensorDesc instance with the specified shape and dtype. + """ + shape, dtype = string.split("/") + shape = to_tuple_of_ints(shape) + return TensorDesc(dtype=dtype, shape=shape, stride=None) + + +def file_path(string: str): + """ + Validate that a string represents an existing file path. + + Parameters: + string: The file path to validate. + + Returns: + The validated file path string. + + Raises: + FileNotFoundError: If the file does not exist. + """ + if not Path(string).is_file(): + raise FileNotFoundError(string) + return string + + +def main(): + """Main entry point for command-line AOT compilation.""" + from argparse import ArgumentParser + + parser = ArgumentParser( + prog="build.py", + description="Compiles GGML HSA kernels for AMD XDNA / XDNA2 devices", + ) + parser.add_argument( + "--op_name", + type=str, + required=True, + help="GGML operation name, e.g., MUL_MAT, ADD, RELU, etc.", + ) + parser.add_argument( + "--arch", + type=str, + required=True, + help="Target architecture", + ) + parser.add_argument( + "--input_tensors", + type=to_tensordesc, + nargs="+", + required=True, + help="Input kernel tensor shapes and datatypes", + ) + parser.add_argument( + "--output_tensor", + type=to_tensordesc, + required=True, + help="Output kernel tensor shape and datatype", + ) + parser.add_argument( + "--exported_name", + type=str, + required=True, + help="Kernel exported name", + ) + parser.add_argument( + "--output_directory", + type=str, + required=True, + help="Output directory", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Verbose output", + ) + args = parser.parse_args() + + ggml_compile_op( + op_name=args.op_name, + arch=args.arch, + input_tensors=args.input_tensors, + output_tensor=args.output_tensor, + op_params=bytearray(), + exported_name=args.exported_name, + output_directory=args.output_directory, + verbose=args.verbose, + ) + + +if __name__ == "__main__": + main() diff --git a/src/ggml-hsa/kernels/build_iron.py b/src/ggml-hsa/kernels/build_iron.py new file mode 100644 index 0000000000..ef8e57dee6 --- /dev/null +++ b/src/ggml-hsa/kernels/build_iron.py @@ -0,0 +1,118 @@ +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON backend compiler for GGML HSA kernels. +""" + +import logging +from collections.abc import Iterable +from pathlib import Path + +from kernel import KernelSpec +from tensor_desc import TensorDesc +from iron.utils import suppress_import_pyxrt_msg + +suppress_import_pyxrt_msg() + +from aie.iron import ExternalFunction +from aie.utils.compile import compile_cxx_core_function +from aie.utils.compile import compile_mlir_module + + +def _compile_aie_core_kernels( + arch: str, + functions: Iterable[ExternalFunction], + work_dir: Path, + verbose: bool, +) -> None: + """ + Compile AIE core functions to object files. + + This function compiles the C++ source files for external functions + (core compute kernels) into object files that will be linked into + the final PDI. + + Parameters: + arch: Target architecture (e.g., "aie2", "aie2p"). + functions: Iterable of ExternalFunction objects to compile. + work_dir: Working directory for intermediate files. + verbose: If True, enables verbose compilation output. + """ + for func in functions: + compile_cxx_core_function( + source_path=func._source_file, + target_arch=arch, + output_path=func.bin_name, + include_dirs=func._include_dirs, + compile_args=func._compile_flags, + cwd=str(work_dir), + verbose=verbose, + ) + + +def compile_iron_kernel( + kernel_spec: KernelSpec, + work_dir: Path, + exported_name: str, + output_directory: Path, + logger: logging.Logger, + verbose: bool, +) -> None: + """ + Compile an IRON kernel to PDI and instructions files. + + This function executes the IRON compilation pipeline: + 1. Executes the kernel's Python function to generate an MLIR module + 2. Compiles any external C++ core functions to object files + 3. Compiles the MLIR module to produce PDI and instructions binaries + + Parameters: + kernel_spec: The KernelSpec containing the IRON kernel function. + work_dir: Working directory for intermediate files. + exported_name: Name for the exported kernel files. + output_directory: Directory for output PDI and instruction files. + logger: Logger for status messages. + verbose: If True, enables verbose compilation output. + """ + # Clear any existing external functions from previous compilations + ExternalFunction._instances.clear() + + # Generate MLIR module by calling the kernel function + # (this also populates ExternalFunction._instances) + mlir_module = kernel_spec.function( + arch=kernel_spec.arch, + input_tensors=kernel_spec.input_tensors, + output_tensor=kernel_spec.output_tensor, + op_params=kernel_spec.op_params, + ) + + # Compile any external C++ core functions + _compile_aie_core_kernels( + arch=kernel_spec.arch, + functions=ExternalFunction._instances, + work_dir=work_dir, + verbose=verbose, + ) + + # Clear external functions after compilation + ExternalFunction._instances.clear() + + # Write MLIR module to file for debugging/inspection + mlir_path = work_dir / f"{exported_name}.mlir" + logger.info( + "Writing MLIR module for operation %s in %s", kernel_spec.op_name, mlir_path + ) + with open(mlir_path, "wt", encoding="utf-8") as file: + file.write(str(mlir_module)) + + # Generate PDI and instructions files from MLIR + pdi_path = output_directory / f"{exported_name}.pdi" + insts_path = output_directory / f"{exported_name}_insts.bin" + compile_mlir_module( + mlir_module=mlir_module, + options=["--alloc-scheme=basic-sequential"], + insts_path=str(insts_path), + pdi_path=str(pdi_path), + verbose=verbose, + work_dir=str(work_dir), + ) diff --git a/src/ggml-hsa/kernels/clamp.py b/src/ggml-hsa/kernels/clamp.py new file mode 100644 index 0000000000..c27ae0f125 --- /dev/null +++ b/src/ggml-hsa/kernels/clamp.py @@ -0,0 +1,42 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +Top-level entry point for the GGML clamp operation (GGML_OP_CLAMP). +""" + +from .iron.clamp import clamp +from .kernel import Backend, KernelSpec + + +def ggml_op_clamp( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_CLAMP implementation. + + Clamps each element of the input tensor to the range [min_val, max_val]. + output[i] = max(min_val, min(input[i], max_val)) + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters containing min and max values. + + Returns: + KernelSpec for the CLAMP operation. + """ + return KernelSpec( + backend=Backend.IRON, + op_name="GGML_OP_CLAMP", + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=clamp, + ) diff --git a/src/ggml-hsa/kernels/count_equal.py b/src/ggml-hsa/kernels/count_equal.py new file mode 100644 index 0000000000..a7d2429b97 --- /dev/null +++ b/src/ggml-hsa/kernels/count_equal.py @@ -0,0 +1,50 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +Top-level entry point for the GGML count equal operation (GGML_OP_COUNT_EQUAL). + +Returns a KernelSpec specifying the compilation backend and kernel function. +""" + +from .iron.count_equal import count_equal_op +from .kernel import Backend, KernelSpec + + +def ggml_op_count_equal( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_COUNT_EQUAL dispatch function. + + Counts the number of elements that are equal between two input tensors. + Both input tensors must have the same shape and be of type I32. + The output is a single I64 scalar containing the count. + + Parameters: + arch (str): Target architecture (e.g., "aie2" for Phoenix/Hawk Point, + "aie2p" for Strix Halo/Krackan). + input_tensors (list[TensorDesc]): List containing exactly two input tensor + descriptors. Both tensors must be I32 type and contiguous in memory. + output_tensor (TensorDesc): Output tensor descriptor of type I64 with + shape [1, 1, 1, 1] containing the count of equal elements. + op_params (bytearray): Operation parameters as a 64-byte buffer (unused + for COUNT_EQUAL, but required by the dispatch interface). + + Returns: + KernelSpec: Kernel specification with backend=IRON and the count_equal_op + function for generating the MLIR module. + """ + return KernelSpec( + backend=Backend.IRON, + op_name="GGML_OP_COUNT_EQUAL", + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=count_equal_op, + ) diff --git a/src/ggml-hsa/kernels/cross_entropy_loss.py b/src/ggml-hsa/kernels/cross_entropy_loss.py new file mode 100644 index 0000000000..5cabd8789a --- /dev/null +++ b/src/ggml-hsa/kernels/cross_entropy_loss.py @@ -0,0 +1,43 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +Top-level entry point for the GGML cross entropy loss operation (GGML_OP_CROSS_ENTROPY_LOSS). + +Returns a KernelSpec specifying the compilation backend and kernel function. +""" + +from .iron.cross_entropy_loss import cross_entropy_loss +from .kernel import Backend, KernelSpec + + +def ggml_op_cross_entropy_loss( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_CROSS_ENTROPY_LOSS implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of 2 input tensors: + - input_tensors[0]: Logits tensor (predictions before softmax) + - input_tensors[1]: Labels tensor (ground truth, often one-hot encoded) + output_tensor: Output scalar tensor containing the loss value. + op_params (bytearray): Operation parameters (currently unused). + + Returns: + KernelSpec for the CROSS_ENTROPY_LOSS operation. + """ + return KernelSpec( + backend=Backend.IRON, + op_name="GGML_OP_CROSS_ENTROPY_LOSS", + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=cross_entropy_loss, + ) diff --git a/src/ggml-hsa/kernels/iron/CMakeLists.txt b/src/ggml-hsa/kernels/iron/CMakeLists.txt new file mode 100644 index 0000000000..e63a0bb3c3 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/CMakeLists.txt @@ -0,0 +1,79 @@ +# Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. + +# copy root package files +set(IRON_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/__init__.py + ${CMAKE_CURRENT_SOURCE_DIR}/aie_kernel_math.h + ${CMAKE_CURRENT_SOURCE_DIR}/aie_kernel_utils.h + ${CMAKE_CURRENT_SOURCE_DIR}/argmax.cc + ${CMAKE_CURRENT_SOURCE_DIR}/argmax.py + ${CMAKE_CURRENT_SOURCE_DIR}/binary_ops.cc + ${CMAKE_CURRENT_SOURCE_DIR}/binary_ops.py + ${CMAKE_CURRENT_SOURCE_DIR}/clamp.cc + ${CMAKE_CURRENT_SOURCE_DIR}/clamp.py + ${CMAKE_CURRENT_SOURCE_DIR}/count_equal.cc + ${CMAKE_CURRENT_SOURCE_DIR}/count_equal.py + ${CMAKE_CURRENT_SOURCE_DIR}/cross_entropy_loss.cc + ${CMAKE_CURRENT_SOURCE_DIR}/cross_entropy_loss.py + ${CMAKE_CURRENT_SOURCE_DIR}/gemm.py + ${CMAKE_CURRENT_SOURCE_DIR}/ggml-aie.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/scale.cc + ${CMAKE_CURRENT_SOURCE_DIR}/scale.py + ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cc + ${CMAKE_CURRENT_SOURCE_DIR}/softmax.py + ${CMAKE_CURRENT_SOURCE_DIR}/unary_ops.cc + ${CMAKE_CURRENT_SOURCE_DIR}/unary_ops.py + ${CMAKE_CURRENT_SOURCE_DIR}/utils.py + ) +ggml_hsa_copy_files(ggml_hsa_copy_iron_files + FILES + ${IRON_FILES} + DESTINATION + ${CMAKE_CURRENT_BINARY_DIR} + ) +add_dependencies(ggml-hsa ggml_hsa_copy_iron_files) +install( + FILES + ${IRON_FILES} + DESTINATION + lib/kernels/iron + ) + +# copy aie2 kernel files +set(IRON_AIE2_KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/aie2/mm.cc + ${CMAKE_CURRENT_SOURCE_DIR}/aie2/zero.cc + ) +ggml_hsa_copy_files(ggml_hsa_copy_iron_aie2_kernel_files + FILES + ${IRON_AIE2_KERNEL_FILES} + DESTINATION + ${CMAKE_CURRENT_BINARY_DIR}/aie2 + ) +add_dependencies(ggml-hsa ggml_hsa_copy_iron_aie2_kernel_files) +install( + FILES + ${IRON_AIE2_KERNEL_FILES} + DESTINATION + lib/kernels/iron/aie2 + ) + +# copy aie2p kernel files +set(IRON_AIE2P_KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/aie2p/mm.cc + ${CMAKE_CURRENT_SOURCE_DIR}/aie2p/zero.cc + ) +ggml_hsa_copy_files(ggml_hsa_copy_iron_aie2p_kernel_files + FILES + ${IRON_AIE2P_KERNEL_FILES} + DESTINATION + ${CMAKE_CURRENT_BINARY_DIR}/aie2p + ) +add_dependencies(ggml-hsa ggml_hsa_copy_iron_aie2p_kernel_files) +install( + FILES + ${IRON_AIE2P_KERNEL_FILES} + DESTINATION + lib/kernels/iron/aie2p + ) + diff --git a/src/ggml-hsa/kernels/iron/__init__.py b/src/ggml-hsa/kernels/iron/__init__.py new file mode 100644 index 0000000000..5d2e0a54b1 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/__init__.py @@ -0,0 +1,14 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON kernel implementations. + +This package contains the low-level IRON (Intermediate Representation for +Optimized NPU) kernel implementations for various GGML operations. Each module +provides kernel designs that generate MLIR code for AMD XDNA / XDNA2 NPUs. +""" diff --git a/src/ggml-hsa/kernels/iron/aie2/mm.cc b/src/ggml-hsa/kernels/iron/aie2/mm.cc new file mode 100644 index 0000000000..c74d698532 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/aie2/mm.cc @@ -0,0 +1,1093 @@ +// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * @file mm.cc + * @brief Matrix multiplication kernels for AIE2 architecture. + * + * This file provides scalar and vectorized matrix multiplication kernels + * optimized for AIE2. The vectorized kernels use the aie::mmul class with + * various expansion factors (2x2, 4x2, 4x4) to maximize accumulator usage + * and achieve high SIMD efficiency. + */ + +#define NOCPP + +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#include "../aie_kernel_utils.h" +#include "zero.cc" + +#include + +/** + * @brief Scalar matrix multiplication kernel for reference/verification. + * + * Computes C += A * B using scalar operations. Supports configurable memory + * layouts for matrices B and C via template parameters. + * + * @tparam T_in Input element type for matrices A and B. + * @tparam T_out Output element type for matrix C. + * @tparam rowA Number of rows in matrix A (and C). + * @tparam colA Number of columns in A (rows in B). + * @tparam colB Number of columns in B (and C). + * @tparam b_row_maj If true, B is row-major; if false, column-major. + * @tparam c_row_maj If true, C is row-major; if false, column-major. + * + * @param[in] a Pointer to matrix A (rowA x colA, row-major). + * @param[in] b Pointer to matrix B (colA x colB, layout per b_row_maj). + * @param[in,out] c Pointer to matrix C (rowA x colB, layout per c_row_maj). + * Results are accumulated into C. + */ +template +static inline void matmul_scalar(T_in * a, T_in * b, T_out * c) { + event0(); + for (int row = 0; row < rowA; row++) { + for (int col = 0; col < colB; col++) { + T_out running_sum = 0; + for (int i = 0; i < colA; i++) { + T_in a_val = a[row * colA + i]; + T_in b_val; + if constexpr (b_row_maj) { + b_val = b[i * colB + col]; + } else { + b_val = b[i + col * colA]; + } + running_sum += a_val * b_val; + } + T_out * c_ptr; + if constexpr (c_row_maj) { + c_ptr = &c[row * colB + col]; + } else { + c_ptr = &c[row + col * rowA]; + } + *c_ptr += running_sum; + } + } + event1(); +} + +/** + * @brief Vectorized matrix multiplication with 2x2 mmul expansion. + * + * Blocked MatMul kernel utilizing the aie::mmul class. Matrices are assumed + * to be pre-tiled with shapes: A => rxs, B => sxt, C => rxt. + * + * This kernel expands the aie::mmul 2x in both A (m dimension) and B (n dimension), + * resulting in a 2x2 expansion in output C (C00, C01, C10, C11). This expansion + * maximizes accumulator register usage for high SIMD efficiency. + * + * Data layout: tiles are row-major, and data within tiles is row-major: + * @verbatim + * <-s-> + * _ ________________________ + * r | 1 | 2 | 3 | ... + * _ |____|____|____| + * | x | x+1| x+2| ... + * @endverbatim + * + * @tparam T_in Input element type. + * @tparam T_out Output element type. + * @tparam rowA Number of tile rows in A (in units of r). + * @tparam colA Number of tile columns in A / rows in B (in units of s). + * @tparam colB Number of tile columns in B (in units of t). + * @tparam r mmul M dimension. + * @tparam s mmul K dimension. + * @tparam t mmul N dimension. + * @tparam b_row_maj If true, B tiles are row-major; if false, column-major. + * @tparam c_row_maj If true, C tiles are row-major; if false, column-major. + * + * @param[in] pA Pointer to pre-tiled matrix A. + * @param[in] pB Pointer to pre-tiled matrix B. + * @param[in,out] pC Pointer to pre-tiled matrix C (accumulates results). + * + * @see https://xilinx.github.io/aie_api/group__group__mmul.html + */ +template +static inline void matmul_vectorized_2x2_mmul(const T_in * __restrict pA, + const T_in * __restrict pB, + T_out * __restrict pC) { + + using MMUL = aie::mmul; + + event0(); + + AIE_PREPARE_FOR_PIPELINING + AIE_LOOP_MIN_ITERATION_COUNT(4) + for (unsigned z = 0; z < rowA; z += 2) { + T_out * __restrict pC1; + T_out * __restrict pC2; + if constexpr (c_row_maj) { + pC1 = pC + (z * colB) * MMUL::size_C; + pC2 = pC + ((z + 1) * colB) * MMUL::size_C; + } + + for (unsigned j = 0; j < colB; j += 2) +#ifdef OPT_PERF_ENABLED + AIE_LOOP_FLATTEN +#endif + { + + if constexpr (!c_row_maj) { + pC1 = pC + j * rowA * MMUL::size_C + z * MMUL::size_C; + pC2 = pC + (j + 1) * rowA * MMUL::size_C + z * MMUL::size_C; + } + const T_in * __restrict pA1 = pA + (z * colA) * MMUL::size_A; + const T_in * __restrict pA2 = pA + ((z + 1) * colA) * MMUL::size_A; + const T_in * __restrict pB1; + const T_in * __restrict pB2; + if constexpr (b_row_maj) { + pB1 = pB + (j)*MMUL::size_B; + pB2 = pB + (j + 1) * MMUL::size_B; + } else { + pB1 = pB + (j * colA) * MMUL::size_B; + pB2 = pB + ((j + 1) * colA) * MMUL::size_B; + } + + aie::vector A0; + aie::vector A1; + aie::vector B0; + aie::vector B1; + + // Load partial results from C buffer for accumulation in-place. The + // zero.cc function handles the zeroing of data when a new + // accumulation is needed (after the 'K' reduction dimension) + aie::vector acc_C00; + aie::vector acc_C01; + aie::vector acc_C10; + aie::vector acc_C11; + if constexpr (c_row_maj) { + acc_C00 = aie::load_v(pC1); + acc_C01 = aie::load_v(pC1 + MMUL::size_C); + acc_C10 = aie::load_v(pC2); + acc_C11 = aie::load_v(pC2 + MMUL::size_C); + } else { + acc_C00 = aie::transpose(aie::load_v(pC1), t, r); + acc_C01 = aie::transpose(aie::load_v(pC2), t, r); + acc_C10 = aie::transpose(aie::load_v(pC1 + MMUL::size_C), t, r); + acc_C11 = aie::transpose(aie::load_v(pC2 + MMUL::size_C), t, r); + } + + MMUL C00(acc_C00); + MMUL C01(acc_C01); + MMUL C10(acc_C10); + MMUL C11(acc_C11); + + for (unsigned i = 0; i < colA; ++i) +#ifdef OPT_PERF_ENABLED + AIE_LOOP_FLATTEN +#endif + { + A0 = aie::load_v(pA1); + pA1 += MMUL::size_A; + A1 = aie::load_v(pA2); + pA2 += MMUL::size_A; + if constexpr (b_row_maj) { + B0 = aie::load_v(pB1); + pB1 += MMUL::size_B * colB; + B1 = aie::load_v(pB2); + pB2 += MMUL::size_B * colB; + } else { + B0 = aie::transpose(aie::load_v(pB1), t, s); + pB1 += MMUL::size_B; + B1 = aie::transpose(aie::load_v(pB2), t, s); + pB2 += MMUL::size_B; + } + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + } + + // TODO make shift right here to keep most significat bits + // when lowering the output + // example below shows how to shift right 10 bits + // #define SHIFT 10 + // aie::store_v(pC1, C00.template to_vector(SHIFT)); + + if constexpr (c_row_maj) { + aie::store_v(pC1, C00.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C01.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC2, C10.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C11.template to_vector()); + pC2 += MMUL::size_C; + } else { + aie::store_v(pC1, aie::transpose(C00.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C01.template to_vector(), r, t)); + pC2 += MMUL::size_C; + aie::store_v(pC1, aie::transpose(C10.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C11.template to_vector(), r, t)); + pC2 += MMUL::size_C; + } + } + } + + event1(); +} + +/** + * @brief Vectorized matrix multiplication with 4x2 mmul expansion. + * + * Similar to matmul_vectorized_2x2_mmul but expands A by 4x in the m dimension + * and B by 2x in the n dimension. This configuration is optimal for some + * precisions (e.g., int8) where more accumulator utilization improves efficiency. + * + * @tparam T_in Input element type. + * @tparam T_out Output element type. + * @tparam rowA Number of tile rows in A (must be divisible by 4). + * @tparam colA Number of tile columns in A / rows in B. + * @tparam colB Number of tile columns in B (must be divisible by 2). + * @tparam r mmul M dimension. + * @tparam s mmul K dimension. + * @tparam t mmul N dimension. + * @tparam b_row_maj If true, B tiles are row-major; if false, column-major. + * @tparam c_row_maj If true, C tiles are row-major; if false, column-major. + * + * @param[in] pA Pointer to pre-tiled matrix A. + * @param[in] pB Pointer to pre-tiled matrix B. + * @param[in,out] pC Pointer to pre-tiled matrix C (accumulates results). + */ +template +static inline void matmul_vectorized_4x2_mmul(const T_in * __restrict pA, + const T_in * __restrict pB, + T_out * __restrict pC) { + + using MMUL = aie::mmul; + + event0(); + + AIE_PREPARE_FOR_PIPELINING + AIE_LOOP_MIN_ITERATION_COUNT(4) + for (unsigned z = 0; z < rowA; z += 4) { + T_out * __restrict pC1; + T_out * __restrict pC2; + T_out * __restrict pC3; + T_out * __restrict pC4; + + if constexpr (c_row_maj) { + pC1 = pC + (z * colB + 0) * MMUL::size_C; + pC2 = pC + ((z + 1) * colB + 0) * MMUL::size_C; + pC3 = pC + ((z + 2) * colB + 0) * MMUL::size_C; + pC4 = pC + ((z + 3) * colB + 0) * MMUL::size_C; + } + + for (unsigned j = 0; j < colB; j += 2) +#ifdef OPT_PERF_ENABLED + AIE_LOOP_FLATTEN +#endif + { + if constexpr (!c_row_maj) { + pC1 = pC + j * rowA * MMUL::size_C + z * MMUL::size_C; + pC2 = pC + (j + 1) * rowA * MMUL::size_C + z * MMUL::size_C; + } + + const T_in * __restrict pA1 = pA + (z * colA + 0) * MMUL::size_A; + const T_in * __restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A; + const T_in * __restrict pA3 = pA + ((z + 2) * colA + 0) * MMUL::size_A; + const T_in * __restrict pA4 = pA + ((z + 3) * colA + 0) * MMUL::size_A; + + const T_in * __restrict pB1; + const T_in * __restrict pB2; + if constexpr (b_row_maj) { + pB1 = pB + (j)*MMUL::size_B; + pB2 = pB + (j + 1) * MMUL::size_B; + } else { + pB1 = pB + (j * colA) * MMUL::size_B; + pB2 = pB + ((j + 1) * colA) * MMUL::size_B; + } + + aie::vector A01; + aie::vector A11; + aie::vector A21; + aie::vector A31; + aie::vector B0; + aie::vector B1; + + aie::vector acc_C00; + aie::vector acc_C01; + aie::vector acc_C10; + aie::vector acc_C11; + aie::vector acc_C20; + aie::vector acc_C21; + aie::vector acc_C30; + aie::vector acc_C31; + + if constexpr (c_row_maj) { + acc_C00 = aie::load_v(pC1); + acc_C01 = aie::load_v(pC1 + MMUL::size_C); + acc_C10 = aie::load_v(pC2); + acc_C11 = aie::load_v(pC2 + MMUL::size_C); + acc_C20 = aie::load_v(pC3); + acc_C21 = aie::load_v(pC3 + MMUL::size_C); + acc_C30 = aie::load_v(pC4); + acc_C31 = aie::load_v(pC4 + MMUL::size_C); + } else { + acc_C00 = aie::transpose(aie::load_v(pC1), t, r); + acc_C01 = aie::transpose(aie::load_v(pC2), t, r); + acc_C10 = aie::transpose(aie::load_v(pC1 + MMUL::size_C), t, r); + acc_C11 = aie::transpose(aie::load_v(pC2 + MMUL::size_C), t, r); + acc_C20 = + aie::transpose(aie::load_v(pC1 + 2 * MMUL::size_C), t, r); + acc_C21 = + aie::transpose(aie::load_v(pC2 + 2 * MMUL::size_C), t, r); + acc_C30 = + aie::transpose(aie::load_v(pC1 + 3 * MMUL::size_C), t, r); + acc_C31 = + aie::transpose(aie::load_v(pC2 + 3 * MMUL::size_C), t, r); + } + + MMUL C00(acc_C00); + MMUL C01(acc_C01); + MMUL C10(acc_C10); + MMUL C11(acc_C11); + MMUL C20(acc_C20); + MMUL C21(acc_C21); + MMUL C30(acc_C30); + MMUL C31(acc_C31); + + for (unsigned i = 0; i < colA; i += 1) +#ifdef OPT_PERF_ENABLED + AIE_LOOP_FLATTEN +#endif + { + A01 = aie::load_v(pA1); + pA1 += MMUL::size_A; + A11 = aie::load_v(pA2); + pA2 += MMUL::size_A; + A21 = aie::load_v(pA3); + pA3 += MMUL::size_A; + A31 = aie::load_v(pA4); + pA4 += MMUL::size_A; + if constexpr (b_row_maj) { + B0 = aie::load_v(pB1); + pB1 += (MMUL::size_B * colB); + B1 = aie::load_v(pB2); + pB2 += (MMUL::size_B * colB); + } else { + B0 = aie::transpose(aie::load_v(pB1), t, s); + pB1 += MMUL::size_B; + B1 = aie::transpose(aie::load_v(pB2), t, s); + pB2 += MMUL::size_B; + } + + C00.mac(A01, B0); + C01.mac(A01, B1); + C10.mac(A11, B0); + C11.mac(A11, B1); + C20.mac(A21, B0); + C21.mac(A21, B1); + C30.mac(A31, B0); + C31.mac(A31, B1); + } + + if constexpr (c_row_maj) { + aie::store_v(pC1, C00.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C01.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC2, C10.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C11.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC3, C20.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C21.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC4, C30.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C31.template to_vector()); + pC4 += MMUL::size_C; + } else { + aie::store_v(pC1, aie::transpose(C00.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C01.template to_vector(), r, t)); + pC2 += MMUL::size_C; + aie::store_v(pC1, aie::transpose(C10.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C11.template to_vector(), r, t)); + pC2 += MMUL::size_C; + aie::store_v(pC1, aie::transpose(C20.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C21.template to_vector(), r, t)); + pC2 += MMUL::size_C; + aie::store_v(pC1, aie::transpose(C30.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C31.template to_vector(), r, t)); + pC2 += MMUL::size_C; + } + } + } + + event1(); +} + +/** + * @brief Vectorized matrix multiplication with 4x4 mmul expansion. + * + * Expands A by 4x in the m dimension and B by 4x in the n dimension for + * maximum accumulator usage. This configuration achieves highest efficiency + * for some precisions (e.g., bf16). + * + * @tparam T_in Input element type. + * @tparam T_out Output element type. + * @tparam rowA Number of tile rows in A (must be divisible by 4). + * @tparam colA Number of tile columns in A / rows in B. + * @tparam colB Number of tile columns in B (must be divisible by 4). + * @tparam r mmul M dimension. + * @tparam s mmul K dimension. + * @tparam t mmul N dimension. + * @tparam b_row_maj If true, B tiles are row-major; if false, column-major. + * @tparam c_row_maj If true, C tiles are row-major; if false, column-major. + * + * @param[in] pA Pointer to pre-tiled matrix A. + * @param[in] pB Pointer to pre-tiled matrix B. + * @param[in,out] pC Pointer to pre-tiled matrix C (accumulates results). + */ +template +static inline void matmul_vectorized_4x4(const T_in * __restrict pA, + const T_in * __restrict pB, + T_out * __restrict pC) { + + using MMUL = aie::mmul; + + event0(); + + AIE_PREPARE_FOR_PIPELINING + AIE_LOOP_MIN_ITERATION_COUNT(2) + for (unsigned z = 0; z < rowA; z += 4) { + T_out * __restrict pC1; + T_out * __restrict pC2; + T_out * __restrict pC3; + T_out * __restrict pC4; + + if constexpr (c_row_maj) { + pC1 = pC + (z * colB) * MMUL::size_C; + pC2 = pC + ((z + 1) * colB) * MMUL::size_C; + pC3 = pC + ((z + 2) * colB) * MMUL::size_C; + pC4 = pC + ((z + 3) * colB) * MMUL::size_C; + } + + for (unsigned j = 0; j < colB; j += 4) +#ifdef OPT_PERF_ENABLED + AIE_LOOP_FLATTEN +#endif + { + if constexpr (!c_row_maj) { + pC1 = pC + j * rowA * MMUL::size_C + z * MMUL::size_C; + pC2 = pC + (j + 1) * rowA * MMUL::size_C + z * MMUL::size_C; + pC3 = pC + (j + 2) * rowA * MMUL::size_C + z * MMUL::size_C; + pC4 = pC + (j + 3) * rowA * MMUL::size_C + z * MMUL::size_C; + } + const T_in * __restrict pA1 = pA + (z * colA + 0) * MMUL::size_A; + const T_in * __restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A; + const T_in * __restrict pA3 = pA + ((z + 2) * colA + 0) * MMUL::size_A; + const T_in * __restrict pA4 = pA + ((z + 3) * colA + 0) * MMUL::size_A; + + const T_in * __restrict pB1; + const T_in * __restrict pB2; + const T_in * __restrict pB3; + const T_in * __restrict pB4; + if constexpr (b_row_maj) { + pB1 = pB + (j)*MMUL::size_B; + pB2 = pB + (j + 1) * MMUL::size_B; + pB3 = pB + (j + 2) * MMUL::size_B; + pB4 = pB + (j + 3) * MMUL::size_B; + } else { + pB1 = pB + (j * colA) * MMUL::size_B; + pB2 = pB + ((j + 1) * colA) * MMUL::size_B; + pB3 = pB + ((j + 2) * colA) * MMUL::size_B; + pB4 = pB + ((j + 3) * colA) * MMUL::size_B; + } + + aie::vector A0; + aie::vector A1; + aie::vector A2; + aie::vector A3; + aie::vector B0; + aie::vector B1; + aie::vector B2; + aie::vector B3; + + aie::vector acc_C00; + aie::vector acc_C01; + aie::vector acc_C02; + aie::vector acc_C03; + + aie::vector acc_C10; + aie::vector acc_C11; + aie::vector acc_C12; + aie::vector acc_C13; + + aie::vector acc_C20; + aie::vector acc_C21; + aie::vector acc_C22; + aie::vector acc_C23; + + aie::vector acc_C30; + aie::vector acc_C31; + aie::vector acc_C32; + aie::vector acc_C33; + + if constexpr (c_row_maj) { + acc_C00 = aie::load_v(pC1); + acc_C01 = aie::load_v(pC1 + MMUL::size_C); + acc_C02 = aie::load_v(pC1 + 2 * MMUL::size_C); + acc_C03 = aie::load_v(pC1 + 3 * MMUL::size_C); + + acc_C10 = aie::load_v(pC2); + acc_C11 = aie::load_v(pC2 + MMUL::size_C); + acc_C12 = aie::load_v(pC2 + 2 * MMUL::size_C); + acc_C13 = aie::load_v(pC2 + 3 * MMUL::size_C); + + acc_C20 = aie::load_v(pC3); + acc_C21 = aie::load_v(pC3 + MMUL::size_C); + acc_C22 = aie::load_v(pC3 + 2 * MMUL::size_C); + acc_C23 = aie::load_v(pC3 + 3 * MMUL::size_C); + + acc_C30 = aie::load_v(pC4); + acc_C31 = aie::load_v(pC4 + MMUL::size_C); + acc_C32 = aie::load_v(pC4 + 2 * MMUL::size_C); + acc_C33 = aie::load_v(pC4 + 3 * MMUL::size_C); + } else { + acc_C00 = aie::transpose(aie::load_v(pC1), t, r); + acc_C01 = aie::transpose(aie::load_v(pC2), t, r); + acc_C02 = aie::transpose(aie::load_v(pC3), t, r); + acc_C03 = aie::transpose(aie::load_v(pC4), t, r); + + acc_C10 = aie::transpose(aie::load_v(pC1 + MMUL::size_C), t, r); + acc_C11 = aie::transpose(aie::load_v(pC2 + MMUL::size_C), t, r); + acc_C12 = aie::transpose(aie::load_v(pC3 + MMUL::size_C), t, r); + acc_C13 = aie::transpose(aie::load_v(pC4 + MMUL::size_C), t, r); + + acc_C20 = + aie::transpose(aie::load_v(pC1 + 2 * MMUL::size_C), t, r); + acc_C21 = + aie::transpose(aie::load_v(pC2 + 2 * MMUL::size_C), t, r); + acc_C22 = + aie::transpose(aie::load_v(pC3 + 2 * MMUL::size_C), t, r); + acc_C23 = + aie::transpose(aie::load_v(pC4 + 2 * MMUL::size_C), t, r); + + acc_C30 = + aie::transpose(aie::load_v(pC1 + 3 * MMUL::size_C), t, r); + acc_C31 = + aie::transpose(aie::load_v(pC2 + 3 * MMUL::size_C), t, r); + acc_C32 = + aie::transpose(aie::load_v(pC3 + 3 * MMUL::size_C), t, r); + acc_C33 = + aie::transpose(aie::load_v(pC4 + 3 * MMUL::size_C), t, r); + } + + MMUL C00(acc_C00); + MMUL C01(acc_C01); + MMUL C02(acc_C02); + MMUL C03(acc_C03); + + MMUL C10(acc_C10); + MMUL C11(acc_C11); + MMUL C12(acc_C12); + MMUL C13(acc_C13); + + MMUL C20(acc_C20); + MMUL C21(acc_C21); + MMUL C22(acc_C22); + MMUL C23(acc_C23); + + MMUL C30(acc_C30); + MMUL C31(acc_C31); + MMUL C32(acc_C32); + MMUL C33(acc_C33); + + for (unsigned i = 0; i < colA; ++i) +#ifdef OPT_PERF_ENABLED + AIE_LOOP_FLATTEN +#endif + { + A0 = aie::load_v(pA1); + pA1 += MMUL::size_A; + A1 = aie::load_v(pA2); + pA2 += MMUL::size_A; + A2 = aie::load_v(pA3); + pA3 += MMUL::size_A; + A3 = aie::load_v(pA4); + pA4 += MMUL::size_A; + + if constexpr (b_row_maj) { + B0 = aie::load_v(pB1); + pB1 += MMUL::size_B * colB; + B1 = aie::load_v(pB2); + pB2 += MMUL::size_B * colB; + B2 = aie::load_v(pB3); + pB3 += MMUL::size_B * colB; + B3 = aie::load_v(pB4); + pB4 += MMUL::size_B * colB; + } else { + B0 = aie::transpose(aie::load_v(pB1), t, s); + pB1 += MMUL::size_B; + B1 = aie::transpose(aie::load_v(pB2), t, s); + pB2 += MMUL::size_B; + B2 = aie::transpose(aie::load_v(pB3), t, s); + pB3 += MMUL::size_B; + B3 = aie::transpose(aie::load_v(pB4), t, s); + pB4 += MMUL::size_B; + } + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + + C02.mac(A0, B2); + C03.mac(A0, B3); + C12.mac(A1, B2); + C13.mac(A1, B3); + + C20.mac(A2, B0); + C21.mac(A2, B1); + C30.mac(A3, B0); + C31.mac(A3, B1); + + C22.mac(A2, B2); + C23.mac(A2, B3); + C32.mac(A3, B2); + C33.mac(A3, B3); + } + + if constexpr (c_row_maj) { + aie::store_v(pC1, C00.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C01.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C02.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C03.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC2, C10.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C11.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C12.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C13.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC3, C20.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C21.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C22.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C23.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC4, C30.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C31.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C32.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C33.template to_vector()); + pC4 += MMUL::size_C; + } else { + aie::store_v(pC1, aie::transpose(C00.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C01.template to_vector(), r, t)); + pC2 += MMUL::size_C; + aie::store_v(pC3, aie::transpose(C02.template to_vector(), r, t)); + pC3 += MMUL::size_C; + aie::store_v(pC4, aie::transpose(C03.template to_vector(), r, t)); + pC4 += MMUL::size_C; + + aie::store_v(pC1, aie::transpose(C10.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C11.template to_vector(), r, t)); + pC2 += MMUL::size_C; + aie::store_v(pC3, aie::transpose(C12.template to_vector(), r, t)); + pC3 += MMUL::size_C; + aie::store_v(pC4, aie::transpose(C13.template to_vector(), r, t)); + pC4 += MMUL::size_C; + + aie::store_v(pC1, aie::transpose(C20.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C21.template to_vector(), r, t)); + pC2 += MMUL::size_C; + aie::store_v(pC3, aie::transpose(C22.template to_vector(), r, t)); + pC3 += MMUL::size_C; + aie::store_v(pC4, aie::transpose(C23.template to_vector(), r, t)); + pC4 += MMUL::size_C; + + aie::store_v(pC1, aie::transpose(C30.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C31.template to_vector(), r, t)); + pC2 += MMUL::size_C; + aie::store_v(pC3, aie::transpose(C32.template to_vector(), r, t)); + pC3 += MMUL::size_C; + aie::store_v(pC4, aie::transpose(C33.template to_vector(), r, t)); + pC4 += MMUL::size_C; + } + } + } + + event1(); +} + +#ifdef B_COL_MAJ +constexpr bool is_b_row_maj = false; +#else +constexpr bool is_b_row_maj = true; +#endif + +#ifdef C_COL_MAJ +constexpr bool is_c_row_maj = false; +#else +constexpr bool is_c_row_maj = true; +#endif + +// The rounding mode can be set for bfloat16 mmul to improve accuracy +#ifdef ROUND_CONV_EVEN +constexpr aie::rounding_mode round_mode = aie::rounding_mode::conv_even; +#else +constexpr aie::rounding_mode round_mode = aie::rounding_mode::floor; // default +#endif + +/** + * @name AIE2-Optimized MatMul Wrappers + * @brief Type-specific matrix multiplication kernels optimized for AIE2. + * + * These wrappers select optimal mmul shapes and expansion factors for each + * data type combination on AIE2 architecture. + * + * Available shapes: https://xilinx.github.io/aie_api/group__group__mmul.html + * + * Each wrapper validates dimension divisibility via static_assert. + * @{ + */ + +/** + * @brief int16 -> int16 matrix multiply using 4x4x4 mmul shape with 2x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 8). + * @tparam k Tile K dimension (must be divisible by 4). + * @tparam n Tile N dimension (must be divisible by 8). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x4x4_i16_i16(const int16 * __restrict pA, + const int16 * __restrict pB, + int16 * __restrict pC) { + constexpr int r = 4; + constexpr int s = 4; + constexpr int t = 4; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief int16 -> int32 matrix multiply using 4x4x4 mmul shape with 2x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 8). + * @tparam k Tile K dimension (must be divisible by 4). + * @tparam n Tile N dimension (must be divisible by 8). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x4x4_i16_i32(const int16 * __restrict pA, + const int16 * __restrict pB, + int32 * __restrict pC) { + constexpr int r = 4; + constexpr int s = 4; + constexpr int t = 4; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief bfloat16 -> bfloat16 matrix multiply using 4x8x4 mmul shape with 4x4 expansion. + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 * __restrict pA, + const bfloat16 * __restrict pB, + bfloat16 * __restrict pC) { + constexpr int r = 4; + constexpr int s = 8; + constexpr int t = 4; + + static_assert(m % (4 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (4 * t) == 0); + + ::aie::set_rounding(round_mode); + + return matmul_vectorized_4x4(pA, pB, pC); +} + +/** + * @brief bfloat16 -> float32 matrix multiply using 4x8x4 mmul shape with 4x4 expansion. + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x8x4_bf16_f32(const bfloat16 * __restrict pA, + const bfloat16 * __restrict pB, + float * __restrict pC) { + constexpr int r = 4; + constexpr int s = 8; + constexpr int t = 4; + + static_assert(m % (4 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (4 * t) == 0); + + ::aie::set_rounding(round_mode); + + return matmul_vectorized_4x4(pA, pB, pC); +} + +/** + * @brief int8 -> int8 matrix multiply using 4x8x8 mmul shape with 4x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x8x8_i8_i8(const int8 * __restrict pA, + const int8 * __restrict pB, + int8 * __restrict pC) { + constexpr int r = 4; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (4 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_4x2_mmul(pA, pB, pC); +} + +/** + * @brief int8 -> int16 matrix multiply using 4x8x8 mmul shape with 4x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x8x8_i8_i16(const int8 * __restrict pA, + const int8 * __restrict pB, + int16 * __restrict pC) { + constexpr int r = 4; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (4 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_4x2_mmul(pA, pB, pC); +} + +/** + * @brief int8 -> int32 matrix multiply using 4x8x8 mmul shape with 4x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x8x8_i8_i32(const int8 * __restrict pA, + const int8 * __restrict pB, + int32 * __restrict pC) { + constexpr int r = 4; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (4 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_4x2_mmul(pA, pB, pC); +} + +extern "C" { + +// If you want to compile microkernels with different inner tile sizes, +// define DIM_M, DIM_K and DIM_N at compile time using -DDIM_M 32 etc. +// These dimensions must be divisible by the r, s, t dimensions used in +// the kernels. + +#ifndef DIM_M +#define DIM_M 64 +#endif + +#ifndef DIM_K +#define DIM_K 64 +#endif + +#ifndef DIM_N +#define DIM_N 64 +#endif + +#ifdef i8_i8_ONLY +#define combos(X) X(int8, i8, int8, i8, 4, 8, 8) +#endif + +#ifdef i8_i16_ONLY +#define combos(X) X(int8, i8, int16, i16, 4, 8, 8) +#endif + +#ifdef i8_i32_ONLY +#define combos(X) X(int8, i8, int32, i32, 4, 8, 8) +#endif + +#ifdef i16_i16_ONLY +#define combos(X) X(int16, i16, int16, i16, 4, 4, 4) +#endif + +#ifdef i16_i32_ONLY +#define combos(X) X(int16, i16, int32, i32, 4, 4, 4) +#endif + +#ifdef bf16_bf16_ONLY +#define combos(X) X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4) +#endif + +#ifdef bf16_f32_ONLY +#define combos(X) X(bfloat16, bf16, float, f32, 4, 8, 4) +#endif + +#ifdef f32_f32_ONLY +// f32 input has no vectorized MAC support on AIE2, use scalar only +#define combos(X) X(float, f32, float, f32, 1, 1, 1) +#define SCALAR_ONLY +#endif + +#ifndef combos +#define combos(X) \ + X(int8, i8, int8, i8, 4, 8, 8) \ + X(int16, i16, int16, i16, 4, 4, 4) \ + X(int16, i16, int32, i32, 4, 4, 4) \ + X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4) \ + X(bfloat16, bf16, float, f32, 4, 8, 4) +#endif + +#define matmul_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, r, s, t) \ + void matmul_##mlir_type_in##_##mlir_type_out(ctype_in * a_in, ctype_in * b_in, \ + ctype_out * c_out) { \ + matmul_vectorized_##r##x##s##x##t##_##mlir_type_in##_##mlir_type_out( \ + a_in, b_in, c_out); \ + } + +#define matmul_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, r, s, t) \ + void matmul_scalar_##mlir_type_in##_##mlir_type_out(ctype_in * a_in, ctype_in * b_in, \ + ctype_out * c_out) { \ + matmul_scalar( \ + a_in, b_in, c_out); \ + } + +#define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, r, s, t) \ + void zero_##mlir_type_out(ctype_out * c_out) { \ + zero_vectorized(c_out); \ + } + +#define zero_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, r, s, t) \ + void zero_scalar_##mlir_type_out(ctype_out * c_out) { \ + zero_scalar(c_out); \ + } + +#ifndef SCALAR_ONLY +combos(matmul_vectorized_c_func) combos(zero_vectorized_c_func) +#endif +#ifndef VECTORIZED_ONLY + combos(matmul_scalar_c_func) combos(zero_scalar_c_func) +#endif + +} // extern "C" \ No newline at end of file diff --git a/src/ggml-hsa/kernels/iron/aie2/zero.cc b/src/ggml-hsa/kernels/iron/aie2/zero.cc new file mode 100644 index 0000000000..ec8c3b3e6f --- /dev/null +++ b/src/ggml-hsa/kernels/iron/aie2/zero.cc @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * @file zero.cc + * @brief Zero-initialization kernels for AIE2 matrix buffers. + * + * Provides scalar and vectorized functions to zero-initialize output matrices + * before matrix multiplication accumulation. + */ + +#ifndef ZERO_CC +#define ZERO_CC + +#include + +/** + * @brief Scalar zero-initialization of a matrix buffer. + * + * Sets all M*N elements of the output buffer to zero using scalar stores. + * + * @tparam T Element type of the matrix. + * @tparam M Number of rows. + * @tparam N Number of columns. + * + * @param[out] c Output buffer of M*N elements to be zeroed. + */ +template +void zero_scalar(T * __restrict c) { + for (int i = 0; i < M * N; i++) { + c[i] = 0; + } +} + +/** + * @brief Vectorized zero-initialization of a matrix buffer. + * + * Sets all M*N elements of the output buffer to zero using 256-bit vector stores. + * More efficient than scalar version for AIE2. + * + * @tparam T Element type of the matrix. + * @tparam M Number of rows (M*N must be divisible by vector width). + * @tparam N Number of columns (M*N must be divisible by vector width). + * + * @param[out] c Output buffer of M*N elements to be zeroed. + */ +template +void zero_vectorized(T * __restrict c) { + constexpr int r = 256 / (sizeof(T) * 8); // one 256 bit store unit + static_assert((M * N) % r == 0); + const aie::vector zeros = aie::zeros(); + const T * __restrict c_end = c + M * N; + event0(); + for (; c < c_end; c += r) { + aie::store_v(c, zeros); + } + event1(); +} + +#endif \ No newline at end of file diff --git a/src/ggml-hsa/kernels/iron/aie2p/mm.cc b/src/ggml-hsa/kernels/iron/aie2p/mm.cc new file mode 100644 index 0000000000..d2cf74ca43 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/aie2p/mm.cc @@ -0,0 +1,648 @@ +// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * @file mm.cc + * @brief Matrix multiplication kernels for AIE2P architecture. + * + * This file provides scalar and vectorized matrix multiplication kernels + * optimized for AIE2P. The vectorized kernels use the aie::mmul class with + * 2x2 expansion and AIE2P-specific mmul shapes for optimal performance. + */ + +#define NOCPP + +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#include "zero.cc" + +#include + +/** + * @brief Scalar matrix multiplication kernel for reference/verification. + * + * Computes C += A * B using scalar operations. Supports configurable memory + * layouts for matrices B and C via template parameters. + * + * @tparam T_in Input element type for matrices A and B. + * @tparam T_out Output element type for matrix C. + * @tparam rowA Number of rows in matrix A (and C). + * @tparam colA Number of columns in A (rows in B). + * @tparam colB Number of columns in B (and C). + * @tparam b_row_maj If true, B is row-major; if false, column-major. + * @tparam c_row_maj If true, C is row-major; if false, column-major. + * + * @param[in] a Pointer to matrix A (rowA x colA, row-major). + * @param[in] b Pointer to matrix B (colA x colB, layout per b_row_maj). + * @param[in,out] c Pointer to matrix C (rowA x colB, layout per c_row_maj). + * Results are accumulated into C. + */ +template +static inline void matmul_scalar(T_in * a, T_in * b, T_out * c) { + event0(); + for (int row = 0; row < rowA; row++) { + for (int col = 0; col < colB; col++) { + T_out running_sum = 0; + for (int i = 0; i < colA; i++) { + T_in a_val = a[row * colA + i]; + T_in b_val; + if constexpr (b_row_maj) { + b_val = b[i * colB + col]; + } else { + b_val = b[i + col * colA]; + } + running_sum += a_val * b_val; + } + T_out * c_ptr; + if constexpr (c_row_maj) { + c_ptr = &c[row * colB + col]; + } else { + c_ptr = &c[row + col * rowA]; + } + *c_ptr += running_sum; + } + } + event1(); +} + +/** + * @brief Vectorized matrix multiplication with 2x2 mmul expansion for AIE2P. + * + * Blocked MatMul kernel utilizing the aie::mmul class. Matrices are assumed + * to be pre-tiled with shapes: A => rxs, B => sxt, C => rxt. + * + * This kernel expands the aie::mmul 2x in both A (m dimension) and B (n dimension), + * resulting in a 2x2 expansion in output C (C00, C01, C10, C11). This expansion + * maximizes accumulator register usage for high SIMD efficiency. + * + * Data layout: tiles are row-major, and data within tiles is row-major: + * @verbatim + * <-s-> + * _ ________________________ + * r | 1 | 2 | 3 | ... + * _ |____|____|____| + * | x | x+1| x+2| ... + * @endverbatim + * + * @tparam T_in Input element type. + * @tparam T_out Output element type. + * @tparam rowA Number of tile rows in A (in units of r). + * @tparam colA Number of tile columns in A / rows in B (in units of s). + * @tparam colB Number of tile columns in B (in units of t). + * @tparam r mmul M dimension. + * @tparam s mmul K dimension. + * @tparam t mmul N dimension. + * @tparam b_row_maj If true, B tiles are row-major; if false, column-major. + * @tparam c_row_maj If true, C tiles are row-major; if false, column-major. + * + * @param[in] pA Pointer to pre-tiled matrix A. + * @param[in] pB Pointer to pre-tiled matrix B. + * @param[in,out] pC Pointer to pre-tiled matrix C (accumulates results). + * + * @see https://xilinx.github.io/aie_api/group__group__mmul.html + */ +template +static inline void matmul_vectorized_2x2_mmul(const T_in * __restrict pA, + const T_in * __restrict pB, + T_out * __restrict pC) { + + using MMUL = aie::mmul; + + event0(); + + for (unsigned z = 0; z < rowA; z += 2) + chess_prepare_for_pipelining chess_loop_range(4, ) { + + T_out * __restrict pC1; + T_out * __restrict pC2; + if constexpr (c_row_maj) { + pC1 = pC + (z * colB) * MMUL::size_C; + pC2 = pC + ((z + 1) * colB) * MMUL::size_C; + } + + for (unsigned j = 0; j < colB; j += 2) +#ifdef OPT_PERF_ENABLED + chess_flatten_loop +#endif + { + + if constexpr (!c_row_maj) { + pC1 = pC + j * rowA * MMUL::size_C + z * MMUL::size_C; + pC2 = pC + (j + 1) * rowA * MMUL::size_C + z * MMUL::size_C; + } + const T_in * __restrict pA1 = pA + (z * colA) * MMUL::size_A; + const T_in * __restrict pA2 = pA + ((z + 1) * colA) * MMUL::size_A; + const T_in * __restrict pB1; + const T_in * __restrict pB2; + if constexpr (b_row_maj) { + pB1 = pB + (j)*MMUL::size_B; + pB2 = pB + (j + 1) * MMUL::size_B; + } else { + pB1 = pB + (j * colA) * MMUL::size_B; + pB2 = pB + ((j + 1) * colA) * MMUL::size_B; + } + aie::vector A0; + aie::vector A1; + aie::vector B0; + aie::vector B1; + + // Load partial results from C buffer for accumulation in-place. The + // zero.cc function handles the zeroing of data when a new + // accumulation is needed (after the 'K' reduction dimension) + aie::vector acc_C00; + aie::vector acc_C01; + aie::vector acc_C10; + aie::vector acc_C11; + if constexpr (c_row_maj) { + acc_C00 = aie::load_v(pC1); + acc_C01 = aie::load_v(pC1 + MMUL::size_C); + acc_C10 = aie::load_v(pC2); + acc_C11 = aie::load_v(pC2 + MMUL::size_C); + } else { + acc_C00 = aie::transpose(aie::load_v(pC1), t, r); + acc_C01 = aie::transpose(aie::load_v(pC2), t, r); + acc_C10 = + aie::transpose(aie::load_v(pC1 + MMUL::size_C), t, r); + acc_C11 = + aie::transpose(aie::load_v(pC2 + MMUL::size_C), t, r); + } + + MMUL C00(acc_C00); + MMUL C01(acc_C01); + MMUL C10(acc_C10); + MMUL C11(acc_C11); + + for (unsigned i = 0; i < colA; ++i) +#ifdef OPT_PERF_ENABLED + chess_flatten_loop +#endif + { + A0 = aie::load_v(pA1); + pA1 += MMUL::size_A; + A1 = aie::load_v(pA2); + pA2 += MMUL::size_A; + if constexpr (b_row_maj) { + B0 = aie::load_v(pB1); + pB1 += MMUL::size_B * colB; + B1 = aie::load_v(pB2); + pB2 += MMUL::size_B * colB; + } else { + B0 = aie::transpose(aie::load_v(pB1), t, s); + pB1 += MMUL::size_B; + B1 = aie::transpose(aie::load_v(pB2), t, s); + pB2 += MMUL::size_B; + } + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + } + + // TODO make shift right here to keep most significat bits + // when lowering the output + // example below shows how to shift right 10 bits + // #define SHIFT 10 + // aie::store_v(pC1, C00.template to_vector(SHIFT)); + + if constexpr (c_row_maj) { + aie::store_v(pC1, C00.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C01.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC2, C10.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C11.template to_vector()); + pC2 += MMUL::size_C; + } else { + aie::store_v(pC1, aie::transpose(C00.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C01.template to_vector(), r, t)); + pC2 += MMUL::size_C; + aie::store_v(pC1, aie::transpose(C10.template to_vector(), r, t)); + pC1 += MMUL::size_C; + aie::store_v(pC2, aie::transpose(C11.template to_vector(), r, t)); + pC2 += MMUL::size_C; + } + } + } + + event1(); +} + +#ifdef B_COL_MAJ +constexpr bool is_b_row_maj = false; +#else +constexpr bool is_b_row_maj = true; +#endif + +#ifdef C_COL_MAJ +constexpr bool is_c_row_maj = false; +#else +constexpr bool is_c_row_maj = true; +#endif + +// The rounding mode can be set for bfloat16 mmul to improve accuracy +#ifdef ROUND_CONV_EVEN +constexpr aie::rounding_mode round_mode = aie::rounding_mode::conv_even; +#else +constexpr aie::rounding_mode round_mode = aie::rounding_mode::floor; // default +#endif + +/** + * @name AIE2P-Optimized MatMul Wrappers + * @brief Type-specific matrix multiplication kernels optimized for AIE2P. + * + * These wrappers select optimal mmul shapes for each data type combination + * on AIE2P architecture. All use 2x2 mmul expansion. + * + * Available shapes: https://xilinx.github.io/aie_api/group__group__mmul.html + * + * Each wrapper validates dimension divisibility via static_assert. + * @{ + */ + +/** + * @brief int16 -> int16 matrix multiply using 4x4x8 mmul shape with 2x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 8). + * @tparam k Tile K dimension (must be divisible by 4). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x4x8_i16_i16(const int16 * __restrict pA, + const int16 * __restrict pB, + int16 * __restrict pC) { + constexpr int r = 4; + constexpr int s = 4; + constexpr int t = 8; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief int16 -> int32 matrix multiply using 4x4x8 mmul shape with 2x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 8). + * @tparam k Tile K dimension (must be divisible by 4). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x4x8_i16_i32(const int16 * __restrict pA, + const int16 * __restrict pB, + int32 * __restrict pC) { + constexpr int r = 4; + constexpr int s = 4; + constexpr int t = 8; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief bfloat16 -> bfloat16 matrix multiply using 4x8x8 mmul shape with 2x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 8). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x8x8_bf16_bf16(const bfloat16 * __restrict pA, + const bfloat16 * __restrict pB, + bfloat16 * __restrict pC) { + constexpr int r = 4; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + ::aie::set_rounding(round_mode); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief bfloat16 -> bfloat16 matrix multiply using 8x8x8 mmul shape with 2x2 expansion. + * + * @note This shape is only available when using bfp16 emulation + * (AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16). + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_8x8x8_bf16_bf16(const bfloat16 * __restrict pA, + const bfloat16 * __restrict pB, + bfloat16 * __restrict pC) { + constexpr int r = 8; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + ::aie::set_rounding(round_mode); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief bfloat16 -> float32 matrix multiply using 4x8x8 mmul shape with 2x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 8). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_4x8x8_bf16_f32(const bfloat16 * __restrict pA, + const bfloat16 * __restrict pB, + float * __restrict pC) { + constexpr int r = 4; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + ::aie::set_rounding(round_mode); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief bfloat16 -> float32 matrix multiply using 8x8x8 mmul shape with 2x2 expansion. + * + * @note This shape is only available when using bfp16 emulation + * (AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16). + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_8x8x8_bf16_f32(const bfloat16 * __restrict pA, + const bfloat16 * __restrict pB, + float * __restrict pC) { + constexpr int r = 8; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + ::aie::set_rounding(round_mode); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief int8 -> int8 matrix multiply using 8x8x8 mmul shape with 2x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_8x8x8_i8_i8(const int8 * __restrict pA, + const int8 * __restrict pB, + int8 * __restrict pC) { + constexpr int r = 8; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief int8 -> int16 matrix multiply using 8x8x8 mmul shape with 2x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_8x8x8_i8_i16(const int8 * __restrict pA, + const int8 * __restrict pB, + int16 * __restrict pC) { + constexpr int r = 8; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +/** + * @brief int8 -> int32 matrix multiply using 8x8x8 mmul shape with 2x2 expansion. + * + * @tparam m Tile M dimension (must be divisible by 16). + * @tparam k Tile K dimension (must be divisible by 8). + * @tparam n Tile N dimension (must be divisible by 16). + * + * @param[in] pA Input matrix A. + * @param[in] pB Input matrix B. + * @param[in,out] pC Output matrix C (accumulated). + */ +template +static inline void matmul_vectorized_8x8x8_i8_i32(const int8 * __restrict pA, + const int8 * __restrict pB, + int32 * __restrict pC) { + constexpr int r = 8; + constexpr int s = 8; + constexpr int t = 8; + + static_assert(m % (2 * r) == 0); + static_assert(k % s == 0); + static_assert(n % (2 * t) == 0); + + return matmul_vectorized_2x2_mmul(pA, pB, pC); +} + +extern "C" { + +// If you want to compile microkernels with different inner tile sizes, +// define DIM_M, DIM_K and DIM_N at compile time using -DDIM_M 32 etc. +// These dimensions must be divisible by the r, s, t dimensions used in +// the kernels. + +#ifndef DIM_M +#define DIM_M 64 +#endif + +#ifndef DIM_K +#define DIM_K 64 +#endif + +#ifndef DIM_N +#define DIM_N 64 +#endif + +#ifdef i8_i8_ONLY +#define combos(X) X(int8, i8, int8, i8, 8, 8, 8) +#endif + +#ifdef i8_i16_ONLY +#define combos(X) X(int8, i8, int16, i16, 8, 8, 8) +#endif + +#ifdef i8_i32_ONLY +#define combos(X) X(int8, i8, int32, i32, 8, 8, 8) +#endif + +#ifdef i16_i16_ONLY +#define combos(X) X(int16, i16, int16, i16, 4, 4, 8) +#endif + +#ifdef i16_i32_ONLY +#define combos(X) X(int16, i16, int32, i32, 4, 4, 8) +#endif + +// The emulation of bf16 changes the available shapes for matrix multiplication +#ifdef bf16_bf16_ONLY +#ifdef AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 +#define combos(X) X(bfloat16, bf16, bfloat16, bf16, 8, 8, 8) +#else +#define combos(X) X(bfloat16, bf16, bfloat16, bf16, 4, 8, 8) +#endif +#endif + +#ifdef bf16_f32_ONLY +#ifdef AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 +#define combos(X) X(bfloat16, bf16, float, f32, 8, 8, 8) +#else +#define combos(X) X(bfloat16, bf16, float, f32, 4, 8, 8) +#endif +#endif + +#ifdef f32_f32_ONLY +// f32 input has no vectorized MAC support on AIE2p, use scalar only +#define combos(X) X(float, f32, float, f32, 1, 1, 1) +#define SCALAR_ONLY +#endif + +#ifndef combos +#ifdef AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 +#define combos(X) \ + X(int8, i8, int8, i8, 8, 8, 8) \ + X(int16, i16, int16, i16, 4, 4, 8) \ + X(int16, i16, int32, i32, 4, 4, 8) \ + X(bfloat16, bf16, bfloat16, bf16, 8, 8, 8) \ + X(bfloat16, bf16, float, f32, 8, 8, 8) +#else +#define combos(X) \ + X(int8, i8, int8, i8, 8, 8, 8) \ + X(int16, i16, int16, i16, 4, 4, 8) \ + X(int16, i16, int32, i32, 4, 4, 8) \ + X(bfloat16, bf16, bfloat16, bf16, 4, 8, 8) \ + X(bfloat16, bf16, float, f32, 4, 8, 8) +#endif +#endif + +#define matmul_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, r, s, t) \ + void matmul_##mlir_type_in##_##mlir_type_out(ctype_in * a_in, ctype_in * b_in, \ + ctype_out * c_out) { \ + matmul_vectorized_##r##x##s##x##t##_##mlir_type_in##_##mlir_type_out( \ + a_in, b_in, c_out); \ + } + +#define matmul_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, r, s, t) \ + void matmul_scalar_##mlir_type_in##_##mlir_type_out(ctype_in * a_in, ctype_in * b_in, \ + ctype_out * c_out) { \ + matmul_scalar( \ + a_in, b_in, c_out); \ + } + +#define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, r, s, t) \ + void zero_##mlir_type_out(ctype_out * c_out) { \ + zero_vectorized(c_out); \ + } + +#define zero_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, r, s, t) \ + void zero_scalar_##mlir_type_out(ctype_out * c_out) { \ + zero_scalar(c_out); \ + } + +combos(matmul_vectorized_c_func) combos(matmul_scalar_c_func) combos(zero_vectorized_c_func) + combos(zero_scalar_c_func) + +} // extern "C" \ No newline at end of file diff --git a/src/ggml-hsa/kernels/iron/aie2p/zero.cc b/src/ggml-hsa/kernels/iron/aie2p/zero.cc new file mode 100644 index 0000000000..782e963739 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/aie2p/zero.cc @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * @file zero.cc + * @brief Zero-initialization kernels for AIE2P matrix buffers. + * + * Provides scalar and vectorized functions to zero-initialize output matrices + * before matrix multiplication accumulation. AIE2P uses 512-bit vector stores. + */ + +#ifndef ZERO_CC +#define ZERO_CC + +#include +#include +#include +#include +#include + +/** + * @brief Scalar zero-initialization of a matrix buffer. + * + * Sets all M*N elements of the output buffer to zero using scalar stores. + * + * @tparam T Element type of the matrix. + * @tparam M Number of rows. + * @tparam N Number of columns. + * + * @param[out] c Output buffer of M*N elements to be zeroed. + */ +template +void zero_scalar(T * __restrict c) { + for (int i = 0; i < M * N; i++) { + c[i] = 0; + } +} + +/** + * @brief Vectorized zero-initialization of a matrix buffer. + * + * Sets all M*N elements of the output buffer to zero using 512-bit vector stores. + * More efficient than scalar version for AIE2P. + * + * @tparam T Element type of the matrix. + * @tparam M Number of rows (M*N must be divisible by vector width). + * @tparam N Number of columns (M*N must be divisible by vector width). + * + * @param[out] c Output buffer of M*N elements to be zeroed. + */ +template +void zero_vectorized(T * __restrict c) { + constexpr int r = 512 / (sizeof(T) * 8); // 512 bit store units for AIE2P + static_assert((M * N) % r == 0); + const aie::vector zeros = aie::zeros(); + const T * __restrict c_end = c + M * N; + event0(); + for (; c < c_end; c += r) { + aie::store_v(c, zeros); + } + event1(); +} + +#endif \ No newline at end of file diff --git a/src/ggml-hsa/kernels/iron/aie_kernel_math.h b/src/ggml-hsa/kernels/iron/aie_kernel_math.h new file mode 100644 index 0000000000..2780956074 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/aie_kernel_math.h @@ -0,0 +1,320 @@ +/* + Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + SPDX-License-Identifier: MIT +*/ + +#pragma once + +#include +#include + +#include "ggml-aie.hpp" +#include + +/** + * @brief Computes the exponential function using range reduction. + * + * Implements exp(x) = 2^(x * log2(e)) = 2^n * 2^f where n is the integer part + * and f is the fractional part. The 2^f term is computed via a Taylor series + * after converting back to natural base. + * + * @param[in] x Input value (clamped to [-88, 88] to avoid overflow). + * @return exp(x), with a floor of 1e-38f for very small results. + */ +inline float scalar_exp(float x) { + // Clamp to avoid overflow/underflow + x = std::clamp(x, -88.0f, 88.0f); + + // Range reduction: exp(x) = 2^(x * log2(e)) = 2^n * 2^f + constexpr float log2e = 1.4426950408889634f; + float t = x * log2e; + int32_t n = static_cast(t); + if (t < static_cast(n)) + n--; + float f = t - static_cast(n); + + // Convert fractional part back to natural base: 2^f = exp(f * ln2) + constexpr float ln2 = 0.6931471805599453f; + float r = f * ln2; + + // Taylor series for exp(r) where r is in [0, ln2) + float poly = + 1.0f + + r * (1.0f + r * (0.5f + r * (0.166666667f + + r * (0.041666667f + r * (0.008333333f + r * 0.001388889f))))); + + // Compute 2^n via bit manipulation + n = (n < -126) ? -126 : ((n > 127) ? 127 : n); + int32_t bits = (127 + n) << 23; + float scale; + std::memcpy(&scale, &bits, sizeof(float)); + + float result = poly * scale; + return (result < 1e-38f) ? 1e-38f : result; +} + +/** + * @brief Computes the natural logarithm using IEEE 754 range reduction. + * + * Implements ln(x) = ln(m * 2^e) = ln(m) + e * ln(2), where m is the mantissa + * normalized to [1, 2). The ln(m) is computed using a 2*atanh series: + * ln(m) = 2 * atanh((m-1)/(m+1)) with a polynomial approximation. + * + * @param[in] x The input value (must be positive). + * + * @return The natural logarithm of x. Returns -88.0f for x <= 0. + */ +inline float scalar_log(float x) { + if (x <= 0.0f) + return -88.0f; + + int32_t bits = reinterpret_cast(x); + int32_t e_int = ((bits >> 23) & 0xFF) - 127; + float e = static_cast(e_int); + + int32_t m_bits = (bits & 0x007FFFFF) | 0x3F800000; + float m = reinterpret_cast(m_bits); + + float z = (m - 1.0f) / (m + 1.0f); + float z2 = z * z; + + float poly = 0.0909090909f; // 1/11 + poly = poly * z2 + 0.1111111111f; // 1/9 + poly = poly * z2 + 0.1428571429f; // 1/7 + poly = poly * z2 + 0.2000000000f; // 1/5 + poly = poly * z2 + 0.3333333333f; // 1/3 + poly = poly * z2 + 1.0f; + + float ln_m = 2.0f * (z * poly); + + constexpr float ln2 = 0.6931471805599453f; + return ln_m + (e * ln2); +} + +/** + * @brief Computes the vectorized exponential function exp(x) for AIE. + * + * This function implements a range-reduced exponential using the identity + * exp(x) = 2^n * exp(r), where n = floor(x * log2(e)) and r = x - n * ln(2). + * + * The implementation uses: + * - Cody-Waite splitting of ln(2) into high and low parts to minimize + * rounding error during range reduction. + * - A degree-13 polynomial approximation of exp(r) on [0, ln2) with + * approximately 6e-13 peak error. + * - IEEE 754 bit manipulation to compute 2^n efficiently. + * + * @tparam VecSize The SIMD vector width + * + * @param[in,out] x Input vector of float values. The input is clamped to + * [-88, 88] to avoid overflow/underflow in float32. The + * vector may be modified during computation. + * + * @return A vector of float values containing exp(x) for each element. + * Results are clamped to a minimum of 1e-38 to avoid exact zero + * from underflow. + */ +template +aie::vector vec_exp(aie::vector & x) { + constexpr float log2e = 1.4426950408889634f; // log2(e) + // Cody-Waite split of ln(2) for accurate range reduction + constexpr float ln2_hi = 6.93145751953125e-1f; // upper bits, exact in float + constexpr float ln2_lo = 1.4286068203094172321e-6f; // residual + + // Clamp to representable range of exp() in float32 + x = aie::max(x, aie::broadcast(-88.0f)); + x = aie::min(x, aie::broadcast(88.0f)); + + // Compute t = x * log2(e) + aie::accum t_acc = aie::mul(x, log2e); + aie::vector t = t_acc.template to_vector(); + + // n = floor(t) via magic-number rounding + constexpr float magic = 12582912.0f; // 1.5 * 2^23 + aie::vector v_magic = aie::broadcast(magic); + aie::vector n_f = aie::sub(aie::add(t, v_magic), v_magic); + + // Adjust round-to-nearest -> floor: if n_f > t, we rounded up + auto overshot = aie::lt(t, n_f); + n_f = aie::sub(n_f, aie::select(aie::broadcast(0.0f), + aie::broadcast(1.0f), overshot)); + + // r = x - n * ln(2) with Cody-Waite precision + aie::accum hi_acc = aie::mul(n_f, ln2_hi); + aie::vector r = aie::sub(x, hi_acc.template to_vector()); + aie::accum lo_acc = aie::mul(n_f, ln2_lo); + r = aie::sub(r, lo_acc.template to_vector()); + + // Evaluate exp(r) using Horner's method (degree 13) + // + // exp(r) ~ 1 + r + r^2/2! + r^3/3! + ... + r^13/13! + // Coefficients in high-to-low order for Horner evaluation: + constexpr float exp_coeffs[] = { + 0.0000000001605904f, // 1/13! + 0.0000000020876757f, // 1/12! + 0.0000000250521084f, // 1/11! + 0.0000002755731922f, // 1/10! + 0.0000027557319224f, // 1/9! + 0.0000248015873016f, // 1/8! + 0.0001984126984127f, // 1/7! + 0.0013888888888889f, // 1/6! + 0.0083333333333333f, // 1/5! + 0.0416666666666667f, // 1/4! + 0.1666666666666667f, // 1/3! + 0.5f, // 1/2! + 1.0f, // 1/1! + 1.0f // 1/0! + }; + constexpr int32_t NUM_EXP_COEFFS = sizeof(exp_coeffs) / sizeof(exp_coeffs[0]); + + aie::vector poly = aie::broadcast(exp_coeffs[0]); + aie::accum tmp; + +#pragma unroll + for (int32_t i = 1; i < NUM_EXP_COEFFS; ++i) { + tmp = aie::mul(poly, r); + poly = aie::add(tmp.template to_vector(), + aie::broadcast(exp_coeffs[i])); + } + + // Compute 2^n via IEEE 754 bit construction + n_f = aie::max(n_f, aie::broadcast(-126.0f)); + n_f = aie::min(n_f, aie::broadcast(127.0f)); + + // aie::to_fixed(n_f, 23) = floor(n_f * 2^23) = n << 23 + auto n_shifted = aie::to_fixed(n_f, 23); + // (n + 127) << 23 = (n << 23) + (127 << 23) + auto scale_bits = aie::add(n_shifted, aie::broadcast(0x3F800000)); + aie::vector scale = scale_bits.template cast_to(); + + // Reconstruct exp(x) = exp(r) * 2^n + aie::accum result_acc = aie::mul(poly, scale); + aie::vector result = result_acc.template to_vector(); + + // Clamp to positive minimum (avoid exact zero from underflow) + result = aie::max(result, aie::broadcast(1e-38f)); + + return result; +} + +/** + * @brief Computes 2^x using range reduction for improved precision. + * + * Horner's method suffers from precision loss for large input values. + * This function applies range reduction by splitting x into integer (i) and + * fractional (f) parts where i = floor(x) and f is in [0, 1). + * Formula: 2^x = 2^i * 2^f + * + * The fractional part 2^f is computed using a degree-10 Taylor series + * approximation of exp(f * ln(2)) via Horner's method. The integer part 2^i + * is computed using IEEE 754 bit manipulation. + * + * @param[in] x The exponent value. + * + * @return The computed value of 2^x. + */ +inline float pow2(float x) { + // split x into integer and fractional parts + int32_t i = static_cast(x); + if (x < static_cast(i)) { + i--; + } + float f = x - static_cast(i); + + constexpr float pow2_coeffs[] = { + 0.0000000070549116f, // ln(2)^10 / 10! + 0.0000001017808600f, // ln(2)^9 / 9! + 0.0000013215486790f, // ln(2)^8 / 8! + 0.0000152525277765f, // ln(2)^7 / 7! + 0.0001540353039338f, // ln(2)^6 / 6! + 0.0013333558146428f, // ln(2)^5 / 5! + 0.0096181291076285f, // ln(2)^4 / 4! + 0.0555041086648216f, // ln(2)^3 / 3! + 0.2402265069591007f, // ln(2)^2 / 2! + 0.6931471805599453f, // ln(2)^1 / 1! + 1.0f // ln(2)^0 / 0! + }; + constexpr int NUM_POW2_COEFFS = sizeof(pow2_coeffs) / sizeof(pow2_coeffs[0]); + + // compute 2^f using Horner's method for Taylor series of exp(f * ln(2)) + float exp_f = pow2_coeffs[0]; + +#pragma unroll + for (int j = 1; j < NUM_POW2_COEFFS; ++j) { + exp_f = exp_f * f + pow2_coeffs[j]; + } + + // this takes a couple of cycles to compute 2^i using IEEE 754 bit manipulation + // IEEE 754 float: 2^i is represented as exponent = 127 + i, mantissa = 0 + // create the integer representation of 2^i + int32_t bits = (127 + i) << 23; + // cast the bits directly to float + float scale = reinterpret_cast(bits); + + return exp_f * scale; +} + +/** + * @brief Computes floor(log2(x)) for positive integers. + * + * Finds the position of the most significant bit set in x. + * + * @param[in] x The input value (must be > 0 for meaningful result). + * + * @return The floor of log base 2 of x. Returns 0 for x <= 1. + */ +inline uint32_t floor_log2(uint32_t x) { + uint32_t result = 0; + while (x > 1) { + x >>= 1; + result++; + } + return result; +} + +/** + * @brief Computes the ALiBi (Attention with Linear Biases) slope for a head. + * + * ALiBi applies position-dependent biases in attention using geometric slopes. + * The slope for each head is computed based on the head index and total heads. + * + * For heads 0 to n_head_log2-1: slope = m0^(head_idx+1) + * For heads >= n_head_log2: slope = m1^(2*(head_idx - n_head_log2) + 1) + * + * where m0 = 2^(-max_bias/n_head_log2) and m1 = 2^(-max_bias/2/n_head_log2) + * + * @param[in] max_bias Maximum bias value. If <= 0, returns 1.0. + * @param[in] n_head Total number of attention heads. + * @param[in] head_idx Index of the current head (0-based). + * + * @return The computed ALiBi slope for this head. + */ +inline float compute_alibi_slope(float max_bias, int32_t n_head, int32_t head_idx) { + if (max_bias <= 0.0f) { + return 1.0f; + } + + uint32_t n_head_log2 = 1u << floor_log2((uint32_t)n_head); + + // compute base values m0 and m1 + float m0 = pow2(-max_bias / n_head_log2); + float m1 = pow2(-(max_bias / 2.0f) / n_head_log2); + + float slope; + if (head_idx < n_head_log2) { + // slope = m0^(head_idx+1) via repeated multiplication + slope = m0; + for (int32_t j = 0; j < head_idx; ++j) { + slope *= m0; + } + } else { + // slope = m1^(2*(head_idx - n_head_log2) + 1) + uint32_t exp = 2 * (head_idx - n_head_log2) + 1; + slope = m1; + for (uint32_t j = 1; j < exp; ++j) { + slope *= m1; + } + } + + return slope; +} diff --git a/src/ggml-hsa/kernels/iron/aie_kernel_utils.h b/src/ggml-hsa/kernels/iron/aie_kernel_utils.h new file mode 100644 index 0000000000..7ca1aedcfc --- /dev/null +++ b/src/ggml-hsa/kernels/iron/aie_kernel_utils.h @@ -0,0 +1,190 @@ +/* + Copyright (C) 2014 - 2022 Xilinx, Inc. All rights reserved. + Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc. All rights reserved. + SPDX-License-Identifier: MIT +*/ + +/** + * @file aie_kernel_utils.h + * @brief Compiler-agnostic macros for AIE kernel loop optimization hints. + * + * This header provides portable macros that map to compiler-specific pragmas + * and attributes for loop optimization on AIE cores. The macros support three + * compilation environments: + * - Chess compiler (__chess__): Uses Chess-specific attributes + * - AIECC compiler (__AIECC__): Uses Clang pragmas + * - Other compilers: Macros expand to empty (no-op) + * + * @defgroup loop_macros Loop Optimization Macros + * @{ + */ + +#ifndef _AIE_KERNEL_UTILS_ +#define _AIE_KERNEL_UTILS_ + +#if defined(__chess__) +/** + * @def AIE_LOOP_UNROLL(x) + * @brief Unrolls a loop by factor x. + * @param x The unroll factor (number of iterations to unroll). + */ +#define AIE_LOOP_UNROLL(x) [[chess::unroll_loop(x)]] + +/** + * @def AIE_LOOP_UNROLL_FULL + * @brief Fully unrolls a loop (all iterations). + */ +#define AIE_LOOP_UNROLL_FULL [[chess::unroll_loop()]] + +/** + * @def AIE_LOOP_NO_UNROLL + * @brief Prevents the compiler from unrolling a loop. + */ +#define AIE_LOOP_NO_UNROLL [[chess::no_unroll]] + +/** + * @def AIE_LOOP_MIN_ITERATION_COUNT(x) + * @brief Hints the minimum number of loop iterations to the compiler. + * @param x The minimum iteration count. + */ +#define AIE_LOOP_MIN_ITERATION_COUNT(x) [[chess::min_loop_count(x)]] + +/** + * @def AIE_LOOP_MAX_ITERATION_COUNT(x) + * @brief Hints the maximum number of loop iterations to the compiler. + * @param x The maximum iteration count. + */ +#define AIE_LOOP_MAX_ITERATION_COUNT(x) [[chess::max_loop_count(x)]] + +/** + * @def AIE_LOOP_RANGE(a, ...) + * @brief Hints both minimum and optionally maximum loop iteration counts. + * @param a The minimum iteration count. + * @param ... Optional maximum iteration count. + */ +#define AIE_LOOP_RANGE(a, ...) \ + [[chess::min_loop_count(a)]] __VA_OPT__([[chess::max_loop_count(__VA_ARGS__)]]) + +/** + * @def AIE_PREPARE_FOR_PIPELINING + * @brief Prepares a loop for software pipelining optimization. + */ +#define AIE_PREPARE_FOR_PIPELINING [[chess::prepare_for_pipelining]] + +/** + * @def AIE_NO_PREPARE_FOR_PIPELINING + * @brief Prevents software pipelining preparation for a loop. + */ +#define AIE_NO_PREPARE_FOR_PIPELINING [[chess::no_prepare_for_pipelining]] + +/** + * @def AIE_MODULO_SCHEDULING_BUDGET_RATIO(x) + * @brief Sets the modulo scheduling budget ratio for pipelining. + * @param x The budget ratio value. + */ +#define AIE_MODULO_SCHEDULING_BUDGET_RATIO(x) [[chess::modulo_scheduling_budget_ratio(x)]] + +/** + * @def AIE_KEEP_SW_LOOP + * @brief Keeps the loop as a software loop (prevents hardware loop conversion). + */ +#define AIE_KEEP_SW_LOOP [[chess::keep_sw_loop]] + +/** + * @def AIE_PEEL_PIPELINED_LOOP(x) + * @brief Peels iterations from a pipelined loop. + * @param x The number of iterations to peel. + */ +#define AIE_PEEL_PIPELINED_LOOP(x) [[chess::peel_pipelined_loop(x)]] + +/** + * @def AIE_KEEP_FREE_FOR_PIPELINING(x) + * @brief Reserves resources for pipelining. + * @param x The resource specification. + */ +#define AIE_KEEP_FREE_FOR_PIPELINING(x) [[chess::keep_free_for_pipelining(x)]] + +/** + * @def AIE_ALLOCATE(x) + * @brief Specifies register allocation hints. + * @param x The allocation specification. + */ +#define AIE_ALLOCATE(x) [[chess::allocate(x)]] + +/** + * @def AIE_NO_HW_LOOP + * @brief Prevents conversion to a hardware loop. + */ +#define AIE_NO_HW_LOOP [[chess::no_hw_loop]] + +/** + * @def AIE_TRY_INITIATION_INTERVAL(x) + * @brief Attempts to achieve a specific initiation interval for pipelining. + * @param x The target initiation interval. + * @note No-op on Chess compiler; effective on AIECC. + */ +#define AIE_TRY_INITIATION_INTERVAL(x) + +/** + * @def AIE_PREPARE_FOR_POSTPIPELINING + * @brief Prepares for post-pipelining optimization. + * @note No-op on Chess compiler; effective on AIECC. + */ +#define AIE_PREPARE_FOR_POSTPIPELINING + +/** + * @def AIE_LOOP_FLATTEN + * @brief Flattens nested loops for optimization. + */ +#define AIE_LOOP_FLATTEN chess_flatten_loop + +/* AIECC compiler (Clang-based) - uses Clang pragmas */ +#elif defined(__AIECC__) +#ifndef __STRINGIFY +#define __STRINGIFY(a) #a +#endif +#define AIE_LOOP_UNROLL(x) _Pragma(__STRINGIFY(clang loop unroll_count(x))) +#define AIE_LOOP_UNROLL_FULL _Pragma("clang loop unroll(full)") +#define AIE_LOOP_NO_UNROLL _Pragma("clang loop unroll(disable)") +#define AIE_LOOP_MIN_ITERATION_COUNT(x) _Pragma(__STRINGIFY(clang loop min_iteration_count(x))) +#define AIE_LOOP_MAX_ITERATION_COUNT(x) _Pragma(__STRINGIFY(clang loop max_iteration_count(x))) +#define AIE_LOOP_RANGE(a, ...) \ + AIE_LOOP_MIN_ITERATION_COUNT(a) \ + __VA_OPT__(AIE_LOOP_MAX_ITERATION_COUNT(__VA_ARGS__)) +#define AIE_PREPARE_FOR_PIPELINING +#define AIE_NO_PREPARE_FOR_PIPELINING +#define AIE_MODULO_SCHEDULING_BUDGET_RATIO(x) +#define AIE_KEEP_SW_LOOP +#define AIE_PEEL_PIPELINED_LOOP(x) +#define AIE_KEEP_FREE_FOR_PIPELINING(x) +#define AIE_ALLOCATE(x) +#define AIE_NO_HW_LOOP +#define AIE_TRY_INITIATION_INTERVAL(x) \ + _Pragma(__STRINGIFY(clang loop pipeline_initiation_interval(x))) +#define AIE_PREPARE_FOR_POSTPIPELINING _Pragma("clang loop pipeline(disable)") +#define AIE_LOOP_FLATTEN + +/* Fallback for other compilers - all macros expand to no-ops */ +#else +#define AIE_LOOP_UNROLL(x) +#define AIE_LOOP_UNROLL_FULL +#define AIE_LOOP_NO_UNROLL +#define AIE_LOOP_MIN_ITERATION_COUNT(x) +#define AIE_LOOP_MAX_ITERATION_COUNT(x) +#define AIE_LOOP_RANGE(a, ...) +#define AIE_PREPARE_FOR_PIPELINING +#define AIE_NO_PREPARE_FOR_PIPELINING +#define AIE_MODULO_SCHEDULING_BUDGET_RATIO(x) +#define AIE_KEEP_SW_LOOP +#define AIE_PEEL_PIPELINED_LOOP(x) +#define AIE_KEEP_FREE_FOR_PIPELINING(x) +#define AIE_ALLOCATE(x) +#define AIE_NO_HW_LOOP +#define AIE_TRY_INITIATION_INTERVAL(x) +#define AIE_PREPARE_FOR_POSTPIPELINING +#define AIE_LOOP_FLATTEN +#endif + +/** @} */ /* End of loop_macros group */ + +#endif /* _AIE_KERNEL_UTILS_ */ \ No newline at end of file diff --git a/src/ggml-hsa/kernels/iron/argmax.cc b/src/ggml-hsa/kernels/iron/argmax.cc new file mode 100644 index 0000000000..1e70fcb2fa --- /dev/null +++ b/src/ggml-hsa/kernels/iron/argmax.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. + +/** + * @file argmax.cc + * @brief Argmax operation for AIE kernels. + */ + +#include "ggml-aie.hpp" + +extern "C" { + +/** + * @brief Finds the index of the maximum value in an input array. + * + * Single-pass algorithm that tracks both the maximum value and its index. + * If multiple elements have the same maximum value, returns the index of + * the first occurrence. + * + * @param[in] in Input array of N elements. + * @param[out] out Output array containing the index of the maximum element. + * Only the first element (out[0]) is written. + * @param[in] N Number of elements to search. If N <= 0, no output is written. + */ +void ggml_op_argmax(const INPUT_DTYPE * __restrict in, OUTPUT_DTYPE * __restrict out, int32_t N) { + event0(); + + if (N > 0) { + auto max_val = in[0]; + int32_t argmax_idx = 0; + + for (int32_t i = 1; i < N; i++) { + if (in[i] > max_val) { + max_val = in[i]; + argmax_idx = i; + } + } + + out[0] = static_cast(argmax_idx); + } + + event1(); +} + +} // extern "C" diff --git a/src/ggml-hsa/kernels/iron/argmax.py b/src/ggml-hsa/kernels/iron/argmax.py new file mode 100644 index 0000000000..6f9e1700e0 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/argmax.py @@ -0,0 +1,176 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON kernel implementation for the argmax operation. + +Finds the index of the maximum value along the first dimension (columns) for each row. +""" + +from pathlib import Path + +import numpy as np + +from .softmax import get_softmax_dimensions +from .utils import ( + arch_to_device, + suppress_import_pyxrt_msg, +) + +suppress_import_pyxrt_msg() + +from aie.iron import ( + ExternalFunction, + ObjectFifo, + Program, + Runtime, + Worker, + dtype_to_str, +) +from aie.iron.controlflow import range_ +from aie.iron.placers import SequentialPlacer + + +def argmax_op(arch: str, input_tensors: list, output_tensor, op_params: bytearray): + """ + IRON design for argmax. + + Computes the index of the maximum value along the first dimension for each row. + Uses row-by-row processing where each kernel invocation processes one row and + outputs a single I32 index. + + Parameters: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + input_tensors (list[TensorDesc]): List containing exactly one input tensor. + The tensor must be F32 with shape [ne0, ne1, ne2, ne3] where ne0 is the + row length (dimension over which argmax is computed) and the product + ne1 * ne2 * ne3 is the number of rows. + output_tensor (TensorDesc): Output tensor of type I32 with shape [ne1, ne2, ne3] + containing one index per row indicating the position of the maximum value. + op_params (bytearray): Operation parameters (unused for ARGMAX). + + Returns: + MLIR module representing the IRON program for argmax. + + Raises: + ValueError: If input_tensors does not contain exactly one tensor. + ValueError: If input or output tensors are not contiguous in memory. + ValueError: If output tensor size does not match the number of input rows. + ValueError: If output tensor dtype is not int32. + """ + + if len(input_tensors) != 1: + raise ValueError("Operation requires exactly one input tensor.") + + input_tensor = input_tensors[0] + + if not input_tensor.contiguous: + raise ValueError("Input tensor must be contiguous in memory.") + if not output_tensor.contiguous: + raise ValueError("Output tensor must be contiguous in memory.") + + row_length, num_rows = get_softmax_dimensions(input_tensor) + + if output_tensor.numel() != num_rows: + raise ValueError( + f"Output tensor size ({output_tensor.numel()}) does not match the number " + f"of input rows ({num_rows})." + ) + + if output_tensor.dtype != np.int32: + raise ValueError( + f"Output tensor dtype must be int32, got {output_tensor.dtype}." + ) + + function = _create_external_function( + arch=arch, + op_name="GGML_OP_ARGMAX", + input_tensor=input_tensor, + output_tensor=output_tensor, + row_length=row_length, + ) + + # AIE-array data movement with object fifos + # Input: one row at a time (F32) + input_tile_ty = np.ndarray[(row_length,), np.dtype[input_tensor.dtype]] + # Output: one index per row (I32) + output_tile_ty = np.ndarray[(1,), np.dtype[output_tensor.dtype]] + + of_in = ObjectFifo(input_tile_ty, name="in") + of_out = ObjectFifo(output_tile_ty, name="out") + + # Task for the core to perform with an external function + def ext_core_fn(of_in, of_out, function): + for _ in range_(num_rows): + elem_in = of_in.acquire(1) + elem_out = of_out.acquire(1) + function(elem_in, elem_out, row_length) + of_in.release(1) + of_out.release(1) + + # Create a worker to run the task on a compute tile + worker = Worker(ext_core_fn, fn_args=[of_in.cons(), of_out.prod(), function]) + + # Runtime operations to move data to/from the AIE-array + rt = Runtime() + num_elements_in = row_length * num_rows + input_tensor_ty = np.ndarray[(num_elements_in,), np.dtype[input_tensor.dtype]] + output_tensor_ty = np.ndarray[(num_rows,), np.dtype[output_tensor.dtype]] + + with rt.sequence(input_tensor_ty, output_tensor_ty) as (a_in, b_out): + rt.start(worker) + rt.fill(of_in.prod(), a_in) + rt.drain(of_out.cons(), b_out, wait=True) + + # Place program components and generate an MLIR module + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def _create_external_function( + arch: str, + op_name: str, + input_tensor, + output_tensor, + row_length: int, +) -> ExternalFunction: + """ + Creates an ExternalFunction specification for argmax. + + The external function wraps the C++ kernel that performs the actual argmax + computation on the AIE tile. The kernel receives one row of input data and + outputs a single I32 index. + + Parameters: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + op_name (str): Operation name used for function naming and compile flags + (e.g., "GGML_OP_ARGMAX"). + input_tensor (TensorDesc): Input tensor descriptor providing dtype information. + output_tensor (TensorDesc): Output tensor descriptor providing dtype information. + row_length (int): Number of elements per row (ne0 dimension). + + Returns: + ExternalFunction: Configured external function specification that references + the argmax.cc source file with appropriate compile flags for dtype and + vector size configuration. + """ + + current_dir = Path(__file__).resolve().parent + func = ExternalFunction( + name=f"{op_name.lower()}", + object_file_name=f"{op_name.lower()}_core_function.o", + source_file=str(current_dir / "argmax.cc"), + arg_types=[ + np.ndarray[(row_length,), np.dtype[input_tensor.dtype]], + np.ndarray[(1,), np.dtype[output_tensor.dtype]], + np.int32, # row_length (N) + ], + compile_flags=[ + f"-DINPUT_DTYPE={dtype_to_str(input_tensor.dtype)}", + f"-DOUTPUT_DTYPE={dtype_to_str(output_tensor.dtype)}", + ], + ) + return func diff --git a/src/ggml-hsa/kernels/iron/binary_ops.cc b/src/ggml-hsa/kernels/iron/binary_ops.cc new file mode 100644 index 0000000000..02046be440 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/binary_ops.cc @@ -0,0 +1,357 @@ +// Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. + +/** + * @file binary_ops.cc + * @brief Element-wise binary operations for AIE kernels. + * + * This file implements binary operations (add, sub, mul, div) with both + * element-wise and broadcasting variants. + */ + +#include "ggml-aie.hpp" + +/** + * @brief Applies a binary operation element-wise to two input arrays. + * + * @tparam T0 Element type of the first input array. + * @tparam T1 Element type of the second input array. + * @tparam TOut Element type of the output array. + * @tparam Size Integer type for the count parameter. + * @tparam BinaryOp Callable type taking two elements and returning the result. + * + * @param[in] in0 First input array of count elements. + * @param[in] in1 Second input array of count elements. + * @param[in] count Number of elements to process. + * @param[out] out Output array of count elements. + * @param[in] op Binary operation to apply: out[i] = op(in0[i], in1[i]). + */ +template +void transform_binary_n(const T0 * __restrict in0, + const T1 * __restrict in1, + Size count, + TOut * __restrict out, + BinaryOp op) { + event0(); + for (Size i = 0; i < count; ++i) { + out[i] = op(in0[i], in1[i]); + } + event1(); +} + +/** + * @brief Applies a binary operation with NumPy-style broadcasting. + * + * Handles broadcasting of src1 (in1) to match the shape of src0/dst (in0/out). + * Tiles are processed sequentially; the global element index is computed from + * tile_idx and tile_size to determine the appropriate src1 index via modulo. + * + * @tparam T0 Element type of the first input array. + * @tparam T1 Element type of the second input array (broadcasted). + * @tparam TOut Element type of the output array. + * @tparam Size Integer type for size/index parameters. + * @tparam BinaryOp Callable type taking two elements and returning the result. + * + * @param[in] in0 First input tile (tile_size elements, contiguous from src0). + * @param[in] in1 Second input array (full broadcasted tensor). + * @param[out] out Output tile (tile_size elements). + * @param[in] tile_size Number of elements in this tile. + * @param[in] tile_idx Index of the current tile (0-based). + * @param[in] src1_ne0 src1 dimension 0 (innermost). + * @param[in] src1_ne1 src1 dimension 1. + * @param[in] src1_ne2 src1 dimension 2. + * @param[in] src1_ne3 src1 dimension 3 (outermost). + * @param[in] dst_ne0 dst dimension 0 (innermost). + * @param[in] dst_ne1 dst dimension 1. + * @param[in] dst_ne2 dst dimension 2. + * @param[in] op Binary operation to apply: out[i] = op(in0[i], in1[broadcast_idx]). + */ +template +void transform_binary_broadcast_n(const T0 * __restrict in0, + const T1 * __restrict in1, + TOut * __restrict out, + Size tile_size, + Size tile_idx, + Size src1_ne0, + Size src1_ne1, + Size src1_ne2, + Size src1_ne3, + Size dst_ne0, + Size dst_ne1, + Size dst_ne2, + BinaryOp op) { + event0(); + + auto global_offset = tile_idx * tile_size; + + // src1 strides (contiguous layout) + auto s1 = src1_ne0; + auto s2 = src1_ne0 * src1_ne1; + auto s3 = src1_ne0 * src1_ne1 * src1_ne2; + + // dst strides for coordinate decomposition + auto d1 = dst_ne0; + auto d2 = dst_ne0 * dst_ne1; + + for (auto i = 0; i < tile_size; ++i) { + auto g = global_offset + i; + + // Decompose into 4D dst coordinates + auto i0 = g % dst_ne0; + auto i1 = (g / d1) % dst_ne1; + auto i2 = (g / d2) % dst_ne2; + auto i3 = g / (d2 * dst_ne2); + + // Apply broadcast modulo + auto j0 = i0 % src1_ne0; + auto j1 = i1 % src1_ne1; + auto j2 = i2 % src1_ne2; + auto j3 = i3 % src1_ne3; + + // src1 index + auto idx_src1 = j0 + j1 * s1 + j2 * s2 + j3 * s3; + + out[i] = op(in0[i], in1[idx_src1]); + } + + event1(); +} + +extern "C" { + +#ifdef GGML_OP_ADD + +/** + * @brief Element-wise addition: out[i] = in0[i] + in1[i]. + * + * @param[in] in0 First input array of N elements. + * @param[in] in1 Second input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_op_add(const INPUT0_DTYPE * __restrict in0, + const INPUT1_DTYPE * __restrict in1, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_binary_n(in0, in1, N, out, [](auto a, auto b) -> OUTPUT_DTYPE { return a + b; }); +} + +#endif // GGML_OP_ADD + +#ifdef GGML_OP_SUB + +/** + * @brief Element-wise subtraction: out[i] = in0[i] - in1[i]. + * + * @param[in] in0 First input array of N elements. + * @param[in] in1 Second input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_op_sub(const INPUT0_DTYPE * __restrict in0, + const INPUT1_DTYPE * __restrict in1, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_binary_n(in0, in1, N, out, [](auto a, auto b) -> OUTPUT_DTYPE { return a - b; }); +} + +#endif // GGML_OP_SUB + +#ifdef GGML_OP_MUL + +/** + * @brief Element-wise multiplication: out[i] = in0[i] * in1[i]. + * + * @param[in] in0 First input array of N elements. + * @param[in] in1 Second input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_op_mul(const INPUT0_DTYPE * __restrict in0, + const INPUT1_DTYPE * __restrict in1, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_binary_n(in0, in1, N, out, [](auto a, auto b) -> OUTPUT_DTYPE { return a * b; }); +} + +#endif // GGML_OP_MUL + +#ifdef GGML_OP_DIV + +/** + * @brief Element-wise division: out[i] = in0[i] / in1[i]. + * + * @param[in] in0 First input array of N elements (dividend). + * @param[in] in1 Second input array of N elements (divisor). + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_op_div(const INPUT0_DTYPE * __restrict in0, + const INPUT1_DTYPE * __restrict in1, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_binary_n(in0, in1, N, out, [](auto a, auto b) -> OUTPUT_DTYPE { return a / b; }); +} + +#endif // GGML_OP_DIV + +#ifdef GGML_OP_ADD_BROADCAST + +/** + * @brief Addition with broadcasting: out[i] = in0[i] + in1[broadcast_idx]. + * + * Broadcasts in1 to match in0's shape using NumPy-style broadcasting rules. + * + * @param[in] in0 First input tile (tile_size elements). + * @param[in] in1 Second input array (broadcasted, may be smaller). + * @param[out] out Output tile (tile_size elements). + * @param[in] tile_size Number of elements in this tile. + * @param[in] tile_idx Index of the current tile (0-based). + * @param[in] src1_ne0 src1 dimension 0. + * @param[in] src1_ne1 src1 dimension 1. + * @param[in] src1_ne2 src1 dimension 2. + * @param[in] src1_ne3 src1 dimension 3. + * @param[in] dst_ne0 dst dimension 0. + * @param[in] dst_ne1 dst dimension 1. + * @param[in] dst_ne2 dst dimension 2. + */ +void ggml_op_add_broadcast(const INPUT0_DTYPE * __restrict in0, + const INPUT1_DTYPE * __restrict in1, + OUTPUT_DTYPE * __restrict out, + int32_t tile_size, + int32_t tile_idx, + int32_t src1_ne0, + int32_t src1_ne1, + int32_t src1_ne2, + int32_t src1_ne3, + int32_t dst_ne0, + int32_t dst_ne1, + int32_t dst_ne2) { + transform_binary_broadcast_n( + in0, in1, out, tile_size, tile_idx, src1_ne0, src1_ne1, src1_ne2, src1_ne3, dst_ne0, + dst_ne1, dst_ne2, + [](auto a, auto b) -> OUTPUT_DTYPE { return static_cast(a + b); }); +} + +#endif // GGML_OP_ADD_BROADCAST + +#ifdef GGML_OP_SUB_BROADCAST + +/** + * @brief Subtraction with broadcasting: out[i] = in0[i] - in1[broadcast_idx]. + * + * Broadcasts in1 to match in0's shape using NumPy-style broadcasting rules. + * + * @param[in] in0 First input tile (tile_size elements). + * @param[in] in1 Second input array (broadcasted, may be smaller). + * @param[out] out Output tile (tile_size elements). + * @param[in] tile_size Number of elements in this tile. + * @param[in] tile_idx Index of the current tile (0-based). + * @param[in] src1_ne0 src1 dimension 0. + * @param[in] src1_ne1 src1 dimension 1. + * @param[in] src1_ne2 src1 dimension 2. + * @param[in] src1_ne3 src1 dimension 3. + * @param[in] dst_ne0 dst dimension 0. + * @param[in] dst_ne1 dst dimension 1. + * @param[in] dst_ne2 dst dimension 2. + */ +void ggml_op_sub_broadcast(const INPUT0_DTYPE * __restrict in0, + const INPUT1_DTYPE * __restrict in1, + OUTPUT_DTYPE * __restrict out, + int32_t tile_size, + int32_t tile_idx, + int32_t src1_ne0, + int32_t src1_ne1, + int32_t src1_ne2, + int32_t src1_ne3, + int32_t dst_ne0, + int32_t dst_ne1, + int32_t dst_ne2) { + transform_binary_broadcast_n( + in0, in1, out, tile_size, tile_idx, src1_ne0, src1_ne1, src1_ne2, src1_ne3, dst_ne0, + dst_ne1, dst_ne2, + [](auto a, auto b) -> OUTPUT_DTYPE { return static_cast(a - b); }); +} + +#endif // GGML_OP_SUB_BROADCAST + +#ifdef GGML_OP_MUL_BROADCAST + +/** + * @brief Multiplication with broadcasting: out[i] = in0[i] * in1[broadcast_idx]. + * + * Broadcasts in1 to match in0's shape using NumPy-style broadcasting rules. + * + * @param[in] in0 First input tile (tile_size elements). + * @param[in] in1 Second input array (broadcasted, may be smaller). + * @param[out] out Output tile (tile_size elements). + * @param[in] tile_size Number of elements in this tile. + * @param[in] tile_idx Index of the current tile (0-based). + * @param[in] src1_ne0 src1 dimension 0. + * @param[in] src1_ne1 src1 dimension 1. + * @param[in] src1_ne2 src1 dimension 2. + * @param[in] src1_ne3 src1 dimension 3. + * @param[in] dst_ne0 dst dimension 0. + * @param[in] dst_ne1 dst dimension 1. + * @param[in] dst_ne2 dst dimension 2. + */ +void ggml_op_mul_broadcast(const INPUT0_DTYPE * __restrict in0, + const INPUT1_DTYPE * __restrict in1, + OUTPUT_DTYPE * __restrict out, + int32_t tile_size, + int32_t tile_idx, + int32_t src1_ne0, + int32_t src1_ne1, + int32_t src1_ne2, + int32_t src1_ne3, + int32_t dst_ne0, + int32_t dst_ne1, + int32_t dst_ne2) { + transform_binary_broadcast_n( + in0, in1, out, tile_size, tile_idx, src1_ne0, src1_ne1, src1_ne2, src1_ne3, dst_ne0, + dst_ne1, dst_ne2, + [](auto a, auto b) -> OUTPUT_DTYPE { return static_cast(a * b); }); +} + +#endif // GGML_OP_MUL_BROADCAST + +#ifdef GGML_OP_DIV_BROADCAST + +/** + * @brief Division with broadcasting: out[i] = in0[i] / in1[broadcast_idx]. + * + * Broadcasts in1 to match in0's shape using NumPy-style broadcasting rules. + * + * @param[in] in0 First input tile (dividend, tile_size elements). + * @param[in] in1 Second input array (divisor, broadcasted). + * @param[out] out Output tile (tile_size elements). + * @param[in] tile_size Number of elements in this tile. + * @param[in] tile_idx Index of the current tile (0-based). + * @param[in] src1_ne0 src1 dimension 0. + * @param[in] src1_ne1 src1 dimension 1. + * @param[in] src1_ne2 src1 dimension 2. + * @param[in] src1_ne3 src1 dimension 3. + * @param[in] dst_ne0 dst dimension 0. + * @param[in] dst_ne1 dst dimension 1. + * @param[in] dst_ne2 dst dimension 2. + */ +void ggml_op_div_broadcast(const INPUT0_DTYPE * __restrict in0, + const INPUT1_DTYPE * __restrict in1, + OUTPUT_DTYPE * __restrict out, + int32_t tile_size, + int32_t tile_idx, + int32_t src1_ne0, + int32_t src1_ne1, + int32_t src1_ne2, + int32_t src1_ne3, + int32_t dst_ne0, + int32_t dst_ne1, + int32_t dst_ne2) { + transform_binary_broadcast_n( + in0, in1, out, tile_size, tile_idx, src1_ne0, src1_ne1, src1_ne2, src1_ne3, dst_ne0, + dst_ne1, dst_ne2, + [](auto a, auto b) -> OUTPUT_DTYPE { return static_cast(a / b); }); +} + +#endif // GGML_OP_DIV_BROADCAST + +} // extern "C" diff --git a/src/ggml-hsa/kernels/iron/binary_ops.py b/src/ggml-hsa/kernels/iron/binary_ops.py new file mode 100644 index 0000000000..6415489a36 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/binary_ops.py @@ -0,0 +1,452 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON kernel implementation for binary element-wise operations. +""" + +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +from .utils import ( + suppress_import_pyxrt_msg, + arch_aligned_num_elements, + arch_to_device, + max_tile_size, +) + +suppress_import_pyxrt_msg() + +from aie.iron import ( + ObjectFifo, + Program, + Runtime, + Worker, + dtype_to_str, + ExternalFunction, +) +from aie.iron.placers import SequentialPlacer +from aie.iron.controlflow import range_ +from aie.dialects.arith import index_cast +from aie.ir import IntegerType + + +def _ggml_can_repeat(t0_shape: tuple, t1_shape: tuple) -> bool: + """Python reimplementation of ggml_can_repeat. + + Checks if tensor t0 can be repeated to fill tensor t1. + This is the GGML broadcast semantic: t1->ne[i] % t0->ne[i] == 0 for all dims. + + From ggml.c: + bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + return (t1->ne[0]%t0->ne[0] == 0) && + (t1->ne[1]%t0->ne[1] == 0) && + (t1->ne[2]%t0->ne[2] == 0) && + (t1->ne[3]%t0->ne[3] == 0); + } + + Parameters: + t0_shape: Shape of the smaller tensor to be repeated. + t1_shape: Shape of the larger tensor to fill. + + Returns: + True if t0 can be repeated to fill t1. + """ + for i in range(4): + if t1_shape[i] % t0_shape[i] != 0: + return False + return True + + +@dataclass(frozen=True) +class CoreFunctionSpec: + """Specification for a core function to be used in binary operations. + + Attributes: + external_function (ExternalFunction): The external function to be called for the binary operation. + num_elements (int): The total number of elements in the input/output tensors. + """ + + external_function: ExternalFunction + num_elements: int + + @property + def tile_size(self) -> int: + """Returns the tile size used by the external function.""" + return self.external_function.tile_size(0) + + +def _binary_op( + arch: str, + input_tensors: list, + function_spec: CoreFunctionSpec, + output_tensor, +): + """ + Implements output_tensor = op(*input_tensors) + + Parameters: + arch (str): Target architecture. + input_tensors (list): Input tensors. + function_spec (CoreFunctionSpec): Binary operator specification. + output_tensor: Output tensor. + """ + + # Tile size and number of tiles + num_elements = function_spec.num_elements + tile_size = function_spec.tile_size + num_tiles = num_elements // tile_size + if num_elements % tile_size != 0: + raise ValueError( + f"Number of elements ({num_elements}) must be divisible by tile size ({tile_size})." + ) + + # AIE-array data movement with object fifos + input_tile_tys = [ + (np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]]) + for input_tensor in input_tensors + ] + of_ins = [ + ObjectFifo(input_tile_ty, name=f"in{index}") + for index, input_tile_ty in enumerate(input_tile_tys) + ] + output_tile_ty = np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]] + of_out = ObjectFifo(output_tile_ty, name="out") + + # Create a worker to run the task on a compute tile + worker = None + function = function_spec.external_function + + # Task for the core to perform with an external function + def ext_core_fn(of_in0, of_in1, of_out, function): + # Number of sub-vector "tile" iterations + for _ in range_(num_tiles): + elem_in0 = of_in0.acquire(1) + elem_in1 = of_in1.acquire(1) + elem_out = of_out.acquire(1) + function(elem_in0, elem_in1, elem_out, tile_size) + of_in0.release(1) + of_in1.release(1) + of_out.release(1) + + worker = Worker( + ext_core_fn, + fn_args=[x.cons() for x in of_ins] + [of_out.prod(), function], + ) + + # Runtime operations to move data to/from the AIE-array + input_tensor_tys = [ + np.ndarray[(num_elements,), np.dtype[input_tensor.dtype]] + for input_tensor in input_tensors + ] + output_tensor_ty = np.ndarray[(num_elements,), np.dtype[output_tensor.dtype]] + rt = Runtime() + with rt.sequence(*input_tensor_tys, output_tensor_ty) as t: + rt.start(worker) + [rt.fill(of_in.prod(), t[i]) for i, of_in in enumerate(of_ins)] + rt.drain(of_out.cons(), t[-1], wait=True) + + # Place program components (assign them resources on the device) and generate an MLIR module + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def _create_external_function( + arch: str, + op_name: str, + input_tensors: list, + output_tensor, +) -> CoreFunctionSpec: + """ + Creates a specification for binary ops. + + Parameters: + arch (str): Target architecture. + op_name (str): Name of the operation. + input_tensors (list): List of input tensors. + output_tensor: Output tensor. + + Returns: + CoreFunctionSpec: Specification for the core function to be used in binary ops. + """ + + num_elements = arch_aligned_num_elements(arch=arch, tensor=output_tensor) + tile_size = max_tile_size(arch, output_tensor.dtype, num_elements) + + current_dir = Path(__file__).resolve().parent + func = ExternalFunction( + name=op_name.lower(), + object_file_name=f"{op_name.lower()}_core_function.o", + source_file=str(current_dir / "binary_ops.cc"), + arg_types=[ + np.ndarray[(tile_size,), np.dtype[input_tensors[0].dtype]], + np.ndarray[(tile_size,), np.dtype[input_tensors[1].dtype]], + np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]], + np.int32, + ], + compile_flags=[ + f"-D{op_name}=1", + f"-DINPUT0_DTYPE={dtype_to_str(input_tensors[0].dtype)}", + f"-DINPUT1_DTYPE={dtype_to_str(input_tensors[1].dtype)}", + f"-DOUTPUT_DTYPE={dtype_to_str(output_tensor.dtype)}", + ], + ) + return CoreFunctionSpec(external_function=func, num_elements=num_elements) + + +@dataclass(frozen=True) +class BroadcastFunctionSpec: + """Specification for a broadcast binary operation. + + Attributes: + external_function (ExternalFunction): The external function for broadcast op. + num_elements_out (int): Total number of elements in output (and src0). + num_elements_src1 (int): Total number of elements in src1 (smaller). + src1_ne (tuple): Shape of src1 as 4-element tuple (ne0, ne1, ne2, ne3). + dst_ne (tuple): Shape of dst as 4-element tuple (ne0, ne1, ne2, ne3). + """ + + external_function: ExternalFunction + num_elements_out: int + num_elements_src1: int + src1_ne: tuple # (ne0, ne1, ne2, ne3) + dst_ne: tuple # (ne0, ne1, ne2, ne3) + + @property + def tile_size(self) -> int: + """Returns the tile size used by the external function.""" + return self.external_function.tile_size(0) + + +def _create_broadcast_external_function( + arch: str, + op_name: str, + input_tensors: list, + output_tensor, +) -> BroadcastFunctionSpec: + """ + Creates a specification for broadcast binary ops. + + In broadcast mode, src1 is smaller than src0/dst and gets repeated. + The kernel receives the full src1 buffer and uses modulo indexing. + + Parameters: + arch (str): Target architecture. + op_name (str): Name of the operation. + input_tensors (list): List of input tensors [src0, src1]. + output_tensor: Output tensor. + + Returns: + BroadcastFunctionSpec: Specification for broadcast binary ops. + """ + num_elements_out = arch_aligned_num_elements(arch=arch, tensor=output_tensor) + num_elements_src1 = arch_aligned_num_elements(arch=arch, tensor=input_tensors[1]) + tile_size = max_tile_size(arch, output_tensor.dtype, num_elements_out) + + # Extract shapes as 4-element tuples for multi-dimensional broadcast indexing + src1_ne = input_tensors[1].shape + dst_ne = output_tensor.shape + + current_dir = Path(__file__).resolve().parent + func = ExternalFunction( + name=f"{op_name.lower()}_broadcast", + object_file_name=f"{op_name.lower()}_broadcast_core_function.o", + source_file=str(current_dir / "binary_ops.cc"), + arg_types=[ + np.ndarray[(tile_size,), np.dtype[input_tensors[0].dtype]], # src0 tile + np.ndarray[ + (num_elements_src1,), np.dtype[input_tensors[1].dtype] + ], # full src1 + np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]], # output tile + np.int32, # tile_size + np.int32, # tile_idx + np.int32, # src1_ne[0] + np.int32, # src1_ne[1] + np.int32, # src1_ne[2] + np.int32, # src1_ne[3] + np.int32, # dst_ne[0] + np.int32, # dst_ne[1] + np.int32, # dst_ne[2] + ], + compile_flags=[ + f"-D{op_name}_BROADCAST=1", + f"-DINPUT0_DTYPE={dtype_to_str(input_tensors[0].dtype)}", + f"-DINPUT1_DTYPE={dtype_to_str(input_tensors[1].dtype)}", + f"-DOUTPUT_DTYPE={dtype_to_str(output_tensor.dtype)}", + ], + ) + return BroadcastFunctionSpec( + external_function=func, + num_elements_out=num_elements_out, + num_elements_src1=num_elements_src1, + src1_ne=src1_ne, + dst_ne=dst_ne, + ) + + +def _binary_op_broadcast( + arch: str, + input_tensors: list, + function_spec: BroadcastFunctionSpec, + output_tensor, +): + """ + Binary op with broadcasting - src1 loaded fully once, src0 streamed in tiles. + + Parameters: + arch (str): Target architecture. + input_tensors (list): Input tensors [src0, src1]. + function_spec (BroadcastFunctionSpec): Broadcast operation specification. + output_tensor: Output tensor. + """ + num_elements_out = function_spec.num_elements_out + num_elements_src1 = function_spec.num_elements_src1 + tile_size = function_spec.tile_size + num_tiles = num_elements_out // tile_size + src1_ne = function_spec.src1_ne + dst_ne = function_spec.dst_ne + + if num_elements_out % tile_size != 0: + raise ValueError( + f"Number of elements ({num_elements_out}) must be divisible by tile size ({tile_size})." + ) + + # ObjectFifos for data movement + src0_tile_ty = np.ndarray[(tile_size,), np.dtype[input_tensors[0].dtype]] + src1_full_ty = np.ndarray[(num_elements_src1,), np.dtype[input_tensors[1].dtype]] + out_tile_ty = np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]] + + of_src0 = ObjectFifo(src0_tile_ty, name="src0") + of_src1 = ObjectFifo(src1_full_ty, depth=1, name="src1") # depth=1, load once + of_out = ObjectFifo(out_tile_ty, name="out") + + function = function_spec.external_function + + def ext_core_fn(of_src0, of_src1, of_out, function): + # Acquire src1 once (full buffer) + src1_buf = of_src1.acquire(1) + + for tile_idx in range_(num_tiles): + src0_tile = of_src0.acquire(1) + out_tile = of_out.acquire(1) + + tile_idx_i32 = index_cast(IntegerType.get_signless(32), tile_idx) + # Pass shape elements as individual scalars (compile-time constants) + function( + src0_tile, + src1_buf, + out_tile, + tile_size, + tile_idx_i32, + src1_ne[0], + src1_ne[1], + src1_ne[2], + src1_ne[3], + dst_ne[0], + dst_ne[1], + dst_ne[2], + ) + + of_src0.release(1) + of_out.release(1) + + of_src1.release(1) + + worker = Worker( + ext_core_fn, + fn_args=[of_src0.cons(), of_src1.cons(), of_out.prod(), function], + ) + + # Runtime operations to move data to/from the AIE-array + src0_ty = np.ndarray[(num_elements_out,), np.dtype[input_tensors[0].dtype]] + src1_ty = np.ndarray[(num_elements_src1,), np.dtype[input_tensors[1].dtype]] + out_ty = np.ndarray[(num_elements_out,), np.dtype[output_tensor.dtype]] + + rt = Runtime() + with rt.sequence(src0_ty, src1_ty, out_ty) as (a, b, c): + rt.start(worker) + rt.fill(of_src0.prod(), a) + rt.fill(of_src1.prod(), b) + rt.drain(of_out.cons(), c, wait=True) + + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def binary_op( + arch: str, + op_name: str, + input_tensors: list, + output_tensor, +): + """ + IRON generic design for binary operations. + + Supports both element-wise operations (same shape) and broadcasting + (src1 smaller, gets repeated to match src0/dst). + + Parameters: + arch (str): Target architecture. + op_name (str): Name of the operation. + input_tensors (list): List of two input tensors [src0, src1]. + output_tensor: Output tensor. + """ + + if len(input_tensors) != 2: + raise ValueError("Operation requires exactly two input tensors.") + + if ( + any(t.contiguous is False for t in input_tensors) + or output_tensor.contiguous is False + ): + raise ValueError("Input and output tensors must be contiguous in memory.") + + src0_shape = input_tensors[0].shape + src1_shape = input_tensors[1].shape + dst_shape = output_tensor.shape + + # src0 must match output shape + if src0_shape != dst_shape: + raise ValueError(f"src0 shape must match output: {src0_shape} != {dst_shape}") + + # Check if broadcasting is needed + needs_broadcast = src1_shape != dst_shape + + if needs_broadcast: + # Validate broadcasting is supported per GGML semantics + # ggml_can_repeat(src1, dst) checks if src1 can be repeated to fill dst + if not _ggml_can_repeat(src1_shape, dst_shape): + raise ValueError(f"Cannot broadcast: {src1_shape} -> {dst_shape}") + + function_spec = _create_broadcast_external_function( + arch=arch, + op_name=op_name, + input_tensors=input_tensors, + output_tensor=output_tensor, + ) + + return _binary_op_broadcast( + arch=arch, + input_tensors=input_tensors, + function_spec=function_spec, + output_tensor=output_tensor, + ) + else: + # Non-broadcast path: standard element-wise operation + function_spec = _create_external_function( + arch=arch, + op_name=op_name, + input_tensors=input_tensors, + output_tensor=output_tensor, + ) + + return _binary_op( + arch=arch, + input_tensors=input_tensors, + function_spec=function_spec, + output_tensor=output_tensor, + ) diff --git a/src/ggml-hsa/kernels/iron/clamp.cc b/src/ggml-hsa/kernels/iron/clamp.cc new file mode 100644 index 0000000000..2b060af13e --- /dev/null +++ b/src/ggml-hsa/kernels/iron/clamp.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. + +/** + * @file clamp.cc + * @brief Clamp operation for AIE kernels. + */ + +#include "ggml-aie.hpp" + +extern "C" { + +/** + * @brief Clamps each element to a specified range: out[i] = clamp(in[i], min, max). + * + * For each element, if the value is less than min_val, it is set to min_val. + * If greater than max_val, it is set to max_val. Otherwise, it is unchanged. + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + * @param[in] min_val Minimum allowed value (inclusive). + * @param[in] max_val Maximum allowed value (inclusive). + */ +void ggml_op_clamp( + const INPUT_DTYPE * in, OUTPUT_DTYPE * out, int32_t N, float min_val, float max_val) { + for (int32_t i = 0; i < N; ++i) { + if (in[i] < static_cast(min_val)) { + out[i] = static_cast(min_val); + } else if (in[i] > static_cast(max_val)) { + out[i] = static_cast(max_val); + } else { + out[i] = static_cast(in[i]); + } + } +} + +} // extern "C" diff --git a/src/ggml-hsa/kernels/iron/clamp.py b/src/ggml-hsa/kernels/iron/clamp.py new file mode 100644 index 0000000000..48a6572c16 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/clamp.py @@ -0,0 +1,151 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON kernel implementation for the clamp operation. +""" + +import struct +from pathlib import Path +from typing import Tuple + +import numpy as np + +from .utils import ( + suppress_import_pyxrt_msg, + arch_aligned_num_elements, + arch_to_device, + max_tile_size, +) + +suppress_import_pyxrt_msg() + +from aie.iron import ( + ObjectFifo, + Program, + Runtime, + Worker, + dtype_to_str, + ExternalFunction, +) +from aie.iron.placers import SequentialPlacer +from aie.iron.controlflow import range_ + + +def _create_external_function( + arch: str, + op_name: str, + input_tensor, + output_tensor, +) -> Tuple[ExternalFunction, int, int]: + """ + Creates an ExternalFunction specification for the clamp operation. + + Parameters: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + op_name (str): Operation name used for function naming and compile flags. + input_tensor (TensorDesc): Input tensor descriptor providing dtype and size. + output_tensor (TensorDesc): Output tensor descriptor providing dtype. + + Returns: + Tuple[ExternalFunction, int, int]: A tuple containing: + - func: The configured ExternalFunction specification. + - num_elements: Architecture-aligned number of elements. + - tile_size: Size of each processing tile. + """ + + num_elements = arch_aligned_num_elements(arch=arch, tensor=input_tensor) + tile_size = max_tile_size(arch, input_tensor.dtype, num_elements) + + current_dir = Path(__file__).resolve().parent + func = ExternalFunction( + name=op_name.lower(), + object_file_name=f"{op_name.lower()}_core_function.o", + source_file=str(current_dir / "clamp.cc"), + arg_types=[ + np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]], + np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]], + np.int32, + np.float32, + np.float32, + ], + compile_flags=[ + f"-DINPUT_DTYPE={dtype_to_str(input_tensor.dtype)}", + f"-DOUTPUT_DTYPE={dtype_to_str(output_tensor.dtype)}", + ], + ) + return func, num_elements, tile_size + + +def clamp(arch: str, input_tensors: list, output_tensor, op_params: bytearray): + """ + IRON design for clamp. + + Clamps each element of the input tensor to the range [min_val, max_val]. + output[i] = max(min_val, min(input[i], max_val)) + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor: Output tensor. + op_params (bytearray): Operation parameters containing min and max values. + """ + + if len(input_tensors) != 1: + raise ValueError("Operation requires exactly one input tensor.") + + if input_tensors[0].contiguous is False or output_tensor.contiguous is False: + raise ValueError("Input and output tensors must be contiguous in memory.") + + if input_tensors[0].shape != output_tensor.shape: + raise ValueError("Input and output tensors must have the same shape.") + + input_tensor = input_tensors[0] + + min_val = struct.unpack_from("f", op_params, 0)[0] + max_val = struct.unpack_from("f", op_params, 4)[0] + + function, num_elements, tile_size = _create_external_function( + arch=arch, + op_name="GGML_OP_CLAMP", + input_tensor=input_tensor, + output_tensor=output_tensor, + ) + + num_tiles = num_elements // tile_size + assert num_elements % tile_size == 0 + + # AIE-array data movement with object fifos + input_tile_ty = np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]] + output_tile_ty = np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]] + of_in = ObjectFifo(input_tile_ty, name="in") + of_out = ObjectFifo(output_tile_ty, name="out") + + # Task for the core to perform with an external function + def ext_core_fn(of_in, of_out, function): + # Number of sub-vector "tile" iterations + for _ in range_(num_tiles): + elem_in = of_in.acquire(1) + elem_out = of_out.acquire(1) + function(elem_in, elem_out, tile_size, min_val, max_val) + of_in.release(1) + of_out.release(1) + + # Create a worker to run the task on a compute tile + worker = Worker(ext_core_fn, fn_args=[of_in.cons(), of_out.prod(), function]) + + # Runtime operations to move data to/from the AIE-array + rt = Runtime() + input_tensor_ty = np.ndarray[(num_elements,), np.dtype[input_tensor.dtype]] + output_tensor_ty = np.ndarray[(num_elements,), np.dtype[output_tensor.dtype]] + with rt.sequence(input_tensor_ty, output_tensor_ty) as (a_in, b_out): + rt.start(worker) + rt.fill(of_in.prod(), a_in) + rt.drain(of_out.cons(), b_out, wait=True) + + # Place program components (assign them resources on the device) and generate an MLIR module + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) diff --git a/src/ggml-hsa/kernels/iron/count_equal.cc b/src/ggml-hsa/kernels/iron/count_equal.cc new file mode 100644 index 0000000000..3447385c69 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/count_equal.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. + +/** + * @file count_equal.cc + * @brief Count equal operation for AIE kernels. + */ + +#include + +#include "ggml-aie.hpp" + +extern "C" { + +/** + * @brief Counts elements that are equal between two input tensors. + * + * Processes data in tiles for streaming execution. On the first tile + * (tile_idx == 0), initializes the output buffer to 0. Each tile + * reads the accumulated count, adds its local count, and writes back. + * + * Uses vectorized comparison where possible for better performance. + * + * @note The output buffer is passed as int32_t[2] because IRON does not + * support int64_t in ObjectFifos. It is accessed as int64_t internally. + * + * @param[in] in0 First input tile of tile_size elements. + * @param[in] in1 Second input tile of tile_size elements. + * @param[in,out] out Output buffer (2 x int32 = 1 x int64) used as + * running accumulator across tiles. + * @param[in] tile_size Number of elements in this tile. + * @param[in] tile_idx Current tile index (0-based). Tile 0 initializes + * the accumulator. + */ +void ggml_op_count_equal(const INPUT_DTYPE * __restrict in0, + const INPUT_DTYPE * __restrict in1, + int32_t * __restrict out, // Actually int64_t, cast due to IRON limitations + int32_t tile_size, + int32_t tile_idx) { + event0(); + + // Initialize accumulator on first tile + if (tile_idx == 0) { + out[0] = 0; + out[1] = 0; + } + + // Count equal elements using vectorized comparison where possible + constexpr int VEC_SIZE = 16; // 16 x int32 = 512 bits + const int num_full_iters = tile_size / VEC_SIZE; + const int tail_start = num_full_iters * VEC_SIZE; + + int32_t local_count = 0; + + // Vectorized loop + const INPUT_DTYPE * __restrict p0 = in0; + const INPUT_DTYPE * __restrict p1 = in1; + + for (int i = 0; i < num_full_iters; i++) { + aie::vector v0 = aie::load_v(p0); + aie::vector v1 = aie::load_v(p1); + p0 += VEC_SIZE; + p1 += VEC_SIZE; + + // Compare vectors - returns mask where elements are equal + auto mask = aie::eq(v0, v1); + + // Count set bits in mask (number of equal elements) + for (int j = 0; j < VEC_SIZE; j++) { + if (mask.test(j)) { + local_count++; + } + } + } + + // Scalar tail + for (int i = tail_start; i < tile_size; i++) { + if (in0[i] == in1[i]) { + local_count++; + } + } + + // Accumulate into output buffer + int64_t out64 = 0; + std::memcpy(&out64, out, sizeof(int64_t)); // Read current count (as int64_t) + out64 += local_count; // Add local count + std::memcpy(out, &out64, sizeof(int64_t)); // Write back + + event1(); +} + +} // extern "C" diff --git a/src/ggml-hsa/kernels/iron/count_equal.py b/src/ggml-hsa/kernels/iron/count_equal.py new file mode 100644 index 0000000000..55a843d245 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/count_equal.py @@ -0,0 +1,232 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON kernel implementation for the count_equal operation. + +Counts the number of elements that are equal between two I32 input tensors. +The output is a single I64 value, but since IRON doesn't support I64 in ObjectFifos, +we use two I32 values (low and high parts) for the transfer. +""" + +from pathlib import Path + +import numpy as np + +from .utils import ( + arch_to_device, + max_tile_size, + suppress_import_pyxrt_msg, +) + +suppress_import_pyxrt_msg() + +from aie.dialects.arith import index_cast +from aie.ir import IntegerType +from aie.iron import ( + ExternalFunction, + ObjectFifo, + Program, + Runtime, + Worker, + dtype_to_str, +) +from aie.iron.controlflow import range_ +from aie.iron.placers import SequentialPlacer + + +def count_equal_op(arch: str, input_tensors: list, output_tensor, op_params: bytearray): + """ + IRON design for count_equal. + + Counts elements that are equal between two I32 input tensors and outputs + a single I64 scalar with the count. Processes data in tiles. + + Since IRON doesn't support I64 types in ObjectFifos, we transfer the count + as two I32 values (low and high 32 bits). The C++ kernel writes the 64-bit + count as these two I32 lanes to the ObjectFifo output buffer, which together + bitwise represent a single I64 value. + Parameters: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + input_tensors (list[TensorDesc]): List containing exactly two input tensors. + Both tensors must be I32 with the same shape. + output_tensor (TensorDesc): Output tensor of type I64 with shape [1,1,1,1] + containing the count of equal elements. + op_params (bytearray): Operation parameters (unused for COUNT_EQUAL). + + Returns: + MLIR module representing the IRON program for count_equal. + + Raises: + ValueError: If input_tensors does not contain exactly two tensors. + ValueError: If input tensors have different shapes. + ValueError: If input or output tensors are not contiguous in memory. + ValueError: If input tensor dtype is not int32. + ValueError: If output tensor dtype is not int64. + """ + + if len(input_tensors) != 2: + raise ValueError("Operation requires exactly two input tensors.") + + input_tensor0 = input_tensors[0] + input_tensor1 = input_tensors[1] + + if not input_tensor0.contiguous: + raise ValueError("First input tensor must be contiguous in memory.") + if not input_tensor1.contiguous: + raise ValueError("Second input tensor must be contiguous in memory.") + if not output_tensor.contiguous: + raise ValueError("Output tensor must be contiguous in memory.") + + if input_tensor0.shape != input_tensor1.shape: + raise ValueError( + f"Input tensor shapes must match: {input_tensor0.shape} != {input_tensor1.shape}" + ) + + if input_tensor0.dtype != np.int32: + raise ValueError( + f"First input tensor dtype must be int32, got {input_tensor0.dtype}." + ) + if input_tensor1.dtype != np.int32: + raise ValueError( + f"Second input tensor dtype must be int32, got {input_tensor1.dtype}." + ) + + if output_tensor.dtype != np.int64: + raise ValueError( + f"Output tensor dtype must be int64, got {output_tensor.dtype}." + ) + + # Validate output tensor is a scalar + if output_tensor.numel() != 1: + raise ValueError( + "Output tensor must be a single-element I64 scalar (shape [1, 1, 1, 1]), " + f"but has {output_tensor.numel()} elements." + ) + + shape = output_tensor.shape + if len(shape) != 4 or any(dim != 1 for dim in shape): + raise ValueError( + "Output tensor must have GGML scalar shape [1, 1, 1, 1], " + f"but has shape {shape}." + ) + + total_elements = input_tensor0.numel() + + # Use max_tile_size to find a tile size that evenly divides total_elements. + # This avoids padding/alignment issues with DMA transfers. + tile_size = max_tile_size(arch, input_tensor0.dtype, total_elements) + num_tiles = total_elements // tile_size + + function = _create_external_function( + arch=arch, + op_name="GGML_OP_COUNT_EQUAL", + input_tensor=input_tensor0, + tile_size=tile_size, + ) + + # AIE-array data movement with object fifos + # Input: tiles of I32 elements from both tensors + input_tile_ty = np.ndarray[(tile_size,), np.dtype[input_tensor0.dtype]] + # Output: Two I32 values representing the I64 count (low and high parts) + # This is needed because IRON doesn't support I64 in ObjectFifos + output_tile_ty = np.ndarray[(2,), np.dtype[np.int32]] + + of_in0 = ObjectFifo(input_tile_ty, name="in0") + of_in1 = ObjectFifo(input_tile_ty, name="in1") + of_out = ObjectFifo(output_tile_ty, name="out") + + # Task for the core to perform with an external function + def ext_core_fn(of_in0, of_in1, of_out, function, num_tiles): + # Acquire output buffer once at the start + elem_out = of_out.acquire(1) + + # Process all tiles + for tile_idx in range_(num_tiles): + elem_in0 = of_in0.acquire(1) + elem_in1 = of_in1.acquire(1) + # Convert tile_idx from index type to i32 + tile_idx_i32 = index_cast(IntegerType.get_signless(32), tile_idx) + function(elem_in0, elem_in1, elem_out, tile_size, tile_idx_i32) + of_in0.release(1) + of_in1.release(1) + + of_out.release(1) + + # Create a worker to run the task on a compute tile + worker = Worker( + ext_core_fn, + fn_args=[ + of_in0.cons(), + of_in1.cons(), + of_out.prod(), + function, + num_tiles, + ], + ) + + # Runtime operations to move data to/from the AIE-array + rt = Runtime() + input_tensor_ty = np.ndarray[(total_elements,), np.dtype[input_tensor0.dtype]] + # Output: 2 x I32 = 8 bytes = 1 x I64 + output_tensor_ty = np.ndarray[(2,), np.dtype[np.int32]] + + with rt.sequence(input_tensor_ty, input_tensor_ty, output_tensor_ty) as ( + a_in0, + a_in1, + b_out, + ): + rt.start(worker) + rt.fill(of_in0.prod(), a_in0) + rt.fill(of_in1.prod(), a_in1) + rt.drain(of_out.cons(), b_out, wait=True) + + # Place program components and generate an MLIR module + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def _create_external_function( + arch: str, + op_name: str, + input_tensor, + tile_size: int, +) -> ExternalFunction: + """ + Creates an ExternalFunction specification for count_equal. + + The external function wraps the C++ kernel that performs the actual count_equal + computation on the AIE tile. + + Parameters: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + op_name (str): Operation name used for function naming and compile flags + (e.g., "GGML_OP_COUNT_EQUAL"). + input_tensor (TensorDesc): Input tensor descriptor providing dtype information. + tile_size (int): Size of each tile in elements. + + Returns: + ExternalFunction: Configured external function specification that references + the count_equal.cc source file with appropriate compile flags. + """ + + current_dir = Path(__file__).resolve().parent + func = ExternalFunction( + name=f"{op_name.lower()}", + object_file_name=f"{op_name.lower()}_core_function.o", + source_file=str(current_dir / "count_equal.cc"), + arg_types=[ + np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]], # in0 + np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]], # in1 + np.ndarray[(2,), np.dtype[np.int32]], # out (count as 2 x I32) + np.int32, # tile_size + np.int32, # tile_idx + ], + compile_flags=[ + f"-DINPUT_DTYPE={dtype_to_str(input_tensor.dtype)}", + ], + ) + return func diff --git a/src/ggml-hsa/kernels/iron/cross_entropy_loss.cc b/src/ggml-hsa/kernels/iron/cross_entropy_loss.cc new file mode 100644 index 0000000000..e01cd6ba79 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/cross_entropy_loss.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. + +#include "aie_kernel_math.h" +#include "ggml-aie.hpp" + +extern "C" { + +/** + * @brief Computes cross-entropy loss using numerically stable log-softmax. + * + * Computes: loss = -sum(labels * log_softmax(logits)) + * where: log_softmax(x_i) = (x_i - max) - log(sum(exp(x_j - max))) + * + * Three-pass algorithm for numerical stability: + * 1. Find max(logits) to prevent overflow in exp(). + * 2. Compute sum_exp = sum(exp(logits - max)). + * 3. Compute loss = -sum(labels * ((logits - max) - log(sum_exp))). + * + * @param[in] logits Input logits array of N elements (unnormalized scores). + * @param[in] labels Target labels array of N elements (typically one-hot or probabilities). + * @param[out] loss_out Single-element output array receiving the total loss. + * @param[in] N Number of elements. + */ +void ggml_op_cross_entropy_loss(const float * __restrict logits, + const float * __restrict labels, + float * __restrict loss_out, + int32_t N) { + event0(); + + constexpr int32_t VEC_SIZE = KERN_VEC_SIZE; + const int32_t num_full_iters = N / VEC_SIZE; + const int32_t tail_start = num_full_iters * VEC_SIZE; + + // --------------------------------------------------------- + // Find max logit for numerical stability + // --------------------------------------------------------- + auto it_max_in = aie::cbegin_vector(logits); + aie::vector v_max = aie::broadcast(-3.4028235e+38f); + + for (int32_t i = 0; i < num_full_iters; i++) { + aie::vector logit_vec = *it_max_in++; + v_max = aie::max(v_max, logit_vec); + } + + float global_max = aie::reduce_max(v_max); + + // Scalar tail loop for remaining elements + for (int32_t i = tail_start; i < N; i++) { + if (logits[i] > global_max) + global_max = logits[i]; + } + + aie::vector v_global_max = aie::broadcast(global_max); + + // --------------------------------------------------------- + // Compute sum_exp = sum(exp(logits - max)) + // --------------------------------------------------------- + auto it_logits = aie::cbegin_vector(logits); + aie::accum v_sum_exp_accum = aie::zeros(); + + for (int32_t i = 0; i < num_full_iters; i++) { + aie::vector logit_vec = *it_logits++; + aie::vector x = aie::sub(logit_vec, v_global_max); + aie::vector exp_val = vec_exp(x); + v_sum_exp_accum = aie::add(v_sum_exp_accum, exp_val); + } + + aie::vector v_sum_exp = v_sum_exp_accum.to_vector(); + float sum_exp = aie::reduce_add(v_sum_exp); + + // Scalar tail loop for remaining elements + for (int32_t i = tail_start; i < N; i++) { + sum_exp += scalar_exp(logits[i] - global_max); + } + + // --------------------------------------------------------- + // Compute log(sum_exp) using range-reduced scalar log + // --------------------------------------------------------- + float log_sum_exp = scalar_log(sum_exp); + + // --------------------------------------------------------- + // Compute cross entropy loss in log-space + // + // log_softmax(x_i) = (x_i - max) - log(sum_exp) + // loss = -sum( labels * log_softmax ) + // --------------------------------------------------------- + it_logits = aie::cbegin_vector(logits); + auto it_labels = aie::cbegin_vector(labels); + + aie::accum v_loss_accum = aie::zeros(); + aie::vector v_log_sum_exp = aie::broadcast(log_sum_exp); + + for (int32_t i = 0; i < num_full_iters; i++) { + aie::vector logit_vec = *it_logits++; + aie::vector label_vec = *it_labels++; + + // log_softmax = (logits - max) - log(sum_exp) + aie::vector log_softmax = aie::sub(logit_vec, v_global_max); + log_softmax = aie::sub(log_softmax, v_log_sum_exp); + + // Accumulate: labels * log_softmax + aie::accum product = aie::mul(log_softmax, label_vec); + v_loss_accum = aie::add(v_loss_accum, product.to_vector()); + } + + // Reduce to scalar loss + aie::vector v_loss = v_loss_accum.to_vector(); + float total_loss = aie::reduce_add(v_loss); + + // Scalar tail loop for remaining elements + for (int32_t i = tail_start; i < N; i++) { + float log_softmax_i = (logits[i] - global_max) - log_sum_exp; + total_loss += labels[i] * log_softmax_i; + } + + // Store negated loss (cross entropy is -sum) + loss_out[0] = -total_loss; + + event1(); +} + +} // extern "C" diff --git a/src/ggml-hsa/kernels/iron/cross_entropy_loss.py b/src/ggml-hsa/kernels/iron/cross_entropy_loss.py new file mode 100644 index 0000000000..7b4df3b1b1 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/cross_entropy_loss.py @@ -0,0 +1,309 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON kernel implementation for the cross entropy loss operation. +""" + +from os import path +from typing import Tuple + +import numpy as np + +from aie.iron.controlflow import range_ +from aie.iron.placers import SequentialPlacer + +from .utils import ( + arch_to_device, + suppress_import_pyxrt_msg, +) + +suppress_import_pyxrt_msg() + +from aie.dialects import arith as arith_dialect +from aie.dialects import memref as memref_dialect +from aie.ir import F32Type, FloatAttr, IndexType, IntegerAttr +from aie.iron import ( + ExternalFunction, + ObjectFifo, + Program, + Runtime, + Worker, +) + + +def get_cross_entropy_loss_dimensions(tensor) -> Tuple[int, int]: + """ + Extract cross entropy loss dimensions from tensor shape. + + GGML convention: cross entropy loss is computed over dimension 0 (ne00). + GGML shape ordering: (ne00, ne01, ne02, ne03) where ne00 is innermost. + + Parameters: + tensor: Input tensor with shape in GGML order. + + Returns: + Tuple of (row_length, num_rows) where: + - row_length = ne00 (dimension over which loss is computed per row) + - num_rows = ne01 * ne02 * ne03 (number of independent rows) + """ + shape = tensor.shape + + if len(shape) == 1: + # shape = (ne00,) + return shape[0], 1 + elif len(shape) == 2: + # shape = (ne00, ne01) + return shape[0], shape[1] + elif len(shape) == 3: + # shape = (ne00, ne01, ne02) + return shape[0], shape[1] * shape[2] + elif len(shape) == 4: + # shape = (ne00, ne01, ne02, ne03) + return shape[0], shape[1] * shape[2] * shape[3] + else: + raise ValueError(f"Unsupported tensor rank: {len(shape)}") + + +# Vector size for AIE kernel vector operations +KERN_VEC_SIZE = 8 + + +def cross_entropy_loss( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +): + """ + IRON design for GGML_OP_CROSS_ENTROPY_LOSS implementation. + + Cross entropy loss computes: -sum(labels * log(softmax(logits))) / num_rows + where the softmax is computed with numerical stability. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of 2 input tensors: + - input_tensors[0]: Logits tensor (predictions before softmax) + - input_tensors[1]: Labels tensor (ground truth, often one-hot encoded) + output_tensor: Output scalar tensor containing the loss value. + op_params (bytearray): Operation parameters (currently unused). + """ + + if len(input_tensors) != 2: + raise ValueError( + f"Cross entropy loss requires 2 input tensors: {len(input_tensors)}" + ) + + logits_tensor = input_tensors[0] + labels_tensor = input_tensors[1] + + if not logits_tensor.contiguous: + raise ValueError("Logits tensor must be contiguous in memory.") + if not labels_tensor.contiguous: + raise ValueError("Labels tensor must be contiguous in memory.") + if not output_tensor.contiguous: + raise ValueError("Output tensor must be contiguous in memory.") + + if logits_tensor.shape != labels_tensor.shape: + raise ValueError("Logits and labels tensors must have the same shape.") + + row_length, num_rows = get_cross_entropy_loss_dimensions(logits_tensor) + + # Tile size equals row length; the kernel handles non-aligned sizes + # via scalar tail loops after vectorized processing. + tile_size = row_length + + # Create external function + function = create_external_function( + arch=arch, + logits_tensor=logits_tensor, + labels_tensor=labels_tensor, + output_tensor=output_tensor, + tile_size=tile_size, + ) + + # Create the program with on-tile reduction + return create_reduction_program( + arch=arch, + function=function, + logits_tensor=logits_tensor, + labels_tensor=labels_tensor, + output_tensor=output_tensor, + tile_size=tile_size, + num_rows=num_rows, + ) + + +def create_reduction_program( + arch: str, + function, + logits_tensor, + labels_tensor, + output_tensor, + tile_size: int, + num_rows: int, +): + """ + Creates an IRON program for cross entropy loss with on-tile reduction. + + The C++ kernel computes per-row loss: loss_row = -sum(labels * log_softmax). + The worker accumulates all per-row losses on-tile and outputs a single + scalar: total_loss / num_rows, matching the CPU reference behavior. + + Algorithm: + 1. Acquire the output FIFO element once (single scalar buffer). + 2. For each row: save accumulated value, call kernel (which overwrites + the buffer with this row's loss), then add accumulated + row loss + and store back. + 3. After all rows: divide by num_rows and release. + 4. DMA drains exactly 1 float to the host. + + Parameters: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + function (ExternalFunction): The external function for per-row loss. + logits_tensor (TensorDesc): Logits tensor descriptor. + labels_tensor (TensorDesc): Labels tensor descriptor. + output_tensor (TensorDesc): Output tensor descriptor. + tile_size (int): Number of elements per tile (row length). + num_rows (int): Number of rows to process. + + Returns: + MLIR module representing the cross entropy loss program. + """ + num_tiles = num_rows + + logits_tile_ty = np.ndarray[(tile_size,), np.dtype[logits_tensor.dtype]] + labels_tile_ty = np.ndarray[(tile_size,), np.dtype[labels_tensor.dtype]] + # Each tile of output is a single scalar loss value + output_tile_ty = np.ndarray[(1,), np.dtype[output_tensor.dtype]] + + of_logits = ObjectFifo(logits_tile_ty, name="logits") + of_labels = ObjectFifo(labels_tile_ty, name="labels") + of_out = ObjectFifo(output_tile_ty, name="out") + + def ext_core_fn(of_logits, of_labels, of_out, function): + # Acquire the output buffer ONCE — we will accumulate into it + # across all rows and release it only after the final result + # is computed. This produces exactly 1 scalar for the DMA to drain. + elem_out = of_out.acquire(1) + + # Create constants for memref indexing and arithmetic + c0_index = arith_dialect.ConstantOp( + IndexType.get(), IntegerAttr.get(IndexType.get(), 0) + ).result + zero_f32 = arith_dialect.ConstantOp( + F32Type.get(), FloatAttr.get(F32Type.get(), 0.0) + ).result + nr_f32 = arith_dialect.ConstantOp( + F32Type.get(), FloatAttr.get(F32Type.get(), float(num_rows)) + ).result + + # Initialize accumulated loss to zero + memref_dialect.StoreOp(zero_f32, elem_out, [c0_index]) + + for _ in range_(num_tiles): + elem_logits = of_logits.acquire(1) + elem_labels = of_labels.acquire(1) + + # Load accumulated loss BEFORE the kernel overwrites the buffer + prev_loss = memref_dialect.LoadOp(elem_out, [c0_index]).result + + # Kernel computes this row's loss -> writes to elem_out[0] + # loss_out[0] = -sum(labels * log_softmax(logits)) + function(elem_logits, elem_labels, elem_out, tile_size) + + # Load per-row loss that the kernel just wrote + row_loss = memref_dialect.LoadOp(elem_out, [c0_index]).result + + # Accumulate: new_total = prev_loss + row_loss + new_total = arith_dialect.AddFOp(prev_loss, row_loss).result + memref_dialect.StoreOp(new_total, elem_out, [c0_index]) + + of_logits.release(1) + of_labels.release(1) + + # Divide accumulated loss by num_rows to get the average, + # matching the CPU reference: dp[0] = -sum_all_rows / num_rows + total_loss = memref_dialect.LoadOp(elem_out, [c0_index]).result + avg_loss = arith_dialect.DivFOp(total_loss, nr_f32).result + memref_dialect.StoreOp(avg_loss, elem_out, [c0_index]) + + # Release the single output element — DMA drains this 1 float + of_out.release(1) + + worker = Worker( + ext_core_fn, + fn_args=[of_logits.cons(), of_labels.cons(), of_out.prod(), function], + ) + + rt = Runtime() + logits_tensor_ty = np.ndarray[ + (tile_size * num_rows,), np.dtype[logits_tensor.dtype] + ] + labels_tensor_ty = np.ndarray[ + (tile_size * num_rows,), np.dtype[labels_tensor.dtype] + ] + + output_scalar_ty = np.ndarray[(1,), np.dtype[output_tensor.dtype]] + + with rt.sequence(logits_tensor_ty, labels_tensor_ty, output_scalar_ty) as ( + a_logits, + a_labels, + b_out, + ): + rt.start(worker) + rt.fill(of_logits.prod(), a_logits) + rt.fill(of_labels.prod(), a_labels) + rt.drain(of_out.cons(), b_out, wait=True) + + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def create_external_function( + arch: str, + logits_tensor, + labels_tensor, + output_tensor, + tile_size: int, +): + """ + Creates an external function specification for cross entropy loss. + + The external function wraps the C++ kernel that computes per-row loss: + loss = -sum(labels * log_softmax(logits)) using numerically stable + log-softmax with max subtraction. + + Parameters: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + logits_tensor (TensorDesc): Logits tensor descriptor providing dtype. + labels_tensor (TensorDesc): Labels tensor descriptor providing dtype. + output_tensor (TensorDesc): Output tensor descriptor providing dtype. + tile_size (int): Number of elements per tile (equals row length). + + Returns: + ExternalFunction: Configured external function specification that + references cross_entropy_loss.cc with appropriate compile flags. + """ + arg_types = [ + np.ndarray[(tile_size,), np.dtype[logits_tensor.dtype]], # logits + np.ndarray[(tile_size,), np.dtype[labels_tensor.dtype]], # labels + np.ndarray[(1,), np.dtype[output_tensor.dtype]], # output (scalar) + np.int32, # tile_size (N) + ] + + compile_flags = [ + f"-DKERN_VEC_SIZE={KERN_VEC_SIZE}", + ] + + current_dir = path.dirname(path.realpath(__file__)) + func = ExternalFunction( + name="ggml_op_cross_entropy_loss", + object_file_name="cross_entropy_loss_core_function.o", + source_file=path.join(current_dir, "cross_entropy_loss.cc"), + arg_types=arg_types, + compile_flags=compile_flags, + ) + + return func diff --git a/src/ggml-hsa/kernels/iron/gemm.py b/src/ggml-hsa/kernels/iron/gemm.py new file mode 100644 index 0000000000..6defde9bd1 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/gemm.py @@ -0,0 +1,989 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 AMD Inc. + +""" +IRON kernel implementation for matrix multiplication (GEMM). +""" + +import argparse +from pathlib import Path + +import numpy as np + +from .utils import suppress_import_pyxrt_msg + +suppress_import_pyxrt_msg() + +from aie.extras.context import mlir_mod_ctx + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.helpers.taplib import TensorAccessPattern, TensorAccessSequence +from aie.iron.controlflow import range_ +from aie.iron import dtype_to_str, str_to_dtype, ExternalFunction + +microkernel_mac_dim_map = { + "npu": { + "bf16": (4, 8, 4), + "i8": (4, 8, 8), + "i16": (4, 4, 4), + }, + "npu2": { + "bf16": { + # emulate_bf16_mmul_with_bfp16 + True: (8, 8, 8), + False: (4, 8, 8), + }, + "i8": (8, 8, 8), + "i16": (4, 4, 8), + }, +} + +# Data types that require scalar matmul (no vectorized MAC support) +# Note: f32 matmul is not supported on AIE due to lack of f32 MAC instructions +# and DMA size constraints. +SCALAR_ONLY_DTYPES = {"f32"} + + +def main(): + """ + Command-line entry point for generating matrix multiplication MLIR. + + Parses command-line arguments and generates MLIR code for a matrix + multiplication design with the specified dimensions and configuration. + """ + argparser = argparse.ArgumentParser( + prog="AIE Matrix Multiplication MLIR Design (Whole Array)", + description="Emits MLIR code for a matrix multiplication design of the given input size", + ) + argparser.add_argument("--dev", type=str, choices=["npu", "npu2"], default="npu") + argparser.add_argument("-M", type=int, default=512) + argparser.add_argument("-K", type=int, default=512) + argparser.add_argument("-N", type=int, default=512) + argparser.add_argument("-m", type=int, default=64) + argparser.add_argument("-k", type=int, default=64) + argparser.add_argument("-n", type=int, default=32) + argparser.add_argument("--n-aie-cols", type=int, choices=[1, 2, 4, 8], default=4) + argparser.add_argument("--b-col-maj", type=int, choices=[0, 1], default=0) + argparser.add_argument("--c-col-maj", type=int, choices=[0, 1], default=0) + # Whether to use the scalar kernel; this is low, but can be useful for debugging smaller sizes + argparser.add_argument("--scalar", type=bool, choices=[0, 1], default=0) + argparser.add_argument("--emulate-bf16-mmul-with-bfp16", type=bool, default=False) + argparser.add_argument( + "--dtype_in", type=str, choices=["bf16", "i8", "i16"], default="i16" + ) + argparser.add_argument( + "--dtype_out", + type=str, + choices=["bf16", "i8", "i16", "f32", "i32"], + default="i16", + ) + argparser.add_argument("--trace_size", type=int, default=0) + argparser.add_argument( + "--generate-taps", + action="store_true", + help="Generate TensorAccessPatterns, a Python object to represent each data transfer" + "of the input/output matrices. These objects can be used for visualization.", + ) + args = argparser.parse_args() + with mlir_mod_ctx() as ctx: + maybe_taps = my_matmul( + args.dev, + args.M, + args.K, + args.N, + args.m, + args.k, + args.n, + args.n_aie_cols, + args.dtype_in, + args.dtype_out, + args.b_col_maj, + args.c_col_maj, + args.scalar, + args.emulate_bf16_mmul_with_bfp16, + args.trace_size, + f"matmul_{dtype_to_str(args.dtype_in)}_{dtype_to_str(args.dtype_out)}", + f"zero_{dtype_to_str(args.dtype_out)}", + f"mm_{args.m}x{args.k}x{args.n}.o", + args.generate_taps, + ) + # print(ctx.module.operation.verify()) + print(ctx.module) + + if args.generate_taps: + return maybe_taps + + +def ceildiv(a, b): + """Returns the ceiling of integer division a/b.""" + return (a + b - 1) // b + + +def my_matmul( + dev, + M, + K, + N, + m, + k, + n, + n_aie_cols, + n_aie_rows, + dtype_in_str, + dtype_out_str, + b_col_maj, + c_col_maj, + use_scalar, + emulate_bf16_mmul_with_bfp16, + trace_size, + zero_fn, + matmul_fn, + object_file, + generate_taps=False, +): + """ + Generates MLIR for tiled matrix multiplication across an AIE array. + + This function creates the complete AIE design including tile declarations, + object FIFOs for data movement, compute core logic, and runtime DMA sequences. + + Parameters: + dev (str): Device type ("npu" or "npu2"). + M (int): Number of rows in matrix A and C. + K (int): Inner dimension (columns of A, rows of B). + N (int): Number of columns in matrix B and C. + m (int): Tile size in M dimension per core. + k (int): Tile size in K dimension (shared across all cores). + n (int): Tile size in N dimension per core. + n_aie_cols (int): Number of AIE columns to use (1, 2, 4, or 8). + n_aie_rows (int): Number of AIE rows to use (1 or 4). + dtype_in_str (str): Input data type ("bf16", "i8", "i16", or "f32"). + dtype_out_str (str): Output data type ("bf16", "i8", "i16", "f32", or "i32"). + b_col_maj (bool): If True, matrix B is in column-major layout. + c_col_maj (bool): If True, matrix C is in column-major layout. + use_scalar (bool): If True, use scalar kernels (for debugging small sizes). + emulate_bf16_mmul_with_bfp16 (bool): If True, use bfp16 emulation for bf16. + trace_size (int): Size of trace buffer (0 to disable tracing). + zero_fn (str): Name of the zero initialization function. + matmul_fn (str): Name of the matrix multiply accumulate function. + object_file (str): Name of the compiled object file containing kernels. + generate_taps (bool): If True, return TensorAccessPattern objects for visualization. + + Returns: + If generate_taps is True, returns a tuple of TensorAccessSequence objects + for A, B, and C matrices. Otherwise returns None. + """ + n_aie_cores = n_aie_rows * n_aie_cols + + dtype_in = str_to_dtype(dtype_in_str) + dtype_out = str_to_dtype(dtype_out_str) + + if np.issubdtype(dtype_in, np.integer) != np.issubdtype(dtype_out, np.integer): + raise ValueError( + f"Input dtype ({dtype_in}) and output dtype ({dtype_out}) must either both be integral or both be float" + ) + if np.dtype(dtype_out).itemsize < np.dtype(dtype_in).itemsize: + raise ValueError( + f"Output dtype ({dtype_out}) must be equal or larger to input dtype ({dtype_in})" + ) + + # r, s, t are the dimensions required by the microkernel MAC instructions. + # Skip MAC dimension lookup for scalar mode (e.g., f32 has no vectorized MAC) + if not use_scalar: + mac_dims = microkernel_mac_dim_map[dev][dtype_in_str] + if dev == "npu2" and dtype_in_str == "bf16": + r, s, t = mac_dims[emulate_bf16_mmul_with_bfp16] + else: + r, s, t = mac_dims + else: + # Scalar mode doesn't use MAC dimensions + r, s, t = 1, 1, 1 + + # npu is a 4 row x 4 col array + if dev == "npu" and n_aie_cols > 4: + raise AssertionError("Invalid configuration: NPU (Phoenix/Hawk) has 4 columns") + # npu2 is a 4 row x 8 col array + if dev == "npu2" and n_aie_cols > 8: + raise AssertionError( + "Invalid configuration: NPU2 (Strix/Strix Halo/Krackan) has 8 columns" + ) + + # Input matrix A: + # Conceptually, we divide input A into (m * n_rows, k)-sized blocks. These + # blocks are _broadcast_ across AIE core columns, then _distributed_ across + # rows, s.t. each of the n_rows compute cores in a column receives a + # contiguous (m, k)-sized block of A. + if M % (m * n_aie_rows) != 0: + raise ValueError( + f"A must be tileable into (m * n_aie_rows, k)-sized blocks: " + f"M={M} is not divisible by m*n_aie_rows={m}*{n_aie_rows}={m * n_aie_rows}" + ) + + # Both A and B are tiled in the K dimension into size k. + if K % k != 0: + raise ValueError(f"K={K} must be divisible by tile size k={k}") + + # Input matrix B: + # Conceptually, we do the same as with A, but instead of broadcasting + # across columns we broadcast across rows and distribute across columns. + if N % (n * n_aie_cols) != 0: + raise ValueError( + f"B must be tileable into (k, n * n_aie_cols)-sized blocks: " + f"N={N} is not divisible by n*n_aie_cols={n}*{n_aie_cols}={n * n_aie_cols}" + ) + + # r, s, t are the dimensions required by the microkernel MAC instructions. + if not use_scalar: + if m % r != 0: + raise ValueError(f"Tile size m={m} must be divisible by MAC dim r={r}") + if k % s != 0: + raise ValueError(f"Tile size k={k} must be divisible by MAC dim s={s}") + if n % t != 0: + raise ValueError(f"Tile size n={n} must be divisible by MAC dim t={t}") + + # If you get errors during CDO generation due to running out of program + # memory, it may be because too much code is generated due to ObjectFIFO + # loop unrollings. Reducing the depth to 1 here will work around that at + # a big performance cost. + fifo_depth = 2 + + n_tiles_per_core = (M // m) * (N // n) // n_aie_cores + + # When using more AIE columns than n_aie_rows (4) (applicable to NPU2), + # restrict the number of shim/mem tiles to n_aie_rows, + # since we have only n_aie_rows row tiles for matrix A + if n_aie_cols > n_aie_rows: + n_shim_mem_A = n_aie_rows + # When using n_aie_rows (4) or less AIE columns (both NPU and NPU2), + # the number of shim/mem tiles are equal to n_aie_cols. + # We use the distribute pattern in object FIFO (see linking for A below), + # since we have n_aie_rows (4) row tiles for matrix A + else: + n_shim_mem_A = n_aie_cols + + # Integer division when n_aie_cols < 4, otherwise set to 1 + n_A_tiles_per_shim = n_aie_rows // n_aie_cols if n_aie_cols < 4 else 1 + + if dev == "npu": + if n_aie_cols == 1: + dev_ty = AIEDevice.npu1_1col + elif n_aie_cols == 2: + dev_ty = AIEDevice.npu1_2col + elif n_aie_cols == 4: + dev_ty = AIEDevice.npu1 + else: + dev_ty = AIEDevice.npu2 + + # These will hold TensorAccessPattern objects that represent the runtime + # npu_dma_memcpy_nd operations of this design. They are only used if generate_taps is true + A_taps = [] + B_taps = [] + C_taps = [] + + @device(dev_ty) + def device_body(): + A_l2_ty = np.ndarray[(m * k * n_A_tiles_per_shim,), np.dtype[dtype_in]] + B_l2_ty = np.ndarray[(k * n,), np.dtype[dtype_in]] + C_l2_ty = np.ndarray[(m * n * n_aie_rows,), np.dtype[dtype_out]] + A_l1_ty = np.ndarray[(m, k), np.dtype[dtype_in]] + B_l1_ty = np.ndarray[(k, n), np.dtype[dtype_in]] + C_l1_ty = np.ndarray[(m, n), np.dtype[dtype_out]] + + # AIE Core Function declarations + zero = external_func(zero_fn, inputs=[C_l1_ty]) + matmul = external_func(matmul_fn, inputs=[A_l1_ty, B_l1_ty, C_l1_ty]) + + # Tile declarations as tile[row][col] + tiles = [ + [tile(col, row) for col in range(0, n_aie_cols)] for row in range(0, 6) + ] + shim_tiles = tiles[0] + mem_tiles = tiles[1] + core_tiles = tiles[2:] + + # AIE-array data movement with object fifos + A_l3l2_fifos = [None] * n_shim_mem_A + A_l2l1_fifos = [None] * n_aie_rows + + B_l3l2_fifos = [None] * n_aie_cols + B_l2l1_fifos = [None] * n_aie_cols + + C_l1l2_fifos = [[None] * n_aie_cols for _ in range(n_aie_rows)] + C_l2l3_fifos = [None] * n_aie_cols + + # Input A + # L3 -> L2 data movement + for i in range(n_shim_mem_A): + A_l3l2_fifos[i] = object_fifo( + f"A_L3L2_{i}", + ( + shim_tiles[2 * i] + if n_aie_cols == 8 + else shim_tiles[i] # alternate columns in full 4x8 NPU2 case + ), + mem_tiles[2 * i] if n_aie_cols == 8 else mem_tiles[i], + fifo_depth, + A_l2_ty, + ) + + # L2 -> L1 data movement + for row in range(n_aie_rows): + A_l2l1_fifos[row] = object_fifo( + f"A_L2L1_{row}", + ( + mem_tiles[2 * row] + if n_aie_cols == 8 + else mem_tiles[row // n_A_tiles_per_shim] + ), + core_tiles[row][0:n_aie_cols], # broadcast along one row + fifo_depth, + A_l1_ty, + ( + [ + (m // r, r * k), + (k // s, s), + (r, k), + (s, 1), + ] + if not use_scalar + else [] + ), + ) + + # A_l3_l2 and A_l2_l1 object FIFO linking + for i in range(n_shim_mem_A): + # If n_shim_mem_A == n_rows, n_A_tiles_per_shim is 1 and + # this simply links a_l3l2_fifos[i] to a_l2l1_fifos[i] directly, + # If n_shim_mem_A < n_rows, each column receives multiple rows of + # tiles; distribute it along rows of AIE cores. + start_row = i * n_A_tiles_per_shim + stop_row = start_row + n_A_tiles_per_shim + if stop_row - start_row > 1: + of_offsets = [m * k * j for j in range(stop_row - start_row)] + else: + of_offsets = [] + object_fifo_link( + A_l3l2_fifos[i], + [A_l2l1_fifos[j] for j in range(start_row, stop_row)], + [], + of_offsets, + ) + + # Input B + for col in range(n_aie_cols): + # L3 -> L2 data movement + B_l3l2_fifos[col] = object_fifo( + f"B_L3L2_{col}", + shim_tiles[col], + mem_tiles[col], + fifo_depth, + B_l2_ty, + ) + # L2 -> L1 data movement + B_l2l1_fifos[col] = object_fifo( + f"B_L2L1_{col}", + mem_tiles[col], + [ + core_tiles[j][col] for j in range(n_aie_rows) + ], # broadcast along one column + fifo_depth, + B_l1_ty, + ( + ( + [ + (k // s, s * n), + (n // t, t), + (s, n), + (t, 1), + ] + if not b_col_maj + else [ + (n // t, t * k), + (k // s, s), + (t, k), + (s, 1), + ] + ) + if not use_scalar + else [] + ), + ) + # B_l3_l2 and B_l2_l1 object FIFO linking + object_fifo_link(B_l3l2_fifos[col], B_l2l1_fifos[col]) + + # Output C + for col in range(n_aie_cols): + for row in range(n_aie_rows): + C_l1l2_fifos[row][col] = object_fifo( + f"C_L1L2_{col}_{row}", + core_tiles[row][col], + mem_tiles[col], + fifo_depth, + C_l1_ty, + ) + C_l2l3_fifos[col] = object_fifo( + f"C_L2L3_{col}", + mem_tiles[col], + shim_tiles[col], + fifo_depth, + C_l2_ty, + ( + ( + [ + (m // r, r * n), + (r, t), + (n // t, r * t), + (t, 1), + ] + if not c_col_maj + else [(n // t, t * m), (t, r), (m // r, r * t), (r, 1)] + ) + if not use_scalar + else [] + ), + ) + if n_aie_rows > 1: + of_offsets = [m * n * i for i in range(n_aie_rows)] + else: + of_offsets = [] + object_fifo_link( + [C_l1l2_fifos[j][col] for j in range(n_aie_rows)], + C_l2l3_fifos[col], + of_offsets, + [], + ) # join along one column + + # Set up compute tiles + for row in range(n_aie_rows): + for col in range(n_aie_cols): + # The stack size choice is a workaround explained here: + # https://github.com/Xilinx/mlir-aie/pull/2391#issuecomment-2967432485 + # In summary, the Peano compiler uses a stack size greater than the default one used by this kernel + # (default is 0x400, chess' stack size is smaller). This is only necessary for bf16 through bfp16 emulation on npu2. + # Exceding the stack size leads to wrong results from the kernel, but no error is triggered. + # Stack usage can be checked as explained here: + # https://github.com/Xilinx/llvm-aie/issues/487#issuecomment-2969438585 + @core(core_tiles[row][col], object_file, stack_size=0xD00) + def core_body(): + for _ in range_(0xFFFFFFFF): + loop = ( + range_(n_tiles_per_core) + if n_tiles_per_core > 1 + else range(1) + ) # Workaround for issue #1547 + for _ in loop: + elem_out = C_l1l2_fifos[row][col].acquire( + ObjectFifoPort.Produce, 1 + ) + zero(elem_out) + + for _ in range_(K // k): + elem_in_a = A_l2l1_fifos[row].acquire( + ObjectFifoPort.Consume, 1 + ) + elem_in_b = B_l2l1_fifos[col].acquire( + ObjectFifoPort.Consume, 1 + ) + matmul(elem_in_a, elem_in_b, elem_out) + A_l2l1_fifos[row].release(ObjectFifoPort.Consume, 1) + B_l2l1_fifos[col].release(ObjectFifoPort.Consume, 1) + + C_l1l2_fifos[row][col].release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + @runtime_sequence( + np.ndarray[(M * K,), np.dtype[dtype_in]], + np.ndarray[(K * N,), np.dtype[dtype_in]], + np.ndarray[(M * N,), np.dtype[dtype_out]], + ) + def sequence(A, B, C): + # We are limited in the number of BDs. After synchronizing, we can reuse BDs. + # We only transfer 4 rows of tiles at once before starting a new transfer block. + # tb = transfer block; block of transfers before sync call + tb_max_n_rows = 4 if not c_col_maj else 2 + for tb in range(ceildiv(M // m // n_aie_rows, tb_max_n_rows)): + for pingpong in [0, 1]: + M // m // n_aie_rows // tb_max_n_rows + row_base = tb * tb_max_n_rows + pingpong * tb_max_n_rows // 2 + bd_id_base = 8 * pingpong + tb_n_rows = min( + [tb_max_n_rows // 2, M // m // n_aie_rows - row_base] + ) + if tb_n_rows <= 0: + # for small input sizes, we may not even need a "pong" iteration + break + for col in range(n_aie_cols): + # C Output Transfer: + # The smallest transfer unit is a (m*n_aie_rows)-x-(n)-sized sub-tile of the matrix. + # Transfer one such tile for every (n_aie_cols)-th column, evenly spaced, + # then repeat that (tb_n_rows) times for the next contiguous blocks of rows. + # Each shim will start at a different column offset, transferring interleaved + # columns. For example, shim 0 may transfer the blocks marked 0 below, and shim 1 + # may transfer the blocks marked 1. + # + # N + # ---------------- + # |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # M |0011 0011 | + # | | + # | | + # | | + # | | + # ---------------- + if not c_col_maj: + C_row_offset = row_base * m * n_aie_rows * N + C_col_offset = col * n + C_offset = C_col_offset + C_row_offset + C_sizes = [ + tb_n_rows, + N // n // n_aie_cols, + m * n_aie_rows, + n, + ] + C_strides = [m * n_aie_rows * N, n * n_aie_cols, N, 1] + else: + C_row_offset = row_base * m * n_aie_rows + C_col_offset = col * n * M + C_offset = C_col_offset + C_row_offset + C_sizes = [N // n // n_aie_cols, n_aie_rows, n, m] + C_strides = [M * n * n_aie_cols, m, M, 1] + npu_dma_memcpy_nd( + metadata=C_l2l3_fifos[col], + bd_id=bd_id_base, + mem=C, + offsets=[0, 0, 0, C_offset], + sizes=C_sizes, + strides=C_strides, + ) + # Use the calculated sizes/strides/offsets to record the data movement + # caused by the above call to npu_dma_memcpy_nd. + # This line does not change MLIR output at all. + if generate_taps: + C_taps.append( + TensorAccessPattern( + (M, N), + offset=C_offset, + sizes=C_sizes, + strides=C_strides, + ) + ) + + for tile_row in range(tb_n_rows): + # A input transfer: + # + # The smallest transfer unit is a (m*n_A_tiles_per_shim)-sized sub-tile of the input matrix. + # Transfer one such tile for every column, contiguously. + # Repeat this transfer with identical tiles a total of (N//n//n_aie_cols) times. + # Each shim transfers the tiles for separate rows. For example, shim 0 may transfer the + # tiles marked 0 below, and shim 1 may transfer the tiles marked 1. + # K + # ---------------- + # |0000000000000000| (repeated N//n//n_aie_cols times) + # |0000000000000000| + # |1111111111111111| + # M |1111111111111111| + # | | + # | | + # | | + # | | + # ---------------- + A_block_offset = ( + (row_base + tile_row) * n_aie_rows * m * K + ) # base address for this transfer block for all BDs + A_row_offset = ( + col * n_A_tiles_per_shim * m * K + ) # base address for the shim in this column + A_offset = A_block_offset + A_row_offset + A_sizes = [ + N // n // n_aie_cols, + K // k, + m * n_A_tiles_per_shim, + k, + ] + A_strides = [0, k, K, 1] + + # always equal to n_aie_rows since we have n_aie_rows row tiles for matrix A + if col < n_aie_rows: + npu_dma_memcpy_nd( + metadata=A_l3l2_fifos[col], + bd_id=bd_id_base + 2 * tile_row + 1, + mem=A, + offsets=[0, 0, 0, A_offset], + sizes=A_sizes, + strides=A_strides, + ) + # # Use the calculated sizes/strides/offsets to record the data movement + # # caused by the above call to npu_dma_memcpy_nd. + # # This line does not change MLIR output at all. + if generate_taps: + A_taps.append( + TensorAccessPattern( + (M, K), + offset=A_offset, + sizes=A_sizes, + strides=A_strides, + ) + ) + + # B input transfer: + # Transfer the first a (n)-wide block of columns of B, + # Then transfer the (n_aie_columns)-th such block, and so on. + # Each shim will start at a different column offset. + # For example, shim 0 may transfer the tiles marked 0 below, + # and shim 1 may transfer the tiles marked 1. + # + # N + # ---------------- + # |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # K |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # |0011 0011 | + # ---------------- + B_col_offset = col * n if not b_col_maj else col * n * K + if not b_col_maj: + B_sizes = [N // n // n_aie_cols, K // k, k, n] + B_strides = [n * n_aie_cols, k * N, N, 1] + else: + B_sizes = [N // n // n_aie_cols, K // k, n, k] + B_strides = [n * n_aie_cols * K, k, K, 1] + + npu_dma_memcpy_nd( + metadata=B_l3l2_fifos[col], + bd_id=bd_id_base + 2 * tile_row + 2, + mem=B, + offsets=[0, 0, 0, B_col_offset], + sizes=B_sizes, + strides=B_strides, + ) + # # Use the calculated sizes/strides/offsets to record the data movement + # # caused by the above call to npu_dma_memcpy_nd. + # # This line does not change MLIR output at all. + if generate_taps: + B_taps.append( + TensorAccessPattern( + (K, N), + offset=B_col_offset, + sizes=B_sizes, + strides=B_strides, + ) + ) + if tb > 0 or (tb == 0 and pingpong > 0): + dma_wait(*C_l2l3_fifos) + dma_wait(*C_l2l3_fifos) + + if generate_taps: + # If generate_taps is true, return a representation of tensor tiles + # representing all the npu_dma_memcpy_nd runtime sequence operations per input/ouput tensor. + return ( + TensorAccessSequence.from_taps(A_taps), + TensorAccessSequence.from_taps(B_taps), + TensorAccessSequence.from_taps(C_taps), + ) + + +if __name__ == "__main__": + main() + + +# Maximum number of DMA transfer iterations per dimension (hardware limit) +MAX_DMA_ITERATIONS = 64 + + +def _find_tile_size(dim: int, max_tile: int, n_cores: int = 1) -> int: + """ + Find tile size that divides dim evenly, respecting multi-core distribution + and DMA transfer size constraints. + + We need: + 1. dim % (tile * n_cores) == 0 (for even distribution) + 2. dim / tile / n_cores <= MAX_DMA_ITERATIONS (for DMA hardware limit) + 3. tile <= max_tile (for memory constraints) + + Args: + dim: The dimension to tile. + max_tile: Maximum allowed tile size (based on memory constraints). + n_cores: Number of cores that will share the dimension. + + Returns: + Tile size such that constraints are satisfied, or raises ValueError if impossible. + """ + # Calculate minimum tile size to satisfy DMA iteration limit + # We need: dim / (tile * n_cores) <= MAX_DMA_ITERATIONS + # Therefore: tile >= dim / (MAX_DMA_ITERATIONS * n_cores) + min_tile_for_dma = (dim + MAX_DMA_ITERATIONS * n_cores - 1) // ( + MAX_DMA_ITERATIONS * n_cores + ) + + # If min_tile_for_dma > max_tile, there's no valid tile size + if min_tile_for_dma > max_tile: + # No tile satisfies both memory and DMA constraints + # Return max_tile and let the caller handle the error + return max_tile + + # Find the largest tile <= max_tile and >= min_tile_for_dma such that (tile * n_cores) divides dim + for tile in range(max_tile, min_tile_for_dma - 1, -1): + if dim % (tile * n_cores) == 0: + return tile + + # If no tile in the preferred range works, try smaller tiles down to 1 + # (this may violate DMA constraints but let caller check) + for tile in range(min_tile_for_dma - 1, 0, -1): + if dim % (tile * n_cores) == 0: + return tile + + # Fallback + return 1 + + +def create_mat_mul_external_functions( + arch: str, + input_tensors: list, + output_tensor, + M: int, + K: int, + N: int, +): + """ + Returns the parameters and names of the external functions for matrix multiplication. + + Args: + arch (str): Target architecture. + input_tensors: List of two input tensors. + output_tensor: Output tensor. + M: Number of rows in output matrix. + K: Shared dimension (columns of A, rows of B). + N: Number of columns in output matrix. + + Returns: + A tuple containing: + - m: The block size in the M dimension. + - n: The block size in the N dimension. + - k: The block size in the K dimension. + - use_scalar: Boolean indicating if scalar multiplication is used. + - mm_fn: The name of the matrix multiplication function. + - zero_fn: The name of the zeroing function. + """ + dtype_in_str = dtype_to_str(input_tensors[0].dtype) + use_scalar = dtype_in_str in SCALAR_ONLY_DTYPES + scalar_suffix = "_scalar" if use_scalar else "" + + if arch == "aie2": + num_cols = 4 + n_aie_rows = 4 + default_m, default_n, default_k = 8, 8, 8 + elif arch == "aie2p": + num_cols = 8 + n_aie_rows = 4 + default_m, default_n, default_k = 16, 16, 16 + else: + raise ValueError(f"Unsupported architecture: {arch}") + + if use_scalar: + # f32 scalar matmul on AIE has severe constraints due to lack of vectorized + # MAC and limited tile memory. Only small matrices can be supported. + # For larger matrices, CPU fallback is recommended. + dtype_size = np.dtype(input_tensors[0].dtype).itemsize + + # AIE2 memtile has ~512KB, AIE2 core tile has ~64KB + # We need buffers for A, B, C tiles plus double-buffering + # Conservative limit: keep total tile buffer usage under 32KB + MAX_TILE_BUFFER_BYTES = 32 * 1024 + + # For scalar mode, use single core (1 row × 1 column) to: + # 1. Avoid M divisibility constraints (M just needs to divide by m) + # 2. Reduce DMA transfer sizes to stay within hardware limits + num_cols = 1 + n_aie_rows = 1 + + # Calculate maximum tile sizes that fit in memory + # A tile: m × k, B tile: k × n, C tile: m × n + # With double buffering: 2 × (m×k + k×n + m×n) × dtype_size <= MAX_TILE_BUFFER_BYTES + # Simplified: assume m = n = k = tile_size + # 2 × 3 × tile_size² × dtype_size <= MAX_TILE_BUFFER_BYTES + # tile_size <= sqrt(MAX_TILE_BUFFER_BYTES / (6 × dtype_size)) + import math + + max_tile_by_memory = int(math.sqrt(MAX_TILE_BUFFER_BYTES / (6 * dtype_size))) + + # Find tile sizes that satisfy all constraints + # For scalar mode, use max_tile_by_memory directly (not default_m/n/k which are for vectorized mode) + m = _find_tile_size(M, max_tile_by_memory, n_aie_rows) + n = _find_tile_size(N, max_tile_by_memory, num_cols) + k = _find_tile_size(K, max_tile_by_memory, 1) + + # Calculate actual buffer sizes + a_tile_bytes = m * k * dtype_size + b_tile_bytes = k * n * dtype_size + c_tile_bytes = m * n * dtype_size + total_tile_bytes = 2 * (a_tile_bytes + b_tile_bytes + c_tile_bytes) + + if total_tile_bytes > MAX_TILE_BUFFER_BYTES: + raise ValueError( + f"Matrix multiplication {M}×{K}×{N} with dtype {dtype_in_str} requires " + f"{total_tile_bytes} bytes of tile buffer, exceeding AIE limit of " + f"{MAX_TILE_BUFFER_BYTES} bytes. Use CPU backend for this operation." + ) + + # Verify the constraints can be satisfied + if M % (m * n_aie_rows) != 0: + raise ValueError( + f"Matrix dimension M={M} not compatible with AIE tiling " + f"(requires M divisible by {n_aie_rows}*m for some m). " + "Use CPU backend for this operation." + ) + else: + m, n, k = default_m, default_n, default_k + + current_dir = Path(__file__).resolve().parent + source_file = str(current_dir / arch / "mm.cc") + compile_args = [ + f"-DDIM_M={m}", + f"-DDIM_N={n}", + f"-DDIM_K={k}", + f"-D{dtype_to_str(input_tensors[0].dtype)}_{dtype_to_str(output_tensor.dtype)}_ONLY", + "-DB_COL_MAJ", + "-DC_COL_MAJ", + ] + object_file_name = "matmul_core_functions.o" + + zero_fn = ExternalFunction( + name=f"zero{scalar_suffix}_{dtype_to_str(output_tensor.dtype)}", + object_file_name=object_file_name, + source_file=source_file, + arg_types=[np.ndarray[(m, n), np.dtype[output_tensor.dtype]]], + compile_flags=compile_args, + ) + + matmul_fn = ExternalFunction( + name=f"matmul{scalar_suffix}_{dtype_to_str(input_tensors[0].dtype)}_{dtype_to_str(output_tensor.dtype)}", + object_file_name=object_file_name, + source_file=source_file, + arg_types=[ + np.ndarray[(m, k), np.dtype[input_tensors[0].dtype]], + np.ndarray[(k, n), np.dtype[input_tensors[0].dtype]], + np.ndarray[(m, n), np.dtype[output_tensor.dtype]], + ], + compile_flags=compile_args, + ) + + return ( + m, + n, + k, + use_scalar, + num_cols, + n_aie_rows, + zero_fn, + matmul_fn, + ) + + +def gemm(arch: str, input_tensors: list, output_tensor, op_params: bytearray): + """ + IRON design for matrix multiplication. + + Args: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + input_tensors (list): List of two input tensors (A and B). + output_tensor: Output tensor (C). + op_params (bytearray): Operation-specific parameters as a bytearray. + + Returns: + The MLIR module representing the matrix multiplication operation. + """ + if len(input_tensors) != 2: + raise ValueError("Requires two input tensors") + + A = input_tensors[0] # MxK = A.shape(1) x A.shape(0) + B = input_tensors[1] # KxN = B.shape(0) x B.shape(1) + C = output_tensor # MxN = C.shape(0) x C.shape(1) + + # Check for batched operations (ne[2] > 1 or ne[3] > 1) + # The current kernel only supports 2D matrix multiplication, not batched. + if A.shape[2] > 1 or A.shape[3] > 1: + raise ValueError( + f"Batched matrix multiplication not supported: A has batch dims {A.shape[2]}x{A.shape[3]}. " + "Use CPU backend for batched operations." + ) + if B.shape[2] > 1 or B.shape[3] > 1: + raise ValueError( + f"Batched matrix multiplication not supported: B has batch dims {B.shape[2]}x{B.shape[3]}. " + "Use CPU backend for batched operations." + ) + if C.shape[2] > 1 or C.shape[3] > 1: + raise ValueError( + f"Batched matrix multiplication not supported: C has batch dims {C.shape[2]}x{C.shape[3]}. " + "Use CPU backend for batched operations." + ) + + if not A.contiguous or not B.contiguous or not C.contiguous: + raise ValueError("Tensors must be contiguous") + + if A.shape[1] != C.shape[0]: + raise ValueError(f"Incompatible M for A and C: {A.shape[1]} != {C.shape[0]}") + + if B.shape[1] != C.shape[1]: + raise ValueError(f"Incompatible N for B and C: {B.shape[1]} != {C.shape[1]}") + + if A.shape[0] != B.shape[0]: + raise ValueError(f"Incompatible K for A and B: {A.shape[0]} != {B.shape[0]}") + + if arch == "aie2": + dev = "npu" + elif arch == "aie2p": + dev = "npu2" + else: + raise ValueError(f"Unsupported architecture: {arch}") + + ( + m, + n, + k, + use_scalar, + num_cols, + num_rows, + zero_fn, + matmul_fn, + ) = create_mat_mul_external_functions( + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + M=A.shape[1], + K=A.shape[0], + N=B.shape[1], + ) + + with mlir_mod_ctx() as ctx: + my_matmul( + dev=dev, + M=A.shape[1], + N=B.shape[1], + K=A.shape[0], + m=m, + n=n, + k=k, + n_aie_cols=num_cols, + n_aie_rows=num_rows, + dtype_in_str=dtype_to_str(A.dtype), + dtype_out_str=dtype_to_str(C.dtype), + b_col_maj=True, + c_col_maj=True, + use_scalar=use_scalar, + emulate_bf16_mmul_with_bfp16=False, + trace_size=0, + zero_fn=zero_fn._name, + matmul_fn=matmul_fn._name, + object_file=matmul_fn.bin_name, + ) + return ctx.module diff --git a/src/ggml-hsa/kernels/iron/ggml-aie.hpp b/src/ggml-hsa/kernels/iron/ggml-aie.hpp new file mode 100644 index 0000000000..f31866d868 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/ggml-aie.hpp @@ -0,0 +1,51 @@ +// Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +#pragma once + +/** + * @file ggml-aie.hpp + * @brief Common type definitions and utilities for AIE kernels. + * + * This header provides type aliases and type traits used across AIE kernels. + */ + +#include +#include + +#include "aie_api/aie.hpp" + +using i8 = std::int8_t; ///< Signed 8-bit integer type alias. +using i16 = std::int16_t; ///< Signed 16-bit integer type alias. +using i32 = std::int32_t; ///< Signed 32-bit integer type alias. +using bf16 = bfloat16; ///< Brain floating-point 16-bit type alias. +using f32 = float; ///< 32-bit floating-point type alias. + +/** + * @brief Type trait to check if a type is a floating-point type. + * + * This extends std::is_floating_point to also recognize bfloat16 as a + * floating-point type, which is commonly used in AIE computations. + * + * @tparam T The type to check. + * + * Usage: + * @code + * static_assert(is_floating_point::value); // true + * static_assert(is_floating_point::value); // true + * static_assert(!is_floating_point::value); // true + * @endcode + */ +template +struct is_floating_point + : public std::integral_constant || std::is_same_v> {}; + +/** + * @brief Helper variable template for is_floating_point. + * + * @tparam T The type to check. + * + * @return true if T is a floating-point type (including bfloat16), false otherwise. + */ +template +constexpr bool is_floating_point_v = is_floating_point::value; diff --git a/src/ggml-hsa/kernels/iron/scale.cc b/src/ggml-hsa/kernels/iron/scale.cc new file mode 100644 index 0000000000..0dd3df8c69 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/scale.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +/** + * @file scale.cc + * @brief Scale and bias operation for AIE kernels. + */ + +#include "ggml-aie.hpp" + +extern "C" { + +/** + * @brief Applies scale and bias to each element: out[i] = in[i] * scale + bias. + * + * @param[in] in Input array of N float elements. + * @param[out] out Output array of N float elements. + * @param[in] N Number of elements to process. + * @param[in] scale Multiplicative scale factor. + * @param[in] bias Additive bias term. + */ +void ggml_op_scale( + const float * __restrict in, float * __restrict out, int32_t N, float scale, float bias) { + for (int i = 0; i < N; ++i) { + out[i] = in[i] * scale + bias; + } +} + +} // extern "C" \ No newline at end of file diff --git a/src/ggml-hsa/kernels/iron/scale.py b/src/ggml-hsa/kernels/iron/scale.py new file mode 100644 index 0000000000..751f5cee0c --- /dev/null +++ b/src/ggml-hsa/kernels/iron/scale.py @@ -0,0 +1,148 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON kernel implementation for the scale operation. +""" + +import struct +from pathlib import Path +from typing import Tuple + +import numpy as np + +from .utils import ( + suppress_import_pyxrt_msg, + arch_aligned_num_elements, + arch_to_device, + max_tile_size, +) + +suppress_import_pyxrt_msg() + +from aie.iron import ( + ObjectFifo, + Program, + Runtime, + Worker, + dtype_to_str, + ExternalFunction, +) +from aie.iron.placers import SequentialPlacer +from aie.iron.controlflow import range_ + + +def scale(arch: str, input_tensors: list, output_tensor, op_params: bytearray): + """ + IRON design for scale. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor: Output tensor. + op_params (op_params): Operation parameters. + """ + + if len(input_tensors) != 1: + raise ValueError("Operation requires exactly one input tensor.") + + if input_tensors[0].contiguous is False or output_tensor.contiguous is False: + raise ValueError("Input and output tensors must be contiguous in memory.") + + if input_tensors[0].shape != output_tensor.shape: + raise ValueError("Input and output tensors must have the same shape.") + + input_tensor = input_tensors[0] + + s = struct.unpack_from("f", op_params, 0)[0] + b = struct.unpack_from("f", op_params, 4)[0] + + function, num_elements, tile_size = _create_external_function( + arch=arch, + op_name="GGML_OP_SCALE", + input_tensor=input_tensor, + output_tensor=output_tensor, + ) + + num_tiles = num_elements // tile_size + assert num_elements % tile_size == 0 + + # AIE-array data movement with object fifos + input_tile_ty = np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]] + output_tile_ty = np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]] + of_in = ObjectFifo(input_tile_ty, name="in") + of_out = ObjectFifo(output_tile_ty, name="out") + + # Task for the core to perform with an external function + def ext_core_fn(of_in, of_out, function): + # Number of sub-vector "tile" iterations + for _ in range_(num_tiles): + elem_in = of_in.acquire(1) + elem_out = of_out.acquire(1) + function(elem_in, elem_out, tile_size, s, b) + of_in.release(1) + of_out.release(1) + + # Create a worker to run the task on a compute tile + worker = Worker(ext_core_fn, fn_args=[of_in.cons(), of_out.prod(), function]) + + # Runtime operations to move data to/from the AIE-array + rt = Runtime() + input_tensor_ty = np.ndarray[(num_elements,), np.dtype[input_tensor.dtype]] + output_tensor_ty = np.ndarray[(num_elements,), np.dtype[output_tensor.dtype]] + with rt.sequence(input_tensor_ty, output_tensor_ty) as (a_in, b_out): + rt.start(worker) + rt.fill(of_in.prod(), a_in) + rt.drain(of_out.cons(), b_out, wait=True) + + # Place program components (assign them resources on the device) and generate an MLIR module + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def _create_external_function( + arch: str, + op_name: str, + input_tensor, + output_tensor, +) -> Tuple[ExternalFunction, int, int]: + """ + Creates an ExternalFunction specification for the scale operation. + + Parameters: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + op_name (str): Operation name used for function naming and compile flags. + input_tensor (TensorDesc): Input tensor descriptor providing dtype and size. + output_tensor (TensorDesc): Output tensor descriptor providing dtype. + + Returns: + Tuple[ExternalFunction, int, int]: A tuple containing: + - func: The configured ExternalFunction specification. + - num_elements: Architecture-aligned number of elements. + - tile_size: Size of each processing tile. + """ + + num_elements = arch_aligned_num_elements(arch=arch, tensor=input_tensor) + tile_size = max_tile_size(arch, input_tensor.dtype, num_elements) + + current_dir = Path(__file__).resolve().parent + func = ExternalFunction( + name=f"{op_name.lower()}", + object_file_name=f"{op_name.lower()}_core_function.o", + source_file=str(current_dir / "scale.cc"), + arg_types=[ + np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]], + np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]], + np.int32, + np.float32, + np.float32, + ], + compile_flags=[ + f"-DINPUT_DTYPE={dtype_to_str(input_tensor.dtype)}", + f"-DOUTPUT_DTYPE={dtype_to_str(output_tensor.dtype)}", + ], + ) + return func, num_elements, tile_size diff --git a/src/ggml-hsa/kernels/iron/softmax.cc b/src/ggml-hsa/kernels/iron/softmax.cc new file mode 100644 index 0000000000..e283e17d6b --- /dev/null +++ b/src/ggml-hsa/kernels/iron/softmax.cc @@ -0,0 +1,342 @@ +// Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. + +#include "aie_kernel_math.h" +#include "ggml-aie.hpp" +#include + +extern "C" { +#ifdef GGML_OP_SOFT_MAX +/** + * @brief Computes softmax without mask or sink tensors. + * + * Implements the numerically stable softmax: softmax(x_i) = exp(x_i - max) / sum(exp(x_j - max)) + * + * Algorithm: + * 1. Find global maximum of scaled inputs for numerical stability. + * 2. Compute exp(scale*x - max) for each element and accumulate sum. + * 3. Normalize by dividing each exp value by the sum. + * + * @param[in] in Input tensor of size N elements. + * @param[out] out Output tensor of size N elements (can alias in). + * @param[in] N Number of elements in the row (must be divisible by KERN_VEC_SIZE). + * @param[in] scale Scale factor applied to input before softmax. + * @param[in] max_bias Unused in this variant (kept for API consistency). + */ +void ggml_op_soft_max(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N, + float scale, + float max_bias) { + event0(); + + constexpr int VEC_SIZE = KERN_VEC_SIZE; + const int num_iters = N / VEC_SIZE; + + auto it_in = aie::cbegin_vector((float *)in); + auto it_exp_out = aie::begin_vector((float *)out); + auto it_scale_in = aie::cbegin_restrict_vector((float *)out); + auto it_soft_out = aie::begin_restrict_vector((float *)out); + + // find max value for numerical stability + + auto it_max_in = aie::cbegin_vector((float *)in); + aie::vector v_max = aie::broadcast(-3.4028235e+38f); + + for (int i = 0; i < num_iters; i++) { + aie::vector input_vec = *it_max_in++; + aie::accum scaled_accum = aie::mul(input_vec, scale); + aie::vector scaled_input = scaled_accum.to_vector(); + v_max = aie::max(v_max, scaled_input); + } + + float global_max = aie::reduce_max(v_max); + aie::vector v_global_max = aie::broadcast(global_max); + + // compute exp(x - max) and sum + + aie::accum v_sum_accum = aie::zeros(); + + for (int i = 0; i < num_iters; i++) { + aie::vector input_vec = *it_in++; + + // apply scale + aie::accum scaled_accum = aie::mul(input_vec, scale); + aie::vector scaled_input = scaled_accum.to_vector(); + + // subtract max for numerical stability + aie::vector x = aie::sub(scaled_input, v_global_max); + + // compute exp(x) + aie::vector exp_val = vec_exp(x); + + // accumulate sum + v_sum_accum = aie::add(v_sum_accum, exp_val); + + // store exp values + *it_exp_out++ = exp_val; + } + + // normalize by dividing by sum + + aie::vector v_sum_vec = v_sum_accum.to_vector(); + float sum_total = aie::reduce_add(v_sum_vec); + float sum_inv = aie::inv(sum_total); + + for (int i = 0; i < num_iters; i++) { + aie::vector in_elems = *it_scale_in++; + aie::accum out_accum = aie::mul(in_elems, sum_inv); + *it_soft_out++ = out_accum.to_vector(); + } + + event1(); +} + +#endif // GGML_OP_SOFT_MAX + +#ifdef GGML_OP_SOFT_MAX_WITH_MASK +/** + * @brief Computes softmax with a mask tensor and ALiBi position biases. + * + * Implements: softmax(scale*x + slope*mask) where slope is computed via ALiBi. + * + * Algorithm: + * 1. Compute ALiBi slope for current head based on tile_idx and rows_per_head. + * 2. Find global maximum of (scale*input + slope*mask) for numerical stability. + * 3. Compute exp(scale*x + slope*mask - max) and accumulate sum. + * 4. Normalize by dividing each exp value by the sum. + * + * @param[in] in Input tensor of size N elements. + * @param[in] mask Mask tensor of size N elements (e.g., causal attention mask). + * @param[out] out Output tensor of size N elements (can alias in). + * @param[in] N Number of elements in the row (must be divisible by KERN_VEC_SIZE). + * @param[in] scale Scale factor applied to input. + * @param[in] max_bias Maximum ALiBi bias value. + * @param[in] n_head Total number of attention heads. + * @param[in] tile_idx Current tile index (used to determine head index). + * @param[in] rows_per_head Number of rows per attention head. + */ +void ggml_op_soft_max_with_mask(const INPUT_DTYPE * __restrict in, + const MASK_DTYPE * __restrict mask, + OUTPUT_DTYPE * __restrict out, + int32_t N, + float scale, + float max_bias, + int32_t n_head, + int32_t tile_idx, + int32_t rows_per_head) { + event0(); + + constexpr int VEC_SIZE = KERN_VEC_SIZE; + const int num_iters = N / VEC_SIZE; + + // compute ALiBi slope + uint32_t head_idx = (uint32_t)(tile_idx / rows_per_head); + float slope = compute_alibi_slope(max_bias, (uint32_t)n_head, head_idx); + + auto it_in = aie::cbegin_vector((float *)in); + auto it_mask = aie::cbegin_vector((float *)mask); + auto it_exp_out = aie::begin_vector((float *)out); + auto it_scale_in = aie::cbegin_restrict_vector((float *)out); + auto it_soft_out = aie::begin_restrict_vector((float *)out); + + // find max(scale * in + slope * mask) + + auto it_max_in = aie::cbegin_vector((float *)in); + auto it_max_mask = aie::cbegin_vector((float *)mask); + aie::vector v_max = aie::broadcast(-3.4028235e+38f); + + for (int i = 0; i < num_iters; i++) { + aie::vector input_vec = *it_max_in++; + aie::vector mask_vec = *it_max_mask++; + + // scaled_input = in * scale + aie::accum scaled_accum = aie::mul(input_vec, scale); + aie::vector scaled_input = scaled_accum.to_vector(); + + // scaled_mask = mask * slope (ALiBi) + aie::accum mask_accum = aie::mul(mask_vec, slope); + aie::vector scaled_mask = mask_accum.to_vector(); + + // masked_input = scaled_input + scaled_mask + aie::vector masked_input = aie::add(scaled_input, scaled_mask); + + v_max = aie::max(v_max, masked_input); + } + + float global_max = aie::reduce_max(v_max); + aie::vector v_global_max = aie::broadcast(global_max); + + // compute exp(scale * in + slope * mask - max) and accumulate sum + + aie::accum v_sum_accum = aie::zeros(); + + for (int i = 0; i < num_iters; i++) { + aie::vector input_vec = *it_in++; + aie::vector mask_vec = *it_mask++; + + // scaled_input = in * scale + aie::accum scaled_accum = aie::mul(input_vec, scale); + aie::vector scaled_input = scaled_accum.to_vector(); + + // scaled_mask = mask * slope (ALiBi) + aie::accum mask_accum = aie::mul(mask_vec, slope); + aie::vector scaled_mask = mask_accum.to_vector(); + + // masked_input = scaled_input + scaled_mask + aie::vector masked_input = aie::add(scaled_input, scaled_mask); + + // x = masked_input - max (numerical stability) + aie::vector x = aie::sub(masked_input, v_global_max); + + // exp_val = exp(x) + aie::vector exp_val = vec_exp(x); + + // accumulate sum + v_sum_accum = aie::add(v_sum_accum, exp_val); + + // store exp values for normalization pass + *it_exp_out++ = exp_val; + } + + // normalize by dividing by sum + + aie::vector v_sum_vec = v_sum_accum.to_vector(); + float sum_total = aie::reduce_add(v_sum_vec); + float sum_inv = aie::inv(sum_total); + + for (int i = 0; i < num_iters; i++) { + aie::vector in_elems = *it_scale_in++; + aie::accum out_accum = aie::mul(in_elems, sum_inv); + *it_soft_out++ = out_accum.to_vector(); + } + + event1(); +} + +#endif // GGML_OP_SOFT_MAX_WITH_MASK + +#ifdef GGML_OP_SOFT_MAX_WITH_MASK_AND_SINKS +/** + * @brief Computes softmax with mask and sink (attention sink) tensors. + * + * This variant includes a "sink" value per attention head that participates + * in the softmax normalization but is not stored in the output. Used for + * streaming attention where early tokens act as attention sinks. + * + * Algorithm: + * 1. Get sink value for current head based on tile_idx. + * 2. Find global maximum including both (scale*input + mask) and sink. + * 3. Compute exp(scale*x + mask - max) and accumulate sum. + * 4. Add exp(sink - max) to sum for proper normalization. + * 5. Normalize output by dividing each exp value by total sum. + * + * @param[in] in Input tensor of size N elements. + * @param[in] mask Mask tensor of size N elements. + * @param[in] sinks Per-head sink values array (indexed by head_idx). + * @param[out] out Output tensor of size N elements (can alias in). + * @param[in] N Number of elements in the row (must be divisible by KERN_VEC_SIZE). + * @param[in] tile_idx Current tile index (used to determine head index). + * @param[in] rows_per_head Number of rows per attention head. + * @param[in] scale Scale factor applied to input. + * @param[in] max_bias Unused in this variant. + */ +void ggml_op_soft_max_with_mask_and_sinks(const INPUT_DTYPE * __restrict in, + const MASK_DTYPE * __restrict mask, + const SINK_DTYPE * __restrict sinks, + OUTPUT_DTYPE * __restrict out, + int32_t N, + int32_t tile_idx, + int32_t rows_per_head, + float scale, + float max_bias) { + event0(); + + constexpr int VEC_SIZE = KERN_VEC_SIZE; + const int num_iters = N / VEC_SIZE; + + // calculate which head this tile belongs to and get the sink value + int32_t head_idx = tile_idx / rows_per_head; + float sink_val = sinks[head_idx]; + + auto it_in = aie::cbegin_vector((float *)in); + auto it_mask = aie::cbegin_vector((float *)mask); + auto it_exp_out = aie::begin_vector((float *)out); + auto it_scale_in = aie::cbegin_restrict_vector((float *)out); + auto it_soft_out = aie::begin_restrict_vector((float *)out); + + // find max value for numerical stability + auto it_max_in = aie::cbegin_vector((float *)in); + auto it_max_mask = aie::cbegin_vector((float *)mask); + aie::vector v_max = aie::broadcast(-3.4028235e+38f); + + for (int i = 0; i < num_iters; i++) { + aie::vector input_vec = *it_max_in++; + aie::vector mask_vec = *it_max_mask++; + + // scaled_input = in * scale + aie::accum scaled_accum = aie::mul(input_vec, scale); + aie::vector scaled_input = scaled_accum.to_vector(); + + // masked_input = scaled_input + mask + aie::vector masked_input = aie::add(scaled_input, mask_vec); + + v_max = aie::max(v_max, masked_input); + } + + // reduce to scalar max, then include sink in max calculation + float global_max = aie::reduce_max(v_max); + global_max = (global_max > sink_val) ? global_max : sink_val; + aie::vector v_global_max = aie::broadcast(global_max); + + // compute exp(scale * in + mask - max) and accumulate sum + aie::accum v_sum_accum = aie::zeros(); + + for (int i = 0; i < num_iters; i++) { + aie::vector input_vec = *it_in++; + aie::vector mask_vec = *it_mask++; + + // scaled_input = in * scale + aie::accum scaled_accum = aie::mul(input_vec, scale); + aie::vector scaled_input = scaled_accum.to_vector(); + + // masked_input = scaled_input + mask + aie::vector masked_input = aie::add(scaled_input, mask_vec); + + // x = masked_input - max (for numerical stability) + aie::vector x = aie::sub(masked_input, v_global_max); + + // exp_val = exp(x) + aie::vector exp_val = vec_exp(x); + + // accumulate sum + v_sum_accum = aie::add(v_sum_accum, exp_val); + + // store exp values for normalization pass + *it_exp_out++ = exp_val; + } + + // reduce sum across vector lanes + aie::vector v_sum_vec = v_sum_accum.to_vector(); + float sum_total = aie::reduce_add(v_sum_vec); + + // compute exp(sink - max) using vec_exp and add to sum + float sink_shifted = sink_val - global_max; + aie::vector sink_vec = aie::broadcast(sink_shifted); + aie::vector sink_exp_vec = vec_exp(sink_vec); + float sink_exp = sink_exp_vec.get(0); + + sum_total += sink_exp; + float sum_inv = aie::inv(sum_total); + + // normalize by multiplying with 1/sum + for (int i = 0; i < num_iters; i++) { + aie::vector in_elems = *it_scale_in++; + aie::accum out_accum = aie::mul(in_elems, sum_inv); + *it_soft_out++ = out_accum.to_vector(); + } + + event1(); +} +#endif // GGML_OP_SOFT_MAX_WITH_MASK_AND_SINKS + +} // extern "C" diff --git a/src/ggml-hsa/kernels/iron/softmax.py b/src/ggml-hsa/kernels/iron/softmax.py new file mode 100644 index 0000000000..c48144112c --- /dev/null +++ b/src/ggml-hsa/kernels/iron/softmax.py @@ -0,0 +1,539 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON kernel implementation for the softmax operation. +""" + +import struct +from pathlib import Path +from typing import Any, Optional, Tuple + +import numpy as np + +from .utils import ( + align_to_arch, + arch_to_device, + suppress_import_pyxrt_msg, +) + +suppress_import_pyxrt_msg() + +from aie.dialects.arith import index_cast +from aie.ir import IntegerType +from aie.iron import ( + ExternalFunction, + ObjectFifo, + Program, + Runtime, + Worker, + dtype_to_str, +) +from aie.iron.controlflow import range_ +from aie.iron.placers import SequentialPlacer + + +def get_softmax_dimensions(tensor) -> Tuple[int, int]: + """ + Extract softmax dimensions from tensor shape. + + GGML convention: softmax is over dimension 0 (ne00). + GGML shape ordering: (ne00, ne01, ne02, ne03) where ne00 is innermost. + + Parameters: + tensor: Input tensor with shape in GGML order. + + Returns: + Tuple of (row_length, num_rows) where: + - row_length = ne00 (dimension over which softmax is computed) + - num_rows = ne01 * ne02 * ne03 (number of independent rows) + """ + shape = tensor.shape + + if len(shape) == 1: + # shape = (ne00,) + return shape[0], 1 + elif len(shape) == 2: + # shape = (ne00, ne01) + return shape[0], shape[1] + elif len(shape) == 3: + # shape = (ne00, ne01, ne02) + return shape[0], shape[1] * shape[2] + elif len(shape) == 4: + # shape = (ne00, ne01, ne02, ne03) + return shape[0], shape[1] * shape[2] * shape[3] + else: + raise ValueError(f"Unsupported tensor rank: {len(shape)}") + + +# Vector size for AIE kernel vector operations. +# This must match the vector width used in the C++ kernel implementation +# and constrains row lengths to be multiples of this value. +KERN_VEC_SIZE = 8 + + +def softmax(arch: str, input_tensors: list, output_tensor, op_params: bytearray): + """ + IRON design for softmax. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of 1-3 input tensors: + - input_tensors[0]: Input tensor (required) + - input_tensors[1]: Mask tensor (optional) + - input_tensors[2]: Sink tensor (optional) + output_tensor: Output tensor. + op_params (bytearray): Operation parameters (scale, max_bias). + """ + + input_tensor_count = len(input_tensors) + + if input_tensor_count < 1 or input_tensor_count > 3: + raise ValueError(f"Operation requires 1, 2, or 3 tensors: {input_tensor_count}") + + input_tensor = input_tensors[0] + mask_tensor = input_tensors[1] if input_tensor_count >= 2 else None + sink_tensor = input_tensors[2] if input_tensor_count >= 3 else None + + if not input_tensor.contiguous: + raise ValueError("Input tensor must be contiguous in memory.") + if not output_tensor.contiguous: + raise ValueError("Output tensor must be contiguous in memory.") + if mask_tensor is not None and not mask_tensor.contiguous: + raise ValueError("Mask tensor must be contiguous in memory.") + + if sink_tensor is not None: + raise ValueError( + "Softmax with sink tensor is not supported on AIE. " + "AIE tiles are limited to 2 input DMA channels, but softmax with " + "mask and sink requires 3 input streams." + ) + # Uncomment when the DMA-channel issue is resolved + # if sink_tensor is not None and not sink_tensor.contiguous: + # raise ValueError("Sink tensor must be contiguous in memory.") + + # Currently f16 mask is not supported as we use f32 vector instructions. + if mask_tensor is not None and mask_tensor.dtype == np.dtype("bfloat16"): + raise ValueError("Softmax with bfloat16 mask is not supported on AIE.") + + if input_tensor.shape != output_tensor.shape: + raise ValueError("Input and output tensors must have the same shape.") + + # Unpack op_params: scale and max_bias + scale = struct.unpack_from("f", op_params, 0)[0] + max_bias = struct.unpack_from("f", op_params, 4)[0] + + op_name = "GGML_OP_SOFT_MAX" + + if input_tensor_count == 1: + return create_unary_program( + arch, op_name, input_tensor, output_tensor, scale, max_bias + ) + elif input_tensor_count == 2: + return create_binary_program( + arch, op_name, input_tensor, mask_tensor, output_tensor, scale, max_bias + ) + else: # input_tensor_count == 3 + return create_ternary_program( + arch, + op_name, + input_tensor, + mask_tensor, + sink_tensor, + output_tensor, + scale, + max_bias, + ) + + +def create_unary_program(arch, op_name, input_tensor, output_tensor, scale, max_bias): + """ + Creates an IRON program for basic softmax without mask or sink tensors. + + Parameters: + arch (str): Target architecture. + op_name (str): Operation name for the external function. + input_tensor: Input tensor description. + output_tensor: Output tensor description. + scale (float): Scaling factor applied before exponentiation. + max_bias (float): Maximum bias (unused in unary variant). + + Returns: + MLIR module representing the softmax program. + """ + function, num_elements, tile_size = _create_external_function( + arch=arch, + op_name=op_name, + input_tensor=input_tensor, + mask_tensor=None, + sink_tensor=None, + output_tensor=output_tensor, + ) + + num_tiles = num_elements // tile_size + assert num_elements % tile_size == 0 + + input_tile_ty = np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]] + output_tile_ty = np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]] + + of_in = ObjectFifo(input_tile_ty, name="in") + of_out = ObjectFifo(output_tile_ty, name="out") + + def ext_core_fn(of_in, of_out, function): + for _ in range_(num_tiles): + elem_in = of_in.acquire(1) + elem_out = of_out.acquire(1) + function(elem_in, elem_out, tile_size, scale, max_bias) + of_in.release(1) + of_out.release(1) + + worker = Worker(ext_core_fn, fn_args=[of_in.cons(), of_out.prod(), function]) + + rt = Runtime() + input_tensor_ty = np.ndarray[(num_elements,), np.dtype[input_tensor.dtype]] + output_tensor_ty = np.ndarray[(num_elements,), np.dtype[output_tensor.dtype]] + + with rt.sequence(input_tensor_ty, output_tensor_ty) as (a_in, b_out): + rt.start(worker) + rt.fill(of_in.prod(), a_in) + rt.drain(of_out.cons(), b_out, wait=True) + + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def create_binary_program( + arch, op_name, input_tensor, mask_tensor, output_tensor, scale, max_bias +): + """ + Creates an IRON program for softmax with a mask tensor. + + This variant supports attention masking where the mask is added to the input + before computing softmax. It also supports ALiBi positional encoding when + max_bias > 0. + + Parameters: + arch (str): Target architecture. + op_name (str): Operation name for the external function. + input_tensor: Input tensor description. + mask_tensor: Mask tensor description (added to input before softmax). + output_tensor: Output tensor description. + scale (float): Scaling factor applied before exponentiation. + max_bias (float): Maximum bias for ALiBi positional encoding. + + Returns: + MLIR module representing the masked softmax program. + """ + func_result = _create_external_function( + arch=arch, + op_name=op_name, + input_tensor=input_tensor, + mask_tensor=mask_tensor, + sink_tensor=None, + output_tensor=output_tensor, + ) + function = func_result[0] + num_elements_in = func_result[1] + tile_size_in = func_result[2] + tile_size_mask = func_result[3] + func_result[4] + num_elements_mask = func_result[5] + n_head = func_result[6] + rows_per_head = func_result[7] + + num_tiles_in = num_elements_in // tile_size_in + num_tiles_mask = num_elements_mask // tile_size_mask + + assert num_elements_in % tile_size_in == 0 + assert num_elements_mask % tile_size_mask == 0 + assert num_elements_in == num_elements_mask + assert num_tiles_in == num_tiles_mask + + input_tile_ty = np.ndarray[(tile_size_in,), np.dtype[input_tensor.dtype]] + mask_tile_ty = np.ndarray[(tile_size_mask,), np.dtype[mask_tensor.dtype]] + output_tile_ty = np.ndarray[(tile_size_in,), np.dtype[output_tensor.dtype]] + + of_in = ObjectFifo(input_tile_ty, name="in") + of_mask = ObjectFifo(mask_tile_ty, name="mask") + of_out = ObjectFifo(output_tile_ty, name="out") + + def ext_core_fn(of_in, of_mask, of_out, function): + for tile_idx in range_(num_tiles_in): + elem_in = of_in.acquire(1) + elem_mask = of_mask.acquire(1) + elem_out = of_out.acquire(1) + + tile_idx_i32 = index_cast(IntegerType.get_signless(32), tile_idx) + + function( + elem_in, + elem_mask, + elem_out, + tile_size_in, + scale, + max_bias, + n_head, + tile_idx_i32, + rows_per_head, + ) + of_in.release(1) + of_mask.release(1) + of_out.release(1) + + worker = Worker( + ext_core_fn, fn_args=[of_in.cons(), of_mask.cons(), of_out.prod(), function] + ) + + rt = Runtime() + + input_tensor_ty = np.ndarray[(num_elements_in,), np.dtype[input_tensor.dtype]] + mask_tensor_ty = np.ndarray[(num_elements_mask,), np.dtype[mask_tensor.dtype]] + output_tensor_ty = np.ndarray[(num_elements_in,), np.dtype[output_tensor.dtype]] + + with rt.sequence(input_tensor_ty, mask_tensor_ty, output_tensor_ty) as ( + a_in, + a_mask, + b_out, + ): + rt.start(worker) + rt.fill(of_in.prod(), a_in) + rt.fill(of_mask.prod(), a_mask) + rt.drain(of_out.cons(), b_out, wait=True) + + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def create_ternary_program( + arch, + op_name, + input_tensor, + mask_tensor, + sink_tensor, + output_tensor, + scale, + max_bias, +): + """ + Softmax with mask tensor and sink tensor. + + Sink tensor contains one value per head. The kernel receives the full + sink array and indexes into it based on tile_idx and rows_per_head. + """ + func_result = _create_external_function( + arch=arch, + op_name=op_name, + input_tensor=input_tensor, + mask_tensor=mask_tensor, + sink_tensor=sink_tensor, + output_tensor=output_tensor, + ) + + function = func_result[0] + num_elements_in = func_result[1] + tile_size_in = func_result[2] + tile_size_mask = func_result[3] + func_result[4] + num_elements_mask = func_result[5] + num_sinks = func_result[6] + rows_per_head = func_result[7] + + num_tiles_in = num_elements_in // tile_size_in + num_tiles_mask = num_elements_mask // tile_size_mask + + assert num_elements_in % tile_size_in == 0 + assert num_elements_mask % tile_size_mask == 0 + assert num_elements_in == num_elements_mask + assert num_tiles_in == num_tiles_mask + + input_tile_ty = np.ndarray[(tile_size_in,), np.dtype[input_tensor.dtype]] + mask_tile_ty = np.ndarray[(tile_size_mask,), np.dtype[mask_tensor.dtype]] + output_tile_ty = np.ndarray[(tile_size_in,), np.dtype[output_tensor.dtype]] + + # entire sink array passed once, not tiled + sink_array_ty = np.ndarray[(num_sinks,), np.dtype[sink_tensor.dtype]] + + of_in = ObjectFifo(input_tile_ty, name="in") + of_mask = ObjectFifo(mask_tile_ty, name="mask") + of_sink = ObjectFifo(sink_array_ty, name="sink", depth=1) # Single buffer + of_out = ObjectFifo(output_tile_ty, name="out") + + def ext_core_fn(of_in, of_mask, of_sink, of_out, function): + # acquire sink array once at the start + sink_array = of_sink.acquire(1) + + for tile_idx in range_(num_tiles_in): + elem_in = of_in.acquire(1) + elem_mask = of_mask.acquire(1) + elem_out = of_out.acquire(1) + + # convert tile_idx from index type to i32 + tile_idx_i32 = index_cast(IntegerType.get_signless(32), tile_idx) + + function( + elem_in, + elem_mask, + sink_array, + elem_out, + tile_size_in, + tile_idx_i32, + rows_per_head, + scale, + max_bias, + ) + + of_in.release(1) + of_mask.release(1) + of_out.release(1) + + # release sink array after all tiles processed + of_sink.release(1) + + worker = Worker( + ext_core_fn, + fn_args=[of_in.cons(), of_mask.cons(), of_sink.cons(), of_out.prod(), function], + ) + + rt = Runtime() + + input_tensor_ty = np.ndarray[(num_elements_in,), np.dtype[input_tensor.dtype]] + mask_tensor_ty = np.ndarray[(num_elements_mask,), np.dtype[mask_tensor.dtype]] + sink_tensor_ty = np.ndarray[(num_sinks,), np.dtype[sink_tensor.dtype]] + output_tensor_ty = np.ndarray[(num_elements_in,), np.dtype[output_tensor.dtype]] + + with rt.sequence( + input_tensor_ty, mask_tensor_ty, sink_tensor_ty, output_tensor_ty + ) as (a_in, a_mask, a_sink, b_out): + rt.start(worker) + rt.fill(of_in.prod(), a_in) + rt.fill(of_mask.prod(), a_mask) + rt.fill(of_sink.prod(), a_sink) + rt.drain(of_out.cons(), b_out, wait=True) + + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def _create_external_function( + arch: str, + op_name: str, + input_tensor: Any, + mask_tensor: Optional[Any], + sink_tensor: Optional[Any], + output_tensor: Any, +) -> Tuple: + """ + Creates an external function specification for softmax variants. + + Returns: + If no mask or sink tensor: + (func, num_elements_in, tile_size_in) + If mask tensor only: + (func, num_elements_in, tile_size_in, tile_size_mask, num_rows_mask, num_elements_mask, n_head, rows_per_head) + If mask and sink tensor: + (func, num_elements_in, tile_size_in, tile_size_mask, num_rows_mask, num_elements_mask, num_sinks, rows_per_head) + """ + row_length_in, num_rows_in = get_softmax_dimensions(input_tensor) + + # Probably aligning doesn't make sense as we already do not allow unaligned + # inputs but keeping this as we'll need it in the future + tile_size_in = align_to_arch(arch, row_length_in, input_tensor.dtype, KERN_VEC_SIZE) + + # Currently we do not support unaligned row sizes as we use vector + # instructions with a fixed length. + if row_length_in % KERN_VEC_SIZE != 0: + raise ValueError( + f"Input row length ({row_length_in}) must be a multiple of {KERN_VEC_SIZE}. " + ) + num_elements_in = tile_size_in * num_rows_in + + arg_types = [np.ndarray[(tile_size_in,), np.dtype[input_tensor.dtype]]] + compile_flags = [ + f"-DINPUT_DTYPE={dtype_to_str(input_tensor.dtype)}", + f"-DKERN_VEC_SIZE={KERN_VEC_SIZE}", + ] + + result_extra = [] + + if mask_tensor is not None: + row_length_mask, num_rows_mask = get_softmax_dimensions(mask_tensor) + tile_size_mask = align_to_arch( + arch, row_length_mask, mask_tensor.dtype, KERN_VEC_SIZE + ) + tile_size_mask = align_to_arch( + arch, row_length_mask, mask_tensor.dtype, KERN_VEC_SIZE + ) + num_elements_mask = tile_size_mask * num_rows_mask + + if row_length_mask % KERN_VEC_SIZE != 0: + raise ValueError( + f"Mask row length ({row_length_mask}) must be a multiple of {KERN_VEC_SIZE}. " + ) + + arg_types.append(np.ndarray[(tile_size_mask,), np.dtype[mask_tensor.dtype]]) + compile_flags.append(f"-DMASK_DTYPE={dtype_to_str(mask_tensor.dtype)}") + result_extra.extend([tile_size_mask, num_rows_mask, num_elements_mask]) + + input_shape = input_tensor.shape + if len(input_shape) >= 3: + n_head = input_shape[2] # ne02 + elif len(input_shape) == 2: + n_head = 1 + else: + n_head = 1 + + rows_per_head = num_rows_in // n_head if n_head > 0 else 1 + result_extra.extend([n_head, rows_per_head]) + + if sink_tensor is not None: + # sink is 1D: one value per head + num_sinks = sink_tensor.shape[0] + rows_per_head = num_rows_in // num_sinks if num_sinks > 0 else 1 + + arg_types.append(np.ndarray[(num_sinks,), np.dtype[sink_tensor.dtype]]) + compile_flags.append(f"-DSINK_DTYPE={dtype_to_str(sink_tensor.dtype)}") + if mask_tensor is not None: + result_extra = result_extra[:-2] + result_extra.extend([num_sinks, rows_per_head]) + + # output tensor + arg_types.append(np.ndarray[(tile_size_in,), np.dtype[output_tensor.dtype]]) + compile_flags.append(f"-DOUTPUT_DTYPE={dtype_to_str(output_tensor.dtype)}") + + arg_types.append(np.int32) # tile_size + + # additional arguments for sink variant + if sink_tensor is not None: + arg_types.append(np.int32) # tile_idx + arg_types.append(np.int32) # rows_per_head + + arg_types.append(np.float32) # scale + arg_types.append(np.float32) # max_bias + + # add ALiBi parameters for mask variant (without sink) + if mask_tensor is not None and sink_tensor is None: + arg_types.append(np.int32) # n_head + arg_types.append(np.int32) # tile_idx (passed dynamically) + arg_types.append(np.int32) # rows_per_head + # determine function name and compile directive + function_name = op_name.lower() + if mask_tensor is not None and sink_tensor is not None: + function_name = function_name + "_with_mask_and_sinks" + compile_flags.append(f"-D{op_name}_WITH_MASK_AND_SINKS=1") + elif mask_tensor is not None: + function_name = function_name + "_with_mask" + compile_flags.append(f"-D{op_name}_WITH_MASK=1") + else: + compile_flags.append(f"-D{op_name}=1") + + current_dir = Path(__file__).resolve().parent + func = ExternalFunction( + name=function_name, + object_file_name=f"{function_name}_core_function.o", + source_file=str(current_dir / "softmax.cc"), + arg_types=arg_types, + compile_flags=compile_flags, + ) + + return (func, num_elements_in, tile_size_in, *result_extra) diff --git a/src/ggml-hsa/kernels/iron/unary_ops.cc b/src/ggml-hsa/kernels/iron/unary_ops.cc new file mode 100644 index 0000000000..2e8d0dadff --- /dev/null +++ b/src/ggml-hsa/kernels/iron/unary_ops.cc @@ -0,0 +1,302 @@ +// Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +/** + * @file unary_ops.cc + * @brief Scalar unary operations for AIE kernels. + * + * This file implements various element-wise unary operations such as + * sqr, sqrt, abs, sgn, neg, step, relu, hardsigmoid, hardswish, + * floor, ceil, round, and trunc. + */ + +#include "ggml-aie.hpp" + +/** + * @brief Applies a unary operation to each element of an input array. + * + * @tparam T Element type of the input and output arrays. + * @tparam Size Integer type for the count parameter. + * @tparam UnaryOp Callable type that takes a single element and returns the transformed value. + * + * @param[in] in Input array of count elements. + * @param[in] count Number of elements to process. + * @param[out] out Output array of count elements. + * @param[in] op Unary operation to apply to each element. + */ +template +void transform_n(const T * __restrict in, Size count, T * __restrict out, UnaryOp op) { + event0(); + for (Size i = 0; i < count; ++i) { + out[i] = op(in[i]); + } + event1(); +} + +extern "C" { + +#ifdef GGML_OP_SQR + +/** + * @brief Computes the square of each element: out[i] = in[i]^2. + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_op_sqr(const INPUT_DTYPE * __restrict in, OUTPUT_DTYPE * __restrict out, int32_t N) { + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { return v * v; }); +} + +#endif // GGML_OP_SQR + +#ifdef GGML_OP_SQRT + +/** + * @brief Computes the square root of each element: out[i] = sqrt(in[i]). + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_op_sqrt(const INPUT_DTYPE * __restrict in, OUTPUT_DTYPE * __restrict out, int32_t N) { + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { return aie::sqrt(v); }); +} + +#endif // GGML_OP_SQRT + +#ifdef GGML_UNARY_OP_ABS + +/** + * @brief Computes the absolute value of each element: out[i] = |in[i]|. + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_abs(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_n(in, N, out, + [](auto v) -> OUTPUT_DTYPE { return v < static_cast(0) ? -v : v; }); +} + +#endif // GGML_UNARY_OP_ABS + +#ifdef GGML_UNARY_OP_SGN + +/** + * @brief Computes the sign of each element: out[i] = sgn(in[i]). + * + * Returns 1 for positive values, -1 for negative values, and 0 for zero. + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_sgn(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { + return (v > static_cast(0)) + ? static_cast(1) + : ((v < static_cast(0)) ? static_cast(-1) + : static_cast(0)); + }); +} + +#endif // GGML_UNARY_OP_SGN + +#ifdef GGML_UNARY_OP_NEG + +/** + * @brief Negates each element: out[i] = -in[i]. + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_neg(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { return -v; }); +} + +#endif // GGML_UNARY_OP_NEG + +#ifdef GGML_UNARY_OP_STEP + +/** + * @brief Computes the Heaviside step function: out[i] = (in[i] > 0) ? 1 : 0. + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_step(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { return v > 0; }); +} + +#endif // GGML_UNARY_OP_STEP + +#ifdef GGML_UNARY_OP_RELU + +/** + * @brief Applies ReLU activation: out[i] = max(0, in[i]). + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_relu(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { return std::max(v, 0); }); +} + +#endif // GGML_UNARY_OP_RELU + +#ifdef GGML_UNARY_OP_HARDSIGMOID + +/** + * @brief Applies hard sigmoid activation: out[i] = clamp((in[i] + 3) / 6, 0, 1). + * + * A piecewise linear approximation of the sigmoid function. + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_hardsigmoid(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { + return std::min(1, std::max(0, (v + 3) / 6)); + }); +} + +#endif // GGML_UNARY_OP_HARDSIGMOID + +#ifdef GGML_UNARY_OP_HARDSWISH + +/** + * @brief Applies hard swish activation: out[i] = in[i] * hardsigmoid(in[i]). + * + * Computes: x * clamp((x + 3) / 6, 0, 1) + * + * @param[in] in Input array of N elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_hardswish(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { + return v * std::min(1, std::max(0, (v + 3) / 6)); + }); +} + +#endif // GGML_UNARY_OP_HARDSWISH + +#ifdef GGML_UNARY_OP_FLOOR + +/** + * @brief Computes the floor of each element: out[i] = floor(in[i]). + * + * Returns the largest integer less than or equal to the input. + * Input type must be a floating-point type. + * + * @param[in] in Input array of N floating-point elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_floor(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + static_assert(is_floating_point_v, "Input type must be a floating point type"); + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { + if (v == static_cast(v)) { + return static_cast(v); + } + return (v >= static_cast(0)) ? static_cast(v) + : static_cast(v) - 1; + }); +} + +#endif // GGML_UNARY_OP_FLOOR + +#ifdef GGML_UNARY_OP_CEIL + +/** + * @brief Computes the ceiling of each element: out[i] = ceil(in[i]). + * + * Returns the smallest integer greater than or equal to the input. + * Input type must be a floating-point type. + * + * @param[in] in Input array of N floating-point elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_ceil(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + static_assert(is_floating_point_v, "Input type must be a floating point type"); + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { + if (v == static_cast(v)) { + return static_cast(v); + } + return (v >= static_cast(0)) ? static_cast(v) + 1 + : static_cast(v); + }); +} + +#endif // GGML_UNARY_OP_CEIL + +#ifdef GGML_UNARY_OP_ROUND + +/** + * @brief Rounds each element to the nearest integer: out[i] = round(in[i]). + * + * Uses round-half-away-from-zero: 0.5 rounds to 1, -0.5 rounds to -1. + * Input type must be a floating-point type. + * + * @param[in] in Input array of N floating-point elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_round(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + static_assert(is_floating_point_v, "Input type must be a floating point type"); + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { + return (v >= static_cast(0)) + ? static_cast(v + static_cast(.5)) + : static_cast(v - static_cast(.5)); + }); +} + +#endif // GGML_UNARY_OP_ROUND + +#ifdef GGML_UNARY_OP_TRUNC + +/** + * @brief Truncates each element toward zero: out[i] = trunc(in[i]). + * + * Returns the integer part by removing the fractional digits. + * Input type must be a floating-point type. + * + * @param[in] in Input array of N floating-point elements. + * @param[out] out Output array of N elements. + * @param[in] N Number of elements to process. + */ +void ggml_unary_op_trunc(const INPUT_DTYPE * __restrict in, + OUTPUT_DTYPE * __restrict out, + int32_t N) { + static_assert(is_floating_point_v, "Input type must be a floating point type"); + transform_n(in, N, out, [](auto v) -> OUTPUT_DTYPE { return static_cast(v); }); +} + +#endif // GGML_UNARY_OP_TRUNC + +} // extern "C" diff --git a/src/ggml-hsa/kernels/iron/unary_ops.py b/src/ggml-hsa/kernels/iron/unary_ops.py new file mode 100644 index 0000000000..22545c580c --- /dev/null +++ b/src/ggml-hsa/kernels/iron/unary_ops.py @@ -0,0 +1,200 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +""" +IRON kernel implementation for unary element-wise operations. +""" + +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +from .utils import ( + suppress_import_pyxrt_msg, + arch_aligned_num_elements, + arch_to_device, + max_tile_size, +) + +suppress_import_pyxrt_msg() + +from aie.iron import ( + ObjectFifo, + Program, + Runtime, + Worker, + dtype_to_str, + ExternalFunction, +) +from aie.iron.placers import SequentialPlacer +from aie.iron.controlflow import range_ + + +@dataclass(frozen=True) +class CoreFunctionSpec: + """Specification for a core function to be used in unary operations. + + Attributes: + external_function (ExternalFunction): The external function to be called for the unary operation. + num_elements (int): The total number of elements in the input/output tensors. + """ + + external_function: ExternalFunction + num_elements: int + + @property + def tile_size(self) -> int: + """Returns the tile size used by the external function.""" + return self.external_function.tile_size(0) + + +def _unary_op( + arch: str, + input_tensors: list, + function_spec: CoreFunctionSpec, + output_tensor, +): + """ + Implements output_tensor = op(input_tensors[0]) + + Parameters: + arch (str): Target architecture. + input_tensors (list): Input tensors. + function_spec (CoreFunctionSpec): Unary operator specification. + output_tensor: Output tensor. + """ + + input_tensor = input_tensors[0] + + # Tile size and number of tiles + num_elements = function_spec.num_elements + tile_size = function_spec.tile_size + num_tiles = num_elements // tile_size + if num_elements % tile_size != 0: + raise ValueError( + f"num_elements ({num_elements}) must be divisible by tile_size ({tile_size}) " + "for correct tiling" + ) + + # AIE-array data movement with object fifos + input_tile_ty = np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]] + output_tile_ty = np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]] + of_in = ObjectFifo(input_tile_ty, name="in") + of_out = ObjectFifo(output_tile_ty, name="out") + + # Create a worker to run the task on a compute tile + worker = None + function = function_spec.external_function + + # Task for the core to perform with an external function + def ext_core_fn(of_in, of_out, function): + # Number of sub-vector "tile" iterations + for _ in range_(num_tiles): + elem_in = of_in.acquire(1) + elem_out = of_out.acquire(1) + function(elem_in, elem_out, tile_size) + of_in.release(1) + of_out.release(1) + + worker = Worker(ext_core_fn, fn_args=[of_in.cons(), of_out.prod(), function]) + + # Runtime operations to move data to/from the AIE-array + input_tensor_ty = np.ndarray[(num_elements,), np.dtype[input_tensor.dtype]] + output_tensor_ty = np.ndarray[(num_elements,), np.dtype[output_tensor.dtype]] + rt = Runtime() + with rt.sequence(input_tensor_ty, output_tensor_ty) as t: + rt.start(worker) + rt.fill(of_in.prod(), t[0]) + rt.drain(of_out.cons(), t[-1], wait=True) + + # Place program components (assign them resources on the device) and generate an MLIR module + return Program(arch_to_device(arch), rt).resolve_program(SequentialPlacer()) + + +def _create_external_function( + arch: str, + op_name: str, + input_tensor, + output_tensor, +) -> CoreFunctionSpec: + """ + Creates a specification for unary ops. + + Parameters: + arch (str): Target architecture. + op_name (str): Name of the operation. + input_tensor: Input tensor. + output_tensor: Output tensor. + + Returns: + CoreFunctionSpec: Specification for the core function to be used in unary ops. + """ + + num_elements = arch_aligned_num_elements(arch=arch, tensor=input_tensor) + tile_size = max_tile_size(arch, input_tensor.dtype, num_elements) + + current_dir = Path(__file__).resolve().parent + func = ExternalFunction( + name=op_name.lower(), + object_file_name=f"{op_name.lower()}_core_function.o", + source_file=str(current_dir / "unary_ops.cc"), + arg_types=[ + np.ndarray[(tile_size,), np.dtype[input_tensor.dtype]], + np.ndarray[(tile_size,), np.dtype[output_tensor.dtype]], + np.int32, + ], + compile_flags=[ + f"-D{op_name}=1", + f"-DINPUT_DTYPE={dtype_to_str(input_tensor.dtype)}", + f"-DOUTPUT_DTYPE={dtype_to_str(output_tensor.dtype)}", + ], + ) + return CoreFunctionSpec(external_function=func, num_elements=num_elements) + + +def unary_op( + arch: str, + op_name: str, + input_tensors: list, + output_tensor, +): + """ + IRON design for unary operations. + + Parameters: + arch (str): Target architecture. + op_name (str): Name of the unary operation. + input_tensors (list): List of one input tensor. + output_tensor: Output tensor. + """ + + if len(input_tensors) != 1: + raise ValueError("Operation requires exactly one input tensor.") + + if input_tensors[0].contiguous is False or output_tensor.contiguous is False: + raise ValueError("Input and output tensors must be contiguous in memory.") + + if input_tensors[0].shape != output_tensor.shape: + raise ValueError("Input and output tensors must have the same shape.") + + if output_tensor.shape[1:4] != (1, 1, 1): + raise ValueError(f"Unsupported shape ({output_tensor.shape}).") + + function_spec = _create_external_function( + arch=arch, + op_name=op_name, + input_tensor=input_tensors[0], + output_tensor=output_tensor, + ) + + return _unary_op( + arch=arch, + input_tensors=input_tensors, + function_spec=function_spec, + output_tensor=output_tensor, + ) diff --git a/src/ggml-hsa/kernels/iron/utils.py b/src/ggml-hsa/kernels/iron/utils.py new file mode 100644 index 0000000000..e0da74c119 --- /dev/null +++ b/src/ggml-hsa/kernels/iron/utils.py @@ -0,0 +1,124 @@ +# suppress stderr from aie imports until https://github.com/Xilinx/mlir-aie/issues/2833 +# is resolved + +""" +Utility functions for IRON kernel implementations. +""" + +import contextlib +import os + +import numpy as np + +from aie.iron.device import NPU1, NPU2 + +with open(os.devnull, "w", encoding="utf-8") as _ggml_hsa_devnull: + with contextlib.redirect_stderr(_ggml_hsa_devnull): + import aie.utils as _ggml_hsa_aie_utils + + +def suppress_import_pyxrt_msg(): + """Return the pre-imported aie.utils module with pyxrt message suppressed. + + The aie.utils module is imported once at module import time with stderr + suppressed to avoid noisy pyxrt not found messages. This function is + retained for backward compatibility and simply returns the cached module. + """ + + return _ggml_hsa_aie_utils + + +def align_to_arch( + arch: str, size: int, dtype: np.dtype, alignment_bytes: int = 4 +) -> int: + """ + Align a size to architecture requirements. + + Parameters: + arch (str): Target architecture. + size (int): Size to align (number of elements). + dtype (np.dtype): Data type of elements. + alignment_bytes (int): Alignment in bytes. + + Returns: + int: Aligned size. + """ + if arch in ["aie2", "aie2p"]: + dtype_size = dtype.itemsize + data_size = size * dtype_size + if data_size % alignment_bytes != 0: + aligned_size = ( + alignment_bytes + * ((data_size + (alignment_bytes - 1)) // alignment_bytes) + // dtype_size + ) + return aligned_size + return size + else: + raise ValueError(f"Unsupported architecture: {arch}") + + +def arch_aligned_num_elements(arch: str, tensor) -> int: + """ + Returns the number of elements in the tensor aligned to what the architecture expects for the data type of the tensor. + + Parameters: + arch (str): Target architecture. + tensor: Tensor. + + Returns: + int: Number of elements aligned to architecture requirements. + """ + return align_to_arch(arch, tensor.numel(), tensor.dtype) + + +def max_tile_size(arch: str, dtype: np.dtype, num_elements: int) -> int: + """ + Returns the maximum tile size based on device, data type and number of elements. + + Parameters: + arch (str): Target architecture. + dtype (np.dtype): Data type of the tensor elements. + num_elements (int): Total number of elements in the tensor. + + Returns: + int: Maximum tile size. + """ + vector_register_width = 0 + if arch == "aie2" or arch == "aie2p": + vector_register_width = 512 # bits + else: + raise ValueError(f"Unsupported architecture: {arch}") + tile_size = int(vector_register_width / dtype.itemsize) + + while num_elements % tile_size != 0 and tile_size > 1: + tile_size //= 2 + + assert ( + num_elements % tile_size == 0 + ), f"Number of elements ({num_elements}) must be a multiple of tile size ({tile_size})." + + return tile_size + + +def arch_to_device(device): + """ + Converts an architecture string to an IRON device object. + + Parameters: + device: Architecture string ("aie2" or "aie2p") or an existing device object. + + Returns: + NPU1 for "aie2", NPU2 for "aie2p", or the input if already a device object. + + Raises: + ValueError: If the architecture string is not supported. + """ + if isinstance(device, str): + if device == "aie2": + return NPU1() + elif device == "aie2p": + return NPU2() + else: + raise ValueError(f"Unsupported device: {device}") + return device diff --git a/src/ggml-hsa/kernels/kernel.py b/src/ggml-hsa/kernels/kernel.py new file mode 100644 index 0000000000..59d5334234 --- /dev/null +++ b/src/ggml-hsa/kernels/kernel.py @@ -0,0 +1,92 @@ +# Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. + +""" +Kernel specification types for the GGML HSA backend. + +This module defines the core data structures used for kernel dispatch and +compilation backend selection. The two-layer architecture separates: + +1. Static mapping (Kernel): Maps GGML operation names to dispatch modules +2. Runtime dispatch (KernelSpec): Returned by dispatch functions to specify + which backend and function to use for compilation + +Example: + # In op_to_kernel_map (static) + "ADD": Kernel("ggml_op_add", "binary_ops.py") + + # At runtime, ggml_op_add() returns: + KernelSpec(backend=Backend.IRON, function=iron_add_fn) +""" + +from collections.abc import Callable +from dataclasses import dataclass +from enum import Enum, auto +from pathlib import Path +from typing import Any + + +class Backend(Enum): + """ + Supported kernel compilation backends. + + Each backend has its own compilation pipeline: + - IRON: Uses MLIR-AIE/IRON framework for optimized AIE kernels + """ + + IRON = auto() + + +@dataclass(frozen=True) +class Kernel: + """ + Static mapping entry from GGML operation to dispatch module. + + This dataclass represents an entry in op_to_kernel_map. It identifies + which Python module contains the dispatch function for a given operation. + + Attributes: + name (str): Name of the dispatch function to call (e.g., "ggml_op_add"). + source_file (str | Path): Path to the Python module containing the + dispatch function. + """ + + name: str + source_file: str | Path + + +@dataclass(frozen=True) +class KernelSpec: + """ + Specification returned by kernel dispatch functions. + + When a kernel dispatch function (e.g., ggml_op_add) is called, it examines + the input parameters and returns a KernelSpec that tells the build system: + 1. Which backend to use for compilation + 2. Which function to call to generate the IR + + This enables per-invocation backend selection based on tensor shapes, + dtypes, and other runtime parameters. + + Attributes: + backend (Backend): The compilation backend to use. + op_name (str): Name of the operation. + arch (str): Target architecture for the kernel. + input_tensors (list): List of input tensors for the operation. + output_tensor (Any): Output tensor for the operation. + op_params (bytearray): Operation parameters. + function (Callable): Callable that generates the backend-specific IR. + """ + + backend: Backend + op_name: str + arch: str + input_tensors: list + output_tensor: Any + op_params: bytearray + function: Callable[..., Any] + + def __post_init__(self): + """Validate that backend is a Backend enum instance.""" + if not isinstance(self.backend, Backend): + backend_type = type(self.backend).__name__ + raise TypeError(f"backend must be a Backend enum, got {backend_type}") diff --git a/src/ggml-hsa/kernels/mul_mat.py b/src/ggml-hsa/kernels/mul_mat.py new file mode 100644 index 0000000000..5921181b06 --- /dev/null +++ b/src/ggml-hsa/kernels/mul_mat.py @@ -0,0 +1,39 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 AMD Inc. + +""" +Top-level entry point for the GGML matrix multiplication operation (GGML_OP_MUL_MAT). +""" + +from .iron.gemm import gemm +from .kernel import Backend, KernelSpec + + +def ggml_op_mul_mat( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_MUL_MAT implementation. + + Parameters: + arch (str): Target architecture (e.g., "aie2", "aie2p"). + input_tensors (list): List of two input tensors (A and B). + output_tensor (TensorDesc): Output tensor (C). + op_params (bytearray): Operation-specific parameters. + + Returns: + KernelSpec for the MUL_MAT operation. + """ + return KernelSpec( + backend=Backend.IRON, + op_name="GGML_OP_MUL_MAT", + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=gemm, + ) diff --git a/src/ggml-hsa/kernels/scale.py b/src/ggml-hsa/kernels/scale.py new file mode 100644 index 0000000000..3bc6c5057f --- /dev/null +++ b/src/ggml-hsa/kernels/scale.py @@ -0,0 +1,41 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +""" +Top-level entry point for the GGML scale operation (GGML_OP_SCALE). + +Returns a KernelSpec specifying the compilation backend and kernel function. +""" + +from .iron.scale import scale +from .kernel import Backend, KernelSpec + + +def ggml_op_scale( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_SCALE implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters containing the scale factor. + + Returns: + KernelSpec for the SCALE operation. + """ + return KernelSpec( + backend=Backend.IRON, + op_name="GGML_OP_SCALE", + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=scale, + ) diff --git a/src/ggml-hsa/kernels/soft_max.py b/src/ggml-hsa/kernels/soft_max.py new file mode 100644 index 0000000000..92e61c0566 --- /dev/null +++ b/src/ggml-hsa/kernels/soft_max.py @@ -0,0 +1,44 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +""" +Top-level entry point for the GGML softmax operation (GGML_OP_SOFT_MAX). + +Returns a KernelSpec specifying the compilation backend and kernel function. +""" + +from .iron.softmax import softmax +from .kernel import Backend, KernelSpec + + +def ggml_op_soft_max( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_SOFT_MAX implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of 1-3 input tensors: + - input_tensors[0]: Input tensor (required) + - input_tensors[1]: Mask tensor (optional) + - input_tensors[2]: Sink tensor (optional) + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters (scale, max_bias). + + Returns: + KernelSpec for the SOFT_MAX operation. + """ + return KernelSpec( + backend=Backend.IRON, + op_name="GGML_OP_SOFT_MAX", + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=softmax, + ) diff --git a/src/ggml-hsa/kernels/tensor_desc.py b/src/ggml-hsa/kernels/tensor_desc.py new file mode 100644 index 0000000000..afad1031ab --- /dev/null +++ b/src/ggml-hsa/kernels/tensor_desc.py @@ -0,0 +1,111 @@ +# Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +""" +Tensor descriptor for GGML HSA kernel operations. + +This module provides the TensorDesc dataclass used to describe tensors passed +to kernels. It captures the essential properties needed for kernel +compilation: data type, shape, stride, and contiguity information. + +The tensor dimensions follow GGML conventions where dimensions are ordered +from innermost to outermost (reverse of PyTorch). +""" + +from dataclasses import dataclass +import numpy as np + +from aie.iron import str_to_dtype + +# Mapping for dtypes not natively supported by IRON but still valid GGML types. +# These tensors can still be described, but kernels need to have special handling for them. +_FALLBACK_DTYPE_MAP = { + "i64": np.int64, + "u64": np.uint64, + "f64": np.float64, +} + + +@dataclass(frozen=True) +class TensorDesc: + """ + ggml_tensor description. + + Attributes: + dtype: Data type of the tensor. + shape (tuple): Shape of the tensor as a tuple of integers. Dimensions are from + innermost to outermost (reverse of PyTorch). + stride (tuple): Stride of the tensor as a tuple of integers, or None if not + specified. Dimensions are from innermost to outermost (reverse of PyTorch). + contiguous (bool): Indicates if the tensor is contiguous in memory. + """ + + dtype: np.dtype | str + shape: tuple[int, int, int, int] + stride: tuple[int, int, int, int] | None = None + contiguous: bool = True + + def __post_init__(self): + # convert dtype to np.dtype if it's a string + if isinstance(self.dtype, str): + # First try AIE-supported dtypes, then fall back to numpy for others + try: + object.__setattr__(self, "dtype", np.dtype(str_to_dtype(self.dtype))) + except ValueError: + # dtype not supported by AIE - use numpy dtype for fallback + if self.dtype in _FALLBACK_DTYPE_MAP: + object.__setattr__( + self, "dtype", np.dtype(_FALLBACK_DTYPE_MAP[self.dtype]) + ) + else: + raise + + # compute stride if not provided as if the tensor is contiguous + if self.stride is None: + stride = [0, 0, 0, 0] + stride[0] = self.dtype.itemsize + stride[1] = stride[0] * self.shape[0] + for i in range(2, len(self.shape)): + stride[i] = stride[i - 1] * self.shape[i - 1] + object.__setattr__(self, "stride", tuple(stride)) + + @property + def size(self): + """ + Returns the number of elements in the tensor. + + Returns: + int: The total number of elements in the tensor. + """ + return int(np.prod(self.shape)) + + def numel(self): + """ + Returns the number of elements in the tensor. + + Returns: + int: The total number of elements in the tensor. + """ + return self.size + + +def ggml_tensor_to_tensordesc( + dtype: str, + ne: tuple[int, int, int, int], + nb: tuple[int, int, int, int], + contiguous: bool, +) -> TensorDesc: + """ + Creates a TensorDesc from the ggml_tensor parameters. + + Parameters: + dtype: Tensor data type. + ne (tuple[int, int, int, int]): Number of elements in each dimension. Dimensions + are from innermost to outermost (reverse of PyTorch). + nb (tuple[int, int, int, int]): Tensor stride in bytes for each dimension. + Dimensions are from innermost to outermost (reverse of PyTorch). + contiguous (bool): Indicates if the tensor is contiguous in memory. + + Returns: + TensorDesc: A new TensorDesc instance. + """ + return TensorDesc(dtype=dtype, shape=ne, stride=nb, contiguous=contiguous) diff --git a/src/ggml-hsa/kernels/unary_ops.py b/src/ggml-hsa/kernels/unary_ops.py new file mode 100644 index 0000000000..3b4e0d1c0a --- /dev/null +++ b/src/ggml-hsa/kernels/unary_ops.py @@ -0,0 +1,548 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates + +""" +Top-level entry points for GGML unary operations. +""" + +from functools import partial + +from .iron.unary_ops import unary_op +from .kernel import Backend, KernelSpec + + +def _iron_unary_kernel( + op_name: str, + arch: str, + input_tensors: list, + output_tensor, + op_params: bytearray, +): + """ + Wrapper for IRON unary operations matching the KernelFunction protocol. + + Parameters: + op_name (str): Name of the unary operation. + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters (unused for unary ops). + + Returns: + MLIR module for the unary operation. + """ + return unary_op( + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_name=op_name, + ) + + +def _make_unary_kernel_spec( + arch: str, + input_tensors: list, + output_tensor, + op_params: bytearray, + op_name: str, +) -> KernelSpec: + """ + Create a KernelSpec for a unary operation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + op_name (str): Name of the unary operation. + + Returns: + KernelSpec configured for IRON backend. + """ + return KernelSpec( + backend=Backend.IRON, + op_name=op_name, + arch=arch, + input_tensors=input_tensors, + output_tensor=output_tensor, + op_params=op_params, + function=partial(_iron_unary_kernel, op_name=op_name), + ) + + +def ggml_op_sqr( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_SQR implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the SQR operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_OP_SQR" + ) + + +def ggml_op_sqrt( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_SQRT implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the SQRT operation. + """ + raise NotImplementedError + + +def ggml_op_log( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_LOG implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the LOG operation. + """ + raise NotImplementedError + + +def ggml_op_sin( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_SIN implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the SIN operation. + """ + raise NotImplementedError + + +def ggml_op_cos( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_OP_COS implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the COS operation. + """ + raise NotImplementedError + + +def ggml_unary_op_abs( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_ABS implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the ABS operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_ABS" + ) + + +def ggml_unary_op_sgn( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_SGN implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the SGN operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_SGN" + ) + + +def ggml_unary_op_neg( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_NEG implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the NEG operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_NEG" + ) + + +def ggml_unary_op_step( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_STEP implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the STEP operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_STEP" + ) + + +def ggml_unary_op_tanh( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_TANH implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the TANH operation. + """ + raise NotImplementedError + + +def ggml_unary_op_elu( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_ELU implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the ELU operation. + """ + raise NotImplementedError + + +def ggml_unary_op_relu( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_RELU implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the RELU operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_RELU" + ) + + +def ggml_unary_op_sigmoid( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_SIGMOID implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the SIGMOID operation. + """ + raise NotImplementedError + + +def ggml_unary_op_gelu( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_GELU implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the GELU operation. + """ + raise NotImplementedError + + +def ggml_unary_op_gelu_quick( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_GELU_QUICK implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the GELU_QUICK operation. + """ + raise NotImplementedError + + +def ggml_unary_op_silu( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_SILU implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the SILU operation. + """ + raise NotImplementedError + + +def ggml_unary_op_hardswish( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_HARDSWISH implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the HARDSWISH operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_HARDSWISH" + ) + + +def ggml_unary_op_hardsigmoid( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_HARDSIGMOID implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the HARDSIGMOID operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_HARDSIGMOID" + ) + + +def ggml_unary_op_exp( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_EXP implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the EXP operation. + """ + raise NotImplementedError + + +def ggml_unary_op_gelu_erf( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_GELU_ERF implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the GELU_ERF operation. + """ + raise NotImplementedError + + +def ggml_unary_op_xielu( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_XIELU implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the XIELU operation. + """ + raise NotImplementedError + + +def ggml_unary_op_floor( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_FLOOR implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the FLOOR operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_FLOOR" + ) + + +def ggml_unary_op_ceil( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_CEIL implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the CEIL operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_CEIL" + ) + + +def ggml_unary_op_round( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_ROUND implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the ROUND operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_ROUND" + ) + + +def ggml_unary_op_trunc( + arch: str, input_tensors: list, output_tensor, op_params: bytearray +) -> KernelSpec: + """ + GGML_UNARY_OP_TRUNC implementation. + + Parameters: + arch (str): Target architecture. + input_tensors (list): List of one input tensor. + output_tensor (TensorDesc): Output tensor. + op_params (bytearray): Operation parameters. + + Returns: + KernelSpec for the TRUNC operation. + """ + return _make_unary_kernel_spec( + arch, input_tensors, output_tensor, op_params, "GGML_UNARY_OP_TRUNC" + ) diff --git a/src/ggml-hsa/requirements.txt b/src/ggml-hsa/requirements.txt new file mode 100644 index 0000000000..0f8a84fba2 --- /dev/null +++ b/src/ggml-hsa/requirements.txt @@ -0,0 +1,12 @@ +# Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All Rights Reserved. + +# The MLIR-AIE and LLVM-AIE (Peano) packages are not on the official PyPI, so +# we add them as additional indices here. +--extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1 +--extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly +--extra-index-url https://pypi.org/simple + +mlir_aie==1.2.1 +llvm-aie + +black diff --git a/src/ggml-hsa/type-traits.hpp b/src/ggml-hsa/type-traits.hpp new file mode 100644 index 0000000000..7e8a4abbde --- /dev/null +++ b/src/ggml-hsa/type-traits.hpp @@ -0,0 +1,60 @@ +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +#pragma once + +#include + +#include "ggml-impl.h" +#include "ggml.h" + +/** + * @brief @ref ggml_type traits. + */ +template +struct ggml_hsa_type_traits; + +template <> +struct ggml_hsa_type_traits { + static constexpr ggml_type ggml_type_v = GGML_TYPE_F32; + using type = float; + static constexpr bool is_fundamental = true; +}; + +template <> +struct ggml_hsa_type_traits { + static constexpr ggml_type ggml_type_v = GGML_TYPE_F16; + using type = ggml_fp16_t; + static constexpr bool is_fundamental = false; + static constexpr auto to_fp32 = [](ggml_fp16_t v) -> float { return GGML_FP16_TO_FP32(v); }; + static constexpr auto from_fp32 = [](float v) -> ggml_fp16_t { return GGML_FP32_TO_FP16(v); }; +}; + +template <> +struct ggml_hsa_type_traits { + static constexpr ggml_type ggml_type_v = GGML_TYPE_I8; + using type = std::int8_t; + static constexpr bool is_fundamental = true; +}; + +template <> +struct ggml_hsa_type_traits { + static constexpr ggml_type ggml_type_v = GGML_TYPE_I16; + using type = std::int16_t; + static constexpr bool is_fundamental = true; +}; + +template <> +struct ggml_hsa_type_traits { + static constexpr ggml_type ggml_type_v = GGML_TYPE_I32; + using type = std::int32_t; + static constexpr bool is_fundamental = true; +}; + +template <> +struct ggml_hsa_type_traits { + static constexpr ggml_type ggml_type_v = GGML_TYPE_BF16; + using type = ggml_bf16_t; + static constexpr bool is_fundamental = false; + static constexpr auto to_fp32 = [](ggml_bf16_t v) -> float { return GGML_BF16_TO_FP32(v); }; + static constexpr auto from_fp32 = [](float v) -> ggml_bf16_t { return GGML_FP32_TO_BF16(v); }; +}; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7c28e344c5..04914c45b6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -353,4 +353,11 @@ if (NOT GGML_BACKEND_DL) target_link_libraries(${TEST_TARGET} PRIVATE ggml) add_test(NAME ${TEST_TARGET} COMMAND $) set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + + # + # ggml-hsa tests + if (GGML_HSA) + add_subdirectory(ggml-hsa) + endif() + endif() diff --git a/tests/ggml-hsa/CMakeLists.txt b/tests/ggml-hsa/CMakeLists.txt new file mode 100644 index 0000000000..f27e7165c8 --- /dev/null +++ b/tests/ggml-hsa/CMakeLists.txt @@ -0,0 +1,17 @@ +# +# test-mul-mat-hsa + +set(TEST_TARGET test-mul-mat-hsa) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test-vector-hsa + +set(TEST_TARGET test-vector-hsa) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") \ No newline at end of file diff --git a/tests/ggml-hsa/test-mul-mat-hsa.cpp b/tests/ggml-hsa/test-mul-mat-hsa.cpp new file mode 100644 index 0000000000..bbf1189c2c --- /dev/null +++ b/tests/ggml-hsa/test-mul-mat-hsa.cpp @@ -0,0 +1,376 @@ +#include "ggml.h" +#include "ggml-cpu.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_HSA +#include "ggml-hsa.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef GGML_USE_HSA +#define USE_NPU 1 +#endif + +template +struct fundamental_to_ggml_type; + +template<> +struct fundamental_to_ggml_type { + inline static const auto ggml_type = GGML_TYPE_I16; +}; + +template<> +struct fundamental_to_ggml_type { + inline static const auto ggml_type = GGML_TYPE_F32; +}; + +static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return (t0->ne[0] == t1->ne[0]) && + (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable + (t1->ne[3]%t0->ne[3] == 0); +} + +struct ggml_tensor * ggml_mul_mat_i16( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_mul_mat(a, b)); + GGML_ASSERT(!ggml_is_transposed(a)); + + const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I16, 4, ne); + + result->op = GGML_OP_MUL_MAT; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct test_model { + ggml_tensor * a; + ggml_tensor * b; + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer; + ggml_context * ctx; + std::vector buf; +}; + +void print_ggml_tensor(const ggml_tensor * tensor) { + printf("%s (%s):\n\tne=[%3ld, %3ld, %3ld, %3ld], nb=[%3ld, %3ld, %3ld, %3ld], type=%s\n", + tensor->name, + ggml_op_desc(tensor), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], + tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3], + ggml_type_name(tensor->type)); +} + +void print_ggml_tensor_data(const ggml_tensor * tensor) { + for (int64_t ne03 = 0; ne03 < tensor->ne[3]; ++ne03) { + for (int64_t ne02 = 0; ne02 < tensor->ne[2]; ++ne02) { + for (int64_t ne01 = 0; ne01 < tensor->ne[1]; ++ne01) { + for (int64_t ne00 = 0; ne00 < tensor->ne[0]; ++ne00) { + const size_t idx = + (ne00 * tensor->nb[0]) + + (ne01 * tensor->nb[1]) + + (ne02 * tensor->nb[2]) + + (ne03 * tensor->nb[3]); + auto p = static_cast(tensor->data) + idx; + switch (tensor->type) { + case GGML_TYPE_I16: { + const auto value = *reinterpret_cast(p); + printf("%d ", value); + break; + } + case GGML_TYPE_F32: { + const auto value = *reinterpret_cast(p); + printf("%f ", value); + break; + } + default: + fprintf(stderr, "Unsupported type %s\n", ggml_type_name(tensor->type)); + return; + } + } + printf("\n"); + } + printf("\n"); + } + printf("\n"); + } +} + +template +void load_model(test_model & model, T * a, T * b, int M, int N, int K, bool use_accelerator) { + const auto ggml_type = fundamental_to_ggml_type::ggml_type; + + size_t buffer_size = 0; + buffer_size += (M * N) * ggml_type_size(ggml_type); // tensor a + buffer_size += (N * K) * ggml_type_size(ggml_type); // tensor b + buffer_size += 1024; // overhead + + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: backend buffer size = %d bytes\n", __func__, (int) buffer_size); + + int num_tensors = 2; + ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + // initialize the backend +#ifdef GGML_USE_HSA + if (use_accelerator && !model.backend) { + fprintf(stderr, "%s: using HSA backend\n", __func__); + model.backend = ggml_backend_hsa_init(0); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_hsa_init() failed\n", __func__); + } + } +#endif + +#ifdef GGML_USE_CUDA + if (use_accelerator && !model.backend) { + fprintf(stderr, "%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(0); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } + } +#endif + + if(!model.backend) { + // fallback to CPU backend + model.backend = ggml_backend_cpu_init(); + } + + model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size); + + // create context + model.ctx = ggml_init(params); + + // create tensors + model.a = ggml_new_tensor_2d(model.ctx, ggml_type, K, M); + model.b = ggml_new_tensor_2d(model.ctx, ggml_type, N, K); + + ggml_set_name(model.a, "a"); + ggml_set_name(model.b, "b"); + + // create a allocator + ggml_tallocr alloc = ggml_tallocr_new(model.buffer); + + // alloc memory + ggml_tallocr_alloc(&alloc, model.a); + ggml_tallocr_alloc(&alloc, model.b); + + // copy data directly to device + ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a)); + ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); +} + + +ggml_cgraph * build_graph(test_model& model) { + const size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + model.buf.resize(buf_size); + + ggml_init_params params0 = { + /*.mem_size =*/ model.buf.size(), + /*.mem_buffer =*/ model.buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + + // create a context to build the graph + ggml_context * ctx = ggml_init(params0); + + ggml_cgraph * gf = ggml_new_graph(ctx); + + // zT = x @ yT + ggml_tensor * b_transposed_vw = ggml_transpose(ctx, model.b); + ggml_tensor * b_transposed = ggml_cont(ctx, b_transposed_vw); + + +#if USE_NPU + ggml_tensor * c_transposed = ggml_mul_mat_i16(ctx, model.a, b_transposed); +#else + ggml_tensor * c_transposed = ggml_mul_mat(ctx, model.a, b_transposed); +#endif + ggml_set_name(c_transposed, "c"); + + ggml_tensor * c_vw = ggml_transpose(ctx, c_transposed); + ggml_tensor * c = ggml_cont(ctx, c_vw); + + print_ggml_tensor(model.a); + print_ggml_tensor(model.b); + print_ggml_tensor(b_transposed_vw); + print_ggml_tensor(b_transposed); + print_ggml_tensor(c_transposed); + print_ggml_tensor(c_vw); + print_ggml_tensor(c); + + // z = (zT)T + ggml_build_forward_expand(gf, c); + + // delete the context used to build the graph + ggml_free(ctx); + + return gf; +} + +ggml_tensor* compute(test_model & model, ggml_gallocr_t allocr) { + ggml_cgraph * gf = build_graph(model); + + // allocate tensors + if (!ggml_gallocr_alloc_graph(allocr, gf)) { + fprintf(stderr, "%s: ggml_gallocr_alloc_graph() failed\n", __func__); + std::exit(-1); + } + + ggml_graph_print(gf); + + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); + fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); + + int n_threads = 1; + + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } + + if (ggml_backend_graph_compute(model.backend, gf) != GGML_STATUS_SUCCESS) { + fprintf(stderr, "%s: ggml_backend_graph_compute() failed\n", __func__); + std::exit(-1); + } + + // in this case, the output tensor is the last one in the graph + return ggml_graph_node(gf, -1); +} + +template +void print_matrix(const T * matrix, int rows, int cols) { + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + std::cout << matrix[i * cols + j] << ' '; + } + std::cout << '\n'; + } +} + +template +void make_eye(T * matrix, int rows, int cols) { + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + matrix[i * cols + j] = (i==j); + } + } +} + +template +void gemm(int M, int N, int K, + const U * A, + const U * B, + T * C) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + for (int k = 0; k < K; ++k) { + C[m * N + n] += A[m * K + k] * B[k * N + n]; + } + } + } +} + +int main() +{ +#if USE_NPU + using value_type = int16_t; + const bool use_npu = true; +#else + using value_type = float; + const bool use_npu = false; +#endif + const bool dump_matrices = false; + + const int64_t M = 256, N = 128, K = 64; + + ggml_time_init(); + + // matrix A + value_type matrixA[M * K] = {}; + std::iota(matrixA, std::next(matrixA, M * K), 0); + + // matrix B + value_type matrixB[K * N] = {}; + make_eye(matrixB, K, N); + + matrixB[0] = 10; + + // C = A * B + value_type matrixC_naive[M * N] = {}; + gemm(M, N, K, matrixA, matrixB, matrixC_naive); + + printf("Matrix A: [%ld, %ld]\n", M, K); + if (dump_matrices) { + print_matrix(matrixA, M, K); + } + printf("Matrix B: [%ld, %ld]\n", K, N); + if (dump_matrices) { + print_matrix(matrixB, K, N); + } + + printf("Matrix C: [%ld, %ld]\n", M, N); + if (dump_matrices) { + print_matrix(matrixC_naive, M, N); + } + + test_model model; + load_model(model, matrixA, matrixB, M, N, K, use_npu); + + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + + ggml_tensor * result = compute(model, allocr); + + if (dump_matrices) { + print_ggml_tensor_data(model.a); + print_ggml_tensor_data(model.b); + print_ggml_tensor_data(result); + } + + std::vector matrixC(ggml_nelements(result)); + ggml_backend_tensor_get(result, matrixC.data(), 0, ggml_nbytes(result)); + + printf("Performing ggml_mul_mat test:\n"); + + bool passed = true; + for(int i = 0; i < M * N; i++) { + if(matrixC_naive[i] != matrixC[i]) { + passed = false; + break; + } + } + + printf("ggml_mul_mat (%d): %s\n", (int) ggml_nelements(result), passed && (ggml_nelements(result) == M * N) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m"); + + // free memory + ggml_free(model.ctx); + + ggml_backend_buffer_free(model.buffer); + ggml_backend_free(model.backend); + ggml_gallocr_free(allocr); + return 0; +} diff --git a/tests/ggml-hsa/test-vector-hsa.cpp b/tests/ggml-hsa/test-vector-hsa.cpp new file mode 100644 index 0000000000..7a2eb9ed86 --- /dev/null +++ b/tests/ggml-hsa/test-vector-hsa.cpp @@ -0,0 +1,153 @@ +// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved. + +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_HSA +#include "ggml-hsa.h" +#endif + +template +auto create_data(std::size_t N, T value) { + std::vector v(N); + std::iota(std::begin(v), std::end(v), value); + return v; +} + +template +std::ostream& operator<<(std::ostream& os, const std::vector& v) { + os << "["; + for (const auto & t : v) { + os << ' ' << t; + } + return os << " ]"; +} + +int main(int argc, char* argv[]) { + std::size_t N = 32; + const char* op = "+"; + if (argc > 1) { + N = std::atoi(argv[1]); + } + if (argc > 2) { + op = argv[2]; + } + + std::cout << "Creating arrays of " << N + << " elements and doing A " << op << " B.\n"; + + // create data + using value_type = std::int32_t; + constexpr auto ggml_type = GGML_TYPE_I32; + const std::vector A = create_data(N, 10); + const std::vector B = create_data(N, 2); + + // initialize GGML backend and allocators + ggml_backend_t backend = {}; + +#ifdef GGML_USE_HSA + std::cout << "Using HSA backend\n"; + backend = ggml_backend_hsa_init(0); +#endif + +#ifdef GGML_USE_CUDA + if (!backend) { + std::cout << "Using CUDA backend\n"; + backend = ggml_backend_cuda_init(0); // init device 0 + } +#endif + + if (backend == nullptr) { + std::cerr << "Could not create backend\n"; + return EXIT_FAILURE; + } + const std::size_t alignment = ggml_backend_get_alignment(backend); + const std::size_t tensor_count = 3; + const std::size_t buffer_size = tensor_count * GGML_PAD((N * sizeof(value_type)), alignment); + ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, buffer_size); + ggml_tallocr alloc = ggml_tallocr_new(buffer); + ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + + // allocate tensors on HSA memory + const std::size_t ctx_size = + tensor_count * ggml_tensor_overhead() + ggml_graph_overhead_custom(tensor_count, false); + ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true }; + ggml_context * ctx = ggml_init(params); + ggml_tensor * tensor_a = ggml_new_tensor_1d(ctx, ggml_type, N); + ggml_tensor * tensor_b = ggml_new_tensor_1d(ctx, ggml_type, N); + if ((ggml_tallocr_alloc(&alloc, tensor_a) != GGML_STATUS_SUCCESS) || + (ggml_tallocr_alloc(&alloc, tensor_b) != GGML_STATUS_SUCCESS)) { + std::cerr << "Could not allocate tensor\n"; + return EXIT_FAILURE; + } + + // create graph + ggml_cgraph * gf = ggml_new_graph_custom(ctx, tensor_count, /*grads*/ false); + + // add operation + ggml_tensor * tensor_result = nullptr; + if (std::strcmp(op, "+") == 0) { + tensor_result = ggml_add(ctx, tensor_a, tensor_b); + } + else if (std::strcmp(op, "-") == 0) { + tensor_result = ggml_sub(ctx, tensor_a, tensor_b); + } + else if (std::strcmp(op, "*") == 0) { + tensor_result = ggml_mul(ctx, tensor_a, tensor_b); + } + else if (std::strcmp(op, "/") == 0) { + tensor_result = ggml_div(ctx, tensor_a, tensor_b); + } + else { + std::cerr << "Unknown operation \"" << op << "\".\n"; + return EXIT_FAILURE; + } + + if (!ggml_backend_supports_op(backend, tensor_result)) { + std::cerr << "Operation not supported\n"; + return EXIT_FAILURE; + } + ggml_build_forward_expand(gf, tensor_result); + if (!ggml_gallocr_alloc_graph(galloc, gf)) { + std::cerr << "Could not allocate graph\n"; + return EXIT_FAILURE; + } + + // copy data in (can be avoided if data created directly in tensors) + ggml_backend_tensor_set(tensor_a, std::data(A), 0, ggml_nbytes(tensor_a)); + ggml_backend_tensor_set(tensor_b, std::data(B), 0, ggml_nbytes(tensor_b)); + + // execute + if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) { + std::cerr << "Execution failed\n"; + return EXIT_FAILURE; + } + + // copy data out and print + std::vector result(N); + ggml_backend_tensor_get(tensor_result, std::data(result), 0, ggml_nbytes(tensor_result)); + std::cout << "A = " << A << '\n' + << "B = " << B << '\n' + << "A " << op << " B = " << result << '\n'; + + // free resources + ggml_free(ctx); + ggml_gallocr_free(galloc); + ggml_backend_free(backend); + + return EXIT_SUCCESS; +}