-
Notifications
You must be signed in to change notification settings - Fork 26
Add MHC (Multi-Head Computation) kernels for Ascend A5 #104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| cmake_minimum_required(VERSION 3.16) | ||
| project(mhc_kernels) | ||
|
|
||
| set(CMAKE_CXX_STANDARD 17) | ||
| set(CMAKE_CXX_STANDARD_REQUIRED ON) | ||
| set(CMAKE_POSITION_INDEPENDENT_CODE ON) | ||
| set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) | ||
| set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) | ||
|
|
||
| if(NOT DEFINED ENV{ASCEND_HOME_PATH}) | ||
| message(FATAL_ERROR "Cannot find ASCEND_HOME_PATH, please run set_env.sh.") | ||
| else() | ||
| set(ASCEND_HOME_PATH $ENV{ASCEND_HOME_PATH}) | ||
| endif() | ||
|
|
||
| set(ASCEND_DRIVER_PATH /usr/local/Ascend/driver) | ||
| set(CMAKE_COMPILER bisheng) | ||
| set(CMAKE_C_COMPILER ${CMAKE_COMPILER}) | ||
| set(CMAKE_CXX_COMPILER ${CMAKE_COMPILER}) | ||
|
|
||
| add_compile_options(-D_FORTIFY_SOURCE=2 -O2 -std=c++17 -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong) | ||
| add_link_options(-s -Wl,-z,relro -Wl,-z,now) | ||
|
|
||
| set(CMAKE_CCE_COMPILE_OPTIONS | ||
| -xcce -Xhost-start -Xhost-end | ||
| "SHELL:-mllvm -cce-aicore-stack-size=0x8000" | ||
| "SHELL:-mllvm -cce-aicore-function-stack-size=0x8000" | ||
| "SHELL:-mllvm -cce-aicore-record-overflow=true" | ||
| "SHELL:-mllvm -cce-aicore-addr-transform" | ||
| "SHELL:-mllvm -cce-aicore-dcci-insert-for-scalar=false" | ||
| ) | ||
|
|
||
| set(CMAKE_CPP_COMPILE_OPTIONS -xc++ "SHELL:-include stdint.h" "SHELL:-include stddef.h") | ||
|
|
||
| include_directories( | ||
| ${PROJECT_SOURCE_DIR}/../../../../include/ | ||
| ${ASCEND_HOME_PATH}/include | ||
| ${ASCEND_HOME_PATH}/pkg_inc/runtime | ||
| ${ASCEND_DRIVER_PATH}/kernel/inc | ||
| ) | ||
|
|
||
| # Build all 15 kernel .so files | ||
| set(MHC_KERNELS | ||
| expand_to_mhc_fwd expand_to_mhc_bwd | ||
| head_compute_mix_fwd head_compute_mix_bwd | ||
| pre_split_mixes_fwd pre_split_mixes_bwd | ||
| pre_apply_mix_fwd pre_apply_mix_bwd | ||
| pre_norm_fn_fwd | ||
| fn_normw_merge_fwd fn_normw_merge_bwd | ||
| post_fwd post_bwd | ||
| sinkhorn_normalize_fwd sinkhorn_normalize_bwd | ||
| ) | ||
|
|
||
| foreach(KERNEL ${MHC_KERNELS}) | ||
| add_library(${KERNEL}_kernel SHARED ${KERNEL}.cpp) | ||
| target_compile_options(${KERNEL}_kernel PRIVATE ${CMAKE_CCE_COMPILE_OPTIONS} --npu-arch=dav-3510 -DMEMORY_BASE) | ||
| endforeach() | ||
|
Comment on lines
+54
to
+57
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The kernels are being compiled into individual shared libraries here, but |
||
|
|
||
| # Build caller .so (wraps expand_to_mhc_fwd for the test) | ||
| add_library(mhc_caller SHARED caller.cpp) | ||
| target_compile_options(mhc_caller PRIVATE ${CMAKE_CCE_COMPILE_OPTIONS} --npu-arch=dav-3510 -DMEMORY_BASE) | ||
|
|
||
| # Build host test executable | ||
| add_executable(mhc_test main.cpp) | ||
| target_compile_options(mhc_test PRIVATE ${CMAKE_CPP_COMPILE_OPTIONS}) | ||
| target_include_directories(mhc_test PRIVATE ${PROJECT_SOURCE_DIR}/../../../../tests/common) | ||
| target_link_directories(mhc_test PUBLIC ${ASCEND_HOME_PATH}/lib64) | ||
| target_link_libraries(mhc_test PRIVATE mhc_caller ascendcl stdc++ m pthread) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| # MHC (Multi-Head Computation) Kernels | ||
|
|
||
| PTO-ISA kernels for the MHC architecture from [DeepSeek TileKernels](https://github.com/deepseek-ai/TileKernels). | ||
|
|
||
| ## Overview | ||
|
|
||
| MHC extends the standard Transformer residual connection from a single stream to multiple parallel heads with learnable mixing: | ||
|
|
||
| ``` | ||
| x[m] = Σ_in comb[in, m] * residual[in] + post_mix[m] * up(F(down(x))) | ||
| ``` | ||
|
|
||
| This directory contains 15 kernels (7 forward + 8 backward) that implement the full MHC forward and backward pass: | ||
|
|
||
| | Kernel | Description | Data types | | ||
| |--------|-------------|------------| | ||
| | `expand_to_mhc_fwd/bwd` | Broadcast x to multi-head / reduce gradient | bf16 | | ||
| | `pre_apply_mix_fwd/bwd` | Weighted sum across heads | bf16 + f32 | | ||
| | `pre_norm_fn_fwd` | RMSNorm + FN weight projection | bf16 → f32 | | ||
| | `fn_normw_merge_fwd/bwd` | Fuse norm weight with FN weight | f32 | | ||
| | `head_compute_mix_fwd/bwd` | Sigmoid activation for head mix | f32 | | ||
| | `pre_split_mixes_fwd/bwd` | Split raw params into pre/post/comb | f32 | | ||
| | `sinkhorn_normalize_fwd/bwd` | Sinkhorn iteration for doubly-stochastic comb matrix | f32 | | ||
| | `post_fwd/bwd` | Final multi-head residual combination | bf16 + f32 | | ||
|
|
||
| ## Generation | ||
|
|
||
| These kernels were generated from [PTO-DSL](https://github.com/huawei-csl/pto-dsl) Python source via the following pipeline: | ||
|
|
||
| ``` | ||
| PTO-DSL Python → MLIR IR (.pto) → ptoas assembler → PTO-ISA C++ (.cpp) | ||
| ``` | ||
|
|
||
| Source: [PTO-Gym PR #7](https://github.com/PTO-ISA/PTO-Gym/pull/7) (`tilekernels_ptodsl/mhc/`) | ||
|
|
||
| Two post-processing steps were applied for Ascend A5 (dav-3510) compatibility: | ||
| 1. Tile shapes padded to 32-byte alignment (via DSL `_meta_data` modification) | ||
| 2. `pipe_barrier(PIPE_V)` replaced with `pipe_barrier(PIPE_ALL)` | ||
|
|
||
| ## Build & Run | ||
|
|
||
| ```bash | ||
| source /usr/local/Ascend/ascend-toolkit/latest/set_env.sh | ||
| bash run.sh | ||
| ``` | ||
|
|
||
| ## Parameters | ||
|
|
||
| All kernels use `mhc_mult=4` and `hidden_size=1280` (matching DeepSeek-V3 configuration). |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| /** | ||
| Copyright (c) 2026 Huawei Technologies Co., Ltd. | ||
| CANN Open Software License Agreement Version 2.0 | ||
|
|
||
| Caller wrappers for MHC kernels. Each wrapper exports a C function that | ||
| launches the corresponding __global__ kernel with the <<<>>> syntax. | ||
| */ | ||
| #include "expand_to_mhc_fwd.cpp" | ||
| #include <cstdint> | ||
|
|
||
| extern "C" void call_expand_fwd(uint32_t blockDim, void *stream, | ||
| uint8_t *x, uint8_t *out, int32_t tokens, int32_t hidden) { | ||
| tilekernels_mhc_expand_to_mhc_fwd_m4<<<blockDim, nullptr, stream>>>( | ||
| (bfloat16_t *)x, (bfloat16_t *)out, tokens, hidden); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,164 @@ | ||
| #include "pto/pto-inst.hpp" | ||
| using namespace pto; | ||
|
|
||
| enum class PTOAutoSyncTailMode : int { | ||
| kBarrierAll = 0, | ||
| kSetWaitMte3ToSEvent0 = 1, | ||
| }; | ||
|
|
||
| static AICORE inline void ptoas_auto_sync_tail( | ||
| PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { | ||
| switch (mode) { | ||
| case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: | ||
| set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); | ||
| wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); | ||
| break; | ||
| case PTOAutoSyncTailMode::kBarrierAll: | ||
| default: | ||
| pipe_barrier(PIPE_ALL); | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| __global__ AICORE void tilekernels_mhc_expand_to_mhc_bwd_m4(__gm__ bfloat16_t* v1, __gm__ bfloat16_t* v2, int32_t v3, int32_t v4) { | ||
| RoundMode v5 = RoundMode::CAST_RINT; | ||
| unsigned v6 = 0; | ||
| const int32_t v7 = 4; | ||
| const int32_t v8 = 1024; | ||
| const int32_t v9 = 1; | ||
| const int32_t v10 = 0; | ||
| const int32_t v11 = 2; | ||
| const int32_t v12 = 3; | ||
| const int64_t v13 = 0; | ||
| const int64_t v14 = 16384; | ||
| const int64_t v15 = 49152; | ||
| const int64_t v16 = 81920; | ||
| using T = float; | ||
| size_t v17 = (size_t) v9; | ||
| int32_t v18 = (int32_t) ((uint32_t) v3 * (uint32_t) v7); | ||
|
|
||
| #if defined(__DAV_VEC__) | ||
| set_mask_norm(); | ||
| set_vector_mask(-1, -1); | ||
| int64_t v19 = get_block_idx(); | ||
| int64_t v20 = get_block_num(); | ||
| int32_t v21 = (int32_t) ((int64_t) v20); | ||
| int32_t v22 = v3 / v21; | ||
| int32_t v23 = v3 % v21 != v10 && v3 < v10 == v21 < v10 ? v22 + v9 : v22; | ||
| int32_t v24 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v19) * (uint32_t) v23); | ||
| int32_t v25 = (int32_t) ((uint32_t) v24 + (uint32_t) v23); | ||
| int32_t v26 = v4 / v8; | ||
| set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); | ||
| set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); | ||
| set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); | ||
| set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); | ||
| for (size_t v27 = (size_t) v24; v27 < ((size_t) ((uint32_t) v25 < (uint32_t) v3 ? v25 : v3)); v27 += v17) { | ||
| int32_t v28 = (int32_t) v27; | ||
| for (size_t v29 = (size_t) v10; v29 < ((size_t) (v4 % v8 != v10 && v4 < v10 == v8 < v10 ? v26 + v9 : v26)); v29 += v17) { | ||
| int32_t v30 = (int32_t) ((uint32_t) ((int32_t) v29) * (uint32_t) v8); | ||
| int32_t v31 = (int32_t) ((uint32_t) v4 - (uint32_t) v30); | ||
| int32_t v32 = (uint32_t) v31 < (uint32_t) v8 ? v31 : v8; | ||
| Tile<TileType::Vec, bfloat16_t, 8, 1024, BLayout::RowMajor, 8, 1024, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v33; | ||
| TASSIGN(v33, v13); | ||
| Tile<TileType::Vec, bfloat16_t, 8, 1024, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v34 = Tile<TileType::Vec, bfloat16_t, 8, 1024, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v32); | ||
| __ubuf__ bfloat16_t* v35 = v33.data(); | ||
| uint64_t v36 = reinterpret_cast<uint64_t>(v35); | ||
| TASSIGN(v34, v36); | ||
| Tile<TileType::Vec, float, 8, 1024, BLayout::RowMajor, 8, 1024, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v37; | ||
| TASSIGN(v37, v14); | ||
| Tile<TileType::Vec, float, 8, 1024, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v38 = Tile<TileType::Vec, float, 8, 1024, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v32); | ||
| __ubuf__ float* v39 = v37.data(); | ||
| uint64_t v40 = reinterpret_cast<uint64_t>(v39); | ||
| TASSIGN(v38, v40); | ||
| Tile<TileType::Vec, float, 8, 1024, BLayout::RowMajor, 8, 1024, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v41; | ||
| TASSIGN(v41, v15); | ||
| Tile<TileType::Vec, float, 8, 1024, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v42 = Tile<TileType::Vec, float, 8, 1024, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v32); | ||
| __ubuf__ float* v43 = v41.data(); | ||
| uint64_t v44 = reinterpret_cast<uint64_t>(v43); | ||
| TASSIGN(v42, v44); | ||
| Tile<TileType::Vec, bfloat16_t, 8, 1024, BLayout::RowMajor, 8, 1024, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v45; | ||
| TASSIGN(v45, v16); | ||
| Tile<TileType::Vec, bfloat16_t, 8, 1024, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v46 = Tile<TileType::Vec, bfloat16_t, 8, 1024, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v32); | ||
| __ubuf__ bfloat16_t* v47 = v45.data(); | ||
| uint64_t v48 = reinterpret_cast<uint64_t>(v47); | ||
| TASSIGN(v46, v48); | ||
| int32_t v49 = (int32_t) ((uint32_t) v28 * (uint32_t) v7); | ||
| unsigned v50 = (unsigned) v32; | ||
| unsigned v51 = (unsigned) v4; | ||
| pto::Shape<1, 1, 1, 1, -1> v52 = pto::Shape<1, 1, 1, 1, -1>(v32); | ||
| pto::Stride<-1, -1, -1, -1, 1> v53 = pto::Stride<-1, -1, -1, -1, 1>(v51, v51, v51, v51); | ||
| GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v54 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v49 * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v52, v53); | ||
| wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); | ||
| TLOAD(v34, v54); | ||
| set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); | ||
| wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); | ||
| TCVT(v38, v34, v5); | ||
| set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); | ||
| pipe_barrier(PIPE_ALL); | ||
| TMOV(v42, v38); | ||
| unsigned v55 = (unsigned) v32; | ||
| unsigned v56 = (unsigned) v4; | ||
| pto::Shape<1, 1, 1, 1, -1> v57 = pto::Shape<1, 1, 1, 1, -1>(v32); | ||
| pto::Stride<-1, -1, -1, -1, 1> v58 = pto::Stride<-1, -1, -1, -1, 1>(v56, v56, v56, v56); | ||
| GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v59 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) ((int32_t) (uint32_t) v49 + (uint32_t) v9) * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v57, v58); | ||
| wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); | ||
| TLOAD(v34, v59); | ||
| set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); | ||
| wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); | ||
| pipe_barrier(PIPE_ALL); | ||
| TCVT(v38, v34, v5); | ||
| set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); | ||
| pipe_barrier(PIPE_ALL); | ||
| TADD(v42, v42, v38); | ||
| unsigned v60 = (unsigned) v32; | ||
| unsigned v61 = (unsigned) v4; | ||
| pto::Shape<1, 1, 1, 1, -1> v62 = pto::Shape<1, 1, 1, 1, -1>(v32); | ||
| pto::Stride<-1, -1, -1, -1, 1> v63 = pto::Stride<-1, -1, -1, -1, 1>(v61, v61, v61, v61); | ||
| GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v64 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) ((int32_t) (uint32_t) v49 + (uint32_t) v11) * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v62, v63); | ||
| wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); | ||
| TLOAD(v34, v64); | ||
| set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); | ||
| wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); | ||
| pipe_barrier(PIPE_ALL); | ||
| TCVT(v38, v34, v5); | ||
| set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); | ||
| pipe_barrier(PIPE_ALL); | ||
| TADD(v42, v42, v38); | ||
| unsigned v65 = (unsigned) v32; | ||
| unsigned v66 = (unsigned) v4; | ||
| pto::Shape<1, 1, 1, 1, -1> v67 = pto::Shape<1, 1, 1, 1, -1>(v32); | ||
| pto::Stride<-1, -1, -1, -1, 1> v68 = pto::Stride<-1, -1, -1, -1, 1>(v66, v66, v66, v66); | ||
| GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v69 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) ((int32_t) (uint32_t) v49 + (uint32_t) v12) * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v67, v68); | ||
| wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); | ||
| TLOAD(v34, v69); | ||
| set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); | ||
| wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); | ||
| pipe_barrier(PIPE_ALL); | ||
| TCVT(v38, v34, v5); | ||
| set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); | ||
| pipe_barrier(PIPE_ALL); | ||
| TADD(v42, v42, v38); | ||
| pipe_barrier(PIPE_ALL); | ||
| wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); | ||
| TCVT(v46, v42, v5); | ||
| set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); | ||
| unsigned v70 = (unsigned) v32; | ||
| unsigned v71 = (unsigned) v4; | ||
| pto::Shape<1, 1, 1, 1, -1> v72 = pto::Shape<1, 1, 1, 1, -1>(v32); | ||
| pto::Stride<-1, -1, -1, -1, 1> v73 = pto::Stride<-1, -1, -1, -1, 1>(v71, v71, v71, v71); | ||
| GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v74 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v28 * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v72, v73); | ||
| wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); | ||
| pipe_barrier(PIPE_MTE3); | ||
| TSTORE(v74, v46); | ||
| set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); | ||
| }; | ||
| } | ||
| wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); | ||
| wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); | ||
| wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); | ||
| wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); | ||
| #endif // __DAV_VEC__ | ||
|
|
||
| ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); | ||
| return; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The path to the Ascend driver is hardcoded to
/usr/local/Ascend/driver. This can cause build failures on environments where the driver is installed in a different location. It is recommended to use an environment variable or a CMake cache variable to allow users to specify the path.