tile-ai · Calaweh · May 6, 2026 · May 6, 2026 · coderabbitai · May 6, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -149,13 +149,43 @@ option(TILELANG_USE_HIP_STUBS
 unset(_TILELANG_USE_HIP_STUBS_DEFAULT)
 # Configs end
 
+# --- LLVM Detection for Hexagon/CPU Backend ---
+if(NOT DEFINED USE_LLVM)
+  set(USE_LLVM OFF CACHE STRING "Build with LLVM support (required for Hexagon)")
+endif()
+
+if(USE_LLVM STREQUAL "ON")
+  find_program(LLVM_CONFIG_PATH NAMES llvm-config-18 llvm-config-17 llvm-config)
+  if(LLVM_CONFIG_PATH)
+    set(USE_LLVM ${LLVM_CONFIG_PATH} CACHE STRING "Path to llvm-config" FORCE)
+  else()
+    message(WARNING "USE_LLVM is ON but llvm-config was not found in PATH.")
+  endif()
+endif()
+
+set(_TILELANG_USE_LLVM_SAVED ${USE_LLVM})
+
+# Load TVM sources and base configs
 include(cmake/load_tvm.cmake)
 
 if(EXISTS ${TVM_SOURCE}/cmake/config.cmake)
   include(${TVM_SOURCE}/cmake/config.cmake)
 else()
   message(FATAL_ERROR "Nor tvm provided or submodule checkout-ed.")
 endif()
+
+# Restore USE_LLVM — TVM's config.cmake may have reset it
+if(_TILELANG_USE_LLVM_SAVED AND NOT _TILELANG_USE_LLVM_SAVED STREQUAL "OFF")
+  set(USE_LLVM ${_TILELANG_USE_LLVM_SAVED} CACHE STRING "LLVM config path" FORCE)
+  message(STATUS "TileLang: restored USE_LLVM=${USE_LLVM}")
+endif()
+unset(_TILELANG_USE_LLVM_SAVED)
+
+if(USE_LLVM AND NOT USE_LLVM STREQUAL "OFF")
+  message(STATUS "TileLang Build with LLVM: ${USE_LLVM}")
+  add_definitions(-DTILELANG_HEXAGON_ENABLED)
+endif()
+
 # Re-apply TileLang's preferred backend settings after TVM's config may have
 # overridden the USE_* cache entries.
 foreach(BACKEND IN LISTS TILELANG_BACKENDS)
@@ -170,6 +200,11 @@ set(USE_GTEST OFF)
 # Include directories for TileLang
 set(TILE_LANG_INCLUDES ${TVM_INCLUDES})
 
+# Include TVM 'src' directory to resolve "tvm/runtime/hexagon/hexagon_htp.h"
+list(APPEND TILE_LANG_INCLUDES
+  ${TVM_SOURCE}/src
+)
+
 # Collect source files
 file(GLOB TILE_LANG_SRCS
   src/*.cc
@@ -188,6 +223,7 @@ file(GLOB TILE_LANG_SRCS
 # Always include CPU-safe runtime helpers
 list(APPEND TILE_LANG_SRCS
   src/runtime/error_helpers.cc
+  src/runtime/hexagon_runtime.cc
 )
 
 set(TILELANG_OUTPUT_TARGETS tilelang tvm)

diff --git a/src/runtime/hexagon_runtime.cc b/src/runtime/hexagon_runtime.cc
@@ -0,0 +1,38 @@
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+
+#ifdef TILELANG_HEXAGON_ENABLED
+
+#include <runtime/hexagon/hexagon_htp.h>
+
+namespace tvm {
+namespace tilelang {
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+
+  refl::GlobalDef().def_packed(
+      "tilelang.hexagon.hmx_kernel_launch",
+      [](ffi::PackedArgs args, ffi::Any *rv) {
+        // args[0] is the kernel Function; remaining args are forwarded to it.
+        // AnyView supports .cast<T>() for type-safe extraction.
+        ffi::Function kernel = args[0].cast<ffi::Function>();
+
+        // PackedArgs(const AnyView* data, int32_t size) — slice past the first
+        // arg. args.data() returns const AnyView*, args.size() returns int32_t.
+        ffi::PackedArgs kernel_args(args.data() + 1, args.size() - 1);
-      [](ffi::PackedArgs args, ffi::Any *rv) {
-        // args[0] is the kernel Function; remaining args are forwarded to it.
-        // AnyView supports .cast<T>() for type-safe extraction.
-        ffi::Function kernel = args[0].cast<ffi::Function>();
-
-        // PackedArgs(const AnyView* data, int32_t size) — slice past the first
-        // arg. args.data() returns const AnyView*, args.size() returns int32_t.
-        ffi::PackedArgs kernel_args(args.data() + 1, args.size() - 1);
+      [](ffi::PackedArgs args, ffi::Any *rv) {
+        ICHECK_GE(args.size(), 1)
+            << "tilelang.hexagon.hmx_kernel_launch expects a kernel function as arg0";
+        // args[0] is the kernel Function; remaining args are forwarded to it.
+        // AnyView supports .cast<T>() for type-safe extraction.
+        ffi::Function kernel = args[0].cast<ffi::Function>();
+
+        // PackedArgs(const AnyView* data, int32_t size) — slice past the first
+        // arg. args.data() returns const AnyView*, args.size() returns int32_t.
+        ffi::PackedArgs kernel_args(args.data() + 1, args.size() - 1);
-      [](ffi::PackedArgs args, ffi::Any *rv) {
-        // args[0] is the kernel Function; remaining args are forwarded to it.
-        // AnyView supports .cast<T>() for type-safe extraction.
-        ffi::Function kernel = args[0].cast<ffi::Function>();
-
-        // PackedArgs(const AnyView* data, int32_t size) — slice past the first
-        // arg. args.data() returns const AnyView*, args.size() returns int32_t.
-        ffi::PackedArgs kernel_args(args.data() + 1, args.size() - 1);
+      [](ffi::PackedArgs args, ffi::Any *rv) {
+        ICHECK_GE(args.size(), 1)
+            << "tilelang.hexagon.hmx_kernel_launch expects a kernel function as arg0";
+        // args[0] is the kernel Function; remaining args are forwarded to it.
+        // AnyView supports .cast<T>() for type-safe extraction.
+        ffi::Function kernel = args[0].cast<ffi::Function>();
+
+        // PackedArgs(const AnyView* data, int32_t size) — slice past the first
+        // arg. args.data() returns const AnyView*, args.size() returns int32_t.
+        ffi::PackedArgs kernel_args(args.data() + 1, args.size() - 1);
+
+        // RAII: powers on HMX on construction, releases on scope exit.
+        tvm::runtime::hexagon::HexagonHtp htp;
+
+        kernel.CallPacked(kernel_args, rv);
+      });
+}
+
+} // namespace tilelang
+} // namespace tvm
+
+#else
+// Hexagon runtime support disabled.
+// Build with -DUSE_LLVM=ON to enable HMX kernel launch support.
+#endif // TILELANG_HEXAGON_ENABLED
diff --git a/src/transform/lower_hexagon_intrinsics.cc b/src/transform/lower_hexagon_intrinsics.cc
@@ -0,0 +1,99 @@
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/target/target_info.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tilelang {
+
+using namespace tir;
+using tvm::ffi::Array;
+
+class HexagonIntrinsicLowerer : public StmtExprMutator {
+public:
+  HexagonIntrinsicLowerer() {}
+
+  Stmt Run(Stmt stmt) { return this->VisitStmt(stmt); }
+
+  Stmt VisitStmt_(const EvaluateNode *op) override {
+    if (const CallNode *call = op->value.as<CallNode>()) {
+      if (call->op.same_as(builtin::call_extern())) {
+        if (const StringImmNode *func_name =
+                call->args[0].as<StringImmNode>()) {
+
+          // Lower HMX MMA placeholder
+          if (func_name->value == "hmx_mma_placeholder") {
+            Array<PrimExpr> new_args;
+            new_args.push_back(StringImm("HexKL_mma_i8acc32"));
+            new_args.push_back(
+                call->args[3]); // C_acc (accumulator — first arg to HexKL)
+            new_args.push_back(call->args[1]); // A_vtcm
+            new_args.push_back(call->args[2]); // B_vtcm
+            return Evaluate(
-          if (func_name->value == "hmx_mma_placeholder") {
-            Array<PrimExpr> new_args;
-            new_args.push_back(StringImm("HexKL_mma_i8acc32"));
-            new_args.push_back(
-                call->args[3]); // C_acc (accumulator — first arg to HexKL)
-            new_args.push_back(call->args[1]); // A_vtcm
-            new_args.push_back(call->args[2]); // B_vtcm
-            return Evaluate(
+          if (func_name->value == "hmx_mma_placeholder") {
+            ICHECK_EQ(call->args.size(), 4)
+                << "hmx_mma_placeholder expects exactly 3 operands";
+            Array<PrimExpr> new_args;
+            new_args.push_back(StringImm("HexKL_mma_i8acc32"));
+            new_args.push_back(
+                call->args[3]); // C_acc (accumulator — first arg to HexKL)
+            new_args.push_back(call->args[1]); // A_vtcm
+            new_args.push_back(call->args[2]); // B_vtcm
+            return Evaluate(
-          if (func_name->value == "hmx_mma_placeholder") {
-            Array<PrimExpr> new_args;
-            new_args.push_back(StringImm("HexKL_mma_i8acc32"));
-            new_args.push_back(
-                call->args[3]); // C_acc (accumulator — first arg to HexKL)
-            new_args.push_back(call->args[1]); // A_vtcm
-            new_args.push_back(call->args[2]); // B_vtcm
-            return Evaluate(
+          if (func_name->value == "hmx_mma_placeholder") {
+            ICHECK_EQ(call->args.size(), 4)
+                << "hmx_mma_placeholder expects exactly 3 operands";
+            Array<PrimExpr> new_args;
+            new_args.push_back(StringImm("HexKL_mma_i8acc32"));
+            new_args.push_back(
+                call->args[3]); // C_acc (accumulator — first arg to HexKL)
+            new_args.push_back(call->args[1]); // A_vtcm
+            new_args.push_back(call->args[2]); // B_vtcm
+            return Evaluate(
+                Call(DataType::Int(32), builtin::call_extern(), new_args));
+          }
+
+          // HexagonDmaCopy is not yet available in HexKL v73.
+          // to-do: LowerHexagonDMA pass.
+        }
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+};
+
+namespace transform {
+
+tvm::transform::Pass LowerHexagonIntrinsics() {
+  auto pass_func = [=](PrimFunc f, IRModule m,
+                       tvm::transform::PassContext ctx) {
+    auto *n = f.CopyOnWrite();
+    n->body = HexagonIntrinsicLowerer().Run(std::move(n->body));
+    return f;
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(
+      pass_func, 0, "tilelang.transform.LowerHexagonIntrinsics", {});
+}
+
+// Memory scope descriptors
+// These are queried by TVM's storage analysis to understand capacity/alignment.
+// Fields confirmed from tvm/target/target_info.h:
+//   unit_bits     — addressable unit size in bits
+//   max_num_bits  — total memory capacity in bits
+//   max_simd_bits — widest SIMD operation in bits (HVX = 1024-bit)
+//   head_address  — base address PrimExpr (IntImm 0 = no fixed mapping)
+
+static MemoryInfo GetHmxAccMem() {
+  auto n = tvm::ffi::make_object<MemoryInfoNode>();
+  // HMX accumulator register file: 32×32 int32 = 32768 bits
+  n->unit_bits = 32;                // 32-bit int32 elements
+  n->max_num_bits = 32LL * 32 * 32; // 32768 bits total
+  n->max_simd_bits = 1024;          // HVX vector width
+  n->head_address = IntImm(DataType::Int(32), 0);
+  return MemoryInfo(n);
+}
+
+static MemoryInfo GetVtcmMem() {
+  auto n = tvm::ffi::make_object<MemoryInfoNode>();
+  // VTCM on Hexagon v73: 8 MB
+  n->unit_bits = 8;                        // byte-addressable
+  n->max_num_bits = 8LL * 1024 * 1024 * 8; // 8 MB in bits
+  n->max_simd_bits = 1024;                 // HVX vector width
+  n->head_address = IntImm(DataType::Int(32), 0);
+  return MemoryInfo(n);
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef()
+      .def("tilelang.transform.LowerHexagonIntrinsics", LowerHexagonIntrinsics)
+      .def("tvm.info.mem.global.hmx.acc", GetHmxAccMem)
+      .def("tvm.info.mem.global.vtcm", GetVtcmMem);
+}
+
+} // namespace transform
+} // namespace tilelang
+} // namespace tvm
diff --git a/testing/python/hexagon/diagnose_hmx.py b/testing/python/hexagon/diagnose_hmx.py
@@ -0,0 +1,141 @@
+import pytest
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+from tilelang.intrinsics.hexagon import hmx
+
+
+def has_hexagon_codegen():
+    try:
+        if not tvm.runtime.enabled("llvm"):
+            return False
+        tvm.target.Target("llvm -mtriple=hexagon -mcpu=hexagonv73")
+        return True
+    except Exception:
+        return False
+
+
+def build_hmx_matmul(M, N, K):
+    @T.prim_func
+    def main(
+        A_host: T.Tensor((M, K), "int8"),
+        B_host: T.Tensor((K, N), "int8"),
+        C_host: T.Tensor((M, N), "int32"),
+    ):
+        A_vtcm = T.alloc_fragment((M, K), "int8", scope="global.vtcm")
+        B_vtcm = T.alloc_fragment((K, N), "int8", scope="global.vtcm")
+        C_acc = T.alloc_fragment((M, N), "int32", scope="global.hmx.acc")
+
+        for i, k in T.grid(M, K):
+            A_vtcm[i, k] = A_host[i, k]
+        for k, j in T.grid(K, N):
+            B_vtcm[k, j] = B_host[k, j]
+        for i, j in T.grid(M, N):
+            C_acc[i, j] = T.cast(0, "int32")
+
+        hmx.mma(A_vtcm, B_vtcm, C_acc)
+
+        for i, j in T.grid(M, N):
+            C_host[i, j] = C_acc[i, j]
+
+    return main
+
+
+# Diagnostics (always run, no skipif)
+def test_000_environment():
+    """Report the full environment so we know exactly what we're working with."""
+    print("\n")
+    print("=" * 60)
+    print("ENVIRONMENT REPORT")
+    print("=" * 60)
+    print(f"  tvm.__file__         : {tvm.__file__}")
+    print(f"  tvm.__version__      : {tvm.__version__}")
+    print(f"  llvm enabled         : {tvm.runtime.enabled('llvm')}")
+    print(f"  has_hexagon_codegen(): {has_hexagon_codegen()}")
+
+    try:
+        t = tvm.target.Target("llvm -mtriple=hexagon -mcpu=hexagonv73")
+        print(f"  hexagon target       : OK → {t}")
+    except Exception as e:
+        print(f"  hexagon target       : FAILED → {e}")
+    print("=" * 60)
+
+
+@pytest.mark.skipif(not has_hexagon_codegen(), reason="Hexagon LLVM not available")
+def test_001_ir_dump():
+    """Dump the full kernel_source so we can see what was actually generated."""
+    M, N, K = 32, 32, 32
+    func = build_hmx_matmul(M, N, K)
+    target = tvm.target.Target("llvm -mtriple=hexagon -mcpu=hexagonv73")
+    kernel = tl.compile(func, target=target)
+    ir = kernel.kernel_source
+
+    print("\n")
+    print("=" * 60)
+    print("FULL KERNEL SOURCE")
+    print("=" * 60)
+    print(ir)
+    print("=" * 60)
+
+    # Report which assertions would pass/fail without actually asserting
+    checks = {
+        'target triple = "hexagon"': "target triple",
+        "A_vtcm": "VTCM alloc A",
+        "B_vtcm": "VTCM alloc B",
+        "C_acc": "HMX accumulator",
+        "hmx_mma_placeholder": "placeholder NOT lowered (bad)",
+        "HexKL_mma_i8acc32": "HexKL intrinsic (good)",
+        "HexKL_mma_i8i32": "HexKL alt spelling",
+        "call_extern": "any call_extern",
+        "llvm.hexagon": "LLVM hexagon intrinsic",
+    }
+
+    print("\nASSERTION PROBE RESULTS:")
+    for needle, label in checks.items():
+        found = needle in ir
+        status = "✓ FOUND  " if found else "✗ MISSING"
+        print(f"  {status}  [{label}]  '{needle}'")
+
+    print()
+    # Only hard-assert on things we're sure about
+    assert 'target triple = "hexagon"' in ir, "Not even targeting Hexagon — target string is wrong or codegen didn't run"
+
+
+@pytest.mark.skipif(not has_hexagon_codegen(), reason="Hexagon LLVM not available")
+def test_002_hmx_lowering_status():
+    """Specifically check whether HMX intrinsics were lowered or are still placeholders."""
+    M, N, K = 32, 32, 32
+    func = build_hmx_matmul(M, N, K)
+    target = tvm.target.Target("llvm -mtriple=hexagon -mcpu=hexagonv73")
+    kernel = tl.compile(func, target=target)
+    ir = kernel.kernel_source
+
+    placeholder_present = "hmx_mma_placeholder" in ir
+    hexkl_present = any(
+        s in ir
+        for s in [
+            "HexKL_mma_i8acc32",
+            "HexKL_mma_i8i32",
+            "HexKL_mma",
+        ]
+    )
+    llvm_intrin_present = "llvm.hexagon" in ir
+
+    print(f"\n  placeholder still in IR : {placeholder_present}")
+    print(f"  HexKL intrinsic in IR   : {hexkl_present}")
+    print(f"  llvm.hexagon in IR      : {llvm_intrin_present}")
+
+    if placeholder_present:
+        pytest.fail(
+            "hmx_mma_placeholder was NOT lowered.\n"
+            "_lower_hexagon_intrinsics is not wired into the compile pipeline.\n"
+            "Check lower() in tilelang/engine/lower.py"
+        )
+    elif not hexkl_present and not llvm_intrin_present:
+        pytest.fail(
+            "HMX placeholder is gone but no HexKL/llvm.hexagon intrinsic was emitted.\n"
+            "The lowering pass may be silently dropping the MMA op.\n"
+            "Check LowerHMXIntrinsics implementation."
+        )
+    else:
+        print("  ✓ HMX intrinsics correctly lowered")