diff --git a/docs/runtime_internals/stubs.md b/docs/runtime_internals/stubs.md index ee4c628e79..37d5a73fb2 100644 --- a/docs/runtime_internals/stubs.md +++ b/docs/runtime_internals/stubs.md @@ -27,7 +27,8 @@ ROCm: ## Implementation -The stubs in `src/target/stubs/` implement a lazy-loading mechanism: +The CUDA stubs in `src/backend/cuda/stubs/` and ROCm stubs in +`src/backend/rocm/stubs/` implement a lazy-loading mechanism: - **Lazy Loading**: Libraries are loaded via `dlopen` only upon the first API call. - **Global Symbol Reuse**: For `cudart` and `nvrtc`, the stubs first check the global namespace (`RTLD_DEFAULT`) to use any already loaded symbols (e.g., from PyTorch). diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index 6eac59fca2..40ac455411 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -49,7 +49,7 @@ if(TILELANG_USE_CUDA_STUBS) # - cuModuleLoadData, cuLaunchKernel, cuMemsetD32_v2, etc. # These can be called directly without any wrapper macros. # ============================================================================ - add_library(cuda_stub SHARED src/backend/cuda/codegen/stubs/cuda.cc) + add_library(cuda_stub SHARED src/backend/cuda/stubs/cuda.cc) target_include_directories(cuda_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) # Export symbols with visibility="default" when building target_compile_definitions(cuda_stub PRIVATE TILELANG_CUDA_STUB_EXPORTS) @@ -77,7 +77,7 @@ if(TILELANG_USE_CUDA_STUBS) # The stub exports a minimal set of CUDA Runtime API entrypoints used by TVM # and lazily loads libcudart at runtime on first API call. # ============================================================================ - add_library(cudart_stub SHARED src/backend/cuda/codegen/stubs/cudart.cc) + add_library(cudart_stub SHARED src/backend/cuda/stubs/cudart.cc) target_include_directories(cudart_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) target_compile_definitions(cudart_stub PRIVATE TILELANG_CUDART_STUB_EXPORTS ${TILELANG_CUDA_TOOLKIT_VERSION_DEFINITIONS}) @@ -111,7 +111,7 @@ if(TILELANG_USE_CUDA_STUBS) # The stub exports a minimal set of NVRTC C API entrypoints used by TVM and # lazily loads libnvrtc at runtime on first API call. # ============================================================================ - add_library(nvrtc_stub SHARED src/backend/cuda/codegen/stubs/nvrtc.cc) + add_library(nvrtc_stub SHARED src/backend/cuda/stubs/nvrtc.cc) target_include_directories(nvrtc_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) target_compile_definitions(nvrtc_stub PRIVATE TILELANG_NVRTC_STUB_EXPORTS ${TILELANG_CUDA_TOOLKIT_VERSION_DEFINITIONS}) diff --git a/src/backend/cuda/codegen/stubs/cuda.cc b/src/backend/cuda/stubs/cuda.cc similarity index 100% rename from src/backend/cuda/codegen/stubs/cuda.cc rename to src/backend/cuda/stubs/cuda.cc diff --git a/src/backend/cuda/codegen/stubs/cuda.h b/src/backend/cuda/stubs/cuda.h similarity index 99% rename from src/backend/cuda/codegen/stubs/cuda.h rename to src/backend/cuda/stubs/cuda.h index c67afd0448..7c04431d1d 100644 --- a/src/backend/cuda/codegen/stubs/cuda.h +++ b/src/backend/cuda/stubs/cuda.h @@ -25,7 +25,7 @@ * exported global functions with C linkage: * * ```cpp - * #include "target/stubs/cuda.h" + * #include "backend/cuda/stubs/cuda.h" * CUresult result = cuModuleLoadData(&mod, image); * ``` * diff --git a/src/backend/cuda/codegen/stubs/cudart.cc b/src/backend/cuda/stubs/cudart.cc similarity index 100% rename from src/backend/cuda/codegen/stubs/cudart.cc rename to src/backend/cuda/stubs/cudart.cc diff --git a/src/backend/cuda/codegen/stubs/dynlib.h b/src/backend/cuda/stubs/dynlib.h similarity index 100% rename from src/backend/cuda/codegen/stubs/dynlib.h rename to src/backend/cuda/stubs/dynlib.h diff --git a/src/backend/cuda/codegen/stubs/nvrtc.cc b/src/backend/cuda/stubs/nvrtc.cc similarity index 100% rename from src/backend/cuda/codegen/stubs/nvrtc.cc rename to src/backend/cuda/stubs/nvrtc.cc diff --git a/src/backend/cuda/codegen/stubs/vendor/cuda.h b/src/backend/cuda/stubs/vendor/cuda.h similarity index 99% rename from src/backend/cuda/codegen/stubs/vendor/cuda.h rename to src/backend/cuda/stubs/vendor/cuda.h index 534831f227..cd24d451c8 100644 --- a/src/backend/cuda/codegen/stubs/vendor/cuda.h +++ b/src/backend/cuda/stubs/vendor/cuda.h @@ -49,8 +49,8 @@ // Guard to ensure this header is only included by the stub library #ifndef _TILELANG_CUDA_STUB_INCLUDE_GUARD -#error "vendor/cuda.h should only be included by target/stubs/cuda.h. " \ - "Do not include this file directly and use target/stubs/cuda.h instead." +#error "vendor/cuda.h should only be included by backend/cuda/stubs/cuda.h. " \ + "Do not include this file directly and use backend/cuda/stubs/cuda.h instead." #endif #ifndef __cuda_cuda_h__ diff --git a/src/backend/rocm/CMakeLists.txt b/src/backend/rocm/CMakeLists.txt index 903bb2fab2..b8b557eaca 100644 --- a/src/backend/rocm/CMakeLists.txt +++ b/src/backend/rocm/CMakeLists.txt @@ -23,7 +23,7 @@ if(TILELANG_USE_HIP_STUBS) # It also provides minimal HSA wrappers (hsa_init / hsa_shut_down) to avoid a # hard DT_NEEDED dependency on libhsa-runtime64 in ROCm-enabled wheels. # ============================================================================ - add_library(hip_stub SHARED src/backend/rocm/codegen/stubs/hip.cc) + add_library(hip_stub SHARED src/backend/rocm/stubs/hip.cc) target_include_directories(hip_stub PRIVATE ${ROCM_INCLUDE_DIRS}) target_compile_definitions(hip_stub PRIVATE TILELANG_HIP_STUB_EXPORTS) target_link_libraries(hip_stub PRIVATE ${CMAKE_DL_LIBS}) @@ -40,7 +40,7 @@ if(TILELANG_USE_HIP_STUBS) # This library provides a minimal HIPRTC API surface and lazily loads # libhiprtc.so at runtime. # ============================================================================ - add_library(hiprtc_stub SHARED src/backend/rocm/codegen/stubs/hiprtc.cc) + add_library(hiprtc_stub SHARED src/backend/rocm/stubs/hiprtc.cc) target_include_directories(hiprtc_stub PRIVATE ${ROCM_INCLUDE_DIRS}) target_compile_definitions(hiprtc_stub PRIVATE TILELANG_HIPRTC_STUB_EXPORTS) target_link_libraries(hiprtc_stub PRIVATE ${CMAKE_DL_LIBS}) diff --git a/src/backend/rocm/codegen/stubs/hip.cc b/src/backend/rocm/stubs/hip.cc similarity index 99% rename from src/backend/rocm/codegen/stubs/hip.cc rename to src/backend/rocm/stubs/hip.cc index 131b123f57..2166b69f72 100644 --- a/src/backend/rocm/codegen/stubs/hip.cc +++ b/src/backend/rocm/stubs/hip.cc @@ -6,7 +6,7 @@ * wrapper functions that serve as drop-in replacements for the HIP runtime / * module APIs used by TVM/TileLang. * - * The implementation mirrors src/target/stubs/cuda.cc: + * The implementation mirrors src/backend/cuda/stubs/cuda.cc: * - Resolve symbols via dlopen/dlsym on first use. * - Prefer RTLD_DEFAULT/RTLD_NEXT when HIP is already loaded by another * framework (e.g. PyTorch ROCm). diff --git a/src/backend/rocm/codegen/stubs/hip.h b/src/backend/rocm/stubs/hip.h similarity index 99% rename from src/backend/rocm/codegen/stubs/hip.h rename to src/backend/rocm/stubs/hip.h index 5030e2e7dc..4dca43c3b5 100644 --- a/src/backend/rocm/codegen/stubs/hip.h +++ b/src/backend/rocm/stubs/hip.h @@ -2,7 +2,7 @@ * \file hip.h * \brief Stub library header for lazy loading ROCm/HIP libraries at runtime. * - * This mirrors the existing CUDA stubs in src/target/stubs/: + * This mirrors the existing CUDA stubs in src/backend/cuda/stubs/: * - Instead of linking against libamdhip64.so at build time, TileLang can link * against a small stub library (libhip_stub.so) that resolves HIP symbols via * dlopen()/dlsym() on first use. @@ -13,7 +13,7 @@ * 3. Building a single wheel that can run across environments. * * Usage: - * #include "target/stubs/hip.h" + * #include "backend/rocm/stubs/hip.h" * hipError_t e = hipSetDevice(0); */ diff --git a/src/backend/rocm/codegen/stubs/hiprtc.cc b/src/backend/rocm/stubs/hiprtc.cc similarity index 97% rename from src/backend/rocm/codegen/stubs/hiprtc.cc rename to src/backend/rocm/stubs/hiprtc.cc index 664f0fb2c6..964233fd52 100644 --- a/src/backend/rocm/codegen/stubs/hiprtc.cc +++ b/src/backend/rocm/stubs/hiprtc.cc @@ -2,8 +2,9 @@ * \file hiprtc.cc * \brief HIPRTC stub library for lazy loading libhiprtc.so at runtime. * - * Similar to src/target/stubs/nvrtc.cc, this stub exports a minimal subset of - * the HIPRTC C API and resolves the real implementation with dlopen()/dlsym(). + * Similar to src/backend/cuda/stubs/nvrtc.cc, this stub exports a minimal + * subset of the HIPRTC C API and resolves the real implementation with + * dlopen()/dlsym(). * * This allows a ROCm-enabled TileLang build to be imported on machines without * ROCm installed, and avoids hard DT_NEEDED dependencies on libhiprtc. diff --git a/src/backend/rocm/codegen/stubs/vendor/hip_runtime.h b/src/backend/rocm/stubs/vendor/hip_runtime.h similarity index 89% rename from src/backend/rocm/codegen/stubs/vendor/hip_runtime.h rename to src/backend/rocm/stubs/vendor/hip_runtime.h index 43b3ff01c9..51455e25b2 100644 --- a/src/backend/rocm/codegen/stubs/vendor/hip_runtime.h +++ b/src/backend/rocm/stubs/vendor/hip_runtime.h @@ -1,18 +1,21 @@ // Minimal HIP runtime/driver API declarations for TileLang's HIP stub library. // -// This file exists to allow building the stub (src/target/stubs/hip.cc) without -// requiring a full ROCm SDK at build time. When ROCm headers are available, -// target/stubs/hip.h prefers including . +// This file exists to allow building the stub (src/backend/rocm/stubs/hip.cc) +// without requiring a full ROCm SDK at build time. When ROCm headers are +// available, backend/rocm/stubs/hip.h prefers including +// . // // IMPORTANT: // - This header is NOT a complete HIP API. // - Types that are passed by pointer are kept opaque/incomplete on purpose. -// - Do not include this file directly; include target/stubs/hip.h instead. +// - Do not include this file directly; include backend/rocm/stubs/hip.h +// instead. // Guard to ensure this header is only included by the stub wrapper header. #ifndef _TILELANG_HIP_STUB_INCLUDE_GUARD -#error "vendor/hip_runtime.h should only be included by target/stubs/hip.h. " \ - "Do not include this file directly; include target/stubs/hip.h instead." +#error \ + "vendor/hip_runtime.h should only be included by backend/rocm/stubs/hip.h. " \ + "Do not include this file directly; include backend/rocm/stubs/hip.h instead." #endif #pragma once diff --git a/src/op/builtin.cc b/src/op/builtin.cc index 3ff0df96fa..508f90c654 100644 --- a/src/op/builtin.cc +++ b/src/op/builtin.cc @@ -10,7 +10,7 @@ #include #include -#include "backend/cuda/codegen/stubs/cuda.h" +#include "backend/cuda/stubs/cuda.h" #include "target/utils.h" namespace tvm { diff --git a/src/op/utils.h b/src/op/utils.h index 83370ed48e..752c8b989c 100644 --- a/src/op/utils.h +++ b/src/op/utils.h @@ -7,7 +7,7 @@ #define TVM_TL_OP_UTILS_H_ #include "./operator.h" -#include "backend/cuda/codegen/stubs/cuda.h" +#include "backend/cuda/stubs/cuda.h" #include "region.h" #include "tvm/runtime/base.h" #include diff --git a/src/runtime/runtime.cc b/src/runtime/runtime.cc index c59dbb4299..f6112d8420 100644 --- a/src/runtime/runtime.cc +++ b/src/runtime/runtime.cc @@ -6,7 +6,7 @@ #include "runtime.h" -#include "backend/cuda/codegen/stubs/cuda.h" +#include "backend/cuda/stubs/cuda.h" #include #include #include diff --git a/testing/python/amd/test_tilelang_hip_codegen.py b/testing/python/amd/test_tilelang_hip_codegen.py index d042a3d08e..9bb1099ff1 100644 --- a/testing/python/amd/test_tilelang_hip_codegen.py +++ b/testing/python/amd/test_tilelang_hip_codegen.py @@ -611,8 +611,10 @@ def kernel( # --------------------------------------------------------------------------- -# Fix 5 — src/target/codegen_hip.cc, src/target/rt_mod_hip.cc, -# src/target/stubs/hip.cc, src/target/stubs/hip.h +# Fix 5 — src/backend/rocm/codegen/codegen_hip.cc, +# src/backend/rocm/codegen/rt_mod_hip.cc, +# src/backend/rocm/stubs/hip.cc, +# src/backend/rocm/stubs/hip.h # T.sync_grid() → cooperative_groups::this_grid().sync() # # Symptom: tl::sync_grid() had no handler → same assertion / link failure.