Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/runtime_internals/stubs.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ ROCm:

## Implementation

The stubs in `src/target/stubs/` implement a lazy-loading mechanism:
The CUDA stubs in `src/backend/cuda/stubs/` and ROCm stubs in
`src/backend/rocm/stubs/` implement a lazy-loading mechanism:

- **Lazy Loading**: Libraries are loaded via `dlopen` only upon the first API call.
- **Global Symbol Reuse**: For `cudart` and `nvrtc`, the stubs first check the global namespace (`RTLD_DEFAULT`) to use any already loaded symbols (e.g., from PyTorch).
Expand Down
6 changes: 3 additions & 3 deletions src/backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ if(TILELANG_USE_CUDA_STUBS)
# - cuModuleLoadData, cuLaunchKernel, cuMemsetD32_v2, etc.
# These can be called directly without any wrapper macros.
# ============================================================================
add_library(cuda_stub SHARED src/backend/cuda/codegen/stubs/cuda.cc)
add_library(cuda_stub SHARED src/backend/cuda/stubs/cuda.cc)
target_include_directories(cuda_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
# Export symbols with visibility="default" when building
target_compile_definitions(cuda_stub PRIVATE TILELANG_CUDA_STUB_EXPORTS)
Expand Down Expand Up @@ -77,7 +77,7 @@ if(TILELANG_USE_CUDA_STUBS)
# The stub exports a minimal set of CUDA Runtime API entrypoints used by TVM
# and lazily loads libcudart at runtime on first API call.
# ============================================================================
add_library(cudart_stub SHARED src/backend/cuda/codegen/stubs/cudart.cc)
add_library(cudart_stub SHARED src/backend/cuda/stubs/cudart.cc)
target_include_directories(cudart_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
target_compile_definitions(cudart_stub PRIVATE TILELANG_CUDART_STUB_EXPORTS
${TILELANG_CUDA_TOOLKIT_VERSION_DEFINITIONS})
Expand Down Expand Up @@ -111,7 +111,7 @@ if(TILELANG_USE_CUDA_STUBS)
# The stub exports a minimal set of NVRTC C API entrypoints used by TVM and
# lazily loads libnvrtc at runtime on first API call.
# ============================================================================
add_library(nvrtc_stub SHARED src/backend/cuda/codegen/stubs/nvrtc.cc)
add_library(nvrtc_stub SHARED src/backend/cuda/stubs/nvrtc.cc)
target_include_directories(nvrtc_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
target_compile_definitions(nvrtc_stub PRIVATE TILELANG_NVRTC_STUB_EXPORTS
${TILELANG_CUDA_TOOLKIT_VERSION_DEFINITIONS})
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
* exported global functions with C linkage:
*
* ```cpp
* #include "target/stubs/cuda.h"
* #include "backend/cuda/stubs/cuda.h"
* CUresult result = cuModuleLoadData(&mod, image);
* ```
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@

// Guard to ensure this header is only included by the stub library
#ifndef _TILELANG_CUDA_STUB_INCLUDE_GUARD
#error "vendor/cuda.h should only be included by target/stubs/cuda.h. " \
"Do not include this file directly and use target/stubs/cuda.h instead."
#error "vendor/cuda.h should only be included by backend/cuda/stubs/cuda.h. " \
"Do not include this file directly and use backend/cuda/stubs/cuda.h instead."
#endif

#ifndef __cuda_cuda_h__
Expand Down
4 changes: 2 additions & 2 deletions src/backend/rocm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ if(TILELANG_USE_HIP_STUBS)
# It also provides minimal HSA wrappers (hsa_init / hsa_shut_down) to avoid a
# hard DT_NEEDED dependency on libhsa-runtime64 in ROCm-enabled wheels.
# ============================================================================
add_library(hip_stub SHARED src/backend/rocm/codegen/stubs/hip.cc)
add_library(hip_stub SHARED src/backend/rocm/stubs/hip.cc)
target_include_directories(hip_stub PRIVATE ${ROCM_INCLUDE_DIRS})
target_compile_definitions(hip_stub PRIVATE TILELANG_HIP_STUB_EXPORTS)
target_link_libraries(hip_stub PRIVATE ${CMAKE_DL_LIBS})
Expand All @@ -40,7 +40,7 @@ if(TILELANG_USE_HIP_STUBS)
# This library provides a minimal HIPRTC API surface and lazily loads
# libhiprtc.so at runtime.
# ============================================================================
add_library(hiprtc_stub SHARED src/backend/rocm/codegen/stubs/hiprtc.cc)
add_library(hiprtc_stub SHARED src/backend/rocm/stubs/hiprtc.cc)
target_include_directories(hiprtc_stub PRIVATE ${ROCM_INCLUDE_DIRS})
target_compile_definitions(hiprtc_stub PRIVATE TILELANG_HIPRTC_STUB_EXPORTS)
target_link_libraries(hiprtc_stub PRIVATE ${CMAKE_DL_LIBS})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* wrapper functions that serve as drop-in replacements for the HIP runtime /
* module APIs used by TVM/TileLang.
*
* The implementation mirrors src/target/stubs/cuda.cc:
* The implementation mirrors src/backend/cuda/stubs/cuda.cc:
* - Resolve symbols via dlopen/dlsym on first use.
* - Prefer RTLD_DEFAULT/RTLD_NEXT when HIP is already loaded by another
* framework (e.g. PyTorch ROCm).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* \file hip.h
* \brief Stub library header for lazy loading ROCm/HIP libraries at runtime.
*
* This mirrors the existing CUDA stubs in src/target/stubs/:
* This mirrors the existing CUDA stubs in src/backend/cuda/stubs/:
* - Instead of linking against libamdhip64.so at build time, TileLang can link
* against a small stub library (libhip_stub.so) that resolves HIP symbols via
* dlopen()/dlsym() on first use.
Expand All @@ -13,7 +13,7 @@
* 3. Building a single wheel that can run across environments.
*
* Usage:
* #include "target/stubs/hip.h"
* #include "backend/rocm/stubs/hip.h"
* hipError_t e = hipSetDevice(0);
*/

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
* \file hiprtc.cc
* \brief HIPRTC stub library for lazy loading libhiprtc.so at runtime.
*
* Similar to src/target/stubs/nvrtc.cc, this stub exports a minimal subset of
* the HIPRTC C API and resolves the real implementation with dlopen()/dlsym().
* Similar to src/backend/cuda/stubs/nvrtc.cc, this stub exports a minimal
* subset of the HIPRTC C API and resolves the real implementation with
* dlopen()/dlsym().
*
* This allows a ROCm-enabled TileLang build to be imported on machines without
* ROCm installed, and avoids hard DT_NEEDED dependencies on libhiprtc.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
// Minimal HIP runtime/driver API declarations for TileLang's HIP stub library.
//
// This file exists to allow building the stub (src/target/stubs/hip.cc) without
// requiring a full ROCm SDK at build time. When ROCm headers are available,
// target/stubs/hip.h prefers including <hip/hip_runtime_api.h>.
// This file exists to allow building the stub (src/backend/rocm/stubs/hip.cc)
// without requiring a full ROCm SDK at build time. When ROCm headers are
// available, backend/rocm/stubs/hip.h prefers including
// <hip/hip_runtime_api.h>.
//
// IMPORTANT:
// - This header is NOT a complete HIP API.
// - Types that are passed by pointer are kept opaque/incomplete on purpose.
// - Do not include this file directly; include target/stubs/hip.h instead.
// - Do not include this file directly; include backend/rocm/stubs/hip.h
// instead.

// Guard to ensure this header is only included by the stub wrapper header.
#ifndef _TILELANG_HIP_STUB_INCLUDE_GUARD
#error "vendor/hip_runtime.h should only be included by target/stubs/hip.h. " \
"Do not include this file directly; include target/stubs/hip.h instead."
#error \
"vendor/hip_runtime.h should only be included by backend/rocm/stubs/hip.h. " \
"Do not include this file directly; include backend/rocm/stubs/hip.h instead."
#endif

#pragma once
Expand Down
2 changes: 1 addition & 1 deletion src/op/builtin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <tvm/tir/op.h>
#include <tvm/tir/op_attr_types.h>

#include "backend/cuda/codegen/stubs/cuda.h"
#include "backend/cuda/stubs/cuda.h"
#include "target/utils.h"

namespace tvm {
Expand Down
2 changes: 1 addition & 1 deletion src/op/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#define TVM_TL_OP_UTILS_H_

#include "./operator.h"
#include "backend/cuda/codegen/stubs/cuda.h"
#include "backend/cuda/stubs/cuda.h"
#include "region.h"
#include "tvm/runtime/base.h"
#include <tvm/tir/buffer.h>
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

#include "runtime.h"

#include "backend/cuda/codegen/stubs/cuda.h"
#include "backend/cuda/stubs/cuda.h"
#include <cstdint>
#include <sstream>
#include <tvm/ffi/function.h>
Expand Down
6 changes: 4 additions & 2 deletions testing/python/amd/test_tilelang_hip_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,8 +611,10 @@ def kernel(


# ---------------------------------------------------------------------------
# Fix 5 — src/target/codegen_hip.cc, src/target/rt_mod_hip.cc,
# src/target/stubs/hip.cc, src/target/stubs/hip.h
# Fix 5 — src/backend/rocm/codegen/codegen_hip.cc,
# src/backend/rocm/codegen/rt_mod_hip.cc,
# src/backend/rocm/stubs/hip.cc,
# src/backend/rocm/stubs/hip.h
# T.sync_grid() → cooperative_groups::this_grid().sync()
#
# Symptom: tl::sync_grid() had no handler → same assertion / link failure.
Expand Down
Loading