diff --git a/CMakeLists.txt b/CMakeLists.txt index 26af62a550..212b1f67ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,6 +190,8 @@ list(APPEND TILE_LANG_SRCS src/runtime/error_helpers.cc ) +set(TILELANG_OUTPUT_TARGETS tilelang tvm) + # Track if the user explicitly selected a backend via cache options. set(TILELANG_BACKEND_USER_SELECTED OFF) foreach(BACKEND IN LISTS TILELANG_BACKENDS) @@ -222,201 +224,11 @@ if(NOT TILELANG_BACKEND_USER_SELECTED) endif() endif() -if(USE_METAL) - if(NOT APPLE) - # On non-Apple platforms USE_METAL=ON enables only codegen (Metal source - # generation) without requiring the Metal/Foundation frameworks. - message(STATUS "Metal backend on non-Apple: enabling codegen-only mode (no Metal runtime)") - set(USE_METAL OFF) - endif() - file(GLOB TILE_LANG_METAL_SRCS - src/target/rt_mod_metal.cc - ) - list(APPEND TILE_LANG_SRCS ${TILE_LANG_METAL_SRCS}) - # FIXME: CIBW failed with backtrace, why??? - set(TVM_FFI_USE_LIBBACKTRACE OFF) -elseif(USE_ROCM) - set(CMAKE_HIP_STANDARD 17) - include(${TVM_SOURCE}/cmake/utils/FindROCM.cmake) - find_rocm(${USE_ROCM}) - add_compile_definitions(__HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__=1) - - if(TILELANG_USE_HIP_STUBS) - if(WIN32 AND NOT CYGWIN) - message(FATAL_ERROR "TILELANG_USE_HIP_STUBS=ON is not supported on Windows. " - "Please configure with -DTILELANG_USE_HIP_STUBS=OFF.") - endif() - - # ============================================================================ - # HIP Stub Library (libhip_stub.so) - # ============================================================================ - # This library provides drop-in replacements for HIP runtime/module APIs by - # lazily loading libamdhip64.so at runtime. - # - # It also provides minimal HSA wrappers (hsa_init / hsa_shut_down) to avoid a - # hard DT_NEEDED dependency on libhsa-runtime64 in ROCm-enabled wheels. - # ============================================================================ - add_library(hip_stub SHARED src/target/stubs/hip.cc) - target_include_directories(hip_stub PRIVATE ${ROCM_INCLUDE_DIRS}) - target_compile_definitions(hip_stub PRIVATE TILELANG_HIP_STUB_EXPORTS) - target_link_libraries(hip_stub PRIVATE ${CMAKE_DL_LIBS}) - set_target_properties(hip_stub PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - OUTPUT_NAME "hip_stub" - ) - - # ============================================================================ - # HIPRTC Stub Library (libhiprtc_stub.so) - # ============================================================================ - # This library provides a minimal HIPRTC API surface and lazily loads - # libhiprtc.so at runtime. - # ============================================================================ - add_library(hiprtc_stub SHARED src/target/stubs/hiprtc.cc) - target_include_directories(hiprtc_stub PRIVATE ${ROCM_INCLUDE_DIRS}) - target_compile_definitions(hiprtc_stub PRIVATE TILELANG_HIPRTC_STUB_EXPORTS) - target_link_libraries(hiprtc_stub PRIVATE ${CMAKE_DL_LIBS}) - set_target_properties(hiprtc_stub PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - OUTPUT_NAME "hiprtc_stub" - ) - - # Make TVM link against our HIP stub instead of the real libamdhip64.so. - # - # NOTE: TVM's `find_rocm()` calls `find_library(ROCM_HIPHCC_LIBRARY amdhip64 ...)`. - # `find_library()` will not override an already-cached variable, so setting it - # here ensures TVM doesn't record a DT_NEEDED on libamdhip64. - set(ROCM_HIPHCC_LIBRARY hip_stub CACHE STRING "HIP runtime library to link against" FORCE) - - # Prevent TVM from recording a DT_NEEDED on libhsa-runtime64. - # The few HSA entrypoints used by TVM are stubbed by hip_stub and resolved - # lazily when available. - set(ROCM_HSA_LIBRARY ROCM_HSA_LIBRARY-NOTFOUND CACHE STRING - "HSA runtime library to link against" FORCE) - endif() - - file(GLOB TILE_LANG_HIP_SRCS - src/target/codegen_hip.cc - src/target/rt_mod_hip.cc - ) - list(APPEND TILE_LANG_SRCS ${TILE_LANG_HIP_SRCS}) - list(APPEND TILE_LANG_INCLUDES ${ROCM_INCLUDE_DIRS}) -elseif(USE_CUDA) - set(CMAKE_CUDA_STANDARD 17) - find_package(CUDAToolkit REQUIRED) - set(CMAKE_CUDA_COMPILER "${CUDAToolkit_BIN_DIR}/nvcc") - add_compile_definitions("CUDA_MAJOR_VERSION=${CUDAToolkit_VERSION_MAJOR}") - - # Set `USE_CUDA=/usr/local/cuda-x.y` - cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH USE_CUDA) - - if(TILELANG_USE_CUDA_STUBS) - if(WIN32 AND NOT CYGWIN) - message(FATAL_ERROR "TILELANG_USE_CUDA_STUBS=ON is not supported on Windows. " - "Please configure with -DTILELANG_USE_CUDA_STUBS=OFF.") - endif() - - # ============================================================================ - # CUDA Driver Stub Library (libcuda_stub.so) - # ============================================================================ - # This library provides drop-in replacements for CUDA driver API functions. - # Instead of linking directly against libcuda.so (which would fail on - # CPU-only machines), we link against this stub which loads libcuda.so - # lazily at runtime on first API call. - # - # The stub exports global C functions matching the CUDA driver API: - # - cuModuleLoadData, cuLaunchKernel, cuMemsetD32_v2, etc. - # These can be called directly without any wrapper macros. - # ============================================================================ - add_library(cuda_stub SHARED src/target/stubs/cuda.cc) - target_include_directories(cuda_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) - # Export symbols with visibility="default" when building - target_compile_definitions(cuda_stub PRIVATE TILELANG_CUDA_STUB_EXPORTS) - # Use dlopen/dlsym for runtime library loading - target_link_libraries(cuda_stub PRIVATE ${CMAKE_DL_LIBS}) - set_target_properties(cuda_stub PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - # Use consistent naming - OUTPUT_NAME "cuda_stub" - ) - - # ============================================================================ - # CUDA Runtime Stub Library (libcudart_stub.so) - # ============================================================================ - # libcudart's SONAME includes its major version (e.g. libcudart.so.11.0 / .12 / .13). - # Link against this stub instead of the real libcudart so a single wheel can - # run in environments that provide different libcudart major versions. - # - # The stub exports a minimal set of CUDA Runtime API entrypoints used by TVM - # and lazily loads libcudart at runtime on first API call. - # ============================================================================ - add_library(cudart_stub SHARED src/target/stubs/cudart.cc) - target_include_directories(cudart_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) - target_compile_definitions(cudart_stub PRIVATE TILELANG_CUDART_STUB_EXPORTS) - target_link_libraries(cudart_stub PRIVATE ${CMAKE_DL_LIBS}) - set_target_properties(cudart_stub PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - OUTPUT_NAME "cudart_stub" - ) - - # Make TVM link against our CUDA Runtime stub instead of the real libcudart. - # - # NOTE: TVM's `find_cuda()` calls `find_library(CUDA_CUDART_LIBRARY cudart ...)`. - # `find_library()` will not override an already-cached variable, so setting it - # here ensures TVM doesn't record a DT_NEEDED on `libcudart.so.`. - set(CUDA_CUDART_LIBRARY cudart_stub CACHE STRING "CUDART library to link against" FORCE) - - # ============================================================================ - # NVRTC Stub Library (libnvrtc_stub.so) - # ============================================================================ - # NVRTC's SONAME includes its major version (e.g. libnvrtc.so.11.2 / .12 / .13). - # Link against this stub instead of the real NVRTC library so a single wheel - # can run in environments that provide different NVRTC major versions. - # - # The stub exports a minimal set of NVRTC C API entrypoints used by TVM and - # lazily loads libnvrtc at runtime on first API call. - # ============================================================================ - add_library(nvrtc_stub SHARED src/target/stubs/nvrtc.cc) - target_include_directories(nvrtc_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) - target_compile_definitions(nvrtc_stub PRIVATE TILELANG_NVRTC_STUB_EXPORTS) - target_link_libraries(nvrtc_stub PRIVATE ${CMAKE_DL_LIBS}) - set_target_properties(nvrtc_stub PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" - OUTPUT_NAME "nvrtc_stub" - ) - - # Make TVM link against our NVRTC stub instead of the real libnvrtc. - # - # NOTE: TVM's `find_cuda()` calls `find_library(CUDA_NVRTC_LIBRARY nvrtc ...)`. - # `find_library()` will not override an already-cached variable, so setting it - # here ensures TVM doesn't record a DT_NEEDED on `libnvrtc.so.`. - set(CUDA_NVRTC_LIBRARY nvrtc_stub CACHE STRING "NVRTC library to link against" FORCE) - endif() - - file(GLOB TILE_LANG_CUDA_SRCS - src/runtime/runtime.cc - src/target/ptx.cc - src/target/codegen_cuda.cc - src/target/codegen_py.cc - src/target/codegen_utils.cc - src/target/codegen_cutedsl.cc - src/target/rt_mod_cuda.cc - src/target/rt_mod_cutedsl.cc - ) - list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS}) - - list(APPEND TILE_LANG_INCLUDES ${CUDAToolkit_INCLUDE_DIRS}) - link_directories(${CUDAToolkit_LIBRARY_DIR} ${CUDAToolkit_LIBRARY_DIR}/stubs) -endif() +# Backend-local CMake files own native source lists, stubs, include paths, and +# compile definitions. Top-level CMake only selects and delegates. +include("${CMAKE_CURRENT_SOURCE_DIR}/src/backend/cuda/CMakeLists.txt") +include("${CMAKE_CURRENT_SOURCE_DIR}/src/backend/rocm/CMakeLists.txt") +include("${CMAKE_CURRENT_SOURCE_DIR}/src/backend/metal/CMakeLists.txt") set(USE_Z3 ON CACHE STRING "Use Z3 SMT solver for TileLang optimizations") set(USE_PYPI_Z3 ON CACHE BOOL "Use Z3 provided by PyPI z3-solver package") @@ -521,28 +333,15 @@ if(USE_Z3 AND USE_PYPI_Z3) endif() endif() -set(TILELANG_OUTPUT_TARGETS tilelang tvm) - -if(USE_CUDA AND TILELANG_USE_CUDA_STUBS) - # Link against CUDA stub library instead of libcuda.so - # This enables lazy loading of libcuda.so at runtime, allowing - # `import tilelang` to succeed on CPU-only machines. +if(DEFINED TILELANG_ACTIVE_BACKEND_STUB_LINK) foreach(target IN LISTS TILELANG_OUTPUT_TARGETS) - target_link_libraries(${target} PUBLIC cuda_stub) + target_link_libraries(${target} PUBLIC ${TILELANG_ACTIVE_BACKEND_STUB_LINK}) endforeach() - # Include CUDA stubs in output targets for RPATH configuration - list(APPEND TILELANG_OUTPUT_TARGETS cuda_stub cudart_stub nvrtc_stub) endif() -if(USE_ROCM AND TILELANG_USE_HIP_STUBS) - # Link against HIP stub library instead of libamdhip64.so - # This enables lazy loading of libamdhip64.so at runtime, allowing - # `import tilelang` to succeed on CPU-only machines. - foreach(target IN LISTS TILELANG_OUTPUT_TARGETS) - target_link_libraries(${target} PUBLIC hip_stub) - endforeach() - # Include HIP stubs in output targets for RPATH configuration / installation - list(APPEND TILELANG_OUTPUT_TARGETS hip_stub hiprtc_stub) +# Append stub targets after the linking loop so they don't link to themselves +if(DEFINED TILELANG_ACTIVE_BACKEND_STUB_TARGETS) + list(APPEND TILELANG_OUTPUT_TARGETS ${TILELANG_ACTIVE_BACKEND_STUB_TARGETS}) endif() unset(PATCHELF_EXECUTABLE CACHE) @@ -558,8 +357,8 @@ elseif(UNIX) if(USE_Z3 AND USE_PYPI_Z3) string(APPEND TILELANG_INSTALL_RPATH ":\$ORIGIN/../../z3/lib") endif() - if(USE_CUDA) - string(APPEND TILELANG_INSTALL_RPATH ":\$ORIGIN/../../nvidia/cu${CUDAToolkit_VERSION_MAJOR}/lib") + if(DEFINED TILELANG_ACTIVE_BACKEND_RPATH_EXTRA) + string(APPEND TILELANG_INSTALL_RPATH "${TILELANG_ACTIVE_BACKEND_RPATH_EXTRA}") endif() find_program(PATCHELF_EXECUTABLE patchelf) if (NOT PATCHELF_EXECUTABLE) @@ -577,25 +376,23 @@ foreach(target IN LISTS TILELANG_OUTPUT_TARGETS) ) endforeach() -# Exclude libcuda.so to allow importing on a CPU-only machine -if(USE_CUDA AND TILELANG_USE_CUDA_STUBS AND PATCHELF_EXECUTABLE) - # Run `patchelf` on built libraries to remove libcuda.so dependency. - # Use `install(CODE ...)` instead of `add_custom_command(... POST_BUILD ...)` - # to avoid race conditions during linking. +# Strip backend runtime dependencies for portable wheels +if(DEFINED TILELANG_ACTIVE_BACKEND_PATCHELF_REMOVE AND PATCHELF_EXECUTABLE) + foreach(_needed IN LISTS TILELANG_ACTIVE_BACKEND_PATCHELF_REMOVE) + set(_patchelf_remove_args "${_patchelf_remove_args} --remove-needed ${_needed}") + endforeach() foreach(target IN LISTS TILELANG_OUTPUT_TARGETS) install(CODE " execute_process( - COMMAND ${PATCHELF_EXECUTABLE} - --remove-needed libcuda.so.1 - --remove-needed libcuda.so + COMMAND ${PATCHELF_EXECUTABLE}${_patchelf_remove_args} \"$\" WORKING_DIRECTORY \"${CMAKE_INSTALL_PREFIX}\" RESULT_VARIABLE patchelf_result ) if(patchelf_result EQUAL 0) - message(STATUS \"`patchelf` successfully removed dependency `libcuda.so` from $\") + message(STATUS \"patchelf: removed dependencies from $\") else() - message(WARNING \"`patchelf` failed to remove dependency `libcuda.so` from $\") + message(WARNING \"patchelf failed for $\") endif() ") endforeach() diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt new file mode 100644 index 0000000000..2a68eb8b6d --- /dev/null +++ b/src/backend/cuda/CMakeLists.txt @@ -0,0 +1,130 @@ +# CUDA backend: toolchain, stub libraries, source files, and build configuration. +if(NOT USE_CUDA) + return() +endif() + +set(CMAKE_CUDA_STANDARD 17) +find_package(CUDAToolkit REQUIRED) +set(CMAKE_CUDA_COMPILER "${CUDAToolkit_BIN_DIR}/nvcc") +add_compile_definitions("CUDA_MAJOR_VERSION=${CUDAToolkit_VERSION_MAJOR}") + +# Set `USE_CUDA=/usr/local/cuda-x.y` +cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH USE_CUDA) + +if(TILELANG_USE_CUDA_STUBS) + if(WIN32 AND NOT CYGWIN) + message(FATAL_ERROR "TILELANG_USE_CUDA_STUBS=ON is not supported on Windows. " + "Please configure with -DTILELANG_USE_CUDA_STUBS=OFF.") + endif() + + # ============================================================================ + # CUDA Driver Stub Library (libcuda_stub.so) + # ============================================================================ + # This library provides drop-in replacements for CUDA driver API functions. + # Instead of linking directly against libcuda.so (which would fail on + # CPU-only machines), we link against this stub which loads libcuda.so + # lazily at runtime on first API call. + # + # The stub exports global C functions matching the CUDA driver API: + # - cuModuleLoadData, cuLaunchKernel, cuMemsetD32_v2, etc. + # These can be called directly without any wrapper macros. + # ============================================================================ + add_library(cuda_stub SHARED src/target/stubs/cuda.cc) + target_include_directories(cuda_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) + # Export symbols with visibility="default" when building + target_compile_definitions(cuda_stub PRIVATE TILELANG_CUDA_STUB_EXPORTS) + # Use dlopen/dlsym for runtime library loading + target_link_libraries(cuda_stub PRIVATE ${CMAKE_DL_LIBS}) + set_target_properties(cuda_stub PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + # Use consistent naming + OUTPUT_NAME "cuda_stub" + ) + + # ============================================================================ + # CUDA Runtime Stub Library (libcudart_stub.so) + # ============================================================================ + # libcudart's SONAME includes its major version (e.g. libcudart.so.11.0 / .12 / .13). + # Link against this stub instead of the real libcudart so a single wheel can + # run in environments that provide different libcudart major versions. + # + # The stub exports a minimal set of CUDA Runtime API entrypoints used by TVM + # and lazily loads libcudart at runtime on first API call. + # ============================================================================ + add_library(cudart_stub SHARED src/target/stubs/cudart.cc) + target_include_directories(cudart_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) + target_compile_definitions(cudart_stub PRIVATE TILELANG_CUDART_STUB_EXPORTS) + target_link_libraries(cudart_stub PRIVATE ${CMAKE_DL_LIBS}) + set_target_properties(cudart_stub PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + OUTPUT_NAME "cudart_stub" + ) + + # Make TVM link against our CUDA Runtime stub instead of the real libcudart. + # + # NOTE: TVM's `find_cuda()` calls `find_library(CUDA_CUDART_LIBRARY cudart ...)`. + # `find_library()` will not override an already-cached variable, so setting it + # here ensures TVM doesn't record a DT_NEEDED on `libcudart.so.`. + set(CUDA_CUDART_LIBRARY cudart_stub CACHE STRING "CUDART library to link against" FORCE) + + # ============================================================================ + # NVRTC Stub Library (libnvrtc_stub.so) + # ============================================================================ + # NVRTC's SONAME includes its major version (e.g. libnvrtc.so.11.2 / .12 / .13). + # Link against this stub instead of the real NVRTC library so a single wheel + # can run in environments that provide different NVRTC major versions. + # + # The stub exports a minimal set of NVRTC C API entrypoints used by TVM and + # lazily loads libnvrtc at runtime on first API call. + # ============================================================================ + add_library(nvrtc_stub SHARED src/target/stubs/nvrtc.cc) + target_include_directories(nvrtc_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) + target_compile_definitions(nvrtc_stub PRIVATE TILELANG_NVRTC_STUB_EXPORTS) + target_link_libraries(nvrtc_stub PRIVATE ${CMAKE_DL_LIBS}) + set_target_properties(nvrtc_stub PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + OUTPUT_NAME "nvrtc_stub" + ) + + # Make TVM link against our NVRTC stub instead of the real libnvrtc. + # + # NOTE: TVM's `find_cuda()` calls `find_library(CUDA_NVRTC_LIBRARY nvrtc ...)`. + # `find_library()` will not override an already-cached variable, so setting it + # here ensures TVM doesn't record a DT_NEEDED on `libnvrtc.so.`. + set(CUDA_NVRTC_LIBRARY nvrtc_stub CACHE STRING "NVRTC library to link against" FORCE) +endif() + +file(GLOB TILE_LANG_CUDA_SRCS + src/runtime/runtime.cc + src/target/ptx.cc + src/target/codegen_cuda.cc + src/target/codegen_py.cc + src/target/codegen_utils.cc + src/target/codegen_cutedsl.cc + src/target/rt_mod_cuda.cc + src/target/rt_mod_cutedsl.cc +) +list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS}) + +list(APPEND TILE_LANG_INCLUDES ${CUDAToolkit_INCLUDE_DIRS}) +link_directories(${CUDAToolkit_LIBRARY_DIR} ${CUDAToolkit_LIBRARY_DIR}/stubs) + +# Register stubs for linking and install +if(TILELANG_USE_CUDA_STUBS) + set(TILELANG_ACTIVE_BACKEND_STUB_LINK cuda_stub) + set(TILELANG_ACTIVE_BACKEND_STUB_TARGETS cuda_stub cudart_stub nvrtc_stub) +endif() + +# Register additional RPATH for CUDA toolkit lib directory +if(UNIX) + set(TILELANG_ACTIVE_BACKEND_RPATH_EXTRA ":\$ORIGIN/../../nvidia/cu${CUDAToolkit_VERSION_MAJOR}/lib") +endif() + +# Register patchelf removals (SONAMEs to strip for portable wheels) +set(TILELANG_ACTIVE_BACKEND_PATCHELF_REMOVE "libcuda.so.1;libcuda.so") diff --git a/src/backend/metal/CMakeLists.txt b/src/backend/metal/CMakeLists.txt new file mode 100644 index 0000000000..9dbf33204a --- /dev/null +++ b/src/backend/metal/CMakeLists.txt @@ -0,0 +1,18 @@ +# Metal backend: source files and build configuration. +if(NOT USE_METAL) + return() +endif() + +if(NOT APPLE) + # On non-Apple platforms USE_METAL=ON enables only codegen (Metal source + # generation) without requiring the Metal/Foundation frameworks. + message(STATUS "Metal backend on non-Apple: enabling codegen-only mode (no Metal runtime)") + set(USE_METAL OFF) +endif() + +file(GLOB TILE_LANG_METAL_SRCS + src/target/rt_mod_metal.cc +) +list(APPEND TILE_LANG_SRCS ${TILE_LANG_METAL_SRCS}) +# FIXME: CIBW failed with backtrace, why??? +set(TVM_FFI_USE_LIBBACKTRACE OFF) diff --git a/src/backend/rocm/CMakeLists.txt b/src/backend/rocm/CMakeLists.txt new file mode 100644 index 0000000000..49da4bf80c --- /dev/null +++ b/src/backend/rocm/CMakeLists.txt @@ -0,0 +1,79 @@ +# ROCm backend: toolchain, stub libraries, source files, and build configuration. +if(NOT USE_ROCM) + return() +endif() + +set(CMAKE_HIP_STANDARD 17) +include(${TVM_SOURCE}/cmake/utils/FindROCM.cmake) +find_rocm(${USE_ROCM}) +add_compile_definitions(__HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__=1) + +if(TILELANG_USE_HIP_STUBS) + if(WIN32 AND NOT CYGWIN) + message(FATAL_ERROR "TILELANG_USE_HIP_STUBS=ON is not supported on Windows. " + "Please configure with -DTILELANG_USE_HIP_STUBS=OFF.") + endif() + + # ============================================================================ + # HIP Stub Library (libhip_stub.so) + # ============================================================================ + # This library provides drop-in replacements for HIP runtime/module APIs by + # lazily loading libamdhip64.so at runtime. + # + # It also provides minimal HSA wrappers (hsa_init / hsa_shut_down) to avoid a + # hard DT_NEEDED dependency on libhsa-runtime64 in ROCm-enabled wheels. + # ============================================================================ + add_library(hip_stub SHARED src/target/stubs/hip.cc) + target_include_directories(hip_stub PRIVATE ${ROCM_INCLUDE_DIRS}) + target_compile_definitions(hip_stub PRIVATE TILELANG_HIP_STUB_EXPORTS) + target_link_libraries(hip_stub PRIVATE ${CMAKE_DL_LIBS}) + set_target_properties(hip_stub PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + OUTPUT_NAME "hip_stub" + ) + + # ============================================================================ + # HIPRTC Stub Library (libhiprtc_stub.so) + # ============================================================================ + # This library provides a minimal HIPRTC API surface and lazily loads + # libhiprtc.so at runtime. + # ============================================================================ + add_library(hiprtc_stub SHARED src/target/stubs/hiprtc.cc) + target_include_directories(hiprtc_stub PRIVATE ${ROCM_INCLUDE_DIRS}) + target_compile_definitions(hiprtc_stub PRIVATE TILELANG_HIPRTC_STUB_EXPORTS) + target_link_libraries(hiprtc_stub PRIVATE ${CMAKE_DL_LIBS}) + set_target_properties(hiprtc_stub PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" + OUTPUT_NAME "hiprtc_stub" + ) + + # Make TVM link against our HIP stub instead of the real libamdhip64.so. + # + # NOTE: TVM's `find_rocm()` calls `find_library(ROCM_HIPHCC_LIBRARY amdhip64 ...)`. + # `find_library()` will not override an already-cached variable, so setting it + # here ensures TVM doesn't record a DT_NEEDED on libamdhip64. + set(ROCM_HIPHCC_LIBRARY hip_stub CACHE STRING "HIP runtime library to link against" FORCE) + + # Prevent TVM from recording a DT_NEEDED on libhsa-runtime64. + # The few HSA entrypoints used by TVM are stubbed by hip_stub and resolved + # lazily when available. + set(ROCM_HSA_LIBRARY ROCM_HSA_LIBRARY-NOTFOUND CACHE STRING + "HSA runtime library to link against" FORCE) +endif() + +file(GLOB TILE_LANG_HIP_SRCS + src/target/codegen_hip.cc + src/target/rt_mod_hip.cc +) +list(APPEND TILE_LANG_SRCS ${TILE_LANG_HIP_SRCS}) +list(APPEND TILE_LANG_INCLUDES ${ROCM_INCLUDE_DIRS}) + +# Register stubs for linking and install +if(TILELANG_USE_HIP_STUBS) + set(TILELANG_ACTIVE_BACKEND_STUB_LINK hip_stub) + set(TILELANG_ACTIVE_BACKEND_STUB_TARGETS hip_stub hiprtc_stub) +endif()