pytorch · meta-codesync · Jan 30, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
@@ -123,6 +123,10 @@ if(EXECUTORCH_ENABLE_BUNDLE_IO)
   add_definitions(-DET_BUNDLE_IO_ENABLED)
 endif()
 
+if(EXECUTORCH_BUILD_CUDA)
+  add_definitions(-DCUDA_AVAILABLE=1)
+endif()
+
 # -ffunction-sections -fdata-sections: breaks function and data into sections so
 # they can be properly gc'd. -s: strip symbol.
 if(WIN32)

@@ -25,7 +25,10 @@ endif()
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
-# Common AOTI functionality - combines all AOTI common components
+# ==============================================================================
+# AOTI common shims using ETensor (for Metal backend)
+# TODO(gasoonjia): Remove this after metal migration
+# ==============================================================================
 set(_aoti_common_sources common_shims.cpp)
 add_library(aoti_common STATIC ${_aoti_common_sources})
 target_include_directories(
@@ -59,3 +62,42 @@ install(
   EXPORT ExecuTorchTargets
   DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
+
+# ==============================================================================
+# AOTI common shims using SlimTensor (for CUDA backend) Uses SlimTensor for all
+# tensor operations
+# TODO(gasoonjia): Replace aoti_common with this one after metal migration
+# ==============================================================================
+add_library(aoti_common_shims_slim STATIC common_shims_slim.cpp)
+target_include_directories(
+  aoti_common_shims_slim
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+)
+target_compile_options(
+  aoti_common_shims_slim
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+target_compile_definitions(
+  aoti_common_shims_slim PUBLIC $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
+)
+
+# Add CUDA include directories and link CUDA runtime when building with CUDA
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  target_include_directories(
+    aoti_common_shims_slim PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
+  )
+  target_link_libraries(aoti_common_shims_slim PUBLIC CUDA::cudart)
+endif()
+
+target_link_libraries(
+  aoti_common_shims_slim PUBLIC slimtensor extension_tensor ${CMAKE_DL_LIBS}
+)
+
+install(
+  TARGETS aoti_common_shims_slim
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl
@@ -33,7 +33,8 @@ def define_common_targets():
         ],
     )
 
-    # AOTI common shims functionality
+    # AOTI common shims functionality using ETensor
+    # TODO(gasoonjia): Remove this after metal migration
     runtime.cxx_library(
         name = "common_shims",
         srcs = [
@@ -89,6 +90,7 @@ def define_common_targets():
 
     # SlimTensor-based common shims library
     # Uses SlimTensor for all tensor operations
+    # TODO(gasoonjia): Replace common_shims with this one after metal migration
     runtime.cxx_library(
         name = "common_shims_slim",
         srcs = [
@@ -97,10 +99,27 @@ def define_common_targets():
         headers = [
             "common_shims_slim.h",
             "export.h",
+            "utils.h",
         ],
         visibility = ["@EXECUTORCH_CLIENTS"],
         exported_deps = [
             "//executorch/runtime/core:core",
+            "//executorch/runtime/core/exec_aten:lib",
             "//executorch/backends/aoti/slim/core:slimtensor",
         ],
     )
+
+    # Common AOTI functionality for SlimTensor-based backends (combining common_shims_slim and delegate_handle)
+    # All CUDA backend code should depend on this target
+    # TODO(gasoonjia): Replace aoti_common with this one after metal migration
+    runtime.cxx_library(
+        name = "aoti_common_slim",
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+        link_whole = True,
+        supports_python_dlopen = True,
+        visibility = ["PUBLIC"],
+        exported_deps = [
+            ":common_shims_slim",
+            ":delegate_handle",
+        ],
+    )
@@ -99,13 +99,18 @@ install(
 
 # CUDA-specific AOTI shim symbols (dynamically linked)
 set(_aoti_cuda_shim_sources
-    runtime/shims/memory.cpp runtime/shims/tensor_attribute.cpp
-    runtime/guard.cpp runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
-    ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
+    runtime/shims/memory.cpp
+    runtime/shims/cuda_guard.cpp
+    runtime/shims/int4mm.cu
+    ${EXECUTORCH_ROOT}/backends/aoti/common_shims_slim.cpp
+    ${EXECUTORCH_ROOT}/backends/aoti/slim/cuda/guard.cpp
 )
 
 add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})
 
+# Define CUDA_AVAILABLE to use SlimTensor on GPU in common_shims_slim.h
+target_compile_definitions(aoti_cuda_shims PRIVATE CUDA_AVAILABLE=1)
+
 # Define export macros for shared library
 if(MSVC)
   target_compile_definitions(aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS)

diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -3,28 +3,6 @@ load("//tools/build/buck:nvcc_flags.bzl", "get_nvcc_arch_args")
 
 oncall("executorch")
 
-runtime.cxx_library(
-    name = "guard",
-    srcs = [
-        "guard.cpp",
-    ],
-    headers = [
-        "guard.h",
-        "utils.h",
-    ],
-    visibility = ["PUBLIC"],
-    deps = [
-        "//executorch/runtime/platform:platform",
-    ],
-    exported_deps = [
-        "//executorch/runtime/core:core",
-        "//executorch/runtime/core/exec_aten:lib",
-    ],
-    external_deps = [
-        ("cuda", None, "cuda-lazy"),
-    ],
-)
-
 runtime.cxx_library(
     name = "cuda_platform",
     srcs = [
@@ -71,14 +49,12 @@ runtime.cxx_library(
 runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
-        "guard.cpp",
         "shims/cuda_guard.cpp",
         "shims/int4mm.cu",
         "shims/memory.cpp",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
-        "guard.h",
         "shims/cuda_guard.h",
         "shims/int4mm.cuh",
         "shims/int4mm.h",
@@ -91,43 +67,18 @@ runtime.cxx_library(
     supports_python_dlopen = True,
     # Constructor needed for backend registration.
     compiler_flags = ["-Wno-global-constructors"],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
     visibility = ["PUBLIC"],
     deps = [
         ":tensor_maker",
-        "//executorch/backends/aoti:common_shims",
-        "//executorch/runtime/core:core",
-        "//executorch/runtime/core/exec_aten:lib",
-        "//executorch/runtime/platform:platform",
-        "//executorch/backends/cuda/runtime:cuda_platform",
-    ],
-    nvcc_flags = get_nvcc_arch_args() + [
-        "-_NVCC_HOST_COMPILER_FLAG_",
-        "gcc",
-    ],
-    external_deps = [
-        ("cuda", None, "cuda-lazy"),
-    ],
-)
-
-runtime.cxx_library(
-    name = "runtime_shims_slim",
-    srcs = [
-        "shims/memory_slim.cpp",
-    ],
-    headers = [
-        "shims/memory_slim.h",
-    ],
-    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
-    link_whole = True,
-    supports_python_dlopen = True,
-    visibility = ["@EXECUTORCH_CLIENTS"],
-    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
-    deps = [
+        "//executorch/backends/aoti:aoti_common_slim",
         "//executorch/backends/aoti/slim/core:slimtensor",
         "//executorch/backends/aoti/slim/factory:empty",
         "//executorch/backends/aoti/slim/factory:from_blob",
-        "//executorch/backends/aoti:common_shims",
+        "//executorch/backends/aoti/slim/cuda:guard",
         "//executorch/runtime/core:core",
+        "//executorch/runtime/core/exec_aten:lib",
+        "//executorch/runtime/core/exec_aten/util:tensor_util",
         "//executorch/runtime/platform:platform",
     ],
     nvcc_flags = get_nvcc_arch_args() + [
@@ -149,10 +100,16 @@ runtime.cxx_library(
     supports_python_dlopen = True,
     # Constructor needed for backend registration.
     compiler_flags = ["-Wno-global-constructors"],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
     visibility = ["PUBLIC"],
     deps = [
         ":runtime_shims",
-        "//executorch/backends/aoti:aoti_common",
+        "//executorch/backends/aoti:aoti_common_slim",
+        "//executorch/backends/aoti/slim/core:slimtensor",
+        "//executorch/backends/aoti/slim/factory:empty",
+        "//executorch/backends/aoti/slim/factory:from_blob",
+        "//executorch/backends/aoti/slim/factory:from_etensor",
+        "//executorch/extension/tensor:tensor",
         "//executorch/runtime/backend:interface",
         "//executorch/runtime/core/exec_aten/util:tensor_util",
     ],