QiJune · QiJune · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -20,3 +20,6 @@
 [submodule "3rdparty/xgrammar"]
 	path = 3rdparty/xgrammar
 	url = https://github.com/mlc-ai/xgrammar.git
+[submodule "3rdparty/nanobind"]
+	path = 3rdparty/nanobind
+	url = https://github.com/wjakob/nanobind
diff --git a/3rdparty/nanobind b/3rdparty/nanobind
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -170,6 +170,7 @@ get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
 add_subdirectory(${3RDPARTY_DIR}/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+add_subdirectory(${3RDPARTY_DIR}/nanobind ${CMAKE_CURRENT_BINARY_DIR}/nanobind)
 
 # include as system to suppress warnings
 include_directories(
@@ -181,7 +182,8 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
   ${3RDPARTY_DIR}/json/include
-  ${3RDPARTY_DIR}/pybind11/include)
+  ${3RDPARTY_DIR}/pybind11/include
+  ${3RDPARTY_DIR}/nanobind/include)
 
 if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
   add_definitions("-DENABLE_BF16")

diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -23,7 +23,7 @@ set(SRCS
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
 
-pybind11_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS})
+nanobind_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS})
 
 set_property(TARGET ${TRTLLM_PYBIND_MODULE} PROPERTY POSITION_INDEPENDENT_CODE
                                                      ON)
@@ -34,9 +34,8 @@ target_link_libraries(
   ${TRTLLM_PYBIND_MODULE}
   PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG} ${NO_AS_NEEDED_FLAG}
          ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
-target_compile_definitions(
-  ${TRTLLM_PYBIND_MODULE} PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE}
-                                 PYBIND11_DETAILED_ERROR_MESSAGES=1)
+target_compile_definitions(${TRTLLM_PYBIND_MODULE}
+                           PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE})
 
 if(NOT WIN32)
   set_target_properties(

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -30,14 +30,12 @@
 #include "tensorrt_llm/runtime/torchView.h"
 
 #include <ATen/ATen.h>
-#include <pybind11/chrono.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
-#include <pybind11/stl_bind.h>
+
+#include <nanobind/nanobind.h>
+#include <nanobind/operators.h>
 #include <torch/extension.h>
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tb = tensorrt_llm::batch_manager;
 namespace tle = tensorrt_llm::executor;
 namespace tr = tensorrt_llm::runtime;
@@ -47,7 +45,7 @@ using namespace tensorrt_llm::runtime;
 namespace tensorrt_llm::pybind::batch_manager
 {
 
-void initBindings(pybind11::module_& m)
+void initBindings(py::module_& m)
 {
     using GenLlmReq = tb::GenericLlmRequest<runtime::ITensor::SharedPtr>;
 

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.h b/cpp/tensorrt_llm/pybind/batch_manager/bindings.h
@@ -18,11 +18,11 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::pybind::batch_manager
 {
 
-void initBindings(pybind11::module_& m);
+void initBindings(nanobind::module_& m);
 
 }
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/buffers.cpp b/cpp/tensorrt_llm/pybind/batch_manager/buffers.cpp
@@ -23,13 +23,9 @@
 #include "tensorrt_llm/batch_manager/transformerBuffers.h"
 
 #include <ATen/ATen.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
-#include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tb = tensorrt_llm::batch_manager;
 namespace tr = tensorrt_llm::runtime;
 
@@ -38,7 +34,7 @@ using tr::SizeType32;
 namespace tensorrt_llm::pybind::batch_manager
 {
 
-void Buffers::initBindings(pybind11::module_& m)
+void Buffers::initBindings(py::module_& m)
 {
     py::class_<tb::TransformerBuffers>(m, "TransformerBuffers")
         .def(py::init<SizeType32, SizeType32, std::vector<SizeType32> const&, SizeType32, SizeType32,

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/buffers.h b/cpp/tensorrt_llm/pybind/batch_manager/buffers.h
@@ -18,13 +18,13 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::pybind::batch_manager
 {
 class Buffers
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::pybind::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.h b/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.h
@@ -18,13 +18,13 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::batch_manager
 {
 class CacheTransceiverBindings
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -23,16 +23,12 @@
 #include "tensorrt_llm/runtime/torchView.h"
 
 #include <ATen/ATen.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
-#include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
 namespace tb = tensorrt_llm::batch_manager;
 namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
 namespace tr = tensorrt_llm::runtime;
-namespace py = pybind11;
+namespace py = nanobind;
 using BlockKey = tbk::BlockKey;
 using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
 using SizeType32 = tensorrt_llm::runtime::SizeType32;

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
@@ -18,14 +18,14 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 class KVCacheManagerBindings
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
 
@@ -34,6 +34,6 @@ namespace tensorrt_llm::batch_manager
 class BasePeftCacheManagerBindings
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -15,11 +15,8 @@
  * limitations under the License.
  */
 
-#include <pybind11/cast.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/operators.h>
 #include <torch/extension.h>
 #include <vector>
 
@@ -45,7 +42,7 @@
 #include "tensorrt_llm/runtime/samplingConfig.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tb = tensorrt_llm::batch_manager;
 namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
 namespace tpb = tensorrt_llm::pybind::batch_manager;
@@ -69,7 +66,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con
 }
 } // namespace
 
-PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
+NB_MODULE(TRTLLM_PYBIND_MODULE, m)
 {
     m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
 

diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -22,31 +22,27 @@
 #include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/executor/types.h"
 
-#include <pybind11/cast.h>
-#include <pybind11/chrono.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/operators.h>
 
 #include <optional>
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tle = tensorrt_llm::executor;
 using SizeType32 = tle::SizeType32;
 
 namespace tensorrt_llm::pybind::executor
 {
 
 template <typename T>
-void instantiateEventDiff(pybind11::module& m, std::string const& name)
+void instantiateEventDiff(py::module& m, std::string const& name)
 {
     py::class_<tle::KVCacheEventDiff<T>>(m, ("KVCacheEventDiff" + name).c_str())
         .def_readonly("old_value", &tle::KVCacheEventDiff<T>::oldValue)
         .def_readonly("new_value", &tle::KVCacheEventDiff<T>::newValue);
 }
 
-void initBindings(pybind11::module_& m)
+void initBindings(py::module_& m)
 {
     m.attr("__version__") = tle::version();
     py::enum_<tle::ModelType>(m, "ModelType")

diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.h b/cpp/tensorrt_llm/pybind/executor/bindings.h
@@ -18,12 +18,12 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::pybind::executor
 {
 
 // Register bindings for executor API.
-void initBindings(pybind11::module_& m);
+void initBindings(nanobind::module_& m);
 
 } // namespace tensorrt_llm::pybind::executor
diff --git a/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp b/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp
@@ -19,13 +19,13 @@
 #include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
 #include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h"
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tub = tensorrt_llm::runtime::ub;
 
 namespace tensorrt_llm::kernels::userbuffers
 {
 
-void UserBufferBindings::initBindings(pybind11::module_& m)
+void UserBufferBindings::initBindings(py::module_& m)
 {
     py::class_<tub::UBBuffer>(m, "UBBuffer")
         .def_readonly("size", &tub::UBBuffer::size)

diff --git a/cpp/tensorrt_llm/pybind/userbuffers/bindings.h b/cpp/tensorrt_llm/pybind/userbuffers/bindings.h
@@ -18,13 +18,13 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::kernels::userbuffers
 {
 class UserBufferBindings
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::kernels::userbuffers