From 8d872cf315924e9bce556d922ff0744da2172f8b Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 11:51:49 +0800
Subject: [PATCH 1/8] switch pybind11 to nanobind

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 .gitmodules                                   |  3 +
 3rdparty/nanobind                             |  1 +
 cpp/CMakeLists.txt                            |  4 +-
 cpp/tensorrt_llm/pybind/CMakeLists.txt        |  7 +-
 .../pybind/batch_manager/bindings.cpp         | 13 ++-
 .../pybind/batch_manager/bindings.h           |  4 +-
 cpp/tensorrt_llm/pybind/bindings.cpp          | 12 +--
 cpp/tensorrt_llm/pybind/executor/bindings.cpp | 15 ++-
 cpp/tensorrt_llm/pybind/executor/bindings.h   |  4 +-
 cpp/tensorrt_llm/pybind/runtime/bindings.cpp  | 99 ++++++++++---------
 cpp/tensorrt_llm/pybind/runtime/bindings.h    |  4 +-
 .../pybind/userbuffers/bindings.cpp           |  4 +-
 .../pybind/userbuffers/bindings.h             |  4 +-
 13 files changed, 88 insertions(+), 86 deletions(-)
 create mode 160000 3rdparty/nanobind

diff --git a/.gitmodules b/.gitmodules
index 555349aa253..31970ad4054 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -20,3 +20,6 @@
 [submodule "3rdparty/xgrammar"]
 	path = 3rdparty/xgrammar
 	url = https://github.com/mlc-ai/xgrammar.git
+[submodule "3rdparty/nanobind"]
+	path = 3rdparty/nanobind
+	url = https://github.com/wjakob/nanobind
diff --git a/3rdparty/nanobind b/3rdparty/nanobind
new file mode 160000
index 00000000000..3d577d099a0
--- /dev/null
+++ b/3rdparty/nanobind
@@ -0,0 +1 @@
+Subproject commit 3d577d099a05f71a7860d8c6d80d2dd1fb92d9e1
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d82bebb73c4..cdde865a564 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -170,6 +170,7 @@ get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
 add_subdirectory(${3RDPARTY_DIR}/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+add_subdirectory(${3RDPARTY_DIR}/nanobind)
 
 # include as system to suppress warnings
 include_directories(
@@ -181,7 +182,8 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
   ${3RDPARTY_DIR}/json/include
-  ${3RDPARTY_DIR}/pybind11/include)
+  ${3RDPARTY_DIR}/pybind11/include
+  ${3RDPARTY_DIR}/nanobind/include)
 
 if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
   add_definitions("-DENABLE_BF16")
diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
index f02599e6089..834d7235743 100755
--- a/cpp/tensorrt_llm/pybind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -23,7 +23,7 @@ set(SRCS
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
 
-pybind11_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS})
+nanobind_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS})
 
 set_property(TARGET ${TRTLLM_PYBIND_MODULE} PROPERTY POSITION_INDEPENDENT_CODE
                                                      ON)
@@ -34,9 +34,8 @@ target_link_libraries(
   ${TRTLLM_PYBIND_MODULE}
   PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG} ${NO_AS_NEEDED_FLAG}
          ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
-target_compile_definitions(
-  ${TRTLLM_PYBIND_MODULE} PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE}
-                                 PYBIND11_DETAILED_ERROR_MESSAGES=1)
+target_compile_definitions(${TRTLLM_PYBIND_MODULE}
+                           PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE})
 
 if(NOT WIN32)
   set_target_properties(
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 4274bbe62dc..f3a90fa68e5 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -30,14 +30,13 @@
 #include "tensorrt_llm/runtime/torchView.h"
 
 #include <ATen/ATen.h>
-#include <pybind11/chrono.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
-#include <pybind11/stl_bind.h>
+
+#include <nanobind/nanobind.h>
+#include <nanobind/nb_cast.h>
+#include <nanobind/operators.h>
 #include <torch/extension.h>
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tb = tensorrt_llm::batch_manager;
 namespace tle = tensorrt_llm::executor;
 namespace tr = tensorrt_llm::runtime;
@@ -47,7 +46,7 @@ using namespace tensorrt_llm::runtime;
 namespace tensorrt_llm::pybind::batch_manager
 {
 
-void initBindings(pybind11::module_& m)
+void initBindings(py::module_& m)
 {
     using GenLlmReq = tb::GenericLlmRequest<runtime::ITensor::SharedPtr>;
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.h b/cpp/tensorrt_llm/pybind/batch_manager/bindings.h
index 4c36ea3f78c..d57694e72cc 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.h
@@ -18,11 +18,11 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::pybind::batch_manager
 {
 
-void initBindings(pybind11::module_& m);
+void initBindings(nanobind::module_& m);
 
 }
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
index ebda5773abb..5ca99d6a5e1 100644
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -15,11 +15,9 @@
  * limitations under the License.
  */
 
-#include <pybind11/cast.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/nb_cast.h>
+#include <nanobind/operators.h>
 #include <torch/extension.h>
 #include <vector>
 
@@ -45,7 +43,7 @@
 #include "tensorrt_llm/runtime/samplingConfig.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tb = tensorrt_llm::batch_manager;
 namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
 namespace tpb = tensorrt_llm::pybind::batch_manager;
@@ -69,7 +67,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con
 }
 } // namespace
 
-PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
+NB_MODULE(TRTLLM_PYBIND_MODULE, m)
 {
     m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
 
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
index 502ab705374..8f3cd0b0b43 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -22,16 +22,13 @@
 #include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/executor/types.h"
 
-#include <pybind11/cast.h>
-#include <pybind11/chrono.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/nb_cast.h>
+#include <nanobind/operators.h>
 
 #include <optional>
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tle = tensorrt_llm::executor;
 using SizeType32 = tle::SizeType32;
 
@@ -39,14 +36,14 @@ namespace tensorrt_llm::pybind::executor
 {
 
 template <typename T>
-void instantiateEventDiff(pybind11::module& m, std::string const& name)
+void instantiateEventDiff(py::module& m, std::string const& name)
 {
     py::class_<tle::KVCacheEventDiff<T>>(m, ("KVCacheEventDiff" + name).c_str())
         .def_readonly("old_value", &tle::KVCacheEventDiff<T>::oldValue)
         .def_readonly("new_value", &tle::KVCacheEventDiff<T>::newValue);
 }
 
-void initBindings(pybind11::module_& m)
+void initBindings(py::module_& m)
 {
     m.attr("__version__") = tle::version();
     py::enum_<tle::ModelType>(m, "ModelType")
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.h b/cpp/tensorrt_llm/pybind/executor/bindings.h
index ea9946d46d0..3ad76c17838 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.h
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.h
@@ -18,12 +18,12 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::pybind::executor
 {
 
 // Register bindings for executor API.
-void initBindings(pybind11::module_& m);
+void initBindings(nanobind::module_& m);
 
 } // namespace tensorrt_llm::pybind::executor
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index 6a9a2e0dcd2..07d6d348852 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -40,8 +40,11 @@
 #include "tensorrt_llm/runtime/torchView.h"
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAStream.h>
-#include <pybind11/stl.h>
-#include <pybind11/stl_bind.h>
+
+#include <nanobind/nanobind.h>
+#include <nanobind/nb_cast.h>
+#include <nanobind/operators.h>
+
 #include <torch/extension.h>
 
 namespace tr = tensorrt_llm::runtime;
@@ -54,73 +57,73 @@ class PyITensor : public tensorrt_llm::runtime::ITensor
 
     [[nodiscard]] void* data() override
     {
-        PYBIND11_OVERRIDE_PURE(void*, /* Return type */
-            ITensor,                  /* Parent class */
-            data                      /* Name of function in C++ (must match Python name) */
-                                      /* Argument(s) */
+        NB_OVERRIDE_PURE(void*, /* Return type */
+            ITensor,            /* Parent class */
+            data                /* Name of function in C++ (must match Python name) */
+                                /* Argument(s) */
         );
     }
 
     [[nodiscard]] void const* data() const override
     {
-        PYBIND11_OVERRIDE_PURE(void const*, /* Return type */
-            ITensor,                        /* Parent class */
-            data                            /* Name of function in C++ (must match Python name) */
-                                            /* Argument(s) */
+        NB_OVERRIDE_PURE(void const*, /* Return type */
+            ITensor,                  /* Parent class */
+            data                      /* Name of function in C++ (must match Python name) */
+                                      /* Argument(s) */
         );
     }
 
     [[nodiscard]] std::size_t getSize() const override
     {
-        PYBIND11_OVERRIDE_PURE(std::size_t, /* Return type */
-            ITensor,                        /* Parent class */
-            getSize                         /* Name of function in C++ (must match Python name) */
-                                            /* Argument(s) */
+        NB_OVERRIDE_PURE(std::size_t, /* Return type */
+            ITensor,                  /* Parent class */
+            getSize                   /* Name of function in C++ (must match Python name) */
+                                      /* Argument(s) */
         );
     }
 
     [[nodiscard]] std::size_t getCapacity() const override
     {
-        PYBIND11_OVERRIDE_PURE(std::size_t, /* Return type */
-            ITensor,                        /* Parent class */
-            getCapacity                     /* Name of function in C++ (must match Python name) */
-                                            /* Argument(s) */
+        NB_OVERRIDE_PURE(std::size_t, /* Return type */
+            ITensor,                  /* Parent class */
+            getCapacity               /* Name of function in C++ (must match Python name) */
+                                      /* Argument(s) */
         );
     }
 
     [[nodiscard]] DataType getDataType() const override
     {
-        PYBIND11_OVERRIDE_PURE(DataType, /* Return type */
-            ITensor,                     /* Parent class */
-            getDataType                  /* Name of function in C++ (must match Python name) */
-                                         /* Argument(s) */
+        NB_OVERRIDE_PURE(DataType, /* Return type */
+            ITensor,               /* Parent class */
+            getDataType            /* Name of function in C++ (must match Python name) */
+                                   /* Argument(s) */
         );
     }
 
     [[nodiscard]] tr::MemoryType getMemoryType() const override
     {
-        PYBIND11_OVERRIDE_PURE(tr::MemoryType, /* Return type */
-            ITensor,                           /* Parent class */
-            getMemoryType                      /* Name of function in C++ (must match Python name) */
-                                               /* Argument(s) */
+        NB_OVERRIDE_PURE(tr::MemoryType, /* Return type */
+            ITensor,                     /* Parent class */
+            getMemoryType                /* Name of function in C++ (must match Python name) */
+                                         /* Argument(s) */
         );
     }
 
     [[nodiscard]] char const* getMemoryTypeName() const override
     {
-        PYBIND11_OVERRIDE_PURE(char const*, /* Return type */
-            ITensor,                        /* Parent class */
-            getMemoryTypeName               /* Name of function in C++ (must match Python name) */
-                                            /* Argument(s) */
+        NB_OVERRIDE_PURE(char const*, /* Return type */
+            ITensor,                  /* Parent class */
+            getMemoryTypeName         /* Name of function in C++ (must match Python name) */
+                                      /* Argument(s) */
         );
     }
 
     virtual void resize(std::size_t newSize) override
     {
-        PYBIND11_OVERRIDE_PURE(void, /* Return type */
-            ITensor,                 /* Parent class */
-            resize                   /* Name of function in C++ (must match Python name) */
-                                     /* Argument(s) */
+        NB_OVERRIDE_PURE(void, /* Return type */
+            ITensor,           /* Parent class */
+            resize             /* Name of function in C++ (must match Python name) */
+                               /* Argument(s) */
         );
     }
 
@@ -135,19 +138,19 @@ class PyITensor : public tensorrt_llm::runtime::ITensor
 
     [[nodiscard]] Shape const& getShape() const override
     {
-        PYBIND11_OVERRIDE_PURE(Shape const&, /* Return type */
-            ITensor,                         /* Parent class */
-            getShape                         /* Name of function in C++ (must match Python name) */
-                                             /* Argument(s) */
+        NB_OVERRIDE_PURE(Shape const&, /* Return type */
+            ITensor,                   /* Parent class */
+            getShape                   /* Name of function in C++ (must match Python name) */
+                                       /* Argument(s) */
         );
     }
 
     void reshape(Shape const& dims) override
     {
-        PYBIND11_OVERRIDE_PURE(void, /* Return type */
-            ITensor,                 /* Parent class */
-            reshape,                 /* Name of function in C++ (must match Python name) */
-            dims                     /* Argument(s) */
+        NB_OVERRIDE_PURE(void, /* Return type */
+            ITensor,           /* Parent class */
+            reshape,           /* Name of function in C++ (must match Python name) */
+            dims               /* Argument(s) */
         );
     }
 };
@@ -162,35 +165,35 @@ class PyIGptDecoder : public tr::IGptDecoder
         std::optional<tr::DecodingOutput> const& output = std::nullopt,
         std::optional<std::vector<tr::decoder_batch::Request> const> const& requests = std::nullopt) override
     {
-        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, setup, samplingConfig, batchSize, batchSlots, output, requests);
+        NB_OVERRIDE_PURE(void, IGptDecoder, setup, samplingConfig, batchSize, batchSlots, output, requests);
     }
 
     void forwardAsync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
     {
-        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, forwardAsync, output, input);
+        NB_OVERRIDE_PURE(void, IGptDecoder, forwardAsync, output, input);
     }
 
     void forwardSync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
     {
-        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, forwardSync, output, input);
+        NB_OVERRIDE_PURE(void, IGptDecoder, forwardSync, output, input);
     }
 
     tr::SamplingConfig const& getSamplingConfig() override
     {
-        PYBIND11_OVERRIDE_PURE(tr::SamplingConfig const&, IGptDecoder, getSamplingConfig);
+        NB_OVERRIDE_PURE(tr::SamplingConfig const&, IGptDecoder, getSamplingConfig);
     }
 
     void disableLookahead(std::optional<tr::SamplingConfig> const& samplingConfig, tr::SizeType32 batchSize,
         tr::DecodingInput::TensorConstPtr batchSlots) override
     {
-        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, disableLookahead, samplingConfig, batchSize, batchSlots);
+        NB_OVERRIDE_PURE(void, IGptDecoder, disableLookahead, samplingConfig, batchSize, batchSlots);
     }
 };
 
 namespace tensorrt_llm::pybind::runtime
 {
 
-void initBindings(pybind11::module_& m)
+void initBindings(py::module_& m)
 {
     py::classh<tr::ITensor, PyITensor>(m, "ITensor").def(py::init());
     py::class_<tr::LoraCache::TaskLayerModuleConfig>(m, "TaskLayerModuleConfig")
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.h b/cpp/tensorrt_llm/pybind/runtime/bindings.h
index b8e1ab66574..7acac226212 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.h
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.h
@@ -18,9 +18,9 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
-namespace py = pybind11;
+namespace py = nanobind;
 
 namespace tensorrt_llm::pybind::runtime
 {
diff --git a/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp b/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp
index c8c32e5589b..50ce8a01317 100644
--- a/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp
@@ -19,13 +19,13 @@
 #include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
 #include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h"
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tub = tensorrt_llm::runtime::ub;
 
 namespace tensorrt_llm::kernels::userbuffers
 {
 
-void UserBufferBindings::initBindings(pybind11::module_& m)
+void UserBufferBindings::initBindings(py::module_& m)
 {
     py::class_<tub::UBBuffer>(m, "UBBuffer")
         .def_readonly("size", &tub::UBBuffer::size)
diff --git a/cpp/tensorrt_llm/pybind/userbuffers/bindings.h b/cpp/tensorrt_llm/pybind/userbuffers/bindings.h
index 3a8fba2cc6f..e8913a5d846 100644
--- a/cpp/tensorrt_llm/pybind/userbuffers/bindings.h
+++ b/cpp/tensorrt_llm/pybind/userbuffers/bindings.h
@@ -18,13 +18,13 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::kernels::userbuffers
 {
 class UserBufferBindings
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::kernels::userbuffers

From fe06017ec0a9270ce3f36a25dd2f4cc2a6e232d6 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 12:44:43 +0800
Subject: [PATCH 2/8] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cdde865a564..1699d0dc640 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -170,7 +170,7 @@ get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
 add_subdirectory(${3RDPARTY_DIR}/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
-add_subdirectory(${3RDPARTY_DIR}/nanobind)
+add_subdirectory(${3RDPARTY_DIR}/nanobind ${CMAKE_CURRENT_BINARY_DIR}/nanobind)
 
 # include as system to suppress warnings
 include_directories(

From b059ebb1de69c5c4068ca35a71901e56996f54a6 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:37:20 +0800
Subject: [PATCH 3/8] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/runtime/bindings.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index 07d6d348852..45b621c2e51 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -44,6 +44,7 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/nb_cast.h>
 #include <nanobind/operators.h>
+#include <nanobind/trampoline.h>
 
 #include <torch/extension.h>
 

From b80a28ab25b43b8994f7aa488ca09a9a5a5e4a6e Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:57:53 +0800
Subject: [PATCH 4/8] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 .../pybind/batch_manager/cacheTransceiver.h   |  4 +-
 cpp/tensorrt_llm/pybind/common/bindTypes.h    |  4 +-
 .../pybind/common/customCasters.h             | 54 +++++++++----------
 3 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.h b/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.h
index 71221d8a7cb..49f65619e60 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.h
@@ -18,13 +18,13 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::batch_manager
 {
 class CacheTransceiverBindings
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/common/bindTypes.h b/cpp/tensorrt_llm/pybind/common/bindTypes.h
index 5959bc8f70c..9ce6da21e9f 100644
--- a/cpp/tensorrt_llm/pybind/common/bindTypes.h
+++ b/cpp/tensorrt_llm/pybind/common/bindTypes.h
@@ -18,12 +18,12 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace PybindUtils
 {
 
-namespace py = pybind11;
+namespace py = nanobind;
 
 template <typename T>
 void bindList(py::module& m, std::string const& name)
diff --git a/cpp/tensorrt_llm/pybind/common/customCasters.h b/cpp/tensorrt_llm/pybind/common/customCasters.h
index 3d1eea7e3f3..a629c1ad0a5 100644
--- a/cpp/tensorrt_llm/pybind/common/customCasters.h
+++ b/cpp/tensorrt_llm/pybind/common/customCasters.h
@@ -17,12 +17,6 @@
 
 #pragma once
 
-#include "pybind11/cast.h"
-#include "pybind11/detail/common.h"
-#include "pybind11/detail/descr.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/pytypes.h"
-
 #include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/common/optionalRef.h"
@@ -33,19 +27,20 @@
 #include "tensorrt_llm/runtime/torchView.h"
 
 #include <filesystem>
-#include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
+#include <nanobind/nanobind.h>
+
 // Pybind requires to have a central include in order for type casters to work.
 // Opaque bindings add a type caster, so they have the same requirement.
 // See the warning in https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html
 
 // Opaque bindings
-PYBIND11_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
-PYBIND11_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
+NB_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
+NB_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
 
 // Custom casters
-namespace PYBIND11_NAMESPACE
+namespace NB_NAMESPACE
 {
 
 namespace detail
@@ -56,9 +51,9 @@ struct type_caster<tensorrt_llm::common::OptionalRef<T>>
 {
     using value_conv = make_caster<T>;
 
-    PYBIND11_TYPE_CASTER(tensorrt_llm::common::OptionalRef<T>, value_conv::name);
+    nb::cast(tensorrt_llm::common::OptionalRef<T>, value_conv::name);
 
-    bool load(handle src, bool convert)
+    bool from_python(handle src, bool convert)
     {
         if (src.is_none())
         {
@@ -68,7 +63,7 @@ struct type_caster<tensorrt_llm::common::OptionalRef<T>>
         }
 
         value_conv conv;
-        if (!conv.load(src, convert))
+        if (!conv.from_python(src, convert))
             return false;
 
         // Create an OptionalRef with a reference to the converted value
@@ -101,7 +96,7 @@ struct PathCaster
     }
 
 public:
-    static handle cast(T const& path, return_value_policy, handle)
+    static handle from_cpp(T const& path, return_value_policy, handle)
     {
         if (auto py_str = unicode_from_fs_native(path.native()))
         {
@@ -110,7 +105,7 @@ struct PathCaster
         return nullptr;
     }
 
-    bool load(handle handle, bool)
+    bool from_python(handle handle, bool)
     {
         PyObject* native = nullptr;
         if constexpr (std::is_same_v<typename T::value_type, char>)
@@ -146,7 +141,7 @@ struct PathCaster
         return true;
     }
 
-    PYBIND11_TYPE_CASTER(T, const_name("os.PathLike"));
+    nb::cast(T, const_name("os.PathLike"));
 };
 
 template <>
@@ -158,9 +153,9 @@ template <>
 class type_caster<tensorrt_llm::executor::StreamPtr>
 {
 public:
-    PYBIND11_TYPE_CASTER(tensorrt_llm::executor::StreamPtr, _("int"));
+    nb::cast(tensorrt_llm::executor::StreamPtr, _("int"));
 
-    bool load([[maybe_unused]] handle src, bool)
+    bool from_python([[maybe_unused]] handle src, bool)
     {
         auto stream_ptr = src.cast<uintptr_t>();
         value = std::make_shared<tensorrt_llm::runtime::CudaStream>(reinterpret_cast<cudaStream_t>(stream_ptr));
@@ -168,7 +163,7 @@ class type_caster<tensorrt_llm::executor::StreamPtr>
         return true;
     }
 
-    static handle cast(
+    static handle from_cpp(
         tensorrt_llm::executor::StreamPtr const& src, return_value_policy /* policy */, handle /* parent */)
     {
         // Return cudaStream_t as integer.
@@ -180,10 +175,10 @@ template <>
 struct type_caster<tensorrt_llm::executor::Tensor>
 {
 public:
-    PYBIND11_TYPE_CASTER(tensorrt_llm::executor::Tensor, _("torch.Tensor"));
+    nb::cast(tensorrt_llm::executor::Tensor, _("torch.Tensor"));
 
     // Convert PyObject(torch.Tensor) -> tensorrt_llm::executor::Tensor
-    bool load(handle src, bool)
+    bool from_python(handle src, bool)
     {
         PyObject* obj = src.ptr();
         if (THPVariable_Check(obj))
@@ -196,7 +191,8 @@ struct type_caster<tensorrt_llm::executor::Tensor>
     }
 
     // Convert tensorrt_llm::executor::Tensor -> PyObject(torch.Tensor)
-    static handle cast(tensorrt_llm::executor::Tensor const& src, return_value_policy /* policy */, handle /* parent */)
+    static handle from_cpp(
+        tensorrt_llm::executor::Tensor const& src, return_value_policy /* policy */, handle /* parent */)
     {
         return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(tensorrt_llm::executor::detail::toITensor(src)));
     }
@@ -206,10 +202,10 @@ template <>
 struct type_caster<tensorrt_llm::runtime::ITensor::SharedPtr>
 {
 public:
-    PYBIND11_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedPtr, _("torch.Tensor"));
+    nb::cast(tensorrt_llm::runtime::ITensor::SharedPtr, _("torch.Tensor"));
 
     // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedPtr
-    bool load(handle src, bool)
+    bool from_python(handle src, bool)
     {
         PyObject* obj = src.ptr();
         if (THPVariable_Check(obj))
@@ -222,7 +218,7 @@ struct type_caster<tensorrt_llm::runtime::ITensor::SharedPtr>
     }
 
     // Convert tensorrt_llm::runtime::ITensor::SharedPtr -> PyObject(torch.Tensor)
-    static handle cast(
+    static handle from_cpp(
         tensorrt_llm::runtime::ITensor::SharedPtr const& src, return_value_policy /* policy */, handle /* parent */)
     {
         if (src == nullptr)
@@ -237,10 +233,10 @@ template <>
 struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
 {
 public:
-    PYBIND11_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedConstPtr, _("torch.Tensor"));
+    nb::cast(tensorrt_llm::runtime::ITensor::SharedConstPtr, _("torch.Tensor"));
 
     // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedConstPtr
-    bool load(handle src, bool)
+    bool from_python(handle src, bool)
     {
         PyObject* obj = src.ptr();
         if (THPVariable_Check(obj))
@@ -253,7 +249,7 @@ struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
     }
 
     // Convert tensorrt_llm::runtime::ITensor::SharedConstPtr -> PyObject(torch.Tensor)
-    static handle cast(tensorrt_llm::runtime::ITensor::SharedConstPtr const& src, return_value_policy /* policy */,
+    static handle from_cpp(tensorrt_llm::runtime::ITensor::SharedConstPtr const& src, return_value_policy /* policy */,
         handle /* parent */)
     {
         if (src == nullptr)
@@ -266,4 +262,4 @@ struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
 };
 
 } // namespace detail
-} // namespace PYBIND11_NAMESPACE
+} // namespace NB_NAMESPACE

From 43da2fc35744f03c41f7c34a02688011bec1d04b Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:11:41 +0800
Subject: [PATCH 5/8] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/buffers.cpp        | 8 ++------
 cpp/tensorrt_llm/pybind/batch_manager/buffers.h          | 4 ++--
 cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp | 6 +-----
 cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h   | 6 +++---
 cpp/tensorrt_llm/pybind/bindings.cpp                     | 1 -
 5 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/buffers.cpp b/cpp/tensorrt_llm/pybind/batch_manager/buffers.cpp
index 721b12f6872..1f438ec027f 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/buffers.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/buffers.cpp
@@ -23,13 +23,9 @@
 #include "tensorrt_llm/batch_manager/transformerBuffers.h"
 
 #include <ATen/ATen.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
-#include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
-namespace py = pybind11;
+namespace py = nanobind;
 namespace tb = tensorrt_llm::batch_manager;
 namespace tr = tensorrt_llm::runtime;
 
@@ -38,7 +34,7 @@ using tr::SizeType32;
 namespace tensorrt_llm::pybind::batch_manager
 {
 
-void Buffers::initBindings(pybind11::module_& m)
+void Buffers::initBindings(py::module_& m)
 {
     py::class_<tb::TransformerBuffers>(m, "TransformerBuffers")
         .def(py::init<SizeType32, SizeType32, std::vector<SizeType32> const&, SizeType32, SizeType32,
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/buffers.h b/cpp/tensorrt_llm/pybind/batch_manager/buffers.h
index bfe06c0e8e8..29cba8fdfc5 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/buffers.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/buffers.h
@@ -18,13 +18,13 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::pybind::batch_manager
 {
 class Buffers
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::pybind::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
index 5be47790c9a..ee7032851e1 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -23,16 +23,12 @@
 #include "tensorrt_llm/runtime/torchView.h"
 
 #include <ATen/ATen.h>
-#include <pybind11/functional.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
-#include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
 namespace tb = tensorrt_llm::batch_manager;
 namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
 namespace tr = tensorrt_llm::runtime;
-namespace py = pybind11;
+namespace py = nanobind;
 using BlockKey = tbk::BlockKey;
 using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
 using SizeType32 = tensorrt_llm::runtime::SizeType32;
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
index 67d8b13ca71..96c9235c85d 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
@@ -18,14 +18,14 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 class KVCacheManagerBindings
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
 
@@ -34,6 +34,6 @@ namespace tensorrt_llm::batch_manager
 class BasePeftCacheManagerBindings
 {
 public:
-    static void initBindings(pybind11::module_& m);
+    static void initBindings(nanobind::module_& m);
 };
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
index 5ca99d6a5e1..178b7fcb957 100644
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -16,7 +16,6 @@
  */
 
 #include <nanobind/nanobind.h>
-#include <nanobind/nb_cast.h>
 #include <nanobind/operators.h>
 #include <torch/extension.h>
 #include <vector>

From ade7e1dea792332bb7069b330b03f58eb6c0153e Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:17:47 +0800
Subject: [PATCH 6/8] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 1 -
 cpp/tensorrt_llm/pybind/common/bindTypes.h         | 4 ++--
 cpp/tensorrt_llm/pybind/executor/bindings.cpp      | 1 -
 cpp/tensorrt_llm/pybind/runtime/bindings.cpp       | 1 -
 4 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index f3a90fa68e5..bc8d327a19e 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -32,7 +32,6 @@
 #include <ATen/ATen.h>
 
 #include <nanobind/nanobind.h>
-#include <nanobind/nb_cast.h>
 #include <nanobind/operators.h>
 #include <torch/extension.h>
 
diff --git a/cpp/tensorrt_llm/pybind/common/bindTypes.h b/cpp/tensorrt_llm/pybind/common/bindTypes.h
index 9ce6da21e9f..8d0b55756ed 100644
--- a/cpp/tensorrt_llm/pybind/common/bindTypes.h
+++ b/cpp/tensorrt_llm/pybind/common/bindTypes.h
@@ -26,7 +26,7 @@ namespace PybindUtils
 namespace py = nanobind;
 
 template <typename T>
-void bindList(py::module& m, std::string const& name)
+void bindList(py::module_& m, std::string const& name)
 {
     py::class_<T>(m, name.c_str())
         .def(py::init())
@@ -58,7 +58,7 @@ void bindList(py::module& m, std::string const& name)
 }
 
 template <typename T>
-void bindSet(py::module& m, std::string const& name)
+void bindSet(py::module_& m, std::string const& name)
 {
     py::class_<T>(m, name.c_str())
         .def(py::init())
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
index 8f3cd0b0b43..5403031716d 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -23,7 +23,6 @@
 #include "tensorrt_llm/executor/types.h"
 
 #include <nanobind/nanobind.h>
-#include <nanobind/nb_cast.h>
 #include <nanobind/operators.h>
 
 #include <optional>
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index 45b621c2e51..0775bf47088 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -42,7 +42,6 @@
 #include <c10/cuda/CUDAStream.h>
 
 #include <nanobind/nanobind.h>
-#include <nanobind/nb_cast.h>
 #include <nanobind/operators.h>
 #include <nanobind/trampoline.h>
 

From 32b5d362df689e760185152466683b2a8e67fdfe Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:23:02 +0800
Subject: [PATCH 7/8] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/common/customCasters.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/tensorrt_llm/pybind/common/customCasters.h b/cpp/tensorrt_llm/pybind/common/customCasters.h
index a629c1ad0a5..29c8ee0df45 100644
--- a/cpp/tensorrt_llm/pybind/common/customCasters.h
+++ b/cpp/tensorrt_llm/pybind/common/customCasters.h
@@ -51,7 +51,7 @@ struct type_caster<tensorrt_llm::common::OptionalRef<T>>
 {
     using value_conv = make_caster<T>;
 
-    nb::cast(tensorrt_llm::common::OptionalRef<T>, value_conv::name);
+    nanobind::cast(tensorrt_llm::common::OptionalRef<T>, value_conv::name);
 
     bool from_python(handle src, bool convert)
     {
@@ -141,7 +141,7 @@ struct PathCaster
         return true;
     }
 
-    nb::cast(T, const_name("os.PathLike"));
+    nanobind::cast(T, const_name("os.PathLike"));
 };
 
 template <>
@@ -153,7 +153,7 @@ template <>
 class type_caster<tensorrt_llm::executor::StreamPtr>
 {
 public:
-    nb::cast(tensorrt_llm::executor::StreamPtr, _("int"));
+    nanobind::cast(tensorrt_llm::executor::StreamPtr, _("int"));
 
     bool from_python([[maybe_unused]] handle src, bool)
     {
@@ -175,7 +175,7 @@ template <>
 struct type_caster<tensorrt_llm::executor::Tensor>
 {
 public:
-    nb::cast(tensorrt_llm::executor::Tensor, _("torch.Tensor"));
+    nanobind::cast(tensorrt_llm::executor::Tensor, _("torch.Tensor"));
 
     // Convert PyObject(torch.Tensor) -> tensorrt_llm::executor::Tensor
     bool from_python(handle src, bool)
@@ -202,7 +202,7 @@ template <>
 struct type_caster<tensorrt_llm::runtime::ITensor::SharedPtr>
 {
 public:
-    nb::cast(tensorrt_llm::runtime::ITensor::SharedPtr, _("torch.Tensor"));
+    nanobind::cast(tensorrt_llm::runtime::ITensor::SharedPtr, _("torch.Tensor"));
 
     // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedPtr
     bool from_python(handle src, bool)
@@ -233,7 +233,7 @@ template <>
 struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
 {
 public:
-    nb::cast(tensorrt_llm::runtime::ITensor::SharedConstPtr, _("torch.Tensor"));
+    nanobind::cast(tensorrt_llm::runtime::ITensor::SharedConstPtr, _("torch.Tensor"));
 
     // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedConstPtr
     bool from_python(handle src, bool)

From 7209b6cd384c0c3144dcbc22d378d3e9b9341b21 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 17:28:44 +0800
Subject: [PATCH 8/8] clean

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/common/bindTypes.h    |  8 +-
 .../pybind/common/customCasters.h             | 54 +++++-----
 cpp/tensorrt_llm/pybind/runtime/bindings.cpp  | 99 +++++++++----------
 cpp/tensorrt_llm/pybind/runtime/bindings.h    |  4 +-
 4 files changed, 83 insertions(+), 82 deletions(-)

diff --git a/cpp/tensorrt_llm/pybind/common/bindTypes.h b/cpp/tensorrt_llm/pybind/common/bindTypes.h
index 8d0b55756ed..5959bc8f70c 100644
--- a/cpp/tensorrt_llm/pybind/common/bindTypes.h
+++ b/cpp/tensorrt_llm/pybind/common/bindTypes.h
@@ -18,15 +18,15 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <nanobind/nanobind.h>
+#include <pybind11/pybind11.h>
 
 namespace PybindUtils
 {
 
-namespace py = nanobind;
+namespace py = pybind11;
 
 template <typename T>
-void bindList(py::module_& m, std::string const& name)
+void bindList(py::module& m, std::string const& name)
 {
     py::class_<T>(m, name.c_str())
         .def(py::init())
@@ -58,7 +58,7 @@ void bindList(py::module_& m, std::string const& name)
 }
 
 template <typename T>
-void bindSet(py::module_& m, std::string const& name)
+void bindSet(py::module& m, std::string const& name)
 {
     py::class_<T>(m, name.c_str())
         .def(py::init())
diff --git a/cpp/tensorrt_llm/pybind/common/customCasters.h b/cpp/tensorrt_llm/pybind/common/customCasters.h
index 29c8ee0df45..3d1eea7e3f3 100644
--- a/cpp/tensorrt_llm/pybind/common/customCasters.h
+++ b/cpp/tensorrt_llm/pybind/common/customCasters.h
@@ -17,6 +17,12 @@
 
 #pragma once
 
+#include "pybind11/cast.h"
+#include "pybind11/detail/common.h"
+#include "pybind11/detail/descr.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+
 #include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/common/optionalRef.h"
@@ -27,20 +33,19 @@
 #include "tensorrt_llm/runtime/torchView.h"
 
 #include <filesystem>
+#include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
-#include <nanobind/nanobind.h>
-
 // Pybind requires to have a central include in order for type casters to work.
 // Opaque bindings add a type caster, so they have the same requirement.
 // See the warning in https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html
 
 // Opaque bindings
-NB_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
-NB_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
+PYBIND11_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
+PYBIND11_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
 
 // Custom casters
-namespace NB_NAMESPACE
+namespace PYBIND11_NAMESPACE
 {
 
 namespace detail
@@ -51,9 +56,9 @@ struct type_caster<tensorrt_llm::common::OptionalRef<T>>
 {
     using value_conv = make_caster<T>;
 
-    nanobind::cast(tensorrt_llm::common::OptionalRef<T>, value_conv::name);
+    PYBIND11_TYPE_CASTER(tensorrt_llm::common::OptionalRef<T>, value_conv::name);
 
-    bool from_python(handle src, bool convert)
+    bool load(handle src, bool convert)
     {
         if (src.is_none())
         {
@@ -63,7 +68,7 @@ struct type_caster<tensorrt_llm::common::OptionalRef<T>>
         }
 
         value_conv conv;
-        if (!conv.from_python(src, convert))
+        if (!conv.load(src, convert))
             return false;
 
         // Create an OptionalRef with a reference to the converted value
@@ -96,7 +101,7 @@ struct PathCaster
     }
 
 public:
-    static handle from_cpp(T const& path, return_value_policy, handle)
+    static handle cast(T const& path, return_value_policy, handle)
     {
         if (auto py_str = unicode_from_fs_native(path.native()))
         {
@@ -105,7 +110,7 @@ struct PathCaster
         return nullptr;
     }
 
-    bool from_python(handle handle, bool)
+    bool load(handle handle, bool)
     {
         PyObject* native = nullptr;
         if constexpr (std::is_same_v<typename T::value_type, char>)
@@ -141,7 +146,7 @@ struct PathCaster
         return true;
     }
 
-    nanobind::cast(T, const_name("os.PathLike"));
+    PYBIND11_TYPE_CASTER(T, const_name("os.PathLike"));
 };
 
 template <>
@@ -153,9 +158,9 @@ template <>
 class type_caster<tensorrt_llm::executor::StreamPtr>
 {
 public:
-    nanobind::cast(tensorrt_llm::executor::StreamPtr, _("int"));
+    PYBIND11_TYPE_CASTER(tensorrt_llm::executor::StreamPtr, _("int"));
 
-    bool from_python([[maybe_unused]] handle src, bool)
+    bool load([[maybe_unused]] handle src, bool)
     {
         auto stream_ptr = src.cast<uintptr_t>();
         value = std::make_shared<tensorrt_llm::runtime::CudaStream>(reinterpret_cast<cudaStream_t>(stream_ptr));
@@ -163,7 +168,7 @@ class type_caster<tensorrt_llm::executor::StreamPtr>
         return true;
     }
 
-    static handle from_cpp(
+    static handle cast(
         tensorrt_llm::executor::StreamPtr const& src, return_value_policy /* policy */, handle /* parent */)
     {
         // Return cudaStream_t as integer.
@@ -175,10 +180,10 @@ template <>
 struct type_caster<tensorrt_llm::executor::Tensor>
 {
 public:
-    nanobind::cast(tensorrt_llm::executor::Tensor, _("torch.Tensor"));
+    PYBIND11_TYPE_CASTER(tensorrt_llm::executor::Tensor, _("torch.Tensor"));
 
     // Convert PyObject(torch.Tensor) -> tensorrt_llm::executor::Tensor
-    bool from_python(handle src, bool)
+    bool load(handle src, bool)
     {
         PyObject* obj = src.ptr();
         if (THPVariable_Check(obj))
@@ -191,8 +196,7 @@ struct type_caster<tensorrt_llm::executor::Tensor>
     }
 
     // Convert tensorrt_llm::executor::Tensor -> PyObject(torch.Tensor)
-    static handle from_cpp(
-        tensorrt_llm::executor::Tensor const& src, return_value_policy /* policy */, handle /* parent */)
+    static handle cast(tensorrt_llm::executor::Tensor const& src, return_value_policy /* policy */, handle /* parent */)
     {
         return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(tensorrt_llm::executor::detail::toITensor(src)));
     }
@@ -202,10 +206,10 @@ template <>
 struct type_caster<tensorrt_llm::runtime::ITensor::SharedPtr>
 {
 public:
-    nanobind::cast(tensorrt_llm::runtime::ITensor::SharedPtr, _("torch.Tensor"));
+    PYBIND11_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedPtr, _("torch.Tensor"));
 
     // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedPtr
-    bool from_python(handle src, bool)
+    bool load(handle src, bool)
     {
         PyObject* obj = src.ptr();
         if (THPVariable_Check(obj))
@@ -218,7 +222,7 @@ struct type_caster<tensorrt_llm::runtime::ITensor::SharedPtr>
     }
 
     // Convert tensorrt_llm::runtime::ITensor::SharedPtr -> PyObject(torch.Tensor)
-    static handle from_cpp(
+    static handle cast(
         tensorrt_llm::runtime::ITensor::SharedPtr const& src, return_value_policy /* policy */, handle /* parent */)
     {
         if (src == nullptr)
@@ -233,10 +237,10 @@ template <>
 struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
 {
 public:
-    nanobind::cast(tensorrt_llm::runtime::ITensor::SharedConstPtr, _("torch.Tensor"));
+    PYBIND11_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedConstPtr, _("torch.Tensor"));
 
     // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedConstPtr
-    bool from_python(handle src, bool)
+    bool load(handle src, bool)
     {
         PyObject* obj = src.ptr();
         if (THPVariable_Check(obj))
@@ -249,7 +253,7 @@ struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
     }
 
     // Convert tensorrt_llm::runtime::ITensor::SharedConstPtr -> PyObject(torch.Tensor)
-    static handle from_cpp(tensorrt_llm::runtime::ITensor::SharedConstPtr const& src, return_value_policy /* policy */,
+    static handle cast(tensorrt_llm::runtime::ITensor::SharedConstPtr const& src, return_value_policy /* policy */,
         handle /* parent */)
     {
         if (src == nullptr)
@@ -262,4 +266,4 @@ struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
 };
 
 } // namespace detail
-} // namespace NB_NAMESPACE
+} // namespace PYBIND11_NAMESPACE
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index 0775bf47088..6a9a2e0dcd2 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -40,11 +40,8 @@
 #include "tensorrt_llm/runtime/torchView.h"
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAStream.h>
-
-#include <nanobind/nanobind.h>
-#include <nanobind/operators.h>
-#include <nanobind/trampoline.h>
-
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
 namespace tr = tensorrt_llm::runtime;
@@ -57,73 +54,73 @@ class PyITensor : public tensorrt_llm::runtime::ITensor
 
     [[nodiscard]] void* data() override
     {
-        NB_OVERRIDE_PURE(void*, /* Return type */
-            ITensor,            /* Parent class */
-            data                /* Name of function in C++ (must match Python name) */
-                                /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(void*, /* Return type */
+            ITensor,                  /* Parent class */
+            data                      /* Name of function in C++ (must match Python name) */
+                                      /* Argument(s) */
         );
     }
 
     [[nodiscard]] void const* data() const override
     {
-        NB_OVERRIDE_PURE(void const*, /* Return type */
-            ITensor,                  /* Parent class */
-            data                      /* Name of function in C++ (must match Python name) */
-                                      /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(void const*, /* Return type */
+            ITensor,                        /* Parent class */
+            data                            /* Name of function in C++ (must match Python name) */
+                                            /* Argument(s) */
         );
     }
 
     [[nodiscard]] std::size_t getSize() const override
     {
-        NB_OVERRIDE_PURE(std::size_t, /* Return type */
-            ITensor,                  /* Parent class */
-            getSize                   /* Name of function in C++ (must match Python name) */
-                                      /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(std::size_t, /* Return type */
+            ITensor,                        /* Parent class */
+            getSize                         /* Name of function in C++ (must match Python name) */
+                                            /* Argument(s) */
         );
     }
 
     [[nodiscard]] std::size_t getCapacity() const override
     {
-        NB_OVERRIDE_PURE(std::size_t, /* Return type */
-            ITensor,                  /* Parent class */
-            getCapacity               /* Name of function in C++ (must match Python name) */
-                                      /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(std::size_t, /* Return type */
+            ITensor,                        /* Parent class */
+            getCapacity                     /* Name of function in C++ (must match Python name) */
+                                            /* Argument(s) */
         );
     }
 
     [[nodiscard]] DataType getDataType() const override
     {
-        NB_OVERRIDE_PURE(DataType, /* Return type */
-            ITensor,               /* Parent class */
-            getDataType            /* Name of function in C++ (must match Python name) */
-                                   /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(DataType, /* Return type */
+            ITensor,                     /* Parent class */
+            getDataType                  /* Name of function in C++ (must match Python name) */
+                                         /* Argument(s) */
         );
     }
 
     [[nodiscard]] tr::MemoryType getMemoryType() const override
     {
-        NB_OVERRIDE_PURE(tr::MemoryType, /* Return type */
-            ITensor,                     /* Parent class */
-            getMemoryType                /* Name of function in C++ (must match Python name) */
-                                         /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(tr::MemoryType, /* Return type */
+            ITensor,                           /* Parent class */
+            getMemoryType                      /* Name of function in C++ (must match Python name) */
+                                               /* Argument(s) */
         );
     }
 
     [[nodiscard]] char const* getMemoryTypeName() const override
     {
-        NB_OVERRIDE_PURE(char const*, /* Return type */
-            ITensor,                  /* Parent class */
-            getMemoryTypeName         /* Name of function in C++ (must match Python name) */
-                                      /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(char const*, /* Return type */
+            ITensor,                        /* Parent class */
+            getMemoryTypeName               /* Name of function in C++ (must match Python name) */
+                                            /* Argument(s) */
         );
     }
 
     virtual void resize(std::size_t newSize) override
     {
-        NB_OVERRIDE_PURE(void, /* Return type */
-            ITensor,           /* Parent class */
-            resize             /* Name of function in C++ (must match Python name) */
-                               /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(void, /* Return type */
+            ITensor,                 /* Parent class */
+            resize                   /* Name of function in C++ (must match Python name) */
+                                     /* Argument(s) */
         );
     }
 
@@ -138,19 +135,19 @@ class PyITensor : public tensorrt_llm::runtime::ITensor
 
     [[nodiscard]] Shape const& getShape() const override
     {
-        NB_OVERRIDE_PURE(Shape const&, /* Return type */
-            ITensor,                   /* Parent class */
-            getShape                   /* Name of function in C++ (must match Python name) */
-                                       /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(Shape const&, /* Return type */
+            ITensor,                         /* Parent class */
+            getShape                         /* Name of function in C++ (must match Python name) */
+                                             /* Argument(s) */
         );
     }
 
     void reshape(Shape const& dims) override
     {
-        NB_OVERRIDE_PURE(void, /* Return type */
-            ITensor,           /* Parent class */
-            reshape,           /* Name of function in C++ (must match Python name) */
-            dims               /* Argument(s) */
+        PYBIND11_OVERRIDE_PURE(void, /* Return type */
+            ITensor,                 /* Parent class */
+            reshape,                 /* Name of function in C++ (must match Python name) */
+            dims                     /* Argument(s) */
         );
     }
 };
@@ -165,35 +162,35 @@ class PyIGptDecoder : public tr::IGptDecoder
         std::optional<tr::DecodingOutput> const& output = std::nullopt,
         std::optional<std::vector<tr::decoder_batch::Request> const> const& requests = std::nullopt) override
     {
-        NB_OVERRIDE_PURE(void, IGptDecoder, setup, samplingConfig, batchSize, batchSlots, output, requests);
+        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, setup, samplingConfig, batchSize, batchSlots, output, requests);
     }
 
     void forwardAsync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
     {
-        NB_OVERRIDE_PURE(void, IGptDecoder, forwardAsync, output, input);
+        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, forwardAsync, output, input);
     }
 
     void forwardSync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
     {
-        NB_OVERRIDE_PURE(void, IGptDecoder, forwardSync, output, input);
+        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, forwardSync, output, input);
     }
 
     tr::SamplingConfig const& getSamplingConfig() override
     {
-        NB_OVERRIDE_PURE(tr::SamplingConfig const&, IGptDecoder, getSamplingConfig);
+        PYBIND11_OVERRIDE_PURE(tr::SamplingConfig const&, IGptDecoder, getSamplingConfig);
     }
 
     void disableLookahead(std::optional<tr::SamplingConfig> const& samplingConfig, tr::SizeType32 batchSize,
         tr::DecodingInput::TensorConstPtr batchSlots) override
     {
-        NB_OVERRIDE_PURE(void, IGptDecoder, disableLookahead, samplingConfig, batchSize, batchSlots);
+        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, disableLookahead, samplingConfig, batchSize, batchSlots);
     }
 };
 
 namespace tensorrt_llm::pybind::runtime
 {
 
-void initBindings(py::module_& m)
+void initBindings(pybind11::module_& m)
 {
     py::classh<tr::ITensor, PyITensor>(m, "ITensor").def(py::init());
     py::class_<tr::LoraCache::TaskLayerModuleConfig>(m, "TaskLayerModuleConfig")
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.h b/cpp/tensorrt_llm/pybind/runtime/bindings.h
index 7acac226212..b8e1ab66574 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.h
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.h
@@ -18,9 +18,9 @@
 #pragma once
 
 #include "tensorrt_llm/pybind/common/customCasters.h"
-#include <nanobind/nanobind.h>
+#include <pybind11/pybind11.h>
 
-namespace py = nanobind;
+namespace py = pybind11;
 
 namespace tensorrt_llm::pybind::runtime
 {